From 62ed11a5cabc626c9b9b54df0e28cd713aaf44e2 Mon Sep 17 00:00:00 2001
From: Cristian <cristian@swapps.com>
Date: Thu, 24 Sep 2020 08:37:27 -0500
Subject: [PATCH] fix: Improve headers handling

---
 archivebox/config/__init__.py    |  1 +
 archivebox/extractors/headers.py |  3 ++-
 archivebox/util.py               |  4 ++++
 tests/fixtures.py                |  1 +
 tests/mock_server/server.py      | 22 +++++++++++++++++++++-
 tests/test_extractors.py         | 32 +++++++++++++++++++++++++++++++-
 6 files changed, 60 insertions(+), 3 deletions(-)
diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py
index 37e3c906..4cd78609 100644
--- a/archivebox/config/__init__.py
+++ b/archivebox/config/__init__.py
@@ -85,6 +85,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
         'SAVE_PDF':                 {'type': bool,  'default': True, 'aliases': ('FETCH_PDF',)},
         'SAVE_SCREENSHOT':          {'type': bool,  'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
         'SAVE_DOM':                 {'type': bool,  'default': True, 'aliases': ('FETCH_DOM',)},
+        'SAVE_HEADERS':             {'type': bool,  'default': True, 'aliases': ('FETCH_HEADERS',)},
         'SAVE_WARC':                {'type': bool,  'default': True, 'aliases': ('FETCH_WARC',)},
         'SAVE_GIT':                 {'type': bool,  'default': True, 'aliases': ('FETCH_GIT',)},
         'SAVE_MEDIA':               {'type': bool,  'default': True, 'aliases': ('FETCH_MEDIA',)},
diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py
index 180055d9..533e6bb3 100644
--- a/archivebox/extractors/headers.py
+++ b/archivebox/extractors/headers.py
@@ -16,6 +16,7 @@ from ..config import (
     CURL_USER_AGENT,
     CURL_VERSION,
     CHECK_SSL_VALIDITY,
+    SAVE_HEADERS
 )
 from ..logging_util import TimedProgress
 
@@ -24,7 +25,7 @@ def should_save_headers(link: Link, out_dir: Optional[str]=None) -> bool:
     out_dir = out_dir or link.link_dir
 
     output = Path(out_dir or link.link_dir) / 'headers.json'
-    return not output.exists()
+    return not output.exists() and SAVE_HEADERS
 
 
 @enforce_types
diff --git a/archivebox/util.py b/archivebox/util.py
index c94b8043..fca3de80 100644
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -186,13 +186,17 @@ def get_headers(url: str, timeout: int=None) -> str:
             headers={'User-Agent': WGET_USER_AGENT},
             verify=CHECK_SSL_VALIDITY,
             timeout=timeout,
+            allow_redirects=True
         )
+        if response.status_code >= 400:
+            raise RequestException
     except RequestException:
         response = requests.get(
             url,
             headers={'User-Agent': WGET_USER_AGENT},
             verify=CHECK_SSL_VALIDITY,
             timeout=timeout,
+            stream=True
         )
     
     return pyjson.dumps(dict(response.headers), indent=4)
diff --git a/tests/fixtures.py b/tests/fixtures.py
index b423c076..6dd4cb28 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -20,6 +20,7 @@ def disable_extractors_dict():
         "SAVE_PDF": "false",
         "SAVE_SCREENSHOT": "false",
         "SAVE_DOM": "false",
+        "SAVE_HEADERS": "false",
         "USE_GIT": "false",
         "SAVE_MEDIA": "false",
         "SAVE_ARCHIVE_DOT_ORG": "false"
diff --git a/tests/mock_server/server.py b/tests/mock_server/server.py
index c819267c..9e5bea05 100644
--- a/tests/mock_server/server.py
+++ b/tests/mock_server/server.py
@@ -2,7 +2,7 @@ from os.path import abspath
 from os import getcwd
 from pathlib import Path
 
-from bottle import route, run, static_file, response
+from bottle import route, run, static_file, response, redirect
 
 @route("/")
 def index():
@@ -30,5 +30,25 @@ def static_path_with_headers(filename):
     response.add_header("Content-Style-Type", "text/css")
     return response
 
+@route("/static/400/<filename>", method="HEAD")
+def static_400(filename):
+    template_path = abspath(getcwd()) / Path("tests/mock_server/templates")
+    response = static_file(filename, root=template_path)
+    response.status = 400
+    response.add_header("Status-Code", "400")
+    return response
+
+@route("/static/400/<filename>", method="GET")
+def static_200(filename):
+    template_path = abspath(getcwd()) / Path("tests/mock_server/templates")
+    response = static_file(filename, root=template_path)
+    response.add_header("Status-Code", "200")
+    return response
+
+@route("/redirect/headers/<filename>")
+def redirect_to_static(filename):
+    redirect(f"/static/headers/$filename")
+
+
 def start():
     run(host='localhost', port=8080)
\ No newline at end of file
diff --git a/tests/test_extractors.py b/tests/test_extractors.py
index 24b750ed..b467f0e1 100644
--- a/tests/test_extractors.py
+++ b/tests/test_extractors.py
@@ -71,7 +71,15 @@ def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, d
     assert "> singlefile" not in output_str
     assert "> readability" not in output_str
 
-def test_headers(tmp_path, process, disable_extractors_dict):
+def test_headers_ignored(tmp_path, process, disable_extractors_dict):
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/headers/example.com.html'],
+                                  capture_output=True, env=disable_extractors_dict)
+    archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
+    output_file = archived_item_path / "headers.json"
+    assert not output_file.exists()
+
+def test_headers_retrieved(tmp_path, process, disable_extractors_dict):
+    disable_extractors_dict.update({"SAVE_HEADERS": "true"})
     add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/headers/example.com.html'],
                                   capture_output=True, env=disable_extractors_dict)
     archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
@@ -83,3 +91,25 @@ def test_headers(tmp_path, process, disable_extractors_dict):
     assert headers['Content-Language'] == 'en'
     assert headers['Content-Script-Type'] == 'text/javascript'
     assert headers['Content-Style-Type'] == 'text/css'
+
+def test_headers_redirect_chain(tmp_path, process, disable_extractors_dict):
+    disable_extractors_dict.update({"SAVE_HEADERS": "true"})
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/redirect/headers/example.com.html'],
+                                  capture_output=True, env=disable_extractors_dict)
+    archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
+    output_file = archived_item_path / "headers.json" 
+    with open(output_file) as f:
+        headers = pyjson.load(f)
+    assert headers['Content-Language'] == 'en'
+    assert headers['Content-Script-Type'] == 'text/javascript'
+    assert headers['Content-Style-Type'] == 'text/css'
+
+def test_headers_400_plus(tmp_path, process, disable_extractors_dict):
+    disable_extractors_dict.update({"SAVE_HEADERS": "true"})
+    add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/400/example.com.html'],
+                                  capture_output=True, env=disable_extractors_dict)
+    archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
+    output_file = archived_item_path / "headers.json" 
+    with open(output_file) as f:
+        headers = pyjson.load(f)
+    assert headers["Status-Code"] == "200"
\ No newline at end of file