From 62ed11a5cabc626c9b9b54df0e28cd713aaf44e2 Mon Sep 17 00:00:00 2001 From: Cristian Date: Thu, 24 Sep 2020 08:37:27 -0500 Subject: [PATCH] fix: Improve headers handling --- archivebox/config/__init__.py | 1 + archivebox/extractors/headers.py | 3 ++- archivebox/util.py | 4 ++++ tests/fixtures.py | 1 + tests/mock_server/server.py | 22 +++++++++++++++++++++- tests/test_extractors.py | 32 +++++++++++++++++++++++++++++++- 6 files changed, 60 insertions(+), 3 deletions(-) diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index 37e3c906..4cd78609 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -85,6 +85,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)}, 'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)}, 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)}, + 'SAVE_HEADERS': {'type': bool, 'default': True, 'aliases': ('FETCH_HEADERS',)}, 'SAVE_WARC': {'type': bool, 'default': True, 'aliases': ('FETCH_WARC',)}, 'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)}, 'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)}, diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py index 180055d9..533e6bb3 100644 --- a/archivebox/extractors/headers.py +++ b/archivebox/extractors/headers.py @@ -16,6 +16,7 @@ from ..config import ( CURL_USER_AGENT, CURL_VERSION, CHECK_SSL_VALIDITY, + SAVE_HEADERS ) from ..logging_util import TimedProgress @@ -24,7 +25,7 @@ def should_save_headers(link: Link, out_dir: Optional[str]=None) -> bool: out_dir = out_dir or link.link_dir output = Path(out_dir or link.link_dir) / 'headers.json' - return not output.exists() + return not output.exists() and SAVE_HEADERS @enforce_types diff --git a/archivebox/util.py b/archivebox/util.py index c94b8043..fca3de80 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -186,13 +186,17 @@ def get_headers(url: str, timeout: int=None) -> str: headers={'User-Agent': WGET_USER_AGENT}, verify=CHECK_SSL_VALIDITY, timeout=timeout, + allow_redirects=True ) + if response.status_code >= 400: + raise RequestException except RequestException: response = requests.get( url, headers={'User-Agent': WGET_USER_AGENT}, verify=CHECK_SSL_VALIDITY, timeout=timeout, + stream=True ) return pyjson.dumps(dict(response.headers), indent=4) diff --git a/tests/fixtures.py b/tests/fixtures.py index b423c076..6dd4cb28 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -20,6 +20,7 @@ def disable_extractors_dict(): "SAVE_PDF": "false", "SAVE_SCREENSHOT": "false", "SAVE_DOM": "false", + "SAVE_HEADERS": "false", "USE_GIT": "false", "SAVE_MEDIA": "false", "SAVE_ARCHIVE_DOT_ORG": "false" diff --git a/tests/mock_server/server.py b/tests/mock_server/server.py index c819267c..9e5bea05 100644 --- a/tests/mock_server/server.py +++ b/tests/mock_server/server.py @@ -2,7 +2,7 @@ from os.path import abspath from os import getcwd from pathlib import Path -from bottle import route, run, static_file, response +from bottle import route, run, static_file, response, redirect @route("/") def index(): @@ -30,5 +30,25 @@ def static_path_with_headers(filename): response.add_header("Content-Style-Type", "text/css") return response +@route("/static/400/", method="HEAD") +def static_400(filename): + template_path = abspath(getcwd()) / Path("tests/mock_server/templates") + response = static_file(filename, root=template_path) + response.status = 400 + response.add_header("Status-Code", "400") + return response + +@route("/static/400/", method="GET") +def static_200(filename): + template_path = abspath(getcwd()) / Path("tests/mock_server/templates") + response = static_file(filename, root=template_path) + response.add_header("Status-Code", "200") + return response + +@route("/redirect/headers/") +def redirect_to_static(filename): + redirect(f"/static/headers/$filename") + + def start(): run(host='localhost', port=8080) \ No newline at end of file diff --git a/tests/test_extractors.py b/tests/test_extractors.py index 24b750ed..b467f0e1 100644 --- a/tests/test_extractors.py +++ b/tests/test_extractors.py @@ -71,7 +71,15 @@ def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, d assert "> singlefile" not in output_str assert "> readability" not in output_str -def test_headers(tmp_path, process, disable_extractors_dict): +def test_headers_ignored(tmp_path, process, disable_extractors_dict): + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/headers/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob("archive/**/*"))[0] + output_file = archived_item_path / "headers.json" + assert not output_file.exists() + +def test_headers_retrieved(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"SAVE_HEADERS": "true"}) add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/headers/example.com.html'], capture_output=True, env=disable_extractors_dict) archived_item_path = list(tmp_path.glob("archive/**/*"))[0] @@ -83,3 +91,25 @@ def test_headers(tmp_path, process, disable_extractors_dict): assert headers['Content-Language'] == 'en' assert headers['Content-Script-Type'] == 'text/javascript' assert headers['Content-Style-Type'] == 'text/css' + +def test_headers_redirect_chain(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"SAVE_HEADERS": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/redirect/headers/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob("archive/**/*"))[0] + output_file = archived_item_path / "headers.json" + with open(output_file) as f: + headers = pyjson.load(f) + assert headers['Content-Language'] == 'en' + assert headers['Content-Script-Type'] == 'text/javascript' + assert headers['Content-Style-Type'] == 'text/css' + +def test_headers_400_plus(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"SAVE_HEADERS": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/400/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob("archive/**/*"))[0] + output_file = archived_item_path / "headers.json" + with open(output_file) as f: + headers = pyjson.load(f) + assert headers["Status-Code"] == "200" \ No newline at end of file