From 23e6803f0252a06aaa197b97e3b6e3bae4cd29d7 Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 17 Jul 2020 16:55:56 -0500 Subject: [PATCH] fix: Add change to calculate wget folder when there is a port present --- archivebox/extractors/wget.py | 4 +--- tests/mock_server/templates/example.com.html | 4 ++-- tests/test_args.py | 16 ++++++++-------- tests/test_extractors.py | 5 +++++ tests/test_init.py | 4 ++-- tests/test_util.py | 2 +- 6 files changed, 19 insertions(+), 16 deletions(-) create mode 100644 tests/test_extractors.py diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index 503c3bad..d7133dcb 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -136,7 +136,6 @@ def wget_output_path(link: Link) -> Optional[str]: See docs on wget --adjust-extension (-E) """ - if is_static_file(link.url): return without_scheme(without_fragment(link.url)) @@ -174,10 +173,9 @@ def wget_output_path(link: Link) -> Optional[str]: full_path = without_fragment(without_query(path(link.url))).strip('/') search_dir = os.path.join( link.link_dir, - domain(link.url), + domain(link.url).replace(":", "+"), urldecode(full_path), ) - for _ in range(4): if os.path.exists(search_dir): if os.path.isdir(search_dir): diff --git a/tests/mock_server/templates/example.com.html b/tests/mock_server/templates/example.com.html index 8492e858..8469956c 100644 --- a/tests/mock_server/templates/example.com.html +++ b/tests/mock_server/templates/example.com.html @@ -40,9 +40,9 @@

Example Domain

This domain is for use in illustrative examples in documents. You may use this - domain in literature without prior coordination or asking for permission.

+ domain in literature without prior coordination or asking for permission.

- More information... + More information...

diff --git a/tests/test_args.py b/tests/test_args.py index f52626fb..ed132524 100644 --- a/tests/test_args.py +++ b/tests/test_args.py @@ -4,25 +4,25 @@ import json from .fixtures import * def test_depth_flag_is_accepted(process): - arg_process = subprocess.run(["archivebox", "add", "http://localhost:8080/static/example.com.html", "--depth=0"], capture_output=True) + arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], capture_output=True) assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode("utf-8") def test_depth_flag_fails_if_it_is_not_0_or_1(process): - arg_process = subprocess.run(["archivebox", "add", "http://localhost:8080/static/example.com.html", "--depth=5"], capture_output=True) + arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=5"], capture_output=True) assert 'invalid choice' in arg_process.stderr.decode("utf-8") - arg_process = subprocess.run(["archivebox", "add", "http://localhost:8080/static/example.com.html", "--depth=-1"], capture_output=True) + arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=-1"], capture_output=True) assert 'invalid choice' in arg_process.stderr.decode("utf-8") def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process): - arg_process = subprocess.run(["archivebox", "add", "http://localhost:8080/static/example.com.html", "--depth=0"], capture_output=True) + arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=0"], capture_output=True) archived_item_path = list(tmp_path.glob('archive/**/*'))[0] with open(archived_item_path / "index.json", "r") as f: output_json = json.load(f) - assert output_json["base_url"] == "localhost:8080/static/example.com.html" + assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html" def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process): - arg_process = subprocess.run(["archivebox", "add", "http://localhost:8080/static/example.com.html", "--depth=1"], capture_output=True) + arg_process = subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--depth=1"], capture_output=True) with open(tmp_path / "index.json", "r") as f: archive_file = f.read() - assert "http://localhost:8080/static/example.com.html" in archive_file - assert "http://localhost:8080/static/iana.org.html" in archive_file + assert "http://127.0.0.1:8080/static/example.com.html" in archive_file + assert "http://127.0.0.1:8080/static/iana.org.html" in archive_file diff --git a/tests/test_extractors.py b/tests/test_extractors.py new file mode 100644 index 00000000..203f6701 --- /dev/null +++ b/tests/test_extractors.py @@ -0,0 +1,5 @@ +from .fixtures import * + +def test_wget_broken_pipe(tmp_path, process): + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True) + assert "TypeError chmod_file(..., path: str) got unexpected NoneType argument path=None" not in add_process.stdout.decode("utf-8") \ No newline at end of file diff --git a/tests/test_init.py b/tests/test_init.py index 24d3ed52..6a15612a 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -18,7 +18,7 @@ def test_update(tmp_path, process): def test_add_link(tmp_path, process): os.chdir(tmp_path) - add_process = subprocess.run(['archivebox', 'add', 'http://localhost:8080/static/example.com.html'], capture_output=True) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True) archived_item_path = list(tmp_path.glob('archive/**/*'))[0] assert "index.json" in [x.name for x in archived_item_path.iterdir()] @@ -34,7 +34,7 @@ def test_add_link(tmp_path, process): def test_add_link_support_stdin(tmp_path, process): os.chdir(tmp_path) stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - stdin_process.communicate(input="http://localhost:8080/static/example.com.html".encode()) + stdin_process.communicate(input="http://127.0.0.1:8080/static/example.com.html".encode()) archived_item_path = list(tmp_path.glob('archive/**/*'))[0] assert "index.json" in [x.name for x in archived_item_path.iterdir()] diff --git a/tests/test_util.py b/tests/test_util.py index 0a076344..e2ad8240 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,5 +1,5 @@ from archivebox import util def test_download_url_downloads_content(): - text = util.download_url("http://localhost:8080/static/example.com.html") + text = util.download_url("http://127.0.0.1:8080/static/example.com.html") assert "Example Domain" in text \ No newline at end of file