diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 99a79278..284ce569 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -32,22 +32,32 @@ from .git import should_save_git, save_git from .media import should_save_media, save_media from .archive_org import should_save_archive_dot_org, save_archive_dot_org +def get_default_archive_methods(): + return [ + ('title', should_save_title, save_title), + ('favicon', should_save_favicon, save_favicon), + ('wget', should_save_wget, save_wget), + ('pdf', should_save_pdf, save_pdf), + ('screenshot', should_save_screenshot, save_screenshot), + ('dom', should_save_dom, save_dom), + ('git', should_save_git, save_git), + ('media', should_save_media, save_media), + ('archive_org', should_save_archive_dot_org, save_archive_dot_org), + ] + +@enforce_types +def ignore_methods(to_ignore: List[str]): + ARCHIVE_METHODS = get_default_archive_methods() + methods = filter(lambda x: x[0] not in to_ignore, ARCHIVE_METHODS) + methods = map(lambda x: x[1], methods) + return list(methods) @enforce_types def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[str]=None, skip_index: bool=False) -> Link: """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" - ARCHIVE_METHODS = [ - ('title', should_save_title, save_title), - ('favicon', should_save_favicon, save_favicon), - ('wget', should_save_wget, save_wget), - ('pdf', should_save_pdf, save_pdf), - ('screenshot', should_save_screenshot, save_screenshot), - ('dom', should_save_dom, save_dom), - ('git', should_save_git, save_git), - ('media', should_save_media, save_media), - ('archive_org', should_save_archive_dot_org, save_archive_dot_org), - ] + ARCHIVE_METHODS = get_default_archive_methods() + if methods is not None: ARCHIVE_METHODS = [ method for method in ARCHIVE_METHODS diff --git a/archivebox/main.py b/archivebox/main.py index 8a4c4971..cd49d68b 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -52,7 +52,7 @@ from .index.sql import ( remove_from_sql_main_index, ) from .index.html import parse_html_main_index -from .extractors import archive_links, archive_link +from .extractors import archive_links, archive_link, ignore_methods from .config import ( stderr, ConfigDict, @@ -503,7 +503,8 @@ def oneshot(url: str, out_dir: str=OUTPUT_DIR): color='red' ) raise SystemExit(2) - archive_link(oneshot_link[0], out_dir=out_dir, skip_index=True) + methods = ignore_methods(['title']) + archive_link(oneshot_link[0], out_dir=out_dir, methods=methods, skip_index=True) return oneshot_link @enforce_types diff --git a/tests/test_extractors.py b/tests/test_extractors.py index 203f6701..c7aaaeaf 100644 --- a/tests/test_extractors.py +++ b/tests/test_extractors.py @@ -1,5 +1,13 @@ from .fixtures import * +from archivebox.extractors import ignore_methods, get_default_archive_methods, should_save_title def test_wget_broken_pipe(tmp_path, process): add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True) - assert "TypeError chmod_file(..., path: str) got unexpected NoneType argument path=None" not in add_process.stdout.decode("utf-8") \ No newline at end of file + assert "TypeError chmod_file(..., path: str) got unexpected NoneType argument path=None" not in add_process.stdout.decode("utf-8") + +def test_ignore_methods(): + """ + Takes the passed method out of the default methods list and returns that value + """ + ignored = ignore_methods(['title']) + assert should_save_title not in ignored \ No newline at end of file diff --git a/tests/test_oneshot.py b/tests/test_oneshot.py index 5d53a942..7ff9867f 100644 --- a/tests/test_oneshot.py +++ b/tests/test_oneshot.py @@ -1,3 +1,5 @@ +from pathlib import Path + from .fixtures import * def test_oneshot_command_exists(tmp_path): @@ -8,5 +10,7 @@ def test_oneshot_command_exists(tmp_path): def test_oneshot_commad_saves_page_in_right_folder(tmp_path): process = subprocess.run(["archivebox", "oneshot", f"--out-dir={tmp_path}", "http://127.0.0.1:8080/static/example.com.html"], capture_output=True) items = ' '.join([str(x) for x in tmp_path.iterdir()]) + current_path = ' '.join([str(x) for x in Path.cwd().iterdir()]) assert "index.json" in items + assert not "index.sqlite3" in current_path \ No newline at end of file