diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index a4acef0b..15968097 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -102,7 +102,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s if method_name not in link.history: link.history[method_name] = [] - if should_run(link, out_dir) or overwrite: + if should_run(link, out_dir, overwrite): log_archive_method_started(method_name) result = method_function(link=link, out_dir=out_dir) diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index f5598d6f..1f382190 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -25,12 +25,12 @@ from ..logging_util import TimedProgress @enforce_types -def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None) -> bool: - out_dir = out_dir or Path(link.link_dir) +def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: if is_static_file(link.url): return False - if (out_dir / "archive.org.txt").exists(): + out_dir = out_dir or Path(link.link_dir) + if not overwrite and (out_dir / 'archive.org.txt').exists(): # if open(path, 'r').read().strip() != 'None': return False diff --git a/archivebox/extractors/dom.py b/archivebox/extractors/dom.py index babbe71c..ec2df073 100644 --- a/archivebox/extractors/dom.py +++ b/archivebox/extractors/dom.py @@ -20,16 +20,16 @@ from ..logging_util import TimedProgress @enforce_types -def should_save_dom(link: Link, out_dir: Optional[Path]=None) -> bool: - out_dir = out_dir or Path(link.link_dir) +def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: if is_static_file(link.url): return False - - if (out_dir / 'output.html').exists(): + + out_dir = out_dir or Path(link.link_dir) + if not overwrite and (out_dir / 'output.html').exists(): return False return SAVE_DOM - + @enforce_types def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: """print HTML of site to file using chrome --dump-html""" diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index 5e7c1fb0..3a4aeea7 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -20,13 +20,13 @@ from ..logging_util import TimedProgress @enforce_types -def should_save_favicon(link: Link, out_dir: Optional[str]=None) -> bool: - out_dir = out_dir or link.link_dir - if (Path(out_dir) / 'favicon.ico').exists(): +def should_save_favicon(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: + out_dir = out_dir or Path(link.link_dir) + if not overwrite and (out_dir / 'favicon.ico').exists(): return False return SAVE_FAVICON - + @enforce_types def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: """download site favicon from google's favicon api""" diff --git a/archivebox/extractors/git.py b/archivebox/extractors/git.py index fd20d4b6..efef37c2 100644 --- a/archivebox/extractors/git.py +++ b/archivebox/extractors/git.py @@ -28,12 +28,12 @@ from ..logging_util import TimedProgress @enforce_types -def should_save_git(link: Link, out_dir: Optional[Path]=None) -> bool: - out_dir = out_dir or link.link_dir +def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: if is_static_file(link.url): return False - if (out_dir / "git").exists(): + out_dir = out_dir or Path(link.link_dir) + if not overwrite and (out_dir / 'git').exists(): return False is_clonable_url = ( diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py index 4e69dec1..91dcb8e3 100644 --- a/archivebox/extractors/headers.py +++ b/archivebox/extractors/headers.py @@ -22,11 +22,12 @@ from ..config import ( from ..logging_util import TimedProgress @enforce_types -def should_save_headers(link: Link, out_dir: Optional[str]=None) -> bool: - out_dir = out_dir or link.link_dir +def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: + out_dir = out_dir or Path(link.link_dir) + if not overwrite and (out_dir / 'headers.json').exists(): + return False - output = Path(out_dir or link.link_dir) / 'headers.json' - return not output.exists() and SAVE_HEADERS + return SAVE_HEADERS @enforce_types diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index 3792fd2a..1c0a21ba 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -21,13 +21,12 @@ from ..logging_util import TimedProgress @enforce_types -def should_save_media(link: Link, out_dir: Optional[Path]=None) -> bool: - out_dir = out_dir or link.link_dir - +def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: if is_static_file(link.url): return False - if (out_dir / "media").exists(): + out_dir = out_dir or Path(link.link_dir) + if not overwrite and (out_dir / 'media').exists(): return False return SAVE_MEDIA diff --git a/archivebox/extractors/mercury.py b/archivebox/extractors/mercury.py index 07c02420..d9e32c0a 100644 --- a/archivebox/extractors/mercury.py +++ b/archivebox/extractors/mercury.py @@ -37,13 +37,15 @@ def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> Archi @enforce_types -def should_save_mercury(link: Link, out_dir: Optional[str]=None) -> bool: - out_dir = out_dir or link.link_dir +def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: if is_static_file(link.url): return False - output = Path(out_dir or link.link_dir) / 'mercury' - return SAVE_MERCURY and MERCURY_VERSION and (not output.exists()) + out_dir = out_dir or Path(link.link_dir) + if not overwrite and (out_dir / 'mercury').exists(): + return False + + return SAVE_MERCURY @enforce_types diff --git a/archivebox/extractors/pdf.py b/archivebox/extractors/pdf.py index 1b0201e3..7138206c 100644 --- a/archivebox/extractors/pdf.py +++ b/archivebox/extractors/pdf.py @@ -19,12 +19,12 @@ from ..logging_util import TimedProgress @enforce_types -def should_save_pdf(link: Link, out_dir: Optional[Path]=None) -> bool: - out_dir = out_dir or Path(link.link_dir) +def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: if is_static_file(link.url): return False - - if (out_dir / "output.pdf").exists(): + + out_dir = out_dir or Path(link.link_dir) + if not overwrite and (out_dir / 'output.pdf').exists(): return False return SAVE_PDF diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 9da620b4..6e48cd9a 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -46,13 +46,15 @@ def get_html(link: Link, path: Path) -> str: return document @enforce_types -def should_save_readability(link: Link, out_dir: Optional[str]=None) -> bool: - out_dir = out_dir or link.link_dir +def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: if is_static_file(link.url): return False - output = Path(out_dir or link.link_dir) / 'readability' - return SAVE_READABILITY and READABILITY_VERSION and (not output.exists()) + out_dir = out_dir or Path(link.link_dir) + if not overwrite and (out_dir / 'readability').exists(): + return False + + return SAVE_READABILITY @enforce_types diff --git a/archivebox/extractors/screenshot.py b/archivebox/extractors/screenshot.py index 325584eb..cc748bf6 100644 --- a/archivebox/extractors/screenshot.py +++ b/archivebox/extractors/screenshot.py @@ -20,12 +20,12 @@ from ..logging_util import TimedProgress @enforce_types -def should_save_screenshot(link: Link, out_dir: Optional[Path]=None) -> bool: - out_dir = out_dir or Path(link.link_dir) +def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: if is_static_file(link.url): return False - - if (out_dir / "screenshot.png").exists(): + + out_dir = out_dir or Path(link.link_dir) + if not overwrite and (out_dir / 'screenshot.png').exists(): return False return SAVE_SCREENSHOT diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index 2e5c3896..8d9b36be 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -23,13 +23,15 @@ from ..logging_util import TimedProgress @enforce_types -def should_save_singlefile(link: Link, out_dir: Optional[Path]=None) -> bool: - out_dir = out_dir or Path(link.link_dir) +def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: if is_static_file(link.url): return False - output = out_dir / 'singlefile.html' - return SAVE_SINGLEFILE and SINGLEFILE_VERSION and (not output.exists()) + out_dir = out_dir or Path(link.link_dir) + if not overwrite and (out_dir / 'singlefile.html').exists(): + return False + + return SAVE_SINGLEFILE @enforce_types diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index 28cb128f..816c0484 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -61,12 +61,12 @@ class TitleParser(HTMLParser): @enforce_types -def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool: - # if link already has valid title, skip it - if link.title and not link.title.lower().startswith('http'): +def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: + if is_static_file(link.url): return False - if is_static_file(link.url): + # if link already has valid title, skip it + if not overwrite and link.title and not link.title.lower().startswith('http'): return False return SAVE_TITLE diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index ec252123..ee8744b2 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -36,10 +36,10 @@ from ..logging_util import TimedProgress @enforce_types -def should_save_wget(link: Link, out_dir: Optional[Path]=None) -> bool: +def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: output_path = wget_output_path(link) out_dir = out_dir or Path(link.link_dir) - if output_path and (out_dir / output_path).exists(): + if not overwrite and output_path and (out_dir / output_path).exists(): return False return SAVE_WGET