diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index 9d483362..aa8cae1b 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -102,6 +102,13 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional default=None, help='Update only URLs matching these filter patterns.' ) + parser.add_argument( + "--extract", + type=str, + help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \ + This does not take precedence over the configuration", + default="" + ) command = parser.parse_args(args or ()) filter_patterns_str = accept_stdin(stdin) @@ -117,6 +124,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional after=command.after, before=command.before, out_dir=pwd or OUTPUT_DIR, + extractors=command.extract, ) diff --git a/archivebox/main.py b/archivebox/main.py index 2d36e1f2..e744234f 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -680,6 +680,7 @@ def update(resume: Optional[float]=None, status: Optional[str]=None, after: Optional[str]=None, before: Optional[str]=None, + extractors: str="", out_dir: Path=OUTPUT_DIR) -> List[Link]: """Import any new links from subscriptions and retry any previously failed/skipped links""" @@ -687,6 +688,8 @@ def update(resume: Optional[float]=None, check_dependencies() new_links: List[Link] = [] # TODO: Remove input argument: only_new + extractors = extractors.split(",") if extractors else [] + # Step 1: Filter for selected_links matching_snapshots = list_links( filter_patterns=filter_patterns, @@ -717,7 +720,13 @@ def update(resume: Optional[float]=None, stderr(f'[√] Nothing found to resume after {resume}', color='green') return all_links - archive_links(to_archive, overwrite=overwrite, out_dir=out_dir) + archive_kwargs = { + "out_dir": out_dir, + } + if extractors: + archive_kwargs["methods"] = extractors + + archive_links(to_archive, overwrite=overwrite, **archive_kwargs) # Step 4: Re-write links index with updated titles, icons, and resources all_links = load_main_index(out_dir=out_dir)