1
0
Fork 0

Merge pull request #535 from cdvv7788/extractors-flag

This commit is contained in:
Nick Sweeting 2020-11-13 14:53:17 -05:00 committed by GitHub
commit fdd4effc92
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 31 additions and 6 deletions

View file

@ -62,10 +62,17 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
help="Re-archive URLs from scratch, overwriting any existing files"
)
parser.add_argument(
'--init', #'-i',
"--init", #'-i',
action='store_true',
help="Init/upgrade the curent data directory before adding",
)
parser.add_argument(
"--extract",
type=str,
help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
This does not take precedence over the configuration",
default=""
)
command = parser.parse_args(args or ())
urls = command.urls
stdin_urls = accept_stdin(stdin)
@ -83,6 +90,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
overwrite=command.overwrite,
init=command.init,
out_dir=pwd or OUTPUT_DIR,
extractors=command.extract,
)

View file

@ -525,11 +525,14 @@ def add(urls: Union[str, List[str]],
index_only: bool=False,
overwrite: bool=False,
init: bool=False,
out_dir: Path=OUTPUT_DIR) -> List[Link]:
out_dir: Path=OUTPUT_DIR,
extractors: str="") -> List[Link]:
"""Add a new URL or list of URLs to your archive"""
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
extractors = extractors.split(",") if extractors else []
if init:
run_subcommand('init', stdin=None, pwd=out_dir)
@ -567,12 +570,17 @@ def add(urls: Union[str, List[str]],
return all_links
# Run the archive methods for each link
archive_kwargs = {
"out_dir": out_dir,
}
if extractors:
archive_kwargs["methods"] = extractors
if update_all:
archive_links(all_links, overwrite=overwrite, out_dir=out_dir)
archive_links(all_links, overwrite=overwrite, **archive_kwargs)
elif overwrite:
archive_links(imported_links, overwrite=True, out_dir=out_dir)
archive_links(imported_links, overwrite=True, **archive_kwargs)
elif new_links:
archive_links(new_links, overwrite=False, out_dir=out_dir)
archive_links(new_links, overwrite=False, **archive_kwargs)
return all_links

View file

@ -82,3 +82,12 @@ def test_add_updates_history_json_index(tmp_path, process, disable_extractors_di
with open(archived_item_path / "index.json", "r") as f:
output_json = json.load(f)
assert output_json["history"] != {}
def test_extract_input_uses_only_passed_extractors(tmp_path, process):
subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--extract", "wget"],
capture_output=True)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
assert (archived_item_path / "warc").exists()
assert not (archived_item_path / "singlefile.html").exists()