feat: Add extract flag to add command
This commit is contained in:
parent
4372cb6eec
commit
44eede96e5
3 changed files with 28 additions and 6 deletions
|
@ -62,10 +62,16 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
||||||
help="Re-archive URLs from scratch, overwriting any existing files"
|
help="Re-archive URLs from scratch, overwriting any existing files"
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--init', #'-i',
|
"--init", #'-i',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help="Init/upgrade the curent data directory before adding",
|
help="Init/upgrade the curent data directory before adding",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--extract",
|
||||||
|
nargs="+",
|
||||||
|
help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
|
||||||
|
This does not take precedence over the configuration"
|
||||||
|
)
|
||||||
command = parser.parse_args(args or ())
|
command = parser.parse_args(args or ())
|
||||||
urls = command.urls
|
urls = command.urls
|
||||||
stdin_urls = accept_stdin(stdin)
|
stdin_urls = accept_stdin(stdin)
|
||||||
|
@ -83,6 +89,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
||||||
overwrite=command.overwrite,
|
overwrite=command.overwrite,
|
||||||
init=command.init,
|
init=command.init,
|
||||||
out_dir=pwd or OUTPUT_DIR,
|
out_dir=pwd or OUTPUT_DIR,
|
||||||
|
extractors = command.extract or [],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -525,7 +525,8 @@ def add(urls: Union[str, List[str]],
|
||||||
index_only: bool=False,
|
index_only: bool=False,
|
||||||
overwrite: bool=False,
|
overwrite: bool=False,
|
||||||
init: bool=False,
|
init: bool=False,
|
||||||
out_dir: Path=OUTPUT_DIR) -> List[Link]:
|
out_dir: Path=OUTPUT_DIR,
|
||||||
|
extractors: list=[]) -> List[Link]:
|
||||||
"""Add a new URL or list of URLs to your archive"""
|
"""Add a new URL or list of URLs to your archive"""
|
||||||
|
|
||||||
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
|
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
|
||||||
|
@ -567,12 +568,17 @@ def add(urls: Union[str, List[str]],
|
||||||
return all_links
|
return all_links
|
||||||
|
|
||||||
# Run the archive methods for each link
|
# Run the archive methods for each link
|
||||||
|
archive_kwargs = {
|
||||||
|
"out_dir": out_dir,
|
||||||
|
}
|
||||||
|
if extractors:
|
||||||
|
archive_kwargs["methods"] = extractors
|
||||||
if update_all:
|
if update_all:
|
||||||
archive_links(all_links, overwrite=overwrite, out_dir=out_dir)
|
archive_links(all_links, overwrite=overwrite, **archive_kwargs)
|
||||||
elif overwrite:
|
elif overwrite:
|
||||||
archive_links(imported_links, overwrite=True, out_dir=out_dir)
|
archive_links(imported_links, overwrite=True, **archive_kwargs)
|
||||||
elif new_links:
|
elif new_links:
|
||||||
archive_links(new_links, overwrite=False, out_dir=out_dir)
|
archive_links(new_links, overwrite=False, **archive_kwargs)
|
||||||
|
|
||||||
return all_links
|
return all_links
|
||||||
|
|
||||||
|
|
|
@ -82,3 +82,12 @@ def test_add_updates_history_json_index(tmp_path, process, disable_extractors_di
|
||||||
with open(archived_item_path / "index.json", "r") as f:
|
with open(archived_item_path / "index.json", "r") as f:
|
||||||
output_json = json.load(f)
|
output_json = json.load(f)
|
||||||
assert output_json["history"] != {}
|
assert output_json["history"] != {}
|
||||||
|
|
||||||
|
def test_extract_input_uses_only_passed_extractors(tmp_path, process):
|
||||||
|
subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--extract", "wget"],
|
||||||
|
capture_output=True)
|
||||||
|
|
||||||
|
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
||||||
|
|
||||||
|
assert (archived_item_path / "warc").exists()
|
||||||
|
assert not (archived_item_path / "singlefile.html").exists()
|
Loading…
Reference in a new issue