Merge pull request #356 from cdvv7788/depth-flag
This commit is contained in:
commit
5b571aa166
7 changed files with 90 additions and 41 deletions
|
@ -38,18 +38,38 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help=(
|
help=(
|
||||||
'URL or path to local file containing a list of links to import. e.g.:\n'
|
'URL or path to local file to start the archiving process from. e.g.:\n'
|
||||||
' https://getpocket.com/users/USERNAME/feed/all\n'
|
' https://getpocket.com/users/USERNAME/feed/all\n'
|
||||||
' https://example.com/some/rss/feed.xml\n'
|
' https://example.com/some/rss/feed.xml\n'
|
||||||
|
' https://example.com\n'
|
||||||
' ~/Downloads/firefox_bookmarks_export.html\n'
|
' ~/Downloads/firefox_bookmarks_export.html\n'
|
||||||
' ~/Desktop/sites_list.csv\n'
|
' ~/Desktop/sites_list.csv\n'
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--depth",
|
||||||
|
action="store",
|
||||||
|
default=0,
|
||||||
|
choices=[0,1],
|
||||||
|
type=int,
|
||||||
|
help="Recursively archive all linked pages up to this many hops away"
|
||||||
|
)
|
||||||
command = parser.parse_args(args or ())
|
command = parser.parse_args(args or ())
|
||||||
import_str = accept_stdin(stdin)
|
import_string = accept_stdin(stdin)
|
||||||
|
if import_string and command.import_path:
|
||||||
|
stderr(
|
||||||
|
'[X] You should pass an import path or a page url as an argument or in stdin but not both\n',
|
||||||
|
color='red',
|
||||||
|
)
|
||||||
|
raise SystemExit(2)
|
||||||
|
elif import_string:
|
||||||
|
import_path = import_string
|
||||||
|
else:
|
||||||
|
import_path = command.import_path
|
||||||
|
|
||||||
add(
|
add(
|
||||||
import_str=import_str,
|
url=import_path,
|
||||||
import_path=command.import_path,
|
depth=command.depth,
|
||||||
update_all=command.update_all,
|
update_all=command.update_all,
|
||||||
index_only=command.index_only,
|
index_only=command.index_only,
|
||||||
out_dir=pwd or OUTPUT_DIR,
|
out_dir=pwd or OUTPUT_DIR,
|
||||||
|
@ -63,12 +83,6 @@ if __name__ == '__main__':
|
||||||
# TODO: Implement these
|
# TODO: Implement these
|
||||||
#
|
#
|
||||||
# parser.add_argument(
|
# parser.add_argument(
|
||||||
# '--depth', #'-d',
|
|
||||||
# type=int,
|
|
||||||
# help='Recursively archive all linked pages up to this many hops away',
|
|
||||||
# default=0,
|
|
||||||
# )
|
|
||||||
# parser.add_argument(
|
|
||||||
# '--mirror', #'-m',
|
# '--mirror', #'-m',
|
||||||
# action='store_true',
|
# action='store_true',
|
||||||
# help='Archive an entire site (finding all linked pages below it on the same domain)',
|
# help='Archive an entire site (finding all linked pages below it on the same domain)',
|
||||||
|
|
|
@ -66,12 +66,10 @@ class AddLinks(View):
|
||||||
if form.is_valid():
|
if form.is_valid():
|
||||||
url = form.cleaned_data["url"]
|
url = form.cleaned_data["url"]
|
||||||
print(f'[+] Adding URL: {url}')
|
print(f'[+] Adding URL: {url}')
|
||||||
if form.cleaned_data["source"] == "url":
|
depth = 0 if form.cleaned_data["source"] == "url" else 1
|
||||||
key = "import_str"
|
|
||||||
else:
|
|
||||||
key = "import_path"
|
|
||||||
input_kwargs = {
|
input_kwargs = {
|
||||||
key: url,
|
"url": url,
|
||||||
|
"depth": depth,
|
||||||
"update_all": False,
|
"update_all": False,
|
||||||
"out_dir": OUTPUT_DIR,
|
"out_dir": OUTPUT_DIR,
|
||||||
}
|
}
|
||||||
|
|
|
@ -496,8 +496,8 @@ def status(out_dir: str=OUTPUT_DIR) -> None:
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def add(import_str: Optional[str]=None,
|
def add(url: str,
|
||||||
import_path: Optional[str]=None,
|
depth: int=0,
|
||||||
update_all: bool=not ONLY_NEW,
|
update_all: bool=not ONLY_NEW,
|
||||||
index_only: bool=False,
|
index_only: bool=False,
|
||||||
out_dir: str=OUTPUT_DIR) -> List[Link]:
|
out_dir: str=OUTPUT_DIR) -> List[Link]:
|
||||||
|
@ -505,18 +505,9 @@ def add(import_str: Optional[str]=None,
|
||||||
|
|
||||||
check_data_folder(out_dir=out_dir)
|
check_data_folder(out_dir=out_dir)
|
||||||
|
|
||||||
if (import_str and import_path) or (not import_str and not import_path):
|
base_path = save_stdin_to_sources(url, out_dir=out_dir)
|
||||||
stderr(
|
if depth == 1:
|
||||||
'[X] You should pass either an import path as an argument, '
|
depth_path = save_file_to_sources(url, out_dir=out_dir)
|
||||||
'or pass a list of links via stdin, but not both.\n',
|
|
||||||
color='red',
|
|
||||||
)
|
|
||||||
raise SystemExit(2)
|
|
||||||
elif import_str:
|
|
||||||
import_path = save_stdin_to_sources(import_str, out_dir=out_dir)
|
|
||||||
elif import_path:
|
|
||||||
import_path = save_file_to_sources(import_path, out_dir=out_dir)
|
|
||||||
|
|
||||||
check_dependencies()
|
check_dependencies()
|
||||||
|
|
||||||
# Step 1: Load list of links from the existing index
|
# Step 1: Load list of links from the existing index
|
||||||
|
@ -524,8 +515,11 @@ def add(import_str: Optional[str]=None,
|
||||||
all_links: List[Link] = []
|
all_links: List[Link] = []
|
||||||
new_links: List[Link] = []
|
new_links: List[Link] = []
|
||||||
all_links = load_main_index(out_dir=out_dir)
|
all_links = load_main_index(out_dir=out_dir)
|
||||||
if import_path:
|
all_links, new_links = import_new_links(all_links, base_path, out_dir=out_dir)
|
||||||
all_links, new_links = import_new_links(all_links, import_path, out_dir=out_dir)
|
if depth == 1:
|
||||||
|
all_links, new_links_depth = import_new_links(all_links, depth_path, out_dir=out_dir)
|
||||||
|
new_links = new_links + new_links_depth
|
||||||
|
|
||||||
|
|
||||||
# Step 2: Write updated index with deduped old and new links back to disk
|
# Step 2: Write updated index with deduped old and new links back to disk
|
||||||
write_main_index(links=all_links, out_dir=out_dir)
|
write_main_index(links=all_links, out_dir=out_dir)
|
||||||
|
|
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
10
tests/fixtures.py
Normal file
10
tests/fixtures.py
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def process(tmp_path):
|
||||||
|
os.chdir(tmp_path)
|
||||||
|
process = subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||||
|
return process
|
28
tests/test_args.py
Normal file
28
tests/test_args.py
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
import subprocess
|
||||||
|
import json
|
||||||
|
|
||||||
|
from .fixtures import *
|
||||||
|
|
||||||
|
def test_depth_flag_is_accepted(process):
|
||||||
|
arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=0"], capture_output=True)
|
||||||
|
assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode("utf-8")
|
||||||
|
|
||||||
|
def test_depth_flag_fails_if_it_is_not_0_or_1(process):
|
||||||
|
arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=5"], capture_output=True)
|
||||||
|
assert 'invalid choice' in arg_process.stderr.decode("utf-8")
|
||||||
|
arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=-1"], capture_output=True)
|
||||||
|
assert 'invalid choice' in arg_process.stderr.decode("utf-8")
|
||||||
|
|
||||||
|
def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process):
|
||||||
|
arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=0"], capture_output=True)
|
||||||
|
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
||||||
|
with open(archived_item_path / "index.json", "r") as f:
|
||||||
|
output_json = json.load(f)
|
||||||
|
assert output_json["base_url"] == "example.com"
|
||||||
|
|
||||||
|
def test_depth_flag_1_crawls_the_page_AND_links(tmp_path, process):
|
||||||
|
arg_process = subprocess.run(["archivebox", "add", "https://example.com", "--depth=1"], capture_output=True)
|
||||||
|
with open(tmp_path / "index.json", "r") as f:
|
||||||
|
archive_file = f.read()
|
||||||
|
assert "https://example.com" in archive_file
|
||||||
|
assert "https://www.iana.org/domains/example" in archive_file
|
|
@ -6,14 +6,7 @@ import subprocess
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import json
|
import json
|
||||||
|
|
||||||
import pytest
|
from .fixtures import *
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def process(tmp_path):
|
|
||||||
os.chdir(tmp_path)
|
|
||||||
process = subprocess.run(['archivebox', 'init'], capture_output=True)
|
|
||||||
return process
|
|
||||||
|
|
||||||
|
|
||||||
def test_init(tmp_path, process):
|
def test_init(tmp_path, process):
|
||||||
assert "Initializing a new ArchiveBox collection in this folder..." in process.stdout.decode("utf-8")
|
assert "Initializing a new ArchiveBox collection in this folder..." in process.stdout.decode("utf-8")
|
||||||
|
@ -32,9 +25,21 @@ def test_add_link(tmp_path, process):
|
||||||
|
|
||||||
with open(archived_item_path / "index.json", "r") as f:
|
with open(archived_item_path / "index.json", "r") as f:
|
||||||
output_json = json.load(f)
|
output_json = json.load(f)
|
||||||
assert "IANA — IANA-managed Reserved Domains" == output_json['history']['title'][0]['output']
|
assert "Example Domain" == output_json['history']['title'][0]['output']
|
||||||
|
|
||||||
with open(tmp_path / "index.html", "r") as f:
|
with open(tmp_path / "index.html", "r") as f:
|
||||||
output_html = f.read()
|
output_html = f.read()
|
||||||
assert "IANA — IANA-managed Reserved Domains" in output_html
|
assert "Example Domain" in output_html
|
||||||
|
|
||||||
|
def test_add_link_support_stdin(tmp_path, process):
|
||||||
|
os.chdir(tmp_path)
|
||||||
|
stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
||||||
|
stdin_process.communicate(input="http://example.com".encode())
|
||||||
|
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
||||||
|
|
||||||
|
assert "index.json" in [x.name for x in archived_item_path.iterdir()]
|
||||||
|
|
||||||
|
with open(archived_item_path / "index.json", "r") as f:
|
||||||
|
output_json = json.load(f)
|
||||||
|
assert "Example Domain" == output_json['history']['title'][0]['output']
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue