diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index ac6c85d6..c2c774c2 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -392,45 +392,50 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type return snapshots.filter(q_filter) -def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_indexed_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links without checking archive status or data directory validity""" + links = [snapshot.as_link() for snapshot in snapshots.iterator()] return { link.link_dir: link for link in links } -def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_archived_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links that are archived with a valid data directory""" + links = [snapshot.as_link() for snapshot in snapshots.iterator()] return { link.link_dir: link for link in filter(is_archived, links) } -def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_unarchived_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """indexed links that are unarchived with no data directory or an empty data directory""" + links = [snapshot.as_link() for snapshot in snapshots.iterator()] return { link.link_dir: link for link in filter(is_unarchived, links) } -def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_present_folders(_snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """dirs that actually exist in the archive/ folder""" + all_folders = {} - for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)): - if entry.is_dir(follow_symlinks=True): + for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir(): + if entry.is_dir(): link = None try: link = parse_json_link_details(entry.path) except Exception: pass - all_folders[entry.path] = link + all_folders[entry.name] = link return all_folders -def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: +def get_valid_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: """dirs with a valid index matched to the main index and archived content""" + links = [snapshot.as_link() for snapshot in snapshots.iterator()] return { link.link_dir: link for link in filter(is_valid, links) diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index 13bb7137..33c25c23 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -29,22 +29,28 @@ def remove_from_sql_main_index(snapshots: QuerySet, out_dir: str=OUTPUT_DIR) -> with transaction.atomic(): snapshots.delete() +@enforce_types +def write_link_to_sql_index(link: Link): + from core.models import Snapshot + info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys} + try: + info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp + except Snapshot.DoesNotExist: + while Snapshot.objects.filter(timestamp=info["timestamp"]).exists(): + info["timestamp"] = str(float(info["timestamp"]) + 1.0) + + return Snapshot.objects.update_or_create(url=link.url, defaults=info)[0] + + @enforce_types def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None: setup_django(out_dir, check_db=True) - from core.models import Snapshot from django.db import transaction with transaction.atomic(): for link in links: - info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys} - try: - info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp - except Snapshot.DoesNotExist: - while Snapshot.objects.filter(timestamp=info["timestamp"]).exists(): - info["timestamp"] = str(float(info["timestamp"]) + 1.0) - - Snapshot.objects.update_or_create(url=link.url, defaults=info) + write_link_to_sql_index(link) + @enforce_types def write_sql_link_details(link: Link, out_dir: str=OUTPUT_DIR) -> None: @@ -53,7 +59,10 @@ def write_sql_link_details(link: Link, out_dir: str=OUTPUT_DIR) -> None: from django.db import transaction with transaction.atomic(): - snap = Snapshot.objects.get(url=link.url) + try: + snap = Snapshot.objects.get(url=link.url) + except Snapshot.DoesNotExist: + snap = write_link_to_sql_index(link) snap.title = link.title snap.tags = link.tags snap.save() diff --git a/archivebox/main.py b/archivebox/main.py index 6a7fa02a..6daebc37 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -659,24 +659,18 @@ def update(resume: Optional[float]=None, check_data_folder(out_dir=out_dir) check_dependencies() + new_links: List[Link] = [] # TODO: Remove input argument: only_new - # Step 1: Load list of links from the existing index - # merge in and dedupe new links from import_path - new_links: List[Link] = [] - all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)] - - # Step 2: Write updated index with deduped old and new links back to disk - # write_main_index(links=list(all_links), out_dir=out_dir) - - # Step 3: Filter for selected_links - matching_links = list_links( + # Step 1: Filter for selected_links + matching_snapshots = list_links( filter_patterns=filter_patterns, filter_type=filter_type, before=before, after=after, ) + matching_folders = list_folders( - links=list(matching_links), + links=matching_snapshots, status=status, out_dir=out_dir, ) @@ -685,7 +679,7 @@ def update(resume: Optional[float]=None, if index_only: return all_links - # Step 3: Run the archive methods for each link + # Step 2: Run the archive methods for each link to_archive = new_links if only_new else all_links if resume: to_archive = [ @@ -700,8 +694,8 @@ def update(resume: Optional[float]=None, archive_links(to_archive, overwrite=overwrite, out_dir=out_dir) # Step 4: Re-write links index with updated titles, icons, and resources - all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)] - write_main_index(links=list(all_links), out_dir=out_dir, finished=True) + all_links = load_main_index(out_dir=out_dir) + write_static_index([link.as_link() for link in all_links], out_dir=out_dir) return all_links @enforce_types @@ -743,7 +737,7 @@ def list_all(filter_patterns_str: Optional[str]=None, # snapshots = sorted(links, key=lambda link: getattr(link, sort)) folders = list_folders( - links=[snapshot.as_link() for snapshot in snapshots], + links=snapshots, status=status, out_dir=out_dir, ) @@ -782,30 +776,23 @@ def list_folders(links: List[Link], check_data_folder(out_dir=out_dir) - if status == 'indexed': - return get_indexed_folders(links, out_dir=out_dir) - elif status == 'archived': - return get_archived_folders(links, out_dir=out_dir) - elif status == 'unarchived': - return get_unarchived_folders(links, out_dir=out_dir) + STATUS_FUNCTIONS = { + "indexed": get_indexed_folders, + "archived": get_archived_folders, + "unarchived": get_unarchived_folders, + "present": get_present_folders, + "valid": get_valid_folders, + "invalid": get_invalid_folders, + "duplicate": get_duplicate_folders, + "orphaned": get_orphaned_folders, + "corrupted": get_corrupted_folders, + "unrecognized": get_unrecognized_folders, + } - elif status == 'present': - return get_present_folders(links, out_dir=out_dir) - elif status == 'valid': - return get_valid_folders(links, out_dir=out_dir) - elif status == 'invalid': - return get_invalid_folders(links, out_dir=out_dir) - - elif status == 'duplicate': - return get_duplicate_folders(links, out_dir=out_dir) - elif status == 'orphaned': - return get_orphaned_folders(links, out_dir=out_dir) - elif status == 'corrupted': - return get_corrupted_folders(links, out_dir=out_dir) - elif status == 'unrecognized': - return get_unrecognized_folders(links, out_dir=out_dir) - - raise ValueError('Status not recognized.') + try: + return STATUS_FUNCTIONS[status](links, out_dir=out_dir) + except KeyError: + raise ValueError('Status not recognized.') @enforce_types diff --git a/tests/test_update.py b/tests/test_update.py new file mode 100644 index 00000000..238a92d9 --- /dev/null +++ b/tests/test_update.py @@ -0,0 +1,27 @@ +import sqlite3 + +from .fixtures import * + +def test_update_status_invalid(tmp_path, process, disable_extractors_dict): + subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict) + assert list((tmp_path / "archive").iterdir()) != [] + + subprocess.run(['archivebox', 'remove', 'http://127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True) + + conn = sqlite3.connect(str(tmp_path / "index.sqlite3")) + c = conn.cursor() + link = c.execute("SELECT * FROM core_snapshot").fetchone() + conn.commit() + conn.close() + + assert link is None + + update_process = subprocess.run(['archivebox', 'update', '--status=invalid'], capture_output=True, env=disable_extractors_dict) + + conn = sqlite3.connect(str(tmp_path / "index.sqlite3")) + c = conn.cursor() + url = c.execute("SELECT * FROM core_snapshot").fetchone()[1] + conn.commit() + conn.close() + + assert url == 'http://127.0.0.1:8080/static/example.com.html' \ No newline at end of file