feat: Add tests to refactored init command

2020-08-20 16:47:02 -05:00 · 2020-08-20 16:47:02 -05:00 · be0dff8126
commit be0dff8126
parent 404f333e17
3 changed files with 81 additions and 12 deletions
--- a/archivebox/index/init.py
+++ b/archivebox/index/init.py
@ -261,6 +261,11 @@ def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=
    log_indexing_process_finished()


+def get_empty_snapshot_queryset(out_dir: str=OUTPUT_DIR):
+    setup_django(out_dir, check_db=True)
+    from core.models import Snapshot
+    return Snapshot.objects.none()
+
@enforce_types
 def load_main_index(out_dir: str=OUTPUT_DIR, warn: bool=True) -> List[Link]:
    """parse and load existing index with any new links from import_path merged in"""
@ -432,23 +437,19 @@ def get_duplicate_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optio
    """dirs that conflict with other directories that have the same link URL or timestamp"""
    by_url = {}
    by_timestamp = {}
-    indexed_folders = set()
-    for snapshot in snapshots.iterator():
-        link = snapshot.as_link()
-        by_url[link.url] = 0
-        by_timestamp[link.timestamp] = 0
-        indexed_folders.update([link.link_dir])
-
    duplicate_folders = {}

    data_folders = (
-        entry.path
+        str(entry)
        for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir()
-            if entry.is_dir() and str(entry) not in indexed_folders
+            if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
    )

-    for path in chain(sorted(indexed_folders), sorted(data_folders)):
+    for path in chain(snapshots.iterator(), data_folders):
        link = None
+        if type(path) is not str:
+            path = path.as_link().link_dir
+
        try:
            link = parse_json_link_details(path)
        except Exception:
@ -464,7 +465,6 @@ def get_duplicate_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optio
            by_url[link.url] = by_url.get(link.url, 0) + 1
            if by_url[link.url] > 1:
                duplicate_folders[path] = link
-
    return duplicate_folders

 def get_orphaned_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
--- a/archivebox/main.py
+++ b/archivebox/main.py
@ -26,6 +26,7 @@ from .util import enforce_types                         # type: ignore
 from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
 from .index import (
    load_main_index,
+    get_empty_snapshot_queryset,
    parse_links_from_source,
    dedupe_links,
    write_main_index,
@ -317,7 +318,7 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
    print()
    print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))

-    all_links: Dict[str, Link] = {}
+    all_links = get_empty_snapshot_queryset()
    pending_links: Dict[str, Link] = {}

    if existing_index:
--- a/tests/test_init.py
+++ b/tests/test_init.py
@ -5,6 +5,7 @@ import os
 import subprocess
 from pathlib import Path
 import json
+import sqlite3

 from archivebox.config import OUTPUT_PERMISSIONS

@ -63,4 +64,71 @@ def test_correct_permissions_add_command_results(tmp_path, process, disable_extr
    for path in archived_item_path.iterdir():
        assert oct(path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS

+def test_collision_urls_different_timestamps(tmp_path, process, disable_extractors_dict):
+    os.chdir(tmp_path)
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True,
+                     env=disable_extractors_dict)
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True,
+                     env=disable_extractors_dict)
+    archive_folders = [x.name for x in (tmp_path / "archive").iterdir()]
+    
+    first_archive = tmp_path / "archive" / str(min([float(folder) for folder in archive_folders]))
+    json_index = str(first_archive / "index.json")
+    with open(json_index, "r") as f:
+        link_details = json.loads(f.read())

+    link_details["url"] = "http://127.0.0.1:8080/static/iana.org.html"
+    with open(json_index, "w") as f:
+        json.dump(link_details, f)
+
+    init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
+    # 1 from duplicated url, 1 from corrupted index
+    assert "Skipped adding 2 invalid link data directories" in init_process.stdout.decode("utf-8")
+    assert init_process.returncode == 0
+
+def test_collision_timestamps_different_urls(tmp_path, process, disable_extractors_dict):
+    os.chdir(tmp_path)
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True,
+                     env=disable_extractors_dict)
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True,
+                     env=disable_extractors_dict)
+    archive_folders = [x.name for x in (tmp_path / "archive").iterdir()]
+    first_archive = tmp_path / "archive" / str(min([float(folder) for folder in archive_folders]))
+    archive_folders.remove(first_archive.name)
+    json_index = str(first_archive / "index.json")
+
+    with open(json_index, "r") as f:
+        link_details = json.loads(f.read())
+
+    link_details["timestamp"] = archive_folders[0]
+
+    with open(json_index, "w") as f:
+        json.dump(link_details, f)
+
+    init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
+    assert "Skipped adding 1 invalid link data directories" in init_process.stdout.decode("utf-8")
+    assert init_process.returncode == 0
+
+def test_orphaned_folders(tmp_path, process, disable_extractors_dict):
+    os.chdir(tmp_path)
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True,
+                     env=disable_extractors_dict)
+    conn = sqlite3.connect("index.sqlite3")
+    c = conn.cursor()
+    c.execute("DELETE from core_snapshot")
+    conn.commit()
+    conn.close()
+
+    init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
+    assert "Added 1 orphaned links from existing JSON index" in init_process.stdout.decode("utf-8")
+    assert init_process.returncode == 0
+
+def test_unrecognized_folders(tmp_path, process, disable_extractors_dict):
+    os.chdir(tmp_path)
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True,
+                     env=disable_extractors_dict)
+    (tmp_path / "archive" / "some_random_folder").mkdir()
+
+    init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
+    assert "Skipped adding 1 invalid link data directories" in init_process.stdout.decode("utf-8")
+    assert init_process.returncode == 0