diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 567e1bf3..d10d3ab1 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -9,7 +9,6 @@ from ..index.schema import Link from ..index import ( load_link_details, write_link_details, - write_main_index, ) from ..util import enforce_types from ..logging_util import ( diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 5b3803ea..ac6c85d6 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -1,6 +1,5 @@ __package__ = 'archivebox.index' -import re import os import shutil import json as pyjson @@ -373,7 +372,7 @@ LINK_FILTERS = { 'exact': lambda pattern: Q(url=pattern), 'substring': lambda pattern: Q(url__icontains=pattern), 'regex': lambda pattern: Q(url__iregex=pattern), - 'domain': lambda pattern: Q(domain=pattern), + 'domain': lambda pattern: Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}"), } @enforce_types diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index eed92697..13bb7137 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -24,7 +24,6 @@ def parse_sql_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]: @enforce_types def remove_from_sql_main_index(snapshots: QuerySet, out_dir: str=OUTPUT_DIR) -> None: setup_django(out_dir, check_db=True) - from core.models import Snapshot from django.db import transaction with transaction.atomic(): diff --git a/tests/test_remove.py b/tests/test_remove.py index d26c96bb..fced2da3 100644 --- a/tests/test_remove.py +++ b/tests/test_remove.py @@ -1,8 +1,71 @@ +import os +import sqlite3 + from .fixtures import * -def test_remove_leaves_index_in_consistent_state(tmp_path, process, disable_extractors_dict): +def test_remove_single_page(tmp_path, process, disable_extractors_dict): os.chdir(tmp_path) subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict) - remove_process = subprocess.run(['archivebox', 'remove', '127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True) - list_process = subprocess.run(['archivebox', 'list'], capture_output=True) - assert "Warning: SQL index does not match JSON index!" not in list_process.stderr.decode("utf-8") \ No newline at end of file + remove_process = subprocess.run(['archivebox', 'remove', 'http://127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True) + assert "Found 1 matching URLs to remove" in remove_process.stdout.decode("utf-8") + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + count = c.execute("SELECT COUNT() from core_snapshot").fetchone()[0] + conn.commit() + conn.close() + + assert count == 0 + + +def test_remove_single_page_filesystem(tmp_path, process, disable_extractors_dict): + subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict) + assert list((tmp_path / "archive").iterdir()) != [] + + subprocess.run(['archivebox', 'remove', 'http://127.0.0.1:8080/static/example.com.html', '--yes', '--delete'], capture_output=True) + + assert list((tmp_path / "archive").iterdir()) == [] + +def test_remove_regex(tmp_path, process, disable_extractors_dict): + subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict) + subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True, env=disable_extractors_dict) + assert list((tmp_path / "archive").iterdir()) != [] + + subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete'], capture_output=True) + + assert list((tmp_path / "archive").iterdir()) == [] + +def test_remove_exact(tmp_path, process, disable_extractors_dict): + subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict) + subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True, env=disable_extractors_dict) + assert list((tmp_path / "archive").iterdir()) != [] + + remove_process = subprocess.run(['archivebox', 'remove', '--filter-type=exact', 'http://127.0.0.1:8080/static/iana.org.html', '--yes', '--delete'], capture_output=True) + + assert len(list((tmp_path / "archive").iterdir())) == 1 + +def test_remove_substr(tmp_path, process, disable_extractors_dict): + subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict) + subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True, env=disable_extractors_dict) + assert list((tmp_path / "archive").iterdir()) != [] + + subprocess.run(['archivebox', 'remove', '--filter-type=substring', 'example.com', '--yes', '--delete'], capture_output=True) + + assert len(list((tmp_path / "archive").iterdir())) == 1 + +def test_remove_domain(tmp_path, process, disable_extractors_dict): + subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict) + subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True, env=disable_extractors_dict) + assert list((tmp_path / "archive").iterdir()) != [] + + remove_process = subprocess.run(['archivebox', 'remove', '--filter-type=domain', '127.0.0.1', '--yes', '--delete'], capture_output=True) + + assert len(list((tmp_path / "archive").iterdir())) == 0 + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + count = c.execute("SELECT COUNT() from core_snapshot").fetchone()[0] + conn.commit() + conn.close() + + assert count == 0 \ No newline at end of file