diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py index 529dad80..140810a6 100644 --- a/archivebox/cli/archivebox_list.py +++ b/archivebox/cli/archivebox_list.py @@ -98,7 +98,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional parser.add_argument( '--filter-type', type=str, - choices=('exact', 'substring', 'domain', 'regex'), + choices=('exact', 'substring', 'domain', 'regex','tag'), default='exact', help='Type of pattern matching to use when filtering URLs', ) diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py index 8fe717fb..cb073e95 100644 --- a/archivebox/cli/archivebox_remove.py +++ b/archivebox/cli/archivebox_remove.py @@ -50,7 +50,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional parser.add_argument( '--filter-type', type=str, - choices=('exact', 'substring', 'domain', 'regex'), + choices=('exact', 'substring', 'domain', 'regex','tag'), default='exact', help='Type of pattern matching to use when filtering URLs', ) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index ee4bf411..890777c8 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -361,6 +361,7 @@ LINK_FILTERS = { 'substring': lambda pattern: Q(url__icontains=pattern), 'regex': lambda pattern: Q(url__iregex=pattern), 'domain': lambda pattern: Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}"), + 'tag': lambda pattern: Q(tags__name=pattern), } @enforce_types diff --git a/tests/test_remove.py b/tests/test_remove.py index 0fb16e2a..c9c63385 100644 --- a/tests/test_remove.py +++ b/tests/test_remove.py @@ -70,6 +70,29 @@ def test_remove_domain(tmp_path, process, disable_extractors_dict): assert count == 0 + +def test_remove_tag(tmp_path, process, disable_extractors_dict): + subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict) + subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True, env=disable_extractors_dict) + assert list((tmp_path / "archive").iterdir()) != [] + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + c.execute("INSERT INTO core_tag (id, name, slug) VALUES (2, 'test-tag', 'test-tag')") + snapshot_ids = c.execute("SELECT id from core_snapshot") + c.executemany('INSERT INTO core_snapshot_tags (snapshot_id, tag_id) VALUES (?, 2)', list(snapshot_ids)) + conn.commit() + + remove_process = subprocess.run(['archivebox', 'remove', '--filter-type=tag', 'test-tag', '--yes', '--delete'], capture_output=True) + + assert len(list((tmp_path / "archive").iterdir())) == 0 + + count = c.execute("SELECT COUNT() from core_snapshot").fetchone()[0] + conn.commit() + conn.close() + + assert count == 0 + def test_remove_before(tmp_path, process, disable_extractors_dict): subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict) subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True, env=disable_extractors_dict)