diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index ed31c6a0..41679d43 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -50,7 +50,7 @@ def verify_snapshots(modeladmin, request, queryset): verify_snapshots.short_description = "Check" def delete_snapshots(modeladmin, request, queryset): - remove(links=[snapshot.as_link() for snapshot in queryset], yes=True, delete=True, out_dir=OUTPUT_DIR) + remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR) delete_snapshots.short_description = "Delete" diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 82c07007..5b3803ea 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -11,7 +11,7 @@ from typing import List, Tuple, Dict, Optional, Iterable from collections import OrderedDict from contextlib import contextmanager from urllib.parse import urlparse -from django.db.models import QuerySet +from django.db.models import QuerySet, Q from ..util import ( scheme, @@ -370,19 +370,19 @@ def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link: LINK_FILTERS = { - 'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern), - 'substring': lambda link, pattern: pattern in link.url, - 'regex': lambda link, pattern: bool(re.match(pattern, link.url)), - 'domain': lambda link, pattern: link.domain == pattern, + 'exact': lambda pattern: Q(url=pattern), + 'substring': lambda pattern: Q(url__icontains=pattern), + 'regex': lambda pattern: Q(url__iregex=pattern), + 'domain': lambda pattern: Q(domain=pattern), } @enforce_types -def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool: +def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet: + q_filter = Q() for pattern in filter_patterns: try: - if LINK_FILTERS[filter_type](link, pattern): - return True - except Exception: + q_filter = q_filter | LINK_FILTERS[filter_type](pattern) + except KeyError: stderr() stderr( f'[X] Got invalid pattern for --filter-type={filter_type}:', @@ -390,8 +390,7 @@ def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str ) stderr(f' {pattern}') raise SystemExit(2) - - return False + return snapshots.filter(q_filter) def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index 232de407..eed92697 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -2,6 +2,7 @@ __package__ = 'archivebox.index' from io import StringIO from typing import List, Tuple, Iterator +from django.db.models import QuerySet from .schema import Link from ..util import enforce_types @@ -21,14 +22,13 @@ def parse_sql_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]: ) @enforce_types -def remove_from_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None: +def remove_from_sql_main_index(snapshots: QuerySet, out_dir: str=OUTPUT_DIR) -> None: setup_django(out_dir, check_db=True) from core.models import Snapshot from django.db import transaction with transaction.atomic(): - for link in links: - Snapshot.objects.filter(url=link.url).delete() + snapshots.delete() @enforce_types def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None: diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index 5a7cab20..70a62919 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -408,19 +408,18 @@ def log_removal_started(links: List["Link"], yes: bool, delete: bool): except (KeyboardInterrupt, EOFError, AssertionError): raise SystemExit(0) -def log_removal_finished(all_links: int, to_keep: int): +def log_removal_finished(all_links: int, to_remove: int): if all_links == 0: print() print('{red}[X] No matching links found.{reset}'.format(**ANSI)) else: - num_removed = all_links - to_keep print() print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format( - num_removed, + to_remove, all_links, **ANSI, )) - print(' Index now contains {} links.'.format(to_keep)) + print(' Index now contains {} links.'.format(all_links - to_remove)) def log_shell_welcome_msg(): diff --git a/archivebox/main.py b/archivebox/main.py index 6f34f91d..6a7fa02a 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -7,6 +7,7 @@ from pathlib import Path from typing import Dict, List, Optional, Iterable, IO, Union from crontab import CronTab, CronSlices +from django.db.models import QuerySet from .cli import ( list_subcommands, @@ -31,7 +32,7 @@ from .index import ( dedupe_links, write_main_index, write_static_index, - link_matches_filter, + snapshot_filter, get_indexed_folders, get_archived_folders, get_unarchived_folders, @@ -567,7 +568,7 @@ def add(urls: Union[str, List[str]], def remove(filter_str: Optional[str]=None, filter_patterns: Optional[List[str]]=None, filter_type: str='exact', - links: Optional[List[Link]]=None, + snapshots: Optional[QuerySet]=None, after: Optional[float]=None, before: Optional[float]=None, yes: bool=False, @@ -577,7 +578,7 @@ def remove(filter_str: Optional[str]=None, check_data_folder(out_dir=out_dir) - if links is None: + if not snapshots: if filter_str and filter_patterns: stderr( '[X] You should pass either a pattern as an argument, ' @@ -593,60 +594,54 @@ def remove(filter_str: Optional[str]=None, ) stderr() hint(('To remove all urls you can run:', - 'archivebox remove --filter-type=regex ".*"')) + 'archivebox remove --filter-type=regex ".*"')) stderr() raise SystemExit(2) elif filter_str: filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')] - log_list_started(filter_patterns, filter_type) - timer = TimedProgress(360, prefix=' ') - try: - links = list(list_links( - filter_patterns=filter_patterns, - filter_type=filter_type, - after=after, - before=before, - )) - finally: - timer.end() + list_kwargs = { + "filter_patterns": filter_patterns, + "filter_type": filter_type, + "after": after, + "before": before, + } + if snapshots: + list_kwargs["snapshots"] = snapshots + + log_list_started(filter_patterns, filter_type) + timer = TimedProgress(360, prefix=' ') + try: + snapshots = list_links(**list_kwargs) + finally: + timer.end() - if not len(links): + if not snapshots.exists(): log_removal_finished(0, 0) raise SystemExit(1) - log_list_finished(links) - log_removal_started(links, yes=yes, delete=delete) + log_links = [link.as_link() for link in snapshots] + log_list_finished(log_links) + log_removal_started(log_links, yes=yes, delete=delete) timer = TimedProgress(360, prefix=' ') try: - to_keep = [] - to_delete = [] - all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)] - for link in all_links: - should_remove = ( - (after is not None and float(link.timestamp) < after) - or (before is not None and float(link.timestamp) > before) - or link_matches_filter(link, filter_patterns or [], filter_type) - or link in links - ) - if should_remove: - to_delete.append(link) - - if delete: - shutil.rmtree(link.link_dir, ignore_errors=True) - else: - to_keep.append(link) + for snapshot in snapshots: + if delete: + shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True) finally: timer.end() - remove_from_sql_main_index(links=to_delete, out_dir=out_dir) - write_main_index(links=to_keep, out_dir=out_dir, finished=True) - log_removal_finished(len(all_links), len(to_keep)) + to_remove = snapshots.count() + + remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir) + all_snapshots = load_main_index(out_dir=out_dir) + write_static_index([link.as_link() for link in all_snapshots], out_dir=out_dir) + log_removal_finished(all_snapshots.count(), to_remove) - return to_keep + return all_snapshots @enforce_types def update(resume: Optional[float]=None, @@ -737,18 +732,18 @@ def list_all(filter_patterns_str: Optional[str]=None, filter_patterns = filter_patterns_str.split('\n') - links = list_links( + snapshots = list_links( filter_patterns=filter_patterns, filter_type=filter_type, before=before, after=after, ) - if sort: - links = sorted(links, key=lambda link: getattr(link, sort)) + #if sort: + # snapshots = sorted(links, key=lambda link: getattr(link, sort)) folders = list_folders( - links=list(links), + links=[snapshot.as_link() for snapshot in snapshots], status=status, out_dir=out_dir, ) @@ -758,7 +753,8 @@ def list_all(filter_patterns_str: Optional[str]=None, @enforce_types -def list_links(filter_patterns: Optional[List[str]]=None, +def list_links(snapshots: Optional[QuerySet]=None, + filter_patterns: Optional[List[str]]=None, filter_type: str='exact', after: Optional[float]=None, before: Optional[float]=None, @@ -766,19 +762,18 @@ def list_links(filter_patterns: Optional[List[str]]=None, check_data_folder(out_dir=out_dir) - all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)] + if snapshots: + all_snapshots = snapshots + else: + all_snapshots = load_main_index(out_dir=out_dir) - for link in all_links: - if after is not None and float(link.timestamp) < after: - continue - if before is not None and float(link.timestamp) > before: - continue - - if filter_patterns: - if link_matches_filter(link, filter_patterns, filter_type): - yield link - else: - yield link + if after is not None: + all_snapshots = all_snapshots.filter(timestamp__lt=after) + if before is not None: + all_snapshots = all_snapshots.filter(timestamp__gt=before) + if filter_patterns: + all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type) + return all_snapshots @enforce_types def list_folders(links: List[Link],