1
0
Fork 0

feat: Refactor remove command to use querysets

This commit is contained in:
Cristian 2020-08-21 12:42:08 -05:00 committed by Cristian Vargas
parent be520d137a
commit a8ed72501d
5 changed files with 68 additions and 75 deletions

View file

@ -50,7 +50,7 @@ def verify_snapshots(modeladmin, request, queryset):
verify_snapshots.short_description = "Check" verify_snapshots.short_description = "Check"
def delete_snapshots(modeladmin, request, queryset): def delete_snapshots(modeladmin, request, queryset):
remove(links=[snapshot.as_link() for snapshot in queryset], yes=True, delete=True, out_dir=OUTPUT_DIR) remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR)
delete_snapshots.short_description = "Delete" delete_snapshots.short_description = "Delete"

View file

@ -11,7 +11,7 @@ from typing import List, Tuple, Dict, Optional, Iterable
from collections import OrderedDict from collections import OrderedDict
from contextlib import contextmanager from contextlib import contextmanager
from urllib.parse import urlparse from urllib.parse import urlparse
from django.db.models import QuerySet from django.db.models import QuerySet, Q
from ..util import ( from ..util import (
scheme, scheme,
@ -370,19 +370,19 @@ def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
LINK_FILTERS = { LINK_FILTERS = {
'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern), 'exact': lambda pattern: Q(url=pattern),
'substring': lambda link, pattern: pattern in link.url, 'substring': lambda pattern: Q(url__icontains=pattern),
'regex': lambda link, pattern: bool(re.match(pattern, link.url)), 'regex': lambda pattern: Q(url__iregex=pattern),
'domain': lambda link, pattern: link.domain == pattern, 'domain': lambda pattern: Q(domain=pattern),
} }
@enforce_types @enforce_types
def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool: def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet:
q_filter = Q()
for pattern in filter_patterns: for pattern in filter_patterns:
try: try:
if LINK_FILTERS[filter_type](link, pattern): q_filter = q_filter | LINK_FILTERS[filter_type](pattern)
return True except KeyError:
except Exception:
stderr() stderr()
stderr( stderr(
f'[X] Got invalid pattern for --filter-type={filter_type}:', f'[X] Got invalid pattern for --filter-type={filter_type}:',
@ -390,8 +390,7 @@ def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str
) )
stderr(f' {pattern}') stderr(f' {pattern}')
raise SystemExit(2) raise SystemExit(2)
return snapshots.filter(q_filter)
return False
def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]: def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:

View file

@ -2,6 +2,7 @@ __package__ = 'archivebox.index'
from io import StringIO from io import StringIO
from typing import List, Tuple, Iterator from typing import List, Tuple, Iterator
from django.db.models import QuerySet
from .schema import Link from .schema import Link
from ..util import enforce_types from ..util import enforce_types
@ -21,14 +22,13 @@ def parse_sql_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
) )
@enforce_types @enforce_types
def remove_from_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None: def remove_from_sql_main_index(snapshots: QuerySet, out_dir: str=OUTPUT_DIR) -> None:
setup_django(out_dir, check_db=True) setup_django(out_dir, check_db=True)
from core.models import Snapshot from core.models import Snapshot
from django.db import transaction from django.db import transaction
with transaction.atomic(): with transaction.atomic():
for link in links: snapshots.delete()
Snapshot.objects.filter(url=link.url).delete()
@enforce_types @enforce_types
def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None: def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:

View file

@ -408,19 +408,18 @@ def log_removal_started(links: List["Link"], yes: bool, delete: bool):
except (KeyboardInterrupt, EOFError, AssertionError): except (KeyboardInterrupt, EOFError, AssertionError):
raise SystemExit(0) raise SystemExit(0)
def log_removal_finished(all_links: int, to_keep: int): def log_removal_finished(all_links: int, to_remove: int):
if all_links == 0: if all_links == 0:
print() print()
print('{red}[X] No matching links found.{reset}'.format(**ANSI)) print('{red}[X] No matching links found.{reset}'.format(**ANSI))
else: else:
num_removed = all_links - to_keep
print() print()
print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format( print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format(
num_removed, to_remove,
all_links, all_links,
**ANSI, **ANSI,
)) ))
print(' Index now contains {} links.'.format(to_keep)) print(' Index now contains {} links.'.format(all_links - to_remove))
def log_shell_welcome_msg(): def log_shell_welcome_msg():

View file

@ -7,6 +7,7 @@ from pathlib import Path
from typing import Dict, List, Optional, Iterable, IO, Union from typing import Dict, List, Optional, Iterable, IO, Union
from crontab import CronTab, CronSlices from crontab import CronTab, CronSlices
from django.db.models import QuerySet
from .cli import ( from .cli import (
list_subcommands, list_subcommands,
@ -31,7 +32,7 @@ from .index import (
dedupe_links, dedupe_links,
write_main_index, write_main_index,
write_static_index, write_static_index,
link_matches_filter, snapshot_filter,
get_indexed_folders, get_indexed_folders,
get_archived_folders, get_archived_folders,
get_unarchived_folders, get_unarchived_folders,
@ -567,7 +568,7 @@ def add(urls: Union[str, List[str]],
def remove(filter_str: Optional[str]=None, def remove(filter_str: Optional[str]=None,
filter_patterns: Optional[List[str]]=None, filter_patterns: Optional[List[str]]=None,
filter_type: str='exact', filter_type: str='exact',
links: Optional[List[Link]]=None, snapshots: Optional[QuerySet]=None,
after: Optional[float]=None, after: Optional[float]=None,
before: Optional[float]=None, before: Optional[float]=None,
yes: bool=False, yes: bool=False,
@ -577,7 +578,7 @@ def remove(filter_str: Optional[str]=None,
check_data_folder(out_dir=out_dir) check_data_folder(out_dir=out_dir)
if links is None: if not snapshots:
if filter_str and filter_patterns: if filter_str and filter_patterns:
stderr( stderr(
'[X] You should pass either a pattern as an argument, ' '[X] You should pass either a pattern as an argument, '
@ -593,60 +594,54 @@ def remove(filter_str: Optional[str]=None,
) )
stderr() stderr()
hint(('To remove all urls you can run:', hint(('To remove all urls you can run:',
'archivebox remove --filter-type=regex ".*"')) 'archivebox remove --filter-type=regex ".*"'))
stderr() stderr()
raise SystemExit(2) raise SystemExit(2)
elif filter_str: elif filter_str:
filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')] filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')]
log_list_started(filter_patterns, filter_type) list_kwargs = {
timer = TimedProgress(360, prefix=' ') "filter_patterns": filter_patterns,
try: "filter_type": filter_type,
links = list(list_links( "after": after,
filter_patterns=filter_patterns, "before": before,
filter_type=filter_type, }
after=after, if snapshots:
before=before, list_kwargs["snapshots"] = snapshots
))
finally: log_list_started(filter_patterns, filter_type)
timer.end() timer = TimedProgress(360, prefix=' ')
try:
snapshots = list_links(**list_kwargs)
finally:
timer.end()
if not len(links): if not snapshots.exists():
log_removal_finished(0, 0) log_removal_finished(0, 0)
raise SystemExit(1) raise SystemExit(1)
log_list_finished(links) log_links = [link.as_link() for link in snapshots]
log_removal_started(links, yes=yes, delete=delete) log_list_finished(log_links)
log_removal_started(log_links, yes=yes, delete=delete)
timer = TimedProgress(360, prefix=' ') timer = TimedProgress(360, prefix=' ')
try: try:
to_keep = [] for snapshot in snapshots:
to_delete = [] if delete:
all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)] shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True)
for link in all_links:
should_remove = (
(after is not None and float(link.timestamp) < after)
or (before is not None and float(link.timestamp) > before)
or link_matches_filter(link, filter_patterns or [], filter_type)
or link in links
)
if should_remove:
to_delete.append(link)
if delete:
shutil.rmtree(link.link_dir, ignore_errors=True)
else:
to_keep.append(link)
finally: finally:
timer.end() timer.end()
remove_from_sql_main_index(links=to_delete, out_dir=out_dir) to_remove = snapshots.count()
write_main_index(links=to_keep, out_dir=out_dir, finished=True)
log_removal_finished(len(all_links), len(to_keep)) remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
all_snapshots = load_main_index(out_dir=out_dir)
write_static_index([link.as_link() for link in all_snapshots], out_dir=out_dir)
log_removal_finished(all_snapshots.count(), to_remove)
return to_keep return all_snapshots
@enforce_types @enforce_types
def update(resume: Optional[float]=None, def update(resume: Optional[float]=None,
@ -737,18 +732,18 @@ def list_all(filter_patterns_str: Optional[str]=None,
filter_patterns = filter_patterns_str.split('\n') filter_patterns = filter_patterns_str.split('\n')
links = list_links( snapshots = list_links(
filter_patterns=filter_patterns, filter_patterns=filter_patterns,
filter_type=filter_type, filter_type=filter_type,
before=before, before=before,
after=after, after=after,
) )
if sort: #if sort:
links = sorted(links, key=lambda link: getattr(link, sort)) # snapshots = sorted(links, key=lambda link: getattr(link, sort))
folders = list_folders( folders = list_folders(
links=list(links), links=[snapshot.as_link() for snapshot in snapshots],
status=status, status=status,
out_dir=out_dir, out_dir=out_dir,
) )
@ -758,7 +753,8 @@ def list_all(filter_patterns_str: Optional[str]=None,
@enforce_types @enforce_types
def list_links(filter_patterns: Optional[List[str]]=None, def list_links(snapshots: Optional[QuerySet]=None,
filter_patterns: Optional[List[str]]=None,
filter_type: str='exact', filter_type: str='exact',
after: Optional[float]=None, after: Optional[float]=None,
before: Optional[float]=None, before: Optional[float]=None,
@ -766,19 +762,18 @@ def list_links(filter_patterns: Optional[List[str]]=None,
check_data_folder(out_dir=out_dir) check_data_folder(out_dir=out_dir)
all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)] if snapshots:
all_snapshots = snapshots
else:
all_snapshots = load_main_index(out_dir=out_dir)
for link in all_links: if after is not None:
if after is not None and float(link.timestamp) < after: all_snapshots = all_snapshots.filter(timestamp__lt=after)
continue if before is not None:
if before is not None and float(link.timestamp) > before: all_snapshots = all_snapshots.filter(timestamp__gt=before)
continue if filter_patterns:
all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
if filter_patterns: return all_snapshots
if link_matches_filter(link, filter_patterns, filter_type):
yield link
else:
yield link
@enforce_types @enforce_types
def list_folders(links: List[Link], def list_folders(links: List[Link],