feat: Refactor remove command to use querysets
This commit is contained in:
parent
be520d137a
commit
a8ed72501d
5 changed files with 68 additions and 75 deletions
|
@ -50,7 +50,7 @@ def verify_snapshots(modeladmin, request, queryset):
|
||||||
verify_snapshots.short_description = "Check"
|
verify_snapshots.short_description = "Check"
|
||||||
|
|
||||||
def delete_snapshots(modeladmin, request, queryset):
|
def delete_snapshots(modeladmin, request, queryset):
|
||||||
remove(links=[snapshot.as_link() for snapshot in queryset], yes=True, delete=True, out_dir=OUTPUT_DIR)
|
remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR)
|
||||||
|
|
||||||
delete_snapshots.short_description = "Delete"
|
delete_snapshots.short_description = "Delete"
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ from typing import List, Tuple, Dict, Optional, Iterable
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from django.db.models import QuerySet
|
from django.db.models import QuerySet, Q
|
||||||
|
|
||||||
from ..util import (
|
from ..util import (
|
||||||
scheme,
|
scheme,
|
||||||
|
@ -370,19 +370,19 @@ def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
|
||||||
|
|
||||||
|
|
||||||
LINK_FILTERS = {
|
LINK_FILTERS = {
|
||||||
'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern),
|
'exact': lambda pattern: Q(url=pattern),
|
||||||
'substring': lambda link, pattern: pattern in link.url,
|
'substring': lambda pattern: Q(url__icontains=pattern),
|
||||||
'regex': lambda link, pattern: bool(re.match(pattern, link.url)),
|
'regex': lambda pattern: Q(url__iregex=pattern),
|
||||||
'domain': lambda link, pattern: link.domain == pattern,
|
'domain': lambda pattern: Q(domain=pattern),
|
||||||
}
|
}
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool:
|
def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet:
|
||||||
|
q_filter = Q()
|
||||||
for pattern in filter_patterns:
|
for pattern in filter_patterns:
|
||||||
try:
|
try:
|
||||||
if LINK_FILTERS[filter_type](link, pattern):
|
q_filter = q_filter | LINK_FILTERS[filter_type](pattern)
|
||||||
return True
|
except KeyError:
|
||||||
except Exception:
|
|
||||||
stderr()
|
stderr()
|
||||||
stderr(
|
stderr(
|
||||||
f'[X] Got invalid pattern for --filter-type={filter_type}:',
|
f'[X] Got invalid pattern for --filter-type={filter_type}:',
|
||||||
|
@ -390,8 +390,7 @@ def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str
|
||||||
)
|
)
|
||||||
stderr(f' {pattern}')
|
stderr(f' {pattern}')
|
||||||
raise SystemExit(2)
|
raise SystemExit(2)
|
||||||
|
return snapshots.filter(q_filter)
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||||
|
|
|
@ -2,6 +2,7 @@ __package__ = 'archivebox.index'
|
||||||
|
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
from typing import List, Tuple, Iterator
|
from typing import List, Tuple, Iterator
|
||||||
|
from django.db.models import QuerySet
|
||||||
|
|
||||||
from .schema import Link
|
from .schema import Link
|
||||||
from ..util import enforce_types
|
from ..util import enforce_types
|
||||||
|
@ -21,14 +22,13 @@ def parse_sql_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
|
||||||
)
|
)
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def remove_from_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
|
def remove_from_sql_main_index(snapshots: QuerySet, out_dir: str=OUTPUT_DIR) -> None:
|
||||||
setup_django(out_dir, check_db=True)
|
setup_django(out_dir, check_db=True)
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
from django.db import transaction
|
from django.db import transaction
|
||||||
|
|
||||||
with transaction.atomic():
|
with transaction.atomic():
|
||||||
for link in links:
|
snapshots.delete()
|
||||||
Snapshot.objects.filter(url=link.url).delete()
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
|
def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
|
||||||
|
|
|
@ -408,19 +408,18 @@ def log_removal_started(links: List["Link"], yes: bool, delete: bool):
|
||||||
except (KeyboardInterrupt, EOFError, AssertionError):
|
except (KeyboardInterrupt, EOFError, AssertionError):
|
||||||
raise SystemExit(0)
|
raise SystemExit(0)
|
||||||
|
|
||||||
def log_removal_finished(all_links: int, to_keep: int):
|
def log_removal_finished(all_links: int, to_remove: int):
|
||||||
if all_links == 0:
|
if all_links == 0:
|
||||||
print()
|
print()
|
||||||
print('{red}[X] No matching links found.{reset}'.format(**ANSI))
|
print('{red}[X] No matching links found.{reset}'.format(**ANSI))
|
||||||
else:
|
else:
|
||||||
num_removed = all_links - to_keep
|
|
||||||
print()
|
print()
|
||||||
print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format(
|
print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format(
|
||||||
num_removed,
|
to_remove,
|
||||||
all_links,
|
all_links,
|
||||||
**ANSI,
|
**ANSI,
|
||||||
))
|
))
|
||||||
print(' Index now contains {} links.'.format(to_keep))
|
print(' Index now contains {} links.'.format(all_links - to_remove))
|
||||||
|
|
||||||
|
|
||||||
def log_shell_welcome_msg():
|
def log_shell_welcome_msg():
|
||||||
|
|
|
@ -7,6 +7,7 @@ from pathlib import Path
|
||||||
|
|
||||||
from typing import Dict, List, Optional, Iterable, IO, Union
|
from typing import Dict, List, Optional, Iterable, IO, Union
|
||||||
from crontab import CronTab, CronSlices
|
from crontab import CronTab, CronSlices
|
||||||
|
from django.db.models import QuerySet
|
||||||
|
|
||||||
from .cli import (
|
from .cli import (
|
||||||
list_subcommands,
|
list_subcommands,
|
||||||
|
@ -31,7 +32,7 @@ from .index import (
|
||||||
dedupe_links,
|
dedupe_links,
|
||||||
write_main_index,
|
write_main_index,
|
||||||
write_static_index,
|
write_static_index,
|
||||||
link_matches_filter,
|
snapshot_filter,
|
||||||
get_indexed_folders,
|
get_indexed_folders,
|
||||||
get_archived_folders,
|
get_archived_folders,
|
||||||
get_unarchived_folders,
|
get_unarchived_folders,
|
||||||
|
@ -567,7 +568,7 @@ def add(urls: Union[str, List[str]],
|
||||||
def remove(filter_str: Optional[str]=None,
|
def remove(filter_str: Optional[str]=None,
|
||||||
filter_patterns: Optional[List[str]]=None,
|
filter_patterns: Optional[List[str]]=None,
|
||||||
filter_type: str='exact',
|
filter_type: str='exact',
|
||||||
links: Optional[List[Link]]=None,
|
snapshots: Optional[QuerySet]=None,
|
||||||
after: Optional[float]=None,
|
after: Optional[float]=None,
|
||||||
before: Optional[float]=None,
|
before: Optional[float]=None,
|
||||||
yes: bool=False,
|
yes: bool=False,
|
||||||
|
@ -577,7 +578,7 @@ def remove(filter_str: Optional[str]=None,
|
||||||
|
|
||||||
check_data_folder(out_dir=out_dir)
|
check_data_folder(out_dir=out_dir)
|
||||||
|
|
||||||
if links is None:
|
if not snapshots:
|
||||||
if filter_str and filter_patterns:
|
if filter_str and filter_patterns:
|
||||||
stderr(
|
stderr(
|
||||||
'[X] You should pass either a pattern as an argument, '
|
'[X] You should pass either a pattern as an argument, '
|
||||||
|
@ -593,60 +594,54 @@ def remove(filter_str: Optional[str]=None,
|
||||||
)
|
)
|
||||||
stderr()
|
stderr()
|
||||||
hint(('To remove all urls you can run:',
|
hint(('To remove all urls you can run:',
|
||||||
'archivebox remove --filter-type=regex ".*"'))
|
'archivebox remove --filter-type=regex ".*"'))
|
||||||
stderr()
|
stderr()
|
||||||
raise SystemExit(2)
|
raise SystemExit(2)
|
||||||
elif filter_str:
|
elif filter_str:
|
||||||
filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')]
|
filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')]
|
||||||
|
|
||||||
log_list_started(filter_patterns, filter_type)
|
list_kwargs = {
|
||||||
timer = TimedProgress(360, prefix=' ')
|
"filter_patterns": filter_patterns,
|
||||||
try:
|
"filter_type": filter_type,
|
||||||
links = list(list_links(
|
"after": after,
|
||||||
filter_patterns=filter_patterns,
|
"before": before,
|
||||||
filter_type=filter_type,
|
}
|
||||||
after=after,
|
if snapshots:
|
||||||
before=before,
|
list_kwargs["snapshots"] = snapshots
|
||||||
))
|
|
||||||
finally:
|
log_list_started(filter_patterns, filter_type)
|
||||||
timer.end()
|
timer = TimedProgress(360, prefix=' ')
|
||||||
|
try:
|
||||||
|
snapshots = list_links(**list_kwargs)
|
||||||
|
finally:
|
||||||
|
timer.end()
|
||||||
|
|
||||||
|
|
||||||
if not len(links):
|
if not snapshots.exists():
|
||||||
log_removal_finished(0, 0)
|
log_removal_finished(0, 0)
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
|
|
||||||
log_list_finished(links)
|
log_links = [link.as_link() for link in snapshots]
|
||||||
log_removal_started(links, yes=yes, delete=delete)
|
log_list_finished(log_links)
|
||||||
|
log_removal_started(log_links, yes=yes, delete=delete)
|
||||||
|
|
||||||
timer = TimedProgress(360, prefix=' ')
|
timer = TimedProgress(360, prefix=' ')
|
||||||
try:
|
try:
|
||||||
to_keep = []
|
for snapshot in snapshots:
|
||||||
to_delete = []
|
if delete:
|
||||||
all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)]
|
shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True)
|
||||||
for link in all_links:
|
|
||||||
should_remove = (
|
|
||||||
(after is not None and float(link.timestamp) < after)
|
|
||||||
or (before is not None and float(link.timestamp) > before)
|
|
||||||
or link_matches_filter(link, filter_patterns or [], filter_type)
|
|
||||||
or link in links
|
|
||||||
)
|
|
||||||
if should_remove:
|
|
||||||
to_delete.append(link)
|
|
||||||
|
|
||||||
if delete:
|
|
||||||
shutil.rmtree(link.link_dir, ignore_errors=True)
|
|
||||||
else:
|
|
||||||
to_keep.append(link)
|
|
||||||
finally:
|
finally:
|
||||||
timer.end()
|
timer.end()
|
||||||
|
|
||||||
remove_from_sql_main_index(links=to_delete, out_dir=out_dir)
|
to_remove = snapshots.count()
|
||||||
write_main_index(links=to_keep, out_dir=out_dir, finished=True)
|
|
||||||
log_removal_finished(len(all_links), len(to_keep))
|
remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
|
||||||
|
all_snapshots = load_main_index(out_dir=out_dir)
|
||||||
|
write_static_index([link.as_link() for link in all_snapshots], out_dir=out_dir)
|
||||||
|
log_removal_finished(all_snapshots.count(), to_remove)
|
||||||
|
|
||||||
return to_keep
|
return all_snapshots
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def update(resume: Optional[float]=None,
|
def update(resume: Optional[float]=None,
|
||||||
|
@ -737,18 +732,18 @@ def list_all(filter_patterns_str: Optional[str]=None,
|
||||||
filter_patterns = filter_patterns_str.split('\n')
|
filter_patterns = filter_patterns_str.split('\n')
|
||||||
|
|
||||||
|
|
||||||
links = list_links(
|
snapshots = list_links(
|
||||||
filter_patterns=filter_patterns,
|
filter_patterns=filter_patterns,
|
||||||
filter_type=filter_type,
|
filter_type=filter_type,
|
||||||
before=before,
|
before=before,
|
||||||
after=after,
|
after=after,
|
||||||
)
|
)
|
||||||
|
|
||||||
if sort:
|
#if sort:
|
||||||
links = sorted(links, key=lambda link: getattr(link, sort))
|
# snapshots = sorted(links, key=lambda link: getattr(link, sort))
|
||||||
|
|
||||||
folders = list_folders(
|
folders = list_folders(
|
||||||
links=list(links),
|
links=[snapshot.as_link() for snapshot in snapshots],
|
||||||
status=status,
|
status=status,
|
||||||
out_dir=out_dir,
|
out_dir=out_dir,
|
||||||
)
|
)
|
||||||
|
@ -758,7 +753,8 @@ def list_all(filter_patterns_str: Optional[str]=None,
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def list_links(filter_patterns: Optional[List[str]]=None,
|
def list_links(snapshots: Optional[QuerySet]=None,
|
||||||
|
filter_patterns: Optional[List[str]]=None,
|
||||||
filter_type: str='exact',
|
filter_type: str='exact',
|
||||||
after: Optional[float]=None,
|
after: Optional[float]=None,
|
||||||
before: Optional[float]=None,
|
before: Optional[float]=None,
|
||||||
|
@ -766,19 +762,18 @@ def list_links(filter_patterns: Optional[List[str]]=None,
|
||||||
|
|
||||||
check_data_folder(out_dir=out_dir)
|
check_data_folder(out_dir=out_dir)
|
||||||
|
|
||||||
all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)]
|
if snapshots:
|
||||||
|
all_snapshots = snapshots
|
||||||
|
else:
|
||||||
|
all_snapshots = load_main_index(out_dir=out_dir)
|
||||||
|
|
||||||
for link in all_links:
|
if after is not None:
|
||||||
if after is not None and float(link.timestamp) < after:
|
all_snapshots = all_snapshots.filter(timestamp__lt=after)
|
||||||
continue
|
if before is not None:
|
||||||
if before is not None and float(link.timestamp) > before:
|
all_snapshots = all_snapshots.filter(timestamp__gt=before)
|
||||||
continue
|
if filter_patterns:
|
||||||
|
all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
|
||||||
if filter_patterns:
|
return all_snapshots
|
||||||
if link_matches_filter(link, filter_patterns, filter_type):
|
|
||||||
yield link
|
|
||||||
else:
|
|
||||||
yield link
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def list_folders(links: List[Link],
|
def list_folders(links: List[Link],
|
||||||
|
|
Loading…
Reference in a new issue