feat: Update update
command to work with querysets
This commit is contained in:
parent
dafa1dd63c
commit
f55153eab3
4 changed files with 84 additions and 56 deletions
|
@ -392,45 +392,50 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
|
||||||
return snapshots.filter(q_filter)
|
return snapshots.filter(q_filter)
|
||||||
|
|
||||||
|
|
||||||
def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_indexed_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""indexed links without checking archive status or data directory validity"""
|
"""indexed links without checking archive status or data directory validity"""
|
||||||
|
links = [snapshot.as_link() for snapshot in snapshots.iterator()]
|
||||||
return {
|
return {
|
||||||
link.link_dir: link
|
link.link_dir: link
|
||||||
for link in links
|
for link in links
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_archived_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""indexed links that are archived with a valid data directory"""
|
"""indexed links that are archived with a valid data directory"""
|
||||||
|
links = [snapshot.as_link() for snapshot in snapshots.iterator()]
|
||||||
return {
|
return {
|
||||||
link.link_dir: link
|
link.link_dir: link
|
||||||
for link in filter(is_archived, links)
|
for link in filter(is_archived, links)
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_unarchived_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""indexed links that are unarchived with no data directory or an empty data directory"""
|
"""indexed links that are unarchived with no data directory or an empty data directory"""
|
||||||
|
links = [snapshot.as_link() for snapshot in snapshots.iterator()]
|
||||||
return {
|
return {
|
||||||
link.link_dir: link
|
link.link_dir: link
|
||||||
for link in filter(is_unarchived, links)
|
for link in filter(is_unarchived, links)
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_present_folders(_snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""dirs that actually exist in the archive/ folder"""
|
"""dirs that actually exist in the archive/ folder"""
|
||||||
|
|
||||||
all_folders = {}
|
all_folders = {}
|
||||||
|
|
||||||
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
|
for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
|
||||||
if entry.is_dir(follow_symlinks=True):
|
if entry.is_dir():
|
||||||
link = None
|
link = None
|
||||||
try:
|
try:
|
||||||
link = parse_json_link_details(entry.path)
|
link = parse_json_link_details(entry.path)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
all_folders[entry.path] = link
|
all_folders[entry.name] = link
|
||||||
|
|
||||||
return all_folders
|
return all_folders
|
||||||
|
|
||||||
def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
def get_valid_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||||
"""dirs with a valid index matched to the main index and archived content"""
|
"""dirs with a valid index matched to the main index and archived content"""
|
||||||
|
links = [snapshot.as_link() for snapshot in snapshots.iterator()]
|
||||||
return {
|
return {
|
||||||
link.link_dir: link
|
link.link_dir: link
|
||||||
for link in filter(is_valid, links)
|
for link in filter(is_valid, links)
|
||||||
|
|
|
@ -29,22 +29,28 @@ def remove_from_sql_main_index(snapshots: QuerySet, out_dir: str=OUTPUT_DIR) ->
|
||||||
with transaction.atomic():
|
with transaction.atomic():
|
||||||
snapshots.delete()
|
snapshots.delete()
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def write_link_to_sql_index(link: Link):
|
||||||
|
from core.models import Snapshot
|
||||||
|
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
|
||||||
|
try:
|
||||||
|
info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
|
||||||
|
except Snapshot.DoesNotExist:
|
||||||
|
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
|
||||||
|
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
|
||||||
|
|
||||||
|
return Snapshot.objects.update_or_create(url=link.url, defaults=info)[0]
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
|
def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
|
||||||
setup_django(out_dir, check_db=True)
|
setup_django(out_dir, check_db=True)
|
||||||
from core.models import Snapshot
|
|
||||||
from django.db import transaction
|
from django.db import transaction
|
||||||
|
|
||||||
with transaction.atomic():
|
with transaction.atomic():
|
||||||
for link in links:
|
for link in links:
|
||||||
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
|
write_link_to_sql_index(link)
|
||||||
try:
|
|
||||||
info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
|
|
||||||
except Snapshot.DoesNotExist:
|
|
||||||
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
|
|
||||||
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
|
|
||||||
|
|
||||||
Snapshot.objects.update_or_create(url=link.url, defaults=info)
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def write_sql_link_details(link: Link, out_dir: str=OUTPUT_DIR) -> None:
|
def write_sql_link_details(link: Link, out_dir: str=OUTPUT_DIR) -> None:
|
||||||
|
@ -53,7 +59,10 @@ def write_sql_link_details(link: Link, out_dir: str=OUTPUT_DIR) -> None:
|
||||||
from django.db import transaction
|
from django.db import transaction
|
||||||
|
|
||||||
with transaction.atomic():
|
with transaction.atomic():
|
||||||
snap = Snapshot.objects.get(url=link.url)
|
try:
|
||||||
|
snap = Snapshot.objects.get(url=link.url)
|
||||||
|
except Snapshot.DoesNotExist:
|
||||||
|
snap = write_link_to_sql_index(link)
|
||||||
snap.title = link.title
|
snap.title = link.title
|
||||||
snap.tags = link.tags
|
snap.tags = link.tags
|
||||||
snap.save()
|
snap.save()
|
||||||
|
|
|
@ -659,24 +659,18 @@ def update(resume: Optional[float]=None,
|
||||||
|
|
||||||
check_data_folder(out_dir=out_dir)
|
check_data_folder(out_dir=out_dir)
|
||||||
check_dependencies()
|
check_dependencies()
|
||||||
|
new_links: List[Link] = [] # TODO: Remove input argument: only_new
|
||||||
|
|
||||||
# Step 1: Load list of links from the existing index
|
# Step 1: Filter for selected_links
|
||||||
# merge in and dedupe new links from import_path
|
matching_snapshots = list_links(
|
||||||
new_links: List[Link] = []
|
|
||||||
all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)]
|
|
||||||
|
|
||||||
# Step 2: Write updated index with deduped old and new links back to disk
|
|
||||||
# write_main_index(links=list(all_links), out_dir=out_dir)
|
|
||||||
|
|
||||||
# Step 3: Filter for selected_links
|
|
||||||
matching_links = list_links(
|
|
||||||
filter_patterns=filter_patterns,
|
filter_patterns=filter_patterns,
|
||||||
filter_type=filter_type,
|
filter_type=filter_type,
|
||||||
before=before,
|
before=before,
|
||||||
after=after,
|
after=after,
|
||||||
)
|
)
|
||||||
|
|
||||||
matching_folders = list_folders(
|
matching_folders = list_folders(
|
||||||
links=list(matching_links),
|
links=matching_snapshots,
|
||||||
status=status,
|
status=status,
|
||||||
out_dir=out_dir,
|
out_dir=out_dir,
|
||||||
)
|
)
|
||||||
|
@ -685,7 +679,7 @@ def update(resume: Optional[float]=None,
|
||||||
if index_only:
|
if index_only:
|
||||||
return all_links
|
return all_links
|
||||||
|
|
||||||
# Step 3: Run the archive methods for each link
|
# Step 2: Run the archive methods for each link
|
||||||
to_archive = new_links if only_new else all_links
|
to_archive = new_links if only_new else all_links
|
||||||
if resume:
|
if resume:
|
||||||
to_archive = [
|
to_archive = [
|
||||||
|
@ -700,8 +694,8 @@ def update(resume: Optional[float]=None,
|
||||||
archive_links(to_archive, overwrite=overwrite, out_dir=out_dir)
|
archive_links(to_archive, overwrite=overwrite, out_dir=out_dir)
|
||||||
|
|
||||||
# Step 4: Re-write links index with updated titles, icons, and resources
|
# Step 4: Re-write links index with updated titles, icons, and resources
|
||||||
all_links = [x.as_link() for x in load_main_index(out_dir=out_dir)]
|
all_links = load_main_index(out_dir=out_dir)
|
||||||
write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
|
write_static_index([link.as_link() for link in all_links], out_dir=out_dir)
|
||||||
return all_links
|
return all_links
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
@ -743,7 +737,7 @@ def list_all(filter_patterns_str: Optional[str]=None,
|
||||||
# snapshots = sorted(links, key=lambda link: getattr(link, sort))
|
# snapshots = sorted(links, key=lambda link: getattr(link, sort))
|
||||||
|
|
||||||
folders = list_folders(
|
folders = list_folders(
|
||||||
links=[snapshot.as_link() for snapshot in snapshots],
|
links=snapshots,
|
||||||
status=status,
|
status=status,
|
||||||
out_dir=out_dir,
|
out_dir=out_dir,
|
||||||
)
|
)
|
||||||
|
@ -782,30 +776,23 @@ def list_folders(links: List[Link],
|
||||||
|
|
||||||
check_data_folder(out_dir=out_dir)
|
check_data_folder(out_dir=out_dir)
|
||||||
|
|
||||||
if status == 'indexed':
|
STATUS_FUNCTIONS = {
|
||||||
return get_indexed_folders(links, out_dir=out_dir)
|
"indexed": get_indexed_folders,
|
||||||
elif status == 'archived':
|
"archived": get_archived_folders,
|
||||||
return get_archived_folders(links, out_dir=out_dir)
|
"unarchived": get_unarchived_folders,
|
||||||
elif status == 'unarchived':
|
"present": get_present_folders,
|
||||||
return get_unarchived_folders(links, out_dir=out_dir)
|
"valid": get_valid_folders,
|
||||||
|
"invalid": get_invalid_folders,
|
||||||
|
"duplicate": get_duplicate_folders,
|
||||||
|
"orphaned": get_orphaned_folders,
|
||||||
|
"corrupted": get_corrupted_folders,
|
||||||
|
"unrecognized": get_unrecognized_folders,
|
||||||
|
}
|
||||||
|
|
||||||
elif status == 'present':
|
try:
|
||||||
return get_present_folders(links, out_dir=out_dir)
|
return STATUS_FUNCTIONS[status](links, out_dir=out_dir)
|
||||||
elif status == 'valid':
|
except KeyError:
|
||||||
return get_valid_folders(links, out_dir=out_dir)
|
raise ValueError('Status not recognized.')
|
||||||
elif status == 'invalid':
|
|
||||||
return get_invalid_folders(links, out_dir=out_dir)
|
|
||||||
|
|
||||||
elif status == 'duplicate':
|
|
||||||
return get_duplicate_folders(links, out_dir=out_dir)
|
|
||||||
elif status == 'orphaned':
|
|
||||||
return get_orphaned_folders(links, out_dir=out_dir)
|
|
||||||
elif status == 'corrupted':
|
|
||||||
return get_corrupted_folders(links, out_dir=out_dir)
|
|
||||||
elif status == 'unrecognized':
|
|
||||||
return get_unrecognized_folders(links, out_dir=out_dir)
|
|
||||||
|
|
||||||
raise ValueError('Status not recognized.')
|
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
|
27
tests/test_update.py
Normal file
27
tests/test_update.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
from .fixtures import *
|
||||||
|
|
||||||
|
def test_update_status_invalid(tmp_path, process, disable_extractors_dict):
|
||||||
|
subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
|
||||||
|
assert list((tmp_path / "archive").iterdir()) != []
|
||||||
|
|
||||||
|
subprocess.run(['archivebox', 'remove', 'http://127.0.0.1:8080/static/example.com.html', '--yes'], capture_output=True)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(str(tmp_path / "index.sqlite3"))
|
||||||
|
c = conn.cursor()
|
||||||
|
link = c.execute("SELECT * FROM core_snapshot").fetchone()
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
assert link is None
|
||||||
|
|
||||||
|
update_process = subprocess.run(['archivebox', 'update', '--status=invalid'], capture_output=True, env=disable_extractors_dict)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(str(tmp_path / "index.sqlite3"))
|
||||||
|
c = conn.cursor()
|
||||||
|
url = c.execute("SELECT * FROM core_snapshot").fetchone()[1]
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
assert url == 'http://127.0.0.1:8080/static/example.com.html'
|
Loading…
Reference in a new issue