From e37b1caf7fbd853aaaf5a9a35168a29ab22ca693 Mon Sep 17 00:00:00 2001 From: Alex Kotov Date: Tue, 29 Aug 2023 17:40:50 +0400 Subject: [PATCH] Some refactoring --- archivebox/index/html.py | 33 +++++++++++++++++---------------- archivebox/index/schema.py | 5 ++++- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/archivebox/index/html.py b/archivebox/index/html.py index ba22a1d2..07058ab3 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -34,6 +34,20 @@ MINIMAL_INDEX_TEMPLATE = 'minimal_index.html' LINK_DETAILS_TEMPLATE = 'snapshot.html' TITLE_LOADING_MSG = 'Not yet archived...' +SNAPSHOT_ICONS = { + 'singlefile': '❶', + 'wget': '🆆', + 'dom': '🅷', + 'pdf': '📄', + 'screenshot': '💻', + 'media': '📼', + 'git': '🅶', + 'archive_org': '🏛', + 'readability': '🆁', + 'mercury': '🅼', + 'warc': '📦', +} + ### Main Links Index @@ -134,19 +148,6 @@ def snapshot_icons(snapshot) -> str: canon = link.canonical_outputs() output = "" output_template = '{}  ' - icons = { - "singlefile": "❶", - "wget": "🆆", - "dom": "🅷", - "pdf": "📄", - "screenshot": "💻", - "media": "📼", - "git": "🅶", - "archive_org": "🏛", - "readability": "🆁", - "mercury": "🅼", - "warc": "📦" - } exclude = ["favicon", "title", "headers", "archive_org"] # Missing specific entry for WARC @@ -167,7 +168,7 @@ def snapshot_icons(snapshot) -> str: # elif existing.is_dir(): # existing = any(existing.glob('*.*')) output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)), - extractor, icons.get(extractor, "?")) + extractor, SNAPSHOT_ICONS.get(extractor, "?")) if extractor == "wget": # warc isn't technically it's own extractor, so we have to add it after wget @@ -175,7 +176,7 @@ def snapshot_icons(snapshot) -> str: exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output # get from filesystem (slower but more accurate) # exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz")) - output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?")) + output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", SNAPSHOT_ICONS.get("warc", "?")) if extractor == "archive_org": # The check for archive_org is different, so it has to be handled separately @@ -186,7 +187,7 @@ def snapshot_icons(snapshot) -> str: # target_path = Path(path) / "archive.org.txt" # exists = target_path.exists() output += '{} '.format(canon["archive_org_path"], str(exists), - "archive_org", icons.get("archive_org", "?")) + "archive_org", SNAPSHOT_ICONS.get("archive_org", "?")) result = format_html('{}', mark_safe(output)) # end = datetime.now(timezone.utc) diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index c44165a9..137d817e 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -22,6 +22,9 @@ from ..system import get_dir_size from ..util import ts_to_date_str, parse_date from ..config import OUTPUT_DIR, ARCHIVE_DIR_NAME, FAVICON_PROVIDER +ARCHIVE_DOT_ORG_TEMPLATE = 'https://web.archive.org/web/{}' + + class ArchiveError(Exception): def __init__(self, message, hints=None): super().__init__(message) @@ -432,7 +435,7 @@ class Link: 'pdf_path': 'output.pdf', 'screenshot_path': 'screenshot.png', 'dom_path': 'output.html', - 'archive_org_path': 'https://web.archive.org/web/{}'.format(self.base_url), + 'archive_org_path': ARCHIVE_DOT_ORG_TEMPLATE.format(self.base_url), 'git_path': 'git/', 'media_path': 'media/', 'headers_path': 'headers.json',