From 51ae634ec98b7dc8ee57ae6f022a87924fb9d912 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 3 Apr 2019 00:27:37 -0400 Subject: [PATCH] working argparse based CLI with most commands implemented --- archivebox/__init__.py | 4 + archivebox/__main__.py | 8 +- archivebox/cli/__init__.py | 27 +++ archivebox/cli/archivebox.py | 71 ++++++++ archivebox/cli/archivebox_add.py | 84 +++++++++ archivebox/cli/archivebox_help.py | 54 ++++++ archivebox/cli/archivebox_init.py | 72 ++++++++ archivebox/cli/archivebox_list.py | 81 +++++++++ archivebox/cli/archivebox_update.py | 45 +++++ archivebox/cli/archivebox_version.py | 103 ++++++++++++ archivebox/core/settings.py | 15 +- archivebox/env.py | 4 + archivebox/legacy/__init__.py | 5 - archivebox/legacy/archive.py | 243 --------------------------- archivebox/legacy/index.py | 150 ++++++++++++++++- archivebox/legacy/links.py | 93 ---------- archivebox/legacy/main.py | 80 +++++++++ archivebox/legacy/purge.py | 6 +- archivebox/legacy/util.py | 82 +++------ bin/archivebox | 4 +- 20 files changed, 807 insertions(+), 424 deletions(-) create mode 100644 archivebox/cli/__init__.py create mode 100755 archivebox/cli/archivebox.py create mode 100644 archivebox/cli/archivebox_add.py create mode 100755 archivebox/cli/archivebox_help.py create mode 100755 archivebox/cli/archivebox_init.py create mode 100644 archivebox/cli/archivebox_list.py create mode 100644 archivebox/cli/archivebox_update.py create mode 100755 archivebox/cli/archivebox_version.py delete mode 100755 archivebox/legacy/archive.py delete mode 100644 archivebox/legacy/links.py create mode 100644 archivebox/legacy/main.py diff --git a/archivebox/__init__.py b/archivebox/__init__.py index e69de29b..26fcd715 100644 --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -0,0 +1,4 @@ + +__AUTHOR__ = 'Nick Sweeting ' +__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.' +__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki' diff --git a/archivebox/__main__.py b/archivebox/__main__.py index 8e75ec40..1439b07f 100755 --- a/archivebox/__main__.py +++ b/archivebox/__main__.py @@ -1,19 +1,15 @@ #!/usr/bin/env python3 -""" -Main ArchiveBox command line application entrypoint. -""" - __package__ = 'archivebox' + import os import sys PYTHON_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(PYTHON_DIR) -from .env import * -from .legacy.archive import main +from .cli.archivebox import main if __name__ == '__main__': diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py new file mode 100644 index 00000000..ea1fcda5 --- /dev/null +++ b/archivebox/cli/__init__.py @@ -0,0 +1,27 @@ +__package__ = 'archivebox.cli' + +import os +from importlib import import_module + +CLI_DIR = os.path.dirname(os.path.abspath(__file__)) + +required_attrs = ('__package__', '__command__', '__description__', 'main') + + +def list_subcommands(): + COMMANDS = {} + for filename in os.listdir(CLI_DIR): + if filename.startswith('archivebox_') and filename.endswith('.py'): + subcommand = filename.replace('archivebox_', '').replace('.py', '') + module = import_module('.archivebox_{}'.format(subcommand), __package__) + + assert all(hasattr(module, attr) for attr in required_attrs) + assert module.__command__.split(' ')[-1] == subcommand + COMMANDS[subcommand] = module.__description__ + + return COMMANDS + + +def run_subcommand(subcommand: str, args=None): + module = import_module('.archivebox_{}'.format(subcommand), __package__) + return module.main(args) # type: ignore diff --git a/archivebox/cli/archivebox.py b/archivebox/cli/archivebox.py new file mode 100755 index 00000000..31cd8b5c --- /dev/null +++ b/archivebox/cli/archivebox.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +# archivebox [command] + +__package__ = 'archivebox.cli' +__command__ = 'archivebox' +__description__ = 'ArchiveBox: The self-hosted internet archive.' + +import sys +import argparse + +from . import list_subcommands, run_subcommand + + +def parse_args(args=None): + args = sys.argv[1:] if args is None else args + + subcommands = list_subcommands() + + parser = argparse.ArgumentParser( + prog=__command__, + description=__description__, + add_help=False, + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + '--help', '-h', + action='store_true', + help=subcommands['help'], + ) + group.add_argument( + '--version', + action='store_true', + help=subcommands['version'], + ) + group.add_argument( + "subcommand", + type=str, + help= "The name of the subcommand to run", + nargs='?', + choices=subcommands.keys(), + default=None, + ) + parser.add_argument( + "args", + help="Arguments for the subcommand", + nargs=argparse.REMAINDER, + ) + + command = parser.parse_args(args) + + if command.help: + command.subcommand = 'help' + if command.version: + command.subcommand = 'version' + + # print('--------------------------------------------') + # print('Command: ', sys.argv[0]) + # print('Subcommand: ', command.subcommand) + # print('Args to pass:', args[1:]) + # print('--------------------------------------------') + + return command.subcommand, command.args + + +def main(args=None): + subcommand, subcommand_args = parse_args(args) + run_subcommand(subcommand, subcommand_args) + + +if __name__ == '__main__': + main() diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py new file mode 100644 index 00000000..934907a2 --- /dev/null +++ b/archivebox/cli/archivebox_add.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +__package__ = 'archivebox.cli' +__command__ = 'archivebox add' +__description__ = 'Add a new URL or list of URLs to your archive' + +import os +import sys +import argparse + +from ..legacy.util import ( + handle_stdin_import, + handle_file_import, +) +from ..legacy.main import update_archive_data + + +def main(args=None): + args = sys.argv[1:] if args is None else args + + parser = argparse.ArgumentParser( + prog=__command__, + description=__description__, + add_help=True, + ) + # parser.add_argument( + # '--depth', #'-d', + # type=int, + # help='Recursively archive all linked pages up to this many hops away', + # default=0, + # ) + parser.add_argument( + '--only-new', #'-n', + action='store_true', + help="Don't attempt to retry previously skipped/failed links when updating", + ) + parser.add_argument( + '--mirror', #'-m', + action='store_true', + help='Archive an entire site (finding all linked pages below it on the same domain)', + ) + parser.add_argument( + '--crawler', #'-r', + choices=('depth_first', 'breadth_first'), + help='Controls which crawler to use in order to find outlinks in a given page', + default=None, + ) + parser.add_argument( + 'url', + nargs='?', + type=str, + default=None, + help='URL of page to archive (or path to local file)' + ) + command = parser.parse_args(args) + + ### Handle ingesting urls piped in through stdin + # (.e.g if user does cat example_urls.txt | ./archive) + import_path = None + if not sys.stdin.isatty(): + stdin_raw_text = sys.stdin.read() + if stdin_raw_text and command.url: + print( + '[X] You should pass either a path as an argument, ' + 'or pass a list of links via stdin, but not both.\n' + ) + raise SystemExit(1) + + import_path = handle_stdin_import(stdin_raw_text) + + ### Handle ingesting url from a remote file/feed + # (e.g. if an RSS feed URL is used as the import path) + elif command.url: + import_path = handle_file_import(command.url) + + + update_archive_data( + import_path=import_path, + resume=None, + only_new=command.only_new, + ) + +if __name__ == '__main__': + main() diff --git a/archivebox/cli/archivebox_help.py b/archivebox/cli/archivebox_help.py new file mode 100755 index 00000000..7e4f9d87 --- /dev/null +++ b/archivebox/cli/archivebox_help.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 + +__package__ = 'archivebox.cli' +__command__ = 'archivebox help' +__description__ = 'Print the ArchiveBox help message and usage' + +import sys +import argparse + +from ..legacy.util import reject_stdin +from . import list_subcommands + + +def main(args=None): + args = sys.argv[1:] if args is None else args + + parser = argparse.ArgumentParser( + prog=__command__, + description=__description__, + add_help=True, + ) + parser.parse_args(args) + reject_stdin(__command__) + + + COMMANDS_HELP_TEXT = '\n '.join( + f'{cmd.ljust(20)} {summary}' + for cmd, summary in list_subcommands().items() + ) + + print(f'''ArchiveBox: The self-hosted internet archive. +Usage: + archivebox [command] [--help] [--version] [...args] + +Comamnds: + {COMMANDS_HELP_TEXT} + +Example Use: + mkdir my-archive; cd my-archive/ + archivebox init + + echo 'https://example.com/some/page' | archivebox add + archivebox add https://example.com/some/other/page + archivebox add --depth=1 ~/Downloads/bookmarks_export.html + archivebox add --depth=1 https://example.com/feed.rss + archivebox update --resume=15109948213.123 + +Documentation: + https://github.com/pirate/ArchiveBox/wiki +''') + + +if __name__ == '__main__': + main() diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py new file mode 100755 index 00000000..ddfbd4a1 --- /dev/null +++ b/archivebox/cli/archivebox_init.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 + +__package__ = 'archivebox.cli' +__command__ = 'archivebox init' +__description__ = 'Initialize a new ArchiveBox collection in the current directory' + +import os +import sys +import argparse + +from ..legacy.util import reject_stdin +from ..legacy.config import ( + OUTPUT_DIR, + SOURCES_DIR, + ARCHIVE_DIR, + DATABASE_DIR, + ANSI, +) + + +def init(output_dir: str=OUTPUT_DIR): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + harmless_files = {'.DS_Store', '.venv', 'venv', 'virtualenv', '.virtualenv'} + is_empty = not len(set(os.listdir(output_dir)) - harmless_files) + existing_index = os.path.exists(os.path.join(output_dir, 'index.json')) + + if not is_empty: + if existing_index: + print('You already have an archive in this folder!') + # TODO: import old archivebox version's archive data folder + + raise SystemExit(1) + else: + print( + ("{red}[X] This folder already has files in it. You must run init inside a completely empty directory.{reset}" + "\n\n" + " {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n" + " just cd into the folder and run the archivebox command to pick up where you left off.\n\n" + " (Always make sure your data folder is backed up first before updating ArchiveBox)" + ).format(output_dir, **ANSI) + ) + raise SystemExit(1) + + + print('{green}[+] Initializing new archive directory: {}{reset}'.format(output_dir, **ANSI)) + os.makedirs(SOURCES_DIR) + print(f' > {SOURCES_DIR}') + os.makedirs(ARCHIVE_DIR) + print(f' > {ARCHIVE_DIR}') + os.makedirs(DATABASE_DIR) + print(f' > {DATABASE_DIR}') + print('{green}[√] Done.{reset}'.format(**ANSI)) + + +def main(args=None): + args = sys.argv[1:] if args is None else args + + parser = argparse.ArgumentParser( + prog=__command__, + description=__description__, + add_help=True, + ) + parser.parse_args(args) + reject_stdin(__command__) + + init() + + +if __name__ == '__main__': + main() diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py new file mode 100644 index 00000000..75699d3a --- /dev/null +++ b/archivebox/cli/archivebox_list.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 + +__package__ = 'archivebox.cli' +__command__ = 'archivebox list' +__description__ = 'List all the URLs currently in the archive.' + +import sys +import json +import argparse + + +from ..legacy.util import reject_stdin, ExtendedEncoder +from ..legacy.main import list_archive_data, csv_format + + +def main(args=None): + args = sys.argv[1:] if args is None else args + + parser = argparse.ArgumentParser( + prog=__command__, + description=__description__, + add_help=True, + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + '--csv', #'-c', + type=str, + help="Print the output in CSV format with the given columns, e.g.: timestamp,url,extension", + default=None, + ) + group.add_argument( + '--json', #'-j', + action='store_true', + help="Print the output in JSON format with all columns included.", + ) + parser.add_argument( + '--filter', #'-f', + type=str, + help="List only URLs matching the given regex pattern.", + default=None, + ) + parser.add_argument( + '--sort', #'-s', + type=str, + help="List the links sorted using the given key, e.g. timestamp or updated", + default=None, + ) + parser.add_argument( + '--before', #'-b', + type=float, + help="List only URLs bookmarked before the given timestamp.", + default=None, + ) + parser.add_argument( + '--after', #'-a', + type=float, + help="List only URLs bookmarked after the given timestamp.", + default=None, + ) + command = parser.parse_args(args) + reject_stdin(__command__) + + links = list_archive_data( + filter_regex=command.filter, + before=command.before, + after=command.after, + ) + if command.sort: + links = sorted(links, key=lambda link: getattr(link, command.sort)) + + if command.csv: + print(command.csv) + print('\n'.join(csv_format(link, command.csv) for link in links)) + elif command.json: + print(json.dumps(list(links), indent=4, cls=ExtendedEncoder)) + else: + print('\n'.join(link.url for link in links)) + + +if __name__ == '__main__': + main() diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py new file mode 100644 index 00000000..c74fc8b7 --- /dev/null +++ b/archivebox/cli/archivebox_update.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 + +__package__ = 'archivebox.cli' +__command__ = 'archivebox update' +__description__ = 'Import any new links from subscriptions and retry any previously failed/skipped links.' + +import sys +import argparse + + +from ..legacy.util import reject_stdin +from ..legacy.main import update_archive_data + + +def main(args=None): + args = sys.argv[1:] if args is None else args + + parser = argparse.ArgumentParser( + prog=__command__, + description=__description__, + add_help=True, + ) + parser.add_argument( + '--only-new', #'-n', + action='store_true', + help="Don't attempt to retry previously skipped/failed links when updating", + ) + parser.add_argument( + '--resume', #'-r', + type=float, + help='Resume the update process from a given timestamp', + default=None, + ) + command = parser.parse_args(args) + reject_stdin(__command__) + + update_archive_data( + import_path=None, + resume=command.resume, + only_new=command.only_new, + ) + + +if __name__ == '__main__': + main() diff --git a/archivebox/cli/archivebox_version.py b/archivebox/cli/archivebox_version.py new file mode 100755 index 00000000..d5eb7954 --- /dev/null +++ b/archivebox/cli/archivebox_version.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 + +__package__ = 'archivebox.cli' +__command__ = 'archivebox version' +__description__ = 'Print the ArchiveBox version and dependency information' + +import sys +import shutil +import argparse + +from ..legacy.util import reject_stdin +from ..legacy.config import ( + VERSION, + + REPO_DIR, + PYTHON_DIR, + LEGACY_DIR, + TEMPLATES_DIR, + OUTPUT_DIR, + SOURCES_DIR, + ARCHIVE_DIR, + DATABASE_DIR, + + USE_CURL, + USE_WGET, + USE_CHROME, + FETCH_GIT, + FETCH_MEDIA, + + DJANGO_BINARY, + CURL_BINARY, + GIT_BINARY, + WGET_BINARY, + YOUTUBEDL_BINARY, + CHROME_BINARY, + + DJANGO_VERSION, + CURL_VERSION, + GIT_VERSION, + WGET_VERSION, + YOUTUBEDL_VERSION, + CHROME_VERSION, +) + + +def main(args=None): + args = sys.argv[1:] if args is None else args + + parser = argparse.ArgumentParser( + prog=__command__, + description=__description__, + add_help=True, + ) + parser.parse_args(args) + reject_stdin(__command__) + + print('ArchiveBox v{}'.format(VERSION)) + print() + print('[i] Folder locations:') + print(' REPO_DIR: ', REPO_DIR) + print(' PYTHON_DIR: ', PYTHON_DIR) + print(' LEGACY_DIR: ', LEGACY_DIR) + print(' TEMPLATES_DIR: ', TEMPLATES_DIR) + print() + print(' OUTPUT_DIR: ', OUTPUT_DIR) + print(' SOURCES_DIR: ', SOURCES_DIR) + print(' ARCHIVE_DIR: ', ARCHIVE_DIR) + print(' DATABASE_DIR: ', DATABASE_DIR) + print() + print( + '[√] Django:'.ljust(14), + 'python3 {} --version\n'.format(DJANGO_BINARY), + ' '*13, DJANGO_VERSION, '\n', + ) + print( + '[{}] CURL:'.format('√' if USE_CURL else 'X').ljust(14), + '{} --version\n'.format(shutil.which(CURL_BINARY)), + ' '*13, CURL_VERSION, '\n', + ) + print( + '[{}] GIT:'.format('√' if FETCH_GIT else 'X').ljust(14), + '{} --version\n'.format(shutil.which(GIT_BINARY)), + ' '*13, GIT_VERSION, '\n', + ) + print( + '[{}] WGET:'.format('√' if USE_WGET else 'X').ljust(14), + '{} --version\n'.format(shutil.which(WGET_BINARY)), + ' '*13, WGET_VERSION, '\n', + ) + print( + '[{}] YOUTUBEDL:'.format('√' if FETCH_MEDIA else 'X').ljust(14), + '{} --version\n'.format(shutil.which(YOUTUBEDL_BINARY)), + ' '*13, YOUTUBEDL_VERSION, '\n', + ) + print( + '[{}] CHROME:'.format('√' if USE_CHROME else 'X').ljust(14), + '{} --version\n'.format(shutil.which(CHROME_BINARY)), + ' '*13, CHROME_VERSION, '\n', + ) + + +if __name__ == '__main__': + main() diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 0f209b4c..14ba519b 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -13,12 +13,12 @@ DEBUG = True INSTALLED_APPS = [ - 'django.contrib.admin', - 'django.contrib.auth', - 'django.contrib.contenttypes', - 'django.contrib.sessions', - 'django.contrib.messages', - 'django.contrib.staticfiles', + # 'django.contrib.admin', + # 'django.contrib.auth', + # 'django.contrib.contenttypes', + # 'django.contrib.sessions', + # 'django.contrib.messages', + # 'django.contrib.staticfiles', 'core', ] @@ -53,10 +53,11 @@ TEMPLATES = [ WSGI_APPLICATION = 'core.wsgi.application' +DATABASE_FILE = os.path.join(DATABASE_DIR, 'database.sqlite3') DATABASES = { 'default': { 'ENGINE': 'django.db.backends.sqlite3', - 'NAME': os.path.join(DATABASE_DIR, 'database.sqlite3'), + 'NAME': DATABASE_FILE, } } diff --git a/archivebox/env.py b/archivebox/env.py index 3a40fab5..905fa275 100644 --- a/archivebox/env.py +++ b/archivebox/env.py @@ -9,3 +9,7 @@ os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings") import django django.setup() + +from django.conf import settings + +DATABASE_FILE = settings.DATABASE_FILE diff --git a/archivebox/legacy/__init__.py b/archivebox/legacy/__init__.py index ab53f570..e69de29b 100644 --- a/archivebox/legacy/__init__.py +++ b/archivebox/legacy/__init__.py @@ -1,5 +0,0 @@ - - -#__name__ = 'archivebox' -#__package__ = 'archivebox' - diff --git a/archivebox/legacy/archive.py b/archivebox/legacy/archive.py deleted file mode 100755 index 82788c47..00000000 --- a/archivebox/legacy/archive.py +++ /dev/null @@ -1,243 +0,0 @@ -#!/usr/bin/env python3 -""" -ArchiveBox command line application. - -./archive and ./bin/archivebox both point to this file, -but you can also run it directly using `python3 archive.py` - -Usage & Documentation: - https://github.com/pirate/ArchiveBox/Wiki -""" -__package__ = 'legacy' - -import os -import sys -import shutil - -from typing import List, Optional - -from .schema import Link -from .links import links_after_timestamp -from .index import write_links_index, load_links_index -from .archive_methods import archive_link -from .config import ( - ONLY_NEW, - VERSION, - ANSI, - - REPO_DIR, - PYTHON_DIR, - LEGACY_DIR, - TEMPLATES_DIR, - OUTPUT_DIR, - SOURCES_DIR, - ARCHIVE_DIR, - DATABASE_DIR, - - USE_CURL, - USE_WGET, - USE_CHROME, - FETCH_GIT, - FETCH_MEDIA, - - DJANGO_BINARY, - CURL_BINARY, - GIT_BINARY, - WGET_BINARY, - YOUTUBEDL_BINARY, - CHROME_BINARY, - - DJANGO_VERSION, - CURL_VERSION, - GIT_VERSION, - WGET_VERSION, - YOUTUBEDL_VERSION, - CHROME_VERSION, -) -from .util import ( - enforce_types, - handle_stdin_import, - handle_file_import, -) -from .logs import ( - log_archiving_started, - log_archiving_paused, - log_archiving_finished, -) - -__AUTHOR__ = 'Nick Sweeting ' -__VERSION__ = VERSION -__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.' -__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki' - - - -def print_help(): - print('ArchiveBox: The self-hosted internet archive.\n') - print("Documentation:") - print(" https://github.com/pirate/ArchiveBox/wiki\n") - print("UI Usage:") - print(" Open output/index.html to view your archive.\n") - print("CLI Usage:") - print(" mkdir data; cd data/") - print(" archivebox init\n") - print(" echo 'https://example.com/some/page' | archivebox add") - print(" archivebox add https://example.com/some/other/page") - print(" archivebox add --depth=1 ~/Downloads/bookmarks_export.html") - print(" archivebox add --depth=1 https://example.com/feed.rss") - print(" archivebox update --resume=15109948213.123") - - -def print_version(): - print('ArchiveBox v{}'.format(__VERSION__)) - print() - print('[i] Folder locations:') - print(' REPO_DIR: ', REPO_DIR) - print(' PYTHON_DIR: ', PYTHON_DIR) - print(' LEGACY_DIR: ', LEGACY_DIR) - print(' TEMPLATES_DIR: ', TEMPLATES_DIR) - print() - print(' OUTPUT_DIR: ', OUTPUT_DIR) - print(' SOURCES_DIR: ', SOURCES_DIR) - print(' ARCHIVE_DIR: ', ARCHIVE_DIR) - print(' DATABASE_DIR: ', DATABASE_DIR) - print() - print( - '[√] Django:'.ljust(14), - 'python3 {} --version\n'.format(DJANGO_BINARY), - ' '*13, DJANGO_VERSION, '\n', - ) - print( - '[{}] CURL:'.format('√' if USE_CURL else 'X').ljust(14), - '{} --version\n'.format(shutil.which(CURL_BINARY)), - ' '*13, CURL_VERSION, '\n', - ) - print( - '[{}] GIT:'.format('√' if FETCH_GIT else 'X').ljust(14), - '{} --version\n'.format(shutil.which(GIT_BINARY)), - ' '*13, GIT_VERSION, '\n', - ) - print( - '[{}] WGET:'.format('√' if USE_WGET else 'X').ljust(14), - '{} --version\n'.format(shutil.which(WGET_BINARY)), - ' '*13, WGET_VERSION, '\n', - ) - print( - '[{}] YOUTUBEDL:'.format('√' if FETCH_MEDIA else 'X').ljust(14), - '{} --version\n'.format(shutil.which(YOUTUBEDL_BINARY)), - ' '*13, YOUTUBEDL_VERSION, '\n', - ) - print( - '[{}] CHROME:'.format('√' if USE_CHROME else 'X').ljust(14), - '{} --version\n'.format(shutil.which(CHROME_BINARY)), - ' '*13, CHROME_VERSION, '\n', - ) - - -def main(args=None) -> None: - if args is None: - args = sys.argv - - if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2: - print_help() - raise SystemExit(0) - - if set(args).intersection(('--version', 'version')): - print_version() - raise SystemExit(0) - - ### Handle CLI arguments - # ./archive bookmarks.html - # ./archive 1523422111.234 - import_path, resume = None, None - if len(args) == 2: - # if the argument is a string, it's a import_path file to import - # if it's a number, it's a timestamp to resume archiving from - if args[1].replace('.', '').isdigit(): - import_path, resume = None, args[1] - else: - import_path, resume = args[1], None - - ### Set up output folder - if not os.path.exists(OUTPUT_DIR): - print('{green}[+] Created a new archive directory: {}{reset}'.format(OUTPUT_DIR, **ANSI)) - os.makedirs(OUTPUT_DIR) - os.makedirs(SOURCES_DIR) - os.makedirs(ARCHIVE_DIR) - os.makedirs(DATABASE_DIR) - else: - not_empty = len(set(os.listdir(OUTPUT_DIR)) - {'.DS_Store', '.venv', 'venv', 'virtualenv', '.virtualenv'}) - index_exists = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json')) - if not_empty and not index_exists: - print( - ("{red}[X] Could not find index.json in the OUTPUT_DIR: {reset}{}\n\n" - " If you're trying to update an existing archive, you must set OUTPUT_DIR to or run archivebox from inside the archive folder you're trying to update.\n" - " If you're trying to create a new archive, you must run archivebox inside a completely empty directory." - "\n\n" - " {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n" - " just cd into the folder and run the archivebox command to pick up where you left off.\n\n" - " (Always make sure your data folder is backed up first before updating ArchiveBox)" - ).format(OUTPUT_DIR, **ANSI) - ) - raise SystemExit(1) - - ### Handle ingesting urls piped in through stdin - # (.e.g if user does cat example_urls.txt | ./archive) - if not sys.stdin.isatty(): - stdin_raw_text = sys.stdin.read() - if stdin_raw_text and import_path: - print( - '[X] You should pass either a path as an argument, ' - 'or pass a list of links via stdin, but not both.\n' - ) - print_help() - raise SystemExit(1) - - import_path = handle_stdin_import(stdin_raw_text) - - ### Handle ingesting url from a remote file/feed - # (e.g. if an RSS feed URL is used as the import path) - if import_path: - import_path = handle_file_import(import_path) - - ### Run the main archive update process - update_archive_data(import_path=import_path, resume=resume) - - -@enforce_types -def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None) -> List[Link]: - """The main ArchiveBox entrancepoint. Everything starts here.""" - - # Step 1: Load list of links from the existing index - # merge in and dedupe new links from import_path - all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path) - - # Step 2: Write updated index with deduped old and new links back to disk - write_links_index(links=list(all_links), out_dir=OUTPUT_DIR) - - # Step 3: Run the archive methods for each link - links = new_links if ONLY_NEW else all_links - log_archiving_started(len(links), resume) - idx: int = 0 - link: Optional[Link] = None - try: - for idx, link in enumerate(links_after_timestamp(links, resume)): - archive_link(link, link_dir=link.link_dir) - - except KeyboardInterrupt: - log_archiving_paused(len(links), idx, link.timestamp if link else '0') - raise SystemExit(0) - - except: - print() - raise - - log_archiving_finished(len(links)) - - # Step 4: Re-write links index with updated titles, icons, and resources - all_links, _ = load_links_index(out_dir=OUTPUT_DIR) - write_links_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True) - return all_links - -if __name__ == '__main__': - main(sys.argv) diff --git a/archivebox/legacy/index.py b/archivebox/legacy/index.py index 98d9e3df..a28192b2 100644 --- a/archivebox/legacy/index.py +++ b/archivebox/legacy/index.py @@ -3,7 +3,8 @@ import json from datetime import datetime from string import Template -from typing import List, Tuple, Iterator, Optional, Mapping +from typing import List, Tuple, Iterator, Optional, Mapping, Iterable +from collections import OrderedDict from .schema import Link, ArchiveResult from .config import ( @@ -13,14 +14,15 @@ from .config import ( GIT_SHA, FOOTER_INFO, TIMEOUT, + URL_BLACKLIST_PTN, ) from .util import ( + scheme, + fuzzy_url, ts_to_date, - merge_links, urlencode, htmlencode, urldecode, - derived_link_info, wget_output_path, enforce_types, TimedProgress, @@ -28,7 +30,6 @@ from .util import ( atomic_write, ) from .parse import parse_links -from .links import validate_links from .logs import ( log_indexing_process_started, log_indexing_started, @@ -41,6 +42,147 @@ TITLE_LOADING_MSG = 'Not yet archived...' +### Link filtering and checking + +@enforce_types +def derived_link_info(link: Link) -> dict: + """extend link info with the archive urls and other derived data""" + + info = link._asdict(extended=True) + info.update(link.canonical_outputs()) + + return info + + +@enforce_types +def merge_links(a: Link, b: Link) -> Link: + """deterministially merge two links, favoring longer field values over shorter, + and "cleaner" values over worse ones. + """ + assert a.base_url == b.base_url, 'Cannot merge two links with different URLs' + + url = a.url if len(a.url) > len(b.url) else b.url + + possible_titles = [ + title + for title in (a.title, b.title) + if title and title.strip() and '://' not in title + ] + title = None + if len(possible_titles) == 2: + title = max(possible_titles, key=lambda t: len(t)) + elif len(possible_titles) == 1: + title = possible_titles[0] + + timestamp = ( + a.timestamp + if float(a.timestamp or 0) < float(b.timestamp or 0) else + b.timestamp + ) + + tags_set = ( + set(tag.strip() for tag in (a.tags or '').split(',')) + | set(tag.strip() for tag in (b.tags or '').split(',')) + ) + tags = ','.join(tags_set) or None + + sources = list(set(a.sources + b.sources)) + + all_methods = set(list(a.history.keys()) + list(a.history.keys())) + history = { + method: (a.history.get(method) or []) + (b.history.get(method) or []) + for method in all_methods + } + + return Link( + url=url, + timestamp=timestamp, + title=title, + tags=tags, + sources=sources, + history=history, + ) + +def validate_links(links: Iterable[Link]) -> Iterable[Link]: + links = archivable_links(links) # remove chrome://, about:, mailto: etc. + links = sorted_links(links) # deterministically sort the links based on timstamp, url + links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls + + if not links: + print('[X] No links found :(') + raise SystemExit(1) + + return links + +def archivable_links(links: Iterable[Link]) -> Iterable[Link]: + """remove chrome://, about:// or other schemed links that cant be archived""" + for link in links: + scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp') + not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True + if scheme_is_valid and not_blacklisted: + yield link + + +def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]: + """ + ensures that all non-duplicate links have monotonically increasing timestamps + """ + + unique_urls: OrderedDict[str, Link] = OrderedDict() + + for link in sorted_links: + fuzzy = fuzzy_url(link.url) + if fuzzy in unique_urls: + # merge with any other links that share the same url + link = merge_links(unique_urls[fuzzy], link) + unique_urls[fuzzy] = link + + unique_timestamps: OrderedDict[str, Link] = OrderedDict() + for link in unique_urls.values(): + new_link = link.overwrite( + timestamp=lowest_uniq_timestamp(unique_timestamps, link.timestamp), + ) + unique_timestamps[new_link.timestamp] = new_link + + return unique_timestamps.values() + + +def sorted_links(links: Iterable[Link]) -> Iterable[Link]: + sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url) + return sorted(links, key=sort_func, reverse=True) + + +def links_after_timestamp(links: Iterable[Link], resume: float=None) -> Iterable[Link]: + if not resume: + yield from links + return + + for link in links: + try: + if float(link.timestamp) <= resume: + yield link + except (ValueError, TypeError): + print('Resume value and all timestamp values must be valid numbers.') + + +def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str: + """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2""" + + timestamp = timestamp.split('.')[0] + nonce = 0 + + # first try 152323423 before 152323423.0 + if timestamp not in used_timestamps: + return timestamp + + new_timestamp = '{}.{}'.format(timestamp, nonce) + while new_timestamp in used_timestamps: + nonce += 1 + new_timestamp = '{}.{}'.format(timestamp, nonce) + + return new_timestamp + + ### Homepage index for all the links diff --git a/archivebox/legacy/links.py b/archivebox/legacy/links.py deleted file mode 100644 index 914c3575..00000000 --- a/archivebox/legacy/links.py +++ /dev/null @@ -1,93 +0,0 @@ -from typing import Iterable -from collections import OrderedDict - -from .schema import Link -from .util import ( - scheme, - fuzzy_url, - merge_links, -) - -from .config import URL_BLACKLIST_PTN - - -def validate_links(links: Iterable[Link]) -> Iterable[Link]: - links = archivable_links(links) # remove chrome://, about:, mailto: etc. - links = sorted_links(links) # deterministically sort the links based on timstamp, url - links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls - - if not links: - print('[X] No links found :(') - raise SystemExit(1) - - return links - -def archivable_links(links: Iterable[Link]) -> Iterable[Link]: - """remove chrome://, about:// or other schemed links that cant be archived""" - for link in links: - scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp') - not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True - if scheme_is_valid and not_blacklisted: - yield link - - -def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]: - """ - ensures that all non-duplicate links have monotonically increasing timestamps - """ - - unique_urls: OrderedDict[str, Link] = OrderedDict() - - for link in sorted_links: - fuzzy = fuzzy_url(link.url) - if fuzzy in unique_urls: - # merge with any other links that share the same url - link = merge_links(unique_urls[fuzzy], link) - unique_urls[fuzzy] = link - - unique_timestamps: OrderedDict[str, Link] = OrderedDict() - for link in unique_urls.values(): - new_link = link.overwrite( - timestamp=lowest_uniq_timestamp(unique_timestamps, link.timestamp), - ) - unique_timestamps[new_link.timestamp] = new_link - - return unique_timestamps.values() - - -def sorted_links(links: Iterable[Link]) -> Iterable[Link]: - sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url) - return sorted(links, key=sort_func, reverse=True) - - -def links_after_timestamp(links: Iterable[Link], resume: float=None) -> Iterable[Link]: - if not resume: - yield from links - return - - for link in links: - try: - if float(link.timestamp) <= resume: - yield link - except (ValueError, TypeError): - print('Resume value and all timestamp values must be valid numbers.') - - -def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str: - """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2""" - - timestamp = timestamp.split('.')[0] - nonce = 0 - - # first try 152323423 before 152323423.0 - if timestamp not in used_timestamps: - return timestamp - - new_timestamp = '{}.{}'.format(timestamp, nonce) - while new_timestamp in used_timestamps: - nonce += 1 - new_timestamp = '{}.{}'.format(timestamp, nonce) - - return new_timestamp - - diff --git a/archivebox/legacy/main.py b/archivebox/legacy/main.py new file mode 100644 index 00000000..12680f5b --- /dev/null +++ b/archivebox/legacy/main.py @@ -0,0 +1,80 @@ +import re +import json + +from typing import List, Optional, Iterable + +from .schema import Link +from .util import enforce_types, ExtendedEncoder +from .index import ( + links_after_timestamp, + load_links_index, + write_links_index, +) +from .archive_methods import archive_link +from .config import ( + ONLY_NEW, + OUTPUT_DIR, +) +from .logs import ( + log_archiving_started, + log_archiving_paused, + log_archiving_finished, +) + + +@enforce_types +def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None, only_new: bool=False) -> List[Link]: + """The main ArchiveBox entrancepoint. Everything starts here.""" + + # Step 1: Load list of links from the existing index + # merge in and dedupe new links from import_path + all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path) + + # Step 2: Write updated index with deduped old and new links back to disk + write_links_index(links=list(all_links), out_dir=OUTPUT_DIR) + + # Step 3: Run the archive methods for each link + links = new_links if ONLY_NEW else all_links + log_archiving_started(len(links), resume) + idx: int = 0 + link: Optional[Link] = None + try: + for idx, link in enumerate(links_after_timestamp(links, resume)): + archive_link(link, link_dir=link.link_dir) + + except KeyboardInterrupt: + log_archiving_paused(len(links), idx, link.timestamp if link else '0') + raise SystemExit(0) + + except: + print() + raise + + log_archiving_finished(len(links)) + + # Step 4: Re-write links index with updated titles, icons, and resources + all_links, _ = load_links_index(out_dir=OUTPUT_DIR) + write_links_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True) + return all_links + + +@enforce_types +def list_archive_data(filter_regex: Optional[str]=None, after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]: + + all_links, _ = load_links_index(out_dir=OUTPUT_DIR) + + pattern = re.compile(filter_regex, re.IGNORECASE) if filter_regex else None + + for link in all_links: + if pattern and not pattern.match(link.url): + continue + if after is not None and float(link.timestamp) < after: + continue + if before is not None and float(link.timestamp) > before: + continue + + yield link + + +def csv_format(link: Link, csv_cols: str) -> str: + return ','.join(json.dumps(getattr(link, col), cls=ExtendedEncoder) for col in csv_cols.split(',')) diff --git a/archivebox/legacy/purge.py b/archivebox/legacy/purge.py index ddc64b6b..b36083f0 100755 --- a/archivebox/legacy/purge.py +++ b/archivebox/legacy/purge.py @@ -7,7 +7,11 @@ from shutil import rmtree from typing import List from .config import ARCHIVE_DIR, OUTPUT_DIR -from .index import parse_json_links_index, write_html_links_index, write_json_links_index +from .index import ( + parse_json_links_index, + write_html_links_index, + write_json_links_index, +) def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None: diff --git a/archivebox/legacy/util.py b/archivebox/legacy/util.py index 8121a988..a4f38316 100644 --- a/archivebox/legacy/util.py +++ b/archivebox/legacy/util.py @@ -404,59 +404,6 @@ def parse_date(date: Any) -> Optional[datetime]: raise ValueError('Tried to parse invalid date! {}'.format(date)) - -### Link Helpers - -@enforce_types -def merge_links(a: Link, b: Link) -> Link: - """deterministially merge two links, favoring longer field values over shorter, - and "cleaner" values over worse ones. - """ - assert a.base_url == b.base_url, 'Cannot merge two links with different URLs' - - url = a.url if len(a.url) > len(b.url) else b.url - - possible_titles = [ - title - for title in (a.title, b.title) - if title and title.strip() and '://' not in title - ] - title = None - if len(possible_titles) == 2: - title = max(possible_titles, key=lambda t: len(t)) - elif len(possible_titles) == 1: - title = possible_titles[0] - - timestamp = ( - a.timestamp - if float(a.timestamp or 0) < float(b.timestamp or 0) else - b.timestamp - ) - - tags_set = ( - set(tag.strip() for tag in (a.tags or '').split(',')) - | set(tag.strip() for tag in (b.tags or '').split(',')) - ) - tags = ','.join(tags_set) or None - - sources = list(set(a.sources + b.sources)) - - all_methods = set(list(a.history.keys()) + list(a.history.keys())) - history = { - method: (a.history.get(method) or []) + (b.history.get(method) or []) - for method in all_methods - } - - return Link( - url=url, - timestamp=timestamp, - title=title, - tags=tags, - sources=sources, - history=history, - ) - - @enforce_types def is_static_file(url: str) -> bool: """Certain URLs just point to a single static file, and @@ -467,16 +414,6 @@ def is_static_file(url: str) -> bool: return extension(url) in STATICFILE_EXTENSIONS -@enforce_types -def derived_link_info(link: Link) -> dict: - """extend link info with the archive urls and other derived data""" - - info = link._asdict(extended=True) - info.update(link.canonical_outputs()) - - return info - - ### Python / System Helpers @@ -696,3 +633,22 @@ def atomic_write(contents: Union[dict, str], path: str) -> None: finally: if os.path.exists(tmp_file): os.remove(tmp_file) + + +def reject_stdin(caller: str) -> None: + """Tell the user they passed stdin to a command that doesn't accept it""" + + if not sys.stdin.isatty(): + stdin_raw_text = sys.stdin.read().strip() + if stdin_raw_text: + print( + '{red}[X] The "{}" command does not accept stdin.{reset}\n'.format( + caller, + **ANSI, + ) + ) + print(' Run archivebox "{} --help" to see usage and examples.'.format( + caller, + )) + print() + raise SystemExit(1) diff --git a/bin/archivebox b/bin/archivebox index 601d4c25..02c45790 100755 --- a/bin/archivebox +++ b/bin/archivebox @@ -8,8 +8,8 @@ BIN_DIR = os.path.dirname(os.path.abspath(__file__)) REPO_DIR = os.path.abspath(os.path.join(BIN_DIR, os.pardir)) sys.path.append(REPO_DIR) -from archivebox.__main__ import main +from archivebox.cli.archivebox import main if __name__ == '__main__': - main(sys.argv) + main()