From 51ae634ec98b7dc8ee57ae6f022a87924fb9d912 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@nicksweeting.com>
Date: Wed, 3 Apr 2019 00:27:37 -0400
Subject: [PATCH] working argparse based CLI with most commands implemented

---
 archivebox/__init__.py               |   4 +
 archivebox/__main__.py               |   8 +-
 archivebox/cli/__init__.py           |  27 +++
 archivebox/cli/archivebox.py         |  71 ++++++++
 archivebox/cli/archivebox_add.py     |  84 +++++++++
 archivebox/cli/archivebox_help.py    |  54 ++++++
 archivebox/cli/archivebox_init.py    |  72 ++++++++
 archivebox/cli/archivebox_list.py    |  81 +++++++++
 archivebox/cli/archivebox_update.py  |  45 +++++
 archivebox/cli/archivebox_version.py | 103 ++++++++++++
 archivebox/core/settings.py          |  15 +-
 archivebox/env.py                    |   4 +
 archivebox/legacy/__init__.py        |   5 -
 archivebox/legacy/archive.py         | 243 ---------------------------
 archivebox/legacy/index.py           | 150 ++++++++++++++++-
 archivebox/legacy/links.py           |  93 ----------
 archivebox/legacy/main.py            |  80 +++++++++
 archivebox/legacy/purge.py           |   6 +-
 archivebox/legacy/util.py            |  82 +++------
 bin/archivebox                       |   4 +-
 20 files changed, 807 insertions(+), 424 deletions(-)
 create mode 100644 archivebox/cli/__init__.py
 create mode 100755 archivebox/cli/archivebox.py
 create mode 100644 archivebox/cli/archivebox_add.py
 create mode 100755 archivebox/cli/archivebox_help.py
 create mode 100755 archivebox/cli/archivebox_init.py
 create mode 100644 archivebox/cli/archivebox_list.py
 create mode 100644 archivebox/cli/archivebox_update.py
 create mode 100755 archivebox/cli/archivebox_version.py
 delete mode 100755 archivebox/legacy/archive.py
 delete mode 100644 archivebox/legacy/links.py
 create mode 100644 archivebox/legacy/main.py

diff --git a/archivebox/__init__.py b/archivebox/__init__.py
index e69de29b..26fcd715 100644
--- a/archivebox/__init__.py
+++ b/archivebox/__init__.py
@@ -0,0 +1,4 @@
+
+__AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
+__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
+__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
diff --git a/archivebox/__main__.py b/archivebox/__main__.py
index 8e75ec40..1439b07f 100755
--- a/archivebox/__main__.py
+++ b/archivebox/__main__.py
@@ -1,19 +1,15 @@
 #!/usr/bin/env python3
 
-"""
-Main ArchiveBox command line application entrypoint.
-"""
-
 __package__ = 'archivebox'
 
+
 import os
 import sys
 
 PYTHON_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(PYTHON_DIR)
 
-from .env import *
-from .legacy.archive import main
+from .cli.archivebox import main
 
 
 if __name__ == '__main__':
diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py
new file mode 100644
index 00000000..ea1fcda5
--- /dev/null
+++ b/archivebox/cli/__init__.py
@@ -0,0 +1,27 @@
+__package__ = 'archivebox.cli'
+
+import os
+from importlib import import_module
+
+CLI_DIR = os.path.dirname(os.path.abspath(__file__))
+
+required_attrs = ('__package__', '__command__', '__description__', 'main')
+
+
+def list_subcommands():
+    COMMANDS = {}
+    for filename in os.listdir(CLI_DIR):
+        if filename.startswith('archivebox_') and filename.endswith('.py'):
+            subcommand = filename.replace('archivebox_', '').replace('.py', '')
+            module = import_module('.archivebox_{}'.format(subcommand), __package__)
+
+            assert all(hasattr(module, attr) for attr in required_attrs)
+            assert module.__command__.split(' ')[-1] == subcommand
+            COMMANDS[subcommand] = module.__description__
+
+    return COMMANDS
+
+
+def run_subcommand(subcommand: str, args=None):
+    module = import_module('.archivebox_{}'.format(subcommand), __package__)
+    return module.main(args)    # type: ignore
diff --git a/archivebox/cli/archivebox.py b/archivebox/cli/archivebox.py
new file mode 100755
index 00000000..31cd8b5c
--- /dev/null
+++ b/archivebox/cli/archivebox.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+# archivebox [command]
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox'
+__description__ = 'ArchiveBox: The self-hosted internet archive.'
+
+import sys
+import argparse
+
+from . import list_subcommands, run_subcommand
+
+
+def parse_args(args=None):
+    args = sys.argv[1:] if args is None else args
+
+    subcommands = list_subcommands()
+
+    parser = argparse.ArgumentParser(
+        prog=__command__,
+        description=__description__,
+        add_help=False,
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        '--help', '-h',
+        action='store_true',
+        help=subcommands['help'],
+    )
+    group.add_argument(
+        '--version',
+        action='store_true',
+        help=subcommands['version'],
+    )
+    group.add_argument(
+        "subcommand",
+        type=str,
+        help= "The name of the subcommand to run",
+        nargs='?',
+        choices=subcommands.keys(),
+        default=None,
+    )
+    parser.add_argument(
+        "args",
+        help="Arguments for the subcommand",
+        nargs=argparse.REMAINDER,
+    )
+    
+    command = parser.parse_args(args)
+
+    if command.help:
+        command.subcommand = 'help'
+    if command.version:
+        command.subcommand = 'version'
+
+    # print('--------------------------------------------')
+    # print('Command:     ', sys.argv[0])
+    # print('Subcommand:  ', command.subcommand)
+    # print('Args to pass:', args[1:])
+    # print('--------------------------------------------')
+
+    return command.subcommand, command.args
+
+
+def main(args=None):
+    subcommand, subcommand_args = parse_args(args)
+    run_subcommand(subcommand, subcommand_args)
+    
+
+if __name__ == '__main__':
+    main()
diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py
new file mode 100644
index 00000000..934907a2
--- /dev/null
+++ b/archivebox/cli/archivebox_add.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox add'
+__description__ = 'Add a new URL or list of URLs to your archive'
+
+import os
+import sys
+import argparse
+
+from ..legacy.util import (
+    handle_stdin_import,
+    handle_file_import,
+)
+from ..legacy.main import update_archive_data
+
+
+def main(args=None):
+    args = sys.argv[1:] if args is None else args
+
+    parser = argparse.ArgumentParser(
+        prog=__command__,
+        description=__description__,
+        add_help=True,
+    )
+    # parser.add_argument(
+    #     '--depth', #'-d',
+    #     type=int,
+    #     help='Recursively archive all linked pages up to this many hops away',
+    #     default=0,
+    # )
+    parser.add_argument(
+        '--only-new', #'-n',
+        action='store_true',
+        help="Don't attempt to retry previously skipped/failed links when updating",
+    )
+    parser.add_argument(
+        '--mirror', #'-m',
+        action='store_true',
+        help='Archive an entire site (finding all linked pages below it on the same domain)',
+    )
+    parser.add_argument(
+        '--crawler', #'-r',
+        choices=('depth_first', 'breadth_first'),
+        help='Controls which crawler to use in order to find outlinks in a given page',
+        default=None,
+    )
+    parser.add_argument(
+        'url',
+        nargs='?',
+        type=str,
+        default=None,
+        help='URL of page to archive (or path to local file)'
+    )
+    command = parser.parse_args(args)
+
+    ### Handle ingesting urls piped in through stdin
+    # (.e.g if user does cat example_urls.txt | ./archive)
+    import_path = None
+    if not sys.stdin.isatty():
+        stdin_raw_text = sys.stdin.read()
+        if stdin_raw_text and command.url:
+            print(
+                '[X] You should pass either a path as an argument, '
+                'or pass a list of links via stdin, but not both.\n'
+            )
+            raise SystemExit(1)
+
+        import_path = handle_stdin_import(stdin_raw_text)
+
+    ### Handle ingesting url from a remote file/feed
+    # (e.g. if an RSS feed URL is used as the import path) 
+    elif command.url:
+        import_path = handle_file_import(command.url)
+
+
+    update_archive_data(
+        import_path=import_path,
+        resume=None,
+        only_new=command.only_new,
+    )
+
+if __name__ == '__main__':
+    main()
diff --git a/archivebox/cli/archivebox_help.py b/archivebox/cli/archivebox_help.py
new file mode 100755
index 00000000..7e4f9d87
--- /dev/null
+++ b/archivebox/cli/archivebox_help.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox help'
+__description__ = 'Print the ArchiveBox help message and usage'
+
+import sys
+import argparse
+
+from ..legacy.util import reject_stdin
+from . import list_subcommands
+
+
+def main(args=None):
+    args = sys.argv[1:] if args is None else args
+
+    parser = argparse.ArgumentParser(
+        prog=__command__,
+        description=__description__,
+        add_help=True,
+    )
+    parser.parse_args(args)
+    reject_stdin(__command__)
+    
+
+    COMMANDS_HELP_TEXT = '\n    '.join(
+        f'{cmd.ljust(20)} {summary}'
+        for cmd, summary in list_subcommands().items()
+    )
+
+    print(f'''ArchiveBox: The self-hosted internet archive.
+Usage:
+    archivebox [command] [--help] [--version] [...args]
+
+Comamnds:
+    {COMMANDS_HELP_TEXT}
+
+Example Use:
+    mkdir my-archive; cd my-archive/
+    archivebox init
+
+    echo 'https://example.com/some/page' | archivebox add
+    archivebox add https://example.com/some/other/page
+    archivebox add --depth=1 ~/Downloads/bookmarks_export.html
+    archivebox add --depth=1 https://example.com/feed.rss
+    archivebox update --resume=15109948213.123
+
+Documentation:
+    https://github.com/pirate/ArchiveBox/wiki
+''')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py
new file mode 100755
index 00000000..ddfbd4a1
--- /dev/null
+++ b/archivebox/cli/archivebox_init.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox init'
+__description__ = 'Initialize a new ArchiveBox collection in the current directory'
+
+import os
+import sys
+import argparse
+
+from ..legacy.util import reject_stdin
+from ..legacy.config import (
+    OUTPUT_DIR,
+    SOURCES_DIR,
+    ARCHIVE_DIR,
+    DATABASE_DIR,
+    ANSI,
+)
+
+
+def init(output_dir: str=OUTPUT_DIR):
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    harmless_files = {'.DS_Store', '.venv', 'venv', 'virtualenv', '.virtualenv'}
+    is_empty = not len(set(os.listdir(output_dir)) - harmless_files)
+    existing_index = os.path.exists(os.path.join(output_dir, 'index.json'))
+
+    if not is_empty:
+        if existing_index:
+            print('You already have an archive in this folder!')
+            # TODO: import old archivebox version's archive data folder
+
+            raise SystemExit(1)
+        else:
+            print(
+                ("{red}[X] This folder already has files in it. You must run init inside a completely empty directory.{reset}"
+                "\n\n"
+                "    {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n"
+                "    just cd into the folder and run the archivebox command to pick up where you left off.\n\n"
+                "    (Always make sure your data folder is backed up first before updating ArchiveBox)"
+                ).format(output_dir, **ANSI)
+            )
+            raise SystemExit(1)
+
+
+    print('{green}[+] Initializing new archive directory: {}{reset}'.format(output_dir, **ANSI))
+    os.makedirs(SOURCES_DIR)
+    print(f'    > {SOURCES_DIR}')
+    os.makedirs(ARCHIVE_DIR)
+    print(f'    > {ARCHIVE_DIR}')
+    os.makedirs(DATABASE_DIR)
+    print(f'    > {DATABASE_DIR}')
+    print('{green}[√] Done.{reset}'.format(**ANSI))
+
+
+def main(args=None):
+    args = sys.argv[1:] if args is None else args
+
+    parser = argparse.ArgumentParser(
+        prog=__command__,
+        description=__description__,
+        add_help=True,
+    )
+    parser.parse_args(args)
+    reject_stdin(__command__)
+
+    init()
+    
+
+if __name__ == '__main__':
+    main()
diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py
new file mode 100644
index 00000000..75699d3a
--- /dev/null
+++ b/archivebox/cli/archivebox_list.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox list'
+__description__ = 'List all the URLs currently in the archive.'
+
+import sys
+import json
+import argparse
+
+
+from ..legacy.util import reject_stdin, ExtendedEncoder
+from ..legacy.main import list_archive_data, csv_format
+
+
+def main(args=None):
+    args = sys.argv[1:] if args is None else args
+
+    parser = argparse.ArgumentParser(
+        prog=__command__,
+        description=__description__,
+        add_help=True,
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        '--csv', #'-c',
+        type=str,
+        help="Print the output in CSV format with the given columns, e.g.: timestamp,url,extension",
+        default=None,
+    )
+    group.add_argument(
+        '--json', #'-j',
+        action='store_true',
+        help="Print the output in JSON format with all columns included.",
+    )
+    parser.add_argument(
+        '--filter', #'-f',
+        type=str,
+        help="List only URLs matching the given regex pattern.",
+        default=None,
+    )
+    parser.add_argument(
+        '--sort', #'-s',
+        type=str,
+        help="List the links sorted using the given key, e.g. timestamp or updated",
+        default=None,
+    )
+    parser.add_argument(
+        '--before', #'-b',
+        type=float,
+        help="List only URLs bookmarked before the given timestamp.",
+        default=None,
+    )
+    parser.add_argument(
+        '--after', #'-a',
+        type=float,
+        help="List only URLs bookmarked after the given timestamp.",
+        default=None,
+    )
+    command = parser.parse_args(args)
+    reject_stdin(__command__)
+
+    links = list_archive_data(
+        filter_regex=command.filter,
+        before=command.before,
+        after=command.after,
+    )
+    if command.sort:
+        links = sorted(links, key=lambda link: getattr(link, command.sort))
+
+    if command.csv:
+        print(command.csv)
+        print('\n'.join(csv_format(link, command.csv) for link in links))
+    elif command.json:
+        print(json.dumps(list(links), indent=4, cls=ExtendedEncoder))
+    else:
+        print('\n'.join(link.url for link in links))
+    
+
+if __name__ == '__main__':
+    main()
diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py
new file mode 100644
index 00000000..c74fc8b7
--- /dev/null
+++ b/archivebox/cli/archivebox_update.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox update'
+__description__ = 'Import any new links from subscriptions and retry any previously failed/skipped links.'
+
+import sys
+import argparse
+
+
+from ..legacy.util import reject_stdin
+from ..legacy.main import update_archive_data
+
+
+def main(args=None):
+    args = sys.argv[1:] if args is None else args
+
+    parser = argparse.ArgumentParser(
+        prog=__command__,
+        description=__description__,
+        add_help=True,
+    )
+    parser.add_argument(
+        '--only-new', #'-n',
+        action='store_true',
+        help="Don't attempt to retry previously skipped/failed links when updating",
+    )
+    parser.add_argument(
+        '--resume', #'-r',
+        type=float,
+        help='Resume the update process from a given timestamp',
+        default=None,
+    )
+    command = parser.parse_args(args)
+    reject_stdin(__command__)
+
+    update_archive_data(
+        import_path=None,
+        resume=command.resume,
+        only_new=command.only_new,
+    )
+    
+
+if __name__ == '__main__':
+    main()
diff --git a/archivebox/cli/archivebox_version.py b/archivebox/cli/archivebox_version.py
new file mode 100755
index 00000000..d5eb7954
--- /dev/null
+++ b/archivebox/cli/archivebox_version.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox version'
+__description__ = 'Print the ArchiveBox version and dependency information'
+
+import sys
+import shutil
+import argparse
+
+from ..legacy.util import reject_stdin
+from ..legacy.config import (
+    VERSION,
+
+    REPO_DIR,
+    PYTHON_DIR,
+    LEGACY_DIR,
+    TEMPLATES_DIR,
+    OUTPUT_DIR,
+    SOURCES_DIR,
+    ARCHIVE_DIR,
+    DATABASE_DIR,
+
+    USE_CURL,
+    USE_WGET,
+    USE_CHROME,
+    FETCH_GIT,
+    FETCH_MEDIA,
+
+    DJANGO_BINARY,
+    CURL_BINARY,
+    GIT_BINARY,
+    WGET_BINARY,
+    YOUTUBEDL_BINARY,
+    CHROME_BINARY,
+
+    DJANGO_VERSION,
+    CURL_VERSION,
+    GIT_VERSION,
+    WGET_VERSION,
+    YOUTUBEDL_VERSION,
+    CHROME_VERSION,
+)
+
+
+def main(args=None):
+    args = sys.argv[1:] if args is None else args
+
+    parser = argparse.ArgumentParser(
+        prog=__command__,
+        description=__description__,
+        add_help=True,
+    )
+    parser.parse_args(args)
+    reject_stdin(__command__)
+    
+    print('ArchiveBox v{}'.format(VERSION))
+    print()
+    print('[i] Folder locations:')
+    print('    REPO_DIR:      ', REPO_DIR)
+    print('    PYTHON_DIR:    ', PYTHON_DIR)
+    print('    LEGACY_DIR:    ', LEGACY_DIR)
+    print('    TEMPLATES_DIR: ', TEMPLATES_DIR)
+    print()
+    print('    OUTPUT_DIR:    ', OUTPUT_DIR)
+    print('    SOURCES_DIR:   ', SOURCES_DIR)
+    print('    ARCHIVE_DIR:   ', ARCHIVE_DIR)
+    print('    DATABASE_DIR:  ', DATABASE_DIR)
+    print()
+    print(
+        '[√] Django:'.ljust(14),
+        'python3 {} --version\n'.format(DJANGO_BINARY),
+        ' '*13, DJANGO_VERSION, '\n',
+    )
+    print(
+        '[{}] CURL:'.format('√' if USE_CURL else 'X').ljust(14),
+        '{} --version\n'.format(shutil.which(CURL_BINARY)),
+        ' '*13, CURL_VERSION, '\n',
+    )
+    print(
+        '[{}] GIT:'.format('√' if FETCH_GIT else 'X').ljust(14),
+        '{} --version\n'.format(shutil.which(GIT_BINARY)),
+        ' '*13, GIT_VERSION, '\n',
+    )
+    print(
+        '[{}] WGET:'.format('√' if USE_WGET else 'X').ljust(14),
+        '{} --version\n'.format(shutil.which(WGET_BINARY)),
+        ' '*13, WGET_VERSION, '\n',
+    )
+    print(
+        '[{}] YOUTUBEDL:'.format('√' if FETCH_MEDIA else 'X').ljust(14),
+        '{} --version\n'.format(shutil.which(YOUTUBEDL_BINARY)),
+        ' '*13, YOUTUBEDL_VERSION, '\n',
+    )
+    print(
+        '[{}] CHROME:'.format('√' if USE_CHROME else 'X').ljust(14),
+        '{} --version\n'.format(shutil.which(CHROME_BINARY)),
+        ' '*13, CHROME_VERSION, '\n',
+    )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index 0f209b4c..14ba519b 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -13,12 +13,12 @@ DEBUG = True
 
 
 INSTALLED_APPS = [
-    'django.contrib.admin',
-    'django.contrib.auth',
-    'django.contrib.contenttypes',
-    'django.contrib.sessions',
-    'django.contrib.messages',
-    'django.contrib.staticfiles',
+    # 'django.contrib.admin',
+    # 'django.contrib.auth',
+    # 'django.contrib.contenttypes',
+    # 'django.contrib.sessions',
+    # 'django.contrib.messages',
+    # 'django.contrib.staticfiles',
 
     'core',
 ]
@@ -53,10 +53,11 @@ TEMPLATES = [
 WSGI_APPLICATION = 'core.wsgi.application'
 
 
+DATABASE_FILE = os.path.join(DATABASE_DIR, 'database.sqlite3')
 DATABASES = {
     'default': {
         'ENGINE': 'django.db.backends.sqlite3',
-        'NAME': os.path.join(DATABASE_DIR, 'database.sqlite3'),
+        'NAME': DATABASE_FILE,
     }
 }
 
diff --git a/archivebox/env.py b/archivebox/env.py
index 3a40fab5..905fa275 100644
--- a/archivebox/env.py
+++ b/archivebox/env.py
@@ -9,3 +9,7 @@ os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings")
 
 import django
 django.setup()
+
+from django.conf import settings
+
+DATABASE_FILE = settings.DATABASE_FILE
diff --git a/archivebox/legacy/__init__.py b/archivebox/legacy/__init__.py
index ab53f570..e69de29b 100644
--- a/archivebox/legacy/__init__.py
+++ b/archivebox/legacy/__init__.py
@@ -1,5 +0,0 @@
-
-
-#__name__ = 'archivebox'
-#__package__ = 'archivebox'
-
diff --git a/archivebox/legacy/archive.py b/archivebox/legacy/archive.py
deleted file mode 100755
index 82788c47..00000000
--- a/archivebox/legacy/archive.py
+++ /dev/null
@@ -1,243 +0,0 @@
-#!/usr/bin/env python3
-"""
-ArchiveBox command line application.
-
-./archive and ./bin/archivebox both point to this file, 
-but you can also run it directly using `python3 archive.py`
-
-Usage & Documentation:
-    https://github.com/pirate/ArchiveBox/Wiki
-"""
-__package__ = 'legacy'
-
-import os
-import sys
-import shutil
-
-from typing import List, Optional
-
-from .schema import Link
-from .links import links_after_timestamp
-from .index import write_links_index, load_links_index
-from .archive_methods import archive_link
-from .config import (
-    ONLY_NEW,
-    VERSION,
-    ANSI,
-
-    REPO_DIR,
-    PYTHON_DIR,
-    LEGACY_DIR,
-    TEMPLATES_DIR,
-    OUTPUT_DIR,
-    SOURCES_DIR,
-    ARCHIVE_DIR,
-    DATABASE_DIR,
-
-    USE_CURL,
-    USE_WGET,
-    USE_CHROME,
-    FETCH_GIT,
-    FETCH_MEDIA,
-
-    DJANGO_BINARY,
-    CURL_BINARY,
-    GIT_BINARY,
-    WGET_BINARY,
-    YOUTUBEDL_BINARY,
-    CHROME_BINARY,
-
-    DJANGO_VERSION,
-    CURL_VERSION,
-    GIT_VERSION,
-    WGET_VERSION,
-    YOUTUBEDL_VERSION,
-    CHROME_VERSION,
-)
-from .util import (
-    enforce_types,
-    handle_stdin_import,
-    handle_file_import,
-)
-from .logs import (
-    log_archiving_started,
-    log_archiving_paused,
-    log_archiving_finished,
-)
-
-__AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
-__VERSION__ = VERSION
-__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
-__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
-
-
-
-def print_help():
-    print('ArchiveBox: The self-hosted internet archive.\n')
-    print("Documentation:")
-    print("    https://github.com/pirate/ArchiveBox/wiki\n")
-    print("UI Usage:")
-    print("    Open output/index.html to view your archive.\n")
-    print("CLI Usage:")
-    print("    mkdir data; cd data/")
-    print("    archivebox init\n")
-    print("    echo 'https://example.com/some/page' | archivebox add")
-    print("    archivebox add https://example.com/some/other/page")
-    print("    archivebox add --depth=1 ~/Downloads/bookmarks_export.html")
-    print("    archivebox add --depth=1 https://example.com/feed.rss")
-    print("    archivebox update --resume=15109948213.123")
-
-
-def print_version():
-    print('ArchiveBox v{}'.format(__VERSION__))
-    print()
-    print('[i] Folder locations:')
-    print('    REPO_DIR:      ', REPO_DIR)
-    print('    PYTHON_DIR:    ', PYTHON_DIR)
-    print('    LEGACY_DIR:    ', LEGACY_DIR)
-    print('    TEMPLATES_DIR: ', TEMPLATES_DIR)
-    print()
-    print('    OUTPUT_DIR:    ', OUTPUT_DIR)
-    print('    SOURCES_DIR:   ', SOURCES_DIR)
-    print('    ARCHIVE_DIR:   ', ARCHIVE_DIR)
-    print('    DATABASE_DIR:  ', DATABASE_DIR)
-    print()
-    print(
-        '[√] Django:'.ljust(14),
-        'python3 {} --version\n'.format(DJANGO_BINARY),
-        ' '*13, DJANGO_VERSION, '\n',
-    )
-    print(
-        '[{}] CURL:'.format('√' if USE_CURL else 'X').ljust(14),
-        '{} --version\n'.format(shutil.which(CURL_BINARY)),
-        ' '*13, CURL_VERSION, '\n',
-    )
-    print(
-        '[{}] GIT:'.format('√' if FETCH_GIT else 'X').ljust(14),
-        '{} --version\n'.format(shutil.which(GIT_BINARY)),
-        ' '*13, GIT_VERSION, '\n',
-    )
-    print(
-        '[{}] WGET:'.format('√' if USE_WGET else 'X').ljust(14),
-        '{} --version\n'.format(shutil.which(WGET_BINARY)),
-        ' '*13, WGET_VERSION, '\n',
-    )
-    print(
-        '[{}] YOUTUBEDL:'.format('√' if FETCH_MEDIA else 'X').ljust(14),
-        '{} --version\n'.format(shutil.which(YOUTUBEDL_BINARY)),
-        ' '*13, YOUTUBEDL_VERSION, '\n',
-    )
-    print(
-        '[{}] CHROME:'.format('√' if USE_CHROME else 'X').ljust(14),
-        '{} --version\n'.format(shutil.which(CHROME_BINARY)),
-        ' '*13, CHROME_VERSION, '\n',
-    )
-
-
-def main(args=None) -> None:
-    if args is None:
-        args = sys.argv
-
-    if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2:
-        print_help()
-        raise SystemExit(0)
-
-    if set(args).intersection(('--version', 'version')):
-        print_version()
-        raise SystemExit(0)
-
-    ### Handle CLI arguments
-    #     ./archive bookmarks.html
-    #     ./archive 1523422111.234
-    import_path, resume = None, None
-    if len(args) == 2:
-        # if the argument is a string, it's a import_path file to import
-        # if it's a number, it's a timestamp to resume archiving from
-        if args[1].replace('.', '').isdigit():
-            import_path, resume = None, args[1]
-        else:
-            import_path, resume = args[1], None
-
-    ### Set up output folder
-    if not os.path.exists(OUTPUT_DIR):
-        print('{green}[+] Created a new archive directory: {}{reset}'.format(OUTPUT_DIR, **ANSI))
-        os.makedirs(OUTPUT_DIR)
-        os.makedirs(SOURCES_DIR)
-        os.makedirs(ARCHIVE_DIR)
-        os.makedirs(DATABASE_DIR)
-    else:
-        not_empty = len(set(os.listdir(OUTPUT_DIR)) - {'.DS_Store', '.venv', 'venv', 'virtualenv', '.virtualenv'})
-        index_exists = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
-        if not_empty and not index_exists:
-            print(
-                ("{red}[X] Could not find index.json in the OUTPUT_DIR: {reset}{}\n\n"
-                "    If you're trying to update an existing archive, you must set OUTPUT_DIR to or run archivebox from inside the archive folder you're trying to update.\n"
-                "    If you're trying to create a new archive, you must run archivebox inside a completely empty directory."
-                "\n\n"
-                "    {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n"
-                "    just cd into the folder and run the archivebox command to pick up where you left off.\n\n"
-                "    (Always make sure your data folder is backed up first before updating ArchiveBox)"
-                ).format(OUTPUT_DIR, **ANSI)
-            )
-            raise SystemExit(1)
-
-    ### Handle ingesting urls piped in through stdin
-    # (.e.g if user does cat example_urls.txt | ./archive)
-    if not sys.stdin.isatty():
-        stdin_raw_text = sys.stdin.read()
-        if stdin_raw_text and import_path:
-            print(
-                '[X] You should pass either a path as an argument, '
-                'or pass a list of links via stdin, but not both.\n'
-            )
-            print_help()
-            raise SystemExit(1)
-
-        import_path = handle_stdin_import(stdin_raw_text)
-
-    ### Handle ingesting url from a remote file/feed
-    # (e.g. if an RSS feed URL is used as the import path) 
-    if import_path:
-        import_path = handle_file_import(import_path)
-
-    ### Run the main archive update process
-    update_archive_data(import_path=import_path, resume=resume)
-
-
-@enforce_types
-def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None) -> List[Link]:
-    """The main ArchiveBox entrancepoint. Everything starts here."""
-
-    # Step 1: Load list of links from the existing index
-    #         merge in and dedupe new links from import_path
-    all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
-
-    # Step 2: Write updated index with deduped old and new links back to disk
-    write_links_index(links=list(all_links), out_dir=OUTPUT_DIR)
-
-    # Step 3: Run the archive methods for each link
-    links = new_links if ONLY_NEW else all_links
-    log_archiving_started(len(links), resume)
-    idx: int = 0
-    link: Optional[Link] = None
-    try:
-        for idx, link in enumerate(links_after_timestamp(links, resume)):
-            archive_link(link, link_dir=link.link_dir)
-
-    except KeyboardInterrupt:
-        log_archiving_paused(len(links), idx, link.timestamp if link else '0')
-        raise SystemExit(0)
-
-    except:
-        print()
-        raise    
-
-    log_archiving_finished(len(links))
-
-    # Step 4: Re-write links index with updated titles, icons, and resources
-    all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
-    write_links_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
-    return all_links
-
-if __name__ == '__main__':
-    main(sys.argv)
diff --git a/archivebox/legacy/index.py b/archivebox/legacy/index.py
index 98d9e3df..a28192b2 100644
--- a/archivebox/legacy/index.py
+++ b/archivebox/legacy/index.py
@@ -3,7 +3,8 @@ import json
 
 from datetime import datetime
 from string import Template
-from typing import List, Tuple, Iterator, Optional, Mapping
+from typing import List, Tuple, Iterator, Optional, Mapping, Iterable
+from collections import OrderedDict
 
 from .schema import Link, ArchiveResult
 from .config import (
@@ -13,14 +14,15 @@ from .config import (
     GIT_SHA,
     FOOTER_INFO,
     TIMEOUT,
+    URL_BLACKLIST_PTN,
 )
 from .util import (
+    scheme,
+    fuzzy_url,
     ts_to_date,
-    merge_links,
     urlencode,
     htmlencode,
     urldecode,
-    derived_link_info,
     wget_output_path,
     enforce_types,
     TimedProgress,
@@ -28,7 +30,6 @@ from .util import (
     atomic_write,
 )
 from .parse import parse_links
-from .links import validate_links
 from .logs import (
     log_indexing_process_started,
     log_indexing_started,
@@ -41,6 +42,147 @@ TITLE_LOADING_MSG = 'Not yet archived...'
 
 
 
+### Link filtering and checking
+
+@enforce_types
+def derived_link_info(link: Link) -> dict:
+    """extend link info with the archive urls and other derived data"""
+
+    info = link._asdict(extended=True)
+    info.update(link.canonical_outputs())
+
+    return info
+
+
+@enforce_types
+def merge_links(a: Link, b: Link) -> Link:
+    """deterministially merge two links, favoring longer field values over shorter,
+    and "cleaner" values over worse ones.
+    """
+    assert a.base_url == b.base_url, 'Cannot merge two links with different URLs'
+
+    url = a.url if len(a.url) > len(b.url) else b.url
+
+    possible_titles = [
+        title
+        for title in (a.title, b.title)
+        if title and title.strip() and '://' not in title
+    ]
+    title = None
+    if len(possible_titles) == 2:
+        title = max(possible_titles, key=lambda t: len(t))
+    elif len(possible_titles) == 1:
+        title = possible_titles[0]
+
+    timestamp = (
+        a.timestamp
+        if float(a.timestamp or 0) < float(b.timestamp or 0) else
+        b.timestamp
+    )
+
+    tags_set = (
+        set(tag.strip() for tag in (a.tags or '').split(','))
+        | set(tag.strip() for tag in (b.tags or '').split(','))
+    )
+    tags = ','.join(tags_set) or None
+
+    sources = list(set(a.sources + b.sources))
+
+    all_methods = set(list(a.history.keys()) + list(a.history.keys()))
+    history = {
+        method: (a.history.get(method) or []) + (b.history.get(method) or [])
+        for method in all_methods
+    }
+
+    return Link(
+        url=url,
+        timestamp=timestamp,
+        title=title,
+        tags=tags,
+        sources=sources,
+        history=history,
+    )
+
+def validate_links(links: Iterable[Link]) -> Iterable[Link]:
+    links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
+    links = sorted_links(links)      # deterministically sort the links based on timstamp, url
+    links = uniquefied_links(links)  # merge/dedupe duplicate timestamps & urls
+
+    if not links:
+        print('[X] No links found :(')
+        raise SystemExit(1)
+
+    return links
+
+def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
+    """remove chrome://, about:// or other schemed links that cant be archived"""
+    for link in links:
+        scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
+        not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True
+        if scheme_is_valid and not_blacklisted:
+            yield link
+
+
+def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
+    """
+    ensures that all non-duplicate links have monotonically increasing timestamps
+    """
+
+    unique_urls: OrderedDict[str, Link] = OrderedDict()
+
+    for link in sorted_links:
+        fuzzy = fuzzy_url(link.url)
+        if fuzzy in unique_urls:
+            # merge with any other links that share the same url
+            link = merge_links(unique_urls[fuzzy], link)
+        unique_urls[fuzzy] = link
+
+    unique_timestamps: OrderedDict[str, Link] = OrderedDict()
+    for link in unique_urls.values():
+        new_link = link.overwrite(
+            timestamp=lowest_uniq_timestamp(unique_timestamps, link.timestamp),
+        )
+        unique_timestamps[new_link.timestamp] = new_link
+
+    return unique_timestamps.values()
+
+
+def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
+    sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url)
+    return sorted(links, key=sort_func, reverse=True)
+
+
+def links_after_timestamp(links: Iterable[Link], resume: float=None) -> Iterable[Link]:
+    if not resume:
+        yield from links
+        return
+
+    for link in links:
+        try:
+            if float(link.timestamp) <= resume:
+                yield link
+        except (ValueError, TypeError):
+            print('Resume value and all timestamp values must be valid numbers.')
+
+
+def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
+    """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
+
+    timestamp = timestamp.split('.')[0]
+    nonce = 0
+
+    # first try 152323423 before 152323423.0
+    if timestamp not in used_timestamps:
+        return timestamp
+
+    new_timestamp = '{}.{}'.format(timestamp, nonce)
+    while new_timestamp in used_timestamps:
+        nonce += 1
+        new_timestamp = '{}.{}'.format(timestamp, nonce)
+
+    return new_timestamp
+
+
 
 ### Homepage index for all the links
 
diff --git a/archivebox/legacy/links.py b/archivebox/legacy/links.py
deleted file mode 100644
index 914c3575..00000000
--- a/archivebox/legacy/links.py
+++ /dev/null
@@ -1,93 +0,0 @@
-from typing import Iterable
-from collections import OrderedDict
-
-from .schema import Link
-from .util import (
-    scheme,
-    fuzzy_url,
-    merge_links,
-)
-
-from .config import URL_BLACKLIST_PTN
-
-
-def validate_links(links: Iterable[Link]) -> Iterable[Link]:
-    links = archivable_links(links)  # remove chrome://, about:, mailto: etc.
-    links = sorted_links(links)      # deterministically sort the links based on timstamp, url
-    links = uniquefied_links(links)  # merge/dedupe duplicate timestamps & urls
-
-    if not links:
-        print('[X] No links found :(')
-        raise SystemExit(1)
-
-    return links
-
-def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
-    """remove chrome://, about:// or other schemed links that cant be archived"""
-    for link in links:
-        scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
-        not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True
-        if scheme_is_valid and not_blacklisted:
-            yield link
-
-
-def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
-    """
-    ensures that all non-duplicate links have monotonically increasing timestamps
-    """
-
-    unique_urls: OrderedDict[str, Link] = OrderedDict()
-
-    for link in sorted_links:
-        fuzzy = fuzzy_url(link.url)
-        if fuzzy in unique_urls:
-            # merge with any other links that share the same url
-            link = merge_links(unique_urls[fuzzy], link)
-        unique_urls[fuzzy] = link
-
-    unique_timestamps: OrderedDict[str, Link] = OrderedDict()
-    for link in unique_urls.values():
-        new_link = link.overwrite(
-            timestamp=lowest_uniq_timestamp(unique_timestamps, link.timestamp),
-        )
-        unique_timestamps[new_link.timestamp] = new_link
-
-    return unique_timestamps.values()
-
-
-def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
-    sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url)
-    return sorted(links, key=sort_func, reverse=True)
-
-
-def links_after_timestamp(links: Iterable[Link], resume: float=None) -> Iterable[Link]:
-    if not resume:
-        yield from links
-        return
-
-    for link in links:
-        try:
-            if float(link.timestamp) <= resume:
-                yield link
-        except (ValueError, TypeError):
-            print('Resume value and all timestamp values must be valid numbers.')
-
-
-def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
-    """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
-
-    timestamp = timestamp.split('.')[0]
-    nonce = 0
-
-    # first try 152323423 before 152323423.0
-    if timestamp not in used_timestamps:
-        return timestamp
-
-    new_timestamp = '{}.{}'.format(timestamp, nonce)
-    while new_timestamp in used_timestamps:
-        nonce += 1
-        new_timestamp = '{}.{}'.format(timestamp, nonce)
-
-    return new_timestamp
-    
-    
diff --git a/archivebox/legacy/main.py b/archivebox/legacy/main.py
new file mode 100644
index 00000000..12680f5b
--- /dev/null
+++ b/archivebox/legacy/main.py
@@ -0,0 +1,80 @@
+import re
+import json
+
+from typing import List, Optional, Iterable
+
+from .schema import Link
+from .util import enforce_types, ExtendedEncoder
+from .index import (
+    links_after_timestamp,
+    load_links_index,
+    write_links_index,
+)
+from .archive_methods import archive_link
+from .config import (
+    ONLY_NEW,
+    OUTPUT_DIR,
+)
+from .logs import (
+    log_archiving_started,
+    log_archiving_paused,
+    log_archiving_finished,
+)
+
+
+@enforce_types
+def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None, only_new: bool=False) -> List[Link]:
+    """The main ArchiveBox entrancepoint. Everything starts here."""
+
+    # Step 1: Load list of links from the existing index
+    #         merge in and dedupe new links from import_path
+    all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
+
+    # Step 2: Write updated index with deduped old and new links back to disk
+    write_links_index(links=list(all_links), out_dir=OUTPUT_DIR)
+
+    # Step 3: Run the archive methods for each link
+    links = new_links if ONLY_NEW else all_links
+    log_archiving_started(len(links), resume)
+    idx: int = 0
+    link: Optional[Link] = None
+    try:
+        for idx, link in enumerate(links_after_timestamp(links, resume)):
+            archive_link(link, link_dir=link.link_dir)
+
+    except KeyboardInterrupt:
+        log_archiving_paused(len(links), idx, link.timestamp if link else '0')
+        raise SystemExit(0)
+
+    except:
+        print()
+        raise    
+
+    log_archiving_finished(len(links))
+
+    # Step 4: Re-write links index with updated titles, icons, and resources
+    all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
+    write_links_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
+    return all_links
+
+
+@enforce_types
+def list_archive_data(filter_regex: Optional[str]=None, after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
+    
+    all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
+
+    pattern = re.compile(filter_regex, re.IGNORECASE) if filter_regex else None
+
+    for link in all_links:
+        if pattern and not pattern.match(link.url):
+            continue
+        if after is not None and float(link.timestamp) < after:
+            continue
+        if before is not None and float(link.timestamp) > before:
+            continue
+
+        yield link
+
+
+def csv_format(link: Link, csv_cols: str) -> str:
+    return ','.join(json.dumps(getattr(link, col), cls=ExtendedEncoder) for col in csv_cols.split(','))
diff --git a/archivebox/legacy/purge.py b/archivebox/legacy/purge.py
index ddc64b6b..b36083f0 100755
--- a/archivebox/legacy/purge.py
+++ b/archivebox/legacy/purge.py
@@ -7,7 +7,11 @@ from shutil import rmtree
 from typing import List
 
 from .config import ARCHIVE_DIR, OUTPUT_DIR
-from .index import parse_json_links_index, write_html_links_index, write_json_links_index
+from .index import (
+    parse_json_links_index,
+    write_html_links_index,
+    write_json_links_index,
+)
 
 
 def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None:
diff --git a/archivebox/legacy/util.py b/archivebox/legacy/util.py
index 8121a988..a4f38316 100644
--- a/archivebox/legacy/util.py
+++ b/archivebox/legacy/util.py
@@ -404,59 +404,6 @@ def parse_date(date: Any) -> Optional[datetime]:
     raise ValueError('Tried to parse invalid date! {}'.format(date))
 
 
-
-### Link Helpers
-
-@enforce_types
-def merge_links(a: Link, b: Link) -> Link:
-    """deterministially merge two links, favoring longer field values over shorter,
-    and "cleaner" values over worse ones.
-    """
-    assert a.base_url == b.base_url, 'Cannot merge two links with different URLs'
-
-    url = a.url if len(a.url) > len(b.url) else b.url
-
-    possible_titles = [
-        title
-        for title in (a.title, b.title)
-        if title and title.strip() and '://' not in title
-    ]
-    title = None
-    if len(possible_titles) == 2:
-        title = max(possible_titles, key=lambda t: len(t))
-    elif len(possible_titles) == 1:
-        title = possible_titles[0]
-
-    timestamp = (
-        a.timestamp
-        if float(a.timestamp or 0) < float(b.timestamp or 0) else
-        b.timestamp
-    )
-
-    tags_set = (
-        set(tag.strip() for tag in (a.tags or '').split(','))
-        | set(tag.strip() for tag in (b.tags or '').split(','))
-    )
-    tags = ','.join(tags_set) or None
-
-    sources = list(set(a.sources + b.sources))
-
-    all_methods = set(list(a.history.keys()) + list(a.history.keys()))
-    history = {
-        method: (a.history.get(method) or []) + (b.history.get(method) or [])
-        for method in all_methods
-    }
-
-    return Link(
-        url=url,
-        timestamp=timestamp,
-        title=title,
-        tags=tags,
-        sources=sources,
-        history=history,
-    )
-
-
 @enforce_types
 def is_static_file(url: str) -> bool:
     """Certain URLs just point to a single static file, and 
@@ -467,16 +414,6 @@ def is_static_file(url: str) -> bool:
     return extension(url) in STATICFILE_EXTENSIONS
 
 
-@enforce_types
-def derived_link_info(link: Link) -> dict:
-    """extend link info with the archive urls and other derived data"""
-
-    info = link._asdict(extended=True)
-    info.update(link.canonical_outputs())
-
-    return info
-
-
 
 ### Python / System Helpers
 
@@ -696,3 +633,22 @@ def atomic_write(contents: Union[dict, str], path: str) -> None:
     finally:
         if os.path.exists(tmp_file):
             os.remove(tmp_file)
+
+
+def reject_stdin(caller: str) -> None:
+    """Tell the user they passed stdin to a command that doesn't accept it"""
+
+    if not sys.stdin.isatty():
+        stdin_raw_text = sys.stdin.read().strip()
+        if stdin_raw_text:
+            print(
+                '{red}[X] The "{}" command does not accept stdin.{reset}\n'.format(
+                    caller,
+                    **ANSI,
+                )
+            )
+            print('    Run archivebox "{} --help" to see usage and examples.'.format(
+                caller,
+            ))
+            print()
+            raise SystemExit(1)
diff --git a/bin/archivebox b/bin/archivebox
index 601d4c25..02c45790 100755
--- a/bin/archivebox
+++ b/bin/archivebox
@@ -8,8 +8,8 @@ BIN_DIR = os.path.dirname(os.path.abspath(__file__))
 REPO_DIR = os.path.abspath(os.path.join(BIN_DIR, os.pardir))
 sys.path.append(REPO_DIR)
 
-from archivebox.__main__ import main
+from archivebox.cli.archivebox import main
 
 
 if __name__ == '__main__':
-    main(sys.argv)
+    main()