move archive.py to archive

2018-04-17 07:00:06 -04:00 · 2018-04-17 07:00:06 -04:00 · c4c8da3deb
commit c4c8da3deb
parent 9ea61bf364
2 changed files with 59 additions and 37 deletions
--- a/README.md
+++ b/README.md
@ -48,10 +48,10 @@ Follow the links here to find instructions for exporting bookmarks from each ser
 git clone https://github.com/pirate/bookmark-archiver
 cd bookmark-archiver/
 ./setup.sh                                      # install all dependencies
-./archive.py ~/Downloads/bookmark_export.html   # replace with the path to your export file from step 1
+./archive ~/Downloads/bookmark_export.html   # replace with the path to your export file from step 1
 # OR
-./archive.py https://getpocket.com/users/yourusername/feed/all  # url to an RSS, html, or json links file
+./archive https://getpocket.com/users/yourusername/feed/all  # url to an RSS, html, or json links file
 ```
 **3. Done!**
@ -108,10 +108,10 @@ Those numbers are from running it single-threaded on my i5 machine with 50mbps d
 You can run it in parallel by using the `resume` feature, or by manually splitting export.html into multiple files:
 ```bash
-./archive.py export.html 1498800000 &  # second argument is timestamp to resume downloading from
+./archive export.html 1498800000 &  # second argument is timestamp to resume downloading from
-./archive.py export.html 1498810000 &
+./archive export.html 1498810000 &
-./archive.py export.html 1498820000 &
+./archive export.html 1498820000 &
-./archive.py export.html 1498830000 &
+./archive export.html 1498830000 &
 ```
 Users have reported running it with 50k+ bookmarks with success (though it will take more RAM while running).
@ -119,7 +119,7 @@ Users have reported running it with 50k+ bookmarks with success (though it will
 You can tweak parameters via environment variables, or by editing `config.py` directly:
 ```bash
-env CHROME_BINARY=google-chrome-stable RESOLUTION=1440,900 FETCH_PDF=False ./archive.py ~/Downloads/bookmarks_export.html
+env CHROME_BINARY=google-chrome-stable RESOLUTION=1440,900 FETCH_PDF=False ./archive ~/Downloads/bookmarks_export.html
 ```
 **Shell Options:**
@ -158,7 +158,7 @@ The chrome/chromium dependency is _optional_ and only required for screenshots a
 ## Publishing Your Archive
-The archive produced by `./archive.py` is suitable for serving on any provider that can host static html (e.g. github pages!).
+The archive produced by `./archive` is suitable for serving on any provider that can host static html (e.g. github pages!).
 You can also serve it from a home server or VPS by uploading the outputted `html` folder to your web directory, e.g. `/var/www/bookmark-archiver` and configuring your webserver.
@ -236,7 +236,7 @@ Follow the instruction links above in the "Quickstart" section to download your
 1. Clone this repo `git clone https://github.com/pirate/bookmark-archiver`
 3. `cd bookmark-archiver/`
-4. `./archive.py ~/Downloads/bookmarks_export.html`
+4. `./archive ~/Downloads/bookmarks_export.html`
 You may optionally specify a second argument to `archive.py export.html 153242424324` to resume the archive update at a specific timestamp.
@ -269,7 +269,7 @@ apt update; apt install google-chrome-beta python3 wget
 2. Set the environment variable `CHROME_BINARY` to `google-chrome` before running:
 ```bash
-env CHROME_BINARY=google-chrome ./archive.py ~/Downloads/bookmarks_export.html
+env CHROME_BINARY=google-chrome ./archive ~/Downloads/bookmarks_export.html
 ```
 If you're having any trouble trying to set up Google Chrome or Chromium, see the Troubleshooting section below.
@ -292,7 +292,7 @@ If you still need help, [the official Python docs](https://docs.python.org/3.6/u
 defaults to `chromium-browser` but can be manually specified with the environment variable `CHROME_BINARY`:
 ```bash
-env CHROME_BINARY=/usr/local/bin/chromium-browser ./archive.py ~/Downloads/bookmarks_export.html
+env CHROME_BINARY=/usr/local/bin/chromium-browser ./archive ~/Downloads/bookmarks_export.html
 ```
 1. Test to make sure you have Chrome on your `$PATH` with:
@ -320,7 +320,7 @@ brew cask upgrade chromium-browser
 4. If a version is displayed and it's `>=59`, make sure `archive.py` is running the right one:
 ```bash
-env CHROME_BINARY=/path/from/step/1/chromium-browser ./archive.py bookmarks_export.html   # replace the path with the one you got from step 1
+env CHROME_BINARY=/path/from/step/1/chromium-browser ./archive bookmarks_export.html   # replace the path with the one you got from step 1
 ```
--- a/archive.py
+++ b/archive.py
@ -25,7 +25,6 @@ from config import (
 )
 from util import (
    download_url,
    check_dependencies,
    progress,
    cleanup_archive,
 )
@ -40,26 +39,36 @@ def print_help():
    print("    ./archive.py ~/Downloads/bookmarks_export.html\n")
-def get_links(new_links_file_path, archive_path=HTML_FOLDER):
+def merge_links(archive_path=HTML_FOLDER, import_path=None):
    """get new links from file and optionally append them to links in existing archive"""
-    # parse and validate the new_links_file
+    all_links = []
-    raw_links = parse_links(new_links_file_path)
+    if import_path:
-    valid_links = validate_links(raw_links)
+        # parse and validate the import file
        raw_links = parse_links(import_path)
        all_links = validate_links(raw_links)
    # merge existing links in archive_path and new links
    existing_links = []
    if archive_path:
        existing_links = parse_json_links_index(archive_path)
-        valid_links = validate_links(existing_links + valid_links)
+        all_links = validate_links(existing_links + all_links)
-    num_new_links = len(valid_links) - len(existing_links)
+    num_new_links = len(all_links) - len(existing_links)
    if import_path:
        print('[*] [{}] Adding {} new links from {} to index'.format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            num_new_links,
-        new_links_file_path,
+            import_path,
        ))
    else:
        print('[*] [{}] Running on existing index with {}{}{} links.'.format(
            datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            ANSI['green'],
            len(all_links),
            ANSI['reset'],
        ))
-    return valid_links
+    return all_links
 def update_archive(archive_path, links, source=None, resume=None, append=True):
    """update or create index.html+json given a path to an export file containing new links"""
@ -91,34 +100,47 @@ def update_archive(archive_path, links, source=None, resume=None, append=True):
 if __name__ == '__main__':
    argc = len(sys.argv)
-    if argc < 2 or set(sys.argv).intersection('-h', '--help', 'help'):
+    if set(sys.argv).intersection('-h', '--help', 'help'):
        print_help()
        raise SystemExit(0)
-    source = sys.argv[1]                        # path to export file
+    source = sys.argv[1] if argc > 1 else None  # path of links file to import
    resume = sys.argv[2] if argc > 2 else None  # timestamp to resume dowloading from
    if argc == 1:
        source, resume = None, None
    elif argc == 2:
        if all(d.isdigit() for d in sys.argv[1].split('.')):
            # argv[1] is a resume timestamp
            source, resume = None, sys.argv[1]
        else:
            # argv[1] is a path to a file to import
            source, resume = sys.argv[1].strip(), None
    elif argc == 3:
        source, resume = sys.argv[1].strip(), sys.argv[1]
    else:
        print_help()
        raise SystemExit(1)
    # See if archive folder already exists
-    for out_folder in (HTML_FOLDER, 'bookmarks', 'pocket', 'pinboard', 'html'):
+    for out_dir in (HTML_FOLDER, 'bookmarks', 'pocket', 'pinboard', 'html'):
-        if os.path.exists(out_folder):
+        if os.path.exists(out_dir):
            break
    else:
-        out_folder = HTML_FOLDER
+        out_dir = HTML_FOLDER
    archive_path = os.path.join(out_folder, 'archive')
    # Step 0: Download url to local file (only happens if a URL is specified instead of local path) 
-    if any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
+    if source and any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
        source = download_url(source)
    # Step 1: Parse the links and dedupe them with existing archive
-    links = get_links(source, archive_path=archive_path)
+    links = merge_links(archive_path=out_dir, import_path=source)
    # Step 2: Write new index
-    write_links_index(archive_path, links)
+    write_links_index(out_dir=out_dir, links=links)
    # Step 3: Verify folder structure is 1:1 with index
-    # cleanup_archive(archive_path, links)
+    # cleanup_archive(out_dir, links)
    # Step 4: Run the archive methods for each link
-    update_archive(archive_path, links, source=source, resume=resume, append=True)
+    update_archive(out_dir, links, source=source, resume=resume, append=True)