From c4c8da3deb8b9e86b53d036c0df19c332b31696e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 17 Apr 2018 07:00:06 -0400 Subject: [PATCH] move archive.py to archive --- README.md | 24 +++++++-------- archive.py => archive | 72 ++++++++++++++++++++++++++++--------------- 2 files changed, 59 insertions(+), 37 deletions(-) rename archive.py => archive (59%) diff --git a/README.md b/README.md index 94cbfd0d..087a92e7 100644 --- a/README.md +++ b/README.md @@ -48,10 +48,10 @@ Follow the links here to find instructions for exporting bookmarks from each ser git clone https://github.com/pirate/bookmark-archiver cd bookmark-archiver/ ./setup.sh # install all dependencies -./archive.py ~/Downloads/bookmark_export.html # replace with the path to your export file from step 1 +./archive ~/Downloads/bookmark_export.html # replace with the path to your export file from step 1 # OR -./archive.py https://getpocket.com/users/yourusername/feed/all # url to an RSS, html, or json links file +./archive https://getpocket.com/users/yourusername/feed/all # url to an RSS, html, or json links file ``` **3. Done!** @@ -108,10 +108,10 @@ Those numbers are from running it single-threaded on my i5 machine with 50mbps d You can run it in parallel by using the `resume` feature, or by manually splitting export.html into multiple files: ```bash -./archive.py export.html 1498800000 & # second argument is timestamp to resume downloading from -./archive.py export.html 1498810000 & -./archive.py export.html 1498820000 & -./archive.py export.html 1498830000 & +./archive export.html 1498800000 & # second argument is timestamp to resume downloading from +./archive export.html 1498810000 & +./archive export.html 1498820000 & +./archive export.html 1498830000 & ``` Users have reported running it with 50k+ bookmarks with success (though it will take more RAM while running). @@ -119,7 +119,7 @@ Users have reported running it with 50k+ bookmarks with success (though it will You can tweak parameters via environment variables, or by editing `config.py` directly: ```bash -env CHROME_BINARY=google-chrome-stable RESOLUTION=1440,900 FETCH_PDF=False ./archive.py ~/Downloads/bookmarks_export.html +env CHROME_BINARY=google-chrome-stable RESOLUTION=1440,900 FETCH_PDF=False ./archive ~/Downloads/bookmarks_export.html ``` **Shell Options:** @@ -158,7 +158,7 @@ The chrome/chromium dependency is _optional_ and only required for screenshots a ## Publishing Your Archive -The archive produced by `./archive.py` is suitable for serving on any provider that can host static html (e.g. github pages!). +The archive produced by `./archive` is suitable for serving on any provider that can host static html (e.g. github pages!). You can also serve it from a home server or VPS by uploading the outputted `html` folder to your web directory, e.g. `/var/www/bookmark-archiver` and configuring your webserver. @@ -236,7 +236,7 @@ Follow the instruction links above in the "Quickstart" section to download your 1. Clone this repo `git clone https://github.com/pirate/bookmark-archiver` 3. `cd bookmark-archiver/` -4. `./archive.py ~/Downloads/bookmarks_export.html` +4. `./archive ~/Downloads/bookmarks_export.html` You may optionally specify a second argument to `archive.py export.html 153242424324` to resume the archive update at a specific timestamp. @@ -269,7 +269,7 @@ apt update; apt install google-chrome-beta python3 wget 2. Set the environment variable `CHROME_BINARY` to `google-chrome` before running: ```bash -env CHROME_BINARY=google-chrome ./archive.py ~/Downloads/bookmarks_export.html +env CHROME_BINARY=google-chrome ./archive ~/Downloads/bookmarks_export.html ``` If you're having any trouble trying to set up Google Chrome or Chromium, see the Troubleshooting section below. @@ -292,7 +292,7 @@ If you still need help, [the official Python docs](https://docs.python.org/3.6/u defaults to `chromium-browser` but can be manually specified with the environment variable `CHROME_BINARY`: ```bash -env CHROME_BINARY=/usr/local/bin/chromium-browser ./archive.py ~/Downloads/bookmarks_export.html +env CHROME_BINARY=/usr/local/bin/chromium-browser ./archive ~/Downloads/bookmarks_export.html ``` 1. Test to make sure you have Chrome on your `$PATH` with: @@ -320,7 +320,7 @@ brew cask upgrade chromium-browser 4. If a version is displayed and it's `>=59`, make sure `archive.py` is running the right one: ```bash -env CHROME_BINARY=/path/from/step/1/chromium-browser ./archive.py bookmarks_export.html # replace the path with the one you got from step 1 +env CHROME_BINARY=/path/from/step/1/chromium-browser ./archive bookmarks_export.html # replace the path with the one you got from step 1 ``` diff --git a/archive.py b/archive similarity index 59% rename from archive.py rename to archive index 5a2d6d10..3761de4e 100755 --- a/archive.py +++ b/archive @@ -25,7 +25,6 @@ from config import ( ) from util import ( download_url, - check_dependencies, progress, cleanup_archive, ) @@ -40,26 +39,36 @@ def print_help(): print(" ./archive.py ~/Downloads/bookmarks_export.html\n") -def get_links(new_links_file_path, archive_path=HTML_FOLDER): +def merge_links(archive_path=HTML_FOLDER, import_path=None): """get new links from file and optionally append them to links in existing archive""" - # parse and validate the new_links_file - raw_links = parse_links(new_links_file_path) - valid_links = validate_links(raw_links) + all_links = [] + if import_path: + # parse and validate the import file + raw_links = parse_links(import_path) + all_links = validate_links(raw_links) # merge existing links in archive_path and new links existing_links = [] if archive_path: existing_links = parse_json_links_index(archive_path) - valid_links = validate_links(existing_links + valid_links) + all_links = validate_links(existing_links + all_links) - num_new_links = len(valid_links) - len(existing_links) - print('[*] [{}] Adding {} new links from {} to index'.format( - datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - num_new_links, - new_links_file_path, - )) + num_new_links = len(all_links) - len(existing_links) + if import_path: + print('[*] [{}] Adding {} new links from {} to index'.format( + datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + num_new_links, + import_path, + )) + else: + print('[*] [{}] Running on existing index with {}{}{} links.'.format( + datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + ANSI['green'], + len(all_links), + ANSI['reset'], + )) - return valid_links + return all_links def update_archive(archive_path, links, source=None, resume=None, append=True): """update or create index.html+json given a path to an export file containing new links""" @@ -91,34 +100,47 @@ def update_archive(archive_path, links, source=None, resume=None, append=True): if __name__ == '__main__': argc = len(sys.argv) - if argc < 2 or set(sys.argv).intersection('-h', '--help', 'help'): + if set(sys.argv).intersection('-h', '--help', 'help'): print_help() raise SystemExit(0) - source = sys.argv[1] # path to export file + source = sys.argv[1] if argc > 1 else None # path of links file to import resume = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from + if argc == 1: + source, resume = None, None + elif argc == 2: + if all(d.isdigit() for d in sys.argv[1].split('.')): + # argv[1] is a resume timestamp + source, resume = None, sys.argv[1] + else: + # argv[1] is a path to a file to import + source, resume = sys.argv[1].strip(), None + elif argc == 3: + source, resume = sys.argv[1].strip(), sys.argv[1] + else: + print_help() + raise SystemExit(1) + # See if archive folder already exists - for out_folder in (HTML_FOLDER, 'bookmarks', 'pocket', 'pinboard', 'html'): - if os.path.exists(out_folder): + for out_dir in (HTML_FOLDER, 'bookmarks', 'pocket', 'pinboard', 'html'): + if os.path.exists(out_dir): break else: - out_folder = HTML_FOLDER - - archive_path = os.path.join(out_folder, 'archive') + out_dir = HTML_FOLDER # Step 0: Download url to local file (only happens if a URL is specified instead of local path) - if any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')): + if source and any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')): source = download_url(source) # Step 1: Parse the links and dedupe them with existing archive - links = get_links(source, archive_path=archive_path) + links = merge_links(archive_path=out_dir, import_path=source) # Step 2: Write new index - write_links_index(archive_path, links) + write_links_index(out_dir=out_dir, links=links) # Step 3: Verify folder structure is 1:1 with index - # cleanup_archive(archive_path, links) + # cleanup_archive(out_dir, links) # Step 4: Run the archive methods for each link - update_archive(archive_path, links, source=source, resume=resume, append=True) + update_archive(out_dir, links, source=source, resume=resume, append=True)