move archive.py to archive
This commit is contained in:
parent
9ea61bf364
commit
c4c8da3deb
2 changed files with 59 additions and 37 deletions
24
README.md
24
README.md
|
@ -48,10 +48,10 @@ Follow the links here to find instructions for exporting bookmarks from each ser
|
|||
git clone https://github.com/pirate/bookmark-archiver
|
||||
cd bookmark-archiver/
|
||||
./setup.sh # install all dependencies
|
||||
./archive.py ~/Downloads/bookmark_export.html # replace with the path to your export file from step 1
|
||||
./archive ~/Downloads/bookmark_export.html # replace with the path to your export file from step 1
|
||||
|
||||
# OR
|
||||
./archive.py https://getpocket.com/users/yourusername/feed/all # url to an RSS, html, or json links file
|
||||
./archive https://getpocket.com/users/yourusername/feed/all # url to an RSS, html, or json links file
|
||||
```
|
||||
|
||||
**3. Done!**
|
||||
|
@ -108,10 +108,10 @@ Those numbers are from running it single-threaded on my i5 machine with 50mbps d
|
|||
|
||||
You can run it in parallel by using the `resume` feature, or by manually splitting export.html into multiple files:
|
||||
```bash
|
||||
./archive.py export.html 1498800000 & # second argument is timestamp to resume downloading from
|
||||
./archive.py export.html 1498810000 &
|
||||
./archive.py export.html 1498820000 &
|
||||
./archive.py export.html 1498830000 &
|
||||
./archive export.html 1498800000 & # second argument is timestamp to resume downloading from
|
||||
./archive export.html 1498810000 &
|
||||
./archive export.html 1498820000 &
|
||||
./archive export.html 1498830000 &
|
||||
```
|
||||
Users have reported running it with 50k+ bookmarks with success (though it will take more RAM while running).
|
||||
|
||||
|
@ -119,7 +119,7 @@ Users have reported running it with 50k+ bookmarks with success (though it will
|
|||
|
||||
You can tweak parameters via environment variables, or by editing `config.py` directly:
|
||||
```bash
|
||||
env CHROME_BINARY=google-chrome-stable RESOLUTION=1440,900 FETCH_PDF=False ./archive.py ~/Downloads/bookmarks_export.html
|
||||
env CHROME_BINARY=google-chrome-stable RESOLUTION=1440,900 FETCH_PDF=False ./archive ~/Downloads/bookmarks_export.html
|
||||
```
|
||||
|
||||
**Shell Options:**
|
||||
|
@ -158,7 +158,7 @@ The chrome/chromium dependency is _optional_ and only required for screenshots a
|
|||
|
||||
## Publishing Your Archive
|
||||
|
||||
The archive produced by `./archive.py` is suitable for serving on any provider that can host static html (e.g. github pages!).
|
||||
The archive produced by `./archive` is suitable for serving on any provider that can host static html (e.g. github pages!).
|
||||
|
||||
You can also serve it from a home server or VPS by uploading the outputted `html` folder to your web directory, e.g. `/var/www/bookmark-archiver` and configuring your webserver.
|
||||
|
||||
|
@ -236,7 +236,7 @@ Follow the instruction links above in the "Quickstart" section to download your
|
|||
|
||||
1. Clone this repo `git clone https://github.com/pirate/bookmark-archiver`
|
||||
3. `cd bookmark-archiver/`
|
||||
4. `./archive.py ~/Downloads/bookmarks_export.html`
|
||||
4. `./archive ~/Downloads/bookmarks_export.html`
|
||||
|
||||
You may optionally specify a second argument to `archive.py export.html 153242424324` to resume the archive update at a specific timestamp.
|
||||
|
||||
|
@ -269,7 +269,7 @@ apt update; apt install google-chrome-beta python3 wget
|
|||
2. Set the environment variable `CHROME_BINARY` to `google-chrome` before running:
|
||||
|
||||
```bash
|
||||
env CHROME_BINARY=google-chrome ./archive.py ~/Downloads/bookmarks_export.html
|
||||
env CHROME_BINARY=google-chrome ./archive ~/Downloads/bookmarks_export.html
|
||||
```
|
||||
If you're having any trouble trying to set up Google Chrome or Chromium, see the Troubleshooting section below.
|
||||
|
||||
|
@ -292,7 +292,7 @@ If you still need help, [the official Python docs](https://docs.python.org/3.6/u
|
|||
defaults to `chromium-browser` but can be manually specified with the environment variable `CHROME_BINARY`:
|
||||
|
||||
```bash
|
||||
env CHROME_BINARY=/usr/local/bin/chromium-browser ./archive.py ~/Downloads/bookmarks_export.html
|
||||
env CHROME_BINARY=/usr/local/bin/chromium-browser ./archive ~/Downloads/bookmarks_export.html
|
||||
```
|
||||
|
||||
1. Test to make sure you have Chrome on your `$PATH` with:
|
||||
|
@ -320,7 +320,7 @@ brew cask upgrade chromium-browser
|
|||
4. If a version is displayed and it's `>=59`, make sure `archive.py` is running the right one:
|
||||
|
||||
```bash
|
||||
env CHROME_BINARY=/path/from/step/1/chromium-browser ./archive.py bookmarks_export.html # replace the path with the one you got from step 1
|
||||
env CHROME_BINARY=/path/from/step/1/chromium-browser ./archive bookmarks_export.html # replace the path with the one you got from step 1
|
||||
```
|
||||
|
||||
|
||||
|
|
|
@ -25,7 +25,6 @@ from config import (
|
|||
)
|
||||
from util import (
|
||||
download_url,
|
||||
check_dependencies,
|
||||
progress,
|
||||
cleanup_archive,
|
||||
)
|
||||
|
@ -40,26 +39,36 @@ def print_help():
|
|||
print(" ./archive.py ~/Downloads/bookmarks_export.html\n")
|
||||
|
||||
|
||||
def get_links(new_links_file_path, archive_path=HTML_FOLDER):
|
||||
def merge_links(archive_path=HTML_FOLDER, import_path=None):
|
||||
"""get new links from file and optionally append them to links in existing archive"""
|
||||
# parse and validate the new_links_file
|
||||
raw_links = parse_links(new_links_file_path)
|
||||
valid_links = validate_links(raw_links)
|
||||
all_links = []
|
||||
if import_path:
|
||||
# parse and validate the import file
|
||||
raw_links = parse_links(import_path)
|
||||
all_links = validate_links(raw_links)
|
||||
|
||||
# merge existing links in archive_path and new links
|
||||
existing_links = []
|
||||
if archive_path:
|
||||
existing_links = parse_json_links_index(archive_path)
|
||||
valid_links = validate_links(existing_links + valid_links)
|
||||
all_links = validate_links(existing_links + all_links)
|
||||
|
||||
num_new_links = len(valid_links) - len(existing_links)
|
||||
print('[*] [{}] Adding {} new links from {} to index'.format(
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_new_links,
|
||||
new_links_file_path,
|
||||
))
|
||||
num_new_links = len(all_links) - len(existing_links)
|
||||
if import_path:
|
||||
print('[*] [{}] Adding {} new links from {} to index'.format(
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_new_links,
|
||||
import_path,
|
||||
))
|
||||
else:
|
||||
print('[*] [{}] Running on existing index with {}{}{} links.'.format(
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
ANSI['green'],
|
||||
len(all_links),
|
||||
ANSI['reset'],
|
||||
))
|
||||
|
||||
return valid_links
|
||||
return all_links
|
||||
|
||||
def update_archive(archive_path, links, source=None, resume=None, append=True):
|
||||
"""update or create index.html+json given a path to an export file containing new links"""
|
||||
|
@ -91,34 +100,47 @@ def update_archive(archive_path, links, source=None, resume=None, append=True):
|
|||
if __name__ == '__main__':
|
||||
argc = len(sys.argv)
|
||||
|
||||
if argc < 2 or set(sys.argv).intersection('-h', '--help', 'help'):
|
||||
if set(sys.argv).intersection('-h', '--help', 'help'):
|
||||
print_help()
|
||||
raise SystemExit(0)
|
||||
|
||||
source = sys.argv[1] # path to export file
|
||||
source = sys.argv[1] if argc > 1 else None # path of links file to import
|
||||
resume = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from
|
||||
|
||||
if argc == 1:
|
||||
source, resume = None, None
|
||||
elif argc == 2:
|
||||
if all(d.isdigit() for d in sys.argv[1].split('.')):
|
||||
# argv[1] is a resume timestamp
|
||||
source, resume = None, sys.argv[1]
|
||||
else:
|
||||
# argv[1] is a path to a file to import
|
||||
source, resume = sys.argv[1].strip(), None
|
||||
elif argc == 3:
|
||||
source, resume = sys.argv[1].strip(), sys.argv[1]
|
||||
else:
|
||||
print_help()
|
||||
raise SystemExit(1)
|
||||
|
||||
# See if archive folder already exists
|
||||
for out_folder in (HTML_FOLDER, 'bookmarks', 'pocket', 'pinboard', 'html'):
|
||||
if os.path.exists(out_folder):
|
||||
for out_dir in (HTML_FOLDER, 'bookmarks', 'pocket', 'pinboard', 'html'):
|
||||
if os.path.exists(out_dir):
|
||||
break
|
||||
else:
|
||||
out_folder = HTML_FOLDER
|
||||
|
||||
archive_path = os.path.join(out_folder, 'archive')
|
||||
out_dir = HTML_FOLDER
|
||||
|
||||
# Step 0: Download url to local file (only happens if a URL is specified instead of local path)
|
||||
if any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
||||
if source and any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
||||
source = download_url(source)
|
||||
|
||||
# Step 1: Parse the links and dedupe them with existing archive
|
||||
links = get_links(source, archive_path=archive_path)
|
||||
links = merge_links(archive_path=out_dir, import_path=source)
|
||||
|
||||
# Step 2: Write new index
|
||||
write_links_index(archive_path, links)
|
||||
write_links_index(out_dir=out_dir, links=links)
|
||||
|
||||
# Step 3: Verify folder structure is 1:1 with index
|
||||
# cleanup_archive(archive_path, links)
|
||||
# cleanup_archive(out_dir, links)
|
||||
|
||||
# Step 4: Run the archive methods for each link
|
||||
update_archive(archive_path, links, source=source, resume=resume, append=True)
|
||||
update_archive(out_dir, links, source=source, resume=resume, append=True)
|
Loading…
Reference in a new issue