add user-data-dir support for chrome headless
This commit is contained in:
parent
81ab050cd2
commit
5758cc2a78
3 changed files with 25 additions and 6 deletions
13
README.md
13
README.md
|
@ -39,6 +39,9 @@ git clone https://github.com/pirate/bookmark-archiver
|
||||||
cd bookmark-archiver/
|
cd bookmark-archiver/
|
||||||
./setup.sh #install ALL dependencies
|
./setup.sh #install ALL dependencies
|
||||||
./archive.py ~/Downloads/bookmark_export.html # replace with the path to your export file from step 1
|
./archive.py ~/Downloads/bookmark_export.html # replace with the path to your export file from step 1
|
||||||
|
|
||||||
|
# OR
|
||||||
|
./archive.py https://getpocket.com/users/yourusername/feed/all # url to an RSS, html, or json links file
|
||||||
```
|
```
|
||||||
|
|
||||||
**3. Done!**
|
**3. Done!**
|
||||||
|
@ -47,6 +50,7 @@ You can open `service/index.html` to view your archive. (favicons will appear n
|
||||||
|
|
||||||
If you want to host your archive somewhere to share it with other people, see the [Publishing Your Archive](#publishing-your-archive) section below.
|
If you want to host your archive somewhere to share it with other people, see the [Publishing Your Archive](#publishing-your-archive) section below.
|
||||||
|
|
||||||
|
If you want to run this as a regular script that pulls new URLs, stick it in `cron` with the second parameter as the URL to your RSS feed.
|
||||||
|
|
||||||
If you have any trouble, see the [Troubleshooting](#troubleshooting) section at the bottom.
|
If you have any trouble, see the [Troubleshooting](#troubleshooting) section at the bottom.
|
||||||
If you'd like to customize options, see the [Configuration](#configuration) section.
|
If you'd like to customize options, see the [Configuration](#configuration) section.
|
||||||
|
@ -66,11 +70,11 @@ For each sites it saves:
|
||||||
- `screenshot.png` 1440x900 screenshot of site using headless chrome
|
- `screenshot.png` 1440x900 screenshot of site using headless chrome
|
||||||
- `output.pdf` Printed PDF of site using headless chrome
|
- `output.pdf` Printed PDF of site using headless chrome
|
||||||
- `archive.org.txt` A link to the saved site on archive.org
|
- `archive.org.txt` A link to the saved site on archive.org
|
||||||
- `link.json` A json file containing link info and archive status
|
|
||||||
- `audio/` and `video/` for sites like youtube, soundcloud, etc. (using youtube-dl) (WIP)
|
- `audio/` and `video/` for sites like youtube, soundcloud, etc. (using youtube-dl) (WIP)
|
||||||
|
- `index.json` JSON index containing link info and archive details
|
||||||
|
- `index.html` HTML index containing link info and archive details
|
||||||
|
|
||||||
Wget and Chrome [don't work](https://bugs.chromium.org/p/chromium/issues/detail?id=617931) on sites you need to be logged into (yet).
|
Wget doesn't work on sites you need to be logged into, but chrome headless does, see the [Configuration](#configuration)* section for `CHROME_USER_DATA_DIR`.
|
||||||
`chrome --headless` essentially runs in an incognito mode session, until they add support for `--user-data-dir=`.
|
|
||||||
|
|
||||||
**Large Exports & Estimated Runtime:**
|
**Large Exports & Estimated Runtime:**
|
||||||
|
|
||||||
|
@ -113,10 +117,13 @@ env CHROME_BINARY=google-chrome-stable RESOLUTION=1440,900 FETCH_PDF=False ./arc
|
||||||
- submit the page to archive.org: `SUBMIT_ARCHIVE_DOT_ORG`
|
- submit the page to archive.org: `SUBMIT_ARCHIVE_DOT_ORG`
|
||||||
- screenshot: `RESOLUTION` values: [`1440,900`]/`1024,768`/`...`
|
- screenshot: `RESOLUTION` values: [`1440,900`]/`1024,768`/`...`
|
||||||
- user agent: `WGET_USER_AGENT` values: [`Wget/1.19.1`]/`"Mozilla/5.0 ..."`/`...`
|
- user agent: `WGET_USER_AGENT` values: [`Wget/1.19.1`]/`"Mozilla/5.0 ..."`/`...`
|
||||||
|
- chrome profile: `CHROME_USER_DATA_DIR` values: `~/Library/Application\ Support/Google/Chrome/Default`/`/tmp/chrome-profile`/`...`
|
||||||
|
To capture sites that require a user to be logged in, you must specify a path to a chrome profile (which loads the cookies needed for the user to be logged in). If you don't have an existing chrome profile, create one with `chromium-browser --disable-gpu --user-data-dir=/tmp/chrome-profile`, and log into the sites you need. Then set `CHROME_USER_DATA_DIR=/tmp/chrome-profile` to make Bookmark Archiver use that profile.
|
||||||
|
|
||||||
**Index Options:**
|
**Index Options:**
|
||||||
- html index template: `INDEX_TEMPLATE` value: `templates/index.html`/`...`
|
- html index template: `INDEX_TEMPLATE` value: `templates/index.html`/`...`
|
||||||
- html index row template: `INDEX_ROW_TEMPLATE` value: `templates/index_row.html`/`...`
|
- html index row template: `INDEX_ROW_TEMPLATE` value: `templates/index_row.html`/`...`
|
||||||
|
- html link index template: `LINK_INDEX_TEMPLATE` value: `templates/link_index_fancy.html`/`templates/link_index.html`/`...`
|
||||||
|
|
||||||
(See defaults & more at the top of `config.py`)
|
(See defaults & more at the top of `config.py`)
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,7 @@ from config import (
|
||||||
FETCH_VIDEO,
|
FETCH_VIDEO,
|
||||||
FETCH_FAVICON,
|
FETCH_FAVICON,
|
||||||
WGET_USER_AGENT,
|
WGET_USER_AGENT,
|
||||||
|
CHROME_USER_DATA_DIR,
|
||||||
TIMEOUT,
|
TIMEOUT,
|
||||||
ANSI,
|
ANSI,
|
||||||
)
|
)
|
||||||
|
@ -35,7 +36,6 @@ _RESULTS_TOTALS = { # globals are bad, mmkay
|
||||||
'failed': 0,
|
'failed': 0,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def archive_links(out_dir, links, export_path, resume=None):
|
def archive_links(out_dir, links, export_path, resume=None):
|
||||||
check_dependencies()
|
check_dependencies()
|
||||||
|
|
||||||
|
@ -198,7 +198,7 @@ def fetch_wget(out_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT)
|
||||||
|
|
||||||
|
|
||||||
@attach_result_to_link('pdf')
|
@attach_result_to_link('pdf')
|
||||||
def fetch_pdf(out_dir, link, timeout=TIMEOUT):
|
def fetch_pdf(out_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
|
||||||
"""print PDF of site to file using chrome --headless"""
|
"""print PDF of site to file using chrome --headless"""
|
||||||
|
|
||||||
if link['type'] in ('PDF', 'image'):
|
if link['type'] in ('PDF', 'image'):
|
||||||
|
@ -210,6 +210,7 @@ def fetch_pdf(out_dir, link, timeout=TIMEOUT):
|
||||||
CMD = [
|
CMD = [
|
||||||
CHROME_BINARY,
|
CHROME_BINARY,
|
||||||
*'--headless --disable-gpu --print-to-pdf'.split(' '),
|
*'--headless --disable-gpu --print-to-pdf'.split(' '),
|
||||||
|
*chrome_data_dir_args(user_data_dir),
|
||||||
link['url']
|
link['url']
|
||||||
]
|
]
|
||||||
end = progress(timeout, prefix=' ')
|
end = progress(timeout, prefix=' ')
|
||||||
|
@ -233,7 +234,7 @@ def fetch_pdf(out_dir, link, timeout=TIMEOUT):
|
||||||
|
|
||||||
|
|
||||||
@attach_result_to_link('screenshot')
|
@attach_result_to_link('screenshot')
|
||||||
def fetch_screenshot(out_dir, link, timeout=TIMEOUT, resolution=RESOLUTION):
|
def fetch_screenshot(out_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR, resolution=RESOLUTION):
|
||||||
"""take screenshot of site using chrome --headless"""
|
"""take screenshot of site using chrome --headless"""
|
||||||
|
|
||||||
if link['type'] in ('PDF', 'image'):
|
if link['type'] in ('PDF', 'image'):
|
||||||
|
@ -245,6 +246,7 @@ def fetch_screenshot(out_dir, link, timeout=TIMEOUT, resolution=RESOLUTION):
|
||||||
CMD = [
|
CMD = [
|
||||||
CHROME_BINARY,
|
CHROME_BINARY,
|
||||||
*'--headless --disable-gpu --screenshot'.split(' '),
|
*'--headless --disable-gpu --screenshot'.split(' '),
|
||||||
|
*chrome_data_dir_args(user_data_dir),
|
||||||
'--window-size={}'.format(resolution),
|
'--window-size={}'.format(resolution),
|
||||||
link['url']
|
link['url']
|
||||||
]
|
]
|
||||||
|
@ -414,3 +416,12 @@ def fetch_favicon(out_dir, link, timeout=TIMEOUT):
|
||||||
# raise
|
# raise
|
||||||
# else:
|
# else:
|
||||||
# print(' √ Skipping video download')
|
# print(' √ Skipping video download')
|
||||||
|
|
||||||
|
|
||||||
|
def chrome_data_dir_args(user_data_dir=CHROME_USER_DATA_DIR):
|
||||||
|
default = os.path.expanduser('~/Library/Application Support/Google/Chrome/Default')
|
||||||
|
if user_data_dir:
|
||||||
|
return ('--user-data-dir={}'.format(user_data_dir),)
|
||||||
|
elif os.path.exists(default):
|
||||||
|
return ('--user-data-dir={}'.format(default),)
|
||||||
|
return ()
|
||||||
|
|
|
@ -27,6 +27,7 @@ ARCHIVE_DIR = os.getenv('ARCHIVE_DIR', '')
|
||||||
CHROME_BINARY = os.getenv('CHROME_BINARY', 'chromium-browser' ) # change to google-chrome browser if using google-chrome
|
CHROME_BINARY = os.getenv('CHROME_BINARY', 'chromium-browser' ) # change to google-chrome browser if using google-chrome
|
||||||
WGET_BINARY = os.getenv('WGET_BINARY', 'wget' )
|
WGET_BINARY = os.getenv('WGET_BINARY', 'wget' )
|
||||||
WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', None)
|
WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', None)
|
||||||
|
CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None)
|
||||||
TIMEOUT = int(os.getenv('TIMEOUT', '60'))
|
TIMEOUT = int(os.getenv('TIMEOUT', '60'))
|
||||||
LINK_INDEX_TEMPLATE = os.getenv('LINK_INDEX_TEMPLATE', 'templates/link_index_fancy.html')
|
LINK_INDEX_TEMPLATE = os.getenv('LINK_INDEX_TEMPLATE', 'templates/link_index_fancy.html')
|
||||||
INDEX_TEMPLATE = os.getenv('INDEX_TEMPLATE', 'templates/index.html')
|
INDEX_TEMPLATE = os.getenv('INDEX_TEMPLATE', 'templates/index.html')
|
||||||
|
|
Loading…
Reference in a new issue