feat: Add options to ease management of node related extractors
This commit is contained in:
parent
73fad928df
commit
cc0fa747ce
4 changed files with 31 additions and 11 deletions
25
README.md
25
README.md
|
@ -2,12 +2,12 @@
|
||||||
<img src="https://i.imgur.com/4nkFjdv.png" height="80px">
|
<img src="https://i.imgur.com/4nkFjdv.png" height="80px">
|
||||||
<h1>ArchiveBox<br/><sub>The open-source self-hosted web archive.</sub></h1>
|
<h1>ArchiveBox<br/><sub>The open-source self-hosted web archive.</sub></h1>
|
||||||
|
|
||||||
▶️ <a href="https://github.com/pirate/ArchiveBox/wiki/Quickstart">Quickstart</a> |
|
▶️ <a href="https://github.com/pirate/ArchiveBox/wiki/Quickstart">Quickstart</a> |
|
||||||
<a href="https://archivebox.zervice.io/">Demo</a> |
|
<a href="https://archivebox.zervice.io/">Demo</a> |
|
||||||
<a href="https://github.com/pirate/ArchiveBox">Github</a> |
|
<a href="https://github.com/pirate/ArchiveBox">Github</a> |
|
||||||
<a href="https://github.com/pirate/ArchiveBox/wiki">Documentation</a> |
|
<a href="https://github.com/pirate/ArchiveBox/wiki">Documentation</a> |
|
||||||
<a href="#background--motivation">Info & Motivation</a> |
|
<a href="#background--motivation">Info & Motivation</a> |
|
||||||
<a href="https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community">Community</a> |
|
<a href="https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community">Community</a> |
|
||||||
<a href="https://github.com/pirate/ArchiveBox/wiki/Roadmap">Roadmap</a>
|
<a href="https://github.com/pirate/ArchiveBox/wiki/Roadmap">Roadmap</a>
|
||||||
|
|
||||||
<pre>
|
<pre>
|
||||||
|
@ -22,6 +22,7 @@
|
||||||
<a href="https://test.pypi.org/project/archivebox/"><img src="https://img.shields.io/badge/Python-%3E%3D3.7-yellow.svg?logo=python&logoColor=yellow"/></a>
|
<a href="https://test.pypi.org/project/archivebox/"><img src="https://img.shields.io/badge/Python-%3E%3D3.7-yellow.svg?logo=python&logoColor=yellow"/></a>
|
||||||
<a href="https://github.com/pirate/ArchiveBox/wiki/Install#dependencies"><img src="https://img.shields.io/badge/Chromium-%3E%3D59-orange.svg?logo=Google+Chrome&logoColor=orange"/></a>
|
<a href="https://github.com/pirate/ArchiveBox/wiki/Install#dependencies"><img src="https://img.shields.io/badge/Chromium-%3E%3D59-orange.svg?logo=Google+Chrome&logoColor=orange"/></a>
|
||||||
<a href="https://hub.docker.com/r/nikisweeting/archivebox"><img src="https://img.shields.io/badge/Docker-all%20platforms-lightblue.svg?logo=docker&logoColor=lightblue"/></a>
|
<a href="https://hub.docker.com/r/nikisweeting/archivebox"><img src="https://img.shields.io/badge/Docker-all%20platforms-lightblue.svg?logo=docker&logoColor=lightblue"/></a>
|
||||||
|
|
||||||
<hr/>
|
<hr/>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -56,8 +57,8 @@ ArchiveBox is written in Python 3.7 and uses wget, Chrome headless, youtube-dl,
|
||||||
|
|
||||||
## Quickstart
|
## Quickstart
|
||||||
|
|
||||||
ArchiveBox is written in `python3.7` and has [3 main binary dependencies](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies): `wget`, `chromium`, and `youtube-dl`.
|
ArchiveBox is written in `python3.7` and has [4 main binary dependencies](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies): `wget`, `chromium`, `youtube-dl` and `nodejs`.
|
||||||
To get started, you can [install them manually](https://github.com/pirate/ArchiveBox/wiki/Install) using your system's package manager, use the [automated helper script](https://github.com/pirate/ArchiveBox/wiki/Quickstart), or use the official [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker) container. All three dependencies are optional if [disabled](https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles) in settings.
|
To get started, you can [install them manually](https://github.com/pirate/ArchiveBox/wiki/Install) using your system's package manager, use the [automated helper script](https://github.com/pirate/ArchiveBox/wiki/Quickstart), or use the official [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker) container. These dependencies are optional if [disabled](https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles) in settings.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Docker
|
# Docker
|
||||||
|
@ -82,9 +83,16 @@ open http://127.0.0.1:8000
|
||||||
```bash
|
```bash
|
||||||
# Bare Metal
|
# Bare Metal
|
||||||
# Use apt on Ubuntu/Debian, brew on mac, or pkg on BSD
|
# Use apt on Ubuntu/Debian, brew on mac, or pkg on BSD
|
||||||
|
# You may need to add a ppa with a more recent version of nodejs
|
||||||
apt install python3 python3-pip git curl wget youtube-dl chromium-browser
|
apt install python3 python3-pip git curl wget youtube-dl chromium-browser
|
||||||
|
|
||||||
|
curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \
|
||||||
|
&& echo 'deb https://deb.nodesource.com/node_14.x buster main' >> /etc/apt/sources.list \
|
||||||
|
&& apt-get update -qq \
|
||||||
|
&& apt-get install -qq -y --no-install-recommends nodejs
|
||||||
|
|
||||||
pip install archivebox # install archivebox
|
pip install archivebox # install archivebox
|
||||||
|
npm run setup
|
||||||
|
|
||||||
mkdir data && cd data # (doesn't have to be called data)
|
mkdir data && cd data # (doesn't have to be called data)
|
||||||
archivebox init
|
archivebox init
|
||||||
|
@ -97,6 +105,7 @@ archivebox add https://getpocket.com/users/USERNAME/feed/all --depth=1
|
||||||
Once you've added your first links, open `data/index.html` in a browser to view the static archive.
|
Once you've added your first links, open `data/index.html` in a browser to view the static archive.
|
||||||
|
|
||||||
You can also start it as a server with a full web UI to manage your links:
|
You can also start it as a server with a full web UI to manage your links:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
archivebox manage createsuperuser
|
archivebox manage createsuperuser
|
||||||
archivebox server
|
archivebox server
|
||||||
|
|
|
@ -112,6 +112,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
|
||||||
'USE_READABILITY': {'type': bool, 'default': True},
|
'USE_READABILITY': {'type': bool, 'default': True},
|
||||||
'USE_GIT': {'type': bool, 'default': True},
|
'USE_GIT': {'type': bool, 'default': True},
|
||||||
'USE_CHROME': {'type': bool, 'default': True},
|
'USE_CHROME': {'type': bool, 'default': True},
|
||||||
|
'USE_NODE': {'type': bool, 'default': True},
|
||||||
'USE_YOUTUBEDL': {'type': bool, 'default': True},
|
'USE_YOUTUBEDL': {'type': bool, 'default': True},
|
||||||
|
|
||||||
'CURL_BINARY': {'type': str, 'default': 'curl'},
|
'CURL_BINARY': {'type': str, 'default': 'curl'},
|
||||||
|
@ -275,11 +276,12 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
|
||||||
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
|
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
|
||||||
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] if c['CHROME_BINARY'] else find_chrome_binary()},
|
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] if c['CHROME_BINARY'] else find_chrome_binary()},
|
||||||
'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
|
'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
|
||||||
|
'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'])},
|
||||||
'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
|
'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
|
||||||
'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
|
'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
|
||||||
'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']},
|
'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']},
|
||||||
'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['USE_SINGLEFILE']},
|
'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['USE_SINGLEFILE'] and c['USE_NODE']},
|
||||||
'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY']},
|
'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
|
||||||
|
|
||||||
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
|
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
|
||||||
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
|
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
|
||||||
|
|
|
@ -5,7 +5,8 @@
|
||||||
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
|
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"archivebox": "./bin/archive"
|
"setup": "node -e \"const {execSync} = require('child_process'); Object.entries(JSON.parse(fs.readFileSync('package.json')).dependencies).forEach(globaldep => execSync('npm i -g ' + globaldep[1]))\"",
|
||||||
|
"archivebox": "./bin/archive"
|
||||||
},
|
},
|
||||||
"bin": {
|
"bin": {
|
||||||
"archivebox": "./bin/archive"
|
"archivebox": "./bin/archive"
|
||||||
|
|
|
@ -53,3 +53,11 @@ def test_readability_works_with_dom(tmp_path, process, disable_extractors_dict):
|
||||||
archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
|
archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
|
||||||
output_file = archived_item_path / "readability" / "content.html"
|
output_file = archived_item_path / "readability" / "content.html"
|
||||||
assert output_file.exists()
|
assert output_file.exists()
|
||||||
|
|
||||||
|
def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, disable_extractors_dict):
|
||||||
|
disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true", "USE_SINGLEFILE": "true", "USE_NODE": "false"})
|
||||||
|
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
|
||||||
|
capture_output=True, env=disable_extractors_dict)
|
||||||
|
output_str = add_process.stdout.decode("utf-8")
|
||||||
|
assert "> singlefile" not in output_str
|
||||||
|
assert "> readability" not in output_str
|
Loading…
Reference in a new issue