1
0
Fork 0

add pipenv, schedule cmd, logs dir, and lots more

This commit is contained in:
Nick Sweeting 2019-04-18 21:09:54 -04:00
parent 4f869f235f
commit 39a0ab3013
20 changed files with 820 additions and 188 deletions

22
Pipfile Normal file
View file

@ -0,0 +1,22 @@
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true
[dev-packages]
ipdb = "*"
flake8 = "*"
mypy = "*"
django-stubs = "*"
setuptools = "*"
[packages]
dataclasses = "*"
base32-crockford = "*"
django = "*"
youtube-dl = "*"
python-crontab = "*"
croniter = "*"
[requires]
python_version = ">=3.6"

314
Pipfile.lock generated Normal file
View file

@ -0,0 +1,314 @@
{
"_meta": {
"hash": {
"sha256": "7f25fb9c97e469fdb787e755c5756e2be4b0b649e3c5ad8feb17200b32d3bb36"
},
"pipfile-spec": 6,
"requires": {
"python_version": ">=3.6"
},
"sources": [
{
"name": "pypi",
"url": "https://pypi.org/simple",
"verify_ssl": true
}
]
},
"default": {
"base32-crockford": {
"hashes": [
"sha256:115f5bd32ae32b724035cb02eb65069a8824ea08c08851eb80c8b9f63443a969",
"sha256:295ef5ffbf6ed96b6e739ffd36be98fa7e90a206dd18c39acefb15777eedfe6e"
],
"index": "pypi",
"version": "==0.3.0"
},
"croniter": {
"hashes": [
"sha256:625949cbd38a0b2325295591940dfa5fa0dfca41d03150ae0284a924e0be10f0",
"sha256:66b6a9c6b2d1a85d4af51453b2328be775a173e688b69eb3a96a7ec752ba77a3"
],
"index": "pypi",
"version": "==0.3.29"
},
"dataclasses": {
"hashes": [
"sha256:454a69d788c7fda44efd71e259be79577822f5e3f53f029a22d08004e951dc9f",
"sha256:6988bd2b895eef432d562370bb707d540f32f7360ab13da45340101bc2307d84"
],
"index": "pypi",
"version": "==0.6"
},
"django": {
"hashes": [
"sha256:7c3543e4fb070d14e10926189a7fcf42ba919263b7473dceaefce34d54e8a119",
"sha256:a2814bffd1f007805b19194eb0b9a331933b82bd5da1c3ba3d7b7ba16e06dc4b"
],
"index": "pypi",
"version": "==2.2"
},
"python-crontab": {
"hashes": [
"sha256:91ce4b245ee5e5c117aa0b21b485bc43f2d80df854a36e922b707643f50d7923"
],
"index": "pypi",
"version": "==2.3.6"
},
"python-dateutil": {
"hashes": [
"sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb",
"sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e"
],
"version": "==2.8.0"
},
"pytz": {
"hashes": [
"sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda",
"sha256:d747dd3d23d77ef44c6a3526e274af6efeb0a6f1afd5a69ba4d5be4098c8e141"
],
"version": "==2019.1"
},
"six": {
"hashes": [
"sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
"sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
],
"version": "==1.12.0"
},
"sqlparse": {
"hashes": [
"sha256:40afe6b8d4b1117e7dff5504d7a8ce07d9a1b15aeeade8a2d10f130a834f8177",
"sha256:7c3dca29c022744e95b547e867cee89f4fce4373f3549ccd8797d8eb52cdb873"
],
"version": "==0.3.0"
},
"youtube-dl": {
"hashes": [
"sha256:0d25459093870bf560bccafe9015e59402d7de1b2c956593623ba4c2840153e5",
"sha256:ea0824ae9a166059ec754c267480198a074bd899c20b2ba497809bac099cde2e"
],
"index": "pypi",
"version": "==2019.4.17"
}
},
"develop": {
"appnope": {
"hashes": [
"sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0",
"sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71"
],
"markers": "sys_platform == 'darwin'",
"version": "==0.1.0"
},
"backcall": {
"hashes": [
"sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4",
"sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2"
],
"version": "==0.1.0"
},
"decorator": {
"hashes": [
"sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de",
"sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6"
],
"version": "==4.4.0"
},
"django-stubs": {
"hashes": [
"sha256:9c06a4b28fc8c18f6abee4f199f8ee29cb5cfcecf349e912ded31cb3526ea2b6",
"sha256:9ef230843a24b5d74f2ebd4c60f9bea09c21911bc119d0325e8bb47e2f495e70"
],
"index": "pypi",
"version": "==0.12.1"
},
"entrypoints": {
"hashes": [
"sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19",
"sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451"
],
"version": "==0.3"
},
"flake8": {
"hashes": [
"sha256:859996073f341f2670741b51ec1e67a01da142831aa1fdc6242dbf88dffbe661",
"sha256:a796a115208f5c03b18f332f7c11729812c8c3ded6c46319c59b53efd3819da8"
],
"index": "pypi",
"version": "==3.7.7"
},
"ipdb": {
"hashes": [
"sha256:dce2112557edfe759742ca2d0fee35c59c97b0cc7a05398b791079d78f1519ce"
],
"index": "pypi",
"version": "==0.12"
},
"ipython": {
"hashes": [
"sha256:b038baa489c38f6d853a3cfc4c635b0cda66f2864d136fe8f40c1a6e334e2a6b",
"sha256:f5102c1cd67e399ec8ea66bcebe6e3968ea25a8977e53f012963e5affeb1fe38"
],
"markers": "python_version >= '3.4'",
"version": "==7.4.0"
},
"ipython-genutils": {
"hashes": [
"sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
"sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
],
"version": "==0.2.0"
},
"jedi": {
"hashes": [
"sha256:2bb0603e3506f708e792c7f4ad8fc2a7a9d9c2d292a358fbbd58da531695595b",
"sha256:2c6bcd9545c7d6440951b12b44d373479bf18123a401a52025cf98563fbd826c"
],
"version": "==0.13.3"
},
"mccabe": {
"hashes": [
"sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
"sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
],
"version": "==0.6.1"
},
"mypy": {
"hashes": [
"sha256:2afe51527b1f6cdc4a5f34fc90473109b22bf7f21086ba3e9451857cf11489e6",
"sha256:56a16df3e0abb145d8accd5dbb70eba6c4bd26e2f89042b491faa78c9635d1e2",
"sha256:5764f10d27b2e93c84f70af5778941b8f4aa1379b2430f85c827e0f5464e8714",
"sha256:5bbc86374f04a3aa817622f98e40375ccb28c4836f36b66706cf3c6ccce86eda",
"sha256:6a9343089f6377e71e20ca734cd8e7ac25d36478a9df580efabfe9059819bf82",
"sha256:6c9851bc4a23dc1d854d3f5dfd5f20a016f8da86bcdbb42687879bb5f86434b0",
"sha256:b8e85956af3fcf043d6f87c91cbe8705073fc67029ba6e22d3468bfee42c4823",
"sha256:b9a0af8fae490306bc112229000aa0c2ccc837b49d29a5c42e088c132a2334dd",
"sha256:bbf643528e2a55df2c1587008d6e3bda5c0445f1240dfa85129af22ae16d7a9a",
"sha256:c46ab3438bd21511db0f2c612d89d8344154c0c9494afc7fbc932de514cf8d15",
"sha256:f7a83d6bd805855ef83ec605eb01ab4fa42bcef254b13631e451cbb44914a9b0"
],
"index": "pypi",
"version": "==0.701"
},
"mypy-extensions": {
"hashes": [
"sha256:37e0e956f41369209a3d5f34580150bcacfabaa57b33a15c0b25f4b5725e0812",
"sha256:b16cabe759f55e3409a7d231ebd2841378fb0c27a5d1994719e340e4f429ac3e"
],
"version": "==0.4.1"
},
"parso": {
"hashes": [
"sha256:17cc2d7a945eb42c3569d4564cdf49bde221bc2b552af3eca9c1aad517dcdd33",
"sha256:2e9574cb12e7112a87253e14e2c380ce312060269d04bd018478a3c92ea9a376"
],
"version": "==0.4.0"
},
"pexpect": {
"hashes": [
"sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1",
"sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb"
],
"markers": "sys_platform != 'win32'",
"version": "==4.7.0"
},
"pickleshare": {
"hashes": [
"sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca",
"sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"
],
"version": "==0.7.5"
},
"prompt-toolkit": {
"hashes": [
"sha256:11adf3389a996a6d45cc277580d0d53e8a5afd281d0c9ec71b28e6f121463780",
"sha256:2519ad1d8038fd5fc8e770362237ad0364d16a7650fb5724af6997ed5515e3c1",
"sha256:977c6583ae813a37dc1c2e1b715892461fcbdaa57f6fc62f33a528c4886c8f55"
],
"version": "==2.0.9"
},
"ptyprocess": {
"hashes": [
"sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0",
"sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"
],
"version": "==0.6.0"
},
"pycodestyle": {
"hashes": [
"sha256:95a2219d12372f05704562a14ec30bc76b05a5b297b21a5dfe3f6fac3491ae56",
"sha256:e40a936c9a450ad81df37f549d676d127b1b66000a6c500caa2b085bc0ca976c"
],
"version": "==2.5.0"
},
"pyflakes": {
"hashes": [
"sha256:17dbeb2e3f4d772725c777fabc446d5634d1038f234e77343108ce445ea69ce0",
"sha256:d976835886f8c5b31d47970ed689944a0262b5f3afa00a5a7b4dc81e5449f8a2"
],
"version": "==2.1.1"
},
"pygments": {
"hashes": [
"sha256:5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a",
"sha256:e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d"
],
"version": "==2.3.1"
},
"six": {
"hashes": [
"sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
"sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
],
"version": "==1.12.0"
},
"traitlets": {
"hashes": [
"sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
"sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9"
],
"version": "==4.3.2"
},
"typed-ast": {
"hashes": [
"sha256:04894d268ba6eab7e093d43107869ad49e7b5ef40d1a94243ea49b352061b200",
"sha256:16616ece19daddc586e499a3d2f560302c11f122b9c692bc216e821ae32aa0d0",
"sha256:252fdae740964b2d3cdfb3f84dcb4d6247a48a6abe2579e8029ab3be3cdc026c",
"sha256:2af80a373af123d0b9f44941a46df67ef0ff7a60f95872412a145f4500a7fc99",
"sha256:2c88d0a913229a06282b285f42a31e063c3bf9071ff65c5ea4c12acb6977c6a7",
"sha256:2ea99c029ebd4b5a308d915cc7fb95b8e1201d60b065450d5d26deb65d3f2bc1",
"sha256:3d2e3ab175fc097d2a51c7a0d3fda442f35ebcc93bb1d7bd9b95ad893e44c04d",
"sha256:4766dd695548a15ee766927bf883fb90c6ac8321be5a60c141f18628fb7f8da8",
"sha256:56b6978798502ef66625a2e0f80cf923da64e328da8bbe16c1ff928c70c873de",
"sha256:5cddb6f8bce14325b2863f9d5ac5c51e07b71b462361fd815d1d7706d3a9d682",
"sha256:644ee788222d81555af543b70a1098f2025db38eaa99226f3a75a6854924d4db",
"sha256:64cf762049fc4775efe6b27161467e76d0ba145862802a65eefc8879086fc6f8",
"sha256:68c362848d9fb71d3c3e5f43c09974a0ae319144634e7a47db62f0f2a54a7fa7",
"sha256:6c1f3c6f6635e611d58e467bf4371883568f0de9ccc4606f17048142dec14a1f",
"sha256:b213d4a02eec4ddf622f4d2fbc539f062af3788d1f332f028a2e19c42da53f15",
"sha256:bb27d4e7805a7de0e35bd0cb1411bc85f807968b2b0539597a49a23b00a622ae",
"sha256:c9d414512eaa417aadae7758bc118868cd2396b0e6138c1dd4fda96679c079d3",
"sha256:f0937165d1e25477b01081c4763d2d9cdc3b18af69cb259dd4f640c9b900fe5e",
"sha256:fb96a6e2c11059ecf84e6741a319f93f683e440e341d4489c9b161eca251cf2a",
"sha256:fc71d2d6ae56a091a8d94f33ec9d0f2001d1cb1db423d8b4355debfe9ce689b7"
],
"version": "==1.3.4"
},
"typing-extensions": {
"hashes": [
"sha256:07b2c978670896022a43c4b915df8958bec4a6b84add7f2c87b2b728bda3ba64",
"sha256:f3f0e67e1d42de47b5c67c32c9b26641642e9170fe7e292991793705cd5fef7c",
"sha256:fb2cd053238d33a8ec939190f30cfd736c00653a85a2919415cecf7dc3d9da71"
],
"version": "==3.7.2"
},
"wcwidth": {
"hashes": [
"sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
"sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
],
"version": "==0.1.7"
}
}
}

View file

@ -1,30 +1,59 @@
__package__ = 'archivebox.cli'
import os
from typing import Dict
from importlib import import_module
CLI_DIR = os.path.dirname(os.path.abspath(__file__))
required_attrs = ('__package__', '__command__', '__description__', 'main')
# these common commands will appear sorted before any others for ease-of-use
display_first = ('help', 'version', 'init', 'list', 'update', 'add', 'remove')
# every imported command module must have these properties in order to be valid
required_attrs = ('__package__', '__command__', 'main')
order = ('help', 'version', 'init', 'list', 'update', 'add', 'remove')
# basic checks to make sure imported files are valid subcommands
is_cli_module = lambda fname: fname.startswith('archivebox_') and fname.endswith('.py')
is_valid_cli_module = lambda module, subcommand: (
all(hasattr(module, attr) for attr in required_attrs)
and module.__command__.split(' ')[-1] == subcommand
)
def list_subcommands() -> Dict[str, str]:
"""find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
def list_subcommands():
COMMANDS = []
for filename in os.listdir(CLI_DIR):
if filename.startswith('archivebox_') and filename.endswith('.py'):
if is_cli_module(filename):
subcommand = filename.replace('archivebox_', '').replace('.py', '')
module = import_module('.archivebox_{}'.format(subcommand), __package__)
assert is_valid_cli_module(module, subcommand)
COMMANDS.append((subcommand, module.__description__)) # type: ignore
globals()[subcommand] = module.main
module.main.__doc__ = module.__description__
assert all(hasattr(module, attr) for attr in required_attrs)
assert module.__command__.split(' ')[-1] == subcommand
COMMANDS.append((subcommand, module.__description__))
display_order = lambda cmd: (
display_first.index(cmd[0])
if cmd[0] in display_first else
100 + len(cmd[0])
)
return dict(sorted(COMMANDS, key=lambda cmd: order.index(cmd[0]) if cmd[0] in order else 10 + len(cmd[0])))
return dict(sorted(COMMANDS, key=display_order))
def run_subcommand(subcommand: str, args=None):
def run_subcommand(subcommand: str, args=None) -> None:
"""run a given ArchiveBox subcommand with the given list of args"""
module = import_module('.archivebox_{}'.format(subcommand), __package__)
return module.main(args) # type: ignore
module.main(args) # type: ignore
SUBCOMMANDS = list_subcommands()
__all__ = (
'SUBCOMMANDS',
'list_subcommands',
'run_subcommand',
*SUBCOMMANDS.keys(),
)

View file

@ -82,5 +82,6 @@ def main(args=None, stdin=None):
only_new=command.only_new,
)
if __name__ == '__main__':
main()

View file

@ -4,7 +4,6 @@ __package__ = 'archivebox.cli'
__command__ = 'archivebox init'
__description__ = 'Initialize a new ArchiveBox collection in the current directory'
import os
import sys
import argparse

View file

@ -0,0 +1,194 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox schedule'
__description__ = 'Set ArchiveBox to run regularly at a specific time'
import os
import sys
import argparse
from datetime import datetime
from crontab import CronTab, CronSlices
from ..legacy.util import reject_stdin
from ..legacy.config import (
OUTPUT_DIR,
LOGS_DIR,
ARCHIVEBOX_BINARY,
USER,
ANSI,
stderr,
)
CRON_COMMENT = 'archivebox_schedule'
def main(args=None):
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
parser.add_argument(
'--quiet', '-q',
action='store_true',
help=("Don't warn about storage space."),
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
'--add', # '-a',
action='store_true',
help='Add a new scheduled ArchiveBox update job to cron',
)
parser.add_argument(
'--every', # '-e',
type=str,
default='daily',
help='Run ArchiveBox once every [timeperiod] (hour/day/week/month/year or cron format e.g. "0 0 * * *")',
)
group.add_argument(
'--clear', # '-c'
action='store_true',
help=("Stop all ArchiveBox scheduled runs, clear it completely from cron"),
)
group.add_argument(
'--show', # '-s'
action='store_true',
help=("Print a list of currently active ArchiveBox cron jobs"),
)
group.add_argument(
'--foreground', '-f',
action='store_true',
help=("Launch ArchiveBox as a long-running foreground task "
"instead of using cron."),
)
group.add_argument(
'--run-all', # '-a',
action='store_true',
help='Run all the scheduled jobs once immediately, independent of their configured schedules',
)
parser.add_argument(
'import_path',
nargs='?',
type=str,
default=None,
help=("Check this path and import any new links on every run "
"(can be either local file or remote URL)"),
)
command = parser.parse_args(args)
reject_stdin(__command__)
os.makedirs(LOGS_DIR, exist_ok=True)
cron = CronTab(user=True)
cron = dedupe_jobs(cron)
existing_jobs = list(cron.find_comment(CRON_COMMENT))
if command.foreground or command.run_all:
if command.import_path or (not existing_jobs):
stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))
stderr(' archivebox schedule --every=hour https://example.com/some/rss/feed.xml')
raise SystemExit(1)
print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI))
if command.run_all:
try:
for job in existing_jobs:
sys.stdout.write(f' > {job.command}')
sys.stdout.flush()
job.run()
sys.stdout.write(f'\r{job.command}\n')
except KeyboardInterrupt:
print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
raise SystemExit(1)
if command.foreground:
try:
for result in cron.run_scheduler():
print(result)
except KeyboardInterrupt:
print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
raise SystemExit(1)
elif command.show:
if existing_jobs:
print('\n'.join(str(cmd) for cmd in existing_jobs))
else:
stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))
stderr(' To schedule a new job, run:')
stderr(' archivebox schedule --every=[timeperiod] https://example.com/some/rss/feed.xml')
raise SystemExit(0)
elif command.clear:
print(cron.remove_all(comment=CRON_COMMENT))
cron.write()
raise SystemExit(0)
elif command.every:
quoted = lambda s: f'"{s}"' if s and ' ' in s else s
cmd = [
'cd',
quoted(OUTPUT_DIR),
'&&',
quoted(ARCHIVEBOX_BINARY),
*(('add', f'"{command.import_path}"',) if command.import_path else ('update',)),
'2>&1',
'>',
quoted(os.path.join(LOGS_DIR, 'archivebox.log')),
]
new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
if command.every in ('minute', 'hour', 'day', 'week', 'month', 'year'):
set_every = getattr(new_job.every(), command.every)
set_every()
elif CronSlices.is_valid(command.every):
new_job.setall(command.every)
else:
stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))
stderr(' It must be one of minute/hour/day/week/month')
stderr(' or a quoted cron-format schedule like:')
stderr(' archivebox init --every=day https://example.com/some/rss/feed.xml')
stderr(' archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
raise SystemExit(1)
cron = dedupe_jobs(cron)
cron.write()
total_runs = sum(j.frequency_per_year() for j in cron)
existing_jobs = list(cron.find_comment(CRON_COMMENT))
print()
print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
if total_runs > 60 and not command.quiet:
stderr()
stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
stderr(f' Congrats on being an enthusiastic internet archiver! 👌')
stderr()
stderr(' Make sure you have enough storage space available to hold all the data.')
stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
raise SystemExit(0)
def dedupe_jobs(cron: CronTab) -> CronTab:
deduped = set()
for job in list(cron):
unique_tuple = (str(job.slices), job.command)
if unique_tuple not in deduped:
deduped.add(unique_tuple)
cron.remove(job)
for schedule, command in deduped:
job = cron.new(command=command, comment=CRON_COMMENT)
job.setall(schedule)
job.enable()
return cron
if __name__ == '__main__':
main()

View file

@ -7,7 +7,7 @@ __description__ = 'Run the ArchiveBox HTTP server'
import sys
import argparse
from ..legacy.config import setup_django
from ..legacy.config import setup_django, OUTPUT_DIR
from ..legacy.util import reject_stdin
@ -29,7 +29,7 @@ def main(args=None):
command = parser.parse_args(args)
reject_stdin(__command__)
setup_django()
setup_django(OUTPUT_DIR)
from django.core.management import call_command
call_command("runserver", *command.runserver_args)

View file

@ -7,7 +7,7 @@ __description__ = 'Enter an interactive ArchiveBox Django shell'
import sys
import argparse
from ..legacy.config import setup_django
from ..legacy.config import setup_django, OUTPUT_DIR
from ..legacy.util import reject_stdin
@ -22,7 +22,7 @@ def main(args=None):
parser.parse_args(args)
reject_stdin(__command__)
setup_django()
setup_django(OUTPUT_DIR)
from django.core.management import call_command
call_command("shell_plus")

View file

@ -5,10 +5,8 @@ import os
SECRET_KEY = '---------------- not a valid secret key ! ----------------'
DEBUG = True
OUTPUT_DIR = os.path.abspath(os.curdir)
DATABASE_DIR_NAME = 'database'
DATABASE_FILE_NAME = 'database.sqlite3'
DATABASE_FILE = os.path.join(OUTPUT_DIR, DATABASE_DIR_NAME, DATABASE_FILE_NAME)
OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR', os.curdir))
DATABASE_FILE = os.path.join(OUTPUT_DIR, 'index.sqlite3')
INSTALLED_APPS = [
@ -38,7 +36,7 @@ ROOT_URLCONF = 'core.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': ['templates'],
'DIRS': ['themes'],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [

View file

@ -1,15 +0,0 @@
import os
import sys
PYTHON_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(PYTHON_DIR)
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings")
import django
django.setup()
from django.conf import settings
DATABASE_FILE = settings.DATABASE_FILE

View file

@ -60,7 +60,6 @@ WGET_BINARY = os.getenv('WGET_BINARY', 'wget')
YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl')
CHROME_BINARY = os.getenv('CHROME_BINARY', None)
# ******************************************************************************
### Terminal Configuration
@ -84,6 +83,7 @@ def stderr(*args):
sys.stderr.write(' '.join(str(a) for a in args) + '\n')
USER = getpass.getuser() or os.getlogin()
ARCHIVEBOX_BINARY = sys.argv[0]
REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..'))
if OUTPUT_DIR:
@ -91,14 +91,15 @@ if OUTPUT_DIR:
else:
OUTPUT_DIR = os.path.abspath(os.curdir)
SQL_INDEX_FILENAME = 'index.sqlite3'
JSON_INDEX_FILENAME = 'index.json'
HTML_INDEX_FILENAME = 'index.html'
ARCHIVE_DIR_NAME = 'archive'
SOURCES_DIR_NAME = 'sources'
DATABASE_DIR_NAME = 'database'
DATABASE_FILE_NAME = 'database.sqlite3'
LOGS_DIR_NAME = 'logs'
ARCHIVE_DIR = os.path.join(OUTPUT_DIR, ARCHIVE_DIR_NAME)
SOURCES_DIR = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME)
DATABASE_DIR = os.path.join(OUTPUT_DIR, DATABASE_DIR_NAME)
DATABASE_FILE = os.path.join(OUTPUT_DIR, DATABASE_DIR_NAME, DATABASE_FILE_NAME)
LOGS_DIR = os.path.join(OUTPUT_DIR, LOGS_DIR_NAME)
PYTHON_DIR = os.path.join(REPO_DIR, 'archivebox')
LEGACY_DIR = os.path.join(PYTHON_DIR, 'legacy')
@ -126,9 +127,10 @@ if USER == 'root':
raise SystemExit(1)
### Check Python environment
python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
if python_vers < 3.6:
stderr('{}[X] Python version is not new enough: {} (>3.6 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
PYTHON_BINARY = sys.executable
PYTHON_VERSION = '{}.{}'.format(sys.version_info.major, sys.version_info.minor)
if float(PYTHON_VERSION) < 3.6:
stderr('{}[X] Python version is not new enough: {} (>3.6 is required){}'.format(ANSI['red'], PYTHON_VERSION, ANSI['reset']))
stderr(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
raise SystemExit(1)
@ -150,6 +152,7 @@ if sys.stdout.encoding.upper() not in ('UTF-8', 'UTF8'):
def bin_version(binary: str) -> Optional[str]:
"""check the presence and return valid version line of a specified binary"""
global HAS_INVALID_DEPENDENCIES
binary = os.path.expanduser(binary)
try:
@ -223,12 +226,17 @@ def find_chrome_data_dir() -> Optional[str]:
return None
def setup_django():
def setup_django(out_dir: str=OUTPUT_DIR, check_db=False):
import django
sys.path.append(PYTHON_DIR)
os.environ.setdefault('OUTPUT_DIR', out_dir)
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
django.setup()
if check_db:
assert os.path.exists(os.path.join(out_dir, SQL_INDEX_FILENAME)), (
f'No database file {SQL_INDEX_FILENAME} found in OUTPUT_DIR: {out_dir}')
# ******************************************************************************
# ************************ Environment & Dependencies **************************
# ******************************************************************************
@ -338,16 +346,16 @@ try:
'enabled': True,
'is_valid': os.path.exists(SOURCES_DIR),
},
'LOGS_DIR': {
'path': os.path.abspath(LOGS_DIR),
'enabled': True,
'is_valid': os.path.exists(LOGS_DIR),
},
'ARCHIVE_DIR': {
'path': os.path.abspath(ARCHIVE_DIR),
'enabled': True,
'is_valid': os.path.exists(ARCHIVE_DIR),
},
'DATABASE_DIR': {
'path': os.path.abspath(DATABASE_DIR),
'enabled': True,
'is_valid': os.path.exists(DATABASE_FILE),
},
'CHROME_USER_DATA_DIR': {
'path': CHROME_USER_DATA_DIR and os.path.abspath(CHROME_USER_DATA_DIR),
'enabled': USE_CHROME and CHROME_USER_DATA_DIR,
@ -361,6 +369,12 @@ try:
}
DEPENDENCIES = {
'PYTHON_BINARY': {
'path': PYTHON_BINARY,
'version': PYTHON_VERSION,
'enabled': True,
'is_valid': bool(DJANGO_VERSION),
},
'DJANGO_BINARY': {
'path': DJANGO_BINARY,
'version': DJANGO_VERSION,

View file

@ -1,13 +1,17 @@
__package__ = 'archivebox.legacy'
import os
import json
from typing import List, Tuple, Optional, Iterable
from collections import OrderedDict
from contextlib import contextmanager
from .schema import Link, ArchiveResult
from .config import (
DATABASE_DIR,
DATABASE_FILE_NAME,
SQL_INDEX_FILENAME,
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
OUTPUT_DIR,
TIMEOUT,
URL_BLACKLIST_PTN,
@ -35,14 +39,13 @@ from .util import (
from .parse import parse_links
from .logs import (
log_indexing_process_started,
log_indexing_process_finished,
log_indexing_started,
log_indexing_finished,
log_parsing_started,
log_parsing_finished,
)
### Link filtering and checking
@enforce_types
@ -117,7 +120,7 @@ def validate_links(links: Iterable[Link]) -> Iterable[Link]:
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
if not links:
stderr('{red}[X] No links found in index.json{reset}'.format(**ANSI))
stderr('{red}[X] No links found in index.{reset}'.format(**ANSI))
stderr(' To add a link to your archive, run:')
stderr(" archivebox add 'https://example.com'")
stderr()
@ -204,58 +207,63 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
### Main Links Index
@contextmanager
@enforce_types
def timed_index_update(out_path: str):
log_indexing_started(out_path)
timer = TimedProgress(TIMEOUT * 2, prefix=' ')
try:
yield
finally:
timer.end()
assert os.path.exists(out_path), f'Failed to write index file: {out_path}'
log_indexing_finished(out_path)
@enforce_types
def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
"""create index.html file for a given list of links"""
log_indexing_process_started()
log_indexing_process_started(len(links))
log_indexing_started(DATABASE_DIR, DATABASE_FILE_NAME)
timer = TimedProgress(TIMEOUT * 2, prefix=' ')
try:
write_sql_main_index(links)
finally:
timer.end()
log_indexing_finished(DATABASE_DIR, DATABASE_FILE_NAME)
with timed_index_update(os.path.join(out_dir, SQL_INDEX_FILENAME)):
write_sql_main_index(links, out_dir=out_dir)
log_indexing_started(out_dir, 'index.json')
timer = TimedProgress(TIMEOUT * 2, prefix=' ')
try:
with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)):
write_json_main_index(links, out_dir=out_dir)
finally:
timer.end()
log_indexing_finished(out_dir, 'index.json')
log_indexing_started(out_dir, 'index.html')
timer = TimedProgress(TIMEOUT * 2, prefix=' ')
try:
with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)):
write_html_main_index(links, out_dir=out_dir, finished=finished)
finally:
timer.end()
log_indexing_finished(out_dir, 'index.html')
log_indexing_process_finished()
@enforce_types
def load_main_index(out_dir: str=OUTPUT_DIR, import_path: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
def load_main_index(out_dir: str=OUTPUT_DIR) -> List[Link]:
"""parse and load existing index with any new links from import_path merged in"""
existing_links: List[Link] = []
if out_dir:
existing_links = list(parse_json_main_index(out_dir))
existing_sql_links = list(parse_sql_main_index())
assert set(l.url for l in existing_links) == set(l['url'] for l in existing_sql_links)
all_links: List[Link] = []
all_links = list(parse_json_main_index(out_dir))
links_from_sql = list(parse_sql_main_index())
assert set(l.url for l in all_links) == set(l['url'] for l in links_from_sql)
return all_links
@enforce_types
def import_new_links(existing_links: List[Link], import_path: str) -> Tuple[List[Link], List[Link]]:
new_links: List[Link] = []
if import_path:
# parse and validate the import file
log_parsing_started(import_path)
raw_links, parser_name = parse_links(import_path)
new_links = list(validate_links(raw_links))
# parse and validate the import file
log_parsing_started(import_path)
raw_links, parser_name = parse_links(import_path)
new_links = list(validate_links(raw_links))
# merge existing links in out_dir and new links
all_links = list(validate_links(existing_links + new_links))
if import_path and parser_name:
if parser_name:
num_parsed = len(raw_links)
num_new_links = len(all_links) - len(existing_links)
log_parsing_finished(num_parsed, num_new_links, parser_name)
@ -323,9 +331,3 @@ def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
return merge_links(existing_link, link)
return link

View file

@ -6,7 +6,7 @@ from dataclasses import dataclass
from typing import Optional, List
from .schema import Link, ArchiveResult
from .config import ANSI, OUTPUT_DIR
from .config import ANSI, OUTPUT_DIR, IS_TTY
@dataclass
@ -42,7 +42,7 @@ def pretty_path(path: str) -> str:
def log_parsing_started(source_file: str):
start_ts = datetime.now()
_LAST_RUN_STATS.parse_start_ts = start_ts
print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
print('\n{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
source_file.rsplit('/', 1)[-1],
**ANSI,
@ -56,22 +56,26 @@ def log_parsing_finished(num_parsed: int, num_new_links: int, parser_name: str):
### Indexing Stage
def log_indexing_process_started():
def log_indexing_process_started(num_links: int):
start_ts = datetime.now()
_LAST_RUN_STATS.index_start_ts = start_ts
print()
print('{green}[*] [{}] Saving main index files...{reset}'.format(
print('{green}[*] [{}] Updating {} links in main index...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
**ANSI,
))
def log_indexing_started(out_dir: str, out_file: str):
sys.stdout.write(' > {}/{}'.format(pretty_path(out_dir), out_file))
def log_indexing_finished(out_dir: str, out_file: str):
def log_indexing_process_finished():
end_ts = datetime.now()
_LAST_RUN_STATS.index_end_ts = end_ts
print('\r{}/{}'.format(out_dir, out_file))
def log_indexing_started(out_path: str):
if IS_TTY:
sys.stdout.write(f' > {out_path}')
def log_indexing_finished(out_path: str):
print(f'\r{out_path}')
### Archiving Stage
@ -108,7 +112,7 @@ def log_archiving_paused(num_links: int, idx: int, timestamp: str):
print(' To view your archive, open:')
print(' {}/index.html'.format(OUTPUT_DIR))
print(' Continue archiving where you left off by running:')
print(' archivebox {}'.format(timestamp))
print(' archivebox update --resume={}'.format(timestamp))
def log_archiving_finished(num_links: int):
end_ts = datetime.now()

View file

@ -9,6 +9,7 @@ from .util import enforce_types, TimedProgress
from .index import (
links_after_timestamp,
load_main_index,
import_new_links,
write_main_index,
)
from .archive_methods import archive_link
@ -19,8 +20,9 @@ from .config import (
OUTPUT_DIR,
SOURCES_DIR,
ARCHIVE_DIR,
DATABASE_DIR,
DATABASE_FILE,
LOGS_DIR,
JSON_INDEX_FILENAME,
SQL_INDEX_FILENAME,
check_dependencies,
check_data_folder,
setup_django,
@ -36,60 +38,85 @@ from .logs import (
)
ALLOWED_IN_OUTPUT_DIR = {
'.DS_Store',
'.venv',
'venv',
'virtualenv',
'.virtualenv',
'sources',
'archive',
'logs',
'static',
}
@enforce_types
def init():
os.makedirs(OUTPUT_DIR, exist_ok=True)
harmless_files = {'.DS_Store', '.venv', 'venv', 'virtualenv', '.virtualenv', 'sources', 'archive', 'database', 'logs', 'static'}
is_empty = not len(set(os.listdir(OUTPUT_DIR)) - harmless_files)
existing_index = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
is_empty = not len(set(os.listdir(OUTPUT_DIR)) - ALLOWED_IN_OUTPUT_DIR)
existing_index = os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
if is_empty:
stderr('{green}[+] Initializing new archive directory: {}{reset}'.format(OUTPUT_DIR, **ANSI))
write_main_index([], out_dir=OUTPUT_DIR, finished=True)
print('{green}[+] Initializing new archive directory: {}{reset}'.format(OUTPUT_DIR, **ANSI))
print('{green}----------------------------------------------------------------{reset}'.format(**ANSI))
else:
if existing_index:
stderr('{green}[√] You already have an ArchiveBox collection in the current folder.{reset}'.format(**ANSI))
stderr(f' {OUTPUT_DIR}')
stderr(f' > index.html')
stderr(f' > index.json')
print('{green}[√] You already have an ArchiveBox collection in the current folder.{reset}'.format(**ANSI))
print('{green}----------------------------------------------------------------{reset}'.format(**ANSI))
print(f' {OUTPUT_DIR}')
else:
stderr(
("{red}[X] This folder already has files in it. You must run init inside a completely empty directory.{reset}"
("{red}[X] This folder appears to have non-ArchiveBox files in it. You must run 'archivebox init' inside a completely empty directory.{reset}"
"\n\n"
" {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n"
" just cd into the folder and run the archivebox command to pick up where you left off.\n\n"
" just cd into the folder and run 'archivebox update' to pick up where you left off.\n\n"
" (Always make sure your data folder is backed up first before updating ArchiveBox)"
).format(OUTPUT_DIR, **ANSI)
)
raise SystemExit(1)
os.makedirs(SOURCES_DIR, exist_ok=True)
stderr(f' > sources/')
os.makedirs(ARCHIVE_DIR, exist_ok=True)
stderr(f' > archive/')
os.makedirs(DATABASE_DIR, exist_ok=True)
setup_django()
from django.core.management import call_command
from django.contrib.auth.models import User
stderr(f' > database/')
print(f' > {SOURCES_DIR}')
stderr('\n{green}[+] Running Django migrations...{reset}'.format(**ANSI))
os.makedirs(ARCHIVE_DIR, exist_ok=True)
print(f' > {ARCHIVE_DIR}')
os.makedirs(LOGS_DIR, exist_ok=True)
print(f' > {LOGS_DIR}')
print('\n{green}[+] Running Django migrations...{reset}'.format(**ANSI))
setup_django(OUTPUT_DIR, check_db=False)
from django.core.management import call_command
from django.conf import settings
assert settings.DATABASE_FILE == os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME)
print(f' {settings.DATABASE_FILE}')
call_command("makemigrations", interactive=False)
call_command("migrate", interactive=False)
if not User.objects.filter(is_superuser=True).exists():
stderr('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
call_command("createsuperuser", interactive=True)
stderr('\n{green}------------------------------------------------------------{reset}'.format(**ANSI))
stderr('{green}[√] Done. ArchiveBox collection is set up in current folder.{reset}'.format(**ANSI))
stderr(' To add new links, you can run:')
stderr(" archivebox add 'https://example.com'")
stderr()
stderr(' For more usage and examples, run:')
stderr(' archivebox help')
assert os.path.exists(settings.DATABASE_FILE)
# from django.contrib.auth.models import User
# if IS_TTY and not User.objects.filter(is_superuser=True).exists():
# print('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
# call_command("createsuperuser", interactive=True)
if existing_index:
all_links = load_main_index(out_dir=OUTPUT_DIR)
write_main_index(links=list(all_links), out_dir=OUTPUT_DIR)
else:
write_main_index([], out_dir=OUTPUT_DIR)
print('\n{green}----------------------------------------------------------------{reset}'.format(**ANSI))
print('{green}[√] Done. ArchiveBox collection is set up in the current folder.{reset}'.format(**ANSI))
print(' To add new links, you can run:')
print(" archivebox add 'https://example.com'")
print()
print(' For more usage and examples, run:')
print(' archivebox help')
@ -102,7 +129,11 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
# Step 1: Load list of links from the existing index
# merge in and dedupe new links from import_path
all_links, new_links = load_main_index(out_dir=OUTPUT_DIR, import_path=import_path)
all_links: List[Link] = []
new_links: List[Link] = []
all_links = load_main_index(out_dir=OUTPUT_DIR)
if import_path:
all_links, new_links = import_new_links(all_links, import_path)
# Step 2: Write updated index with deduped old and new links back to disk
write_main_index(links=list(all_links), out_dir=OUTPUT_DIR)
@ -127,7 +158,7 @@ def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]
log_archiving_finished(len(links))
# Step 4: Re-write links index with updated titles, icons, and resources
all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
all_links = load_main_index(out_dir=OUTPUT_DIR)
write_main_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
return all_links
@ -152,7 +183,7 @@ def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str
def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact',
after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
all_links = load_main_index(out_dir=OUTPUT_DIR)
for link in all_links:
if after is not None and float(link.timestamp) < after:
@ -198,7 +229,7 @@ def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
timer = TimedProgress(360, prefix=' ')
try:
to_keep = []
all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
all_links = load_main_index(out_dir=OUTPUT_DIR)
for link in all_links:
should_remove = (
(after is not None and float(link.timestamp) < after)

View file

@ -13,6 +13,7 @@ from ..config import (
GIT_SHA,
FOOTER_INFO,
ARCHIVE_DIR_NAME,
HTML_INDEX_FILENAME,
)
from ..util import (
enforce_types,
@ -44,7 +45,7 @@ def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished:
copy_and_overwrite(join(TEMPLATES_DIR, 'static'), join(out_dir, 'static'))
rendered_html = main_index_template(links, finished=finished)
atomic_write(rendered_html, join(out_dir, 'index.html'))
atomic_write(rendered_html, join(out_dir, HTML_INDEX_FILENAME))
@enforce_types
@ -100,7 +101,7 @@ def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
out_dir = out_dir or link.link_dir
rendered_html = link_details_template(link)
atomic_write(rendered_html, join(out_dir, 'index.html'))
atomic_write(rendered_html, join(out_dir, HTML_INDEX_FILENAME))
@enforce_types

View file

@ -1,6 +1,7 @@
__package__ = 'archivebox.legacy.storage'
import os
import sys
import json
from datetime import datetime
@ -10,12 +11,33 @@ from ..schema import Link, ArchiveResult
from ..config import (
VERSION,
OUTPUT_DIR,
FOOTER_INFO,
GIT_SHA,
DEPENDENCIES,
JSON_INDEX_FILENAME,
)
from ..util import (
enforce_types,
atomic_write,
)
MAIN_INDEX_HEADER = {
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
'schema': 'archivebox.legacy.storage.json',
'copyright_info': FOOTER_INFO,
'meta': {
'project': 'ArchiveBox',
'cmd': sys.argv,
'version': VERSION,
'git_sha': GIT_SHA,
'website': 'https://ArchiveBox.io',
'docs': 'https://github.com/pirate/ArchiveBox/wiki',
'source': 'https://github.com/pirate/ArchiveBox',
'issues': 'https://github.com/pirate/ArchiveBox/issues',
'dependencies': DEPENDENCIES,
},
}
### Main Links Index
@ -23,7 +45,7 @@ from ..util import (
def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
"""parse a archive index json file and return the list of links"""
index_path = os.path.join(out_dir, 'index.json')
index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
links = json.load(f)['links']
@ -46,18 +68,13 @@ def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
if links and links[0].sources:
assert isinstance(links[0].sources[0], str)
path = os.path.join(out_dir, 'index.json')
index_json = {
'info': 'ArchiveBox Index',
'source': 'https://github.com/pirate/ArchiveBox',
'docs': 'https://github.com/pirate/ArchiveBox/wiki',
'version': VERSION,
main_index_json = {
**MAIN_INDEX_HEADER,
'num_links': len(links),
'updated': datetime.now(),
'links': links,
}
atomic_write(index_json, path)
atomic_write(main_index_json, os.path.join(out_dir, JSON_INDEX_FILENAME))
### Link Details Index
@ -67,7 +84,7 @@ def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
"""write a json file with some info about the link"""
out_dir = out_dir or link.link_dir
path = os.path.join(out_dir, 'index.json')
path = os.path.join(out_dir, JSON_INDEX_FILENAME)
atomic_write(link._asdict(extended=True), path)
@ -75,7 +92,7 @@ def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
@enforce_types
def parse_json_link_details(out_dir: str) -> Optional[Link]:
"""load the json link index from a given directory"""
existing_index = os.path.join(out_dir, 'index.json')
existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)
if os.path.exists(existing_index):
with open(existing_index, 'r', encoding='utf-8') as f:
link_json = json.load(f)

View file

@ -4,14 +4,14 @@ from typing import List, Iterator
from ..schema import Link
from ..util import enforce_types
from ..config import setup_django
from ..config import setup_django, OUTPUT_DIR
### Main Links Index
@enforce_types
def parse_sql_main_index() -> Iterator[Link]:
setup_django()
def parse_sql_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
setup_django(out_dir, check_db=True)
from core.models import Page
return (
@ -20,8 +20,8 @@ def parse_sql_main_index() -> Iterator[Link]:
)
@enforce_types
def write_sql_main_index(links: List[Link]) -> None:
setup_django()
def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
setup_django(out_dir, check_db=True)
from core.models import Page
for link in links:

View file

@ -27,6 +27,11 @@ os.environ.update(TEST_CONFIG)
from .legacy.main import init
from .legacy.index import load_main_index
from .legacy.config import (
SQL_INDEX_FILENAME,
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
)
from .cli import (
archivebox_init,
@ -55,12 +60,12 @@ and example14.badb
<or>htt://example15.badc</that>
'''
stdout = sys.stdout
stderr = sys.stderr
@contextmanager
def output_hidden(show_failing=True):
stdout = sys.stdout
stderr = sys.stderr
if not HIDE_CLI_OUTPUT:
yield
return
@ -100,6 +105,11 @@ class TestInit(unittest.TestCase):
with output_hidden():
archivebox_init.main([])
assert os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME))
assert os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
assert os.path.exists(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME))
assert len(load_main_index(out_dir=OUTPUT_DIR)) == 0
def test_conflicting_init(self):
with open(os.path.join(OUTPUT_DIR, 'test_conflict.txt'), 'w+') as f:
f.write('test')
@ -108,9 +118,25 @@ class TestInit(unittest.TestCase):
with output_hidden(show_failing=False):
archivebox_init.main([])
assert False, 'Init should have exited with an exception'
except SystemExit:
pass
assert not os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME))
assert not os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
assert not os.path.exists(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME))
try:
load_main_index(out_dir=OUTPUT_DIR)
assert False, 'load_main_index should raise an exception when no index is present'
except:
pass
def test_no_dirty_state(self):
with output_hidden():
init()
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
with output_hidden():
init()
class TestAdd(unittest.TestCase):
def setUp(self):
@ -125,7 +151,7 @@ class TestAdd(unittest.TestCase):
with output_hidden():
archivebox_add.main(['https://getpocket.com/users/nikisweeting/feed/all'])
all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 30
def test_add_arg_file(self):
@ -136,7 +162,7 @@ class TestAdd(unittest.TestCase):
with output_hidden():
archivebox_add.main([test_file])
all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 12
os.remove(test_file)
@ -144,7 +170,7 @@ class TestAdd(unittest.TestCase):
with output_hidden():
archivebox_add.main([], stdin=test_urls)
all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 12
@ -155,29 +181,29 @@ class TestRemove(unittest.TestCase):
init()
archivebox_add.main([], stdin=test_urls)
def tearDown(self):
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
# def tearDown(self):
# shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
def test_remove_exact(self):
with output_hidden():
archivebox_remove.main(['--yes', '--delete', 'https://example5.com/'])
all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 11
def test_remove_regex(self):
with output_hidden():
archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', 'http(s)?:\/\/(.+\.)?(example\d\.com)'])
all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 4
def test_remove_domain(self):
with output_hidden():
archivebox_remove.main(['--yes', '--delete', '--filter-type=domain', 'example5.com', 'example6.com'])
all_links, _ = load_main_index(out_dir=OUTPUT_DIR)
all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 10
def test_remove_none(self):
@ -190,4 +216,7 @@ class TestRemove(unittest.TestCase):
if __name__ == '__main__':
if '--verbose' in sys.argv or '-v' in sys.argv:
HIDE_CLI_OUTPUT = False
unittest.main()

View file

@ -1,17 +0,0 @@
dataclasses
django
base32-crockford
setuptools
ipdb
mypy
django-stubs
flake8
#wpull
#pywb
#pyppeteer
#GitPython
#youtube-dl
#archivenow
#requests

View file

@ -31,7 +31,7 @@ setuptools.setup(
'Bug Tracker': 'https://github.com/pirate/ArchiveBox/issues',
'Roadmap': 'https://github.com/pirate/ArchiveBox/wiki/Roadmap',
'Changelog': 'https://github.com/pirate/ArchiveBox/wiki/Changelog',
'Donations': 'https://github.com/pirate/ArchiveBox/wiki/Donations',
'Patreon': 'https://github.com/pirate/ArchiveBox/wiki/Donations',
},
packages=setuptools.find_packages(),
python_requires='>=3.6',
@ -40,6 +40,15 @@ setuptools.setup(
"base32-crockford==0.3.0",
"django==2.2",
"django-extensions==2.1.6",
"youtube-dl",
# Some/all of these will likely be added in the future:
# wpull
# pywb
# pyppeteer
# archivenow
# requests
],
entry_points={
'console_scripts': [