colorize output and add progress bar
This commit is contained in:
parent
172c4ad5b8
commit
a167d2a1f4
5 changed files with 226 additions and 89 deletions
55
archive.py
55
archive.py
|
@ -7,13 +7,13 @@ import os
|
|||
import sys
|
||||
|
||||
from datetime import datetime
|
||||
from subprocess import run
|
||||
|
||||
from parse import parse_export
|
||||
from index import dump_index
|
||||
from fetch import dump_website
|
||||
from config import (
|
||||
ARCHIVE_PERMISSIONS,
|
||||
ANSI,
|
||||
check_dependencies,
|
||||
)
|
||||
|
||||
|
@ -24,23 +24,23 @@ __DOCUMENTATION__ = 'https://github.com/pirate/bookmark-archiver'
|
|||
def create_archive(export_file, service=None, resume=None):
|
||||
"""update or create index.html and download archive of all links"""
|
||||
|
||||
with open(export_file, 'r', encoding='utf-8') as f:
|
||||
print('[+] [{}] Starting archive from {} export file.'.format(
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
export_file
|
||||
))
|
||||
print('[*] [{}] Starting archive from {} export file.'.format(
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
export_file,
|
||||
))
|
||||
|
||||
with open(export_file, 'r', encoding='utf-8') as f:
|
||||
links, service = parse_export(f, service=service)
|
||||
|
||||
if resume:
|
||||
try:
|
||||
links = [
|
||||
link
|
||||
for link in links
|
||||
if float(link['timestamp']) >= float(resume)
|
||||
]
|
||||
except TypeError:
|
||||
print('Resume value and all timestamp values must be valid numbers.')
|
||||
if resume:
|
||||
try:
|
||||
links = [
|
||||
link
|
||||
for link in links
|
||||
if float(link['timestamp']) >= float(resume)
|
||||
]
|
||||
except TypeError:
|
||||
print('Resume value and all timestamp values must be valid numbers.')
|
||||
|
||||
if not links or not service:
|
||||
print('[X] No links found in {}, is it a {} export file?'.format(export_file, service))
|
||||
|
@ -53,20 +53,21 @@ def create_archive(export_file, service=None, resume=None):
|
|||
os.makedirs(os.path.join(service, 'archive'))
|
||||
|
||||
dump_index(links, service)
|
||||
|
||||
run(['chmod', '-R', ARCHIVE_PERMISSIONS, service], timeout=30)
|
||||
|
||||
print('[*] [{}] Created archive index with {} links.'.format(
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
len(links),
|
||||
))
|
||||
|
||||
check_dependencies()
|
||||
try:
|
||||
for link in links:
|
||||
dump_website(link, service)
|
||||
except (KeyboardInterrupt, SystemExit, Exception):
|
||||
print('{red}[X] Archive creation stopped.{reset}'.format(**ANSI))
|
||||
print(' Continue where you left off by running:')
|
||||
print(' ./archive.py {} {} {}'.format(
|
||||
export_file,
|
||||
service,
|
||||
link['timestamp'],
|
||||
))
|
||||
raise SystemExit(1)
|
||||
|
||||
for link in links:
|
||||
dump_website(link, service)
|
||||
|
||||
print('[√] [{}] Archive update complete.'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
|
||||
print('{}[√] [{}] Archive update complete.{}'.format(ANSI['green'], datetime.now().strftime('%Y-%m-%d %H:%M:%S'), ANSI['reset']))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
136
config.py
136
config.py
|
@ -1,7 +1,10 @@
|
|||
import os
|
||||
import sys
|
||||
import time
|
||||
import shutil
|
||||
|
||||
from subprocess import run, PIPE
|
||||
from subprocess import run, PIPE, DEVNULL
|
||||
from multiprocessing import Process
|
||||
|
||||
# os.getenv('VARIABLE', 'DEFAULT') gets the value of environment
|
||||
# variable "VARIABLE" and if it is not set, sets it to 'DEFAULT'
|
||||
|
@ -9,10 +12,14 @@ from subprocess import run, PIPE
|
|||
# for boolean values, check to see if the string is 'true', and
|
||||
# if so, the python variable will be True
|
||||
|
||||
IS_TTY = sys.stdout.isatty()
|
||||
|
||||
USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true'
|
||||
SHOW_PROGRESS = os.getenv('SHOW_PROGRESS', str(IS_TTY) ).lower() == 'true'
|
||||
FETCH_WGET = os.getenv('FETCH_WGET', 'True' ).lower() == 'true'
|
||||
FETCH_WGET_REQUISITES = os.getenv('FETCH_WGET_REQUISITES', 'True' ).lower() == 'true'
|
||||
FETCH_AUDIO = os.getenv('FETCH_AUDIO', 'False' ).lower() == 'true'
|
||||
FETCH_VIDEO = os.getenv('FETCH_VIDEO', 'False' ).lower() == 'true'
|
||||
FETCH_AUDIO = os.getenv('FETCH_AUDIO', 'False' ).lower() == 'true'
|
||||
FETCH_VIDEO = os.getenv('FETCH_VIDEO', 'False' ).lower() == 'true'
|
||||
FETCH_PDF = os.getenv('FETCH_PDF', 'True' ).lower() == 'true'
|
||||
FETCH_SCREENSHOT = os.getenv('FETCH_SCREENSHOT', 'True' ).lower() == 'true'
|
||||
FETCH_FAVICON = os.getenv('FETCH_FAVICON', 'True' ).lower() == 'true'
|
||||
|
@ -25,44 +32,143 @@ TIMEOUT = int(os.getenv('TIMEOUT', '60'))
|
|||
INDEX_TEMPLATE = os.getenv('INDEX_TEMPLATE', 'templates/index.html')
|
||||
INDEX_ROW_TEMPLATE = os.getenv('INDEX_ROW_TEMPLATE', 'templates/index_row.html')
|
||||
|
||||
TERM_WIDTH = shutil.get_terminal_size((100, 10)).columns
|
||||
ANSI = {
|
||||
'reset': '\033[00;00m',
|
||||
'lightblue': '\033[01;30m',
|
||||
'lightyellow': '\033[01;33m',
|
||||
'lightred': '\033[01;35m',
|
||||
'red': '\033[01;31m',
|
||||
'green': '\033[01;32m',
|
||||
'blue': '\033[01;34m',
|
||||
'white': '\033[01;37m',
|
||||
'black': '\033[01;30m',
|
||||
}
|
||||
if not USE_COLOR:
|
||||
# dont show colors if USE_COLOR is False
|
||||
ANSI = {k: '' for k in ANSI.keys()}
|
||||
|
||||
|
||||
### Util Functions
|
||||
|
||||
def check_dependencies():
|
||||
"""Check that all necessary dependencies are installed, and have valid versions"""
|
||||
|
||||
print('[*] Checking Dependencies:')
|
||||
|
||||
python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
|
||||
if python_vers < 3.5:
|
||||
print('[X] Python version is not new enough: {} (>3.5 is required)'.format(python_vers))
|
||||
print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
|
||||
print(' See https://github.com/pirate/bookmark-archiver#troubleshooting for help upgrading your Python installation.')
|
||||
raise SystemExit(1)
|
||||
|
||||
if FETCH_PDF or FETCH_SCREENSHOT:
|
||||
if run(['which', CHROME_BINARY]).returncode:
|
||||
print('[X] Missing dependency: {}'.format(CHROME_BINARY))
|
||||
print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
|
||||
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
|
||||
print(' See https://github.com/pirate/bookmark-archiver for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
# parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
|
||||
result = run([CHROME_BINARY, '--version'], stdout=PIPE)
|
||||
version = result.stdout.decode('utf-8').replace('Google Chrome ', '').replace('Chromium ', '').split(' ', 1)[0].split('.', 1)[0] # TODO: regex might be better
|
||||
if int(version) < 59:
|
||||
print('[X] Chrome version must be 59 or greater for headless PDF and screenshot saving')
|
||||
try:
|
||||
result = run([CHROME_BINARY, '--version'], stdout=PIPE)
|
||||
version = result.stdout.decode('utf-8').replace('Google Chrome ', '').replace('Chromium ', '').split(' ', 1)[0].split('.', 1)[0] # TODO: regex might be better
|
||||
if int(version) < 59:
|
||||
print('{red}[X] Chrome version must be 59 or greater for headless PDF and screenshot saving{reset}'.format(**ANSI))
|
||||
print(' See https://github.com/pirate/bookmark-archiver for help.')
|
||||
raise SystemExit(1)
|
||||
except (TypeError, OSError):
|
||||
print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
|
||||
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
|
||||
print(' See https://github.com/pirate/bookmark-archiver for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
if FETCH_WGET:
|
||||
if run(['which', 'wget']).returncode:
|
||||
print('[X] Missing dependency: wget')
|
||||
if run(['which', 'wget']).returncode or run(['wget', '--version'], stdout=DEVNULL).returncode:
|
||||
print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
|
||||
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('wget'))
|
||||
print(' See https://github.com/pirate/bookmark-archiver for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
|
||||
if run(['which', 'curl']).returncode:
|
||||
print('[X] Missing dependency: curl')
|
||||
if run(['which', 'curl']).returncode or run(['curl', '--version'], stdout=DEVNULL).returncode:
|
||||
print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
|
||||
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('curl'))
|
||||
print(' See https://github.com/pirate/bookmark-archiver for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
if FETCH_AUDIO or FETCH_VIDEO:
|
||||
if run(['which', 'youtube-dl']).returncode:
|
||||
print('[X] Missing dependency: youtube-dl')
|
||||
if run(['which', 'youtube-dl']).returncode or run(['youtube-dl', '--version'], stdout=DEVNULL).returncode:
|
||||
print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
|
||||
print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('youtube-dl'))
|
||||
print(' See https://github.com/pirate/bookmark-archiver for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
|
||||
def chmod_file(path, cwd='.', permissions=ARCHIVE_PERMISSIONS, timeout=30):
|
||||
"""chmod -R <permissions> <cwd>/<path>"""
|
||||
|
||||
if not os.path.exists(os.path.join(cwd, path)):
|
||||
raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
|
||||
|
||||
chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
|
||||
if chmod_result.returncode == 1:
|
||||
print(' ', chmod_result.stderr.decode())
|
||||
raise Exception('Failed to chmod {}/{}'.format(cwd, path))
|
||||
|
||||
|
||||
def progress(seconds=TIMEOUT, prefix=''):
|
||||
"""Show a (subprocess-controlled) progress bar with a <seconds> timeout,
|
||||
returns end() function to instantly finish the progress
|
||||
"""
|
||||
|
||||
if not SHOW_PROGRESS:
|
||||
return lambda: None
|
||||
|
||||
chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
|
||||
|
||||
def progress_bar(seconds=seconds, prefix=prefix):
|
||||
"""show timer in the form of progress bar, with percentage and seconds remaining"""
|
||||
try:
|
||||
for s in range(seconds * chunks):
|
||||
progress = s / chunks / seconds * 100
|
||||
bar_width = round(progress/(100/chunks))
|
||||
|
||||
# ████████████████████ 0.9% (1/60sec)
|
||||
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
|
||||
prefix,
|
||||
ANSI['green'],
|
||||
('█' * bar_width).ljust(chunks),
|
||||
ANSI['reset'],
|
||||
round(progress, 1),
|
||||
round(s/chunks),
|
||||
seconds,
|
||||
))
|
||||
sys.stdout.flush()
|
||||
time.sleep(1 / chunks)
|
||||
|
||||
# ██████████████████████████████████ 100.0% (60/60sec)
|
||||
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
|
||||
prefix,
|
||||
ANSI['red'],
|
||||
'█' * chunks,
|
||||
ANSI['reset'],
|
||||
100.0,
|
||||
seconds,
|
||||
seconds,
|
||||
))
|
||||
sys.stdout.flush()
|
||||
except KeyboardInterrupt:
|
||||
print()
|
||||
pass
|
||||
|
||||
p = Process(target=progress_bar)
|
||||
p.start()
|
||||
|
||||
def end():
|
||||
"""immediately finish progress and clear the progressbar line"""
|
||||
p.terminate()
|
||||
sys.stdout.write('\r{}\r'.format(' ' * TERM_WIDTH)) # clear whole terminal line
|
||||
sys.stdout.flush()
|
||||
|
||||
return end
|
||||
|
|
89
fetch.py
89
fetch.py
|
@ -16,38 +16,35 @@ from config import (
|
|||
FETCH_AUDIO,
|
||||
FETCH_VIDEO,
|
||||
FETCH_FAVICON,
|
||||
TIMEOUT,
|
||||
ANSI,
|
||||
progress,
|
||||
chmod_file,
|
||||
)
|
||||
|
||||
|
||||
def chmod_file(path, cwd='.', permissions='755', timeout=30):
|
||||
if not os.path.exists(os.path.join(cwd, path)):
|
||||
raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
|
||||
|
||||
chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
|
||||
if chmod_result.returncode == 1:
|
||||
print(' ', chmod_result.stderr.decode())
|
||||
raise Exception('Failed to chmod {}/{}'.format(cwd, path))
|
||||
|
||||
def fetch_wget(out_dir, link, overwrite=False, requisites=True, timeout=60):
|
||||
"""download full site using wget"""
|
||||
|
||||
domain = link['base_url'].split('/', 1)[0]
|
||||
if not os.path.exists(os.path.join(out_dir, domain)) or overwrite:
|
||||
print(' - Downloading Full Site')
|
||||
if not os.path.exists(os.path.join(out_dir, link['domain'])) or overwrite:
|
||||
print(' - Downloading full site')
|
||||
CMD = [
|
||||
*'wget --timestamping --adjust-extension --no-parent'.split(' '), # Docs: https://www.gnu.org/software/wget/manual/wget.html
|
||||
*(('--page-requisites', '--convert-links') if requisites else ()),
|
||||
link['url'],
|
||||
]
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout) # dom.html
|
||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout + 1) # index.html
|
||||
end()
|
||||
if result.returncode > 0:
|
||||
print(' ', result.stderr.decode().split('\n')[-1])
|
||||
print('\n'.join(' ' + line for line in result.stderr.decode().rsplit('\n', 5)[-3:] if line.strip()))
|
||||
raise Exception('Failed to wget download')
|
||||
chmod_file(domain, cwd=out_dir)
|
||||
chmod_file(link['domain'], cwd=out_dir)
|
||||
except Exception as e:
|
||||
end()
|
||||
print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
|
||||
print(' Failed: {} {}'.format(e.__class__.__name__, e))
|
||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||
else:
|
||||
print(' √ Skipping site download')
|
||||
|
||||
|
@ -63,15 +60,18 @@ def fetch_pdf(out_dir, link, overwrite=False, timeout=60, chrome_binary='chromiu
|
|||
*'--headless --disable-gpu --print-to-pdf'.split(' '),
|
||||
link['url']
|
||||
]
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(CMD, stdout=DEVNULL, stderr=PIPE, cwd=out_dir, timeout=timeout) # output.pdf
|
||||
result = run(CMD, stdout=DEVNULL, stderr=PIPE, cwd=out_dir, timeout=timeout + 1) # output.pdf
|
||||
end()
|
||||
if result.returncode:
|
||||
print(' ', result.stderr.decode())
|
||||
raise Exception('Failed to print PDF')
|
||||
chmod_file('output.pdf', cwd=out_dir)
|
||||
except Exception as e:
|
||||
end()
|
||||
print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
|
||||
print(' Failed: {} {}'.format(e.__class__.__name__, e))
|
||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||
else:
|
||||
print(' √ Skipping PDF print')
|
||||
|
||||
|
@ -88,15 +88,18 @@ def fetch_screenshot(out_dir, link, overwrite=False, timeout=60, chrome_binary='
|
|||
'--window-size={}'.format(resolution),
|
||||
link['url']
|
||||
]
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout) # sreenshot.png
|
||||
result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # sreenshot.png
|
||||
end()
|
||||
if result.returncode:
|
||||
print(' ', result.stderr.decode())
|
||||
raise Exception('Failed to take screenshot')
|
||||
chmod_file('screenshot.png', cwd=out_dir)
|
||||
except Exception as e:
|
||||
end()
|
||||
print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
|
||||
print(' Failed: {} {}'.format(e.__class__.__name__, e))
|
||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||
else:
|
||||
print(' √ Skipping screenshot')
|
||||
|
||||
|
@ -111,8 +114,10 @@ def archive_dot_org(out_dir, link, overwrite=False, timeout=60):
|
|||
|
||||
success = False
|
||||
CMD = ['curl', '-I', submit_url]
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout) # archive.org
|
||||
result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # archive.org.txt
|
||||
end()
|
||||
headers = result.stdout.splitlines()
|
||||
content_location = [h for h in headers if b'Content-Location: ' in h]
|
||||
if content_location:
|
||||
|
@ -122,11 +127,12 @@ def archive_dot_org(out_dir, link, overwrite=False, timeout=60):
|
|||
else:
|
||||
raise Exception('Failed to find "Content-Location" URL header in Archive.org response.')
|
||||
except Exception as e:
|
||||
end()
|
||||
print(' Visit url to see output:', ' '.join(CMD))
|
||||
print(' Failed: {} {}'.format(e.__class__.__name__, e))
|
||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||
|
||||
if success:
|
||||
with open('{}/archive.org.txt'.format(out_dir), 'w', encoding='utf-8') as f:
|
||||
with open(os.path.join(out_dir, 'archive.org.txt'), 'w', encoding='utf-8') as f:
|
||||
f.write(saved_url)
|
||||
chmod_file('archive.org.txt', cwd=out_dir)
|
||||
|
||||
|
@ -140,14 +146,17 @@ def fetch_favicon(out_dir, link, overwrite=False, timeout=60):
|
|||
|
||||
if not os.path.exists(path) or overwrite:
|
||||
print(' - Fetching Favicon')
|
||||
CMD = 'curl https://www.google.com/s2/favicons?domain={domain}'.format(**link).split(' ')
|
||||
CMD = ['curl', 'https://www.google.com/s2/favicons?domain={domain}'.format(**link)]
|
||||
fout = open('{}/favicon.ico'.format(out_dir), 'w')
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
run([*CMD], stdout=fout, stderr=DEVNULL, cwd=out_dir, timeout=timeout) # favicon.ico
|
||||
run(CMD, stdout=fout, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # favicon.ico
|
||||
end()
|
||||
chmod_file('favicon.ico', cwd=out_dir)
|
||||
except Exception as e:
|
||||
end()
|
||||
print(' Run to see full output:', ' '.join(CMD))
|
||||
print(' Failed: {} {}'.format(e.__class__.__name__, e))
|
||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||
fout.close()
|
||||
else:
|
||||
print(' √ Skipping favicon')
|
||||
|
@ -166,15 +175,18 @@ def fetch_audio(out_dir, link, overwrite=False, timeout=60):
|
|||
"youtube-dl -x --audio-format mp3 --audio-quality 0 -o '%(title)s.%(ext)s'",
|
||||
link['url'],
|
||||
]
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout) # sreenshot.png
|
||||
result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # audio/audio.mp3
|
||||
end()
|
||||
if result.returncode:
|
||||
print(' ', result.stderr.decode())
|
||||
raise Exception('Failed to download audio')
|
||||
chmod_file('audio', cwd=out_dir)
|
||||
except Exception as e:
|
||||
end()
|
||||
print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
|
||||
print(' Failed: {} {}'.format(e.__class__.__name__, e))
|
||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||
else:
|
||||
print(' √ Skipping audio download')
|
||||
|
||||
|
@ -189,27 +201,30 @@ def fetch_video(out_dir, link, overwrite=False, timeout=60):
|
|||
if not os.path.exists(path) or overwrite:
|
||||
print(' - Downloading video')
|
||||
CMD = [
|
||||
"youtube-dl -x --audio-format mp3 --audio-quality 0 -o '%(title)s.%(ext)s'",
|
||||
"youtube-dl -x --video-format mp4 --audio-quality 0 -o '%(title)s.%(ext)s'",
|
||||
link['url'],
|
||||
]
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout) # sreenshot.png
|
||||
result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # video/movie.mp4
|
||||
end()
|
||||
if result.returncode:
|
||||
print(' ', result.stderr.decode())
|
||||
raise Exception('Failed to download video')
|
||||
chmod_file('video', cwd=out_dir)
|
||||
except Exception as e:
|
||||
end()
|
||||
print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
|
||||
print(' Failed: {} {}'.format(e.__class__.__name__, e))
|
||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||
else:
|
||||
print(' √ Skipping video download')
|
||||
|
||||
def dump_link_info(out_dir, link, update=True):
|
||||
def dump_link_info(out_dir, link, overwrite=False):
|
||||
"""write a json file with some info about the link"""
|
||||
|
||||
info_file_path = os.path.join(out_dir, 'link.json')
|
||||
|
||||
if (not os.path.exists(info_file_path) or update):
|
||||
if (not os.path.exists(info_file_path) or overwrite):
|
||||
print(' - Creating link info file')
|
||||
try:
|
||||
link_json = derived_link_info(link)
|
||||
|
@ -223,7 +238,7 @@ def dump_link_info(out_dir, link, update=True):
|
|||
|
||||
chmod_file('link.json', cwd=out_dir)
|
||||
except Exception as e:
|
||||
print(' Failed: {} {}'.format(e.__class__.__name__, e))
|
||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||
else:
|
||||
print(' √ Skipping link info file')
|
||||
|
||||
|
@ -231,7 +246,7 @@ def dump_link_info(out_dir, link, update=True):
|
|||
def dump_website(link, service, overwrite=False, permissions=ARCHIVE_PERMISSIONS):
|
||||
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
||||
|
||||
print('[+] [{timestamp} ({time})] "{title}": {base_url}'.format(**link))
|
||||
print('[{green}+{reset}] [{timestamp} ({time})] "{title}": {blue}{base_url}{reset}'.format(**link, **ANSI))
|
||||
|
||||
out_dir = os.path.join(service, 'archive', link['timestamp'])
|
||||
if not os.path.exists(out_dir):
|
||||
|
@ -243,7 +258,7 @@ def dump_website(link, service, overwrite=False, permissions=ARCHIVE_PERMISSIONS
|
|||
print(' i Type: {}'.format(link['type']))
|
||||
|
||||
if not (link['url'].startswith('http') or link['url'].startswith('ftp')):
|
||||
print(' X Skipping: invalid link.')
|
||||
print(' {}X Skipping: invalid link.{}', ANSI['red'], ANSI['yellow'])
|
||||
return
|
||||
|
||||
if FETCH_WGET:
|
||||
|
@ -267,4 +282,4 @@ def dump_website(link, service, overwrite=False, permissions=ARCHIVE_PERMISSIONS
|
|||
if FETCH_FAVICON:
|
||||
fetch_favicon(out_dir, link, overwrite=overwrite)
|
||||
|
||||
dump_link_info(out_dir, link)
|
||||
dump_link_info(out_dir, link, overwrite=overwrite)
|
||||
|
|
19
index.py
19
index.py
|
@ -2,8 +2,14 @@ import os
|
|||
from datetime import datetime
|
||||
from string import Template
|
||||
|
||||
from config import INDEX_TEMPLATE, INDEX_ROW_TEMPLATE
|
||||
from parse import derived_link_info
|
||||
from config import (
|
||||
INDEX_TEMPLATE,
|
||||
INDEX_ROW_TEMPLATE,
|
||||
ARCHIVE_PERMISSIONS,
|
||||
ANSI,
|
||||
chmod_file,
|
||||
)
|
||||
|
||||
|
||||
def dump_index(links, service):
|
||||
|
@ -28,4 +34,13 @@ def dump_index(links, service):
|
|||
}
|
||||
|
||||
with open(os.path.join(service, 'index.html'), 'w', encoding='utf-8') as f:
|
||||
f.write(Template(index_html).substitute(template_vars))
|
||||
f.write(Template(index_html).substitute(**template_vars))
|
||||
|
||||
chmod_file(service, permissions=ARCHIVE_PERMISSIONS)
|
||||
|
||||
print('[+] [{}] Created archive index with {}{}{} links.'.format(
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
ANSI['green'],
|
||||
len(links),
|
||||
ANSI['reset'],
|
||||
))
|
||||
|
|
16
parse.py
16
parse.py
|
@ -64,8 +64,8 @@ def parse_pocket_export(html_file):
|
|||
without_scheme = fixed_url.replace('http://', '').replace('https://', '')
|
||||
info = {
|
||||
'url': fixed_url,
|
||||
'domain': without_scheme.split('/')[0], # without pathname
|
||||
'base_url': without_scheme.split('?')[0], # without query args
|
||||
'domain': without_scheme.split('/', 1)[0], # without pathname
|
||||
'base_url': without_scheme.split('?', 1)[0], # without query args
|
||||
'time': datetime.fromtimestamp(int(match.group(2))).strftime('%Y-%m-%d %H:%M'),
|
||||
'timestamp': match.group(2),
|
||||
'tags': match.group(3),
|
||||
|
@ -84,10 +84,10 @@ def parse_json_export(json_file):
|
|||
erg = line
|
||||
info = {
|
||||
'url': erg['href'],
|
||||
'domain': erg['href'].replace('http://', '').replace('https://', '').split('/')[0],
|
||||
'base_url': erg['href'].replace('https://', '').replace('http://', '').split('?')[0],
|
||||
'time': datetime.fromtimestamp(int(time.mktime(time.strptime(erg['time'].split(',')[0], '%Y-%m-%dT%H:%M:%SZ')))),
|
||||
'timestamp': str(int(time.mktime(time.strptime(erg['time'].split(',')[0], '%Y-%m-%dT%H:%M:%SZ')))),
|
||||
'domain': erg['href'].replace('http://', '').replace('https://', '').split('/', 1)[0],
|
||||
'base_url': erg['href'].replace('https://', '').replace('http://', '').split('?', 1)[0],
|
||||
'time': datetime.fromtimestamp(int(time.mktime(time.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ')))),
|
||||
'timestamp': str(int(time.mktime(time.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ')))),
|
||||
'tags': erg['tags'],
|
||||
'title': erg['description'].replace(' — Readability', ''),
|
||||
}
|
||||
|
@ -108,8 +108,8 @@ def parse_bookmarks_export(html_file):
|
|||
|
||||
info = {
|
||||
'url': url,
|
||||
'domain': url.replace('http://', '').replace('https://', '').split('/')[0],
|
||||
'base_url': url.replace('https://', '').replace('http://', '').split('?')[0],
|
||||
'domain': url.replace('http://', '').replace('https://', '').split('/', 1)[0],
|
||||
'base_url': url.replace('https://', '').replace('http://', '').split('?', 1)[0],
|
||||
'time': dt,
|
||||
'timestamp': secs,
|
||||
'tags': "",
|
||||
|
|
Loading…
Reference in a new issue