consistent naming and args refactor
This commit is contained in:
parent
2aae6e0c27
commit
d32afec54b
4 changed files with 164 additions and 142 deletions
|
@ -377,10 +377,11 @@ will run fast subsequent times because it only downloads new links that haven't
|
|||
|
||||
## Changelog
|
||||
|
||||
- v0.0.2 released
|
||||
- v0.0.3 released
|
||||
- support for chrome `--user-data-dir` to archive sites that need logins
|
||||
- fancy individual html & json indexes for each link
|
||||
- smartly append new links to existing index instead of overwriting
|
||||
- v0.0.2 released
|
||||
- proper HTML templating instead of format strings (thanks to https://github.com/bardisty!)
|
||||
- refactored into separate files, wip audio & video archiving
|
||||
- v0.0.1 released
|
||||
|
|
109
archive.py
109
archive.py
|
@ -3,6 +3,7 @@
|
|||
# Nick Sweeting 2017 | MIT License
|
||||
# https://github.com/pirate/bookmark-archiver
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
from datetime import datetime
|
||||
|
@ -19,7 +20,6 @@ from index import (
|
|||
from config import (
|
||||
ARCHIVE_PERMISSIONS,
|
||||
HTML_FOLDER,
|
||||
ARCHIVE_FOLDER,
|
||||
ANSI,
|
||||
TIMEOUT,
|
||||
)
|
||||
|
@ -33,19 +33,50 @@ from util import (
|
|||
__DESCRIPTION__ = 'Bookmark Archiver: Create a browsable html archive of a list of links.'
|
||||
__DOCUMENTATION__ = 'https://github.com/pirate/bookmark-archiver'
|
||||
|
||||
def print_help():
|
||||
print(__DESCRIPTION__)
|
||||
print("Documentation: {}\n".format(__DOCUMENTATION__))
|
||||
print("Usage:")
|
||||
print(" ./archive.py ~/Downloads/bookmarks_export.html\n")
|
||||
|
||||
def update_archive(export_path, links, resume=None, append=True):
|
||||
|
||||
def get_links(new_links_file_path, archive_path=HTML_FOLDER):
|
||||
"""get new links from file and optionally append them to links in existing archive"""
|
||||
# parse and validate the new_links_file
|
||||
raw_links = parse_links(new_links_file_path)
|
||||
valid_links = validate_links(raw_links)
|
||||
|
||||
# merge existing links in archive_path and new links
|
||||
existing_links = []
|
||||
if archive_path:
|
||||
existing_links = parse_json_links_index(archive_path)
|
||||
valid_links = validate_links(existing_links + valid_links)
|
||||
|
||||
num_new_links = len(valid_links) - len(existing_links)
|
||||
print('[*] [{}] Adding {} new links from {} to index'.format(
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_new_links,
|
||||
new_links_file_path,
|
||||
))
|
||||
|
||||
return valid_links
|
||||
|
||||
def update_archive(archive_path, links, source=None, resume=None, append=True):
|
||||
"""update or create index.html+json given a path to an export file containing new links"""
|
||||
|
||||
start_ts = datetime.now().timestamp()
|
||||
|
||||
# loop over links and archive them
|
||||
archive_links(ARCHIVE_FOLDER, links, export_path, resume=resume)
|
||||
archive_links(archive_path, links, source=source, resume=resume)
|
||||
|
||||
# print timing information & summary
|
||||
end_ts = datetime.now().timestamp()
|
||||
seconds = round(end_ts - start_ts, 2)
|
||||
duration = '{} min'.format(round(seconds / 60, 2)) if seconds > 60 else '{} sec'.format(seconds)
|
||||
seconds = end_ts - start_ts
|
||||
if seconds > 60:
|
||||
duration = '{0:.2f} min'.format(seconds / 60, 2)
|
||||
else:
|
||||
duration = '{0:.2f} sec'.format(seconds, 2)
|
||||
|
||||
print('{}[√] [{}] Archive update complete ({}){}'.format(
|
||||
ANSI['green'],
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
|
@ -57,53 +88,37 @@ def update_archive(export_path, links, resume=None, append=True):
|
|||
print(' - {} errors'.format(_RESULTS_TOTALS['failed']))
|
||||
|
||||
|
||||
def update_index(export_path, resume=None, append=True):
|
||||
"""handling parsing new links into the json index, returns a set of clean links"""
|
||||
|
||||
# parse an validate the export file
|
||||
new_links = validate_links(parse_links(export_path))
|
||||
|
||||
# load existing links if archive folder is present
|
||||
existing_links = []
|
||||
if append:
|
||||
existing_links = parse_json_links_index(HTML_FOLDER)
|
||||
links = validate_links(existing_links + new_links)
|
||||
|
||||
|
||||
# merge existing links and new links
|
||||
num_new_links = len(links) - len(existing_links)
|
||||
print('[*] [{}] Adding {} new links from {} to index'.format(
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_new_links,
|
||||
export_path,
|
||||
))
|
||||
|
||||
# write link index html & json
|
||||
write_links_index(HTML_FOLDER, links)
|
||||
|
||||
return links
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
argc = len(sys.argv)
|
||||
|
||||
if argc < 2 or sys.argv[1] in ('-h', '--help', 'help'):
|
||||
print(__DESCRIPTION__)
|
||||
print("Documentation: {}".format(__DOCUMENTATION__))
|
||||
print("")
|
||||
print("Usage:")
|
||||
print(" ./archive.py ~/Downloads/bookmarks_export.html")
|
||||
print("")
|
||||
if argc < 2 or set(sys.argv).intersection('-h', '--help', 'help'):
|
||||
print_help()
|
||||
raise SystemExit(0)
|
||||
|
||||
export_path = sys.argv[1] # path to export file
|
||||
resume_from = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from
|
||||
source = sys.argv[1] # path to export file
|
||||
resume = sys.argv[2] if argc > 2 else None # timestamp to resume dowloading from
|
||||
|
||||
# See if archive folder already exists
|
||||
for out_folder in (HTML_FOLDER, 'bookmarks', 'pocket', 'pinboard', 'html'):
|
||||
if os.path.exists(out_folder):
|
||||
break
|
||||
else:
|
||||
out_folder = HTML_FOLDER
|
||||
|
||||
if any(export_path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
||||
export_path = download_url(export_path)
|
||||
archive_path = os.path.join(out_folder, 'archive')
|
||||
|
||||
links = update_index(export_path, resume=resume_from, append=True)
|
||||
# Step 0: Download url to local file (only happens if a URL is specified instead of local path)
|
||||
if any(source.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
||||
source = download_url(source)
|
||||
|
||||
# make sure folder structure is sane
|
||||
cleanup_archive(ARCHIVE_FOLDER, links)
|
||||
update_archive(export_path, links, resume=resume_from, append=True)
|
||||
# Step 1: Parse the links and dedupe them with existing archive
|
||||
links = get_links(source, archive_path=archive_path)
|
||||
|
||||
# Step 2: Write new index
|
||||
write_links_index(archive_path, links)
|
||||
|
||||
# Step 3: Verify folder structure is 1:1 with index
|
||||
cleanup_archive(archive_path, links)
|
||||
|
||||
# Step 4: Run the archive methods for each link
|
||||
update_archive(archive_path, links, source=source, resume=resume, append=True)
|
||||
|
|
|
@ -36,24 +36,24 @@ _RESULTS_TOTALS = { # globals are bad, mmkay
|
|||
'failed': 0,
|
||||
}
|
||||
|
||||
def archive_links(out_dir, links, export_path, resume=None):
|
||||
def archive_links(archive_path, links, source=None, resume=None):
|
||||
check_dependencies()
|
||||
|
||||
to_archive = links_after_timestamp(links, resume)
|
||||
try:
|
||||
for idx, link in enumerate(to_archive):
|
||||
link_dir = os.path.join(out_dir, link['timestamp'])
|
||||
link_dir = os.path.join(archive_path, link['timestamp'])
|
||||
archive_link(link_dir, link)
|
||||
|
||||
except (KeyboardInterrupt, SystemExit, Exception) as e:
|
||||
print('{red}[X] Archive update stopped on #{idx} out of {total} links{reset}'.format(
|
||||
print('{red}[X] Index is up-to-date, archive update paused on link {idx}/{total}{reset}'.format(
|
||||
**ANSI,
|
||||
idx=idx,
|
||||
total=len(list(to_archive)),
|
||||
))
|
||||
print(' Continue where you left off by running:')
|
||||
print(' ./archive.py {} {}'.format(
|
||||
export_path,
|
||||
source,
|
||||
link['timestamp'],
|
||||
))
|
||||
if not isinstance(e, KeyboardInterrupt):
|
||||
|
@ -61,42 +61,46 @@ def archive_links(out_dir, links, export_path, resume=None):
|
|||
raise SystemExit(1)
|
||||
|
||||
|
||||
def archive_link(out_dir, link, overwrite=False):
|
||||
def archive_link(link_dir, link, overwrite=False):
|
||||
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
||||
|
||||
link = {**parse_json_link_index(out_dir), **link}
|
||||
log_link_archive(out_dir, link)
|
||||
update_existing = os.path.exists(link_dir)
|
||||
if update_existing:
|
||||
link = {
|
||||
**parse_json_link_index(link_dir),
|
||||
**link,
|
||||
}
|
||||
else:
|
||||
os.makedirs(link_dir)
|
||||
|
||||
log_link_archive(link_dir, link, update_existing)
|
||||
|
||||
if FETCH_WGET:
|
||||
link = fetch_wget(out_dir, link, overwrite=overwrite)
|
||||
link = fetch_wget(link_dir, link, overwrite=overwrite)
|
||||
|
||||
if FETCH_PDF:
|
||||
link = fetch_pdf(out_dir, link, overwrite=overwrite)
|
||||
link = fetch_pdf(link_dir, link, overwrite=overwrite)
|
||||
|
||||
if FETCH_SCREENSHOT:
|
||||
link = fetch_screenshot(out_dir, link, overwrite=overwrite)
|
||||
link = fetch_screenshot(link_dir, link, overwrite=overwrite)
|
||||
|
||||
if SUBMIT_ARCHIVE_DOT_ORG:
|
||||
link = archive_dot_org(out_dir, link, overwrite=overwrite)
|
||||
link = archive_dot_org(link_dir, link, overwrite=overwrite)
|
||||
|
||||
# if FETCH_AUDIO:
|
||||
# link = fetch_audio(out_dir, link, overwrite=overwrite)
|
||||
# link = fetch_audio(link_dir, link, overwrite=overwrite)
|
||||
|
||||
# if FETCH_VIDEO:
|
||||
# link = fetch_video(out_dir, link, overwrite=overwrite)
|
||||
# link = fetch_video(link_dir, link, overwrite=overwrite)
|
||||
|
||||
if FETCH_FAVICON:
|
||||
link = fetch_favicon(out_dir, link, overwrite=overwrite)
|
||||
link = fetch_favicon(link_dir, link, overwrite=overwrite)
|
||||
|
||||
write_link_index(out_dir, link)
|
||||
write_link_index(link_dir, link)
|
||||
|
||||
return link
|
||||
|
||||
def log_link_archive(out_dir, link):
|
||||
update_existing = os.path.exists(out_dir)
|
||||
if not update_existing:
|
||||
os.makedirs(out_dir)
|
||||
|
||||
def log_link_archive(link_dir, link, update_existing):
|
||||
print('[{symbol_color}{symbol}{reset}] [{timestamp}] "{title}": {blue}{base_url}{reset}'.format(
|
||||
symbol='*' if update_existing else '+',
|
||||
symbol_color=ANSI['black' if update_existing else 'green'],
|
||||
|
@ -106,7 +110,7 @@ def log_link_archive(out_dir, link):
|
|||
if link['type']:
|
||||
print(' i Type: {}'.format(link['type']))
|
||||
|
||||
print(' {} ({})'.format(out_dir, 'updating' if update_existing else 'creating'))
|
||||
print(' {} ({})'.format(link_dir, 'updating' if update_existing else 'creating'))
|
||||
|
||||
|
||||
|
||||
|
@ -118,7 +122,7 @@ def attach_result_to_link(method):
|
|||
"""
|
||||
def decorator(fetch_func):
|
||||
@wraps(fetch_func)
|
||||
def timed_fetch_func(out_dir, link, overwrite=False, **kwargs):
|
||||
def timed_fetch_func(link_dir, link, overwrite=False, **kwargs):
|
||||
# initialize methods and history json field on link
|
||||
link['latest'] = link.get('latest') or {}
|
||||
link['latest'][method] = link['latest'].get(method) or None
|
||||
|
@ -133,7 +137,7 @@ def attach_result_to_link(method):
|
|||
result = None
|
||||
else:
|
||||
print(' - Fetching: {}'.format(method))
|
||||
result = fetch_func(out_dir, link, **kwargs)
|
||||
result = fetch_func(link_dir, link, **kwargs)
|
||||
|
||||
end_ts = datetime.now().timestamp()
|
||||
duration = str(end_ts * 1000 - start_ts * 1000).split('.')[0]
|
||||
|
@ -164,10 +168,10 @@ def attach_result_to_link(method):
|
|||
|
||||
|
||||
@attach_result_to_link('wget')
|
||||
def fetch_wget(out_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT):
|
||||
def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT):
|
||||
"""download full site using wget"""
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, link['domain'])):
|
||||
if os.path.exists(os.path.join(link_dir, link['domain'])):
|
||||
return {'output': html_appended_url(link), 'status': 'skipped'}
|
||||
|
||||
CMD = [
|
||||
|
@ -178,7 +182,7 @@ def fetch_wget(out_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT)
|
|||
]
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout + 1) # index.html
|
||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # index.html
|
||||
end()
|
||||
output = html_appended_url(link)
|
||||
if result.returncode > 0:
|
||||
|
@ -187,7 +191,7 @@ def fetch_wget(out_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT)
|
|||
# raise Exception('Failed to wget download')
|
||||
except Exception as e:
|
||||
end()
|
||||
print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
|
||||
print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
|
||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||
output = e
|
||||
|
||||
|
@ -198,24 +202,23 @@ def fetch_wget(out_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT)
|
|||
|
||||
|
||||
@attach_result_to_link('pdf')
|
||||
def fetch_pdf(out_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
|
||||
def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
|
||||
"""print PDF of site to file using chrome --headless"""
|
||||
|
||||
if link['type'] in ('PDF', 'image'):
|
||||
return {'output': html_appended_url(link)}
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'output.pdf')):
|
||||
if os.path.exists(os.path.join(link_dir, 'output.pdf')):
|
||||
return {'output': 'output.pdf', 'status': 'skipped'}
|
||||
|
||||
CMD = [
|
||||
CHROME_BINARY,
|
||||
*'--headless --disable-gpu --print-to-pdf'.split(' '),
|
||||
*chrome_data_dir_args(user_data_dir),
|
||||
*chrome_headless(user_data_dir=user_data_dir),
|
||||
'--print-to-pdf',
|
||||
link['url']
|
||||
]
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout + 1) # output.pdf
|
||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # output.pdf
|
||||
end()
|
||||
if result.returncode:
|
||||
print(' ', (result.stderr or result.stdout).decode())
|
||||
|
@ -223,7 +226,7 @@ def fetch_pdf(out_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR
|
|||
output = 'output.pdf'
|
||||
except Exception as e:
|
||||
end()
|
||||
print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
|
||||
print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
|
||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||
output = e
|
||||
|
||||
|
@ -234,34 +237,33 @@ def fetch_pdf(out_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR
|
|||
|
||||
|
||||
@attach_result_to_link('screenshot')
|
||||
def fetch_screenshot(out_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR, resolution=RESOLUTION):
|
||||
def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR, resolution=RESOLUTION):
|
||||
"""take screenshot of site using chrome --headless"""
|
||||
|
||||
if link['type'] in ('PDF', 'image'):
|
||||
return {'output': html_appended_url(link)}
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'screenshot.png')):
|
||||
if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
|
||||
return {'output': 'screenshot.png', 'status': 'skipped'}
|
||||
|
||||
CMD = [
|
||||
CHROME_BINARY,
|
||||
*'--headless --disable-gpu --screenshot'.split(' '),
|
||||
*chrome_data_dir_args(user_data_dir),
|
||||
*chrome_headless(user_data_dir=user_data_dir),
|
||||
'--screenshot',
|
||||
'--window-size={}'.format(resolution),
|
||||
link['url']
|
||||
]
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout + 1) # sreenshot.png
|
||||
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # sreenshot.png
|
||||
end()
|
||||
if result.returncode:
|
||||
print(' ', (result.stderr or result.stdout).decode())
|
||||
raise Exception('Failed to take screenshot')
|
||||
chmod_file('screenshot.png', cwd=out_dir)
|
||||
chmod_file('screenshot.png', cwd=link_dir)
|
||||
output = 'screenshot.png'
|
||||
except Exception as e:
|
||||
end()
|
||||
print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
|
||||
print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
|
||||
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||
output = e
|
||||
|
||||
|
@ -272,10 +274,10 @@ def fetch_screenshot(out_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_D
|
|||
|
||||
|
||||
@attach_result_to_link('archive_org')
|
||||
def archive_dot_org(out_dir, link, timeout=TIMEOUT):
|
||||
def archive_dot_org(link_dir, link, timeout=TIMEOUT):
|
||||
"""submit site to archive.org for archiving via their service, save returned archive url"""
|
||||
|
||||
path = os.path.join(out_dir, 'archive.org.txt')
|
||||
path = os.path.join(link_dir, 'archive.org.txt')
|
||||
if os.path.exists(path):
|
||||
archive_org_url = open(path, 'r').read().strip()
|
||||
return {'output': archive_org_url, 'status': 'skipped'}
|
||||
|
@ -286,7 +288,7 @@ def archive_dot_org(out_dir, link, timeout=TIMEOUT):
|
|||
CMD = ['curl', '-I', submit_url]
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # archive.org.txt
|
||||
result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # archive.org.txt
|
||||
end()
|
||||
|
||||
# Parse archive.org response headers
|
||||
|
@ -313,9 +315,9 @@ def archive_dot_org(out_dir, link, timeout=TIMEOUT):
|
|||
output = e
|
||||
|
||||
if success:
|
||||
with open(os.path.join(out_dir, 'archive.org.txt'), 'w', encoding='utf-8') as f:
|
||||
with open(os.path.join(link_dir, 'archive.org.txt'), 'w', encoding='utf-8') as f:
|
||||
f.write(saved_url)
|
||||
chmod_file('archive.org.txt', cwd=out_dir)
|
||||
chmod_file('archive.org.txt', cwd=link_dir)
|
||||
output = saved_url
|
||||
|
||||
return {
|
||||
|
@ -324,20 +326,20 @@ def archive_dot_org(out_dir, link, timeout=TIMEOUT):
|
|||
}
|
||||
|
||||
@attach_result_to_link('favicon')
|
||||
def fetch_favicon(out_dir, link, timeout=TIMEOUT):
|
||||
def fetch_favicon(link_dir, link, timeout=TIMEOUT):
|
||||
"""download site favicon from google's favicon api"""
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'favicon.ico')):
|
||||
if os.path.exists(os.path.join(link_dir, 'favicon.ico')):
|
||||
return {'output': 'favicon.ico', 'status': 'skipped'}
|
||||
|
||||
CMD = ['curl', 'https://www.google.com/s2/favicons?domain={domain}'.format(**link)]
|
||||
fout = open('{}/favicon.ico'.format(out_dir), 'w')
|
||||
fout = open('{}/favicon.ico'.format(link_dir), 'w')
|
||||
end = progress(timeout, prefix=' ')
|
||||
try:
|
||||
run(CMD, stdout=fout, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # favicon.ico
|
||||
run(CMD, stdout=fout, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # favicon.ico
|
||||
fout.close()
|
||||
end()
|
||||
chmod_file('favicon.ico', cwd=out_dir)
|
||||
chmod_file('favicon.ico', cwd=link_dir)
|
||||
output = 'favicon.ico'
|
||||
except Exception as e:
|
||||
fout.close()
|
||||
|
@ -352,14 +354,14 @@ def fetch_favicon(out_dir, link, timeout=TIMEOUT):
|
|||
}
|
||||
|
||||
# @attach_result_to_link('audio')
|
||||
# def fetch_audio(out_dir, link, timeout=TIMEOUT):
|
||||
# def fetch_audio(link_dir, link, timeout=TIMEOUT):
|
||||
# """Download audio rip using youtube-dl"""
|
||||
|
||||
# if link['type'] not in ('soundcloud',)\
|
||||
# and 'audio' not in link['tags']:
|
||||
# return
|
||||
|
||||
# path = os.path.join(out_dir, 'audio')
|
||||
# path = os.path.join(link_dir, 'audio')
|
||||
|
||||
# if not os.path.exists(path) or overwrite:
|
||||
# print(' - Downloading audio')
|
||||
|
@ -369,30 +371,30 @@ def fetch_favicon(out_dir, link, timeout=TIMEOUT):
|
|||
# ]
|
||||
# end = progress(timeout, prefix=' ')
|
||||
# try:
|
||||
# result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # audio/audio.mp3
|
||||
# result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # audio/audio.mp3
|
||||
# end()
|
||||
# if result.returncode:
|
||||
# print(' ', result.stderr.decode())
|
||||
# raise Exception('Failed to download audio')
|
||||
# chmod_file('audio.mp3', cwd=out_dir)
|
||||
# chmod_file('audio.mp3', cwd=link_dir)
|
||||
# return 'audio.mp3'
|
||||
# except Exception as e:
|
||||
# end()
|
||||
# print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
|
||||
# print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
|
||||
# print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||
# raise
|
||||
# else:
|
||||
# print(' √ Skipping audio download')
|
||||
|
||||
# @attach_result_to_link('video')
|
||||
# def fetch_video(out_dir, link, timeout=TIMEOUT):
|
||||
# def fetch_video(link_dir, link, timeout=TIMEOUT):
|
||||
# """Download video rip using youtube-dl"""
|
||||
|
||||
# if link['type'] not in ('youtube', 'youku', 'vimeo')\
|
||||
# and 'video' not in link['tags']:
|
||||
# return
|
||||
|
||||
# path = os.path.join(out_dir, 'video')
|
||||
# path = os.path.join(link_dir, 'video')
|
||||
|
||||
# if not os.path.exists(path) or overwrite:
|
||||
# print(' - Downloading video')
|
||||
|
@ -402,26 +404,27 @@ def fetch_favicon(out_dir, link, timeout=TIMEOUT):
|
|||
# ]
|
||||
# end = progress(timeout, prefix=' ')
|
||||
# try:
|
||||
# result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # video/movie.mp4
|
||||
# result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # video/movie.mp4
|
||||
# end()
|
||||
# if result.returncode:
|
||||
# print(' ', result.stderr.decode())
|
||||
# raise Exception('Failed to download video')
|
||||
# chmod_file('video.mp4', cwd=out_dir)
|
||||
# chmod_file('video.mp4', cwd=link_dir)
|
||||
# return 'video.mp4'
|
||||
# except Exception as e:
|
||||
# end()
|
||||
# print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD)))
|
||||
# print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
|
||||
# print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||
# raise
|
||||
# else:
|
||||
# print(' √ Skipping video download')
|
||||
|
||||
|
||||
def chrome_data_dir_args(user_data_dir=CHROME_USER_DATA_DIR):
|
||||
default = os.path.expanduser('~/Library/Application Support/Google/Chrome/Default')
|
||||
def chrome_headless(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR):
|
||||
args = [binary, '--headless', '--disable-gpu']
|
||||
default_profile = os.path.expanduser('~/Library/Application Support/Google/Chrome/Default')
|
||||
if user_data_dir:
|
||||
return ('--user-data-dir={}'.format(user_data_dir),)
|
||||
elif os.path.exists(default):
|
||||
return ('--user-data-dir={}'.format(default),)
|
||||
return ()
|
||||
args.append('--user-data-dir={}'.format(user_data_dir))
|
||||
elif os.path.exists(default_profile):
|
||||
args.append('--user-data-dir={}'.format(default_profile))
|
||||
return args
|
||||
|
|
55
util.py
55
util.py
|
@ -293,8 +293,8 @@ def manually_merge_folders(source, target):
|
|||
|
||||
print(' {} and {} have conflicting files, which do you want to keep?'.format(fname(source), fname(target)))
|
||||
print(' - [enter]: do nothing (keep both)')
|
||||
print(' - a: keep everything from {}'.format(source))
|
||||
print(' - b: keep everything from {}'.format(target))
|
||||
print(' - a: prefer files from {}'.format(source))
|
||||
print(' - b: prefer files from {}'.format(target))
|
||||
print(' - q: quit and resolve the conflict manually')
|
||||
try:
|
||||
answer = input('> ').strip().lower()
|
||||
|
@ -311,7 +311,7 @@ def manually_merge_folders(source, target):
|
|||
|
||||
files_in_source = set(os.listdir(source))
|
||||
files_in_target = set(os.listdir(target))
|
||||
for file in files_in_source.intersection(files_in_target):
|
||||
for file in files_in_source:
|
||||
if file in files_in_target:
|
||||
to_delete = target if answer == 'a' else source
|
||||
run(['rm', '-Rf', os.path.join(to_delete, file)])
|
||||
|
@ -320,27 +320,26 @@ def manually_merge_folders(source, target):
|
|||
if not set(os.listdir(source)):
|
||||
run(['rm', '-Rf', source])
|
||||
|
||||
def merge_folders(path, folder, link):
|
||||
def fix_folder_path(archive_path, link_folder, link):
|
||||
"""given a folder, merge it to the canonical 'correct' path for the given link object"""
|
||||
source, target = os.path.join(path, folder), os.path.join(path, link['timestamp'])
|
||||
source = os.path.join(archive_path, link_folder)
|
||||
target = os.path.join(archive_path, link['timestamp'])
|
||||
|
||||
base_url = parse_url(source)
|
||||
if not (base_url in link['base_url']
|
||||
or link['base_url'] in base_url):
|
||||
url_in_folder = parse_url(source)
|
||||
if not (url_in_folder in link['base_url']
|
||||
or link['base_url'] in url_in_folder):
|
||||
raise ValueError('The link does not match the url for this folder.')
|
||||
|
||||
if not os.path.exists(target):
|
||||
# target doesn't exist so nothing needs merging, simply move A to B
|
||||
if run(['mv', source, target]).returncode:
|
||||
print('Failed to move {} to {}!'.format(source, target))
|
||||
return False
|
||||
run(['mv', source, target])
|
||||
else:
|
||||
# target folder exists, check for conflicting files and attempt manual merge
|
||||
files_in_source = set(os.listdir(source))
|
||||
files_in_target = set(os.listdir(target))
|
||||
conflicting_files = files_in_source & files_in_target
|
||||
|
||||
if not files_in_source.intersection(files_in_target):
|
||||
# no conflicts, move everything from A to B
|
||||
if not conflicting_files:
|
||||
for file in files_in_source:
|
||||
run(['mv', os.path.join(source, file), os.path.join(target, file)])
|
||||
|
||||
|
@ -352,26 +351,25 @@ def merge_folders(path, folder, link):
|
|||
run(['rm', '-R', source])
|
||||
|
||||
|
||||
def cleanup_archive(path, links):
|
||||
def cleanup_archive(archive_path, links):
|
||||
"""move any incorrectly named folders to their canonical locations"""
|
||||
|
||||
# for each folder that exists, see if we can match it up with a known good link
|
||||
# if we can, then merge the two folders, if not, move it to lost & found
|
||||
|
||||
# for each timestamp, find similar timestamped folders
|
||||
# check each folder for a "domain.com" folder or
|
||||
# if we can, then merge the two folders (TODO: if not, move it to lost & found)
|
||||
|
||||
unmatched = []
|
||||
bad_folders = []
|
||||
|
||||
if not os.path.exists(path):
|
||||
if not os.path.exists(archive_path):
|
||||
return
|
||||
|
||||
for folder in os.listdir(path):
|
||||
if not os.listdir(os.path.join(path, folder)):
|
||||
# delete empty folders
|
||||
run(['rm', '-R', os.path.join(path, folder)])
|
||||
else:
|
||||
for folder in os.listdir(archive_path):
|
||||
try:
|
||||
files = os.listdir(os.path.join(archive_path, folder))
|
||||
except NotADirectoryError:
|
||||
continue
|
||||
|
||||
if files:
|
||||
link = find_link(folder, links)
|
||||
if link is None:
|
||||
unmatched.append(folder)
|
||||
|
@ -379,11 +377,16 @@ def cleanup_archive(path, links):
|
|||
|
||||
if folder != link['timestamp']:
|
||||
bad_folders.append((folder, link))
|
||||
else:
|
||||
# delete empty folders
|
||||
run(['rm', '-R', os.path.join(archive_path, folder)])
|
||||
|
||||
if bad_folders:
|
||||
if bad_folders and IS_TTY and input('[!] Cleanup archive? y/[n]: ') == 'y':
|
||||
print('[!] Fixing {} improperly named folders in archive...'.format(len(bad_folders)))
|
||||
for folder, link in bad_folders:
|
||||
merge_folders(path, folder, link)
|
||||
fix_folder_path(archive_path, folder, link)
|
||||
elif bad_folders:
|
||||
print('[!] Warning! {} folders need to be merged, fix by running bookmark archiver.'.format(len(bad_folders)))
|
||||
|
||||
if unmatched:
|
||||
print('[!] Warning! {} unrecognized folders in html/archive/'.format(len(unmatched)))
|
||||
|
|
Loading…
Add table
Reference in a new issue