From a167d2a1f47879c7be489818c51c244e9b343a5e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 5 Jul 2017 16:33:51 -0500 Subject: [PATCH] colorize output and add progress bar --- archive.py | 55 +++++++++++----------- config.py | 136 +++++++++++++++++++++++++++++++++++++++++++++++------ fetch.py | 89 ++++++++++++++++++++--------------- index.py | 19 +++++++- parse.py | 16 +++---- 5 files changed, 226 insertions(+), 89 deletions(-) diff --git a/archive.py b/archive.py index 138ae9d4..eb6f76fb 100755 --- a/archive.py +++ b/archive.py @@ -7,13 +7,13 @@ import os import sys from datetime import datetime -from subprocess import run from parse import parse_export from index import dump_index from fetch import dump_website from config import ( ARCHIVE_PERMISSIONS, + ANSI, check_dependencies, ) @@ -24,23 +24,23 @@ __DOCUMENTATION__ = 'https://github.com/pirate/bookmark-archiver' def create_archive(export_file, service=None, resume=None): """update or create index.html and download archive of all links""" - with open(export_file, 'r', encoding='utf-8') as f: - print('[+] [{}] Starting archive from {} export file.'.format( - datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - export_file - )) + print('[*] [{}] Starting archive from {} export file.'.format( + datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + export_file, + )) + with open(export_file, 'r', encoding='utf-8') as f: links, service = parse_export(f, service=service) - if resume: - try: - links = [ - link - for link in links - if float(link['timestamp']) >= float(resume) - ] - except TypeError: - print('Resume value and all timestamp values must be valid numbers.') + if resume: + try: + links = [ + link + for link in links + if float(link['timestamp']) >= float(resume) + ] + except TypeError: + print('Resume value and all timestamp values must be valid numbers.') if not links or not service: print('[X] No links found in {}, is it a {} export file?'.format(export_file, service)) @@ -53,20 +53,21 @@ def create_archive(export_file, service=None, resume=None): os.makedirs(os.path.join(service, 'archive')) dump_index(links, service) - - run(['chmod', '-R', ARCHIVE_PERMISSIONS, service], timeout=30) - - print('[*] [{}] Created archive index with {} links.'.format( - datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - len(links), - )) - check_dependencies() + try: + for link in links: + dump_website(link, service) + except (KeyboardInterrupt, SystemExit, Exception): + print('{red}[X] Archive creation stopped.{reset}'.format(**ANSI)) + print(' Continue where you left off by running:') + print(' ./archive.py {} {} {}'.format( + export_file, + service, + link['timestamp'], + )) + raise SystemExit(1) - for link in links: - dump_website(link, service) - - print('[√] [{}] Archive update complete.'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) + print('{}[√] [{}] Archive update complete.{}'.format(ANSI['green'], datetime.now().strftime('%Y-%m-%d %H:%M:%S'), ANSI['reset'])) if __name__ == '__main__': diff --git a/config.py b/config.py index 5cd8d5c4..9933a86e 100644 --- a/config.py +++ b/config.py @@ -1,7 +1,10 @@ import os import sys +import time +import shutil -from subprocess import run, PIPE +from subprocess import run, PIPE, DEVNULL +from multiprocessing import Process # os.getenv('VARIABLE', 'DEFAULT') gets the value of environment # variable "VARIABLE" and if it is not set, sets it to 'DEFAULT' @@ -9,10 +12,14 @@ from subprocess import run, PIPE # for boolean values, check to see if the string is 'true', and # if so, the python variable will be True +IS_TTY = sys.stdout.isatty() + +USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true' +SHOW_PROGRESS = os.getenv('SHOW_PROGRESS', str(IS_TTY) ).lower() == 'true' FETCH_WGET = os.getenv('FETCH_WGET', 'True' ).lower() == 'true' FETCH_WGET_REQUISITES = os.getenv('FETCH_WGET_REQUISITES', 'True' ).lower() == 'true' -FETCH_AUDIO = os.getenv('FETCH_AUDIO', 'False' ).lower() == 'true' -FETCH_VIDEO = os.getenv('FETCH_VIDEO', 'False' ).lower() == 'true' +FETCH_AUDIO = os.getenv('FETCH_AUDIO', 'False' ).lower() == 'true' +FETCH_VIDEO = os.getenv('FETCH_VIDEO', 'False' ).lower() == 'true' FETCH_PDF = os.getenv('FETCH_PDF', 'True' ).lower() == 'true' FETCH_SCREENSHOT = os.getenv('FETCH_SCREENSHOT', 'True' ).lower() == 'true' FETCH_FAVICON = os.getenv('FETCH_FAVICON', 'True' ).lower() == 'true' @@ -25,44 +32,143 @@ TIMEOUT = int(os.getenv('TIMEOUT', '60')) INDEX_TEMPLATE = os.getenv('INDEX_TEMPLATE', 'templates/index.html') INDEX_ROW_TEMPLATE = os.getenv('INDEX_ROW_TEMPLATE', 'templates/index_row.html') +TERM_WIDTH = shutil.get_terminal_size((100, 10)).columns +ANSI = { + 'reset': '\033[00;00m', + 'lightblue': '\033[01;30m', + 'lightyellow': '\033[01;33m', + 'lightred': '\033[01;35m', + 'red': '\033[01;31m', + 'green': '\033[01;32m', + 'blue': '\033[01;34m', + 'white': '\033[01;37m', + 'black': '\033[01;30m', +} +if not USE_COLOR: + # dont show colors if USE_COLOR is False + ANSI = {k: '' for k in ANSI.keys()} + + +### Util Functions def check_dependencies(): + """Check that all necessary dependencies are installed, and have valid versions""" + print('[*] Checking Dependencies:') python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor)) if python_vers < 3.5: - print('[X] Python version is not new enough: {} (>3.5 is required)'.format(python_vers)) + print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset'])) print(' See https://github.com/pirate/bookmark-archiver#troubleshooting for help upgrading your Python installation.') raise SystemExit(1) if FETCH_PDF or FETCH_SCREENSHOT: if run(['which', CHROME_BINARY]).returncode: - print('[X] Missing dependency: {}'.format(CHROME_BINARY)) + print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset'])) + print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY)) print(' See https://github.com/pirate/bookmark-archiver for help.') raise SystemExit(1) # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04 - result = run([CHROME_BINARY, '--version'], stdout=PIPE) - version = result.stdout.decode('utf-8').replace('Google Chrome ', '').replace('Chromium ', '').split(' ', 1)[0].split('.', 1)[0] # TODO: regex might be better - if int(version) < 59: - print('[X] Chrome version must be 59 or greater for headless PDF and screenshot saving') + try: + result = run([CHROME_BINARY, '--version'], stdout=PIPE) + version = result.stdout.decode('utf-8').replace('Google Chrome ', '').replace('Chromium ', '').split(' ', 1)[0].split('.', 1)[0] # TODO: regex might be better + if int(version) < 59: + print('{red}[X] Chrome version must be 59 or greater for headless PDF and screenshot saving{reset}'.format(**ANSI)) + print(' See https://github.com/pirate/bookmark-archiver for help.') + raise SystemExit(1) + except (TypeError, OSError): + print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI)) + print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY)) print(' See https://github.com/pirate/bookmark-archiver for help.') raise SystemExit(1) if FETCH_WGET: - if run(['which', 'wget']).returncode: - print('[X] Missing dependency: wget') + if run(['which', 'wget']).returncode or run(['wget', '--version'], stdout=DEVNULL).returncode: + print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI)) + print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('wget')) print(' See https://github.com/pirate/bookmark-archiver for help.') raise SystemExit(1) if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG: - if run(['which', 'curl']).returncode: - print('[X] Missing dependency: curl') + if run(['which', 'curl']).returncode or run(['curl', '--version'], stdout=DEVNULL).returncode: + print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI)) + print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('curl')) print(' See https://github.com/pirate/bookmark-archiver for help.') raise SystemExit(1) if FETCH_AUDIO or FETCH_VIDEO: - if run(['which', 'youtube-dl']).returncode: - print('[X] Missing dependency: youtube-dl') + if run(['which', 'youtube-dl']).returncode or run(['youtube-dl', '--version'], stdout=DEVNULL).returncode: + print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI)) + print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('youtube-dl')) print(' See https://github.com/pirate/bookmark-archiver for help.') raise SystemExit(1) + + +def chmod_file(path, cwd='.', permissions=ARCHIVE_PERMISSIONS, timeout=30): + """chmod -R /""" + + if not os.path.exists(os.path.join(cwd, path)): + raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path)) + + chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout) + if chmod_result.returncode == 1: + print(' ', chmod_result.stderr.decode()) + raise Exception('Failed to chmod {}/{}'.format(cwd, path)) + + +def progress(seconds=TIMEOUT, prefix=''): + """Show a (subprocess-controlled) progress bar with a timeout, + returns end() function to instantly finish the progress + """ + + if not SHOW_PROGRESS: + return lambda: None + + chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width) + + def progress_bar(seconds=seconds, prefix=prefix): + """show timer in the form of progress bar, with percentage and seconds remaining""" + try: + for s in range(seconds * chunks): + progress = s / chunks / seconds * 100 + bar_width = round(progress/(100/chunks)) + + # ████████████████████ 0.9% (1/60sec) + sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format( + prefix, + ANSI['green'], + ('█' * bar_width).ljust(chunks), + ANSI['reset'], + round(progress, 1), + round(s/chunks), + seconds, + )) + sys.stdout.flush() + time.sleep(1 / chunks) + + # ██████████████████████████████████ 100.0% (60/60sec) + sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format( + prefix, + ANSI['red'], + '█' * chunks, + ANSI['reset'], + 100.0, + seconds, + seconds, + )) + sys.stdout.flush() + except KeyboardInterrupt: + print() + pass + + p = Process(target=progress_bar) + p.start() + + def end(): + """immediately finish progress and clear the progressbar line""" + p.terminate() + sys.stdout.write('\r{}\r'.format(' ' * TERM_WIDTH)) # clear whole terminal line + sys.stdout.flush() + + return end diff --git a/fetch.py b/fetch.py index 7978b815..57d545a5 100644 --- a/fetch.py +++ b/fetch.py @@ -16,38 +16,35 @@ from config import ( FETCH_AUDIO, FETCH_VIDEO, FETCH_FAVICON, + TIMEOUT, + ANSI, + progress, + chmod_file, ) -def chmod_file(path, cwd='.', permissions='755', timeout=30): - if not os.path.exists(os.path.join(cwd, path)): - raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path)) - - chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout) - if chmod_result.returncode == 1: - print(' ', chmod_result.stderr.decode()) - raise Exception('Failed to chmod {}/{}'.format(cwd, path)) - def fetch_wget(out_dir, link, overwrite=False, requisites=True, timeout=60): """download full site using wget""" - domain = link['base_url'].split('/', 1)[0] - if not os.path.exists(os.path.join(out_dir, domain)) or overwrite: - print(' - Downloading Full Site') + if not os.path.exists(os.path.join(out_dir, link['domain'])) or overwrite: + print(' - Downloading full site') CMD = [ *'wget --timestamping --adjust-extension --no-parent'.split(' '), # Docs: https://www.gnu.org/software/wget/manual/wget.html *(('--page-requisites', '--convert-links') if requisites else ()), link['url'], ] + end = progress(timeout, prefix=' ') try: - result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout) # dom.html + result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=out_dir, timeout=timeout + 1) # index.html + end() if result.returncode > 0: - print(' ', result.stderr.decode().split('\n')[-1]) + print('\n'.join(' ' + line for line in result.stderr.decode().rsplit('\n', 5)[-3:] if line.strip())) raise Exception('Failed to wget download') - chmod_file(domain, cwd=out_dir) + chmod_file(link['domain'], cwd=out_dir) except Exception as e: + end() print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) - print(' Failed: {} {}'.format(e.__class__.__name__, e)) + print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) else: print(' √ Skipping site download') @@ -63,15 +60,18 @@ def fetch_pdf(out_dir, link, overwrite=False, timeout=60, chrome_binary='chromiu *'--headless --disable-gpu --print-to-pdf'.split(' '), link['url'] ] + end = progress(timeout, prefix=' ') try: - result = run(CMD, stdout=DEVNULL, stderr=PIPE, cwd=out_dir, timeout=timeout) # output.pdf + result = run(CMD, stdout=DEVNULL, stderr=PIPE, cwd=out_dir, timeout=timeout + 1) # output.pdf + end() if result.returncode: print(' ', result.stderr.decode()) raise Exception('Failed to print PDF') chmod_file('output.pdf', cwd=out_dir) except Exception as e: + end() print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) - print(' Failed: {} {}'.format(e.__class__.__name__, e)) + print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) else: print(' √ Skipping PDF print') @@ -88,15 +88,18 @@ def fetch_screenshot(out_dir, link, overwrite=False, timeout=60, chrome_binary=' '--window-size={}'.format(resolution), link['url'] ] + end = progress(timeout, prefix=' ') try: - result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout) # sreenshot.png + result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # sreenshot.png + end() if result.returncode: print(' ', result.stderr.decode()) raise Exception('Failed to take screenshot') chmod_file('screenshot.png', cwd=out_dir) except Exception as e: + end() print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) - print(' Failed: {} {}'.format(e.__class__.__name__, e)) + print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) else: print(' √ Skipping screenshot') @@ -111,8 +114,10 @@ def archive_dot_org(out_dir, link, overwrite=False, timeout=60): success = False CMD = ['curl', '-I', submit_url] + end = progress(timeout, prefix=' ') try: - result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout) # archive.org + result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # archive.org.txt + end() headers = result.stdout.splitlines() content_location = [h for h in headers if b'Content-Location: ' in h] if content_location: @@ -122,11 +127,12 @@ def archive_dot_org(out_dir, link, overwrite=False, timeout=60): else: raise Exception('Failed to find "Content-Location" URL header in Archive.org response.') except Exception as e: + end() print(' Visit url to see output:', ' '.join(CMD)) - print(' Failed: {} {}'.format(e.__class__.__name__, e)) + print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) if success: - with open('{}/archive.org.txt'.format(out_dir), 'w', encoding='utf-8') as f: + with open(os.path.join(out_dir, 'archive.org.txt'), 'w', encoding='utf-8') as f: f.write(saved_url) chmod_file('archive.org.txt', cwd=out_dir) @@ -140,14 +146,17 @@ def fetch_favicon(out_dir, link, overwrite=False, timeout=60): if not os.path.exists(path) or overwrite: print(' - Fetching Favicon') - CMD = 'curl https://www.google.com/s2/favicons?domain={domain}'.format(**link).split(' ') + CMD = ['curl', 'https://www.google.com/s2/favicons?domain={domain}'.format(**link)] fout = open('{}/favicon.ico'.format(out_dir), 'w') + end = progress(timeout, prefix=' ') try: - run([*CMD], stdout=fout, stderr=DEVNULL, cwd=out_dir, timeout=timeout) # favicon.ico + run(CMD, stdout=fout, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # favicon.ico + end() chmod_file('favicon.ico', cwd=out_dir) except Exception as e: + end() print(' Run to see full output:', ' '.join(CMD)) - print(' Failed: {} {}'.format(e.__class__.__name__, e)) + print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) fout.close() else: print(' √ Skipping favicon') @@ -166,15 +175,18 @@ def fetch_audio(out_dir, link, overwrite=False, timeout=60): "youtube-dl -x --audio-format mp3 --audio-quality 0 -o '%(title)s.%(ext)s'", link['url'], ] + end = progress(timeout, prefix=' ') try: - result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout) # sreenshot.png + result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # audio/audio.mp3 + end() if result.returncode: print(' ', result.stderr.decode()) raise Exception('Failed to download audio') chmod_file('audio', cwd=out_dir) except Exception as e: + end() print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) - print(' Failed: {} {}'.format(e.__class__.__name__, e)) + print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) else: print(' √ Skipping audio download') @@ -189,27 +201,30 @@ def fetch_video(out_dir, link, overwrite=False, timeout=60): if not os.path.exists(path) or overwrite: print(' - Downloading video') CMD = [ - "youtube-dl -x --audio-format mp3 --audio-quality 0 -o '%(title)s.%(ext)s'", + "youtube-dl -x --video-format mp4 --audio-quality 0 -o '%(title)s.%(ext)s'", link['url'], ] + end = progress(timeout, prefix=' ') try: - result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout) # sreenshot.png + result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=out_dir, timeout=timeout + 1) # video/movie.mp4 + end() if result.returncode: print(' ', result.stderr.decode()) raise Exception('Failed to download video') chmod_file('video', cwd=out_dir) except Exception as e: + end() print(' Run to see full output:', 'cd {}; {}'.format(out_dir, ' '.join(CMD))) - print(' Failed: {} {}'.format(e.__class__.__name__, e)) + print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) else: print(' √ Skipping video download') -def dump_link_info(out_dir, link, update=True): +def dump_link_info(out_dir, link, overwrite=False): """write a json file with some info about the link""" info_file_path = os.path.join(out_dir, 'link.json') - if (not os.path.exists(info_file_path) or update): + if (not os.path.exists(info_file_path) or overwrite): print(' - Creating link info file') try: link_json = derived_link_info(link) @@ -223,7 +238,7 @@ def dump_link_info(out_dir, link, update=True): chmod_file('link.json', cwd=out_dir) except Exception as e: - print(' Failed: {} {}'.format(e.__class__.__name__, e)) + print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) else: print(' √ Skipping link info file') @@ -231,7 +246,7 @@ def dump_link_info(out_dir, link, update=True): def dump_website(link, service, overwrite=False, permissions=ARCHIVE_PERMISSIONS): """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp""" - print('[+] [{timestamp} ({time})] "{title}": {base_url}'.format(**link)) + print('[{green}+{reset}] [{timestamp} ({time})] "{title}": {blue}{base_url}{reset}'.format(**link, **ANSI)) out_dir = os.path.join(service, 'archive', link['timestamp']) if not os.path.exists(out_dir): @@ -243,7 +258,7 @@ def dump_website(link, service, overwrite=False, permissions=ARCHIVE_PERMISSIONS print(' i Type: {}'.format(link['type'])) if not (link['url'].startswith('http') or link['url'].startswith('ftp')): - print(' X Skipping: invalid link.') + print(' {}X Skipping: invalid link.{}', ANSI['red'], ANSI['yellow']) return if FETCH_WGET: @@ -267,4 +282,4 @@ def dump_website(link, service, overwrite=False, permissions=ARCHIVE_PERMISSIONS if FETCH_FAVICON: fetch_favicon(out_dir, link, overwrite=overwrite) - dump_link_info(out_dir, link) + dump_link_info(out_dir, link, overwrite=overwrite) diff --git a/index.py b/index.py index ab80a70b..442f8a98 100644 --- a/index.py +++ b/index.py @@ -2,8 +2,14 @@ import os from datetime import datetime from string import Template -from config import INDEX_TEMPLATE, INDEX_ROW_TEMPLATE from parse import derived_link_info +from config import ( + INDEX_TEMPLATE, + INDEX_ROW_TEMPLATE, + ARCHIVE_PERMISSIONS, + ANSI, + chmod_file, +) def dump_index(links, service): @@ -28,4 +34,13 @@ def dump_index(links, service): } with open(os.path.join(service, 'index.html'), 'w', encoding='utf-8') as f: - f.write(Template(index_html).substitute(template_vars)) + f.write(Template(index_html).substitute(**template_vars)) + + chmod_file(service, permissions=ARCHIVE_PERMISSIONS) + + print('[+] [{}] Created archive index with {}{}{} links.'.format( + datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + ANSI['green'], + len(links), + ANSI['reset'], + )) diff --git a/parse.py b/parse.py index d130b298..8f807346 100644 --- a/parse.py +++ b/parse.py @@ -64,8 +64,8 @@ def parse_pocket_export(html_file): without_scheme = fixed_url.replace('http://', '').replace('https://', '') info = { 'url': fixed_url, - 'domain': without_scheme.split('/')[0], # without pathname - 'base_url': without_scheme.split('?')[0], # without query args + 'domain': without_scheme.split('/', 1)[0], # without pathname + 'base_url': without_scheme.split('?', 1)[0], # without query args 'time': datetime.fromtimestamp(int(match.group(2))).strftime('%Y-%m-%d %H:%M'), 'timestamp': match.group(2), 'tags': match.group(3), @@ -84,10 +84,10 @@ def parse_json_export(json_file): erg = line info = { 'url': erg['href'], - 'domain': erg['href'].replace('http://', '').replace('https://', '').split('/')[0], - 'base_url': erg['href'].replace('https://', '').replace('http://', '').split('?')[0], - 'time': datetime.fromtimestamp(int(time.mktime(time.strptime(erg['time'].split(',')[0], '%Y-%m-%dT%H:%M:%SZ')))), - 'timestamp': str(int(time.mktime(time.strptime(erg['time'].split(',')[0], '%Y-%m-%dT%H:%M:%SZ')))), + 'domain': erg['href'].replace('http://', '').replace('https://', '').split('/', 1)[0], + 'base_url': erg['href'].replace('https://', '').replace('http://', '').split('?', 1)[0], + 'time': datetime.fromtimestamp(int(time.mktime(time.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ')))), + 'timestamp': str(int(time.mktime(time.strptime(erg['time'].split(',', 1)[0], '%Y-%m-%dT%H:%M:%SZ')))), 'tags': erg['tags'], 'title': erg['description'].replace(' — Readability', ''), } @@ -108,8 +108,8 @@ def parse_bookmarks_export(html_file): info = { 'url': url, - 'domain': url.replace('http://', '').replace('https://', '').split('/')[0], - 'base_url': url.replace('https://', '').replace('http://', '').split('?')[0], + 'domain': url.replace('http://', '').replace('https://', '').split('/', 1)[0], + 'base_url': url.replace('https://', '').replace('http://', '').split('?', 1)[0], 'time': dt, 'timestamp': secs, 'tags': "",