import os import sys import time import requests from datetime import datetime from subprocess import run, PIPE, DEVNULL from multiprocessing import Process from config import ( ARCHIVE_PERMISSIONS, ARCHIVE_DIR, TIMEOUT, TERM_WIDTH, SHOW_PROGRESS, ANSI, CHROME_BINARY, FETCH_WGET, FETCH_PDF, FETCH_SCREENSHOT, FETCH_FAVICON, FETCH_AUDIO, FETCH_VIDEO, SUBMIT_ARCHIVE_DOT_ORG, ) def check_dependencies(): """Check that all necessary dependencies are installed, and have valid versions""" print('[*] Checking Dependencies:') python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor)) if python_vers < 3.5: print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset'])) print(' See https://github.com/pirate/bookmark-archiver#troubleshooting for help upgrading your Python installation.') raise SystemExit(1) if FETCH_PDF or FETCH_SCREENSHOT: if run(['which', CHROME_BINARY]).returncode: print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset'])) print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY)) print(' See https://github.com/pirate/bookmark-archiver for help.') raise SystemExit(1) # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04 try: result = run([CHROME_BINARY, '--version'], stdout=PIPE) version = result.stdout.decode('utf-8').replace('Google Chrome ', '').replace('Chromium ', '').split(' ', 1)[0].split('.', 1)[0] # TODO: regex might be better if int(version) < 59: print('{red}[X] Chrome version must be 59 or greater for headless PDF and screenshot saving{reset}'.format(**ANSI)) print(' See https://github.com/pirate/bookmark-archiver for help.') raise SystemExit(1) except (TypeError, OSError): print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI)) print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY)) print(' See https://github.com/pirate/bookmark-archiver for help.') raise SystemExit(1) if FETCH_WGET: if run(['which', 'wget']).returncode or run(['wget', '--version'], stdout=DEVNULL).returncode: print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI)) print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('wget')) print(' See https://github.com/pirate/bookmark-archiver for help.') raise SystemExit(1) if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG: if run(['which', 'curl']).returncode or run(['curl', '--version'], stdout=DEVNULL).returncode: print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI)) print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('curl')) print(' See https://github.com/pirate/bookmark-archiver for help.') raise SystemExit(1) if FETCH_AUDIO or FETCH_VIDEO: if run(['which', 'youtube-dl']).returncode or run(['youtube-dl', '--version'], stdout=DEVNULL).returncode: print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI)) print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('youtube-dl')) print(' See https://github.com/pirate/bookmark-archiver for help.') raise SystemExit(1) def chmod_file(path, cwd='.', permissions=ARCHIVE_PERMISSIONS, timeout=30): """chmod -R /""" if not os.path.exists(os.path.join(cwd, path)): raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path)) chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout) if chmod_result.returncode == 1: print(' ', chmod_result.stderr.decode()) raise Exception('Failed to chmod {}/{}'.format(cwd, path)) def progress(seconds=TIMEOUT, prefix=''): """Show a (subprocess-controlled) progress bar with a timeout, returns end() function to instantly finish the progress """ if not SHOW_PROGRESS: return lambda: None chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#' chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width) def progress_bar(seconds=seconds, prefix=prefix): """show timer in the form of progress bar, with percentage and seconds remaining""" try: for s in range(seconds * chunks): progress = s / chunks / seconds * 100 bar_width = round(progress/(100/chunks)) # ████████████████████ 0.9% (1/60sec) sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format( prefix, ANSI['green'], (chunk * bar_width).ljust(chunks), ANSI['reset'], round(progress, 1), round(s/chunks), seconds, )) sys.stdout.flush() time.sleep(1 / chunks) # ██████████████████████████████████ 100.0% (60/60sec) sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format( prefix, ANSI['red'], chunk * chunks, ANSI['reset'], 100.0, seconds, seconds, )) sys.stdout.flush() except KeyboardInterrupt: print() pass p = Process(target=progress_bar) p.start() def end(): """immediately finish progress and clear the progressbar line""" p.terminate() sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH), ANSI['reset'])) # clear whole terminal line sys.stdout.flush() return end def download_url(url): if not os.path.exists(os.path.join(ARCHIVE_DIR, 'downloads')): os.makedirs(os.path.join(ARCHIVE_DIR, 'downloads')) url_domain = url.split('/', 3)[2] output_path = os.path.join(ARCHIVE_DIR, 'downloads', '{}.txt'.format(url_domain)) print('[*] [{}] Downloading {} > {}'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), url, output_path, )) end = progress(TIMEOUT, prefix=' ') try: downloaded_xml = requests.get(url).content.decode() end() except Exception as e: end() print('[!] Failed to download {}\n'.format(url)) print(' ', e) raise SystemExit(1) with open(output_path, 'w', encoding='utf-8') as f: f.write(downloaded_xml) return output_path def get_str_between(string, start, end=None): """(12345, , ) -> 12345""" content = string.split(start, 1)[-1] if end is not None: content = content.rsplit(end, 1)[0] return content def get_link_type(link): """Certain types of links need to be handled specially, this figures out when that's the case""" if link['base_url'].endswith('.pdf'): return 'PDF' elif link['base_url'].rsplit('.', 1) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'): return 'image' elif 'wikipedia.org' in link['domain']: return 'wiki' elif 'youtube.com' in link['domain']: return 'youtube' elif 'soundcloud.com' in link['domain']: return 'soundcloud' elif 'youku.com' in link['domain']: return 'youku' elif 'vimeo.com' in link['domain']: return 'vimeo' return None # URL helpers without_scheme = lambda url: url.replace('http://', '').replace('https://', '').replace('ftp://', '') without_query = lambda url: url.split('?', 1)[0] without_hash = lambda url: url.split('#', 1)[0] without_path = lambda url: url.split('/', 1)[0] domain = lambda url: without_hash(without_query(without_path(without_scheme(url)))) base_url = lambda url: without_query(without_scheme(url))