diff --git a/archive_methods.py b/archive_methods.py index c5e77660..cb4e19da 100644 --- a/archive_methods.py +++ b/archive_methods.py @@ -17,6 +17,7 @@ from config import ( FETCH_WGET_REQUISITES, FETCH_PDF, FETCH_SCREENSHOT, + FETCH_DOM, RESOLUTION, CHECK_SSL_VALIDITY, SUBMIT_ARCHIVE_DOT_ORG, @@ -93,6 +94,9 @@ def archive_link(link_dir, link, overwrite=True): if FETCH_SCREENSHOT: link = fetch_screenshot(link_dir, link, overwrite=overwrite) + if FETCH_DOM: + link = fetch_dom(link_dir, link, overwrite=overwrite) + if SUBMIT_ARCHIVE_DOT_ORG: link = archive_dot_org(link_dir, link, overwrite=overwrite) @@ -252,7 +256,6 @@ def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DI 'output': output, } - @attach_result_to_link('screenshot') def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR, resolution=RESOLUTION): """take screenshot of site using chrome --headless""" @@ -289,6 +292,43 @@ def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_ 'output': output, } +@attach_result_to_link('dom') +def fetch_dom(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR): + """print HTML of site to file using chrome --dump-html""" + + if link['type'] in ('PDF', 'image'): + return {'output': wget_output_path(link)} + + output_path = os.path.join(link_dir, 'output.html') + + if os.path.exists(output_path): + return {'output': 'output.html', 'status': 'skipped'} + + CMD = [ + *chrome_headless(user_data_dir=user_data_dir), + '--dump-dom', + link['url'] + ] + end = progress(timeout, prefix=' ') + try: + with open(output_path, 'w+') as f: + result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # output.html + end() + if result.returncode: + print(' ', (result.stderr).decode()) + raise Exception('Failed to fetch DOM') + chmod_file('output.html', cwd=link_dir) + output = 'output.html' + except Exception as e: + end() + print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD))) + print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) + output = e + + return { + 'cmd': CMD, + 'output': output, + } @attach_result_to_link('archive_org') def archive_dot_org(link_dir, link, timeout=TIMEOUT): @@ -445,7 +485,7 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT): def chrome_headless(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR): - args = [binary, '--headless', '--disable-gpu'] + args = [binary, '--headless'] # '--disable-gpu' default_profile = os.path.expanduser('~/Library/Application Support/Google/Chrome/Default') if user_data_dir: args.append('--user-data-dir={}'.format(user_data_dir)) diff --git a/config.py b/config.py index 6f85ed09..ceae6c75 100644 --- a/config.py +++ b/config.py @@ -19,6 +19,7 @@ FETCH_AUDIO = os.getenv('FETCH_AUDIO', 'False' FETCH_VIDEO = os.getenv('FETCH_VIDEO', 'False' ).lower() == 'true' FETCH_PDF = os.getenv('FETCH_PDF', 'True' ).lower() == 'true' FETCH_SCREENSHOT = os.getenv('FETCH_SCREENSHOT', 'True' ).lower() == 'true' +FETCH_DOM = os.getenv('FETCH_DOM', 'True' ).lower() == 'true' FETCH_FAVICON = os.getenv('FETCH_FAVICON', 'True' ).lower() == 'true' SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' ).lower() == 'true' RESOLUTION = os.getenv('RESOLUTION', '1440,1200' ) diff --git a/links.py b/links.py index 61d968e9..b3fca5d4 100644 --- a/links.py +++ b/links.py @@ -68,6 +68,9 @@ def validate_links(links): if not link['latest'].get('screenshot'): link['latest']['screenshot'] = None + if not link['latest'].get('dom'): + link['latest']['dom'] = None + return list(links) diff --git a/templates/index.html b/templates/index.html index e3037495..22cb888b 100644 --- a/templates/index.html +++ b/templates/index.html @@ -113,6 +113,7 @@