1
0
Fork 0

fetch page title during archiving process

This commit is contained in:
Nick Sweeting 2019-02-19 01:44:54 -05:00
parent bb5879a4f7
commit 5a7d00a639
5 changed files with 44 additions and 15 deletions

View file

@ -12,6 +12,8 @@ from index import wget_output_path, parse_json_link_index, write_link_index
from links import links_after_timestamp from links import links_after_timestamp
from config import ( from config import (
CHROME_BINARY, CHROME_BINARY,
FETCH_FAVICON,
FETCH_TITLE,
FETCH_WGET, FETCH_WGET,
FETCH_WGET_REQUISITES, FETCH_WGET_REQUISITES,
FETCH_PDF, FETCH_PDF,
@ -23,7 +25,6 @@ from config import (
RESOLUTION, RESOLUTION,
CHECK_SSL_VALIDITY, CHECK_SSL_VALIDITY,
SUBMIT_ARCHIVE_DOT_ORG, SUBMIT_ARCHIVE_DOT_ORG,
FETCH_FAVICON,
WGET_USER_AGENT, WGET_USER_AGENT,
CHROME_USER_DATA_DIR, CHROME_USER_DATA_DIR,
CHROME_SANDBOX, CHROME_SANDBOX,
@ -36,6 +37,7 @@ from config import (
) )
from util import ( from util import (
check_dependencies, check_dependencies,
fetch_page_title,
progress, progress,
chmod_file, chmod_file,
pretty_path, pretty_path,
@ -96,6 +98,9 @@ def archive_link(link_dir, link, overwrite=True):
if FETCH_FAVICON: if FETCH_FAVICON:
link = fetch_favicon(link_dir, link, overwrite=overwrite) link = fetch_favicon(link_dir, link, overwrite=overwrite)
if FETCH_TITLE:
link = fetch_title(link_dir, link, overwrite=overwrite)
if FETCH_WGET: if FETCH_WGET:
link = fetch_wget(link_dir, link, overwrite=overwrite) link = fetch_wget(link_dir, link, overwrite=overwrite)
@ -129,7 +134,7 @@ def log_link_archive(link_dir, link, update_existing):
symbol='*' if update_existing else '+', symbol='*' if update_existing else '+',
symbol_color=ANSI['black' if update_existing else 'green'], symbol_color=ANSI['black' if update_existing else 'green'],
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'), now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
**link, **{**link, 'title': link['title'] or link['url']},
**ANSI, **ANSI,
)) ))
@ -492,6 +497,29 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT):
'output': output, 'output': output,
} }
@attach_result_to_link('title')
def fetch_title(link_dir, link, timeout=TIMEOUT):
"""try to guess the page's title from its content"""
# if link already has valid title, skip it
if link['title'] and not link['title'].lower().startswith('http'):
return {'output': link['title'], 'cmd': 'fetch_page_title("{}")'.format(link['url'])}
end = progress(timeout, prefix=' ')
try:
title = fetch_page_title(link['url'], timeout=timeout, progress=False)
end()
output = title
except Exception as e:
end()
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
output = e
return {
'cmd': 'fetch_page_title("{}")'.format(link['url']),
'output': output,
}
@attach_result_to_link('media') @attach_result_to_link('media')
def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False): def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
"""Download playlists or individual video, audio, and subtitles using youtube-dl""" """Download playlists or individual video, audio, and subtitles using youtube-dl"""

View file

@ -27,6 +27,7 @@ FETCH_WARC = os.getenv('FETCH_WARC', 'True'
FETCH_GIT = os.getenv('FETCH_GIT', 'True' ).lower() == 'true' FETCH_GIT = os.getenv('FETCH_GIT', 'True' ).lower() == 'true'
FETCH_MEDIA = os.getenv('FETCH_MEDIA', 'True' ).lower() == 'true' FETCH_MEDIA = os.getenv('FETCH_MEDIA', 'True' ).lower() == 'true'
FETCH_FAVICON = os.getenv('FETCH_FAVICON', 'True' ).lower() == 'true' FETCH_FAVICON = os.getenv('FETCH_FAVICON', 'True' ).lower() == 'true'
FETCH_TITLE = os.getenv('FETCH_TITLE', 'True' ).lower() == 'true'
SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' ).lower() == 'true' SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' ).lower() == 'true'
CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true' CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true'

View file

@ -57,7 +57,7 @@ def validate_links(links):
raise SystemExit(1) raise SystemExit(1)
for link in links: for link in links:
link['title'] = unescape(link['title']) link['title'] = unescape(link['title']) if link['title'] else None
link['latest'] = link.get('latest') or {} link['latest'] = link.get('latest') or {}
latest = link['latest'] latest = link['latest']
@ -76,6 +76,9 @@ def validate_links(links):
if not latest.get('favicon'): if not latest.get('favicon'):
latest['favicon'] = None latest['favicon'] = None
if not link['latest'].get('title'):
link['latest']['title'] = link['title']
return list(links) return list(links)
def new_links(all_links, existing_links): def new_links(all_links, existing_links):

View file

@ -44,6 +44,7 @@ base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
short_ts = lambda ts: ts.split('.')[0] short_ts = lambda ts: ts.split('.')[0]
URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))[^<\""]+' URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))[^<\""]+'
HTML_TITLE_REGEX = '<title>(.[^<>]+)'
def check_dependencies(): def check_dependencies():
@ -227,22 +228,17 @@ def download_url(url):
return source_path return source_path
def fetch_page_title(url, default=True): def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
"""Attempt to guess a page's title by downloading the html""" """Attempt to guess a page's title by downloading the html"""
if default is True:
default = url
try: try:
if SHOW_PROGRESS: if progress:
sys.stdout.write('.') sys.stdout.write('.')
sys.stdout.flush() sys.stdout.flush()
html_content = urllib.request.urlopen(url, timeout=10).read().decode('utf-8') html_content = urllib.request.urlopen(url, timeout=10).read().decode('utf-8')
match = re.search('<title>(.*?)</title>', html_content) match = re.search('<title>(.*?)</title>', html_content)
return match.group(1) if match else default or None return match.group(1) if match else default or None
except Exception: except Exception:
if default is False: return None
raise
return default
def str_between(string, start, end=None): def str_between(string, start, end=None):
@ -277,19 +273,19 @@ def merge_links(a, b):
"""deterministially merge two links, favoring longer field values over shorter, """deterministially merge two links, favoring longer field values over shorter,
and "cleaner" values over worse ones. and "cleaner" values over worse ones.
""" """
longer = lambda key: a[key] if len(a[key]) > len(b[key]) else b[key] longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key])
earlier = lambda key: a[key] if a[key] < b[key] else b[key] earlier = lambda key: a[key] if a[key] < b[key] else b[key]
url = longer('url') url = longer('url')
longest_title = longer('title') longest_title = longer('title')
cleanest_title = a['title'] if '://' not in a['title'] else b['title'] cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
link = { link = {
'timestamp': earlier('timestamp'), 'timestamp': earlier('timestamp'),
'url': url, 'url': url,
'domain': domain(url), 'domain': domain(url),
'base_url': base_url(url), 'base_url': base_url(url),
'tags': longer('tags'), 'tags': longer('tags'),
'title': longest_title if '://' not in longest_title else cleanest_title, 'title': longest_title if '://' not in (longest_title or '') else cleanest_title,
'sources': list(set(a.get('sources', []) + b.get('sources', []))), 'sources': list(set(a.get('sources', []) + b.get('sources', []))),
} }
link['type'] = get_link_type(link) link['type'] = get_link_type(link)
@ -532,7 +528,7 @@ def derived_link_info(link):
'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link), 'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link),
'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link), 'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link),
'dom_link': 'archive/{timestamp}/{base_url}'.format(**link), 'dom_link': 'archive/{timestamp}/{base_url}'.format(**link),
'title': '{title} ({type})'.format(**link), 'title': link['title'] or basename(link['url']),
}) })
return link_info return link_info

View file

@ -10,6 +10,7 @@
# FETCH_MEDIA=False # FETCH_MEDIA=False
# FETCH_GIT=True # FETCH_GIT=True
# FETCH_FAVICON=True # FETCH_FAVICON=True
# FETCH_TITLE=True
# SUBMIT_ARCHIVE_DOT_ORG=True # SUBMIT_ARCHIVE_DOT_ORG=True
### To only download new links, and never attempt to update old ones, uncomment this line: ### To only download new links, and never attempt to update old ones, uncomment this line: