fetch page title during archiving process
This commit is contained in:
parent
bb5879a4f7
commit
5a7d00a639
5 changed files with 44 additions and 15 deletions
|
@ -12,6 +12,8 @@ from index import wget_output_path, parse_json_link_index, write_link_index
|
||||||
from links import links_after_timestamp
|
from links import links_after_timestamp
|
||||||
from config import (
|
from config import (
|
||||||
CHROME_BINARY,
|
CHROME_BINARY,
|
||||||
|
FETCH_FAVICON,
|
||||||
|
FETCH_TITLE,
|
||||||
FETCH_WGET,
|
FETCH_WGET,
|
||||||
FETCH_WGET_REQUISITES,
|
FETCH_WGET_REQUISITES,
|
||||||
FETCH_PDF,
|
FETCH_PDF,
|
||||||
|
@ -23,7 +25,6 @@ from config import (
|
||||||
RESOLUTION,
|
RESOLUTION,
|
||||||
CHECK_SSL_VALIDITY,
|
CHECK_SSL_VALIDITY,
|
||||||
SUBMIT_ARCHIVE_DOT_ORG,
|
SUBMIT_ARCHIVE_DOT_ORG,
|
||||||
FETCH_FAVICON,
|
|
||||||
WGET_USER_AGENT,
|
WGET_USER_AGENT,
|
||||||
CHROME_USER_DATA_DIR,
|
CHROME_USER_DATA_DIR,
|
||||||
CHROME_SANDBOX,
|
CHROME_SANDBOX,
|
||||||
|
@ -36,6 +37,7 @@ from config import (
|
||||||
)
|
)
|
||||||
from util import (
|
from util import (
|
||||||
check_dependencies,
|
check_dependencies,
|
||||||
|
fetch_page_title,
|
||||||
progress,
|
progress,
|
||||||
chmod_file,
|
chmod_file,
|
||||||
pretty_path,
|
pretty_path,
|
||||||
|
@ -96,6 +98,9 @@ def archive_link(link_dir, link, overwrite=True):
|
||||||
if FETCH_FAVICON:
|
if FETCH_FAVICON:
|
||||||
link = fetch_favicon(link_dir, link, overwrite=overwrite)
|
link = fetch_favicon(link_dir, link, overwrite=overwrite)
|
||||||
|
|
||||||
|
if FETCH_TITLE:
|
||||||
|
link = fetch_title(link_dir, link, overwrite=overwrite)
|
||||||
|
|
||||||
if FETCH_WGET:
|
if FETCH_WGET:
|
||||||
link = fetch_wget(link_dir, link, overwrite=overwrite)
|
link = fetch_wget(link_dir, link, overwrite=overwrite)
|
||||||
|
|
||||||
|
@ -129,7 +134,7 @@ def log_link_archive(link_dir, link, update_existing):
|
||||||
symbol='*' if update_existing else '+',
|
symbol='*' if update_existing else '+',
|
||||||
symbol_color=ANSI['black' if update_existing else 'green'],
|
symbol_color=ANSI['black' if update_existing else 'green'],
|
||||||
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||||
**link,
|
**{**link, 'title': link['title'] or link['url']},
|
||||||
**ANSI,
|
**ANSI,
|
||||||
))
|
))
|
||||||
|
|
||||||
|
@ -492,6 +497,29 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT):
|
||||||
'output': output,
|
'output': output,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@attach_result_to_link('title')
|
||||||
|
def fetch_title(link_dir, link, timeout=TIMEOUT):
|
||||||
|
"""try to guess the page's title from its content"""
|
||||||
|
|
||||||
|
# if link already has valid title, skip it
|
||||||
|
if link['title'] and not link['title'].lower().startswith('http'):
|
||||||
|
return {'output': link['title'], 'cmd': 'fetch_page_title("{}")'.format(link['url'])}
|
||||||
|
|
||||||
|
end = progress(timeout, prefix=' ')
|
||||||
|
try:
|
||||||
|
title = fetch_page_title(link['url'], timeout=timeout, progress=False)
|
||||||
|
end()
|
||||||
|
output = title
|
||||||
|
except Exception as e:
|
||||||
|
end()
|
||||||
|
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
|
||||||
|
output = e
|
||||||
|
|
||||||
|
return {
|
||||||
|
'cmd': 'fetch_page_title("{}")'.format(link['url']),
|
||||||
|
'output': output,
|
||||||
|
}
|
||||||
|
|
||||||
@attach_result_to_link('media')
|
@attach_result_to_link('media')
|
||||||
def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
|
def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
|
||||||
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
|
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
|
||||||
|
|
|
@ -27,6 +27,7 @@ FETCH_WARC = os.getenv('FETCH_WARC', 'True'
|
||||||
FETCH_GIT = os.getenv('FETCH_GIT', 'True' ).lower() == 'true'
|
FETCH_GIT = os.getenv('FETCH_GIT', 'True' ).lower() == 'true'
|
||||||
FETCH_MEDIA = os.getenv('FETCH_MEDIA', 'True' ).lower() == 'true'
|
FETCH_MEDIA = os.getenv('FETCH_MEDIA', 'True' ).lower() == 'true'
|
||||||
FETCH_FAVICON = os.getenv('FETCH_FAVICON', 'True' ).lower() == 'true'
|
FETCH_FAVICON = os.getenv('FETCH_FAVICON', 'True' ).lower() == 'true'
|
||||||
|
FETCH_TITLE = os.getenv('FETCH_TITLE', 'True' ).lower() == 'true'
|
||||||
SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' ).lower() == 'true'
|
SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' ).lower() == 'true'
|
||||||
|
|
||||||
CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true'
|
CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true'
|
||||||
|
|
|
@ -57,7 +57,7 @@ def validate_links(links):
|
||||||
raise SystemExit(1)
|
raise SystemExit(1)
|
||||||
|
|
||||||
for link in links:
|
for link in links:
|
||||||
link['title'] = unescape(link['title'])
|
link['title'] = unescape(link['title']) if link['title'] else None
|
||||||
link['latest'] = link.get('latest') or {}
|
link['latest'] = link.get('latest') or {}
|
||||||
|
|
||||||
latest = link['latest']
|
latest = link['latest']
|
||||||
|
@ -76,6 +76,9 @@ def validate_links(links):
|
||||||
if not latest.get('favicon'):
|
if not latest.get('favicon'):
|
||||||
latest['favicon'] = None
|
latest['favicon'] = None
|
||||||
|
|
||||||
|
if not link['latest'].get('title'):
|
||||||
|
link['latest']['title'] = link['title']
|
||||||
|
|
||||||
return list(links)
|
return list(links)
|
||||||
|
|
||||||
def new_links(all_links, existing_links):
|
def new_links(all_links, existing_links):
|
||||||
|
|
|
@ -44,6 +44,7 @@ base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
|
||||||
short_ts = lambda ts: ts.split('.')[0]
|
short_ts = lambda ts: ts.split('.')[0]
|
||||||
|
|
||||||
URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))[^<\""]+'
|
URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))[^<\""]+'
|
||||||
|
HTML_TITLE_REGEX = '<title>(.[^<>]+)'
|
||||||
|
|
||||||
|
|
||||||
def check_dependencies():
|
def check_dependencies():
|
||||||
|
@ -227,22 +228,17 @@ def download_url(url):
|
||||||
return source_path
|
return source_path
|
||||||
|
|
||||||
|
|
||||||
def fetch_page_title(url, default=True):
|
def fetch_page_title(url, timeout=10, progress=SHOW_PROGRESS):
|
||||||
"""Attempt to guess a page's title by downloading the html"""
|
"""Attempt to guess a page's title by downloading the html"""
|
||||||
if default is True:
|
|
||||||
default = url
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if SHOW_PROGRESS:
|
if progress:
|
||||||
sys.stdout.write('.')
|
sys.stdout.write('.')
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
html_content = urllib.request.urlopen(url, timeout=10).read().decode('utf-8')
|
html_content = urllib.request.urlopen(url, timeout=10).read().decode('utf-8')
|
||||||
match = re.search('<title>(.*?)</title>', html_content)
|
match = re.search('<title>(.*?)</title>', html_content)
|
||||||
return match.group(1) if match else default or None
|
return match.group(1) if match else default or None
|
||||||
except Exception:
|
except Exception:
|
||||||
if default is False:
|
return None
|
||||||
raise
|
|
||||||
return default
|
|
||||||
|
|
||||||
|
|
||||||
def str_between(string, start, end=None):
|
def str_between(string, start, end=None):
|
||||||
|
@ -277,19 +273,19 @@ def merge_links(a, b):
|
||||||
"""deterministially merge two links, favoring longer field values over shorter,
|
"""deterministially merge two links, favoring longer field values over shorter,
|
||||||
and "cleaner" values over worse ones.
|
and "cleaner" values over worse ones.
|
||||||
"""
|
"""
|
||||||
longer = lambda key: a[key] if len(a[key]) > len(b[key]) else b[key]
|
longer = lambda key: (a[key] if len(a[key]) > len(b[key]) else b[key]) if (a[key] and b[key]) else (a[key] or b[key])
|
||||||
earlier = lambda key: a[key] if a[key] < b[key] else b[key]
|
earlier = lambda key: a[key] if a[key] < b[key] else b[key]
|
||||||
|
|
||||||
url = longer('url')
|
url = longer('url')
|
||||||
longest_title = longer('title')
|
longest_title = longer('title')
|
||||||
cleanest_title = a['title'] if '://' not in a['title'] else b['title']
|
cleanest_title = a['title'] if '://' not in (a['title'] or '') else b['title']
|
||||||
link = {
|
link = {
|
||||||
'timestamp': earlier('timestamp'),
|
'timestamp': earlier('timestamp'),
|
||||||
'url': url,
|
'url': url,
|
||||||
'domain': domain(url),
|
'domain': domain(url),
|
||||||
'base_url': base_url(url),
|
'base_url': base_url(url),
|
||||||
'tags': longer('tags'),
|
'tags': longer('tags'),
|
||||||
'title': longest_title if '://' not in longest_title else cleanest_title,
|
'title': longest_title if '://' not in (longest_title or '') else cleanest_title,
|
||||||
'sources': list(set(a.get('sources', []) + b.get('sources', []))),
|
'sources': list(set(a.get('sources', []) + b.get('sources', []))),
|
||||||
}
|
}
|
||||||
link['type'] = get_link_type(link)
|
link['type'] = get_link_type(link)
|
||||||
|
@ -532,7 +528,7 @@ def derived_link_info(link):
|
||||||
'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link),
|
'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link),
|
||||||
'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link),
|
'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link),
|
||||||
'dom_link': 'archive/{timestamp}/{base_url}'.format(**link),
|
'dom_link': 'archive/{timestamp}/{base_url}'.format(**link),
|
||||||
'title': '{title} ({type})'.format(**link),
|
'title': link['title'] or basename(link['url']),
|
||||||
})
|
})
|
||||||
return link_info
|
return link_info
|
||||||
|
|
||||||
|
|
|
@ -10,6 +10,7 @@
|
||||||
# FETCH_MEDIA=False
|
# FETCH_MEDIA=False
|
||||||
# FETCH_GIT=True
|
# FETCH_GIT=True
|
||||||
# FETCH_FAVICON=True
|
# FETCH_FAVICON=True
|
||||||
|
# FETCH_TITLE=True
|
||||||
# SUBMIT_ARCHIVE_DOT_ORG=True
|
# SUBMIT_ARCHIVE_DOT_ORG=True
|
||||||
|
|
||||||
### To only download new links, and never attempt to update old ones, uncomment this line:
|
### To only download new links, and never attempt to update old ones, uncomment this line:
|
||||||
|
|
Loading…
Reference in a new issue