1
0
Fork 0

used derived info for all derivable info

This commit is contained in:
Nick Sweeting 2019-02-27 04:49:58 -05:00
parent ef4c446c8b
commit fa6f53f2af

View file

@ -340,19 +340,19 @@ def str_between(string, start, end=None):
def get_link_type(link): def get_link_type(link):
"""Certain types of links need to be handled specially, this figures out when that's the case""" """Certain types of links need to be handled specially, this figures out when that's the case"""
if link['base_url'].endswith('.pdf'): if extension(link['url']) == 'pdf':
return 'PDF' return 'PDF'
elif link['base_url'].rsplit('.', 1) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'): elif extension(link['url']) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'):
return 'image' return 'image'
elif 'wikipedia.org' in link['domain']: elif 'wikipedia.org' in domain(link['url']).lower():
return 'wiki' return 'wiki'
elif 'youtube.com' in link['domain']: elif 'youtube.com' in domain(link['url']).lower():
return 'youtube' return 'youtube'
elif 'soundcloud.com' in link['domain']: elif 'soundcloud.com' in domain(link['url']).lower():
return 'soundcloud' return 'soundcloud'
elif 'youku.com' in link['domain']: elif 'youku.com' in domain(link['url']).lower():
return 'youku' return 'youku'
elif 'vimeo.com' in link['domain']: elif 'vimeo.com' in domain(link['url']).lower():
return 'vimeo' return 'vimeo'
return None return None
@ -383,15 +383,15 @@ def find_link(folder, links):
url = parse_url(folder) url = parse_url(folder)
if url: if url:
for link in links: for link in links:
if (link['base_url'] in url) or (url in link['url']): if (base_url(link['url']) in url) or (url in link['url']):
return link return link
timestamp = folder.split('.')[0] timestamp = folder.split('.')[0]
for link in links: for link in links:
if link['timestamp'].startswith(timestamp): if link['timestamp'].startswith(timestamp):
if link['domain'] in os.listdir(os.path.join(ARCHIVE_DIR, folder)): if domain(link['url']) in os.listdir(os.path.join(ARCHIVE_DIR, folder)):
return link # careful now, this isn't safe for most ppl return link # careful now, this isn't safe for most ppl
if link['domain'] in parse_url(folder): if domain(link['url']) in parse_url(folder):
return link return link
return None return None
@ -405,7 +405,7 @@ def parse_url(folder):
link_json = f.read().strip() link_json = f.read().strip()
if link_json: if link_json:
link = json.loads(link_json) link = json.loads(link_json)
return link['base_url'] return base_url(link['url'])
except ValueError: except ValueError:
print('File contains invalid JSON: {}!'.format(link_json)) print('File contains invalid JSON: {}!'.format(link_json))
@ -461,8 +461,8 @@ def fix_folder_path(archive_path, link_folder, link):
target = os.path.join(archive_path, link['timestamp']) target = os.path.join(archive_path, link['timestamp'])
url_in_folder = parse_url(source) url_in_folder = parse_url(source)
if not (url_in_folder in link['base_url'] if not (url_in_folder in base_url(link['url'])
or link['base_url'] in url_in_folder): or base_url(link['url']) in url_in_folder):
raise ValueError('The link does not match the url for this folder.') raise ValueError('The link does not match the url for this folder.')
if not os.path.exists(target): if not os.path.exists(target):
@ -550,12 +550,12 @@ def wget_output_path(link, look_in=None):
urlencode = lambda s: quote(s, encoding='utf-8', errors='replace') urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
if link['type'] in ('PDF', 'image'): if link['type'] in ('PDF', 'image'):
return urlencode(link['base_url']) return urlencode(base_url(link['url']))
# Since the wget algorithm to for -E (appending .html) is incredibly complex # Since the wget algorithm to for -E (appending .html) is incredibly complex
# instead of trying to emulate it here, we just look in the output folder # instead of trying to emulate it here, we just look in the output folder
# to see what html file wget actually created as the output # to see what html file wget actually created as the output
wget_folder = link['base_url'].rsplit('/', 1)[0].split('/') wget_folder = base_url(link['url']).rsplit('/', 1)[0].split('/')
look_in = os.path.join(ARCHIVE_DIR, link['timestamp'], *wget_folder) look_in = os.path.join(ARCHIVE_DIR, link['timestamp'], *wget_folder)
if look_in and os.path.exists(look_in): if look_in and os.path.exists(look_in):
@ -575,7 +575,7 @@ def wget_output_path(link, look_in=None):
# if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M): # if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
# # already ends in .html # # already ends in .html
# return urlencode(link['base_url']) # return urlencode(base_url(link['url']))
# else: # else:
# # .html needs to be appended # # .html needs to be appended
# without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0] # without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
@ -588,7 +588,7 @@ def wget_output_path(link, look_in=None):
# return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]])) # return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
# elif '/' in without_scheme: # elif '/' in without_scheme:
# return urlencode('#'.join([without_scheme + '.html', *split_url[1:]])) # return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
# return urlencode(link['base_url'] + '/index.html') # return urlencode(base_url(link['url']) + '/index.html')
def derived_link_info(link): def derived_link_info(link):
@ -596,42 +596,45 @@ def derived_link_info(link):
url = link['url'] url = link['url']
link_info = { extended_info = {
**link, **link,
'title': link['title'] or url, 'title': link['title'] or base_url(url),
'date': datetime.fromtimestamp(Decimal(link['timestamp'])).strftime('%Y-%m-%d %H:%M'), 'date': datetime.fromtimestamp(Decimal(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
'base_url': base_url(url), 'base_url': base_url(url),
'domain': domain(url), 'domain': domain(url),
'basename': basename(url), 'basename': basename(url),
'path': path(url), 'path': path(url),
}
# Archive Method Output URLs # Archive Method Output URLs
'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link), extended_info = {
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link), **extended_info,
'files_url': 'archive/{timestamp}/index.html'.format(**link), 'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**extended_info),
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**extended_info),
'files_url': 'archive/{timestamp}/index.html'.format(**extended_info),
'archive_url': 'archive/{}/{}'.format(link['timestamp'], wget_output_path(link) or 'index.html'), 'archive_url': 'archive/{}/{}'.format(link['timestamp'], wget_output_path(link) or 'index.html'),
'warc_url': 'archive/{timestamp}/warc'.format(**link), 'warc_url': 'archive/{timestamp}/warc'.format(**extended_info),
'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link), 'pdf_link': 'archive/{timestamp}/output.pdf'.format(**extended_info),
'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link), 'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**extended_info),
'dom_link': 'archive/{timestamp}/output.html'.format(**link), 'dom_link': 'archive/{timestamp}/output.html'.format(**extended_info),
'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link), 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**extended_info),
'git_url': 'archive/{timestamp}/git'.format(**link), 'git_url': 'archive/{timestamp}/git'.format(**extended_info),
'media_url': 'archive/{timestamp}/media'.format(**link), 'media_url': 'archive/{timestamp}/media'.format(**extended_info),
} }
# PDF and images are handled slightly differently # PDF and images are handled slightly differently
# wget, screenshot, & pdf urls all point to the same file # wget, screenshot, & pdf urls all point to the same file
if link['type'] in ('PDF', 'image'): if link['type'] in ('PDF', 'image'):
link_info.update({ extended_info.update({
'archive_url': 'archive/{timestamp}/{base_url}'.format(**link), 'archive_url': 'archive/{timestamp}/{base_url}'.format(**extended_info),
'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link), 'pdf_link': 'archive/{timestamp}/{base_url}'.format(**extended_info),
'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link), 'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**extended_info),
'dom_link': 'archive/{timestamp}/{base_url}'.format(**link), 'dom_link': 'archive/{timestamp}/{base_url}'.format(**extended_info),
'title': link['title'] or basename(link['url']), 'title': link['title'] or basename(link['url']),
}) })
return link_info return extended_info
def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs): def run(*popenargs, input=None, capture_output=False, timeout=None, check=False, **kwargs):