diff --git a/archive.py b/archive.py index c4e5ed25..7038d772 100755 --- a/archive.py +++ b/archive.py @@ -326,6 +326,22 @@ def valid_links(links): """remove chrome://, about:// or other schemed links that cant be archived""" return (link for link in links if link['url'].startswith('http') or link['url'].startswith('ftp')) +def calculate_archive_url(link): + """calculate the path to the wgetted html file, since wget may adjust some paths + to be different than the base_url path + See docs on wget --adjust-extension.""" + + split_url = link['url'].split('#', 1) + + if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M): + # already ends in .html + return link['base_url'] + else: + # .html needs to be appended + url = split_url[0] if not split_url[0].endswith('/') else split_url[0][:-1] + without_scheme = url.split('://', 1)[-1] + return '#'.join([without_scheme + '.html', *split_url[1:]]) + def dump_index(links, service): """create index.html file for a given list of links and service""" @@ -337,7 +353,7 @@ def dump_index(links, service): link_html = """\