From d59bdbc86a971fc75024d93575ac9f8b1a0ca740 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 4 Jul 2017 04:03:09 -0500 Subject: [PATCH] fix archive urls not pointing to .html wget versions!! --- archive.py | 21 ++++++++++++++++++++- examples/firefox_export.html | 1 + 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/archive.py b/archive.py index c4e5ed25..7038d772 100755 --- a/archive.py +++ b/archive.py @@ -326,6 +326,22 @@ def valid_links(links): """remove chrome://, about:// or other schemed links that cant be archived""" return (link for link in links if link['url'].startswith('http') or link['url'].startswith('ftp')) +def calculate_archive_url(link): + """calculate the path to the wgetted html file, since wget may adjust some paths + to be different than the base_url path + See docs on wget --adjust-extension.""" + + split_url = link['url'].split('#', 1) + + if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M): + # already ends in .html + return link['base_url'] + else: + # .html needs to be appended + url = split_url[0] if not split_url[0].endswith('/') else split_url[0][:-1] + without_scheme = url.split('://', 1)[-1] + return '#'.join([without_scheme + '.html', *split_url[1:]]) + def dump_index(links, service): """create index.html file for a given list of links and service""" @@ -337,7 +353,7 @@ def dump_index(links, service): link_html = """\ {time} - + {title} {tags} @@ -352,6 +368,9 @@ def dump_index(links, service): # since we dont screenshot or PDF links that are images or PDFs, change those links to point to the wget'ed file link_info = {**link} + # append .html to archive links that dont have it, since wget appends .html to everything + link_info['archive_url'] = calculate_archive_url(link) + # add link type to title if link['type']: link_info.update({'title': '{title} ({type})'.format(**link)}) diff --git a/examples/firefox_export.html b/examples/firefox_export.html index b9175928..56ffd77d 100644 --- a/examples/firefox_export.html +++ b/examples/firefox_export.html @@ -21,6 +21,7 @@
firefox export bookmarks at DuckDuckGo
archive firefox bookmarks at DuckDuckGo
nodiscc (nodiscc) · GitHub +
pirate/bookmark-archiver · Github
Firefox Bookmarks Archiver - gHacks Tech News

Bookmarks Toolbar