2017-05-05 05:00:30 -04:00
#!/usr/bin/env python3
2017-06-30 04:23:19 -04:00
# Bookmark Archiver
# Nick Sweeting 2017 | MIT License
# https://github.com/pirate/bookmark-archiver
2017-05-05 05:00:30 -04:00
import re
import os
import sys
2017-05-05 19:36:46 -04:00
import json
import time
2017-06-30 04:23:19 -04:00
from datetime import datetime
2017-05-29 14:00:46 -04:00
from subprocess import run , PIPE , DEVNULL
2017-06-30 04:23:19 -04:00
__DESCRIPTION__ = ' Bookmark Archiver: Create a browsable html archive of a list of links. '
__DOCUMENTATION__ = ' https://github.com/pirate/bookmark-archiver '
2017-05-29 14:00:46 -04:00
### SETTINGS
2017-05-05 05:00:30 -04:00
2017-05-05 07:27:05 -04:00
INDEX_TEMPLATE = ' index_template.html '
2017-05-05 05:00:30 -04:00
2017-06-21 15:42:24 -04:00
# os.getenv('VARIABLE', 'DEFAULT') gets the value of environment
# variable "VARIABLE" and if it is not set, sets it to 'DEFAULT'
# for boolean values, check to see if the string is 'true', and
# if so, the python variable will be True
2017-06-30 04:23:19 -04:00
FETCH_WGET = os . getenv ( ' FETCH_WGET ' , ' True ' ) . lower ( ) == ' true '
2017-07-04 04:29:42 -04:00
FETCH_WGET_REQUISITES = os . getenv ( ' FETCH_WGET_REQUISITES ' , ' True ' ) . lower ( ) == ' true '
2017-06-30 04:23:19 -04:00
FETCH_PDF = os . getenv ( ' FETCH_PDF ' , ' True ' ) . lower ( ) == ' true '
FETCH_SCREENSHOT = os . getenv ( ' FETCH_SCREENSHOT ' , ' True ' ) . lower ( ) == ' true '
FETCH_FAVICON = os . getenv ( ' FETCH_FAVICON ' , ' True ' ) . lower ( ) == ' true '
SUBMIT_ARCHIVE_DOT_ORG = os . getenv ( ' SUBMIT_ARCHIVE_DOT_ORG ' , ' True ' ) . lower ( ) == ' true '
RESOLUTION = os . getenv ( ' RESOLUTION ' , ' 1440,900 ' )
ARCHIVE_PERMISSIONS = os . getenv ( ' ARCHIVE_PERMISSIONS ' , ' 755 ' )
CHROME_BINARY = os . getenv ( ' CHROME_BINARY ' , ' chromium-browser ' ) # change to google-chrome browser if using google-chrome
WGET_BINARY = os . getenv ( ' WGET_BINARY ' , ' wget ' )
2017-07-04 04:09:47 -04:00
TIMEOUT = int ( os . getenv ( ' TIMEOUT ' , ' 60 ' ) )
2017-06-30 04:23:19 -04:00
2017-05-05 05:00:30 -04:00
2017-05-05 07:27:05 -04:00
def check_dependencies ( ) :
2017-05-29 14:00:46 -04:00
print ( ' [*] Checking Dependencies: ' )
if FETCH_PDF or FETCH_SCREENSHOT :
if run ( [ ' which ' , CHROME_BINARY ] ) . returncode :
print ( ' [X] Missing dependency: {} ' . format ( CHROME_BINARY ) )
2017-06-15 19:31:33 -04:00
print ( ' See https://github.com/pirate/bookmark-archiver for help. ' )
2017-05-29 14:00:46 -04:00
raise SystemExit ( 1 )
# parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
result = run ( [ CHROME_BINARY , ' --version ' ] , stdout = PIPE )
version = result . stdout . decode ( ' utf-8 ' ) . replace ( ' Google Chrome ' , ' ' ) . replace ( ' Chromium ' , ' ' ) . split ( ' ' , 1 ) [ 0 ] . split ( ' . ' , 1 ) [ 0 ] # TODO: regex might be better
if int ( version ) < 59 :
print ( ' [X] Chrome version must be 59 or greater for headless PDF and screenshot saving ' )
2017-06-15 19:31:33 -04:00
print ( ' See https://github.com/pirate/bookmark-archiver for help. ' )
2017-05-29 14:00:46 -04:00
raise SystemExit ( 1 )
if FETCH_WGET :
if run ( [ ' which ' , ' wget ' ] ) . returncode :
print ( ' [X] Missing dependency: wget ' )
2017-06-15 19:31:33 -04:00
print ( ' See https://github.com/pirate/bookmark-archiver for help. ' )
2017-05-29 14:00:46 -04:00
raise SystemExit ( 1 )
if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG :
if run ( [ ' which ' , ' curl ' ] ) . returncode :
print ( ' [X] Missing dependency: curl ' )
2017-06-15 19:31:33 -04:00
print ( ' See https://github.com/pirate/bookmark-archiver for help. ' )
2017-05-05 07:27:05 -04:00
raise SystemExit ( 1 )
2017-05-05 05:00:30 -04:00
2017-05-29 14:00:46 -04:00
### PARSING READER LIST EXPORTS
def get_link_type ( link ) :
2017-06-30 04:23:19 -04:00
""" Certain types of links need to be handled specially, this figures out when that ' s the case """
2017-05-29 14:00:46 -04:00
if link [ ' base_url ' ] . endswith ( ' .pdf ' ) :
return ' PDF '
elif link [ ' base_url ' ] . rsplit ( ' . ' , 1 ) in ( ' pdf ' , ' png ' , ' jpg ' , ' jpeg ' , ' svg ' , ' bmp ' , ' gif ' , ' tiff ' , ' webp ' ) :
return ' image '
elif ' wikipedia.org ' in link [ ' domain ' ] :
return ' wiki '
elif ' youtube.com ' in link [ ' domain ' ] :
return ' youtube '
return None
2017-06-15 19:31:33 -04:00
def parse_pocket_export ( html_file ) :
2017-06-30 04:23:19 -04:00
""" Parse Pocket-format bookmarks export files (produced by getpocket.com/export/) """
2017-06-15 19:31:33 -04:00
html_file . seek ( 0 )
2017-05-05 10:54:18 -04:00
pattern = re . compile ( " ^ \\ s*<li><a href= \" (.+) \" time_added= \" ( \\ d+) \" tags= \" (.*) \" >(.+)</a></li> " , re . UNICODE ) # see sample input in ./example_ril_export.html
2017-06-15 19:31:33 -04:00
for line in html_file :
2017-05-05 05:00:30 -04:00
match = pattern . search ( line )
if match :
2017-05-22 11:40:16 -04:00
fixed_url = match . group ( 1 ) . replace ( ' http://www.readability.com/read?url= ' , ' ' ) # remove old readability prefixes to get original url
without_scheme = fixed_url . replace ( ' http:// ' , ' ' ) . replace ( ' https:// ' , ' ' )
2017-05-29 14:00:46 -04:00
info = {
2017-05-22 11:40:16 -04:00
' url ' : fixed_url ,
' domain ' : without_scheme . split ( ' / ' ) [ 0 ] , # without pathname
' base_url ' : without_scheme . split ( ' ? ' ) [ 0 ] , # without query args
2017-05-29 14:00:46 -04:00
' time ' : datetime . fromtimestamp ( int ( match . group ( 2 ) ) ) . strftime ( ' % Y- % m- %d % H: % M ' ) ,
2017-05-05 05:00:30 -04:00
' timestamp ' : match . group ( 2 ) ,
' tags ' : match . group ( 3 ) ,
2017-05-22 11:40:16 -04:00
' title ' : match . group ( 4 ) . replace ( ' — Readability ' , ' ' ) . replace ( ' http://www.readability.com/read?url= ' , ' ' ) or without_scheme ,
2017-05-05 05:00:30 -04:00
}
2017-05-29 14:00:46 -04:00
info [ ' type ' ] = get_link_type ( info )
yield info
2017-05-05 05:00:30 -04:00
2017-06-15 19:31:33 -04:00
def parse_json_export ( json_file ) :
2017-06-30 04:23:19 -04:00
""" Parse JSON-format bookmarks export files (produced by pinboard.in/export/) """
2017-06-15 19:31:33 -04:00
json_file . seek ( 0 )
json_content = json . load ( json_file )
2017-05-05 19:36:46 -04:00
for line in json_content :
if line :
erg = line
2017-05-29 14:00:46 -04:00
info = {
2017-05-22 11:40:16 -04:00
' url ' : erg [ ' href ' ] ,
2017-05-05 19:36:46 -04:00
' domain ' : erg [ ' href ' ] . replace ( ' http:// ' , ' ' ) . replace ( ' https:// ' , ' ' ) . split ( ' / ' ) [ 0 ] ,
' base_url ' : erg [ ' href ' ] . replace ( ' https:// ' , ' ' ) . replace ( ' http:// ' , ' ' ) . split ( ' ? ' ) [ 0 ] ,
2017-07-01 06:53:19 -04:00
' time ' : datetime . fromtimestamp ( int ( time . mktime ( time . strptime ( erg [ ' time ' ] . split ( ' , ' ) [ 0 ] , ' % Y- % m- %d T % H: % M: % SZ ' ) ) ) ) ,
' timestamp ' : str ( int ( time . mktime ( time . strptime ( erg [ ' time ' ] . split ( ' , ' ) [ 0 ] , ' % Y- % m- %d T % H: % M: % SZ ' ) ) ) ) ,
2017-05-05 19:36:46 -04:00
' tags ' : erg [ ' tags ' ] ,
2017-05-22 11:40:16 -04:00
' title ' : erg [ ' description ' ] . replace ( ' — Readability ' , ' ' ) ,
2017-05-05 19:36:46 -04:00
}
2017-05-29 14:00:46 -04:00
info [ ' type ' ] = get_link_type ( info )
yield info
2017-05-05 19:36:46 -04:00
2017-06-15 19:31:33 -04:00
def parse_bookmarks_export ( html_file ) :
2017-06-30 04:23:19 -04:00
""" Parse netscape-format bookmarks export files (produced by all browsers) """
2017-06-15 19:31:33 -04:00
html_file . seek ( 0 )
2017-06-11 14:45:22 -04:00
pattern = re . compile ( " <a href= \" (.+?) \" add_date= \" ( \\ d+) \" [^>]*>(.+)</a> " , re . UNICODE | re . IGNORECASE )
2017-06-15 19:31:33 -04:00
for line in html_file :
2017-06-11 14:45:22 -04:00
match = pattern . search ( line )
if match :
url = match . group ( 1 )
secs = match . group ( 2 )
dt = datetime . fromtimestamp ( int ( secs ) )
info = {
' url ' : url ,
' domain ' : url . replace ( ' http:// ' , ' ' ) . replace ( ' https:// ' , ' ' ) . split ( ' / ' ) [ 0 ] ,
' base_url ' : url . replace ( ' https:// ' , ' ' ) . replace ( ' http:// ' , ' ' ) . split ( ' ? ' ) [ 0 ] ,
' time ' : dt ,
' timestamp ' : secs ,
' tags ' : " " ,
2017-06-15 01:45:40 -04:00
' title ' : match . group ( 3 ) ,
2017-06-11 14:45:22 -04:00
}
info [ ' type ' ] = get_link_type ( info )
yield info
2017-06-08 16:52:39 -04:00
2017-05-05 05:00:30 -04:00
2017-05-29 14:00:46 -04:00
### ACHIVING FUNCTIONS
2017-05-05 05:00:30 -04:00
2017-07-04 04:09:47 -04:00
def chmod_file ( path , cwd = ' . ' , permissions = ARCHIVE_PERMISSIONS ) :
if not os . path . exists ( os . path . join ( cwd , path ) ) :
2017-07-04 04:02:01 -04:00
raise Exception ( ' Failed to chmod: {} does not exist (did the previous step fail?) ' . format ( path ) )
2017-07-04 04:09:47 -04:00
chmod_result = run ( [ ' chmod ' , ' -R ' , ARCHIVE_PERMISSIONS , path ] , cwd = cwd , stdout = DEVNULL , stderr = PIPE , timeout = 5 )
2017-07-04 04:02:01 -04:00
if chmod_result . returncode == 1 :
print ( ' ' , chmod_result . stderr . decode ( ) )
2017-07-04 04:09:47 -04:00
raise Exception ( ' Failed to chmod {} / {} ' . format ( cwd , path ) )
2017-07-04 04:02:01 -04:00
2017-05-05 07:27:05 -04:00
def fetch_wget ( out_dir , link , overwrite = False ) :
2017-06-30 04:23:19 -04:00
""" download full site using wget """
2017-06-15 20:12:56 -04:00
domain = link [ ' base_url ' ] . split ( ' / ' , 1 ) [ 0 ]
if not os . path . exists ( ' {} / {} ' . format ( out_dir , domain ) ) or overwrite :
2017-05-05 05:00:30 -04:00
print ( ' - Downloading Full Site ' )
CMD = [
2017-07-04 05:28:26 -04:00
* ' wget --timestamping --adjust-extension --no-parent ' . split ( ' ' ) , # Docs: https://www.gnu.org/software/wget/manual/wget.html
2017-07-04 04:29:42 -04:00
* ( ( ' --page-requisites ' , ' --convert-links ' ) if FETCH_WGET_REQUISITES else ( ) ) ,
2017-07-04 04:02:01 -04:00
link [ ' url ' ] ,
2017-05-05 05:00:30 -04:00
]
try :
2017-07-04 05:02:47 -04:00
result = run ( CMD , stdout = PIPE , stderr = PIPE , cwd = out_dir , timeout = TIMEOUT ) # dom.html
2017-07-04 04:29:42 -04:00
if result . returncode > 0 :
print ( ' ' , result . stderr . decode ( ) . split ( ' \n ' ) [ - 1 ] )
2017-07-04 03:42:29 -04:00
raise Exception ( ' Failed to wget download ' )
2017-07-04 04:09:47 -04:00
chmod_file ( domain , cwd = out_dir )
2017-05-05 05:00:30 -04:00
except Exception as e :
2017-07-04 05:02:47 -04:00
print ( ' Run to see full output: ' , ' cd {} ; {} ' . format ( out_dir , ' ' . join ( CMD ) ) )
print ( ' Failed: {} {} ' . format ( e . __class__ . __name__ , e ) )
2017-05-05 05:00:30 -04:00
else :
print ( ' √ Skipping site download ' )
2017-05-05 07:27:05 -04:00
def fetch_pdf ( out_dir , link , overwrite = False ) :
2017-06-30 04:23:19 -04:00
""" print PDF of site to file using chrome --headless """
2017-06-15 01:45:40 -04:00
if ( not os . path . exists ( ' {} /output.pdf ' . format ( out_dir ) ) or overwrite ) and link [ ' type ' ] not in ( ' PDF ' , ' image ' ) :
2017-05-05 05:00:30 -04:00
print ( ' - Printing PDF ' )
2017-07-04 05:02:47 -04:00
CMD = [
CHROME_BINARY ,
* ' --headless --disable-gpu --print-to-pdf ' . split ( ' ' ) ,
link [ ' url ' ]
]
2017-05-05 05:00:30 -04:00
try :
2017-07-04 05:02:47 -04:00
result = run ( CMD , stdout = DEVNULL , stderr = PIPE , cwd = out_dir , timeout = TIMEOUT ) # output.pdf
2017-07-04 03:50:24 -04:00
if result . returncode :
2017-07-04 04:02:01 -04:00
print ( ' ' , result . stderr . decode ( ) )
2017-06-15 20:12:56 -04:00
raise Exception ( ' Failed to print PDF ' )
2017-07-04 04:09:47 -04:00
chmod_file ( ' output.pdf ' , cwd = out_dir )
2017-05-05 05:00:30 -04:00
except Exception as e :
2017-07-04 05:02:47 -04:00
print ( ' Run to see full output: ' , ' cd {} ; {} ' . format ( out_dir , ' ' . join ( CMD ) ) )
print ( ' Failed: {} {} ' . format ( e . __class__ . __name__ , e ) )
2017-05-05 05:00:30 -04:00
else :
print ( ' √ Skipping PDF print ' )
2017-05-05 07:27:05 -04:00
def fetch_screenshot ( out_dir , link , overwrite = False ) :
2017-06-30 04:23:19 -04:00
""" take screenshot of site using chrome --headless """
2017-06-15 01:45:40 -04:00
if ( not os . path . exists ( ' {} /screenshot.png ' . format ( out_dir ) ) or overwrite ) and link [ ' type ' ] not in ( ' PDF ' , ' image ' ) :
2017-05-05 05:00:30 -04:00
print ( ' - Snapping Screenshot ' )
2017-07-04 05:02:47 -04:00
CMD = [
CHROME_BINARY ,
* ' --headless --disable-gpu --screenshot ' . split ( ' ' ) ,
' --window-size= {} ' . format ( RESOLUTION ) ,
link [ ' url ' ]
]
2017-05-05 05:00:30 -04:00
try :
2017-07-04 05:02:47 -04:00
result = run ( CMD , stdout = DEVNULL , stderr = DEVNULL , cwd = out_dir , timeout = TIMEOUT ) # sreenshot.png
2017-07-04 03:50:24 -04:00
if result . returncode :
2017-07-04 04:02:01 -04:00
print ( ' ' , result . stderr . decode ( ) )
2017-06-15 20:12:56 -04:00
raise Exception ( ' Failed to take screenshot ' )
2017-07-04 04:09:47 -04:00
chmod_file ( ' screenshot.png ' , cwd = out_dir )
2017-05-05 05:00:30 -04:00
except Exception as e :
2017-07-04 05:02:47 -04:00
print ( ' Run to see full output: ' , ' cd {} ; {} ' . format ( out_dir , ' ' . join ( CMD ) ) )
print ( ' Failed: {} {} ' . format ( e . __class__ . __name__ , e ) )
2017-05-05 05:00:30 -04:00
else :
print ( ' √ Skipping screenshot ' )
2017-05-29 14:00:46 -04:00
def archive_dot_org ( out_dir , link , overwrite = False ) :
2017-06-30 04:23:19 -04:00
""" submit site to archive.org for archiving via their service, save returned archive url """
2017-05-29 14:00:46 -04:00
if ( not os . path . exists ( ' {} /archive.org.txt ' . format ( out_dir ) ) or overwrite ) :
print ( ' - Submitting to archive.org ' )
2017-06-15 18:32:11 -04:00
submit_url = ' https://web.archive.org/save/ {} ' . format ( link [ ' url ' ] . split ( ' ? ' , 1 ) [ 0 ] )
2017-05-29 14:00:46 -04:00
success = False
2017-07-04 04:21:47 -04:00
CMD = [ ' curl ' , ' -I ' , submit_url ]
2017-05-29 14:00:46 -04:00
try :
2017-07-04 04:21:47 -04:00
result = run ( CMD , stdout = PIPE , stderr = DEVNULL , cwd = out_dir , timeout = TIMEOUT ) # archive.org
2017-05-29 14:00:46 -04:00
headers = result . stdout . splitlines ( )
content_location = [ h for h in headers if b ' Content-Location: ' in h ]
if content_location :
archive_path = content_location [ 0 ] . split ( b ' Content-Location: ' , 1 ) [ - 1 ] . decode ( ' utf-8 ' )
saved_url = ' https://web.archive.org {} ' . format ( archive_path )
success = True
else :
2017-07-04 04:29:42 -04:00
raise Exception ( ' Failed to find " Content-Location " URL header in Archive.org response. ' )
2017-05-29 14:00:46 -04:00
except Exception as e :
2017-07-04 05:02:47 -04:00
print ( ' Visit url to see output: ' , ' ' . join ( CMD ) )
print ( ' Failed: {} {} ' . format ( e . __class__ . __name__ , e ) )
2017-05-29 14:00:46 -04:00
if success :
with open ( ' {} /archive.org.txt ' . format ( out_dir ) , ' w ' ) as f :
f . write ( saved_url )
2017-07-04 04:15:17 -04:00
chmod_file ( ' archive.org.txt ' , cwd = out_dir )
2017-05-29 14:00:46 -04:00
else :
print ( ' √ Skipping archive.org ' )
2017-05-05 07:27:05 -04:00
def fetch_favicon ( out_dir , link , overwrite = False ) :
2017-06-30 04:23:19 -04:00
""" download site favicon from google ' s favicon api """
2017-05-05 05:00:30 -04:00
if not os . path . exists ( ' {} /favicon.ico ' . format ( out_dir ) ) or overwrite :
print ( ' - Fetching Favicon ' )
CMD = ' curl https://www.google.com/s2/favicons?domain= {domain} ' . format ( * * link ) . split ( ' ' )
fout = open ( ' {} /favicon.ico ' . format ( out_dir ) , ' w ' )
try :
2017-07-04 04:09:47 -04:00
run ( [ * CMD ] , stdout = fout , stderr = DEVNULL , cwd = out_dir , timeout = TIMEOUT ) # favicon.ico
chmod_file ( ' favicon.ico ' , cwd = out_dir )
2017-05-05 05:00:30 -04:00
except Exception as e :
2017-07-04 05:02:47 -04:00
print ( ' Run to see full output: ' , ' ' . join ( CMD ) )
print ( ' Failed: {} {} ' . format ( e . __class__ . __name__ , e ) )
2017-05-05 05:00:30 -04:00
fout . close ( )
else :
print ( ' √ Skipping favicon ' )
2017-05-05 07:27:05 -04:00
2017-05-29 14:00:46 -04:00
### ORCHESTRATION
2017-06-15 18:33:01 -04:00
def next_uniq_timestamp ( used_timestamps , timestamp ) :
2017-06-30 04:23:19 -04:00
""" resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2 """
2017-06-15 18:33:01 -04:00
if timestamp not in used_timestamps :
return timestamp
if ' . ' in timestamp :
timestamp , nonce = timestamp . split ( ' . ' )
nonce = int ( nonce )
else :
nonce = 1
new_timestamp = ' {} . {} ' . format ( timestamp , nonce )
while new_timestamp in used_timestamps :
nonce + = 1
new_timestamp = ' {} . {} ' . format ( timestamp , nonce )
return new_timestamp
def uniquefied_links ( links ) :
""" uniqueify link timestamps by de-duping using url, returns links sorted most recent -> oldest
needed because firefox will produce exports where many links share the same timestamp , this func
2017-06-30 04:23:19 -04:00
ensures that all non - duplicate links have monotonically increasing timestamps
"""
2017-06-15 18:33:01 -04:00
links = list ( reversed ( sorted ( links , key = lambda l : ( l [ ' timestamp ' ] , l [ ' url ' ] ) ) ) )
seen_timestamps = { }
for link in links :
t = link [ ' timestamp ' ]
if t in seen_timestamps :
if link [ ' url ' ] == seen_timestamps [ t ] [ ' url ' ] :
# don't create new unique timestamp if link is the same
continue
else :
# resolve duplicate timstamp by appending a decimal
link [ ' timestamp ' ] = next_uniq_timestamp ( seen_timestamps , link [ ' timestamp ' ] )
seen_timestamps [ link [ ' timestamp ' ] ] = link
return links
def valid_links ( links ) :
2017-06-30 04:23:19 -04:00
""" remove chrome://, about:// or other schemed links that cant be archived """
2017-06-15 18:33:01 -04:00
return ( link for link in links if link [ ' url ' ] . startswith ( ' http ' ) or link [ ' url ' ] . startswith ( ' ftp ' ) )
2017-07-04 05:03:09 -04:00
def calculate_archive_url ( link ) :
2017-07-04 05:28:26 -04:00
""" calculate the path to the wgetted html file, since wget may
adjust some paths to be different than the base_url path .
2017-07-04 05:03:09 -04:00
See docs on wget - - adjust - extension . """
split_url = link [ ' url ' ] . split ( ' # ' , 1 )
if re . search ( " .+ \\ .[Hh][Tt][Mm][Ll]?$ " , split_url [ 0 ] , re . I | re . M ) :
# already ends in .html
return link [ ' base_url ' ]
else :
# .html needs to be appended
2017-07-04 05:09:00 -04:00
without_scheme = split_url [ 0 ] . split ( ' :// ' , 1 ) [ - 1 ]
2017-07-04 05:09:42 -04:00
if without_scheme . endswith ( ' / ' ) :
2017-07-04 05:09:00 -04:00
return ' # ' . join ( [ without_scheme + ' index.html ' , * split_url [ 1 : ] ] )
2017-07-04 05:03:09 -04:00
return ' # ' . join ( [ without_scheme + ' .html ' , * split_url [ 1 : ] ] )
2017-06-15 18:33:01 -04:00
2017-07-04 05:09:00 -04:00
2017-05-29 14:00:46 -04:00
def dump_index ( links , service ) :
2017-06-30 04:23:19 -04:00
""" create index.html file for a given list of links and service """
2017-05-29 14:00:46 -04:00
with open ( INDEX_TEMPLATE , ' r ' ) as f :
index_html = f . read ( )
2017-06-30 04:23:19 -04:00
# TODO: refactor this out into index_template.html
2017-05-29 14:00:46 -04:00
link_html = """ \
< tr >
< td > { time } < / td >
2017-07-04 05:03:09 -04:00
< td > < a href = " archive/ {timestamp} / {archive_url} " style = " font-size:1.4em;text-decoration:none;color:black; " title = " {title} " >
2017-05-29 14:00:46 -04:00
< img src = " archive/ {timestamp} /favicon.ico " >
{ title } < small style = " background-color: #eee;border-radius:4px; float:right " > { tags } < / small >
< / td >
< td style = " text-align:center " > < a href = " archive/ {timestamp} / " title = " Files " > 📂 < / a > < / td >
< td style = " text-align:center " > < a href = " {pdf_link} " title = " PDF " > 📄 < / a > < / td >
< td style = " text-align:center " > < a href = " {screenshot_link} " title = " Screenshot " > 🖼 < / a > < / td >
< td style = " text-align:center " > < a href = " https://web.archive.org/web/ {base_url} " title = " Archive.org " > 🏛 < / a > < / td >
< td > 🔗 < img src = " https://www.google.com/s2/favicons?domain= {domain} " height = " 16px " > < a href = " {url} " > { url } < / a > < / td >
< / tr > """
def get_template_vars ( link ) :
# since we dont screenshot or PDF links that are images or PDFs, change those links to point to the wget'ed file
link_info = { * * link }
2017-07-04 05:28:26 -04:00
# PDF and images are handled slightly differently
# wget, screenshot, & pdf urls all point to the same file
2017-05-29 14:00:46 -04:00
if link [ ' type ' ] in ( ' PDF ' , ' image ' ) :
link_info . update ( {
2017-07-04 05:28:26 -04:00
' archive_url ' : ' archive/ {timestamp} / {base_url} ' . format ( * * link ) ,
2017-05-29 14:00:46 -04:00
' pdf_link ' : ' archive/ {timestamp} / {base_url} ' . format ( * * link ) ,
' screenshot_link ' : ' archive/ {timestamp} / {base_url} ' . format ( * * link ) ,
2017-07-04 05:28:26 -04:00
' title ' : ' {title} ( {type} ) ' . format ( * * link ) ,
2017-05-29 14:00:46 -04:00
} )
else :
link_info . update ( {
2017-07-04 05:28:26 -04:00
' archive_url ' : calculate_archive_url ( link ) ,
2017-05-29 14:00:46 -04:00
' pdf_link ' : ' archive/ {timestamp} /output.pdf ' . format ( * * link ) ,
' screenshot_link ' : ' archive/ {timestamp} /screenshot.png ' . format ( * * link )
} )
return link_info
2017-06-30 04:23:19 -04:00
article_rows = ' \n ' . join (
link_html . format ( * * get_template_vars ( link ) ) for link in links
)
template_vars = ( datetime . now ( ) . strftime ( ' % Y- % m- %d % H: % M ' ) , article_rows )
2017-05-29 14:00:46 -04:00
with open ( ' ' . join ( ( service , ' /index.html ' ) ) , ' w ' ) as f :
2017-06-30 04:23:19 -04:00
f . write ( index_html . format ( * template_vars ) )
2017-05-29 14:00:46 -04:00
2017-05-05 19:36:46 -04:00
def dump_website ( link , service , overwrite = False ) :
2017-05-05 07:27:05 -04:00
""" download the DOM, PDF, and a screenshot into a folder named after the link ' s timestamp """
2017-06-15 18:32:37 -04:00
print ( ' [+] [ {timestamp} ( {time} )] " {title} " : {base_url} ' . format ( * * link ) )
2017-05-05 07:27:05 -04:00
2017-05-05 19:36:46 -04:00
out_dir = ' ' . join ( ( service , ' /archive/ {timestamp} ' ) ) . format ( * * link )
2017-05-05 07:27:05 -04:00
if not os . path . exists ( out_dir ) :
os . makedirs ( out_dir )
2017-06-15 19:51:03 -04:00
run ( [ ' chmod ' , ARCHIVE_PERMISSIONS , out_dir ] , timeout = 5 )
2017-05-29 14:00:46 -04:00
if link [ ' type ' ] :
print ( ' i Type: {} ' . format ( link [ ' type ' ] ) )
2017-05-05 07:27:05 -04:00
2017-06-30 04:23:19 -04:00
if not ( link [ ' url ' ] . startswith ( ' http ' ) or link [ ' url ' ] . startswith ( ' ftp ' ) ) :
2017-06-15 18:33:01 -04:00
print ( ' X Skipping: invalid link. ' )
return
2017-05-05 07:27:05 -04:00
if FETCH_WGET :
fetch_wget ( out_dir , link , overwrite = overwrite )
if FETCH_PDF :
fetch_pdf ( out_dir , link , overwrite = overwrite )
if FETCH_SCREENSHOT :
fetch_screenshot ( out_dir , link , overwrite = overwrite )
2017-05-29 14:00:46 -04:00
if SUBMIT_ARCHIVE_DOT_ORG :
archive_dot_org ( out_dir , link , overwrite = overwrite )
2017-05-05 07:27:05 -04:00
if FETCH_FAVICON :
fetch_favicon ( out_dir , link , overwrite = overwrite )
2017-06-30 04:23:19 -04:00
2017-06-15 19:31:33 -04:00
def create_archive ( export_file , service = None , resume = None ) :
2017-06-30 04:23:19 -04:00
""" update or create index.html and download archive of all links """
2017-05-29 14:00:46 -04:00
with open ( export_file , ' r ' , encoding = ' utf-8 ' ) as f :
2017-06-30 04:23:19 -04:00
print ( ' [+] [ {} ] Starting archive from {} export file. ' . format (
datetime . now ( ) . strftime ( ' % Y- % m- %d % H: % M: % S ' ) ,
export_file
) )
2017-06-15 19:31:33 -04:00
2017-06-30 04:23:19 -04:00
# if specific service was passed via command line
2017-05-05 19:36:46 -04:00
if service == " pocket " :
links = parse_pocket_export ( f )
elif service == " pinboard " :
2017-06-15 19:31:33 -04:00
links = parse_json_export ( f )
2017-06-08 16:52:39 -04:00
elif service == " bookmarks " :
links = parse_bookmarks_export ( f )
2017-06-15 19:31:33 -04:00
else :
2017-06-30 04:23:19 -04:00
# otherwise try all parsers until one works
2017-06-15 19:31:33 -04:00
try :
links = list ( parse_json_export ( f ) )
service = ' pinboard '
except Exception :
links = list ( parse_pocket_export ( f ) )
if links :
service = ' pocket '
else :
links = list ( parse_bookmarks_export ( f ) )
service = ' bookmarks '
2017-06-15 18:32:37 -04:00
links = valid_links ( links ) # remove chrome://, about:, mailto: etc.
2017-06-15 19:31:33 -04:00
links = uniquefied_links ( links ) # fix duplicate timestamps, returns sorted list
2017-06-30 04:23:19 -04:00
2017-05-05 05:00:30 -04:00
if resume :
2017-06-15 18:32:37 -04:00
try :
2017-06-30 04:23:19 -04:00
links = [
link
for link in links
if float ( link [ ' timestamp ' ] ) > = float ( resume )
]
2017-06-15 18:32:37 -04:00
except TypeError :
print ( ' Resume value and all timestamp values must be valid numbers. ' )
2017-05-05 05:00:30 -04:00
if not links :
2017-06-15 01:45:40 -04:00
print ( ' [X] No links found in {} , is it a {} export file? ' . format ( export_file , service ) )
2017-05-05 05:00:30 -04:00
raise SystemExit ( 1 )
2017-06-15 19:31:33 -04:00
if not os . path . exists ( service ) :
os . makedirs ( service )
if not os . path . exists ( ' ' . join ( ( service , ' /archive ' ) ) ) :
os . makedirs ( ' ' . join ( ( service , ' /archive ' ) ) )
2017-06-15 18:32:37 -04:00
2017-05-05 19:36:46 -04:00
dump_index ( links , service )
2017-05-05 05:00:30 -04:00
2017-06-15 19:51:03 -04:00
run ( [ ' chmod ' , ' -R ' , ARCHIVE_PERMISSIONS , service ] , timeout = 30 )
2017-05-05 05:00:30 -04:00
2017-06-30 04:23:19 -04:00
print ( ' [*] [ {} ] Created archive index with {} links. ' . format (
datetime . now ( ) . strftime ( ' % Y- % m- %d % H: % M: % S ' ) ,
len ( links ) ,
) )
2017-05-05 05:00:30 -04:00
2017-05-05 07:27:05 -04:00
check_dependencies ( )
2017-05-05 05:00:30 -04:00
for link in links :
2017-05-05 19:36:46 -04:00
dump_website ( link , service )
2017-05-05 05:00:30 -04:00
2017-06-30 04:23:19 -04:00
print ( ' [√] [ {} ] Archive update complete. ' . format ( datetime . now ( ) ) )
2017-05-05 05:00:30 -04:00
2017-05-29 14:00:46 -04:00
2017-05-05 05:00:30 -04:00
if __name__ == ' __main__ ' :
2017-05-06 08:23:49 -04:00
argc = len ( sys . argv )
2017-06-30 04:23:19 -04:00
if argc < 2 or sys . argv [ 1 ] in ( ' -h ' , ' --help ' , ' help ' ) :
print ( __DESCRIPTION__ )
print ( " Documentation: {} " . format ( __DOCUMENTATION__ ) )
print ( " " )
print ( " Usage: " )
print ( " ./archive.py ~/Downloads/bookmarks_export.html " )
print ( " " )
raise SystemExit ( 0 )
export_file = sys . argv [ 1 ] # path to export file
2017-06-15 19:31:33 -04:00
export_type = sys . argv [ 2 ] if argc > 2 else None # select export_type for file format select
2017-06-30 04:23:19 -04:00
resume_from = sys . argv [ 3 ] if argc > 3 else None # timestamp to resume dowloading from
2017-05-05 05:00:30 -04:00
2017-06-30 04:23:19 -04:00
create_archive ( export_file , service = export_type , resume = resume_from )