2017-05-05 05:00:30 -04:00
#!/usr/bin/env python3
# wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | sudo apt-key add -
# sudo sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list'
# apt update; apt install google-chrome-beta
import re
import os
import sys
2017-05-05 19:36:46 -04:00
import json
2017-05-05 05:00:30 -04:00
from datetime import datetime
2017-05-05 19:36:46 -04:00
import time
2017-05-05 05:00:30 -04:00
from subprocess import run , DEVNULL
2017-05-05 07:27:05 -04:00
INDEX_TEMPLATE = ' index_template.html '
2017-05-05 05:00:30 -04:00
2017-05-05 07:27:05 -04:00
FETCH_WGET = True
FETCH_PDF = True
FETCH_SCREENSHOT = True
FETCH_FAVICON = True
2017-05-05 05:00:30 -04:00
RESOLUTION = ' 1440,900 '
2017-05-05 07:27:05 -04:00
def check_dependencies ( ) :
for dependency in ( ' google-chrome ' , ' wget ' ) :
if run ( [ ' which ' , dependency ] ) . returncode :
print ( ' [X] Missing dependency: {} ' . format ( dependency ) )
print ( ' See https://github.com/pirate/pocket-archive-stream for help. ' )
raise SystemExit ( 1 )
2017-05-05 05:00:30 -04:00
def parse_pocket_export ( html ) :
2017-05-05 10:54:18 -04:00
pattern = re . compile ( " ^ \\ s*<li><a href= \" (.+) \" time_added= \" ( \\ d+) \" tags= \" (.*) \" >(.+)</a></li> " , re . UNICODE ) # see sample input in ./example_ril_export.html
2017-05-05 05:00:30 -04:00
for line in html :
match = pattern . search ( line )
if match :
yield {
' url ' : match . group ( 1 ) . replace ( ' http://www.readability.com/read?url= ' , ' ' ) ,
' domain ' : match . group ( 1 ) . replace ( ' http:// ' , ' ' ) . replace ( ' https:// ' , ' ' ) . split ( ' / ' ) [ 0 ] ,
' base_url ' : match . group ( 1 ) . replace ( ' https:// ' , ' ' ) . replace ( ' http:// ' , ' ' ) . split ( ' ? ' ) [ 0 ] ,
' time ' : datetime . fromtimestamp ( int ( match . group ( 2 ) ) ) ,
' timestamp ' : match . group ( 2 ) ,
' tags ' : match . group ( 3 ) ,
' title ' : match . group ( 4 ) . replace ( ' — Readability ' , ' ' ) . replace ( ' http://www.readability.com/read?url= ' , ' ' ) ,
}
2017-05-05 19:36:46 -04:00
def parse_pinboard_export ( html ) :
json_content = json . load ( html )
for line in json_content :
if line :
erg = line
yield {
' url ' : erg [ ' href ' ] . replace ( ' http://www.readability.com/read?url= ' , ' ' ) ,
' domain ' : erg [ ' href ' ] . replace ( ' http:// ' , ' ' ) . replace ( ' https:// ' , ' ' ) . split ( ' / ' ) [ 0 ] ,
' base_url ' : erg [ ' href ' ] . replace ( ' https:// ' , ' ' ) . replace ( ' http:// ' , ' ' ) . split ( ' ? ' ) [ 0 ] ,
' time ' : datetime . fromtimestamp ( time . mktime ( time . strptime ( erg [ ' time ' ] . split ( ' , ' ) [ 0 ] , ' % Y- % m- %d T % H: % M: % SZ ' ) ) ) ,
' timestamp ' : time . mktime ( time . strptime ( erg [ ' time ' ] . split ( ' , ' ) [ 0 ] , ' % Y- % m- %d T % H: % M: % SZ ' ) ) ,
' tags ' : erg [ ' tags ' ] ,
' title ' : erg [ ' description ' ] . replace ( ' — Readability ' , ' ' ) . replace ( ' http://www.readability.com/read?url= ' , ' ' ) ,
}
def dump_index ( links , service ) :
2017-05-05 07:27:05 -04:00
with open ( INDEX_TEMPLATE , ' r ' ) as f :
2017-05-05 05:00:30 -04:00
index_html = f . read ( )
link_html = """ \
< tr >
< td > { time } < / td >
< td > < a href = " archive/ {timestamp} / {base_url} " style = " font-size:1.4em;text-decoration:none;color:black; " title = " {title} " >
< img src = " archive/ {timestamp} /favicon.ico " >
{ title }
< / td >
< td style = " text-align:center " > < a href = " archive/ {timestamp} / " title = " Files " > 📂 < / a > < / td >
< td style = " text-align:center " > < a href = " archive/ {timestamp} /output.pdf " title = " PDF " > 📄 < / a > < / td >
< td style = " text-align:center " > < a href = " archive/ {timestamp} /screenshot.png " title = " Screenshot " > 🖼 < / a > < / td >
< td > 🔗 < img src = " https://www.google.com/s2/favicons?domain= {domain} " height = " 16px " > < a href = " {url} " > { url } < / a > < / td >
< / tr > """
2017-05-05 19:36:46 -04:00
with open ( ' ' . join ( ( service , ' /index.html ' ) ) , ' w ' ) as f :
2017-05-05 05:00:30 -04:00
article_rows = ' \n ' . join (
link_html . format ( * * link ) for link in links
)
f . write ( index_html . format ( datetime . now ( ) . strftime ( ' % Y- % m- %d % H: % M ' ) , article_rows ) )
2017-05-05 07:27:05 -04:00
def fetch_wget ( out_dir , link , overwrite = False ) :
2017-05-05 05:00:30 -04:00
# download full site
2017-05-05 07:27:05 -04:00
if not os . path . exists ( ' {} / {} ' . format ( out_dir , link , overwrite = link [ ' domain ' ] ) ) or overwrite :
2017-05-05 05:00:30 -04:00
print ( ' - Downloading Full Site ' )
CMD = [
* ' wget --no-clobber --page-requisites --adjust-extension --convert-links --no-parent ' . split ( ' ' ) ,
link [ ' url ' ] ,
]
try :
2017-05-05 07:27:05 -04:00
run ( CMD , stdout = DEVNULL , stderr = DEVNULL , cwd = out_dir , timeout = 20 ) # dom.html
2017-05-05 05:00:30 -04:00
except Exception as e :
print ( ' Exception: {} ' . format ( e . __class__ . __name__ ) )
else :
print ( ' √ Skipping site download ' )
2017-05-05 07:27:05 -04:00
def fetch_pdf ( out_dir , link , overwrite = False ) :
2017-05-05 05:00:30 -04:00
# download PDF
if ( not os . path . exists ( ' {} /output.pdf ' . format ( out_dir ) ) or overwrite ) and not link [ ' base_url ' ] . endswith ( ' .pdf ' ) :
print ( ' - Printing PDF ' )
CMD = ' google-chrome --headless --disable-gpu --print-to-pdf ' . split ( ' ' )
try :
2017-05-05 07:27:05 -04:00
run ( [ * CMD , link [ ' url ' ] ] , stdout = DEVNULL , stderr = DEVNULL , cwd = out_dir , timeout = 20 ) # output.pdf
2017-05-05 05:00:30 -04:00
except Exception as e :
print ( ' Exception: {} ' . format ( e . __class__ . __name__ ) )
else :
print ( ' √ Skipping PDF print ' )
2017-05-05 07:27:05 -04:00
def fetch_screenshot ( out_dir , link , overwrite = False ) :
2017-05-05 05:00:30 -04:00
# take screenshot
if ( not os . path . exists ( ' {} /screenshot.png ' . format ( out_dir ) ) or overwrite ) and not link [ ' base_url ' ] . endswith ( ' .pdf ' ) :
print ( ' - Snapping Screenshot ' )
CMD = ' google-chrome --headless --disable-gpu --screenshot ' . split ( ' ' )
try :
2017-05-05 07:27:05 -04:00
run ( [ * CMD , ' --window-size= {} ' . format ( RESOLUTION ) , link [ ' url ' ] ] , stdout = DEVNULL , stderr = DEVNULL , cwd = out_dir , timeout = 20 ) # sreenshot.png
2017-05-05 05:00:30 -04:00
except Exception as e :
print ( ' Exception: {} ' . format ( e . __class__ . __name__ ) )
else :
print ( ' √ Skipping screenshot ' )
2017-05-05 07:27:05 -04:00
def fetch_favicon ( out_dir , link , overwrite = False ) :
2017-05-05 05:00:30 -04:00
# download favicon
if not os . path . exists ( ' {} /favicon.ico ' . format ( out_dir ) ) or overwrite :
print ( ' - Fetching Favicon ' )
CMD = ' curl https://www.google.com/s2/favicons?domain= {domain} ' . format ( * * link ) . split ( ' ' )
fout = open ( ' {} /favicon.ico ' . format ( out_dir ) , ' w ' )
try :
2017-05-05 07:27:05 -04:00
run ( [ * CMD ] , stdout = fout , stderr = DEVNULL , cwd = out_dir , timeout = 20 ) # dom.html
2017-05-05 05:00:30 -04:00
except Exception as e :
print ( ' Exception: {} ' . format ( e . __class__ . __name__ ) )
fout . close ( )
else :
print ( ' √ Skipping favicon ' )
2017-05-05 07:27:05 -04:00
2017-05-05 19:36:46 -04:00
def dump_website ( link , service , overwrite = False ) :
2017-05-05 07:27:05 -04:00
""" download the DOM, PDF, and a screenshot into a folder named after the link ' s timestamp """
print ( ' [+] [ {time} ] Archiving " {title} " : {url} ' . format ( * * link ) )
2017-05-05 19:36:46 -04:00
out_dir = ' ' . join ( ( service , ' /archive/ {timestamp} ' ) ) . format ( * * link )
2017-05-05 07:27:05 -04:00
if not os . path . exists ( out_dir ) :
os . makedirs ( out_dir )
if link [ ' base_url ' ] . endswith ( ' .pdf ' ) :
print ( ' i PDF File ' )
elif ' youtube.com ' in link [ ' domain ' ] :
print ( ' i Youtube Video ' )
elif ' wikipedia.org ' in link [ ' domain ' ] :
print ( ' i Wikipedia Article ' )
if FETCH_WGET :
fetch_wget ( out_dir , link , overwrite = overwrite )
if FETCH_PDF :
fetch_pdf ( out_dir , link , overwrite = overwrite )
if FETCH_SCREENSHOT :
fetch_screenshot ( out_dir , link , overwrite = overwrite )
if FETCH_FAVICON :
fetch_favicon ( out_dir , link , overwrite = overwrite )
2017-05-05 05:00:30 -04:00
run ( [ ' chmod ' , ' -R ' , ' 755 ' , out_dir ] , timeout = 1 )
2017-05-05 19:36:46 -04:00
def create_archive ( service_file , service , resume = None ) :
print ( ' [+] [ {} ] Starting {} archive from {} ' . format ( datetime . now ( ) , service , service_file ) )
2017-05-05 05:00:30 -04:00
2017-05-05 19:36:46 -04:00
if not os . path . exists ( service ) :
os . makedirs ( service )
2017-05-05 05:00:30 -04:00
2017-05-05 19:36:46 -04:00
if not os . path . exists ( ' ' . join ( ( service , ' /archive ' ) ) ) :
os . makedirs ( ' ' . join ( ( service , ' /archive ' ) ) )
2017-05-05 05:00:30 -04:00
2017-05-05 19:36:46 -04:00
with open ( service_file , ' r ' , encoding = ' utf-8 ' ) as f :
if service == " pocket " :
links = parse_pocket_export ( f )
elif service == " pinboard " :
links = parse_pinboard_export ( f )
2017-05-05 05:00:30 -04:00
links = list ( reversed ( sorted ( links , key = lambda l : l [ ' timestamp ' ] ) ) ) # most recent first
if resume :
links = [ link for link in links if link [ ' timestamp ' ] > = resume ]
if not links :
2017-05-05 19:36:46 -04:00
if service == " pocket " :
2017-05-06 08:22:48 -04:00
print ( ' [X] No links found in {} , is it a getpocket.com/export export? ' . format ( service_file ) )
2017-05-05 19:36:46 -04:00
elif service == " pinboard " :
print ( ' [X] No links found in {} , is it a pinboard.in/export/format:json/ export? ' . format ( service_file ) )
2017-05-05 05:00:30 -04:00
raise SystemExit ( 1 )
2017-05-05 19:36:46 -04:00
dump_index ( links , service )
2017-05-05 05:00:30 -04:00
2017-05-05 19:36:46 -04:00
run ( [ ' chmod ' , ' -R ' , ' 755 ' , service ] , timeout = 1 )
2017-05-05 05:00:30 -04:00
print ( ' [*] [ {} ] Created archive index. ' . format ( datetime . now ( ) ) )
2017-05-05 07:27:05 -04:00
check_dependencies ( )
2017-05-05 05:00:30 -04:00
for link in links :
2017-05-05 19:36:46 -04:00
dump_website ( link , service )
2017-05-05 05:00:30 -04:00
print ( ' [√] [ {} ] Archive complete. ' . format ( datetime . now ( ) ) )
if __name__ == ' __main__ ' :
2017-05-05 19:36:46 -04:00
service_file = ' ril_export.html '
2017-05-05 05:00:30 -04:00
resume = None
try :
2017-05-05 19:36:46 -04:00
service_file = sys . argv [ 1 ] # path to export file
service = sys . argv [ 2 ] or " pocket " # select service for file format select
resume = sys . argv [ 3 ] # timestamp to resume dowloading from
2017-05-05 05:00:30 -04:00
except IndexError :
pass
2017-05-05 19:36:46 -04:00
create_archive ( service_file , service , resume = resume )