2017-07-04 06:38:07 -04:00
import re
import json
from datetime import datetime
2017-10-18 18:38:17 -04:00
from util import (
domain ,
base_url ,
get_str_between ,
get_link_type ,
)
2017-07-04 06:38:07 -04:00
2017-10-18 18:38:17 -04:00
def parse_export ( path ) :
2017-07-04 06:38:07 -04:00
""" parse a list of links dictionaries from a bookmark export file """
2017-10-18 18:38:17 -04:00
links = [ ]
with open ( path , ' r ' , encoding = ' utf-8 ' ) as file :
for service , parser_func in get_parsers ( ) . items ( ) :
# otherwise try all parsers until one works
try :
links + = list ( parser_func ( file ) )
if links :
break
except Exception as e :
pass
return links
2017-07-04 06:38:07 -04:00
2017-10-18 18:38:17 -04:00
def get_parsers ( ) :
return {
' pocket ' : parse_pocket_export ,
' pinboard ' : parse_json_export ,
' bookmarks ' : parse_bookmarks_export ,
' rss ' : parse_rss_export ,
}
2017-07-04 06:38:07 -04:00
def parse_pocket_export ( html_file ) :
""" Parse Pocket-format bookmarks export files (produced by getpocket.com/export/) """
html_file . seek ( 0 )
pattern = re . compile ( " ^ \\ s*<li><a href= \" (.+) \" time_added= \" ( \\ d+) \" tags= \" (.*) \" >(.+)</a></li> " , re . UNICODE ) # see sample input in ./example_ril_export.html
for line in html_file :
match = pattern . search ( line )
if match :
fixed_url = match . group ( 1 ) . replace ( ' http://www.readability.com/read?url= ' , ' ' ) # remove old readability prefixes to get original url
2017-10-18 18:38:17 -04:00
time = datetime . fromtimestamp ( float ( match . group ( 2 ) ) )
2017-07-04 06:38:07 -04:00
info = {
' url ' : fixed_url ,
2017-10-18 18:38:17 -04:00
' domain ' : domain ( fixed_url ) ,
' base_url ' : base_url ( fixed_url ) ,
' timestamp ' : str ( time . timestamp ( ) ) ,
2017-07-04 06:38:07 -04:00
' tags ' : match . group ( 3 ) ,
2017-10-18 18:38:17 -04:00
' title ' : match . group ( 4 ) . replace ( ' — Readability ' , ' ' ) . replace ( ' http://www.readability.com/read?url= ' , ' ' ) or base_url ( fixed_url ) ,
' sources ' : [ html_file . name ] ,
2017-07-04 06:38:07 -04:00
}
info [ ' type ' ] = get_link_type ( info )
yield info
def parse_json_export ( json_file ) :
""" Parse JSON-format bookmarks export files (produced by pinboard.in/export/) """
json_file . seek ( 0 )
json_content = json . load ( json_file )
for line in json_content :
if line :
erg = line
2017-10-18 18:38:17 -04:00
time = datetime . strptime ( erg [ ' time ' ] . split ( ' , ' , 1 ) [ 0 ] , ' % Y- % m- %d T % H: % M: % SZ ' )
2017-07-04 06:38:07 -04:00
info = {
' url ' : erg [ ' href ' ] ,
2017-10-18 18:38:17 -04:00
' domain ' : domain ( erg [ ' href ' ] ) ,
' base_url ' : base_url ( erg [ ' href ' ] ) ,
' timestamp ' : str ( time . timestamp ( ) ) ,
2017-07-04 06:38:07 -04:00
' tags ' : erg [ ' tags ' ] ,
' title ' : erg [ ' description ' ] . replace ( ' — Readability ' , ' ' ) ,
2017-10-18 18:38:17 -04:00
' sources ' : [ json_file . name ] ,
2017-07-04 06:38:07 -04:00
}
info [ ' type ' ] = get_link_type ( info )
yield info
2017-10-18 18:38:17 -04:00
def parse_rss_export ( rss_file ) :
""" Parse RSS XML-format files into links """
rss_file . seek ( 0 )
items = rss_file . read ( ) . split ( ' </item> \n <item> ' )
for item in items :
# example item:
# <item>
# <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
# <category>Unread</category>
# <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
# <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
# <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
# </item>
trailing_removed = item . split ( ' </item> ' , 1 ) [ 0 ]
leading_removed = trailing_removed . split ( ' <item> ' , 1 ) [ - 1 ]
rows = leading_removed . split ( ' \n ' )
row = lambda key : [ r for r in rows if r . startswith ( ' < {} > ' . format ( key ) ) ] [ 0 ]
title = get_str_between ( row ( ' title ' ) , ' <![CDATA[ ' , ' ]] ' )
url = get_str_between ( row ( ' link ' ) , ' <link> ' , ' </link> ' )
ts_str = get_str_between ( row ( ' pubDate ' ) , ' <pubDate> ' , ' </pubDate> ' )
time = datetime . strptime ( ts_str , " %a , %d % b % Y % H: % M: % S % z " )
info = {
' url ' : url ,
' domain ' : domain ( url ) ,
' base_url ' : base_url ( url ) ,
' timestamp ' : str ( time . timestamp ( ) ) ,
' tags ' : ' ' ,
' title ' : title ,
' sources ' : [ rss_file . name ] ,
}
info [ ' type ' ] = get_link_type ( info )
# import ipdb; ipdb.set_trace()
yield info
2017-07-04 06:38:07 -04:00
def parse_bookmarks_export ( html_file ) :
""" Parse netscape-format bookmarks export files (produced by all browsers) """
html_file . seek ( 0 )
pattern = re . compile ( " <a href= \" (.+?) \" add_date= \" ( \\ d+) \" [^>]*>(.+)</a> " , re . UNICODE | re . IGNORECASE )
for line in html_file :
match = pattern . search ( line )
if match :
url = match . group ( 1 )
2017-10-18 18:38:17 -04:00
time = datetime . fromtimestamp ( float ( match . group ( 2 ) ) )
2017-07-04 06:38:07 -04:00
info = {
' url ' : url ,
2017-10-18 18:38:17 -04:00
' domain ' : domain ( url ) ,
' base_url ' : base_url ( url ) ,
' timestamp ' : str ( time . timestamp ( ) ) ,
2017-07-04 06:38:07 -04:00
' tags ' : " " ,
' title ' : match . group ( 3 ) ,
2017-10-18 18:38:17 -04:00
' sources ' : [ html_file . name ] ,
2017-07-04 06:38:07 -04:00
}
info [ ' type ' ] = get_link_type ( info )
yield info