From cf9d1875c74cff7639c9e6d00036518956a69165 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Jan 2019 04:09:39 -0500 Subject: [PATCH] add plain text link parsing --- archivebox/parse.py | 50 ++++++++++++++++++++++++++++++++++++--------- archivebox/util.py | 15 ++++++++++++++ 2 files changed, 55 insertions(+), 10 deletions(-) diff --git a/archivebox/parse.py b/archivebox/parse.py index 08f6333e..3310f554 100644 --- a/archivebox/parse.py +++ b/archivebox/parse.py @@ -19,6 +19,7 @@ Parsed link schema: { import re import json +import urllib import xml.etree.ElementTree as etree from datetime import datetime @@ -28,6 +29,8 @@ from util import ( base_url, str_between, get_link_type, + fetch_page_title, + URL_REGEX, ) @@ -41,6 +44,7 @@ def get_parsers(file): 'rss': parse_rss_export, 'pinboard_rss': parse_pinboard_rss_feed, 'medium_rss': parse_medium_rss_feed, + 'plain_text': parse_plain_text, } def parse_links(path): @@ -109,14 +113,14 @@ def parse_json_export(json_file): if erg.get('description'): title = (erg.get('description') or '').replace(' — Readability', '') else: - title = erg['title'] + title = erg['title'].strip() info = { 'url': url, 'domain': domain(url), 'base_url': base_url(url), 'timestamp': timestamp, 'tags': erg.get('tags') or '', - 'title': title, + 'title': title or fetch_page_title(url), 'sources': [json_file.name], } info['type'] = get_link_type(info) @@ -144,7 +148,7 @@ def parse_rss_export(rss_file): def get_row(key): return [r for r in rows if r.startswith('<{}>'.format(key))][0] - title = str_between(get_row('title'), '', '') ts_str = str_between(get_row('pubDate'), '', '') time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z") @@ -155,7 +159,7 @@ def parse_rss_export(rss_file): 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': '', - 'title': title, + 'title': title or fetch_page_title(url), 'sources': [rss_file.name], } info['type'] = get_link_type(info) @@ -182,7 +186,7 @@ def parse_bookmarks_export(html_file): 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': "", - 'title': match.group(3), + 'title': match.group(3).strip() or fetch_page_title(url), 'sources': [html_file.name], } info['type'] = get_link_type(info) @@ -198,7 +202,7 @@ def parse_pinboard_rss_feed(rss_file): for item in items: url = item.find("{http://purl.org/rss/1.0/}link").text tags = item.find("{http://purl.org/dc/elements/1.1/}subject").text - title = item.find("{http://purl.org/rss/1.0/}title").text + title = item.find("{http://purl.org/rss/1.0/}title").text.strip() ts_str = item.find("{http://purl.org/dc/elements/1.1/}date").text # = 🌈🌈🌈🌈 # = 🌈🌈🌈🌈 @@ -215,7 +219,7 @@ def parse_pinboard_rss_feed(rss_file): 'base_url': base_url(url), 'timestamp': str(time.timestamp()), 'tags': tags, - 'title': title, + 'title': title or fetch_page_title(url), 'sources': [rss_file.name], } info['type'] = get_link_type(info) @@ -231,7 +235,7 @@ def parse_medium_rss_feed(rss_file): # for child in item: # print(child.tag, child.text) url = item.find("link").text - title = item.find("title").text + title = item.find("title").text.strip() ts_str = item.find("pubDate").text time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") info = { @@ -239,9 +243,35 @@ def parse_medium_rss_feed(rss_file): 'domain': domain(url), 'base_url': base_url(url), 'timestamp': str(time.timestamp()), - 'tags': "", - 'title': title, + 'tags': '', + 'title': title or fetch_page_title(url), 'sources': [rss_file.name], } info['type'] = get_link_type(info) yield info + + +def parse_plain_text(text_file): + """Parse raw links from each line in a text file""" + + text_file.seek(0) + text_content = text_file.readlines() + for line in text_content: + if line: + urls = re.findall(URL_REGEX, line) + + for url in urls: + timestamp = str(datetime.now().timestamp()) + + info = { + 'url': url, + 'domain': domain(url), + 'base_url': base_url(url), + 'timestamp': timestamp, + 'tags': '', + 'title': fetch_page_title(url), + 'sources': [text_file.name], + } + info['type'] = get_link_type(info) + yield info + diff --git a/archivebox/util.py b/archivebox/util.py index 432b5d3e..ac2f7c11 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -42,6 +42,8 @@ base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links short_ts = lambda ts: ts.split('.')[0] +URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' + def check_dependencies(): """Check that all necessary dependencies are installed, and have valid versions""" @@ -208,6 +210,19 @@ def download_url(url): return source_path + +def fetch_page_title(url, default=None): + """Attempt to guess a page's title by downloading the html""" + + try: + html_content = urllib.request.urlopen(url).read().decode('utf-8') + + match = re.search('(.*?)', html_content) + return match.group(1) if match else default + except Exception: + return default + + def str_between(string, start, end=None): """(12345, , ) -> 12345"""