1
0
Fork 0

new generic_html parser for extracting hrefs

This commit is contained in:
Nick Sweeting 2020-08-18 08:29:05 -04:00
parent a682a9c478
commit 15efb2d5ed
5 changed files with 106 additions and 39 deletions

View file

@ -70,6 +70,7 @@ archivebox/index/json.py
archivebox/index/schema.py archivebox/index/schema.py
archivebox/index/sql.py archivebox/index/sql.py
archivebox/parsers/__init__.py archivebox/parsers/__init__.py
archivebox/parsers/generic_html.py
archivebox/parsers/generic_json.py archivebox/parsers/generic_json.py
archivebox/parsers/generic_rss.py archivebox/parsers/generic_rss.py
archivebox/parsers/generic_txt.py archivebox/parsers/generic_txt.py

View file

@ -301,14 +301,14 @@ def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]:
@enforce_types @enforce_types
def parse_links_from_source(source_path: str) -> Tuple[List[Link], List[Link]]: def parse_links_from_source(source_path: str, root_url: Optional[str]=None) -> Tuple[List[Link], List[Link]]:
from ..parsers import parse_links from ..parsers import parse_links
new_links: List[Link] = [] new_links: List[Link] = []
# parse and validate the import file # parse and validate the import file
raw_links, parser_name = parse_links(source_path) raw_links, parser_name = parse_links(source_path, root_url=root_url)
new_links = validate_links(raw_links) new_links = validate_links(raw_links)
if parser_name: if parser_name:

View file

@ -548,7 +548,7 @@ def add(urls: Union[str, List[str]],
# save verbatim args to sources # save verbatim args to sources
write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir) write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
new_links += parse_links_from_source(write_ahead_log) new_links += parse_links_from_source(write_ahead_log, root_url=None)
# If we're going one level deeper, download each link and look for more links # If we're going one level deeper, download each link and look for more links
new_links_depth = [] new_links_depth = []
@ -556,9 +556,9 @@ def add(urls: Union[str, List[str]],
log_crawl_started(new_links) log_crawl_started(new_links)
for new_link in new_links: for new_link in new_links:
downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir) downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
new_links_depth += parse_links_from_source(downloaded_file) new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
imported_links = new_links + new_links_depth imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
all_links, new_links = dedupe_links(all_links, imported_links) all_links, new_links = dedupe_links(all_links, imported_links)
write_main_index(links=all_links, out_dir=out_dir, finished=not new_links) write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)

View file

@ -11,7 +11,7 @@ import re
import os import os
from io import StringIO from io import StringIO
from typing import IO, Tuple, List from typing import IO, Tuple, List, Optional
from datetime import datetime from datetime import datetime
from ..system import atomic_write from ..system import atomic_write
@ -38,26 +38,29 @@ from .medium_rss import parse_medium_rss_export
from .netscape_html import parse_netscape_html_export from .netscape_html import parse_netscape_html_export
from .generic_rss import parse_generic_rss_export from .generic_rss import parse_generic_rss_export
from .generic_json import parse_generic_json_export from .generic_json import parse_generic_json_export
from .generic_html import parse_generic_html_export
from .generic_txt import parse_generic_txt_export from .generic_txt import parse_generic_txt_export
PARSERS = ( PARSERS = (
# Specialized parsers # Specialized parsers
('Pocket HTML', parse_pocket_html_export), ('Pocket HTML', parse_pocket_html_export),
('Pinboard RSS', parse_pinboard_rss_export), ('Pinboard RSS', parse_pinboard_rss_export),
('Shaarli RSS', parse_shaarli_rss_export), ('Shaarli RSS', parse_shaarli_rss_export),
('Medium RSS', parse_medium_rss_export), ('Medium RSS', parse_medium_rss_export),
# General parsers # General parsers
('Netscape HTML', parse_netscape_html_export), ('Netscape HTML', parse_netscape_html_export),
('Generic RSS', parse_generic_rss_export), ('Generic RSS', parse_generic_rss_export),
('Generic JSON', parse_generic_json_export), ('Generic JSON', parse_generic_json_export),
('Generic HTML', parse_generic_html_export),
# Fallback parser
('Plain Text', parse_generic_txt_export),
)
# Fallback parser
('Plain Text', parse_generic_txt_export),
)
@enforce_types @enforce_types
def parse_links_memory(urls: List[str]): def parse_links_memory(urls: List[str], root_url: Optional[str]=None):
""" """
parse a list of URLS without touching the filesystem parse a list of URLS without touching the filesystem
""" """
@ -68,17 +71,16 @@ def parse_links_memory(urls: List[str]):
file = StringIO() file = StringIO()
file.writelines(urls) file.writelines(urls)
file.name = "io_string" file.name = "io_string"
output = _parse(file, timer) links, parser = run_parser_functions(file, timer, root_url=root_url)
if output is not None:
return output
timer.end() timer.end()
return [], 'Failed to parse'
if parser is None:
return [], 'Failed to parse'
return links, parser
@enforce_types @enforce_types
def parse_links(source_file: str) -> Tuple[List[Link], str]: def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Link], str]:
"""parse a list of URLs with their metadata from an """parse a list of URLs with their metadata from an
RSS feed, bookmarks export, or text file RSS feed, bookmarks export, or text file
""" """
@ -87,28 +89,39 @@ def parse_links(source_file: str) -> Tuple[List[Link], str]:
timer = TimedProgress(TIMEOUT * 4) timer = TimedProgress(TIMEOUT * 4)
with open(source_file, 'r', encoding='utf-8') as file: with open(source_file, 'r', encoding='utf-8') as file:
output = _parse(file, timer) links, parser = run_parser_functions(file, timer, root_url=root_url)
if output is not None:
return output
timer.end() timer.end()
return [], 'Failed to parse' if parser is None:
return [], 'Failed to parse'
return links, parser
def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None) -> Tuple[List[Link], Optional[str]]:
most_links: List[Link] = []
best_parser_name = None
def _parse(to_parse: IO[str], timer) -> Tuple[List[Link], str]:
for parser_name, parser_func in PARSERS: for parser_name, parser_func in PARSERS:
try: try:
links = list(parser_func(to_parse)) parsed_links = list(parser_func(to_parse, root_url=root_url))
if links: if not parsed_links:
timer.end() raise Exception('no links found')
return links, parser_name
except Exception as err: # noqa # print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed')
pass if len(parsed_links) > len(most_links):
most_links = parsed_links
best_parser_name = parser_name
except Exception as err: # noqa
# Parsers are tried one by one down the list, and the first one # Parsers are tried one by one down the list, and the first one
# that succeeds is used. To see why a certain parser was not used # that succeeds is used. To see why a certain parser was not used
# due to error or format incompatibility, uncomment this line: # due to error or format incompatibility, uncomment this line:
# print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err)) # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
# raise # raise
pass
timer.end()
return most_links, best_parser_name
@enforce_types @enforce_types

View file

@ -0,0 +1,53 @@
__package__ = 'archivebox.parsers'
import re
from typing import IO, Iterable, Optional
from datetime import datetime
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
URL_REGEX,
)
from html.parser import HTMLParser
from urllib.parse import urljoin
class HrefParser(HTMLParser):
def __init__(self):
super().__init__()
self.urls = []
def handle_starttag(self, tag, attrs):
if tag == "a":
for attr, value in attrs:
if attr == "href":
self.urls.append(value)
@enforce_types
def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]:
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
html_file.seek(0)
for line in html_file:
parser = HrefParser()
# example line
# <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
parser.feed(line)
for url in parser.urls:
if root_url:
# resolve relative urls /home.html -> https://example.com/home.html
url = urljoin(root_url, url)
for archivable_url in re.findall(URL_REGEX, url):
yield Link(
url=htmldecode(archivable_url),
timestamp=str(datetime.now().timestamp()),
title=None,
tags=None,
sources=[html_file.name],
)