diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index 38696a4b..88b705ae 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -31,39 +31,41 @@ from ..util import ( from ..index.schema import Link from ..logging_util import TimedProgress, log_source_saved -from .pocket_html import parse_pocket_html_export -from .pocket_api import parse_pocket_api_export -from .pinboard_rss import parse_pinboard_rss_export -from .wallabag_atom import parse_wallabag_atom_export -from .shaarli_rss import parse_shaarli_rss_export -from .medium_rss import parse_medium_rss_export -from .netscape_html import parse_netscape_html_export -from .generic_rss import parse_generic_rss_export -from .generic_json import parse_generic_json_export -from .generic_html import parse_generic_html_export -from .generic_txt import parse_generic_txt_export -from .url_list import parse_url_list +from . import pocket_api +from . import wallabag_atom +from . import pocket_html +from . import pinboard_rss +from . import shaarli_rss +from . import medium_rss + +from . import netscape_html +from . import generic_rss +from . import generic_json +from . import generic_html +from . import generic_txt +from . import url_list + PARSERS = { # Specialized parsers - 'pocket-api': ('Pocket API', parse_pocket_api_export), - 'wallabag': ('Wallabag ATOM', parse_wallabag_atom_export), - 'pocket-html': ('Pocket HTML', parse_pocket_html_export), - 'pinboard-rss': ('Pinboard RSS', parse_pinboard_rss_export), - 'shaarli-rss': ('Shaarli RSS', parse_shaarli_rss_export), - 'medium-rss': ('Medium RSS', parse_medium_rss_export), - - # General parsers - 'netscape-html': ('Netscape HTML', parse_netscape_html_export), - 'rss': ('Generic RSS', parse_generic_rss_export), - 'json': ('Generic JSON', parse_generic_json_export), - 'html': ('Generic HTML', parse_generic_html_export), + pocket_api.KEY: (pocket_api.NAME, pocket_api.PARSER), + wallabag_atom.KEY: (wallabag_atom.NAME, wallabag_atom.PARSER), + pocket_html.KEY: (pocket_html.NAME, pocket_html.PARSER), + pinboard_rss.KEY: (pinboard_rss.NAME, pinboard_rss.PARSER), + shaarli_rss.KEY: (shaarli_rss.NAME, shaarli_rss.PARSER), + medium_rss.KEY: (medium_rss.NAME, medium_rss.PARSER), - # Fallback parser - 'plain-text': ('Plain Text', parse_generic_txt_export), + # General parsers + netscape_html.KEY: (netscape_html.NAME, netscape_html.PARSER), + generic_rss.KEY: (generic_rss.NAME, generic_rss.PARSER), + generic_json.KEY: (generic_json.NAME, generic_json.PARSER), + generic_html.KEY: (generic_html.NAME, generic_html.PARSER), + + # Catchall fallback parser + generic_txt.KEY: (generic_txt.NAME, generic_txt.PARSER), # Explicitly specified parsers - 'url-list': ('URL list', parse_url_list), + url_list.KEY: (url_list.NAME, url_list.PARSER), } diff --git a/archivebox/parsers/generic_html.py b/archivebox/parsers/generic_html.py index 74b3d1fc..6950dc1d 100644 --- a/archivebox/parsers/generic_html.py +++ b/archivebox/parsers/generic_html.py @@ -51,3 +51,8 @@ def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, tags=None, sources=[html_file.name], ) + + +KEY = 'html' +NAME = 'Generic HTML' +PARSER = parse_generic_html_export diff --git a/archivebox/parsers/generic_json.py b/archivebox/parsers/generic_json.py index e6ed6772..fff4d712 100644 --- a/archivebox/parsers/generic_json.py +++ b/archivebox/parsers/generic_json.py @@ -63,3 +63,8 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]: tags=htmldecode(link.get('tags')) or '', sources=[json_file.name], ) + + +KEY = 'json' +NAME = 'Generic JSON' +PARSER = parse_generic_json_export diff --git a/archivebox/parsers/generic_rss.py b/archivebox/parsers/generic_rss.py index 28318444..4bd04967 100644 --- a/archivebox/parsers/generic_rss.py +++ b/archivebox/parsers/generic_rss.py @@ -47,3 +47,8 @@ def parse_generic_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: tags=None, sources=[rss_file.name], ) + + +KEY = 'rss' +NAME = 'Generic RSS' +PARSER = parse_generic_rss_export diff --git a/archivebox/parsers/generic_txt.py b/archivebox/parsers/generic_txt.py index ee6ec7c8..a7ed8d54 100644 --- a/archivebox/parsers/generic_txt.py +++ b/archivebox/parsers/generic_txt.py @@ -59,3 +59,7 @@ def parse_generic_txt_export(text_file: IO[str], **_kwargs) -> Iterable[Link]: tags=None, sources=[text_file.name], ) + +KEY = 'txt' +NAME = 'Generic TXT' +PARSER = parse_generic_txt_export diff --git a/archivebox/parsers/medium_rss.py b/archivebox/parsers/medium_rss.py index 8f14f773..a4159f28 100644 --- a/archivebox/parsers/medium_rss.py +++ b/archivebox/parsers/medium_rss.py @@ -33,3 +33,8 @@ def parse_medium_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: tags=None, sources=[rss_file.name], ) + + +KEY = 'medium_rss' +NAME = 'Medium RSS' +PARSER = parse_medium_rss_export diff --git a/archivebox/parsers/netscape_html.py b/archivebox/parsers/netscape_html.py index a063023c..7523f100 100644 --- a/archivebox/parsers/netscape_html.py +++ b/archivebox/parsers/netscape_html.py @@ -37,3 +37,7 @@ def parse_netscape_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]: sources=[html_file.name], ) + +KEY = 'netscape_html' +NAME = 'Netscape HTML' +PARSER = parse_netscape_html_export diff --git a/archivebox/parsers/pinboard_rss.py b/archivebox/parsers/pinboard_rss.py index 98ff14a3..17d1025e 100644 --- a/archivebox/parsers/pinboard_rss.py +++ b/archivebox/parsers/pinboard_rss.py @@ -45,3 +45,8 @@ def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: tags=htmldecode(tags) or None, sources=[rss_file.name], ) + + +KEY = 'pinboard_rss' +NAME = 'Pinboard RSS' +PARSER = parse_pinboard_rss_export diff --git a/archivebox/parsers/pocket_api.py b/archivebox/parsers/pocket_api.py index bf3a292b..afad70ed 100644 --- a/archivebox/parsers/pocket_api.py +++ b/archivebox/parsers/pocket_api.py @@ -111,3 +111,8 @@ def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]: yield link_from_article(article, sources=[line]) write_since(username, api.last_since) + + +KEY = 'pocket_api' +NAME = 'Pocket API' +PARSER = parse_pocket_api_export diff --git a/archivebox/parsers/pocket_html.py b/archivebox/parsers/pocket_html.py index 653f21b8..d34c8bad 100644 --- a/archivebox/parsers/pocket_html.py +++ b/archivebox/parsers/pocket_html.py @@ -36,3 +36,8 @@ def parse_pocket_html_export(html_file: IO[str], **_kwargs) -> Iterable[Link]: tags=tags or '', sources=[html_file.name], ) + + +KEY = 'pocket_html' +NAME = 'Pocket HTML' +PARSER = parse_pocket_html_export diff --git a/archivebox/parsers/shaarli_rss.py b/archivebox/parsers/shaarli_rss.py index 4a925f46..67934899 100644 --- a/archivebox/parsers/shaarli_rss.py +++ b/archivebox/parsers/shaarli_rss.py @@ -48,3 +48,8 @@ def parse_shaarli_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: tags=None, sources=[rss_file.name], ) + + +KEY = 'shaarli_rss' +NAME = 'Shaarli RSS' +PARSER = parse_shaarli_rss_export diff --git a/archivebox/parsers/url_list.py b/archivebox/parsers/url_list.py index fa91acde..a45e5225 100644 --- a/archivebox/parsers/url_list.py +++ b/archivebox/parsers/url_list.py @@ -17,7 +17,7 @@ def parse_url_list(text_file: IO[str], **_kwargs) -> Iterable[Link]: text_file.seek(0) for line in text_file.readlines(): url = line.strip() - if len(url) == 0: + if not url: continue yield Link( @@ -27,3 +27,8 @@ def parse_url_list(text_file: IO[str], **_kwargs) -> Iterable[Link]: tags=None, sources=[text_file.name], ) + + +KEY = 'url_list' +NAME = 'URL List' +PARSER = parse_url_list diff --git a/archivebox/parsers/wallabag_atom.py b/archivebox/parsers/wallabag_atom.py index 7acfc2fc..32740097 100644 --- a/archivebox/parsers/wallabag_atom.py +++ b/archivebox/parsers/wallabag_atom.py @@ -55,3 +55,8 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: tags=tags or '', sources=[rss_file.name], ) + + +KEY = 'wallabag_atom' +NAME = 'Wallabag Atom' +PARSER = parse_wallabag_atom_export