From 7e2b249388e4a530dcbcd374de469033eeb36c18 Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 7 Aug 2020 08:05:17 -0500 Subject: [PATCH 01/14] feat: Initial version of readability extractor --- archivebox/config/__init__.py | 14 +++++ archivebox/extractors/__init__.py | 2 + archivebox/extractors/readability.py | 83 ++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+) create mode 100644 archivebox/extractors/readability.py diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index c53c5eec..b51c7034 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -76,6 +76,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)}, 'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)}, 'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)}, + 'SAVE_READABILITY': {'type': bool, 'default': True, 'aliases': ('FETCH_READABILITY',)}, 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)}, 'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)}, 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)}, @@ -107,6 +108,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'USE_CURL': {'type': bool, 'default': True}, 'USE_WGET': {'type': bool, 'default': True}, 'USE_SINGLEFILE': {'type': bool, 'default': True}, + 'USE_READABILITY': {'type': bool, 'default': True}, 'USE_GIT': {'type': bool, 'default': True}, 'USE_CHROME': {'type': bool, 'default': True}, 'USE_YOUTUBEDL': {'type': bool, 'default': True}, @@ -115,6 +117,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'GIT_BINARY': {'type': str, 'default': 'git'}, 'WGET_BINARY': {'type': str, 'default': 'wget'}, 'SINGLEFILE_BINARY': {'type': str, 'default': 'single-file'}, + 'READABILITY_BINARY': {'type': str, 'default': 'readability-extractor'}, 'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, 'CHROME_BINARY': {'type': str, 'default': None}, }, @@ -256,6 +259,9 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, + 'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']}, + 'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None}, + 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, @@ -272,6 +278,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']}, 'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']}, 'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['USE_SINGLEFILE']}, + 'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY']}, 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)}, 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)}, @@ -689,6 +696,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue: 'enabled': config['USE_SINGLEFILE'], 'is_valid': bool(config['SINGLEFILE_VERSION']), }, + 'READABILITY_BINARY': { + 'path': bin_path(config['READABILITY_BINARY']), + 'version': config['READABILITY_VERSION'], + 'hash': bin_hash(config['READABILITY_BINARY']), + 'enabled': config['USE_READABILITY'], + 'is_valid': bool(config['READABILITY_VERSION']), + }, 'GIT_BINARY': { 'path': bin_path(config['GIT_BINARY']), 'version': config['GIT_VERSION'], diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index bdeae3d7..6cd3c551 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -26,6 +26,7 @@ from .title import should_save_title, save_title from .favicon import should_save_favicon, save_favicon from .wget import should_save_wget, save_wget from .singlefile import should_save_singlefile, save_singlefile +from .readability import should_save_readability, save_readability from .pdf import should_save_pdf, save_pdf from .screenshot import should_save_screenshot, save_screenshot from .dom import should_save_dom, save_dom @@ -39,6 +40,7 @@ def get_default_archive_methods(): ('favicon', should_save_favicon, save_favicon), ('wget', should_save_wget, save_wget), ('singlefile', should_save_singlefile, save_singlefile), + ('readability', should_save_readability, save_readability), ('pdf', should_save_pdf, save_pdf), ('screenshot', should_save_screenshot, save_screenshot), ('dom', should_save_dom, save_dom), diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py new file mode 100644 index 00000000..2659c18e --- /dev/null +++ b/archivebox/extractors/readability.py @@ -0,0 +1,83 @@ +__package__ = 'archivebox.extractors' + +from pathlib import Path + +from typing import Optional +import json + +from ..index.schema import Link, ArchiveResult, ArchiveError +from ..system import run, atomic_write +from ..util import ( + enforce_types, + download_url, + +) +from ..config import ( + TIMEOUT, + SAVE_READABILITY, + READABILITY_BINARY, + READABILITY_VERSION, + CHROME_BINARY, +) +from ..logging_util import TimedProgress + + +@enforce_types +def should_save_readability(link: Link, out_dir: Optional[str]=None) -> bool: + out_dir = out_dir or link.link_dir + + output = Path(out_dir or link.link_dir) / 'readability.json' + return SAVE_READABILITY and (not output.exists()) + + +@enforce_types +def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: + """download reader friendly version using @mozilla/readability""" + + out_dir = out_dir or link.link_dir + output = str(Path(out_dir).absolute() / "readability.json") + + document = download_url(link.url) + + # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli + cmd = [ + READABILITY_BINARY, + document + ] + + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') + try: + result = run(cmd, cwd=out_dir, timeout=timeout) + result_json = json.loads(result.stdout) + atomic_write(output, result_json) + + # parse out number of files downloaded from last line of stderr: + # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" + output_tail = [ + line.strip() + for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] + if line.strip() + ] + hints = ( + 'Got readability response code: {}.'.format(result.returncode), + *output_tail, + ) + + # Check for common failure cases + if (result.returncode > 0): + raise ArchiveError('Readability was not able to archive the page', hints) + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() + + return ArchiveResult( + cmd=cmd, + pwd=out_dir, + cmd_version=READABILITY_VERSION, + output=output, + status=status, + **timer.stats, + ) From b33c66a9f77e973b0fa338038fcf18dc2eddb584 Mon Sep 17 00:00:00 2001 From: Cristian Date: Mon, 10 Aug 2020 13:15:28 -0500 Subject: [PATCH 02/14] feat: Split output of readability into multiple files --- archivebox/extractors/readability.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 2659c18e..b0daf5e6 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -1,6 +1,7 @@ __package__ = 'archivebox.extractors' from pathlib import Path +from tempfile import NamedTemporaryFile from typing import Optional import json @@ -35,14 +36,15 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO """download reader friendly version using @mozilla/readability""" out_dir = out_dir or link.link_dir - output = str(Path(out_dir).absolute() / "readability.json") + output_folder = Path(out_dir).absolute() / "readability" document = download_url(link.url) - + temp_doc = NamedTemporaryFile() + temp_doc.write(document.encode("utf-8")) # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli cmd = [ READABILITY_BINARY, - document + temp_doc.name ] status = 'succeeded' @@ -50,7 +52,10 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO try: result = run(cmd, cwd=out_dir, timeout=timeout) result_json = json.loads(result.stdout) - atomic_write(output, result_json) + output_folder.mkdir(exist_ok=True) + atomic_write(str(output_folder / "content.html"), result_json.pop("content")) + atomic_write(str(output_folder / "content.txt"), result_json.pop("textContent")) + atomic_write(str(output_folder / "article.json"), result_json) # parse out number of files downloaded from last line of stderr: # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" @@ -72,12 +77,13 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO output = err finally: timer.end() + temp_doc.close() return ArchiveResult( cmd=cmd, pwd=out_dir, cmd_version=READABILITY_VERSION, - output=output, + output=str(output_folder), status=status, **timer.stats, ) From 61e08a7c430b634022088273ddb0a74a1f6e8b89 Mon Sep 17 00:00:00 2001 From: Cristian Date: Mon, 10 Aug 2020 13:17:55 -0500 Subject: [PATCH 03/14] docs: Update docs link --- archivebox/extractors/readability.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index b0daf5e6..0a3c13b3 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -41,7 +41,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO document = download_url(link.url) temp_doc = NamedTemporaryFile() temp_doc.write(document.encode("utf-8")) - # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli + # Readability Docs: https://github.com/mozilla/readability cmd = [ READABILITY_BINARY, temp_doc.name From a14762640e68d32dcb8aa29639fce7474c50a1d6 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 11 Aug 2020 07:40:55 -0500 Subject: [PATCH 04/14] feat: Avoid running readability when the target is a file --- archivebox/extractors/readability.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 0a3c13b3..8cac5e29 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -11,6 +11,7 @@ from ..system import run, atomic_write from ..util import ( enforce_types, download_url, + is_static_file, ) from ..config import ( @@ -26,6 +27,8 @@ from ..logging_util import TimedProgress @enforce_types def should_save_readability(link: Link, out_dir: Optional[str]=None) -> bool: out_dir = out_dir or link.link_dir + if is_static_file(link.url): + return False output = Path(out_dir or link.link_dir) / 'readability.json' return SAVE_READABILITY and (not output.exists()) From 0ec747f64e9b47fd08555d2c17b555874ace0a90 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 11 Aug 2020 08:36:03 -0500 Subject: [PATCH 05/14] feat: Look in wget, singlefile or dom outputs before attempting to download the information again --- archivebox/extractors/__init__.py | 2 +- archivebox/extractors/readability.py | 30 ++++++++++++++++++++++++---- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 6cd3c551..0882c50e 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -40,10 +40,10 @@ def get_default_archive_methods(): ('favicon', should_save_favicon, save_favicon), ('wget', should_save_wget, save_wget), ('singlefile', should_save_singlefile, save_singlefile), - ('readability', should_save_readability, save_readability), ('pdf', should_save_pdf, save_pdf), ('screenshot', should_save_screenshot, save_screenshot), ('dom', should_save_dom, save_dom), + ('readability', should_save_readability, save_readability), #keep readability below wget and singlefile, as it depends on them ('git', should_save_git, save_git), ('media', should_save_media, save_media), ('archive_org', should_save_archive_dot_org, save_archive_dot_org), diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 8cac5e29..c9b5b6b9 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -23,6 +23,28 @@ from ..config import ( ) from ..logging_util import TimedProgress +@enforce_types +def get_html(link: Link, path: Path) -> str: + """ + Try to find wget, singlefile and then dom files. + If none is found, download the url again. + """ + canonical = link.canonical_outputs() + abs_path = path.absolute() + sources = [canonical["wget_path"], canonical["singlefile_path"], canonical["dom_path"]] + document = None + breakpoint() + for source in sources: + try: + with open(abs_path / source, "r") as f: + document = f.read() + break + except FileNotFoundError: + continue + if document is None: + return download_url(link.url) + else: + return document @enforce_types def should_save_readability(link: Link, out_dir: Optional[str]=None) -> bool: @@ -38,10 +60,10 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None) -> bool: def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: """download reader friendly version using @mozilla/readability""" - out_dir = out_dir or link.link_dir - output_folder = Path(out_dir).absolute() / "readability" + out_dir = Path(out_dir or link.link_dir) + output_folder = out_dir.absolute() / "readability" - document = download_url(link.url) + document = get_html(link, out_dir) temp_doc = NamedTemporaryFile() temp_doc.write(document.encode("utf-8")) # Readability Docs: https://github.com/mozilla/readability @@ -84,7 +106,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO return ArchiveResult( cmd=cmd, - pwd=out_dir, + pwd=str(out_dir), cmd_version=READABILITY_VERSION, output=str(output_folder), status=status, From dc87d8b68c717438e95a713040fbcdd1849f8508 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 11 Aug 2020 08:48:13 -0500 Subject: [PATCH 06/14] tests: Update failing tests --- archivebox/extractors/readability.py | 1 - tests/test_init.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index c9b5b6b9..03fa0a88 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -33,7 +33,6 @@ def get_html(link: Link, path: Path) -> str: abs_path = path.absolute() sources = [canonical["wget_path"], canonical["singlefile_path"], canonical["dom_path"]] document = None - breakpoint() for source in sources: try: with open(abs_path / source, "r") as f: diff --git a/tests/test_init.py b/tests/test_init.py index bd1ad516..f5a34538 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -19,6 +19,7 @@ def test_update(tmp_path, process): assert "Updating existing ArchiveBox collection in this folder" in update_process.stdout.decode("utf-8") def test_add_link(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_WGET": "true"}) os.chdir(tmp_path) add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict) @@ -35,6 +36,7 @@ def test_add_link(tmp_path, process, disable_extractors_dict): assert "Example Domain" in output_html def test_add_link_support_stdin(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_WGET": "true"}) os.chdir(tmp_path) stdin_process = subprocess.Popen(["archivebox", "add"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=disable_extractors_dict) From 8aa7b34de731e48e1aba84582a5df43e2add410b Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 11 Aug 2020 08:58:49 -0500 Subject: [PATCH 07/14] tests: Add readability to ignored methods in tests --- archivebox/extractors/readability.py | 4 ++-- tests/fixtures.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 03fa0a88..9c8babe3 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -19,7 +19,6 @@ from ..config import ( SAVE_READABILITY, READABILITY_BINARY, READABILITY_VERSION, - CHROME_BINARY, ) from ..logging_util import TimedProgress @@ -61,6 +60,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO out_dir = Path(out_dir or link.link_dir) output_folder = out_dir.absolute() / "readability" + output = str(output_folder) document = get_html(link, out_dir) temp_doc = NamedTemporaryFile() @@ -107,7 +107,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO cmd=cmd, pwd=str(out_dir), cmd_version=READABILITY_VERSION, - output=str(output_folder), + output=output, status=status, **timer.stats, ) diff --git a/tests/fixtures.py b/tests/fixtures.py index 3d8dabfe..458929d3 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -15,6 +15,7 @@ def disable_extractors_dict(): env.update({ "USE_WGET": "false", "USE_SINGLEFILE": "false", + "USE_READABILITY": "false", "SAVE_PDF": "false", "SAVE_SCREENSHOT": "false", "SAVE_DOM": "false", From 2a68af1b946c59a042d49c3bda414c39fee136b6 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 11 Aug 2020 11:15:15 -0500 Subject: [PATCH 08/14] tests: Add readability tests --- archivebox/extractors/readability.py | 7 +++--- tests/test_extractors.py | 32 ++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 9c8babe3..91e85468 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -37,7 +37,7 @@ def get_html(link: Link, path: Path) -> str: with open(abs_path / source, "r") as f: document = f.read() break - except FileNotFoundError: + except (FileNotFoundError, TypeError): continue if document is None: return download_url(link.url) @@ -51,6 +51,7 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None) -> bool: return False output = Path(out_dir or link.link_dir) / 'readability.json' + print(output, SAVE_READABILITY) return SAVE_READABILITY and (not output.exists()) @@ -63,8 +64,9 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO output = str(output_folder) document = get_html(link, out_dir) - temp_doc = NamedTemporaryFile() + temp_doc = NamedTemporaryFile(delete=False) temp_doc.write(document.encode("utf-8")) + temp_doc.close() # Readability Docs: https://github.com/mozilla/readability cmd = [ READABILITY_BINARY, @@ -101,7 +103,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO output = err finally: timer.end() - temp_doc.close() return ArchiveResult( cmd=cmd, diff --git a/tests/test_extractors.py b/tests/test_extractors.py index ffb933c1..e085d10e 100644 --- a/tests/test_extractors.py +++ b/tests/test_extractors.py @@ -21,3 +21,35 @@ def test_singlefile_works(tmp_path, process, disable_extractors_dict): archived_item_path = list(tmp_path.glob('archive/**/*'))[0] output_file = archived_item_path / "singlefile.html" assert output_file.exists() + +def test_readability_works(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_READABILITY": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob("archive/**/*"))[0] + output_file = archived_item_path / "readability" / "content.html" + assert output_file.exists() + +def test_readability_works_with_wget(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_READABILITY": "true", "USE_WGET": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob("archive/**/*"))[0] + output_file = archived_item_path / "readability" / "content.html" + assert output_file.exists() + +def test_readability_works_with_singlefile(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_READABILITY": "true", "USE_SINGLEFILE": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob("archive/**/*"))[0] + output_file = archived_item_path / "readability" / "content.html" + assert output_file.exists() + +def test_readability_works_with_dom(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob("archive/**/*"))[0] + output_file = archived_item_path / "readability" / "content.html" + assert output_file.exists() From 5dc7e63792286c31988a964e5d5ef3a89a70ced8 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 11 Aug 2020 11:52:43 -0500 Subject: [PATCH 09/14] feat: Update dockerfile to support readability --- Dockerfile | 8 +++++++- archivebox/extractors/readability.py | 3 +-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index c6b898e7..f7b64d75 100644 --- a/Dockerfile +++ b/Dockerfile @@ -57,6 +57,11 @@ RUN wget -qO - https://github.com/gildas-lormeau/SingleFile/archive/master.zip > && npm install --prefix SingleFile-master/cli --production > /dev/null 2>&1 \ && chmod +x SingleFile-master/cli/single-file +RUN wget -qO - https://github.com/pirate/readability-extractor/archive/master.zip > readability.zip \ + && unzip -q readability.zip \ + && npm install --prefix readability-extractor-master --production > /dev/null 2>&1 \ + && chmod +x readability-extractor-master/readability-extractor + # Run everything from here on out as non-privileged user RUN groupadd --system archivebox \ && useradd --system --create-home --gid archivebox --groups audio,video archivebox @@ -74,7 +79,8 @@ EXPOSE 8000 ENV IN_DOCKER=True \ CHROME_BINARY=google-chrome \ CHROME_SANDBOX=False \ - SINGLEFILE_BINARY="$EXTRA_PATH/SingleFile-master/cli/single-file" + SINGLEFILE_BINARY="$EXTRA_PATH/SingleFile-master/cli/single-file" \ + READABILITY_BINARY="$EXTRA_PATH/readability-extractor-master/readability-extractor" RUN env ALLOW_ROOT=True archivebox version diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 91e85468..8b573720 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -50,8 +50,7 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None) -> bool: if is_static_file(link.url): return False - output = Path(out_dir or link.link_dir) / 'readability.json' - print(output, SAVE_READABILITY) + output = Path(out_dir or link.link_dir) / 'readability' return SAVE_READABILITY and (not output.exists()) From 4d44b172e67e23fd6a2fb835b9d6547900701ff0 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 11 Aug 2020 11:58:17 -0500 Subject: [PATCH 10/14] tests: Add readability steps to CI --- .github/workflows/test.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 65ba19a6..d0f70e93 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -56,6 +56,22 @@ jobs: - name: Set SINGLEFILE_BINARY run: echo "::set-env name=SINGLEFILE_BINARY::$GITHUB_WORKSPACE/singlefile/cli/single-file" + - uses: actions/checkout@v2 + with: + fetch-depth: 1 + repository: "pirate/readability-extractor" + ref: "master" + path: "readability-extractor" + + - name: Install npm requirements for readability + run: npm install --prefix readability-extractor + + - name: Give readability-extractor execution permissions + run: chmod +x readability-extractor/readability-extractor + + - name: Set READABILITY_BINARY + run: echo "::set-env name=READABILITY_BINARY::$GITHUB_WORKSPACE/readability-extractor/readability-extractor" + - name: Set up Python ${{ matrix.python }} uses: actions/setup-python@v1 with: From eb3528fa9fb339a1f96e2db325c0a82f8c392d71 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 11 Aug 2020 12:14:13 -0500 Subject: [PATCH 11/14] feat: Add readability output to legacy index.html --- archivebox/index/schema.py | 2 ++ archivebox/themes/legacy/link_details.html | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 2129f5d3..c3b6ce8c 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -407,6 +407,7 @@ class Link: 'wget_path': wget_output_path(self), 'warc_path': 'warc', 'singlefile_path': 'singlefile.html', + 'readability_path': 'readability/content.html', 'pdf_path': 'output.pdf', 'screenshot_path': 'screenshot.png', 'dom_path': 'output.html', @@ -427,6 +428,7 @@ class Link: 'screenshot_path': static_path, 'dom_path': static_path, 'singlefile_path': static_path, + 'readability_path': static_path, }) return canonical diff --git a/archivebox/themes/legacy/link_details.html b/archivebox/themes/legacy/link_details.html index 447552ad..1dabae2d 100644 --- a/archivebox/themes/legacy/link_details.html +++ b/archivebox/themes/legacy/link_details.html @@ -348,6 +348,18 @@ +
+
+ +
+ + + +

Readability

+

archive/readability/...

+
+
+
From b7aa3df8d21526f02bf90746b8263a7bee6709ad Mon Sep 17 00:00:00 2001 From: Cristian Date: Wed, 12 Aug 2020 14:42:21 -0500 Subject: [PATCH 12/14] feat: Disable singlefile and readability by default --- archivebox/config/__init__.py | 4 ++-- archivebox/extractors/__init__.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index b51c7034..e57fcf5e 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -107,8 +107,8 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'DEPENDENCY_CONFIG': { 'USE_CURL': {'type': bool, 'default': True}, 'USE_WGET': {'type': bool, 'default': True}, - 'USE_SINGLEFILE': {'type': bool, 'default': True}, - 'USE_READABILITY': {'type': bool, 'default': True}, + 'USE_SINGLEFILE': {'type': bool, 'default': False}, + 'USE_READABILITY': {'type': bool, 'default': False}, 'USE_GIT': {'type': bool, 'default': True}, 'USE_CHROME': {'type': bool, 'default': True}, 'USE_YOUTUBEDL': {'type': bool, 'default': True}, diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 0882c50e..dd388446 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -95,6 +95,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s stats[result.status] += 1 log_archive_method_finished(result) else: + print(' > Skipping extractor: {}'.format(method_name)) stats['skipped'] += 1 except Exception as e: raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format( From 89775ae41694b2bc5edff6a7813f911609745d6d Mon Sep 17 00:00:00 2001 From: Cristian Date: Wed, 12 Aug 2020 15:35:21 -0500 Subject: [PATCH 13/14] feat: Enable singlefile and readability in docker by default --- Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Dockerfile b/Dockerfile index f7b64d75..beba4ea5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -79,7 +79,9 @@ EXPOSE 8000 ENV IN_DOCKER=True \ CHROME_BINARY=google-chrome \ CHROME_SANDBOX=False \ + USE_SINGLEFILE="true" \ SINGLEFILE_BINARY="$EXTRA_PATH/SingleFile-master/cli/single-file" \ + USE_READABILITY="true" \ READABILITY_BINARY="$EXTRA_PATH/readability-extractor-master/readability-extractor" RUN env ALLOW_ROOT=True archivebox version From 03b73bfe77956d3938d0c820d7cd5343980c676a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 14 Aug 2020 12:55:22 -0400 Subject: [PATCH 14/14] Update archivebox/extractors/readability.py --- archivebox/extractors/readability.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 8b573720..c6335a5a 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -30,7 +30,7 @@ def get_html(link: Link, path: Path) -> str: """ canonical = link.canonical_outputs() abs_path = path.absolute() - sources = [canonical["wget_path"], canonical["singlefile_path"], canonical["dom_path"]] + sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]] document = None for source in sources: try: