From 82d8662c74c598a403fdabc9a6fd0bd4ceda0508 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 20 Oct 2023 04:14:28 -0700 Subject: [PATCH] add more readability error output --- archivebox/extractors/readability.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index a1689f95..e6e5e061 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -71,7 +71,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO result = run(cmd, cwd=out_dir, timeout=timeout) try: result_json = json.loads(result.stdout) - assert result_json and 'content' in result_json + assert result_json and 'content' in result_json, 'Readability output is not valid JSON' except json.JSONDecodeError: raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr) @@ -85,7 +85,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" output_tail = [ line.strip() - for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] + for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:] if line.strip() ] hints = (