From b864c38d9e9eb661bde0c0c267784b71cb8bd95e Mon Sep 17 00:00:00 2001 From: Joseph Turian Date: Mon, 12 Sep 2022 20:40:45 +0000 Subject: [PATCH] Don't be strict on unicode errors --- archivebox/extractors/media.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index c6388a1f..7d73024f 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -74,7 +74,16 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME # add video description and subtitles to full-text index # Let's try a few different index_texts = [ - text_file.read_text(encoding='utf-8').strip() + # errors: + # * 'strict' to raise a ValueError exception if there is an + # encoding error. The default value of None has the same effect. + # * 'ignore' ignores errors. Note that ignoring encoding errors + # can lead to data loss. + # * 'xmlcharrefreplace' is only supported when writing to a + # file. Characters not supported by the encoding are replaced with + # the appropriate XML character reference &#nnn;. + # There are a few more options described in https://docs.python.org/3/library/functions.html#open + text_file.read_text(encoding='utf-8', errors='xmlcharrefreplace').strip() for text_file in ( *output_path.glob('*.description'), *output_path.glob('*.srt'),