From d8aa84ac9864c8a31eed2abcc1dff7901b7e047c Mon Sep 17 00:00:00 2001 From: Ross Williams Date: Thu, 12 Oct 2023 13:14:39 -0400 Subject: [PATCH] Make extracting text for indexing optional Add a configuration option to enable/disable HTML text extraction for indexing --- archivebox/config.py | 1 + archivebox/search/utils.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 795b98e9..4286ce58 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -209,6 +209,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'}, 'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491}, 'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'}, + 'SEARCH_PROCESS_HTML': {'type': bool, 'default': True}, # SONIC 'SONIC_COLLECTION': {'type': str, 'default': 'archivebox'}, 'SONIC_BUCKET': {'type': str, 'default': 'snapshots'}, diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py index 4573ca69..f734908c 100644 --- a/archivebox/search/utils.py +++ b/archivebox/search/utils.py @@ -4,7 +4,7 @@ import io from django.db.models import QuerySet from archivebox.util import enforce_types -from archivebox.config import ANSI +from archivebox.config import ANSI, SEARCH_PROCESS_HTML BLOCK_SIZE = 32768 @@ -128,7 +128,8 @@ def get_indexable_content(results: QuerySet): if method == 'readability': return get_file_result_content(res, 'content.txt', use_pwd=True) elif method == 'singlefile': - return get_file_result_content(res, '', use_pwd=True, filter=_extract_html_text) + filter = _extract_html_text if SEARCH_PROCESS_HTML else _read_all + return get_file_result_content(res, '', use_pwd=True, filter=filter) elif method == 'dom': return get_file_result_content(res, '', use_pwd=True) elif method == 'wget':