1
0
Fork 0

Make extracting text for indexing optional

Add a configuration option to enable/disable HTML text extraction
for indexing
This commit is contained in:
Ross Williams 2023-10-12 13:14:39 -04:00
parent b6a20c962a
commit d8aa84ac98
2 changed files with 4 additions and 2 deletions

View file

@ -209,6 +209,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'}, 'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'},
'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491}, 'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491},
'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'}, 'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'},
'SEARCH_PROCESS_HTML': {'type': bool, 'default': True},
# SONIC # SONIC
'SONIC_COLLECTION': {'type': str, 'default': 'archivebox'}, 'SONIC_COLLECTION': {'type': str, 'default': 'archivebox'},
'SONIC_BUCKET': {'type': str, 'default': 'snapshots'}, 'SONIC_BUCKET': {'type': str, 'default': 'snapshots'},

View file

@ -4,7 +4,7 @@ import io
from django.db.models import QuerySet from django.db.models import QuerySet
from archivebox.util import enforce_types from archivebox.util import enforce_types
from archivebox.config import ANSI from archivebox.config import ANSI, SEARCH_PROCESS_HTML
BLOCK_SIZE = 32768 BLOCK_SIZE = 32768
@ -128,7 +128,8 @@ def get_indexable_content(results: QuerySet):
if method == 'readability': if method == 'readability':
return get_file_result_content(res, 'content.txt', use_pwd=True) return get_file_result_content(res, 'content.txt', use_pwd=True)
elif method == 'singlefile': elif method == 'singlefile':
return get_file_result_content(res, '', use_pwd=True, filter=_extract_html_text) filter = _extract_html_text if SEARCH_PROCESS_HTML else _read_all
return get_file_result_content(res, '', use_pwd=True, filter=filter)
elif method == 'dom': elif method == 'dom':
return get_file_result_content(res, '', use_pwd=True) return get_file_result_content(res, '', use_pwd=True)
elif method == 'wget': elif method == 'wget':