From 95382b381203e92dda76286a30a934ca2cca1ba5 Mon Sep 17 00:00:00 2001 From: JDC Date: Sun, 22 Nov 2020 20:56:24 -0500 Subject: [PATCH] Add ripgrep rg search backend and set as default --- Dockerfile | 2 +- archivebox/config.py | 2 +- archivebox/search/backends/ripgrep.py | 43 +++++++++++++++++++++++++++ docker-compose.yml | 24 +++++++-------- 4 files changed, 56 insertions(+), 15 deletions(-) create mode 100644 archivebox/search/backends/ripgrep.py diff --git a/Dockerfile b/Dockerfile index 33d4a488..20a410e2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -46,7 +46,7 @@ RUN apt-get update -qq \ # Install apt dependencies RUN apt-get update -qq \ && apt-get install -qq -y --no-install-recommends \ - wget curl chromium git ffmpeg youtube-dl \ + wget curl chromium git ffmpeg youtube-dl ripgrep \ fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \ && rm -rf /var/lib/apt/lists/* diff --git a/archivebox/config.py b/archivebox/config.py index ee2f0b4a..846df0c9 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -142,7 +142,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'SEARCH_BACKEND_CONFIG' : { 'USE_INDEXING_BACKEND': {'type': bool, 'default': True}, 'USE_SEARCHING_BACKEND': {'type': bool, 'default': True}, - 'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'sonic'}, + 'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'ripgrep'}, 'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'}, 'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491}, 'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'}, diff --git a/archivebox/search/backends/ripgrep.py b/archivebox/search/backends/ripgrep.py new file mode 100644 index 00000000..cd9ecfee --- /dev/null +++ b/archivebox/search/backends/ripgrep.py @@ -0,0 +1,43 @@ +import re +from subprocess import run, PIPE, DEVNULL +from typing import List, Generator + +from archivebox.config import setup_django, ARCHIVE_DIR, ARCHIVE_DIR_NAME +from archivebox.util import enforce_types + +DEFAULT_ARGUMENTS = '-ilt' # Case insensitive, matching files, types +DEFAULT_EXTENSIONS = 'html' +REGEX_ARGUMENT = '-e' + +TIMESTAMP_REGEX = r'\/([\d]+\.[\d]+)\/' + +ts_regex = re.compile(TIMESTAMP_REGEX) + +@enforce_types +def index(snapshot_id: str, texts: List[str]): + return + +@enforce_types +def flush(snapshot_ids: Generator[str, None, None]): + return + +@enforce_types +def search(text: str) -> List[str]: + is_rg_installed = run(['which', 'rg'], stdout=DEVNULL, stderr=DEVNULL) + if is_rg_installed.returncode: + raise Exception("rg binary not found, install ripgrep to use this backend") + + setup_django(check_db=True) + from core.models import Snapshot + + rg = run(['rg',DEFAULT_ARGUMENTS, DEFAULT_EXTENSIONS, REGEX_ARGUMENT, text, str(ARCHIVE_DIR)],stdout=PIPE, stderr=PIPE, timeout=60) + file_paths = [p.decode().replace(str(ARCHIVE_DIR_NAME), '') for p in rg.stdout.splitlines()] + timestamps = set() + for path in file_paths: + if ts := ts_regex.findall(path): + timestamps.add(ts[0]) + + snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)] + + return snap_ids + diff --git a/docker-compose.yml b/docker-compose.yml index 29fc6f7a..c76f734a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -21,21 +21,8 @@ services: environment: - USE_COLOR=True - SHOW_PROGRESS=False - - SEARCH_BACKEND_HOST_NAME=sonic - - SEARCH_BACKEND_PASSWORD=SecretPassword volumes: - ./data:/data - depends_on: - - sonic - sonic: - image: valeriansaliou/sonic:v1.3.0 - ports: - - 1491:1491 - environment: - - SEARCH_BACKEND_PASSWORD=SecretPassword - volumes: - - ./etc/sonic/config.cfg:/etc/sonic.cfg - - ./data:/var/lib/sonic/store/ @@ -87,3 +74,14 @@ services: # volumes: # ./data:/archivebox # ./data/wayback:/webarchive + + # Example: Run sonic search backend + # sonic: + # image: valeriansaliou/sonic:v1.3.0 + # ports: + # - 1491:1491 + # environment: + # - SEARCH_BACKEND_PASSWORD=SecretPassword + # volumes: + # - ./etc/sonic/config.cfg:/etc/sonic.cfg + # - ./data:/var/lib/sonic/store/ \ No newline at end of file