Add ripgrep rg search backend and set as default
This commit is contained in:
parent
8484bdb973
commit
95382b3812
4 changed files with 56 additions and 15 deletions
|
@ -46,7 +46,7 @@ RUN apt-get update -qq \
|
||||||
# Install apt dependencies
|
# Install apt dependencies
|
||||||
RUN apt-get update -qq \
|
RUN apt-get update -qq \
|
||||||
&& apt-get install -qq -y --no-install-recommends \
|
&& apt-get install -qq -y --no-install-recommends \
|
||||||
wget curl chromium git ffmpeg youtube-dl \
|
wget curl chromium git ffmpeg youtube-dl ripgrep \
|
||||||
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
|
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
|
|
@ -142,7 +142,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
|
||||||
'SEARCH_BACKEND_CONFIG' : {
|
'SEARCH_BACKEND_CONFIG' : {
|
||||||
'USE_INDEXING_BACKEND': {'type': bool, 'default': True},
|
'USE_INDEXING_BACKEND': {'type': bool, 'default': True},
|
||||||
'USE_SEARCHING_BACKEND': {'type': bool, 'default': True},
|
'USE_SEARCHING_BACKEND': {'type': bool, 'default': True},
|
||||||
'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'sonic'},
|
'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'ripgrep'},
|
||||||
'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'},
|
'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'},
|
||||||
'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491},
|
'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491},
|
||||||
'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'},
|
'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'},
|
||||||
|
|
43
archivebox/search/backends/ripgrep.py
Normal file
43
archivebox/search/backends/ripgrep.py
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
import re
|
||||||
|
from subprocess import run, PIPE, DEVNULL
|
||||||
|
from typing import List, Generator
|
||||||
|
|
||||||
|
from archivebox.config import setup_django, ARCHIVE_DIR, ARCHIVE_DIR_NAME
|
||||||
|
from archivebox.util import enforce_types
|
||||||
|
|
||||||
|
DEFAULT_ARGUMENTS = '-ilt' # Case insensitive, matching files, types
|
||||||
|
DEFAULT_EXTENSIONS = 'html'
|
||||||
|
REGEX_ARGUMENT = '-e'
|
||||||
|
|
||||||
|
TIMESTAMP_REGEX = r'\/([\d]+\.[\d]+)\/'
|
||||||
|
|
||||||
|
ts_regex = re.compile(TIMESTAMP_REGEX)
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def index(snapshot_id: str, texts: List[str]):
|
||||||
|
return
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def flush(snapshot_ids: Generator[str, None, None]):
|
||||||
|
return
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def search(text: str) -> List[str]:
|
||||||
|
is_rg_installed = run(['which', 'rg'], stdout=DEVNULL, stderr=DEVNULL)
|
||||||
|
if is_rg_installed.returncode:
|
||||||
|
raise Exception("rg binary not found, install ripgrep to use this backend")
|
||||||
|
|
||||||
|
setup_django(check_db=True)
|
||||||
|
from core.models import Snapshot
|
||||||
|
|
||||||
|
rg = run(['rg',DEFAULT_ARGUMENTS, DEFAULT_EXTENSIONS, REGEX_ARGUMENT, text, str(ARCHIVE_DIR)],stdout=PIPE, stderr=PIPE, timeout=60)
|
||||||
|
file_paths = [p.decode().replace(str(ARCHIVE_DIR_NAME), '') for p in rg.stdout.splitlines()]
|
||||||
|
timestamps = set()
|
||||||
|
for path in file_paths:
|
||||||
|
if ts := ts_regex.findall(path):
|
||||||
|
timestamps.add(ts[0])
|
||||||
|
|
||||||
|
snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)]
|
||||||
|
|
||||||
|
return snap_ids
|
||||||
|
|
|
@ -21,21 +21,8 @@ services:
|
||||||
environment:
|
environment:
|
||||||
- USE_COLOR=True
|
- USE_COLOR=True
|
||||||
- SHOW_PROGRESS=False
|
- SHOW_PROGRESS=False
|
||||||
- SEARCH_BACKEND_HOST_NAME=sonic
|
|
||||||
- SEARCH_BACKEND_PASSWORD=SecretPassword
|
|
||||||
volumes:
|
volumes:
|
||||||
- ./data:/data
|
- ./data:/data
|
||||||
depends_on:
|
|
||||||
- sonic
|
|
||||||
sonic:
|
|
||||||
image: valeriansaliou/sonic:v1.3.0
|
|
||||||
ports:
|
|
||||||
- 1491:1491
|
|
||||||
environment:
|
|
||||||
- SEARCH_BACKEND_PASSWORD=SecretPassword
|
|
||||||
volumes:
|
|
||||||
- ./etc/sonic/config.cfg:/etc/sonic.cfg
|
|
||||||
- ./data:/var/lib/sonic/store/
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -87,3 +74,14 @@ services:
|
||||||
# volumes:
|
# volumes:
|
||||||
# ./data:/archivebox
|
# ./data:/archivebox
|
||||||
# ./data/wayback:/webarchive
|
# ./data/wayback:/webarchive
|
||||||
|
|
||||||
|
# Example: Run sonic search backend
|
||||||
|
# sonic:
|
||||||
|
# image: valeriansaliou/sonic:v1.3.0
|
||||||
|
# ports:
|
||||||
|
# - 1491:1491
|
||||||
|
# environment:
|
||||||
|
# - SEARCH_BACKEND_PASSWORD=SecretPassword
|
||||||
|
# volumes:
|
||||||
|
# - ./etc/sonic/config.cfg:/etc/sonic.cfg
|
||||||
|
# - ./data:/var/lib/sonic/store/
|
Loading…
Reference in a new issue