From 049f88def984188b9ff49b169d3a27595f099936 Mon Sep 17 00:00:00 2001 From: hannah98 Date: Thu, 30 Dec 2021 20:19:48 +0000 Subject: [PATCH] Added TAG_SEPARATORS option to supply a regex of characters to use when splitting tags --- archivebox/config.py | 1 + archivebox/config_stubs.py | 1 + archivebox/index/sql.py | 11 ++++++++--- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index 0551de2e..6eba098f 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -79,6 +79,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'}, # to avoid downloading code assets as their own pages 'URL_WHITELIST': {'type': str, 'default': None}, 'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True}, + 'TAG_SEPARATORS': {'type': str, 'default': '[,]'}, }, 'SERVER_CONFIG': { diff --git a/archivebox/config_stubs.py b/archivebox/config_stubs.py index f9c22a0c..432b86d8 100644 --- a/archivebox/config_stubs.py +++ b/archivebox/config_stubs.py @@ -98,6 +98,7 @@ class ConfigDict(BaseConfig, total=False): WGET_ARGS: List[str] CURL_ARGS: List[str] GIT_ARGS: List[str] + TAG_SEPARATORS: str ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue] diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index 661436cf..66402af7 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -1,5 +1,7 @@ __package__ = 'archivebox.index' +import re + from io import StringIO from pathlib import Path from typing import List, Tuple, Iterator @@ -8,7 +10,10 @@ from django.db import transaction from .schema import Link from ..util import enforce_types, parse_date -from ..config import OUTPUT_DIR +from ..config import ( + OUTPUT_DIR, + TAG_SEPARATORS, +) ### Main Links Index @@ -35,7 +40,7 @@ def write_link_to_sql_index(link: Link): info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys} tag_list = list(dict.fromkeys( - tag.strip() for tag in (link.tags or '').split(',') + tag.strip() for tag in re.split(TAG_SEPARATORS, link.tags or '') )) info.pop('tags') @@ -107,7 +112,7 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None: snap.title = link.title tag_list = list(dict.fromkeys( - tag.strip() for tag in (link.tags or '').split(',') + tag.strip() for tag in re.split(TAG_SEPARATORS, link.tags or '') )) snap.save()