From eda3836dee4f4b0ebf805ca708bc3f31533788a9 Mon Sep 17 00:00:00 2001 From: ttimasdf Date: Sun, 27 Sep 2020 12:54:52 +0800 Subject: [PATCH] feat: add og:title metadata as alternative title --- archivebox/extractors/title.py | 37 +++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index 7a5cd471..ca42c579 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -1,6 +1,7 @@ __package__ = 'archivebox.extractors' import re +from html.parser import HTMLParser from pathlib import Path from typing import Optional @@ -23,11 +24,32 @@ from ..config import ( from ..logging_util import TimedProgress -HTML_TITLE_REGEX = re.compile( - r'' # start matching text after tag - r'(.[^<>]+)', # get everything up to these symbols - re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE, -) +class TitleParser(HTMLParser): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.title_tag = "" + self.title_og = "" + self.inside_title_tag = False + + @property + def title(self): + return self.title_tag or self.title_og or None + + def handle_starttag(self, tag, attrs): + if tag.lower() == "title" and not self.title_tag: + self.inside_title_tag = True + elif tag.lower() == "meta" and not self.title_og: + attrs = dict(attrs) + if attrs.get("property") == "og:title" and attrs.get("content"): + self.title_og = attrs.get("content") + + def handle_data(self, data): + if self.inside_title_tag and data: + self.title_tag += data.strip() + + def handle_endtag(self, tag): + if tag.lower() == "title": + self.inside_title_tag = False @enforce_types @@ -63,8 +85,9 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) - timer = TimedProgress(timeout, prefix=' ') try: html = download_url(link.url, timeout=timeout) - match = re.search(HTML_TITLE_REGEX, html) - output = htmldecode(match.group(1).strip()) if match else None + parser = TitleParser() + parser.feed(html) + output = parser.title if output: if not link.title or len(output) >= len(link.title): Snapshot.objects.filter(url=link.url, timestamp=link.timestamp).update(title=output)