diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py
index bb176bfb..7863fb9c 100644
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@@ -25,6 +25,14 @@ from ..config import (
from ..logging_util import TimedProgress
+
+HTML_TITLE_REGEX = re.compile(
+ r'
' # start matching text after tag
+ r'(.[^<>]+)', # get everything up to these symbols
+ re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
+)
+
+
class TitleParser(HTMLParser):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@@ -84,12 +92,22 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
timer = TimedProgress(timeout, prefix=' ')
try:
html = download_url(link.url, timeout=timeout)
- parser = TitleParser()
- parser.feed(html)
- output = parser.title
- if output:
+ try:
+ # try using relatively strict html parser first
+ parser = TitleParser()
+ parser.feed(html)
+ output = parser.title
+ except Exception:
+ # fallback to regex that can handle broken/malformed html
+ match = re.search(HTML_TITLE_REGEX, html)
+ output = htmldecode(match.group(1).strip()) if match else None
+
+ # if title is better than the one in the db, update db with new title
+ if isinstance(output, str) and output:
if not link.title or len(output) >= len(link.title):
- Snapshot.objects.filter(url=link.url, timestamp=link.timestamp).update(title=output)
+ Snapshot.objects.filter(url=link.url,
+ timestamp=link.timestamp)\
+ .update(title=output)
else:
raise ArchiveError('Unable to detect page title')
except Exception as err: