add regex fallback back to title parser
This commit is contained in:
parent
79bef1384e
commit
f727ece7b3
1 changed files with 23 additions and 5 deletions
|
@ -25,6 +25,14 @@ from ..config import (
|
||||||
from ..logging_util import TimedProgress
|
from ..logging_util import TimedProgress
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
HTML_TITLE_REGEX = re.compile(
|
||||||
|
r'<title.*?>' # start matching text after <title> tag
|
||||||
|
r'(.[^<>]+)', # get everything up to these symbols
|
||||||
|
re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TitleParser(HTMLParser):
|
class TitleParser(HTMLParser):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
@ -84,12 +92,22 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
try:
|
try:
|
||||||
html = download_url(link.url, timeout=timeout)
|
html = download_url(link.url, timeout=timeout)
|
||||||
parser = TitleParser()
|
try:
|
||||||
parser.feed(html)
|
# try using relatively strict html parser first
|
||||||
output = parser.title
|
parser = TitleParser()
|
||||||
if output:
|
parser.feed(html)
|
||||||
|
output = parser.title
|
||||||
|
except Exception:
|
||||||
|
# fallback to regex that can handle broken/malformed html
|
||||||
|
match = re.search(HTML_TITLE_REGEX, html)
|
||||||
|
output = htmldecode(match.group(1).strip()) if match else None
|
||||||
|
|
||||||
|
# if title is better than the one in the db, update db with new title
|
||||||
|
if isinstance(output, str) and output:
|
||||||
if not link.title or len(output) >= len(link.title):
|
if not link.title or len(output) >= len(link.title):
|
||||||
Snapshot.objects.filter(url=link.url, timestamp=link.timestamp).update(title=output)
|
Snapshot.objects.filter(url=link.url,
|
||||||
|
timestamp=link.timestamp)\
|
||||||
|
.update(title=output)
|
||||||
else:
|
else:
|
||||||
raise ArchiveError('Unable to detect page title')
|
raise ArchiveError('Unable to detect page title')
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
|
|
Loading…
Reference in a new issue