diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py index f734908c..348b5603 100644 --- a/archivebox/search/utils.py +++ b/archivebox/search/utils.py @@ -65,6 +65,11 @@ class HTMLTextExtractor(HTMLParser): # ancestor matching this end tag while tag != self._tag_stack.pop(): pass + # Write a space after every tag, to ensure that tokens + # in tag text aren't concatenated. This may result in + # excess spaces, which should be ignored by search tokenizers. + if not self._in_notext_tag() and tag not in self.NOTEXT_TAGS: + self.output.write(" ") except IndexError: # Got to the top of the stack, but somehow missed # this end tag -- maybe malformed markup -- restore the @@ -75,9 +80,8 @@ class HTMLTextExtractor(HTMLParser): # Don't output text data if any ancestor is in NOTEXT_TAGS if self._in_notext_tag(): return - if stripped := data.strip(): - self.output.write(stripped) - self.output.write(" ") + + self.output.write(data) def __str__(self): return self.output.getvalue()