Refactor and improve performance of RDoc::Markup::Parser

This change introduces a wrapper of StringScanner that is aware of the current position (column and lineno). It has two advantages: faster and more modular. The old code frequently runs `@input.byteslice(0, byte_offset).length` to get the current position, but it was painfully slow. This change keeps track of the position at each scan, which reduces about half of time of "Generating RI format into ..." in Ruby's `make rdoc` (5.5 sec -> 3.0 sec). And the old code used four instance variables (`@input`, `@line`, `@line_pos`, and `@s`) to track the position. This change factors them out into MyStringScanner, so now only one variable (`@s`) is needed.
2022-11-09 12:17:21 -05:00 · 2019-08-07 01:53:56 +09:00 · 2019-08-07 01:53:56 +09:00 · 0a0760aa63
commit 0a0760aa63
parent 9d2fed2ccd
3 changed files with 65 additions and 67 deletions
--- a/lib/rdoc/markup/parser.rb
+++ b/lib/rdoc/markup/parser.rb
@ -80,10 +80,6 @@ class RDoc::Markup::Parser
    @binary_input   = nil
    @current_token  = nil
    @debug          = false
-    @input          = nil
-    @input_encoding = nil
-    @line           = 0
-    @line_pos       = 0
    @s              = nil
    @tokens         = []
  end
@ -319,13 +315,6 @@ class RDoc::Markup::Parser
    verbatim
  end

-  ##
-  # The character offset for the input string at the given +byte_offset+
-
-  def char_pos byte_offset
-    @input.byteslice(0, byte_offset).length
-  end
-
  ##
  # Pulls the next token from the stream.

@ -424,15 +413,54 @@ class RDoc::Markup::Parser
    token
  end

+  ##
+  # A simple wrapper of StringScanner that is aware of the current column and lineno
+
+  class MyStringScanner
+    def initialize(input)
+      @line = @column = 0
+      @s = StringScanner.new input
+    end
+
+    def scan(re)
+      prev_pos = @s.pos
+      ret = @s.scan(re)
+      @column += ret.length if ret
+      ret
+    end
+
+    def unscan(s)
+      @s.pos -= s.bytesize
+      @column -= s.length
+    end
+
+    def pos
+      [@column, @line]
+    end
+
+    def newline!
+      @column = 0
+      @line += 1
+    end
+
+    def eos?
+      @s.eos?
+    end
+
+    def matched
+      @s.matched
+    end
+
+    def [](i)
+      @s[i]
+    end
+  end
+
  ##
  # Creates the StringScanner

  def setup_scanner input
-    @line     = 0
-    @line_pos = 0
-    @input    = input.dup
-
-    @s = StringScanner.new input
+    @s = MyStringScanner.new input
  end

  ##
@ -467,31 +495,30 @@ class RDoc::Markup::Parser
      @tokens << case
                 # [CR]LF => :NEWLINE
                 when @s.scan(/\r?\n/) then
-                   token = [:NEWLINE, @s.matched, *token_pos(pos)]
-                   @line_pos = char_pos @s.pos
-                   @line += 1
+                   token = [:NEWLINE, @s.matched, *pos]
+                   @s.newline!
                   token
                 # === text => :HEADER then :TEXT
                 when @s.scan(/(=+)(\s*)/) then
                   level = @s[1].length
-                   header = [:HEADER, level, *token_pos(pos)]
+                   header = [:HEADER, level, *pos]

                   if @s[2] =~ /^\r?\n/ then
-                     @s.pos -= @s[2].length
+                     @s.unscan(@s[2])
                     header
                   else
                     pos = @s.pos
                     @s.scan(/.*/)
                     @tokens << header
-                     [:TEXT, @s.matched.sub(/\r$/, ''), *token_pos(pos)]
+                     [:TEXT, @s.matched.sub(/\r$/, ''), *pos]
                   end
                 # --- (at least 3) and nothing else on the line => :RULE
                 when @s.scan(/(-{3,}) *\r?$/) then
-                   [:RULE, @s[1].length - 2, *token_pos(pos)]
+                   [:RULE, @s[1].length - 2, *pos]
                 # * or - followed by white space and text => :BULLET
                 when @s.scan(/([*-]) +(\S)/) then
-                   @s.pos -= @s[2].bytesize # unget \S
-                   [:BULLET, @s[1], *token_pos(pos)]
+                   @s.unscan(@s[2])
+                   [:BULLET, @s[1], *pos]
                 # A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER
                 when @s.scan(/([a-z]|\d+)\. +(\S)/i) then
                   # FIXME if tab(s), the column will be wrong
@ -500,7 +527,7 @@ class RDoc::Markup::Parser
                   # before (and provide a check for that at least in debug
                   # mode)
                   list_label = @s[1]
-                   @s.pos -= @s[2].bytesize # unget \S
+                   @s.unscan(@s[2])
                   list_type =
                     case list_label
                     when /[a-z]/ then :LALPHA
@ -509,24 +536,24 @@ class RDoc::Markup::Parser
                     else
                       raise ParseError, "BUG token #{list_label}"
                     end
-                   [list_type, list_label, *token_pos(pos)]
+                   [list_type, list_label, *pos]
                 # [text] followed by spaces or end of line => :LABEL
                 when @s.scan(/\[(.*?)\]( +|\r?$)/) then
-                   [:LABEL, @s[1], *token_pos(pos)]
+                   [:LABEL, @s[1], *pos]
                 # text:: followed by spaces or end of line => :NOTE
                 when @s.scan(/(.*?)::( +|\r?$)/) then
-                   [:NOTE, @s[1], *token_pos(pos)]
+                   [:NOTE, @s[1], *pos]
                 # >>> followed by end of line => :BLOCKQUOTE
                 when @s.scan(/>>> *(\w+)?$/) then
-                   [:BLOCKQUOTE, @s[1], *token_pos(pos)]
+                   [:BLOCKQUOTE, @s[1], *pos]
                 # anything else: :TEXT
                 else
                   @s.scan(/(.*?)(  )?\r?$/)
-                   token = [:TEXT, @s[1], *token_pos(pos)]
+                   token = [:TEXT, @s[1], *pos]

                   if @s[2] then
                     @tokens << token
-                     [:BREAK, @s[2], *token_pos(pos + @s[1].length)]
+                     [:BREAK, @s[2], pos[0] + @s[1].length, pos[1]]
                   else
                     token
                   end
@ -536,16 +563,6 @@ class RDoc::Markup::Parser
    self
  end

-  ##
-  # Calculates the column (by character) and line of the current token based
-  # on +byte_offset+.
-
-  def token_pos byte_offset
-    offset = char_pos byte_offset
-
-    [offset - @line_pos, @line]
-  end
-
  ##
  # Returns the current token to the token stream

--- a/lib/rdoc/tom_doc.rb
+++ b/lib/rdoc/tom_doc.rb
@ -242,19 +242,18 @@ class RDoc::TomDoc < RDoc::Markup::Parser

      @tokens << case
                 when @s.scan(/\r?\n/) then
-                   token = [:NEWLINE, @s.matched, *token_pos(pos)]
-                   @line_pos = char_pos @s.pos
-                   @line += 1
+                   token = [:NEWLINE, @s.matched, *pos]
+                   @s.newline!
                   token
                 when @s.scan(/(Examples|Signature)$/) then
-                   @tokens << [:HEADER, 3, *token_pos(pos)]
+                   @tokens << [:HEADER, 3, *pos]

-                   [:TEXT, @s[1], *token_pos(pos)]
+                   [:TEXT, @s[1], *pos]
                 when @s.scan(/([:\w][\w\[\]]*)[ ]+- /) then
-                   [:NOTE, @s[1], *token_pos(pos)]
+                   [:NOTE, @s[1], *pos]
                 else
                   @s.scan(/.*/)
-                   [:TEXT, @s.matched.sub(/\r$/, ''), *token_pos(pos)]
+                   [:TEXT, @s.matched.sub(/\r$/, ''), *pos]
                 end
    end

--- a/test/rdoc/test_rdoc_markup_parser.rb
+++ b/test/rdoc/test_rdoc_markup_parser.rb
@ -22,15 +22,6 @@ class TestRDocMarkupParser < RDoc::TestCase
    assert_equal @RM::Heading.new(3, 'heading three'), parser.build_heading(3)
  end

-  def test_char_pos
-    parser = @RMP.new
-    s = parser.setup_scanner 'cät'
-
-    s.scan(/\S+/)
-
-    assert_equal 3, parser.char_pos(s.pos)
-  end
-
  def test_get
    parser = util_parser

@ -1647,15 +1638,6 @@ Example heading:
    assert_equal expected, @RMP.tokenize(str)
  end

-  def test_token_pos
-    parser = @RMP.new
-    s = parser.setup_scanner 'cät'
-
-    s.scan(/\S+/)
-
-    assert_equal [3, 0], parser.token_pos(s.pos)
-  end
-
  # HACK move to Verbatim test case
  def test_verbatim_normalize
    v = @RM::Verbatim.new "foo\n", "\n", "\n", "bar\n"