1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

Refactor and improve performance of RDoc::Markup::Parser

This change introduces a wrapper of StringScanner that is aware of the
current position (column and lineno).
It has two advantages: faster and more modular.

The old code frequently runs `@input.byteslice(0, byte_offset).length`
to get the current position, but it was painfully slow.  This change
keeps track of the position at each scan, which reduces about half of
time of "Generating RI format into ..." in Ruby's `make rdoc`
(5.5 sec -> 3.0 sec).

And the old code used four instance variables (`@input`, `@line`,
`@line_pos`, and `@s`) to track the position.  This change factors them
out into MyStringScanner, so now only one variable (`@s`) is needed.
This commit is contained in:
Yusuke Endoh 2019-08-07 01:53:56 +09:00 committed by aycabta
parent 9d2fed2ccd
commit 0a0760aa63
3 changed files with 65 additions and 67 deletions

View file

@ -80,10 +80,6 @@ class RDoc::Markup::Parser
@binary_input = nil @binary_input = nil
@current_token = nil @current_token = nil
@debug = false @debug = false
@input = nil
@input_encoding = nil
@line = 0
@line_pos = 0
@s = nil @s = nil
@tokens = [] @tokens = []
end end
@ -319,13 +315,6 @@ class RDoc::Markup::Parser
verbatim verbatim
end end
##
# The character offset for the input string at the given +byte_offset+
def char_pos byte_offset
@input.byteslice(0, byte_offset).length
end
## ##
# Pulls the next token from the stream. # Pulls the next token from the stream.
@ -424,15 +413,54 @@ class RDoc::Markup::Parser
token token
end end
##
# A simple wrapper of StringScanner that is aware of the current column and lineno
class MyStringScanner
def initialize(input)
@line = @column = 0
@s = StringScanner.new input
end
def scan(re)
prev_pos = @s.pos
ret = @s.scan(re)
@column += ret.length if ret
ret
end
def unscan(s)
@s.pos -= s.bytesize
@column -= s.length
end
def pos
[@column, @line]
end
def newline!
@column = 0
@line += 1
end
def eos?
@s.eos?
end
def matched
@s.matched
end
def [](i)
@s[i]
end
end
## ##
# Creates the StringScanner # Creates the StringScanner
def setup_scanner input def setup_scanner input
@line = 0 @s = MyStringScanner.new input
@line_pos = 0
@input = input.dup
@s = StringScanner.new input
end end
## ##
@ -467,31 +495,30 @@ class RDoc::Markup::Parser
@tokens << case @tokens << case
# [CR]LF => :NEWLINE # [CR]LF => :NEWLINE
when @s.scan(/\r?\n/) then when @s.scan(/\r?\n/) then
token = [:NEWLINE, @s.matched, *token_pos(pos)] token = [:NEWLINE, @s.matched, *pos]
@line_pos = char_pos @s.pos @s.newline!
@line += 1
token token
# === text => :HEADER then :TEXT # === text => :HEADER then :TEXT
when @s.scan(/(=+)(\s*)/) then when @s.scan(/(=+)(\s*)/) then
level = @s[1].length level = @s[1].length
header = [:HEADER, level, *token_pos(pos)] header = [:HEADER, level, *pos]
if @s[2] =~ /^\r?\n/ then if @s[2] =~ /^\r?\n/ then
@s.pos -= @s[2].length @s.unscan(@s[2])
header header
else else
pos = @s.pos pos = @s.pos
@s.scan(/.*/) @s.scan(/.*/)
@tokens << header @tokens << header
[:TEXT, @s.matched.sub(/\r$/, ''), *token_pos(pos)] [:TEXT, @s.matched.sub(/\r$/, ''), *pos]
end end
# --- (at least 3) and nothing else on the line => :RULE # --- (at least 3) and nothing else on the line => :RULE
when @s.scan(/(-{3,}) *\r?$/) then when @s.scan(/(-{3,}) *\r?$/) then
[:RULE, @s[1].length - 2, *token_pos(pos)] [:RULE, @s[1].length - 2, *pos]
# * or - followed by white space and text => :BULLET # * or - followed by white space and text => :BULLET
when @s.scan(/([*-]) +(\S)/) then when @s.scan(/([*-]) +(\S)/) then
@s.pos -= @s[2].bytesize # unget \S @s.unscan(@s[2])
[:BULLET, @s[1], *token_pos(pos)] [:BULLET, @s[1], *pos]
# A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER # A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER
when @s.scan(/([a-z]|\d+)\. +(\S)/i) then when @s.scan(/([a-z]|\d+)\. +(\S)/i) then
# FIXME if tab(s), the column will be wrong # FIXME if tab(s), the column will be wrong
@ -500,7 +527,7 @@ class RDoc::Markup::Parser
# before (and provide a check for that at least in debug # before (and provide a check for that at least in debug
# mode) # mode)
list_label = @s[1] list_label = @s[1]
@s.pos -= @s[2].bytesize # unget \S @s.unscan(@s[2])
list_type = list_type =
case list_label case list_label
when /[a-z]/ then :LALPHA when /[a-z]/ then :LALPHA
@ -509,24 +536,24 @@ class RDoc::Markup::Parser
else else
raise ParseError, "BUG token #{list_label}" raise ParseError, "BUG token #{list_label}"
end end
[list_type, list_label, *token_pos(pos)] [list_type, list_label, *pos]
# [text] followed by spaces or end of line => :LABEL # [text] followed by spaces or end of line => :LABEL
when @s.scan(/\[(.*?)\]( +|\r?$)/) then when @s.scan(/\[(.*?)\]( +|\r?$)/) then
[:LABEL, @s[1], *token_pos(pos)] [:LABEL, @s[1], *pos]
# text:: followed by spaces or end of line => :NOTE # text:: followed by spaces or end of line => :NOTE
when @s.scan(/(.*?)::( +|\r?$)/) then when @s.scan(/(.*?)::( +|\r?$)/) then
[:NOTE, @s[1], *token_pos(pos)] [:NOTE, @s[1], *pos]
# >>> followed by end of line => :BLOCKQUOTE # >>> followed by end of line => :BLOCKQUOTE
when @s.scan(/>>> *(\w+)?$/) then when @s.scan(/>>> *(\w+)?$/) then
[:BLOCKQUOTE, @s[1], *token_pos(pos)] [:BLOCKQUOTE, @s[1], *pos]
# anything else: :TEXT # anything else: :TEXT
else else
@s.scan(/(.*?)( )?\r?$/) @s.scan(/(.*?)( )?\r?$/)
token = [:TEXT, @s[1], *token_pos(pos)] token = [:TEXT, @s[1], *pos]
if @s[2] then if @s[2] then
@tokens << token @tokens << token
[:BREAK, @s[2], *token_pos(pos + @s[1].length)] [:BREAK, @s[2], pos[0] + @s[1].length, pos[1]]
else else
token token
end end
@ -536,16 +563,6 @@ class RDoc::Markup::Parser
self self
end end
##
# Calculates the column (by character) and line of the current token based
# on +byte_offset+.
def token_pos byte_offset
offset = char_pos byte_offset
[offset - @line_pos, @line]
end
## ##
# Returns the current token to the token stream # Returns the current token to the token stream

View file

@ -242,19 +242,18 @@ class RDoc::TomDoc < RDoc::Markup::Parser
@tokens << case @tokens << case
when @s.scan(/\r?\n/) then when @s.scan(/\r?\n/) then
token = [:NEWLINE, @s.matched, *token_pos(pos)] token = [:NEWLINE, @s.matched, *pos]
@line_pos = char_pos @s.pos @s.newline!
@line += 1
token token
when @s.scan(/(Examples|Signature)$/) then when @s.scan(/(Examples|Signature)$/) then
@tokens << [:HEADER, 3, *token_pos(pos)] @tokens << [:HEADER, 3, *pos]
[:TEXT, @s[1], *token_pos(pos)] [:TEXT, @s[1], *pos]
when @s.scan(/([:\w][\w\[\]]*)[ ]+- /) then when @s.scan(/([:\w][\w\[\]]*)[ ]+- /) then
[:NOTE, @s[1], *token_pos(pos)] [:NOTE, @s[1], *pos]
else else
@s.scan(/.*/) @s.scan(/.*/)
[:TEXT, @s.matched.sub(/\r$/, ''), *token_pos(pos)] [:TEXT, @s.matched.sub(/\r$/, ''), *pos]
end end
end end

View file

@ -22,15 +22,6 @@ class TestRDocMarkupParser < RDoc::TestCase
assert_equal @RM::Heading.new(3, 'heading three'), parser.build_heading(3) assert_equal @RM::Heading.new(3, 'heading three'), parser.build_heading(3)
end end
def test_char_pos
parser = @RMP.new
s = parser.setup_scanner 'cät'
s.scan(/\S+/)
assert_equal 3, parser.char_pos(s.pos)
end
def test_get def test_get
parser = util_parser parser = util_parser
@ -1647,15 +1638,6 @@ Example heading:
assert_equal expected, @RMP.tokenize(str) assert_equal expected, @RMP.tokenize(str)
end end
def test_token_pos
parser = @RMP.new
s = parser.setup_scanner 'cät'
s.scan(/\S+/)
assert_equal [3, 0], parser.token_pos(s.pos)
end
# HACK move to Verbatim test case # HACK move to Verbatim test case
def test_verbatim_normalize def test_verbatim_normalize
v = @RM::Verbatim.new "foo\n", "\n", "\n", "bar\n" v = @RM::Verbatim.new "foo\n", "\n", "\n", "bar\n"