mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00

This change introduces a wrapper of StringScanner that is aware of the current position (column and lineno). It has two advantages: faster and more modular. The old code frequently runs `@input.byteslice(0, byte_offset).length` to get the current position, but it was painfully slow. This change keeps track of the position at each scan, which reduces about half of time of "Generating RI format into ..." in Ruby's `make rdoc` (5.5 sec -> 3.0 sec). And the old code used four instance variables (`@input`, `@line`, `@line_pos`, and `@s`) to track the position. This change factors them out into MyStringScanner, so now only one variable (`@s`) is needed.
576 lines
14 KiB
Ruby
576 lines
14 KiB
Ruby
# frozen_string_literal: true
|
|
require 'strscan'
|
|
|
|
##
|
|
# A recursive-descent parser for RDoc markup.
|
|
#
|
|
# The parser tokenizes an input string then parses the tokens into a Document.
|
|
# Documents can be converted into output formats by writing a visitor like
|
|
# RDoc::Markup::ToHTML.
|
|
#
|
|
# The parser only handles the block-level constructs Paragraph, List,
|
|
# ListItem, Heading, Verbatim, BlankLine, Rule and BlockQuote.
|
|
# Inline markup such as <tt>\+blah\+</tt> is handled separately by
|
|
# RDoc::Markup::AttributeManager.
|
|
#
|
|
# To see what markup the Parser implements read RDoc. To see how to use
|
|
# RDoc markup to format text in your program read RDoc::Markup.
|
|
|
|
class RDoc::Markup::Parser
|
|
|
|
include RDoc::Text
|
|
|
|
##
|
|
# List token types
|
|
|
|
LIST_TOKENS = [
|
|
:BULLET,
|
|
:LABEL,
|
|
:LALPHA,
|
|
:NOTE,
|
|
:NUMBER,
|
|
:UALPHA,
|
|
]
|
|
|
|
##
|
|
# Parser error subclass
|
|
|
|
class Error < RuntimeError; end
|
|
|
|
##
|
|
# Raised when the parser is unable to handle the given markup
|
|
|
|
class ParseError < Error; end
|
|
|
|
##
|
|
# Enables display of debugging information
|
|
|
|
attr_accessor :debug
|
|
|
|
##
|
|
# Token accessor
|
|
|
|
attr_reader :tokens
|
|
|
|
##
|
|
# Parses +str+ into a Document.
|
|
#
|
|
# Use RDoc::Markup#parse instead of this method.
|
|
|
|
def self.parse str
|
|
parser = new
|
|
parser.tokenize str
|
|
doc = RDoc::Markup::Document.new
|
|
parser.parse doc
|
|
end
|
|
|
|
##
|
|
# Returns a token stream for +str+, for testing
|
|
|
|
def self.tokenize str
|
|
parser = new
|
|
parser.tokenize str
|
|
parser.tokens
|
|
end
|
|
|
|
##
|
|
# Creates a new Parser. See also ::parse
|
|
|
|
def initialize
|
|
@binary_input = nil
|
|
@current_token = nil
|
|
@debug = false
|
|
@s = nil
|
|
@tokens = []
|
|
end
|
|
|
|
##
|
|
# Builds a Heading of +level+
|
|
|
|
def build_heading level
|
|
type, text, = get
|
|
|
|
text = case type
|
|
when :TEXT then
|
|
skip :NEWLINE
|
|
text
|
|
else
|
|
unget
|
|
''
|
|
end
|
|
|
|
RDoc::Markup::Heading.new level, text
|
|
end
|
|
|
|
##
|
|
# Builds a List flush to +margin+
|
|
|
|
def build_list margin
|
|
p :list_start => margin if @debug
|
|
|
|
list = RDoc::Markup::List.new
|
|
label = nil
|
|
|
|
until @tokens.empty? do
|
|
type, data, column, = get
|
|
|
|
case type
|
|
when *LIST_TOKENS then
|
|
if column < margin || (list.type && list.type != type) then
|
|
unget
|
|
break
|
|
end
|
|
|
|
list.type = type
|
|
peek_type, _, column, = peek_token
|
|
|
|
case type
|
|
when :NOTE, :LABEL then
|
|
label = [] unless label
|
|
|
|
if peek_type == :NEWLINE then
|
|
# description not on the same line as LABEL/NOTE
|
|
# skip the trailing newline & any blank lines below
|
|
while peek_type == :NEWLINE
|
|
get
|
|
peek_type, _, column, = peek_token
|
|
end
|
|
|
|
# we may be:
|
|
# - at end of stream
|
|
# - at a column < margin:
|
|
# [text]
|
|
# blah blah blah
|
|
# - at the same column, but with a different type of list item
|
|
# [text]
|
|
# * blah blah
|
|
# - at the same column, with the same type of list item
|
|
# [one]
|
|
# [two]
|
|
# In all cases, we have an empty description.
|
|
# In the last case only, we continue.
|
|
if peek_type.nil? || column < margin then
|
|
empty = true
|
|
elsif column == margin then
|
|
case peek_type
|
|
when type
|
|
empty = :continue
|
|
when *LIST_TOKENS
|
|
empty = true
|
|
else
|
|
empty = false
|
|
end
|
|
else
|
|
empty = false
|
|
end
|
|
|
|
if empty then
|
|
label << data
|
|
next if empty == :continue
|
|
break
|
|
end
|
|
end
|
|
else
|
|
data = nil
|
|
end
|
|
|
|
if label then
|
|
data = label << data
|
|
label = nil
|
|
end
|
|
|
|
list_item = RDoc::Markup::ListItem.new data
|
|
parse list_item, column
|
|
list << list_item
|
|
|
|
else
|
|
unget
|
|
break
|
|
end
|
|
end
|
|
|
|
p :list_end => margin if @debug
|
|
|
|
if list.empty? then
|
|
return nil unless label
|
|
return nil unless [:LABEL, :NOTE].include? list.type
|
|
|
|
list_item = RDoc::Markup::ListItem.new label, RDoc::Markup::BlankLine.new
|
|
list << list_item
|
|
end
|
|
|
|
list
|
|
end
|
|
|
|
##
|
|
# Builds a Paragraph that is flush to +margin+
|
|
|
|
def build_paragraph margin
|
|
p :paragraph_start => margin if @debug
|
|
|
|
paragraph = RDoc::Markup::Paragraph.new
|
|
|
|
until @tokens.empty? do
|
|
type, data, column, = get
|
|
|
|
if type == :TEXT and column == margin then
|
|
paragraph << data
|
|
|
|
break if peek_token.first == :BREAK
|
|
|
|
data << ' ' if skip :NEWLINE
|
|
else
|
|
unget
|
|
break
|
|
end
|
|
end
|
|
|
|
paragraph.parts.last.sub!(/ \z/, '') # cleanup
|
|
|
|
p :paragraph_end => margin if @debug
|
|
|
|
paragraph
|
|
end
|
|
|
|
##
|
|
# Builds a Verbatim that is indented from +margin+.
|
|
#
|
|
# The verbatim block is shifted left (the least indented lines start in
|
|
# column 0). Each part of the verbatim is one line of text, always
|
|
# terminated by a newline. Blank lines always consist of a single newline
|
|
# character, and there is never a single newline at the end of the verbatim.
|
|
|
|
def build_verbatim margin
|
|
p :verbatim_begin => margin if @debug
|
|
verbatim = RDoc::Markup::Verbatim.new
|
|
|
|
min_indent = nil
|
|
generate_leading_spaces = true
|
|
line = ''.dup
|
|
|
|
until @tokens.empty? do
|
|
type, data, column, = get
|
|
|
|
if type == :NEWLINE then
|
|
line << data
|
|
verbatim << line
|
|
line = ''.dup
|
|
generate_leading_spaces = true
|
|
next
|
|
end
|
|
|
|
if column <= margin
|
|
unget
|
|
break
|
|
end
|
|
|
|
if generate_leading_spaces then
|
|
indent = column - margin
|
|
line << ' ' * indent
|
|
min_indent = indent if min_indent.nil? || indent < min_indent
|
|
generate_leading_spaces = false
|
|
end
|
|
|
|
case type
|
|
when :HEADER then
|
|
line << '=' * data
|
|
_, _, peek_column, = peek_token
|
|
peek_column ||= column + data
|
|
indent = peek_column - column - data
|
|
line << ' ' * indent
|
|
when :RULE then
|
|
width = 2 + data
|
|
line << '-' * width
|
|
_, _, peek_column, = peek_token
|
|
peek_column ||= column + width
|
|
indent = peek_column - column - width
|
|
line << ' ' * indent
|
|
when :BREAK, :TEXT then
|
|
line << data
|
|
else # *LIST_TOKENS
|
|
list_marker = case type
|
|
when :BULLET then data
|
|
when :LABEL then "[#{data}]"
|
|
when :NOTE then "#{data}::"
|
|
else # :LALPHA, :NUMBER, :UALPHA
|
|
"#{data}."
|
|
end
|
|
line << list_marker
|
|
peek_type, _, peek_column = peek_token
|
|
unless peek_type == :NEWLINE then
|
|
peek_column ||= column + list_marker.length
|
|
indent = peek_column - column - list_marker.length
|
|
line << ' ' * indent
|
|
end
|
|
end
|
|
|
|
end
|
|
|
|
verbatim << line << "\n" unless line.empty?
|
|
verbatim.parts.each { |p| p.slice!(0, min_indent) unless p == "\n" } if min_indent > 0
|
|
verbatim.normalize
|
|
|
|
p :verbatim_end => margin if @debug
|
|
|
|
verbatim
|
|
end
|
|
|
|
##
|
|
# Pulls the next token from the stream.
|
|
|
|
def get
|
|
@current_token = @tokens.shift
|
|
p :get => @current_token if @debug
|
|
@current_token
|
|
end
|
|
|
|
##
|
|
# Parses the tokens into an array of RDoc::Markup::XXX objects,
|
|
# and appends them to the passed +parent+ RDoc::Markup::YYY object.
|
|
#
|
|
# Exits at the end of the token stream, or when it encounters a token
|
|
# in a column less than +indent+ (unless it is a NEWLINE).
|
|
#
|
|
# Returns +parent+.
|
|
|
|
def parse parent, indent = 0
|
|
p :parse_start => indent if @debug
|
|
|
|
until @tokens.empty? do
|
|
type, data, column, = get
|
|
|
|
case type
|
|
when :BREAK then
|
|
parent << RDoc::Markup::BlankLine.new
|
|
skip :NEWLINE, false
|
|
next
|
|
when :NEWLINE then
|
|
# trailing newlines are skipped below, so this is a blank line
|
|
parent << RDoc::Markup::BlankLine.new
|
|
skip :NEWLINE, false
|
|
next
|
|
end
|
|
|
|
# indentation change: break or verbatim
|
|
if column < indent then
|
|
unget
|
|
break
|
|
elsif column > indent then
|
|
unget
|
|
parent << build_verbatim(indent)
|
|
next
|
|
end
|
|
|
|
# indentation is the same
|
|
case type
|
|
when :HEADER then
|
|
parent << build_heading(data)
|
|
when :RULE then
|
|
parent << RDoc::Markup::Rule.new(data)
|
|
skip :NEWLINE
|
|
when :TEXT then
|
|
unget
|
|
parse_text parent, indent
|
|
when :BLOCKQUOTE then
|
|
type, _, column = get
|
|
if type == :NEWLINE
|
|
type, _, column = get
|
|
end
|
|
unget if type
|
|
bq = RDoc::Markup::BlockQuote.new
|
|
p :blockquote_start => [data, column] if @debug
|
|
parse bq, column
|
|
p :blockquote_end => indent if @debug
|
|
parent << bq
|
|
when *LIST_TOKENS then
|
|
unget
|
|
parent << build_list(indent)
|
|
else
|
|
type, data, column, line = @current_token
|
|
raise ParseError, "Unhandled token #{type} (#{data.inspect}) at #{line}:#{column}"
|
|
end
|
|
end
|
|
|
|
p :parse_end => indent if @debug
|
|
|
|
parent
|
|
|
|
end
|
|
|
|
##
|
|
# Small hook that is overridden by RDoc::TomDoc
|
|
|
|
def parse_text parent, indent # :nodoc:
|
|
parent << build_paragraph(indent)
|
|
end
|
|
|
|
##
|
|
# Returns the next token on the stream without modifying the stream
|
|
|
|
def peek_token
|
|
token = @tokens.first || []
|
|
p :peek => token if @debug
|
|
token
|
|
end
|
|
|
|
##
|
|
# A simple wrapper of StringScanner that is aware of the current column and lineno
|
|
|
|
class MyStringScanner
|
|
def initialize(input)
|
|
@line = @column = 0
|
|
@s = StringScanner.new input
|
|
end
|
|
|
|
def scan(re)
|
|
prev_pos = @s.pos
|
|
ret = @s.scan(re)
|
|
@column += ret.length if ret
|
|
ret
|
|
end
|
|
|
|
def unscan(s)
|
|
@s.pos -= s.bytesize
|
|
@column -= s.length
|
|
end
|
|
|
|
def pos
|
|
[@column, @line]
|
|
end
|
|
|
|
def newline!
|
|
@column = 0
|
|
@line += 1
|
|
end
|
|
|
|
def eos?
|
|
@s.eos?
|
|
end
|
|
|
|
def matched
|
|
@s.matched
|
|
end
|
|
|
|
def [](i)
|
|
@s[i]
|
|
end
|
|
end
|
|
|
|
##
|
|
# Creates the StringScanner
|
|
|
|
def setup_scanner input
|
|
@s = MyStringScanner.new input
|
|
end
|
|
|
|
##
|
|
# Skips the next token if its type is +token_type+.
|
|
#
|
|
# Optionally raises an error if the next token is not of the expected type.
|
|
|
|
def skip token_type, error = true
|
|
type, = get
|
|
return unless type # end of stream
|
|
return @current_token if token_type == type
|
|
unget
|
|
raise ParseError, "expected #{token_type} got #{@current_token.inspect}" if error
|
|
end
|
|
|
|
##
|
|
# Turns text +input+ into a stream of tokens
|
|
|
|
def tokenize input
|
|
setup_scanner input
|
|
|
|
until @s.eos? do
|
|
pos = @s.pos
|
|
|
|
# leading spaces will be reflected by the column of the next token
|
|
# the only thing we loose are trailing spaces at the end of the file
|
|
next if @s.scan(/ +/)
|
|
|
|
# note: after BULLET, LABEL, etc.,
|
|
# indent will be the column of the next non-newline token
|
|
|
|
@tokens << case
|
|
# [CR]LF => :NEWLINE
|
|
when @s.scan(/\r?\n/) then
|
|
token = [:NEWLINE, @s.matched, *pos]
|
|
@s.newline!
|
|
token
|
|
# === text => :HEADER then :TEXT
|
|
when @s.scan(/(=+)(\s*)/) then
|
|
level = @s[1].length
|
|
header = [:HEADER, level, *pos]
|
|
|
|
if @s[2] =~ /^\r?\n/ then
|
|
@s.unscan(@s[2])
|
|
header
|
|
else
|
|
pos = @s.pos
|
|
@s.scan(/.*/)
|
|
@tokens << header
|
|
[:TEXT, @s.matched.sub(/\r$/, ''), *pos]
|
|
end
|
|
# --- (at least 3) and nothing else on the line => :RULE
|
|
when @s.scan(/(-{3,}) *\r?$/) then
|
|
[:RULE, @s[1].length - 2, *pos]
|
|
# * or - followed by white space and text => :BULLET
|
|
when @s.scan(/([*-]) +(\S)/) then
|
|
@s.unscan(@s[2])
|
|
[:BULLET, @s[1], *pos]
|
|
# A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER
|
|
when @s.scan(/([a-z]|\d+)\. +(\S)/i) then
|
|
# FIXME if tab(s), the column will be wrong
|
|
# either support tabs everywhere by first expanding them to
|
|
# spaces, or assume that they will have been replaced
|
|
# before (and provide a check for that at least in debug
|
|
# mode)
|
|
list_label = @s[1]
|
|
@s.unscan(@s[2])
|
|
list_type =
|
|
case list_label
|
|
when /[a-z]/ then :LALPHA
|
|
when /[A-Z]/ then :UALPHA
|
|
when /\d/ then :NUMBER
|
|
else
|
|
raise ParseError, "BUG token #{list_label}"
|
|
end
|
|
[list_type, list_label, *pos]
|
|
# [text] followed by spaces or end of line => :LABEL
|
|
when @s.scan(/\[(.*?)\]( +|\r?$)/) then
|
|
[:LABEL, @s[1], *pos]
|
|
# text:: followed by spaces or end of line => :NOTE
|
|
when @s.scan(/(.*?)::( +|\r?$)/) then
|
|
[:NOTE, @s[1], *pos]
|
|
# >>> followed by end of line => :BLOCKQUOTE
|
|
when @s.scan(/>>> *(\w+)?$/) then
|
|
[:BLOCKQUOTE, @s[1], *pos]
|
|
# anything else: :TEXT
|
|
else
|
|
@s.scan(/(.*?)( )?\r?$/)
|
|
token = [:TEXT, @s[1], *pos]
|
|
|
|
if @s[2] then
|
|
@tokens << token
|
|
[:BREAK, @s[2], pos[0] + @s[1].length, pos[1]]
|
|
else
|
|
token
|
|
end
|
|
end
|
|
end
|
|
|
|
self
|
|
end
|
|
|
|
##
|
|
# Returns the current token to the token stream
|
|
|
|
def unget
|
|
token = @current_token
|
|
p :unget => token if @debug
|
|
raise Error, 'too many #ungets' if token == @tokens.first
|
|
@tokens.unshift token if token
|
|
end
|
|
|
|
end
|