mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
529 lines
12 KiB
Ruby
529 lines
12 KiB
Ruby
|
require 'strscan'
|
||
|
require 'rdoc/text'
|
||
|
|
||
|
##
|
||
|
# A recursive-descent parser for RDoc markup.
|
||
|
#
|
||
|
# The parser tokenizes an input string then parses the tokens into a Document.
|
||
|
# Documents can be converted into output formats by writing a visitor like
|
||
|
# RDoc::Markup::ToHTML.
|
||
|
#
|
||
|
# The parser only handles the block-level constructs Paragraph, List,
|
||
|
# ListItem, Heading, Verbatim, BlankLine and Rule. Inline markup such as
|
||
|
# <tt>\+blah\+</tt> is handled separately by RDoc::Markup::AttributeManager.
|
||
|
#
|
||
|
# To see what markup the Parser implements read RDoc. To see how to use
|
||
|
# RDoc markup to format text in your program read RDoc::Markup.
|
||
|
|
||
|
class RDoc::Markup::Parser
|
||
|
|
||
|
include RDoc::Text
|
||
|
|
||
|
##
|
||
|
# List token types
|
||
|
|
||
|
LIST_TOKENS = [
|
||
|
:BULLET,
|
||
|
:LABEL,
|
||
|
:LALPHA,
|
||
|
:NOTE,
|
||
|
:NUMBER,
|
||
|
:UALPHA,
|
||
|
]
|
||
|
|
||
|
##
|
||
|
# Parser error subclass
|
||
|
|
||
|
class Error < RuntimeError; end
|
||
|
|
||
|
##
|
||
|
# Raised when the parser is unable to handle the given markup
|
||
|
|
||
|
class ParseError < Error; end
|
||
|
|
||
|
##
|
||
|
# Enables display of debugging information
|
||
|
|
||
|
attr_accessor :debug
|
||
|
|
||
|
##
|
||
|
# Token accessor
|
||
|
|
||
|
attr_reader :tokens
|
||
|
|
||
|
##
|
||
|
# Parsers +str+ into a Document
|
||
|
|
||
|
def self.parse str
|
||
|
parser = new
|
||
|
#parser.debug = true
|
||
|
parser.tokenize str
|
||
|
RDoc::Markup::Document.new(*parser.parse)
|
||
|
end
|
||
|
|
||
|
##
|
||
|
# Returns a token stream for +str+, for testing
|
||
|
|
||
|
def self.tokenize str
|
||
|
parser = new
|
||
|
parser.tokenize str
|
||
|
parser.tokens
|
||
|
end
|
||
|
|
||
|
##
|
||
|
# Creates a new Parser. See also ::parse
|
||
|
|
||
|
def initialize
|
||
|
@tokens = []
|
||
|
@current_token = nil
|
||
|
@debug = false
|
||
|
|
||
|
@line = 0
|
||
|
@line_pos = 0
|
||
|
end
|
||
|
|
||
|
##
|
||
|
# Builds a Heading of +level+
|
||
|
|
||
|
def build_heading level
|
||
|
heading = RDoc::Markup::Heading.new level, text
|
||
|
skip :NEWLINE
|
||
|
|
||
|
heading
|
||
|
end
|
||
|
|
||
|
##
|
||
|
# Builds a List flush to +margin+
|
||
|
|
||
|
def build_list margin
|
||
|
p :list_start => margin if @debug
|
||
|
|
||
|
list = RDoc::Markup::List.new
|
||
|
|
||
|
until @tokens.empty? do
|
||
|
type, data, column, = get
|
||
|
|
||
|
case type
|
||
|
when :BULLET, :LABEL, :LALPHA, :NOTE, :NUMBER, :UALPHA then
|
||
|
list_type = type
|
||
|
|
||
|
if column < margin then
|
||
|
unget
|
||
|
break
|
||
|
end
|
||
|
|
||
|
if list.type and list.type != list_type then
|
||
|
unget
|
||
|
break
|
||
|
end
|
||
|
|
||
|
list.type = list_type
|
||
|
|
||
|
case type
|
||
|
when :NOTE, :LABEL then
|
||
|
_, indent, = get # SPACE
|
||
|
if :NEWLINE == peek_token.first then
|
||
|
get
|
||
|
peek_type, new_indent, peek_column, = peek_token
|
||
|
indent = new_indent if
|
||
|
peek_type == :INDENT and peek_column >= column
|
||
|
unget
|
||
|
end
|
||
|
else
|
||
|
data = nil
|
||
|
_, indent, = get
|
||
|
end
|
||
|
|
||
|
list_item = build_list_item(margin + indent, data)
|
||
|
|
||
|
list << list_item if list_item
|
||
|
else
|
||
|
unget
|
||
|
break
|
||
|
end
|
||
|
end
|
||
|
|
||
|
p :list_end => margin if @debug
|
||
|
|
||
|
return nil if list.empty?
|
||
|
|
||
|
list
|
||
|
end
|
||
|
|
||
|
##
|
||
|
# Builds a ListItem that is flush to +indent+ with type +item_type+
|
||
|
|
||
|
def build_list_item indent, item_type = nil
|
||
|
p :list_item_start => [indent, item_type] if @debug
|
||
|
|
||
|
list_item = RDoc::Markup::ListItem.new item_type
|
||
|
|
||
|
until @tokens.empty? do
|
||
|
type, data, column = get
|
||
|
|
||
|
if column < indent and
|
||
|
not type == :NEWLINE and
|
||
|
(type != :INDENT or data < indent) then
|
||
|
unget
|
||
|
break
|
||
|
end
|
||
|
|
||
|
case type
|
||
|
when :INDENT then
|
||
|
unget
|
||
|
list_item.push(*parse(indent))
|
||
|
when :TEXT then
|
||
|
unget
|
||
|
list_item << build_paragraph(indent)
|
||
|
when :HEADER then
|
||
|
list_item << build_heading(data)
|
||
|
when :NEWLINE then
|
||
|
list_item << RDoc::Markup::BlankLine.new
|
||
|
when *LIST_TOKENS then
|
||
|
unget
|
||
|
list_item << build_list(column)
|
||
|
else
|
||
|
raise ParseError, "Unhandled token #{@current_token.inspect}"
|
||
|
end
|
||
|
end
|
||
|
|
||
|
p :list_item_end => [indent, item_type] if @debug
|
||
|
|
||
|
return nil if list_item.empty?
|
||
|
|
||
|
list_item.parts.shift if
|
||
|
RDoc::Markup::BlankLine === list_item.parts.first and
|
||
|
list_item.length > 1
|
||
|
|
||
|
list_item
|
||
|
end
|
||
|
|
||
|
##
|
||
|
# Builds a Paragraph that is flush to +margin+
|
||
|
|
||
|
def build_paragraph margin
|
||
|
p :paragraph_start => margin if @debug
|
||
|
|
||
|
paragraph = RDoc::Markup::Paragraph.new
|
||
|
|
||
|
until @tokens.empty? do
|
||
|
type, data, column, = get
|
||
|
|
||
|
case type
|
||
|
when :INDENT then
|
||
|
next if data == margin and peek_token[0] == :TEXT
|
||
|
|
||
|
unget
|
||
|
break
|
||
|
when :TEXT then
|
||
|
if column != margin then
|
||
|
unget
|
||
|
break
|
||
|
end
|
||
|
|
||
|
paragraph << data
|
||
|
skip :NEWLINE
|
||
|
else
|
||
|
unget
|
||
|
break
|
||
|
end
|
||
|
end
|
||
|
|
||
|
p :paragraph_end => margin if @debug
|
||
|
|
||
|
paragraph
|
||
|
end
|
||
|
|
||
|
##
|
||
|
# Builds a Verbatim that is flush to +margin+
|
||
|
|
||
|
def build_verbatim margin
|
||
|
p :verbatim_begin => margin if @debug
|
||
|
verbatim = RDoc::Markup::Verbatim.new
|
||
|
|
||
|
until @tokens.empty? do
|
||
|
type, data, column, = get
|
||
|
|
||
|
case type
|
||
|
when :INDENT then
|
||
|
if margin >= data then
|
||
|
unget
|
||
|
break
|
||
|
end
|
||
|
|
||
|
indent = data - margin
|
||
|
|
||
|
verbatim << ' ' * indent
|
||
|
when :HEADER then
|
||
|
verbatim << '=' * data
|
||
|
|
||
|
_, _, peek_column, = peek_token
|
||
|
peek_column ||= column + data
|
||
|
verbatim << ' ' * (peek_column - column - data)
|
||
|
when :RULE then
|
||
|
width = 2 + data
|
||
|
verbatim << '-' * width
|
||
|
|
||
|
_, _, peek_column, = peek_token
|
||
|
peek_column ||= column + data + 2
|
||
|
verbatim << ' ' * (peek_column - column - width)
|
||
|
when :TEXT then
|
||
|
verbatim << data
|
||
|
when *LIST_TOKENS then
|
||
|
if column <= margin then
|
||
|
unget
|
||
|
break
|
||
|
end
|
||
|
|
||
|
list_marker = case type
|
||
|
when :BULLET then '*'
|
||
|
when :LABEL then "[#{data}]"
|
||
|
when :LALPHA, :NUMBER, :UALPHA then "#{data}."
|
||
|
when :NOTE then "#{data}::"
|
||
|
end
|
||
|
|
||
|
verbatim << list_marker
|
||
|
|
||
|
_, data, = get
|
||
|
|
||
|
verbatim << ' ' * (data - list_marker.length)
|
||
|
when :NEWLINE then
|
||
|
verbatim << data
|
||
|
break unless [:INDENT, :NEWLINE].include? peek_token[0]
|
||
|
else
|
||
|
unget
|
||
|
break
|
||
|
end
|
||
|
end
|
||
|
|
||
|
verbatim.normalize
|
||
|
|
||
|
p :verbatim_end => margin if @debug
|
||
|
|
||
|
verbatim
|
||
|
end
|
||
|
|
||
|
##
|
||
|
# Pulls the next token from the stream.
|
||
|
|
||
|
def get
|
||
|
@current_token = @tokens.shift
|
||
|
p :get => @current_token if @debug
|
||
|
@current_token
|
||
|
end
|
||
|
|
||
|
##
|
||
|
# Parses the tokens into a Document
|
||
|
|
||
|
def parse indent = 0
|
||
|
p :parse_start => indent if @debug
|
||
|
|
||
|
document = []
|
||
|
|
||
|
until @tokens.empty? do
|
||
|
type, data, column, = get
|
||
|
|
||
|
if type != :INDENT and column < indent then
|
||
|
unget
|
||
|
break
|
||
|
end
|
||
|
|
||
|
case type
|
||
|
when :HEADER then
|
||
|
document << build_heading(data)
|
||
|
when :INDENT then
|
||
|
if indent > data then
|
||
|
unget
|
||
|
break
|
||
|
elsif indent == data then
|
||
|
next
|
||
|
end
|
||
|
|
||
|
unget
|
||
|
document << build_verbatim(indent)
|
||
|
when :NEWLINE then
|
||
|
document << RDoc::Markup::BlankLine.new
|
||
|
skip :NEWLINE, false
|
||
|
when :RULE then
|
||
|
document << RDoc::Markup::Rule.new(data)
|
||
|
skip :NEWLINE
|
||
|
when :TEXT then
|
||
|
unget
|
||
|
document << build_paragraph(indent)
|
||
|
|
||
|
# we're done with this paragraph (indent mismatch)
|
||
|
break if peek_token[0] == :TEXT
|
||
|
when *LIST_TOKENS then
|
||
|
unget
|
||
|
|
||
|
list = build_list(indent)
|
||
|
|
||
|
document << list if list
|
||
|
|
||
|
# we're done with this list (indent mismatch)
|
||
|
break if LIST_TOKENS.include? peek_token.first and indent > 0
|
||
|
else
|
||
|
type, data, column, line = @current_token
|
||
|
raise ParseError,
|
||
|
"Unhandled token #{type} (#{data.inspect}) at #{line}:#{column}"
|
||
|
end
|
||
|
end
|
||
|
|
||
|
p :parse_end => indent if @debug
|
||
|
|
||
|
document
|
||
|
end
|
||
|
|
||
|
##
|
||
|
# Returns the next token on the stream without modifying the stream
|
||
|
|
||
|
def peek_token
|
||
|
token = @tokens.first || []
|
||
|
p :peek => token if @debug
|
||
|
token
|
||
|
end
|
||
|
|
||
|
##
|
||
|
# Skips a token of +token_type+, optionally raising an error.
|
||
|
|
||
|
def skip token_type, error = true
|
||
|
type, data, = get
|
||
|
|
||
|
return unless type # end of stream
|
||
|
|
||
|
return @current_token if token_type == type
|
||
|
|
||
|
unget
|
||
|
|
||
|
raise ParseError, "expected #{token_type} got #{@current_token.inspect}" if
|
||
|
error
|
||
|
end
|
||
|
|
||
|
##
|
||
|
# Consumes tokens until NEWLINE and turns them back into text
|
||
|
|
||
|
def text
|
||
|
text = ''
|
||
|
|
||
|
loop do
|
||
|
type, data, = get
|
||
|
|
||
|
text << case type
|
||
|
when :BULLET then
|
||
|
_, space, = get # SPACE
|
||
|
"*#{' ' * (space - 1)}"
|
||
|
when :LABEL then
|
||
|
_, space, = get # SPACE
|
||
|
"[#{data}]#{' ' * (space - data.length - 2)}"
|
||
|
when :LALPHA, :NUMBER, :UALPHA then
|
||
|
_, space, = get # SPACE
|
||
|
"#{data}.#{' ' * (space - 2)}"
|
||
|
when :NOTE then
|
||
|
_, space = get # SPACE
|
||
|
"#{data}::#{' ' * (space - data.length - 2)}"
|
||
|
when :TEXT then
|
||
|
data
|
||
|
when :NEWLINE then
|
||
|
unget
|
||
|
break
|
||
|
when nil then
|
||
|
break
|
||
|
else
|
||
|
raise ParseError, "unhandled token #{@current_token.inspect}"
|
||
|
end
|
||
|
end
|
||
|
|
||
|
text
|
||
|
end
|
||
|
|
||
|
##
|
||
|
# Calculates the column and line of the current token based on +offset+.
|
||
|
|
||
|
def token_pos offset
|
||
|
[offset - @line_pos, @line]
|
||
|
end
|
||
|
|
||
|
##
|
||
|
# Turns text +input+ into a stream of tokens
|
||
|
|
||
|
def tokenize input
|
||
|
s = StringScanner.new input
|
||
|
|
||
|
@line = 0
|
||
|
@line_pos = 0
|
||
|
|
||
|
until s.eos? do
|
||
|
pos = s.pos
|
||
|
|
||
|
@tokens << case
|
||
|
when s.scan(/\r?\n/) then
|
||
|
token = [:NEWLINE, s.matched, *token_pos(pos)]
|
||
|
@line_pos = s.pos
|
||
|
@line += 1
|
||
|
token
|
||
|
when s.scan(/ +/) then
|
||
|
[:INDENT, s.matched_size, *token_pos(pos)]
|
||
|
when s.scan(/(=+)\s+/) then
|
||
|
level = s[1].length
|
||
|
level = 6 if level > 6
|
||
|
@tokens << [:HEADER, level, *token_pos(pos)]
|
||
|
|
||
|
pos = s.pos
|
||
|
s.scan(/.*/)
|
||
|
[:TEXT, s.matched, *token_pos(pos)]
|
||
|
when s.scan(/^(-{3,}) *$/) then
|
||
|
[:RULE, s[1].length - 2, *token_pos(pos)]
|
||
|
when s.scan(/([*-])\s+/) then
|
||
|
@tokens << [:BULLET, :BULLET, *token_pos(pos)]
|
||
|
[:SPACE, s.matched_size, *token_pos(pos)]
|
||
|
when s.scan(/([a-z]|\d+)\.[ \t]+\S/i) then
|
||
|
list_label = s[1]
|
||
|
width = s.matched_size - 1
|
||
|
|
||
|
s.pos -= 1 # unget \S
|
||
|
|
||
|
list_type = case list_label
|
||
|
when /[a-z]/ then :LALPHA
|
||
|
when /[A-Z]/ then :UALPHA
|
||
|
when /\d/ then :NUMBER
|
||
|
else
|
||
|
raise ParseError, "BUG token #{list_label}"
|
||
|
end
|
||
|
|
||
|
@tokens << [list_type, list_label, *token_pos(pos)]
|
||
|
[:SPACE, width, *token_pos(pos)]
|
||
|
when s.scan(/\[(.*?)\]( +|$)/) then
|
||
|
@tokens << [:LABEL, s[1], *token_pos(pos)]
|
||
|
[:SPACE, s.matched_size, *token_pos(pos)]
|
||
|
when s.scan(/(.*?)::( +|$)/) then
|
||
|
@tokens << [:NOTE, s[1], *token_pos(pos)]
|
||
|
[:SPACE, s.matched_size, *token_pos(pos)]
|
||
|
else s.scan(/.*/)
|
||
|
[:TEXT, s.matched, *token_pos(pos)]
|
||
|
end
|
||
|
end
|
||
|
|
||
|
self
|
||
|
end
|
||
|
|
||
|
##
|
||
|
# Returns the current token or +token+ to the token stream
|
||
|
|
||
|
def unget token = @current_token
|
||
|
p :unget => token if @debug
|
||
|
raise Error, 'too many #ungets' if token == @tokens.first
|
||
|
@tokens.unshift token if token
|
||
|
end
|
||
|
|
||
|
end
|
||
|
|
||
|
require 'rdoc/markup/blank_line'
|
||
|
require 'rdoc/markup/document'
|
||
|
require 'rdoc/markup/heading'
|
||
|
require 'rdoc/markup/list'
|
||
|
require 'rdoc/markup/list_item'
|
||
|
require 'rdoc/markup/paragraph'
|
||
|
require 'rdoc/markup/rule'
|
||
|
require 'rdoc/markup/verbatim'
|
||
|
|