ruby--ruby/lib/rdoc/markup/parser.rb

require 'strscan'
require 'rdoc/text'

##
# A recursive-descent parser for RDoc markup.
#
# The parser tokenizes an input string then parses the tokens into a Document.
# Documents can be converted into output formats by writing a visitor like
# RDoc::Markup::ToHTML.
#
# The parser only handles the block-level constructs Paragraph, List,
# ListItem, Heading, Verbatim, BlankLine and Rule.  Inline markup such as
# <tt>\+blah\+</tt> is handled separately by RDoc::Markup::AttributeManager.
#
# To see what markup the Parser implements read RDoc.  To see how to use
# RDoc markup to format text in your program read RDoc::Markup.

class RDoc::Markup::Parser

  include RDoc::Text

  ##
  # List token types

  LIST_TOKENS = [
    :BULLET,
    :LABEL,
    :LALPHA,
    :NOTE,
    :NUMBER,
    :UALPHA,
  ]

  ##
  # Parser error subclass

  class Error < RuntimeError; end

  ##
  # Raised when the parser is unable to handle the given markup

  class ParseError < Error; end

  ##
  # Enables display of debugging information

  attr_accessor :debug

  ##
  # Token accessor

  attr_reader :tokens

  ##
  # Parsers +str+ into a Document

  def self.parse str
    parser = new
    #parser.debug = true
    parser.tokenize str
    RDoc::Markup::Document.new(*parser.parse)
  end

  ##
  # Returns a token stream for +str+, for testing

  def self.tokenize str
    parser = new
    parser.tokenize str
    parser.tokens
  end

  ##
  # Creates a new Parser.  See also ::parse

  def initialize
    @tokens = []
    @current_token = nil
    @debug = false

    @line = 0
    @line_pos = 0
  end

  ##
  # Builds a Heading of +level+

  def build_heading level
    heading = RDoc::Markup::Heading.new level, text
    skip :NEWLINE

    heading
  end

  ##
  # Builds a List flush to +margin+

  def build_list margin
    p :list_start => margin if @debug

    list = RDoc::Markup::List.new

    until @tokens.empty? do
      type, data, column, = get

      case type
      when :BULLET, :LABEL, :LALPHA, :NOTE, :NUMBER, :UALPHA then
        list_type = type

        if column < margin then
          unget
          break
        end

        if list.type and list.type != list_type then
          unget
          break
        end

        list.type = list_type

        case type
        when :NOTE, :LABEL then
          _, indent, = get # SPACE
          if :NEWLINE == peek_token.first then
            get
            peek_type, new_indent, peek_column, = peek_token
            indent = new_indent if
              peek_type == :INDENT and peek_column >= column
            unget
          end
        else
          data = nil
          _, indent, = get
        end

        list_item = build_list_item(margin + indent, data)

        list << list_item if list_item
      else
        unget
        break
      end
    end

    p :list_end => margin if @debug

    return nil if list.empty?

    list
  end

  ##
  # Builds a ListItem that is flush to +indent+ with type +item_type+

  def build_list_item indent, item_type = nil
    p :list_item_start => [indent, item_type] if @debug

    list_item = RDoc::Markup::ListItem.new item_type

    until @tokens.empty? do
      type, data, column = get

      if column < indent and
         not type == :NEWLINE and
         (type != :INDENT or data < indent) then
        unget
        break
      end

      case type
      when :INDENT then
        unget
        list_item.push(*parse(indent))
      when :TEXT then
        unget
        list_item << build_paragraph(indent)
      when :HEADER then
        list_item << build_heading(data)
      when :NEWLINE then
        list_item << RDoc::Markup::BlankLine.new
      when *LIST_TOKENS then
        unget
        list_item << build_list(column)
      else
        raise ParseError, "Unhandled token #{@current_token.inspect}"
      end
    end

    p :list_item_end => [indent, item_type] if @debug

    return nil if list_item.empty?

    list_item.parts.shift if
      RDoc::Markup::BlankLine === list_item.parts.first and
      list_item.length > 1

    list_item
  end

  ##
  # Builds a Paragraph that is flush to +margin+

  def build_paragraph margin
    p :paragraph_start => margin if @debug

    paragraph = RDoc::Markup::Paragraph.new

    until @tokens.empty? do
      type, data, column, = get

      case type
      when :INDENT then
        next if data == margin and peek_token[0] == :TEXT

        unget
        break
      when :TEXT then
        if column != margin then
          unget
          break
        end

        paragraph << data
        skip :NEWLINE
      else
        unget
        break
      end
    end

    p :paragraph_end => margin if @debug

    paragraph
  end

  ##
  # Builds a Verbatim that is flush to +margin+

  def build_verbatim margin
    p :verbatim_begin => margin if @debug
    verbatim = RDoc::Markup::Verbatim.new

    until @tokens.empty? do
      type, data, column, = get

      case type
      when :INDENT then
        if margin >= data then
          unget
          break
        end

        indent = data - margin

        verbatim << ' ' * indent
      when :HEADER then
        verbatim << '=' * data

        _, _, peek_column, = peek_token
        peek_column ||= column + data
        verbatim << ' ' * (peek_column - column - data)
      when :RULE then
        width = 2 + data
        verbatim << '-' * width

        _, _, peek_column, = peek_token
        peek_column ||= column + data + 2
        verbatim << ' ' * (peek_column - column - width)
      when :TEXT then
        verbatim << data
      when *LIST_TOKENS then
        if column <= margin then
          unget
          break
        end

        list_marker = case type
                      when :BULLET                   then '*'
                      when :LABEL                    then "[#{data}]"
                      when :LALPHA, :NUMBER, :UALPHA then "#{data}."
                      when :NOTE                     then "#{data}::"
                      end

        verbatim << list_marker

        _, data, = get

        verbatim << ' ' * (data - list_marker.length)
      when :NEWLINE then
        verbatim << data
        break unless [:INDENT, :NEWLINE].include? peek_token[0]
      else
        unget
        break
      end
    end

    verbatim.normalize

    p :verbatim_end => margin if @debug

    verbatim
  end

  ##
  # Pulls the next token from the stream.

  def get
    @current_token = @tokens.shift
    p :get => @current_token if @debug
    @current_token
  end

  ##
  # Parses the tokens into a Document

  def parse indent = 0
    p :parse_start => indent if @debug

    document = []

    until @tokens.empty? do
      type, data, column, = get

      if type != :INDENT and column < indent then
        unget
        break
      end

      case type
      when :HEADER then
        document << build_heading(data)
      when :INDENT then
        if indent > data then
          unget
          break
        elsif indent == data then
          next
        end

        unget
        document << build_verbatim(indent)
      when :NEWLINE then
        document << RDoc::Markup::BlankLine.new
        skip :NEWLINE, false
      when :RULE then
        document << RDoc::Markup::Rule.new(data)
        skip :NEWLINE
      when :TEXT then
        unget
        document << build_paragraph(indent)

        # we're done with this paragraph (indent mismatch)
        break if peek_token[0] == :TEXT
      when *LIST_TOKENS then
        unget

        list = build_list(indent)

        document << list if list

        # we're done with this list (indent mismatch)
        break if LIST_TOKENS.include? peek_token.first and indent > 0
      else
        type, data, column, line = @current_token
        raise ParseError,
              "Unhandled token #{type} (#{data.inspect}) at #{line}:#{column}"
      end
    end

    p :parse_end => indent if @debug

    document
  end

  ##
  # Returns the next token on the stream without modifying the stream

  def peek_token
    token = @tokens.first || []
    p :peek => token if @debug
    token
  end

  ##
  # Skips a token of +token_type+, optionally raising an error.

  def skip token_type, error = true
    type, data, = get

    return unless type # end of stream

    return @current_token if token_type == type

    unget

    raise ParseError, "expected #{token_type} got #{@current_token.inspect}" if
      error
  end

  ##
  # Consumes tokens until NEWLINE and turns them back into text

  def text
    text = ''

    loop do
      type, data, = get

      text << case type
              when :BULLET then
                _, space, = get # SPACE
                "*#{' ' * (space - 1)}"
              when :LABEL then
                _, space, = get # SPACE
                "[#{data}]#{' ' * (space - data.length - 2)}"
              when :LALPHA, :NUMBER, :UALPHA then
                _, space, = get # SPACE
                "#{data}.#{' ' * (space - 2)}"
              when :NOTE then
                _, space = get # SPACE
                "#{data}::#{' ' * (space - data.length - 2)}"
              when :TEXT then
                data
              when :NEWLINE then
                unget
                break
              when nil then
                break
              else
                raise ParseError, "unhandled token #{@current_token.inspect}"
              end
    end

    text
  end

  ##
  # Calculates the column and line of the current token based on +offset+.

  def token_pos offset
    [offset - @line_pos, @line]
  end

  ##
  # Turns text +input+ into a stream of tokens

  def tokenize input
    s = StringScanner.new input

    @line = 0
    @line_pos = 0

    until s.eos? do
      pos = s.pos

      @tokens << case
                 when s.scan(/\r?\n/) then
                   token = [:NEWLINE, s.matched, *token_pos(pos)]
                   @line_pos = s.pos
                   @line += 1
                   token
                 when s.scan(/ +/) then
                   [:INDENT, s.matched_size, *token_pos(pos)]
                 when s.scan(/(=+)\s+/) then
                   level = s[1].length
                   level = 6 if level > 6
                   @tokens << [:HEADER, level, *token_pos(pos)]

                   pos = s.pos
                   s.scan(/.*/)
                   [:TEXT, s.matched, *token_pos(pos)]
                 when s.scan(/^(-{3,}) *$/) then
                   [:RULE, s[1].length - 2, *token_pos(pos)]
                 when s.scan(/([*-])\s+/) then
                   @tokens << [:BULLET, :BULLET, *token_pos(pos)]
                   [:SPACE, s.matched_size, *token_pos(pos)]
                 when s.scan(/([a-z]|\d+)\.[ \t]+\S/i) then
                   list_label = s[1]
                   width      = s.matched_size - 1

                   s.pos -= 1 # unget \S

                   list_type = case list_label
                               when /[a-z]/ then :LALPHA
                               when /[A-Z]/ then :UALPHA
                               when /\d/    then :NUMBER
                               else
                                 raise ParseError, "BUG token #{list_label}"
                               end

                   @tokens << [list_type, list_label, *token_pos(pos)]
                   [:SPACE, width, *token_pos(pos)]
                 when s.scan(/\[(.*?)\]( +|$)/) then
                   @tokens << [:LABEL, s[1], *token_pos(pos)]
                   [:SPACE, s.matched_size, *token_pos(pos)]
                 when s.scan(/(.*?)::( +|$)/) then
                   @tokens << [:NOTE, s[1], *token_pos(pos)]
                   [:SPACE, s.matched_size, *token_pos(pos)]
                 else s.scan(/.*/)
                   [:TEXT, s.matched, *token_pos(pos)]
                 end
    end

    self
  end

  ##
  # Returns the current token or +token+ to the token stream

  def unget token = @current_token
    p :unget => token if @debug
    raise Error, 'too many #ungets' if token == @tokens.first
    @tokens.unshift token if token
  end

end

require 'rdoc/markup/blank_line'
require 'rdoc/markup/document'
require 'rdoc/markup/heading'
require 'rdoc/markup/list'
require 'rdoc/markup/list_item'
require 'rdoc/markup/paragraph'
require 'rdoc/markup/rule'
require 'rdoc/markup/verbatim'
Import RDoc 2.5 git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@27147 b2dd03c8-39d4-4d8f-98ff-823fe69b080e 2010-04-01 03:45:16 -04:00			`require 'strscan'`
			`require 'rdoc/text'`

			`##`
			`# A recursive-descent parser for RDoc markup.`
			`#`
			`# The parser tokenizes an input string then parses the tokens into a Document.`
			`# Documents can be converted into output formats by writing a visitor like`
			`# RDoc::Markup::ToHTML.`
			`#`
			`# The parser only handles the block-level constructs Paragraph, List,`
			`# ListItem, Heading, Verbatim, BlankLine and Rule. Inline markup such as`
			`# <tt>\+blah\+</tt> is handled separately by RDoc::Markup::AttributeManager.`
			`#`
			`# To see what markup the Parser implements read RDoc. To see how to use`
			`# RDoc markup to format text in your program read RDoc::Markup.`

			`class RDoc::Markup::Parser`

			`include RDoc::Text`

			`##`
			`# List token types`

			`LIST_TOKENS = [`
			`:BULLET,`
			`:LABEL,`
			`:LALPHA,`
			`:NOTE,`
			`:NUMBER,`
			`:UALPHA,`
			`]`

			`##`
			`# Parser error subclass`

			`class Error < RuntimeError; end`

			`##`
			`# Raised when the parser is unable to handle the given markup`

			`class ParseError < Error; end`

			`##`
			`# Enables display of debugging information`

			`attr_accessor :debug`

			`##`
			`# Token accessor`

			`attr_reader :tokens`

			`##`
			`# Parsers +str+ into a Document`

			`def self.parse str`
			`parser = new`
			`#parser.debug = true`
			`parser.tokenize str`
			`RDoc::Markup::Document.new(*parser.parse)`
			`end`

			`##`
			`# Returns a token stream for +str+, for testing`

			`def self.tokenize str`
			`parser = new`
			`parser.tokenize str`
			`parser.tokens`
			`end`

			`##`
			`# Creates a new Parser. See also ::parse`

			`def initialize`
			`@tokens = []`
			`@current_token = nil`
			`@debug = false`

			`@line = 0`
			`@line_pos = 0`
			`end`

			`##`
			`# Builds a Heading of +level+`

			`def build_heading level`
			`heading = RDoc::Markup::Heading.new level, text`
			`skip :NEWLINE`

			`heading`
			`end`

			`##`
			`# Builds a List flush to +margin+`

			`def build_list margin`
			`p :list_start => margin if @debug`

			`list = RDoc::Markup::List.new`

			`until @tokens.empty? do`
			`type, data, column, = get`

			`case type`
			`when :BULLET, :LABEL, :LALPHA, :NOTE, :NUMBER, :UALPHA then`
			`list_type = type`

			`if column < margin then`
			`unget`
			`break`
			`end`

			`if list.type and list.type != list_type then`
			`unget`
			`break`
			`end`

			`list.type = list_type`

			`case type`
			`when :NOTE, :LABEL then`
			`_, indent, = get # SPACE`
			`if :NEWLINE == peek_token.first then`
			`get`
			`peek_type, new_indent, peek_column, = peek_token`
			`indent = new_indent if`
			`peek_type == :INDENT and peek_column >= column`
			`unget`
			`end`
			`else`
			`data = nil`
			`_, indent, = get`
			`end`

			`list_item = build_list_item(margin + indent, data)`

			`list << list_item if list_item`
			`else`
			`unget`
			`break`
			`end`
			`end`

			`p :list_end => margin if @debug`

			`return nil if list.empty?`

			`list`
			`end`

			`##`
			`# Builds a ListItem that is flush to +indent+ with type +item_type+`

			`def build_list_item indent, item_type = nil`
			`p :list_item_start => [indent, item_type] if @debug`

			`list_item = RDoc::Markup::ListItem.new item_type`

			`until @tokens.empty? do`
			`type, data, column = get`

			`if column < indent and`
			`not type == :NEWLINE and`
			`(type != :INDENT or data < indent) then`
			`unget`
			`break`
			`end`

			`case type`
			`when :INDENT then`
			`unget`
			`list_item.push(*parse(indent))`
			`when :TEXT then`
			`unget`
			`list_item << build_paragraph(indent)`
			`when :HEADER then`
			`list_item << build_heading(data)`
			`when :NEWLINE then`
			`list_item << RDoc::Markup::BlankLine.new`
			`when *LIST_TOKENS then`
			`unget`
			`list_item << build_list(column)`
			`else`
			`raise ParseError, "Unhandled token #{@current_token.inspect}"`
			`end`
			`end`

			`p :list_item_end => [indent, item_type] if @debug`

			`return nil if list_item.empty?`

			`list_item.parts.shift if`
			`RDoc::Markup::BlankLine === list_item.parts.first and`
			`list_item.length > 1`

			`list_item`
			`end`

			`##`
			`# Builds a Paragraph that is flush to +margin+`

			`def build_paragraph margin`
			`p :paragraph_start => margin if @debug`

			`paragraph = RDoc::Markup::Paragraph.new`

			`until @tokens.empty? do`
			`type, data, column, = get`

			`case type`
			`when :INDENT then`
			`next if data == margin and peek_token[0] == :TEXT`

			`unget`
			`break`
			`when :TEXT then`
			`if column != margin then`
			`unget`
			`break`
			`end`

			`paragraph << data`
			`skip :NEWLINE`
			`else`
			`unget`
			`break`
			`end`
			`end`

			`p :paragraph_end => margin if @debug`

			`paragraph`
			`end`

			`##`
			`# Builds a Verbatim that is flush to +margin+`

			`def build_verbatim margin`
			`p :verbatim_begin => margin if @debug`
			`verbatim = RDoc::Markup::Verbatim.new`

			`until @tokens.empty? do`
			`type, data, column, = get`

			`case type`
			`when :INDENT then`
			`if margin >= data then`
			`unget`
			`break`
			`end`

			`indent = data - margin`

			`verbatim << ' ' * indent`
			`when :HEADER then`
			`verbatim << '=' * data`

			`_, _, peek_column, = peek_token`
			`peek_column \|\|= column + data`
			`verbatim << ' ' * (peek_column - column - data)`
			`when :RULE then`
			`width = 2 + data`
			`verbatim << '-' * width`

			`_, _, peek_column, = peek_token`
			`peek_column \|\|= column + data + 2`
			`verbatim << ' ' * (peek_column - column - width)`
			`when :TEXT then`
			`verbatim << data`
			`when *LIST_TOKENS then`
			`if column <= margin then`
			`unget`
			`break`
			`end`

			`list_marker = case type`
			`when :BULLET then '*'`
			`when :LABEL then "[#{data}]"`
			`when :LALPHA, :NUMBER, :UALPHA then "#{data}."`
			`when :NOTE then "#{data}::"`
			`end`

			`verbatim << list_marker`

			`_, data, = get`

			`verbatim << ' ' * (data - list_marker.length)`
			`when :NEWLINE then`
			`verbatim << data`
			`break unless [:INDENT, :NEWLINE].include? peek_token[0]`
			`else`
			`unget`
			`break`
			`end`
			`end`

			`verbatim.normalize`

			`p :verbatim_end => margin if @debug`

			`verbatim`
			`end`

			`##`
			`# Pulls the next token from the stream.`

			`def get`
			`@current_token = @tokens.shift`
			`p :get => @current_token if @debug`
			`@current_token`
			`end`

			`##`
			`# Parses the tokens into a Document`

			`def parse indent = 0`
			`p :parse_start => indent if @debug`

			`document = []`

			`until @tokens.empty? do`
			`type, data, column, = get`

			`if type != :INDENT and column < indent then`
			`unget`
			`break`
			`end`

			`case type`
			`when :HEADER then`
			`document << build_heading(data)`
			`when :INDENT then`
			`if indent > data then`
			`unget`
			`break`
			`elsif indent == data then`
			`next`
			`end`

			`unget`
			`document << build_verbatim(indent)`
			`when :NEWLINE then`
			`document << RDoc::Markup::BlankLine.new`
			`skip :NEWLINE, false`
			`when :RULE then`
			`document << RDoc::Markup::Rule.new(data)`
			`skip :NEWLINE`
			`when :TEXT then`
			`unget`
			`document << build_paragraph(indent)`

			`# we're done with this paragraph (indent mismatch)`
			`break if peek_token[0] == :TEXT`
			`when *LIST_TOKENS then`
			`unget`

			`list = build_list(indent)`

			`document << list if list`

			`# we're done with this list (indent mismatch)`
			`break if LIST_TOKENS.include? peek_token.first and indent > 0`
			`else`
			`type, data, column, line = @current_token`
			`raise ParseError,`
			`"Unhandled token #{type} (#{data.inspect}) at #{line}:#{column}"`
			`end`
			`end`

			`p :parse_end => indent if @debug`

			`document`
			`end`

			`##`
			`# Returns the next token on the stream without modifying the stream`

			`def peek_token`
			`token = @tokens.first \|\| []`
			`p :peek => token if @debug`
			`token`
			`end`

			`##`
			`# Skips a token of +token_type+, optionally raising an error.`

			`def skip token_type, error = true`
			`type, data, = get`

			`return unless type # end of stream`

			`return @current_token if token_type == type`

			`unget`

			`raise ParseError, "expected #{token_type} got #{@current_token.inspect}" if`
			`error`
			`end`

			`##`
			`# Consumes tokens until NEWLINE and turns them back into text`

			`def text`
			`text = ''`

			`loop do`
			`type, data, = get`

			`text << case type`
			`when :BULLET then`
			`_, space, = get # SPACE`
			`"#{' ' (space - 1)}"`
			`when :LABEL then`
			`_, space, = get # SPACE`
			`"[#{data}]#{' ' * (space - data.length - 2)}"`
			`when :LALPHA, :NUMBER, :UALPHA then`
			`_, space, = get # SPACE`
			`"#{data}.#{' ' * (space - 2)}"`
			`when :NOTE then`
			`_, space = get # SPACE`
			`"#{data}::#{' ' * (space - data.length - 2)}"`
			`when :TEXT then`
			`data`
			`when :NEWLINE then`
			`unget`
			`break`
			`when nil then`
			`break`
			`else`
			`raise ParseError, "unhandled token #{@current_token.inspect}"`
			`end`
			`end`

			`text`
			`end`

			`##`
			`# Calculates the column and line of the current token based on +offset+.`

			`def token_pos offset`
			`[offset - @line_pos, @line]`
			`end`

			`##`
			`# Turns text +input+ into a stream of tokens`

			`def tokenize input`
			`s = StringScanner.new input`

			`@line = 0`
			`@line_pos = 0`

			`until s.eos? do`
			`pos = s.pos`

			`@tokens << case`
			`when s.scan(/\r?\n/) then`
			`token = [:NEWLINE, s.matched, *token_pos(pos)]`
			`@line_pos = s.pos`
			`@line += 1`
			`token`
			`when s.scan(/ +/) then`
			`[:INDENT, s.matched_size, *token_pos(pos)]`
			`when s.scan(/(=+)\s+/) then`
			`level = s[1].length`
			`level = 6 if level > 6`
			`@tokens << [:HEADER, level, *token_pos(pos)]`

			`pos = s.pos`
			`s.scan(/.*/)`
			`[:TEXT, s.matched, *token_pos(pos)]`
			`when s.scan(/^(-{3,}) *$/) then`
			`[:RULE, s[1].length - 2, *token_pos(pos)]`
			`when s.scan(/([*-])\s+/) then`
			`@tokens << [:BULLET, :BULLET, *token_pos(pos)]`
			`[:SPACE, s.matched_size, *token_pos(pos)]`
			`when s.scan(/([a-z]\|\d+)\.[ \t]+\S/i) then`
			`list_label = s[1]`
			`width = s.matched_size - 1`

			`s.pos -= 1 # unget \S`

			`list_type = case list_label`
			`when /[a-z]/ then :LALPHA`
			`when /[A-Z]/ then :UALPHA`
			`when /\d/ then :NUMBER`
			`else`
			`raise ParseError, "BUG token #{list_label}"`
			`end`

			`@tokens << [list_type, list_label, *token_pos(pos)]`
			`[:SPACE, width, *token_pos(pos)]`
			`when s.scan(/\[(.*?)\]( +\|$)/) then`
			`@tokens << [:LABEL, s[1], *token_pos(pos)]`
			`[:SPACE, s.matched_size, *token_pos(pos)]`
			`when s.scan(/(.*?)::( +\|$)/) then`
			`@tokens << [:NOTE, s[1], *token_pos(pos)]`
			`[:SPACE, s.matched_size, *token_pos(pos)]`
			`else s.scan(/.*/)`
			`[:TEXT, s.matched, *token_pos(pos)]`
			`end`
			`end`

			`self`
			`end`

			`##`
			`# Returns the current token or +token+ to the token stream`

			`def unget token = @current_token`
			`p :unget => token if @debug`
			`raise Error, 'too many #ungets' if token == @tokens.first`
			`@tokens.unshift token if token`
			`end`

			`end`

			`require 'rdoc/markup/blank_line'`
			`require 'rdoc/markup/document'`
			`require 'rdoc/markup/heading'`
			`require 'rdoc/markup/list'`
			`require 'rdoc/markup/list_item'`
			`require 'rdoc/markup/paragraph'`
			`require 'rdoc/markup/rule'`
			`require 'rdoc/markup/verbatim'`