ruby--ruby/lib/rdoc/markup/parser.rb

require 'strscan'
require 'rdoc/text'

##
# A recursive-descent parser for RDoc markup.
#
# The parser tokenizes an input string then parses the tokens into a Document.
# Documents can be converted into output formats by writing a visitor like
# RDoc::Markup::ToHTML.
#
# The parser only handles the block-level constructs Paragraph, List,
# ListItem, Heading, Verbatim, BlankLine and Rule.  Inline markup such as
# <tt>\+blah\+</tt> is handled separately by RDoc::Markup::AttributeManager.
#
# To see what markup the Parser implements read RDoc.  To see how to use
# RDoc markup to format text in your program read RDoc::Markup.

class RDoc::Markup::Parser

  include RDoc::Text

  ##
  # List token types

  LIST_TOKENS = [
    :BULLET,
    :LABEL,
    :LALPHA,
    :NOTE,
    :NUMBER,
    :UALPHA,
  ]

  ##
  # Parser error subclass

  class Error < RuntimeError; end

  ##
  # Raised when the parser is unable to handle the given markup

  class ParseError < Error; end

  ##
  # Enables display of debugging information

  attr_accessor :debug

  ##
  # Token accessor

  attr_reader :tokens

  ##
  # Parses +str+ into a Document

  def self.parse str
    parser = new
    parser.tokenize str
    doc = RDoc::Markup::Document.new
    parser.parse doc
  end

  ##
  # Returns a token stream for +str+, for testing

  def self.tokenize str
    parser = new
    parser.tokenize str
    parser.tokens
  end

  ##
  # Creates a new Parser.  See also ::parse

  def initialize
    @tokens = []
    @current_token = nil
    @debug = false

    @line = 0
    @line_pos = 0
  end

  ##
  # Builds a Heading of +level+

  def build_heading level
    type, text, = get

    text = case type
           when :TEXT then
             skip :NEWLINE
             text
           else
             unget
             ''
           end

    RDoc::Markup::Heading.new level, text
  end

  ##
  # Builds a List flush to +margin+

  def build_list margin
    p :list_start => margin if @debug

    list = RDoc::Markup::List.new

    until @tokens.empty? do
      type, data, column, = get

      case type
      when :BULLET, :LABEL, :LALPHA, :NOTE, :NUMBER, :UALPHA then

        if column < margin || (list.type && list.type != type) then
          unget
          break
        end

        list.type = type
        peek_type, _, column, = peek_token

        case type
        when :NOTE, :LABEL then
          if peek_type == :NEWLINE then
            # description not on the same line as LABEL/NOTE
            # skip the trailing newline & any blank lines below
            while peek_type == :NEWLINE
              get
              peek_type, _, column, = peek_token
            end

            # we may be:
            #   - at end of stream
            #   - at a column < margin:
            #         [text]
            #       blah blah blah
            #   - at the same column, but with a different type of list item
            #       [text]
            #       * blah blah
            #   - at the same column, with the same type of list item
            #       [one]
            #       [two]
            # In all cases, we have an empty description.
            # In the last case only, we continue.
            if peek_type.nil? || column < margin then
              empty = 1
            elsif column == margin then
              case peek_type
              when type
                empty = 2 # continue
              when *LIST_TOKENS
                empty = 1
              else
                empty = 0
              end
            else
              empty = 0
            end

            if empty > 0 then
              item = RDoc::Markup::ListItem.new(data)
              item << RDoc::Markup::BlankLine.new
              list << item
              break if empty == 1
              next
            end
          end
        else
          data = nil
        end

        list_item = RDoc::Markup::ListItem.new data
        parse list_item, column
        list << list_item

      else
        unget
        break
      end
    end

    p :list_end => margin if @debug

    return nil if list.empty?

    list
  end

  ##
  # Builds a Paragraph that is flush to +margin+

  def build_paragraph margin
    p :paragraph_start => margin if @debug

    paragraph = RDoc::Markup::Paragraph.new

    until @tokens.empty? do
      type, data, column, = get

      if type == :TEXT && column == margin then
        paragraph << data
        skip :NEWLINE
      else
        unget
        break
      end
    end

    p :paragraph_end => margin if @debug

    paragraph
  end

  ##
  # Builds a Verbatim that is indented from +margin+.
  #
  # The verbatim block is shifted left (the least indented lines start in
  # column 0).  Each part of the verbatim is one line of text, always
  # terminated by a newline.  Blank lines always consist of a single newline
  # character, and there is never a single newline at the end of the verbatim.

  def build_verbatim margin
    p :verbatim_begin => margin if @debug
    verbatim = RDoc::Markup::Verbatim.new

    min_indent = nil
    generate_leading_spaces = true
    line = ''

    until @tokens.empty? do
      type, data, column, = get

      if type == :NEWLINE then
        line << data
        verbatim << line
        line = ''
        generate_leading_spaces = true
        next
      end

      if column <= margin
        unget
        break
      end

      if generate_leading_spaces then
        indent = column - margin
        line << ' ' * indent
        min_indent = indent if min_indent.nil? || indent < min_indent
        generate_leading_spaces = false
      end

      case type
      when :HEADER then
        line << '=' * data
        _, _, peek_column, = peek_token
        peek_column ||= column + data
        indent = peek_column - column - data
        line << ' ' * indent
      when :RULE then
        width = 2 + data
        line << '-' * width
        _, _, peek_column, = peek_token
        peek_column ||= column + width
        indent = peek_column - column - width
        line << ' ' * indent
      when :TEXT then
        line << data
      else # *LIST_TOKENS
        list_marker = case type
                      when :BULLET then data
                      when :LABEL  then "[#{data}]"
                      when :NOTE   then "#{data}::"
                      else # :LALPHA, :NUMBER, :UALPHA
                        "#{data}."
                      end
        line << list_marker
        peek_type, _, peek_column = peek_token
        unless peek_type == :NEWLINE then
          peek_column ||= column + list_marker.length
          indent = peek_column - column - list_marker.length
          line << ' ' * indent
        end
      end

    end

    verbatim << line << "\n" unless line.empty?
    verbatim.parts.each { |p| p.slice!(0, min_indent) unless p == "\n" } if min_indent > 0
    verbatim.normalize

    p :verbatim_end => margin if @debug

    verbatim
  end

  ##
  # Pulls the next token from the stream.

  def get
    @current_token = @tokens.shift
    p :get => @current_token if @debug
    @current_token
  end

  ##
  # Parses the tokens into an array of RDoc::Markup::XXX objects,
  # and appends them to the passed +parent+ RDoc::Markup::YYY object.
  #
  # Exits at the end of the token stream, or when it encounters a token
  # in a column less than +indent+ (unless it is a NEWLINE).
  #
  # Returns +parent+.

  def parse parent, indent = 0
    p :parse_start => indent if @debug

    until @tokens.empty? do
      type, data, column, = get

      if type == :NEWLINE then
        # trailing newlines are skipped below, so this is a blank line
        parent << RDoc::Markup::BlankLine.new
        skip :NEWLINE, false
        next
      end

      # indentation change: break or verbatim
      if column < indent then
        unget
        break
      elsif column > indent then
        unget
        parent << build_verbatim(indent)
        next
      end

      # indentation is the same
      case type
      when :HEADER then
        parent << build_heading(data)
      when :RULE then
        parent << RDoc::Markup::Rule.new(data)
        skip :NEWLINE
      when :TEXT then
        unget
        parent << build_paragraph(indent)
      when *LIST_TOKENS then
        unget
        parent << build_list(indent)
      else
        type, data, column, line = @current_token
        raise ParseError, "Unhandled token #{type} (#{data.inspect}) at #{line}:#{column}"
      end
    end

    p :parse_end => indent if @debug

    parent

  end

  ##
  # Returns the next token on the stream without modifying the stream

  def peek_token
    token = @tokens.first || []
    p :peek => token if @debug
    token
  end

  ##
  # Skips the next token if its type is +token_type+.
  #
  # Optionally raises an error if the next token is not of the expected type.

  def skip token_type, error = true
    type, = get
    return unless type # end of stream
    return @current_token if token_type == type
    unget
    raise ParseError, "expected #{token_type} got #{@current_token.inspect}" if error
  end

  ##
  # Turns text +input+ into a stream of tokens

  def tokenize input
    s = StringScanner.new input

    @line = 0
    @line_pos = 0

    until s.eos? do
      pos = s.pos

      # leading spaces will be reflected by the column of the next token
      # the only thing we loose are trailing spaces at the end of the file
      next if s.scan(/ +/)

      # note: after BULLET, LABEL, etc.,
      # indent will be the column of the next non-newline token

      @tokens << case
                 # [CR]LF => :NEWLINE
                 when s.scan(/\r?\n/) then
                   token = [:NEWLINE, s.matched, *token_pos(pos)]
                   @line_pos = s.pos
                   @line += 1
                   token
                 # === text => :HEADER then :TEXT
                 when s.scan(/(=+)(\s*)/) then
                   level = s[1].length
                   header = [:HEADER, level, *token_pos(pos)]

                   if s[2] =~ /^\r?\n/ then
                     s.pos -= s[2].length
                     header
                   else
                     pos = s.pos
                     s.scan(/.*/)
                     @tokens << header
                     [:TEXT, s.matched.sub(/\r$/, ''), *token_pos(pos)]
                   end
                 # --- (at least 3) and nothing else on the line => :RULE
                 when s.scan(/(-{3,}) *$/) then
                   [:RULE, s[1].length - 2, *token_pos(pos)]
                 # * or - followed by white space and text => :BULLET
                 when s.scan(/([*-]) +(\S)/) then
                   s.pos -= s[2].bytesize # unget \S
                   [:BULLET, s[1], *token_pos(pos)]
                 # A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER
                 when s.scan(/([a-z]|\d+)\. +(\S)/i) then
                   # FIXME if tab(s), the column will be wrong
                   # either support tabs everywhere by first expanding them to
                   # spaces, or assume that they will have been replaced
                   # before (and provide a check for that at least in debug
                   # mode)
                   list_label = s[1]
                   s.pos -= s[2].bytesize # unget \S
                   list_type =
                     case list_label
                     when /[a-z]/ then :LALPHA
                     when /[A-Z]/ then :UALPHA
                     when /\d/    then :NUMBER
                     else
                       raise ParseError, "BUG token #{list_label}"
                     end
                   [list_type, list_label, *token_pos(pos)]
                 # [text] followed by spaces or end of line => :LABEL
                 when s.scan(/\[(.*?)\]( +|$)/) then
                   [:LABEL, s[1], *token_pos(pos)]
                 # text:: followed by spaces or end of line => :NOTE
                 when s.scan(/(.*?)::( +|$)/) then
                   [:NOTE, s[1], *token_pos(pos)]
                 # anything else: :TEXT
                 else s.scan(/.*/)
                   [:TEXT, s.matched.sub(/\r$/, ''), *token_pos(pos)]
                 end
    end

    self
  end

  ##
  # Calculates the column and line of the current token based on +offset+.

  def token_pos offset
    [offset - @line_pos, @line]
  end

  ##
  # Returns the current token to the token stream

  def unget
    token = @current_token
    p :unget => token if @debug
    raise Error, 'too many #ungets' if token == @tokens.first
    @tokens.unshift token if token
  end

end

require 'rdoc/markup/blank_line'
require 'rdoc/markup/document'
require 'rdoc/markup/heading'
require 'rdoc/markup/list'
require 'rdoc/markup/list_item'
require 'rdoc/markup/raw'
require 'rdoc/markup/paragraph'
require 'rdoc/markup/indented_paragraph'
require 'rdoc/markup/rule'
require 'rdoc/markup/verbatim'