ruby--ruby/test/racc/assets/namae.y

# -*- ruby -*-
# vi: set ft=ruby :

# Copyright (C) 2012 President and Fellows of Harvard College
# Copyright (C) 2013-2014 Sylvester Keil
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#  1. Redistributions of source code must retain the above copyright notice,
#     this list of conditions and the following disclaimer.
#
#  2. Redistributions in binary form must reproduce the above copyright notice,
#     this list of conditions and the following disclaimer in the documentation
#     and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of the copyright holder.

class Namae::Parser

token COMMA UWORD LWORD PWORD NICK AND APPELLATION TITLE SUFFIX

expect 0

rule

  names :                { result = [] }
        | name           { result = [val[0]] }
        | names AND name { result = val[0] << val[2] }

  name : word            { result = Name.new(:given => val[0]) }
       | display_order
       | honorific word          { result = val[0].merge(:family => val[1]) }
       | honorific display_order { result = val[1].merge(val[0]) }
       | sort_order

  honorific : APPELLATION { result = Name.new(:appellation => val[0]) }
            | TITLE       { result = Name.new(:title => val[0]) }

  display_order : u_words word opt_suffices opt_titles
       {
         result = Name.new(:given => val[0], :family => val[1],
           :suffix => val[2], :title => val[3])
       }
       | u_words NICK last opt_suffices opt_titles
       {
         result = Name.new(:given => val[0], :nick => val[1],
           :family => val[2], :suffix => val[3], :title => val[4])
       }
       | u_words NICK von last opt_suffices opt_titles
       {
         result = Name.new(:given => val[0], :nick => val[1],
           :particle => val[2], :family => val[3],
           :suffix => val[4], :title => val[5])
       }
       | u_words von last
       {
         result = Name.new(:given => val[0], :particle => val[1],
          :family => val[2])
       }
       | von last
       {
         result = Name.new(:particle => val[0], :family => val[1])
       }

  sort_order : last COMMA first
       {
         result = Name.new({ :family => val[0], :suffix => val[2][0],
           :given => val[2][1] }, !!val[2][0])
       }
       | von last COMMA first
       {
         result = Name.new({ :particle => val[0], :family => val[1],
           :suffix => val[3][0], :given => val[3][1] }, !!val[3][0])
       }
       | u_words von last COMMA first
       {
         result = Name.new({ :particle => val[0,2].join(' '), :family => val[2],
           :suffix => val[4][0], :given => val[4][1] }, !!val[4][0])
       }
       ;

  von : LWORD
      | von LWORD         { result = val.join(' ') }
      | von u_words LWORD { result = val.join(' ') }

  last : LWORD | u_words

  first : opt_words                 { result = [nil,val[0]] }
        | words opt_comma suffices  { result = [val[2],val[0]] }
        | suffices                  { result = [val[0],nil] }
        | suffices COMMA words      { result = [val[0],val[2]] }

  u_words : u_word
          | u_words u_word { result = val.join(' ') }

  u_word : UWORD | PWORD

  words : word
        | words word { result = val.join(' ') }

  opt_comma : /* empty */ | COMMA
  opt_words : /* empty */ | words

  word : LWORD | UWORD | PWORD

  opt_suffices : /* empty */ | suffices

  suffices : SUFFIX
           | suffices SUFFIX { result = val.join(' ') }

  opt_titles : /* empty */ | titles

  titles : TITLE
         | titles TITLE { result = val.join(' ') }

---- header
require 'singleton'
require 'strscan'

---- inner

  include Singleton

  attr_reader :options, :input

  def initialize
    @input, @options = StringScanner.new(''), {
      :debug => false,
      :prefer_comma_as_separator => false,
      :comma => ',',
      :stops => ',;',
      :separator => /\s*(\band\b|\&|;)\s*/i,
      :title => /\s*\b(sir|lord|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d)\.?)(\s+|$)/i,
      :suffix => /\s*\b(JR|Jr|jr|SR|Sr|sr|[IVX]{2,})(\.|\b)/,
      :appellation => /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
    }
  end

  def debug?
    options[:debug] || ENV['DEBUG']
  end

  def separator
    options[:separator]
  end

  def comma
    options[:comma]
  end

  def stops
    options[:stops]
  end

  def title
    options[:title]
  end

  def suffix
    options[:suffix]
  end

  def appellation
    options[:appellation]
  end

  def prefer_comma_as_separator?
    options[:prefer_comma_as_separator]
  end

  def parse(input)
    parse!(input)
  rescue => e
    warn e.message if debug?
    []
  end

  def parse!(string)
    input.string = normalize(string)
    reset
    do_parse
  end

  def normalize(string)
    string = string.strip
    string
  end

  def reset
    @commas, @words, @initials, @suffices, @yydebug = 0, 0, 0, 0, debug?
    self
  end

  private

  def stack
    @vstack || @racc_vstack || []
  end

  def last_token
    stack[-1]
  end

  def consume_separator
    return next_token if seen_separator?
    @commas, @words, @initials, @suffices = 0, 0, 0, 0
    [:AND, :AND]
  end

  def consume_comma
    @commas += 1
    [:COMMA, :COMMA]
  end

  def consume_word(type, word)
    @words += 1

    case type
    when :UWORD
      @initials += 1 if word =~ /^[[:upper:]]+\b/
    when :SUFFIX
      @suffices += 1
    end

    [type, word]
  end

  def seen_separator?
    !stack.empty? && last_token == :AND
  end

  def suffix?
    !@suffices.zero? || will_see_suffix?
  end

  def will_see_suffix?
    input.peek(8).to_s.strip.split(/\s+/)[0] =~ suffix
  end

  def will_see_initial?
    input.peek(6).to_s.strip.split(/\s+/)[0] =~ /^[[:upper:]]+\b/
  end

  def seen_full_name?
    prefer_comma_as_separator? && @words > 1 &&
      (@initials > 0 || !will_see_initial?) && !will_see_suffix?
  end

  def next_token
    case
    when input.nil?, input.eos?
      nil
    when input.scan(separator)
      consume_separator
    when input.scan(/\s*#{comma}\s*/)
      if @commas.zero? && !seen_full_name? || @commas == 1 && suffix?
        consume_comma
      else
        consume_separator
      end
    when input.scan(/\s+/)
      next_token
    when input.scan(title)
      consume_word(:TITLE, input.matched.strip)
    when input.scan(suffix)
      consume_word(:SUFFIX, input.matched.strip)
    when input.scan(appellation)
      [:APPELLATION, input.matched.strip]
    when input.scan(/((\\\w+)?\{[^\}]*\})*[[:upper:]][^\s#{stops}]*/)
      consume_word(:UWORD, input.matched)
    when input.scan(/((\\\w+)?\{[^\}]*\})*[[:lower:]][^\s#{stops}]*/)
      consume_word(:LWORD, input.matched)
    when input.scan(/(\\\w+)?\{[^\}]*\}[^\s#{stops}]*/)
      consume_word(:PWORD, input.matched)
    when input.scan(/('[^'\n]+')|("[^"\n]+")/)
      consume_word(:NICK, input.matched[1...-1])
    else
      raise ArgumentError,
        "Failed to parse name #{input.string.inspect}: unmatched data at offset #{input.pos}"
    end
  end

  def on_error(tid, value, stack)
    raise ArgumentError,
      "Failed to parse name: unexpected '#{value}' at #{stack.inspect}"
  end

# -*- racc -*-
Backport racc-1.4.15 from upstream. 2019-05-13 08:25:22 -04:00			`# -- ruby --`
			`# vi: set ft=ruby :`

			`# Copyright (C) 2012 President and Fellows of Harvard College`
			`# Copyright (C) 2013-2014 Sylvester Keil`
			`#`
			`# Redistribution and use in source and binary forms, with or without`
			`# modification, are permitted provided that the following conditions are met:`
			`#`
			`# 1. Redistributions of source code must retain the above copyright notice,`
			`# this list of conditions and the following disclaimer.`
			`#`
			`# 2. Redistributions in binary form must reproduce the above copyright notice,`
			`# this list of conditions and the following disclaimer in the documentation`
			`# and/or other materials provided with the distribution.`
			`#`
			# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR
			`# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF`
			`# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO`
			`# EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,`
			`# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,`
			`# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,`
			`# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING`
			`# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,`
			`# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`#`
			`# The views and conclusions contained in the software and documentation are`
			`# those of the authors and should not be interpreted as representing official`
			`# policies, either expressed or implied, of the copyright holder.`

			`class Namae::Parser`

			`token COMMA UWORD LWORD PWORD NICK AND APPELLATION TITLE SUFFIX`

			`expect 0`

			`rule`

			`names : { result = [] }`
			`\| name { result = [val[0]] }`
			`\| names AND name { result = val[0] << val[2] }`

			`name : word { result = Name.new(:given => val[0]) }`
			`\| display_order`
			`\| honorific word { result = val[0].merge(:family => val[1]) }`
			`\| honorific display_order { result = val[1].merge(val[0]) }`
			`\| sort_order`

			`honorific : APPELLATION { result = Name.new(:appellation => val[0]) }`
			`\| TITLE { result = Name.new(:title => val[0]) }`

			`display_order : u_words word opt_suffices opt_titles`
			`{`
			`result = Name.new(:given => val[0], :family => val[1],`
			`:suffix => val[2], :title => val[3])`
			`}`
			`\| u_words NICK last opt_suffices opt_titles`
			`{`
			`result = Name.new(:given => val[0], :nick => val[1],`
			`:family => val[2], :suffix => val[3], :title => val[4])`
			`}`
			`\| u_words NICK von last opt_suffices opt_titles`
			`{`
			`result = Name.new(:given => val[0], :nick => val[1],`
			`:particle => val[2], :family => val[3],`
			`:suffix => val[4], :title => val[5])`
			`}`
			`\| u_words von last`
			`{`
			`result = Name.new(:given => val[0], :particle => val[1],`
			`:family => val[2])`
			`}`
			`\| von last`
			`{`
			`result = Name.new(:particle => val[0], :family => val[1])`
			`}`

			`sort_order : last COMMA first`
			`{`
			`result = Name.new({ :family => val[0], :suffix => val[2][0],`
			`:given => val[2][1] }, !!val[2][0])`
			`}`
			`\| von last COMMA first`
			`{`
			`result = Name.new({ :particle => val[0], :family => val[1],`
			`:suffix => val[3][0], :given => val[3][1] }, !!val[3][0])`
			`}`
			`\| u_words von last COMMA first`
			`{`
			`result = Name.new({ :particle => val[0,2].join(' '), :family => val[2],`
			`:suffix => val[4][0], :given => val[4][1] }, !!val[4][0])`
			`}`
			`;`

			`von : LWORD`
			`\| von LWORD { result = val.join(' ') }`
			`\| von u_words LWORD { result = val.join(' ') }`

			`last : LWORD \| u_words`

			`first : opt_words { result = [nil,val[0]] }`
			`\| words opt_comma suffices { result = [val[2],val[0]] }`
			`\| suffices { result = [val[0],nil] }`
			`\| suffices COMMA words { result = [val[0],val[2]] }`

			`u_words : u_word`
			`\| u_words u_word { result = val.join(' ') }`

			`u_word : UWORD \| PWORD`

			`words : word`
			`\| words word { result = val.join(' ') }`

			`opt_comma : /* empty */ \| COMMA`
			`opt_words : /* empty */ \| words`

			`word : LWORD \| UWORD \| PWORD`

			`opt_suffices : /* empty */ \| suffices`

			`suffices : SUFFIX`
			`\| suffices SUFFIX { result = val.join(' ') }`

			`opt_titles : /* empty */ \| titles`

			`titles : TITLE`
			`\| titles TITLE { result = val.join(' ') }`

			`---- header`
			`require 'singleton'`
			`require 'strscan'`

			`---- inner`

			`include Singleton`

			`attr_reader :options, :input`

			`def initialize`
			`@input, @options = StringScanner.new(''), {`
			`:debug => false,`
			`:prefer_comma_as_separator => false,`
			`:comma => ',',`
			`:stops => ',;',`
			`:separator => /\s(\band\b\|\&\|;)\s/i,`
			`:title => /\s*\b(sir\|lord\|count(ess)?\|(gen\|adm\|col\|maj\|capt\|cmdr\|lt\|sgt\|cpl\|pvt\|prof\|dr\|md\|ph\.?d)\.?)(\s+\|$)/i,`
			`:suffix => /\s*\b(JR\|Jr\|jr\|SR\|Sr\|sr\|[IVX]{2,})(\.\|\b)/,`
			`:appellation => /\s*\b((mrs?\|ms\|fr\|hr)\.?\|miss\|herr\|frau)(\s+\|$)/i`
			`}`
			`end`

			`def debug?`
			`options[:debug] \|\| ENV['DEBUG']`
			`end`

			`def separator`
			`options[:separator]`
			`end`

			`def comma`
			`options[:comma]`
			`end`

			`def stops`
			`options[:stops]`
			`end`

			`def title`
			`options[:title]`
			`end`

			`def suffix`
			`options[:suffix]`
			`end`

			`def appellation`
			`options[:appellation]`
			`end`

			`def prefer_comma_as_separator?`
			`options[:prefer_comma_as_separator]`
			`end`

			`def parse(input)`
			`parse!(input)`
			`rescue => e`
			`warn e.message if debug?`
			`[]`
			`end`

			`def parse!(string)`
			`input.string = normalize(string)`
			`reset`
			`do_parse`
			`end`

			`def normalize(string)`
			`string = string.strip`
			`string`
			`end`

			`def reset`
			`@commas, @words, @initials, @suffices, @yydebug = 0, 0, 0, 0, debug?`
			`self`
			`end`

			`private`

			`def stack`
			`@vstack \|\| @racc_vstack \|\| []`
			`end`

			`def last_token`
			`stack[-1]`
			`end`

			`def consume_separator`
			`return next_token if seen_separator?`
			`@commas, @words, @initials, @suffices = 0, 0, 0, 0`
			`[:AND, :AND]`
			`end`

			`def consume_comma`
			`@commas += 1`
			`[:COMMA, :COMMA]`
			`end`

			`def consume_word(type, word)`
			`@words += 1`

			`case type`
			`when :UWORD`
			`@initials += 1 if word =~ /^[[:upper:]]+\b/`
			`when :SUFFIX`
			`@suffices += 1`
			`end`

			`[type, word]`
			`end`

			`def seen_separator?`
			`!stack.empty? && last_token == :AND`
			`end`

			`def suffix?`
			`!@suffices.zero? \|\| will_see_suffix?`
			`end`

			`def will_see_suffix?`
			`input.peek(8).to_s.strip.split(/\s+/)[0] =~ suffix`
			`end`

			`def will_see_initial?`
			`input.peek(6).to_s.strip.split(/\s+/)[0] =~ /^[[:upper:]]+\b/`
			`end`

			`def seen_full_name?`
			`prefer_comma_as_separator? && @words > 1 &&`
			`(@initials > 0 \|\| !will_see_initial?) && !will_see_suffix?`
			`end`

			`def next_token`
			`case`
			`when input.nil?, input.eos?`
			`nil`
			`when input.scan(separator)`
			`consume_separator`
			`when input.scan(/\s#{comma}\s/)`
			`if @commas.zero? && !seen_full_name? \|\| @commas == 1 && suffix?`
			`consume_comma`
			`else`
			`consume_separator`
			`end`
			`when input.scan(/\s+/)`
			`next_token`
			`when input.scan(title)`
			`consume_word(:TITLE, input.matched.strip)`
			`when input.scan(suffix)`
			`consume_word(:SUFFIX, input.matched.strip)`
			`when input.scan(appellation)`
			`[:APPELLATION, input.matched.strip]`
			`when input.scan(/((\\\w+)?\{[^\}]\})[[:upper:]][^\s#{stops}]*/)`
			`consume_word(:UWORD, input.matched)`
			`when input.scan(/((\\\w+)?\{[^\}]\})[[:lower:]][^\s#{stops}]*/)`
			`consume_word(:LWORD, input.matched)`
			`when input.scan(/(\\\w+)?\{[^\}]\}[^\s#{stops}]/)`
			`consume_word(:PWORD, input.matched)`
			`when input.scan(/('[^'\n]+')\|("[^"\n]+")/)`
			`consume_word(:NICK, input.matched[1...-1])`
			`else`
			`raise ArgumentError,`
			`"Failed to parse name #{input.string.inspect}: unmatched data at offset #{input.pos}"`
			`end`
			`end`

			`def on_error(tid, value, stack)`
			`raise ArgumentError,`
			`"Failed to parse name: unexpected '#{value}' at #{stack.inspect}"`
			`end`

			`# -- racc --`