mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
302 lines
8 KiB
Text
302 lines
8 KiB
Text
# -*- ruby -*-
|
|
# vi: set ft=ruby :
|
|
|
|
# Copyright (C) 2012 President and Fellows of Harvard College
|
|
# Copyright (C) 2013-2014 Sylvester Keil
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
#
|
|
# 1. Redistributions of source code must retain the above copyright notice,
|
|
# this list of conditions and the following disclaimer.
|
|
#
|
|
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
# this list of conditions and the following disclaimer in the documentation
|
|
# and/or other materials provided with the distribution.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR
|
|
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
|
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
|
# EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
|
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
|
|
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
#
|
|
# The views and conclusions contained in the software and documentation are
|
|
# those of the authors and should not be interpreted as representing official
|
|
# policies, either expressed or implied, of the copyright holder.
|
|
|
|
class Namae::Parser
|
|
|
|
token COMMA UWORD LWORD PWORD NICK AND APPELLATION TITLE SUFFIX
|
|
|
|
expect 0
|
|
|
|
rule
|
|
|
|
names : { result = [] }
|
|
| name { result = [val[0]] }
|
|
| names AND name { result = val[0] << val[2] }
|
|
|
|
name : word { result = Name.new(:given => val[0]) }
|
|
| display_order
|
|
| honorific word { result = val[0].merge(:family => val[1]) }
|
|
| honorific display_order { result = val[1].merge(val[0]) }
|
|
| sort_order
|
|
|
|
honorific : APPELLATION { result = Name.new(:appellation => val[0]) }
|
|
| TITLE { result = Name.new(:title => val[0]) }
|
|
|
|
display_order : u_words word opt_suffices opt_titles
|
|
{
|
|
result = Name.new(:given => val[0], :family => val[1],
|
|
:suffix => val[2], :title => val[3])
|
|
}
|
|
| u_words NICK last opt_suffices opt_titles
|
|
{
|
|
result = Name.new(:given => val[0], :nick => val[1],
|
|
:family => val[2], :suffix => val[3], :title => val[4])
|
|
}
|
|
| u_words NICK von last opt_suffices opt_titles
|
|
{
|
|
result = Name.new(:given => val[0], :nick => val[1],
|
|
:particle => val[2], :family => val[3],
|
|
:suffix => val[4], :title => val[5])
|
|
}
|
|
| u_words von last
|
|
{
|
|
result = Name.new(:given => val[0], :particle => val[1],
|
|
:family => val[2])
|
|
}
|
|
| von last
|
|
{
|
|
result = Name.new(:particle => val[0], :family => val[1])
|
|
}
|
|
|
|
sort_order : last COMMA first
|
|
{
|
|
result = Name.new({ :family => val[0], :suffix => val[2][0],
|
|
:given => val[2][1] }, !!val[2][0])
|
|
}
|
|
| von last COMMA first
|
|
{
|
|
result = Name.new({ :particle => val[0], :family => val[1],
|
|
:suffix => val[3][0], :given => val[3][1] }, !!val[3][0])
|
|
}
|
|
| u_words von last COMMA first
|
|
{
|
|
result = Name.new({ :particle => val[0,2].join(' '), :family => val[2],
|
|
:suffix => val[4][0], :given => val[4][1] }, !!val[4][0])
|
|
}
|
|
;
|
|
|
|
von : LWORD
|
|
| von LWORD { result = val.join(' ') }
|
|
| von u_words LWORD { result = val.join(' ') }
|
|
|
|
last : LWORD | u_words
|
|
|
|
first : opt_words { result = [nil,val[0]] }
|
|
| words opt_comma suffices { result = [val[2],val[0]] }
|
|
| suffices { result = [val[0],nil] }
|
|
| suffices COMMA words { result = [val[0],val[2]] }
|
|
|
|
u_words : u_word
|
|
| u_words u_word { result = val.join(' ') }
|
|
|
|
u_word : UWORD | PWORD
|
|
|
|
words : word
|
|
| words word { result = val.join(' ') }
|
|
|
|
opt_comma : /* empty */ | COMMA
|
|
opt_words : /* empty */ | words
|
|
|
|
word : LWORD | UWORD | PWORD
|
|
|
|
opt_suffices : /* empty */ | suffices
|
|
|
|
suffices : SUFFIX
|
|
| suffices SUFFIX { result = val.join(' ') }
|
|
|
|
opt_titles : /* empty */ | titles
|
|
|
|
titles : TITLE
|
|
| titles TITLE { result = val.join(' ') }
|
|
|
|
---- header
|
|
require 'singleton'
|
|
require 'strscan'
|
|
|
|
---- inner
|
|
|
|
include Singleton
|
|
|
|
attr_reader :options, :input
|
|
|
|
def initialize
|
|
@input, @options = StringScanner.new(''), {
|
|
:debug => false,
|
|
:prefer_comma_as_separator => false,
|
|
:comma => ',',
|
|
:stops => ',;',
|
|
:separator => /\s*(\band\b|\&|;)\s*/i,
|
|
:title => /\s*\b(sir|lord|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d)\.?)(\s+|$)/i,
|
|
:suffix => /\s*\b(JR|Jr|jr|SR|Sr|sr|[IVX]{2,})(\.|\b)/,
|
|
:appellation => /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
|
|
}
|
|
end
|
|
|
|
def debug?
|
|
options[:debug] || ENV['DEBUG']
|
|
end
|
|
|
|
def separator
|
|
options[:separator]
|
|
end
|
|
|
|
def comma
|
|
options[:comma]
|
|
end
|
|
|
|
def stops
|
|
options[:stops]
|
|
end
|
|
|
|
def title
|
|
options[:title]
|
|
end
|
|
|
|
def suffix
|
|
options[:suffix]
|
|
end
|
|
|
|
def appellation
|
|
options[:appellation]
|
|
end
|
|
|
|
def prefer_comma_as_separator?
|
|
options[:prefer_comma_as_separator]
|
|
end
|
|
|
|
def parse(input)
|
|
parse!(input)
|
|
rescue => e
|
|
warn e.message if debug?
|
|
[]
|
|
end
|
|
|
|
def parse!(string)
|
|
input.string = normalize(string)
|
|
reset
|
|
do_parse
|
|
end
|
|
|
|
def normalize(string)
|
|
string = string.strip
|
|
string
|
|
end
|
|
|
|
def reset
|
|
@commas, @words, @initials, @suffices, @yydebug = 0, 0, 0, 0, debug?
|
|
self
|
|
end
|
|
|
|
private
|
|
|
|
def stack
|
|
@vstack || @racc_vstack || []
|
|
end
|
|
|
|
def last_token
|
|
stack[-1]
|
|
end
|
|
|
|
def consume_separator
|
|
return next_token if seen_separator?
|
|
@commas, @words, @initials, @suffices = 0, 0, 0, 0
|
|
[:AND, :AND]
|
|
end
|
|
|
|
def consume_comma
|
|
@commas += 1
|
|
[:COMMA, :COMMA]
|
|
end
|
|
|
|
def consume_word(type, word)
|
|
@words += 1
|
|
|
|
case type
|
|
when :UWORD
|
|
@initials += 1 if word =~ /^[[:upper:]]+\b/
|
|
when :SUFFIX
|
|
@suffices += 1
|
|
end
|
|
|
|
[type, word]
|
|
end
|
|
|
|
def seen_separator?
|
|
!stack.empty? && last_token == :AND
|
|
end
|
|
|
|
def suffix?
|
|
!@suffices.zero? || will_see_suffix?
|
|
end
|
|
|
|
def will_see_suffix?
|
|
input.peek(8).to_s.strip.split(/\s+/)[0] =~ suffix
|
|
end
|
|
|
|
def will_see_initial?
|
|
input.peek(6).to_s.strip.split(/\s+/)[0] =~ /^[[:upper:]]+\b/
|
|
end
|
|
|
|
def seen_full_name?
|
|
prefer_comma_as_separator? && @words > 1 &&
|
|
(@initials > 0 || !will_see_initial?) && !will_see_suffix?
|
|
end
|
|
|
|
def next_token
|
|
case
|
|
when input.nil?, input.eos?
|
|
nil
|
|
when input.scan(separator)
|
|
consume_separator
|
|
when input.scan(/\s*#{comma}\s*/)
|
|
if @commas.zero? && !seen_full_name? || @commas == 1 && suffix?
|
|
consume_comma
|
|
else
|
|
consume_separator
|
|
end
|
|
when input.scan(/\s+/)
|
|
next_token
|
|
when input.scan(title)
|
|
consume_word(:TITLE, input.matched.strip)
|
|
when input.scan(suffix)
|
|
consume_word(:SUFFIX, input.matched.strip)
|
|
when input.scan(appellation)
|
|
[:APPELLATION, input.matched.strip]
|
|
when input.scan(/((\\\w+)?\{[^\}]*\})*[[:upper:]][^\s#{stops}]*/)
|
|
consume_word(:UWORD, input.matched)
|
|
when input.scan(/((\\\w+)?\{[^\}]*\})*[[:lower:]][^\s#{stops}]*/)
|
|
consume_word(:LWORD, input.matched)
|
|
when input.scan(/(\\\w+)?\{[^\}]*\}[^\s#{stops}]*/)
|
|
consume_word(:PWORD, input.matched)
|
|
when input.scan(/('[^'\n]+')|("[^"\n]+")/)
|
|
consume_word(:NICK, input.matched[1...-1])
|
|
else
|
|
raise ArgumentError,
|
|
"Failed to parse name #{input.string.inspect}: unmatched data at offset #{input.pos}"
|
|
end
|
|
end
|
|
|
|
def on_error(tid, value, stack)
|
|
raise ArgumentError,
|
|
"Failed to parse name: unexpected '#{value}' at #{stack.inspect}"
|
|
end
|
|
|
|
# -*- racc -*-
|