1
0
Fork 0
mirror of https://github.com/rails/rails.git synced 2022-11-09 12:12:34 -05:00

Improve reliability of Inflector.transliterate. [#4374 state:resolved]

Signed-off-by: Jeremy Kemper <jeremy@bitsweat.net>
This commit is contained in:
Norman Clarke 2010-04-12 12:44:25 -03:00 committed by Jeremy Kemper
parent 36f3634a6a
commit dceef0828a
4 changed files with 94 additions and 26 deletions

View file

@ -1,5 +1,7 @@
*Rails 3.0.0 [beta 3] (pending)*
* Improve transliteration quality. #4374 [Norman Clarke]
* Speed up and add Ruby 1.9 support for ActiveSupport::Multibyte::Chars#tidy_bytes. #4350 [Norman Clarke]

View file

@ -1,32 +1,47 @@
# encoding: utf-8
require 'iconv'
require 'kconv'
require 'active_support/core_ext/string/multibyte'
module ActiveSupport
module Inflector
extend self
# Replaces accented characters with their ascii equivalents.
# UTF-8 byte => ASCII approximate UTF-8 byte(s)
ASCII_APPROXIMATIONS = {
198 => [65, 69], # Æ => AE
208 => 68, # Ð => D
216 => 79, # Ø => O
222 => [84, 104], # Þ => Þ
223 => [115, 115], # ß => ss
230 => [97, 101], # æ => ae
240 => 100, # ð => d
248 => 111, # ø => o
254 => [116, 104], # þ => th
272 => 68, # Đ => D
273 => 100, # đ => đ
294 => 72, # Ħ => H
295 => 104, # ħ => h
305 => 105, # ı => i
306 => [73, 74], # IJ =>IJ
307 => [105, 106], # ij => ij
312 => 107, # ĸ => k
319 => 76, # Ŀ => L
320 => 108, # ŀ => l
321 => 76, # Ł => L
322 => 108, # ł => l
329 => 110, # ʼn => n
330 => [78, 71], # Ŋ => NG
331 => [110, 103], # ŋ => ng
338 => [79, 69], # Œ => OE
339 => [111, 101], # œ => oe
358 => 84, # Ŧ => T
359 => 116 # ŧ => t
}
# Replaces accented characters with an ASCII approximation, or deletes it if none exsits.
def transliterate(string)
Iconv.iconv('ascii//ignore//translit', 'utf-8', string).to_s
end
if RUBY_VERSION >= '1.9'
undef_method :transliterate
def transliterate(string)
proxy = ActiveSupport::Multibyte.proxy_class.new(string)
proxy.normalize(:kd).gsub(/[^\x00-\x7F]+/, '')
end
# The iconv transliteration code doesn't function correctly
# on some platforms, but it's very fast where it does function.
elsif "foo" != (Inflector.transliterate("föö") rescue nil)
undef_method :transliterate
def transliterate(string)
string.mb_chars.normalize(:kd). # Decompose accented characters
gsub(/[^\x00-\x7F]+/, '') # Remove anything non-ASCII entirely (e.g. diacritics).
end
ActiveSupport::Multibyte::Chars.new(string).tidy_bytes.normalize(:d).unpack("U*").map do |char|
ASCII_APPROXIMATIONS[char] || (char if char < 128)
end.compact.flatten.pack("U*")
end
# Replaces special characters in a string so that it may be used as part of a 'pretty' URL.
@ -45,8 +60,6 @@ module ActiveSupport
# <%= link_to(@person.name, person_path(@person)) %>
# # => <a href="/person/1-donald-e-knuth">Donald E. Knuth</a>
def parameterize(string, sep = '-')
# remove malformed utf8 characters
string = string.toutf8 unless string.is_utf8?
# replace accented chars with their ascii equivalents
parameterized_string = transliterate(string)
# Turn unwanted chars into the separator
@ -59,6 +72,6 @@ module ActiveSupport
parameterized_string.gsub!(/^#{re_sep}|#{re_sep}$/i, '')
end
parameterized_string.downcase
end
end
end
end

View file

@ -188,7 +188,10 @@ module InflectorTestCases
StringToParameterizedAndNormalized = {
"Malmö" => "malmo",
"Garçons" => "garcons",
"Ops\331" => "ops"
"Ops\331" => "opsu",
"Ærøskøbing" => "aeroskobing",
"Aßlar" => "asslar",
"Japanese: 日本語" => "japanese"
}
UnderscoreToHuman = {

View file

@ -0,0 +1,50 @@
# encoding: utf-8
require 'abstract_unit'
require 'active_support/inflector/transliterate'
class TransliterateTest < Test::Unit::TestCase
APPROXIMATIONS = {
"À"=>"A", "Á"=>"A", "Â"=>"A", "Ã"=>"A", "Ä"=>"A", "Å"=>"A", "Æ"=>"AE",
"Ç"=>"C", "È"=>"E", "É"=>"E", "Ê"=>"E", "Ë"=>"E", "Ì"=>"I", "Í"=>"I",
"Î"=>"I", "Ï"=>"I", "Ð"=>"D", "Ñ"=>"N", "Ò"=>"O", "Ó"=>"O", "Ô"=>"O",
"Õ"=>"O", "Ö"=>"O", "Ø"=>"O", "Ù"=>"U", "Ú"=>"U", "Û"=>"U", "Ü"=>"U",
"Ý"=>"Y", "Þ"=>"Th", "ß"=>"ss", "à"=>"a", "á"=>"a", "â"=>"a", "ã"=>"a",
"ä"=>"a", "å"=>"a", "æ"=>"ae", "ç"=>"c", "è"=>"e", "é"=>"e", "ê"=>"e",
"ë"=>"e", "ì"=>"i", "í"=>"i", "î"=>"i", "ï"=>"i", "ð"=>"d", "ñ"=>"n",
"ò"=>"o", "ó"=>"o", "ô"=>"o", "õ"=>"o", "ö"=>"o", "ø"=>"o", "ù"=>"u",
"ú"=>"u", "û"=>"u", "ü"=>"u", "ý"=>"y", "þ"=>"th", "ÿ"=>"y", "Ā"=>"A",
"ā"=>"a", "Ă"=>"A", "ă"=>"a", "Ą"=>"A", "ą"=>"a", "Ć"=>"C", "ć"=>"c",
"Ĉ"=>"C", "ĉ"=>"c", "Ċ"=>"C", "ċ"=>"c", "Č"=>"C", "č"=>"c", "Ď"=>"D",
"ď"=>"d", "Đ"=>"D", "đ"=>"d", "Ē"=>"E", "ē"=>"e", "Ĕ"=>"E", "ĕ"=>"e",
"Ė"=>"E", "ė"=>"e", "Ę"=>"E", "ę"=>"e", "Ě"=>"E", "ě"=>"e", "Ĝ"=>"G",
"ĝ"=>"g", "Ğ"=>"G", "ğ"=>"g", "Ġ"=>"G", "ġ"=>"g", "Ģ"=>"G", "ģ"=>"g",
"Ĥ"=>"H", "ĥ"=>"h", "Ħ"=>"H", "ħ"=>"h", "Ĩ"=>"I", "ĩ"=>"i", "Ī"=>"I",
"ī"=>"i", "Ĭ"=>"I", "ĭ"=>"i", "Į"=>"I", "į"=>"i", "İ"=>"I", "ı"=>"i",
"IJ"=>"IJ", "ij"=>"ij", "Ĵ"=>"J", "ĵ"=>"j", "Ķ"=>"K", "ķ"=>"k", "ĸ"=>"k",
"Ĺ"=>"L", "ĺ"=>"l", "Ļ"=>"L", "ļ"=>"l", "Ľ"=>"L", "ľ"=>"l", "Ŀ"=>"L",
"ŀ"=>"l", "Ł"=>"L", "ł"=>"l", "Ń"=>"N", "ń"=>"n", "Ņ"=>"N", "ņ"=>"n",
"Ň"=>"N", "ň"=>"n", "ʼn"=>"n", "Ŋ"=>"NG", "ŋ"=>"ng", "Ō"=>"O", "ō"=>"o",
"Ŏ"=>"O", "ŏ"=>"o", "Ő"=>"O", "ő"=>"o", "Œ"=>"OE", "œ"=>"oe", "Ŕ"=>"R",
"ŕ"=>"r", "Ŗ"=>"R", "ŗ"=>"r", "Ř"=>"R", "ř"=>"r", "Ś"=>"S", "ś"=>"s",
"Ŝ"=>"S", "ŝ"=>"s", "Ş"=>"S", "ş"=>"s", "Š"=>"S", "š"=>"s", "Ţ"=>"T",
"ţ"=>"t", "Ť"=>"T", "ť"=>"t", "Ŧ"=>"T", "ŧ"=>"t", "Ũ"=>"U", "ũ"=>"u",
"Ū"=>"U", "ū"=>"u", "Ŭ"=>"U", "ŭ"=>"u", "Ů"=>"U", "ů"=>"u", "Ű"=>"U",
"ű"=>"u", "Ų"=>"U", "ų"=>"u", "Ŵ"=>"W", "ŵ"=>"w", "Ŷ"=>"Y", "ŷ"=>"y",
"Ÿ"=>"Y", "Ź"=>"Z", "ź"=>"z", "Ż"=>"Z", "ż"=>"z", "Ž"=>"Z", "ž"=>"z"
}
def test_transliterate_should_not_change_ascii_chars
(0..127).each do |byte|
char = [byte].pack("U")
assert_equal char, ActiveSupport::Inflector.transliterate(char)
end
end
def test_should_convert_accented_chars_to_approximate_ascii_chars
APPROXIMATIONS.each do |given, expected|
assert_equal expected, ActiveSupport::Inflector.transliterate(given)
end
end
end