mirror of
https://github.com/rails/rails.git
synced 2022-11-09 12:12:34 -05:00
Improve reliability of Inflector.transliterate. [#4374 state:resolved]
Signed-off-by: Jeremy Kemper <jeremy@bitsweat.net>
This commit is contained in:
parent
36f3634a6a
commit
dceef0828a
4 changed files with 94 additions and 26 deletions
|
@ -1,5 +1,7 @@
|
|||
*Rails 3.0.0 [beta 3] (pending)*
|
||||
|
||||
* Improve transliteration quality. #4374 [Norman Clarke]
|
||||
|
||||
* Speed up and add Ruby 1.9 support for ActiveSupport::Multibyte::Chars#tidy_bytes. #4350 [Norman Clarke]
|
||||
|
||||
|
||||
|
|
|
@ -1,32 +1,47 @@
|
|||
# encoding: utf-8
|
||||
require 'iconv'
|
||||
require 'kconv'
|
||||
require 'active_support/core_ext/string/multibyte'
|
||||
|
||||
module ActiveSupport
|
||||
module Inflector
|
||||
extend self
|
||||
|
||||
# Replaces accented characters with their ascii equivalents.
|
||||
|
||||
# UTF-8 byte => ASCII approximate UTF-8 byte(s)
|
||||
ASCII_APPROXIMATIONS = {
|
||||
198 => [65, 69], # Æ => AE
|
||||
208 => 68, # Ð => D
|
||||
216 => 79, # Ø => O
|
||||
222 => [84, 104], # Þ => Þ
|
||||
223 => [115, 115], # ß => ss
|
||||
230 => [97, 101], # æ => ae
|
||||
240 => 100, # ð => d
|
||||
248 => 111, # ø => o
|
||||
254 => [116, 104], # þ => th
|
||||
272 => 68, # Đ => D
|
||||
273 => 100, # đ => đ
|
||||
294 => 72, # Ħ => H
|
||||
295 => 104, # ħ => h
|
||||
305 => 105, # ı => i
|
||||
306 => [73, 74], # IJ =>IJ
|
||||
307 => [105, 106], # ij => ij
|
||||
312 => 107, # ĸ => k
|
||||
319 => 76, # Ŀ => L
|
||||
320 => 108, # ŀ => l
|
||||
321 => 76, # Ł => L
|
||||
322 => 108, # ł => l
|
||||
329 => 110, # ʼn => n
|
||||
330 => [78, 71], # Ŋ => NG
|
||||
331 => [110, 103], # ŋ => ng
|
||||
338 => [79, 69], # Œ => OE
|
||||
339 => [111, 101], # œ => oe
|
||||
358 => 84, # Ŧ => T
|
||||
359 => 116 # ŧ => t
|
||||
}
|
||||
|
||||
# Replaces accented characters with an ASCII approximation, or deletes it if none exsits.
|
||||
def transliterate(string)
|
||||
Iconv.iconv('ascii//ignore//translit', 'utf-8', string).to_s
|
||||
end
|
||||
|
||||
if RUBY_VERSION >= '1.9'
|
||||
undef_method :transliterate
|
||||
def transliterate(string)
|
||||
proxy = ActiveSupport::Multibyte.proxy_class.new(string)
|
||||
proxy.normalize(:kd).gsub(/[^\x00-\x7F]+/, '')
|
||||
end
|
||||
|
||||
# The iconv transliteration code doesn't function correctly
|
||||
# on some platforms, but it's very fast where it does function.
|
||||
elsif "foo" != (Inflector.transliterate("föö") rescue nil)
|
||||
undef_method :transliterate
|
||||
def transliterate(string)
|
||||
string.mb_chars.normalize(:kd). # Decompose accented characters
|
||||
gsub(/[^\x00-\x7F]+/, '') # Remove anything non-ASCII entirely (e.g. diacritics).
|
||||
end
|
||||
ActiveSupport::Multibyte::Chars.new(string).tidy_bytes.normalize(:d).unpack("U*").map do |char|
|
||||
ASCII_APPROXIMATIONS[char] || (char if char < 128)
|
||||
end.compact.flatten.pack("U*")
|
||||
end
|
||||
|
||||
# Replaces special characters in a string so that it may be used as part of a 'pretty' URL.
|
||||
|
@ -45,8 +60,6 @@ module ActiveSupport
|
|||
# <%= link_to(@person.name, person_path(@person)) %>
|
||||
# # => <a href="/person/1-donald-e-knuth">Donald E. Knuth</a>
|
||||
def parameterize(string, sep = '-')
|
||||
# remove malformed utf8 characters
|
||||
string = string.toutf8 unless string.is_utf8?
|
||||
# replace accented chars with their ascii equivalents
|
||||
parameterized_string = transliterate(string)
|
||||
# Turn unwanted chars into the separator
|
||||
|
@ -59,6 +72,6 @@ module ActiveSupport
|
|||
parameterized_string.gsub!(/^#{re_sep}|#{re_sep}$/i, '')
|
||||
end
|
||||
parameterized_string.downcase
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -188,7 +188,10 @@ module InflectorTestCases
|
|||
StringToParameterizedAndNormalized = {
|
||||
"Malmö" => "malmo",
|
||||
"Garçons" => "garcons",
|
||||
"Ops\331" => "ops"
|
||||
"Ops\331" => "opsu",
|
||||
"Ærøskøbing" => "aeroskobing",
|
||||
"Aßlar" => "asslar",
|
||||
"Japanese: 日本語" => "japanese"
|
||||
}
|
||||
|
||||
UnderscoreToHuman = {
|
||||
|
|
50
activesupport/test/transliterate_test.rb
Normal file
50
activesupport/test/transliterate_test.rb
Normal file
|
@ -0,0 +1,50 @@
|
|||
# encoding: utf-8
|
||||
require 'abstract_unit'
|
||||
require 'active_support/inflector/transliterate'
|
||||
|
||||
class TransliterateTest < Test::Unit::TestCase
|
||||
|
||||
APPROXIMATIONS = {
|
||||
"À"=>"A", "Á"=>"A", "Â"=>"A", "Ã"=>"A", "Ä"=>"A", "Å"=>"A", "Æ"=>"AE",
|
||||
"Ç"=>"C", "È"=>"E", "É"=>"E", "Ê"=>"E", "Ë"=>"E", "Ì"=>"I", "Í"=>"I",
|
||||
"Î"=>"I", "Ï"=>"I", "Ð"=>"D", "Ñ"=>"N", "Ò"=>"O", "Ó"=>"O", "Ô"=>"O",
|
||||
"Õ"=>"O", "Ö"=>"O", "Ø"=>"O", "Ù"=>"U", "Ú"=>"U", "Û"=>"U", "Ü"=>"U",
|
||||
"Ý"=>"Y", "Þ"=>"Th", "ß"=>"ss", "à"=>"a", "á"=>"a", "â"=>"a", "ã"=>"a",
|
||||
"ä"=>"a", "å"=>"a", "æ"=>"ae", "ç"=>"c", "è"=>"e", "é"=>"e", "ê"=>"e",
|
||||
"ë"=>"e", "ì"=>"i", "í"=>"i", "î"=>"i", "ï"=>"i", "ð"=>"d", "ñ"=>"n",
|
||||
"ò"=>"o", "ó"=>"o", "ô"=>"o", "õ"=>"o", "ö"=>"o", "ø"=>"o", "ù"=>"u",
|
||||
"ú"=>"u", "û"=>"u", "ü"=>"u", "ý"=>"y", "þ"=>"th", "ÿ"=>"y", "Ā"=>"A",
|
||||
"ā"=>"a", "Ă"=>"A", "ă"=>"a", "Ą"=>"A", "ą"=>"a", "Ć"=>"C", "ć"=>"c",
|
||||
"Ĉ"=>"C", "ĉ"=>"c", "Ċ"=>"C", "ċ"=>"c", "Č"=>"C", "č"=>"c", "Ď"=>"D",
|
||||
"ď"=>"d", "Đ"=>"D", "đ"=>"d", "Ē"=>"E", "ē"=>"e", "Ĕ"=>"E", "ĕ"=>"e",
|
||||
"Ė"=>"E", "ė"=>"e", "Ę"=>"E", "ę"=>"e", "Ě"=>"E", "ě"=>"e", "Ĝ"=>"G",
|
||||
"ĝ"=>"g", "Ğ"=>"G", "ğ"=>"g", "Ġ"=>"G", "ġ"=>"g", "Ģ"=>"G", "ģ"=>"g",
|
||||
"Ĥ"=>"H", "ĥ"=>"h", "Ħ"=>"H", "ħ"=>"h", "Ĩ"=>"I", "ĩ"=>"i", "Ī"=>"I",
|
||||
"ī"=>"i", "Ĭ"=>"I", "ĭ"=>"i", "Į"=>"I", "į"=>"i", "İ"=>"I", "ı"=>"i",
|
||||
"IJ"=>"IJ", "ij"=>"ij", "Ĵ"=>"J", "ĵ"=>"j", "Ķ"=>"K", "ķ"=>"k", "ĸ"=>"k",
|
||||
"Ĺ"=>"L", "ĺ"=>"l", "Ļ"=>"L", "ļ"=>"l", "Ľ"=>"L", "ľ"=>"l", "Ŀ"=>"L",
|
||||
"ŀ"=>"l", "Ł"=>"L", "ł"=>"l", "Ń"=>"N", "ń"=>"n", "Ņ"=>"N", "ņ"=>"n",
|
||||
"Ň"=>"N", "ň"=>"n", "ʼn"=>"n", "Ŋ"=>"NG", "ŋ"=>"ng", "Ō"=>"O", "ō"=>"o",
|
||||
"Ŏ"=>"O", "ŏ"=>"o", "Ő"=>"O", "ő"=>"o", "Œ"=>"OE", "œ"=>"oe", "Ŕ"=>"R",
|
||||
"ŕ"=>"r", "Ŗ"=>"R", "ŗ"=>"r", "Ř"=>"R", "ř"=>"r", "Ś"=>"S", "ś"=>"s",
|
||||
"Ŝ"=>"S", "ŝ"=>"s", "Ş"=>"S", "ş"=>"s", "Š"=>"S", "š"=>"s", "Ţ"=>"T",
|
||||
"ţ"=>"t", "Ť"=>"T", "ť"=>"t", "Ŧ"=>"T", "ŧ"=>"t", "Ũ"=>"U", "ũ"=>"u",
|
||||
"Ū"=>"U", "ū"=>"u", "Ŭ"=>"U", "ŭ"=>"u", "Ů"=>"U", "ů"=>"u", "Ű"=>"U",
|
||||
"ű"=>"u", "Ų"=>"U", "ų"=>"u", "Ŵ"=>"W", "ŵ"=>"w", "Ŷ"=>"Y", "ŷ"=>"y",
|
||||
"Ÿ"=>"Y", "Ź"=>"Z", "ź"=>"z", "Ż"=>"Z", "ż"=>"z", "Ž"=>"Z", "ž"=>"z"
|
||||
}
|
||||
|
||||
def test_transliterate_should_not_change_ascii_chars
|
||||
(0..127).each do |byte|
|
||||
char = [byte].pack("U")
|
||||
assert_equal char, ActiveSupport::Inflector.transliterate(char)
|
||||
end
|
||||
end
|
||||
|
||||
def test_should_convert_accented_chars_to_approximate_ascii_chars
|
||||
APPROXIMATIONS.each do |given, expected|
|
||||
assert_equal expected, ActiveSupport::Inflector.transliterate(given)
|
||||
end
|
||||
end
|
||||
|
||||
end
|
Loading…
Reference in a new issue