Delegate Inflector.transliterate to i18n. [#4508 state:resolved]

Ancillary changes: Moved Chars#normalize into a class method; removed unused UTF_PAT constant. Signed-off-by: José Valim <jose.valim@gmail.com>
2010-04-14 11:12:07 -03:00 · 2010-04-14 11:12:07 -03:00 · f0e754e713
parent 60504e62c8
commit f0e754e713
3 changed files with 101 additions and 89 deletions
--- a/activesupport/lib/active_support/inflector/transliterate.rb
+++ b/activesupport/lib/active_support/inflector/transliterate.rb
@ -3,45 +3,62 @@ require 'active_support/core_ext/string/multibyte'

 module ActiveSupport
  module Inflector
-    extend self

-    # UTF-8 byte => ASCII approximate UTF-8 byte(s)
-    ASCII_APPROXIMATIONS = {
-      198 => [65, 69],   # Æ => AE
-      208 => 68,         # Ð => D
-      216 => 79,         # Ø => O
-      222 => [84, 104],  # Þ => Þ
-      223 => [115, 115], # ß => ss
-      230 => [97, 101],  # æ => ae
-      240 => 100,        # ð => d
-      248 => 111,        # ø => o
-      254 => [116, 104], # þ => th
-      272 => 68,         # Đ => D
-      273 => 100,        # đ => đ
-      294 => 72,         # Ħ => H
-      295 => 104,        # ħ => h
-      305 => 105,        # ı => i
-      306 => [73, 74],   # Ĳ =>IJ
-      307 => [105, 106], # ĳ => ij
-      312 => 107,        # ĸ => k
-      319 => 76,         # Ŀ => L
-      320 => 108,        # ŀ => l
-      321 => 76,         # Ł => L
-      322 => 108,        # ł => l
-      329 => 110,        # ŉ => n
-      330 => [78, 71],   # Ŋ => NG
-      331 => [110, 103], # ŋ => ng
-      338 => [79, 69],   # Œ => OE
-      339 => [111, 101], # œ => oe
-      358 => 84,         # Ŧ => T
-      359 => 116         # ŧ => t
-    }
-
-    # Replaces accented characters with an ASCII approximation, or deletes it if none exsits.
-    def transliterate(string)
-      ActiveSupport::Multibyte::Chars.new(string).tidy_bytes.normalize(:d).unpack("U*").map do |char|
-        ASCII_APPROXIMATIONS[char] || (char if char < 128)
-      end.compact.flatten.pack("U*")
+    # Replaces non-ASCII characters with an ASCII approximation, or if none
+    # exists, a replacement character which defaults to "?".
+    #
+    #    transliterate("Ærøskøbing")
+    #    # => "AEroskobing"
+    #
+    # Default approximations are provided for Western/Latin characters,
+    # e.g, "ø", "ñ", "é", "ß", etc.
+    #
+    # This method is I18n aware, so you can set up custom approximations for a
+    # locale. This can be useful, for example, to transliterate German's "ü"
+    # and "ö" to "ue" and "oe", or to add support for transliterating Russian
+    # to ASCII.
+    #
+    # In order to make your custom transliterations available, you must set
+    # them as the <tt>i18n.transliterate.rule</tt> i18n key:
+    #
+    #   # Store the transliterations in locales/de.yml
+    #   i18n:
+    #     transliterate:
+    #       ü: "ue"
+    #       ö: "oe"
+    #
+    #   # Or set them using Ruby
+    #   I18n.backend.store_translations(:de, :i18n => {
+    #     :transliterate => {
+    #       :rule => {
+    #         "ü" => "ue",
+    #         "ö" => "oe"
+    #       }
+    #     }
+    #   })
+    #
+    # The value for <tt>i18n.transliterate.rule</tt> can be a simple Hash that maps
+    # characters to ASCII approximations as shown above, or, for more complex
+    # requirements, a Proc:
+    #
+    #   I18n.backend.store_translations(:de, :i18n => {
+    #     :transliterate => {
+    #       :rule => lambda {|string| MyTransliterator.transliterate(string)}
+    #     }
+    #   })
+    #
+    # Now you can have different transliterations for each locale:
+    #
+    #   I18n.locale = :en
+    #   transliterate("Jürgen")
+    #   # => "Jurgen"
+    #
+    #   I18n.locale = :de
+    #   transliterate("Jürgen")
+    #   # => "Juergen"
+    def transliterate(string, replacement = "?")
+      I18n.transliterate(Multibyte::Chars.normalize(
+        Multibyte::Chars.tidy_bytes(string), :c), :replacement => replacement)
    end

    # Replaces special characters in a string so that it may be used as part of a 'pretty' URL.
@ -73,5 +90,6 @@ module ActiveSupport
      end
      parameterized_string.downcase
    end
+
  end
 end
--- a/activesupport/lib/active_support/multibyte/chars.rb
+++ b/activesupport/lib/active_support/multibyte/chars.rb
@ -75,8 +75,6 @@ module ActiveSupport #:nodoc:
      UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/u
      UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/u

-      UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8']
-
      attr_reader :wrapped_string
      alias to_s wrapped_string
      alias to_str wrapped_string
@ -409,25 +407,11 @@ module ActiveSupport #:nodoc:
      # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for
      # passing strings to databases and validations.
      #
-      # * <tt>str</tt> - The string to perform normalization on.
      # * <tt>form</tt> - The form you want to normalize in. Should be one of the following:
      #   <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is
      #   ActiveSupport::Multibyte.default_normalization_form
      def normalize(form=ActiveSupport::Multibyte.default_normalization_form)
-        # See http://www.unicode.org/reports/tr15, Table 1
-        codepoints = self.class.u_unpack(@wrapped_string)
-        chars(case form
-          when :d
-            self.class.reorder_characters(self.class.decompose_codepoints(:canonical, codepoints))
-          when :c
-            self.class.compose_codepoints(self.class.reorder_characters(self.class.decompose_codepoints(:canonical, codepoints)))
-          when :kd
-            self.class.reorder_characters(self.class.decompose_codepoints(:compatability, codepoints))
-          when :kc
-            self.class.compose_codepoints(self.class.reorder_characters(self.class.decompose_codepoints(:compatability, codepoints)))
-          else
-            raise ArgumentError, "#{form} is not a valid normalization variant", caller
-        end.pack('U*'))
+        chars(self.class.normalize(@wrapped_string, form))
      end

      # Performs canonical decomposition on all the characters.
@ -659,7 +643,7 @@ module ActiveSupport #:nodoc:

        # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
        #
-        # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP-1252 or ISO-8859-1.
+        # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1.
        def tidy_bytes(string, force = false)
          if force
            return string.unpack("C*").map do |b|
@ -708,6 +692,31 @@ module ActiveSupport #:nodoc:
          end
          bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
        end
+
+        # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for
+        # passing strings to databases and validations.
+        #
+        # * <tt>string</tt> - The string to perform normalization on.
+        # * <tt>form</tt> - The form you want to normalize in. Should be one of the following:
+        #   <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is
+        #   ActiveSupport::Multibyte.default_normalization_form
+        def normalize(string, form=ActiveSupport::Multibyte.default_normalization_form)
+          # See http://www.unicode.org/reports/tr15, Table 1
+          codepoints = u_unpack(string)
+          case form
+            when :d
+              reorder_characters(decompose_codepoints(:canonical, codepoints))
+            when :c
+              compose_codepoints(reorder_characters(decompose_codepoints(:canonical, codepoints)))
+            when :kd
+              reorder_characters(decompose_codepoints(:compatability, codepoints))
+            when :kc
+              compose_codepoints(reorder_characters(decompose_codepoints(:compatability, codepoints)))
+            else
+              raise ArgumentError, "#{form} is not a valid normalization variant", caller
+          end.pack('U*')
+        end
+
      end

      protected
--- a/activesupport/test/transliterate_test.rb
+++ b/activesupport/test/transliterate_test.rb
@ -4,36 +4,6 @@ require 'active_support/inflector/transliterate'

 class TransliterateTest < Test::Unit::TestCase

-  APPROXIMATIONS = {
-    "À"=>"A", "Á"=>"A", "Â"=>"A", "Ã"=>"A", "Ä"=>"A", "Å"=>"A", "Æ"=>"AE",
-    "Ç"=>"C", "È"=>"E", "É"=>"E", "Ê"=>"E", "Ë"=>"E", "Ì"=>"I", "Í"=>"I",
-    "Î"=>"I", "Ï"=>"I", "Ð"=>"D", "Ñ"=>"N", "Ò"=>"O", "Ó"=>"O", "Ô"=>"O",
-    "Õ"=>"O", "Ö"=>"O", "Ø"=>"O", "Ù"=>"U", "Ú"=>"U", "Û"=>"U", "Ü"=>"U",
-    "Ý"=>"Y", "Þ"=>"Th", "ß"=>"ss", "à"=>"a", "á"=>"a", "â"=>"a", "ã"=>"a",
-    "ä"=>"a", "å"=>"a", "æ"=>"ae", "ç"=>"c", "è"=>"e", "é"=>"e", "ê"=>"e",
-    "ë"=>"e", "ì"=>"i", "í"=>"i", "î"=>"i", "ï"=>"i", "ð"=>"d", "ñ"=>"n",
-    "ò"=>"o", "ó"=>"o", "ô"=>"o", "õ"=>"o", "ö"=>"o", "ø"=>"o", "ù"=>"u",
-    "ú"=>"u", "û"=>"u", "ü"=>"u", "ý"=>"y", "þ"=>"th", "ÿ"=>"y", "Ā"=>"A",
-    "ā"=>"a", "Ă"=>"A", "ă"=>"a", "Ą"=>"A", "ą"=>"a", "Ć"=>"C", "ć"=>"c",
-    "Ĉ"=>"C", "ĉ"=>"c", "Ċ"=>"C", "ċ"=>"c", "Č"=>"C", "č"=>"c", "Ď"=>"D",
-    "ď"=>"d", "Đ"=>"D", "đ"=>"d", "Ē"=>"E", "ē"=>"e", "Ĕ"=>"E", "ĕ"=>"e",
-    "Ė"=>"E", "ė"=>"e", "Ę"=>"E", "ę"=>"e", "Ě"=>"E", "ě"=>"e", "Ĝ"=>"G",
-    "ĝ"=>"g", "Ğ"=>"G", "ğ"=>"g", "Ġ"=>"G", "ġ"=>"g", "Ģ"=>"G", "ģ"=>"g",
-    "Ĥ"=>"H", "ĥ"=>"h", "Ħ"=>"H", "ħ"=>"h", "Ĩ"=>"I", "ĩ"=>"i", "Ī"=>"I",
-    "ī"=>"i", "Ĭ"=>"I", "ĭ"=>"i", "Į"=>"I", "į"=>"i", "İ"=>"I", "ı"=>"i",
-    "Ĳ"=>"IJ", "ĳ"=>"ij", "Ĵ"=>"J", "ĵ"=>"j", "Ķ"=>"K", "ķ"=>"k", "ĸ"=>"k",
-    "Ĺ"=>"L", "ĺ"=>"l", "Ļ"=>"L", "ļ"=>"l", "Ľ"=>"L", "ľ"=>"l", "Ŀ"=>"L",
-    "ŀ"=>"l", "Ł"=>"L", "ł"=>"l", "Ń"=>"N", "ń"=>"n", "Ņ"=>"N", "ņ"=>"n",
-    "Ň"=>"N", "ň"=>"n", "ŉ"=>"n", "Ŋ"=>"NG", "ŋ"=>"ng", "Ō"=>"O", "ō"=>"o",
-    "Ŏ"=>"O", "ŏ"=>"o", "Ő"=>"O", "ő"=>"o", "Œ"=>"OE", "œ"=>"oe", "Ŕ"=>"R",
-    "ŕ"=>"r", "Ŗ"=>"R", "ŗ"=>"r", "Ř"=>"R", "ř"=>"r", "Ś"=>"S", "ś"=>"s",
-    "Ŝ"=>"S", "ŝ"=>"s", "Ş"=>"S", "ş"=>"s", "Š"=>"S", "š"=>"s", "Ţ"=>"T",
-    "ţ"=>"t", "Ť"=>"T", "ť"=>"t", "Ŧ"=>"T", "ŧ"=>"t", "Ũ"=>"U", "ũ"=>"u",
-    "Ū"=>"U", "ū"=>"u", "Ŭ"=>"U", "ŭ"=>"u", "Ů"=>"U", "ů"=>"u", "Ű"=>"U",
-    "ű"=>"u", "Ų"=>"U", "ų"=>"u", "Ŵ"=>"W", "ŵ"=>"w", "Ŷ"=>"Y", "ŷ"=>"y",
-    "Ÿ"=>"Y", "Ź"=>"Z", "ź"=>"z", "Ż"=>"Z", "ż"=>"z", "Ž"=>"Z", "ž"=>"z"
-  }
-
  def test_transliterate_should_not_change_ascii_chars
    (0..127).each do |byte|
      char = [byte].pack("U")
@ -41,10 +11,25 @@ class TransliterateTest < Test::Unit::TestCase
    end
  end

-  def test_should_convert_accented_chars_to_approximate_ascii_chars
-    APPROXIMATIONS.each do |given, expected|
-      assert_equal expected, ActiveSupport::Inflector.transliterate(given)
+  def test_transliterate_should_approximate_ascii
+    # create string with range of Unicode"s western characters with
+    # diacritics, excluding the division and multiplication signs which for
+    # some reason or other are floating in the middle of all the letters.
+    string = (0xC0..0x17E).to_a.reject {|c| [0xD7, 0xF7].include? c}.pack("U*")
+    string.each_char do |char|
+      assert_match %r{^[a-zA-Z']*$}, ActiveSupport::Inflector.transliterate(string)
    end
  end

+  def test_transliterate_should_work_with_custom_i18n_rules_and_uncomposed_utf8
+    char = [117, 776].pack("U*") # "ü" as ASCII "u" plus COMBINING DIAERESIS
+    I18n.backend.store_translations(:de, :i18n => {:transliterate => {:rule => {"ü" => "ue"}}})
+    I18n.locale = :de
+    assert_equal "ue", ActiveSupport::Inflector.transliterate(char)
+  end
+
+  def test_transliterate_should_allow_a_custom_replacement_char
+    assert_equal "a*b", ActiveSupport::Inflector.transliterate("a索b", "*")
+  end
+
 end