Improve reliability of Inflector.transliterate. [#4374 state:resolved]

Signed-off-by: N Jeremy Kemper <jeremy@bitsweat.net>

Improve reliability of Inflector.transliterate. [#4374 state:resolved]
Signed-off-by: N Jeremy Kemper <jeremy@bitsweat.net>
dceef082 · Norman Clarke · Jeremy Kemper · 36f3634a · dceef082 · dceef082
4 changed file
--- a/activesupport/CHANGELOG
+++ b/activesupport/CHANGELOG
 *Rails 3.0.0 [beta 3] (pending)*

+* Improve transliteration quality.  #4374 [Norman Clarke]
+
 * Speed up and add Ruby 1.9 support for ActiveSupport::Multibyte::Chars#tidy_bytes.  #4350 [Norman Clarke]



--- a/activesupport/lib/active_support/inflector/transliterate.rb
+++ b/activesupport/lib/active_support/inflector/transliterate.rb
 # encoding: utf-8
-require 'iconv'
-require 'kconv'
 require 'active_support/core_ext/string/multibyte'

 module ActiveSupport
  module Inflector
    extend self
-    
-    # Replaces accented characters with their ascii equivalents.
-    def transliterate(string)
-      Iconv.iconv('ascii//ignore//translit', 'utf-8', string).to_s
-    end

-    if RUBY_VERSION >= '1.9'
-      undef_method :transliterate
-      def transliterate(string)
-        proxy = ActiveSupport::Multibyte.proxy_class.new(string)
-        proxy.normalize(:kd).gsub(/[^\x00-\x7F]+/, '')
-      end
+    # UTF-8 byte => ASCII approximate UTF-8 byte(s)
+    ASCII_APPROXIMATIONS = {
+      198 => [65, 69],   # Æ => AE
+      208 => 68,         # Ð => D
+      216 => 79,         # Ø => O
+      222 => [84, 104],  # Þ => Þ
+      223 => [115, 115], # ß => ss
+      230 => [97, 101],  # æ => ae
+      240 => 100,        # ð => d
+      248 => 111,        # ø => o
+      254 => [116, 104], # þ => th
+      272 => 68,         # Đ => D
+      273 => 100,        # đ => đ
+      294 => 72,         # Ħ => H
+      295 => 104,        # ħ => h
+      305 => 105,        # ı => i
+      306 => [73, 74],   # Ĳ =>IJ
+      307 => [105, 106], # ĳ => ij
+      312 => 107,        # ĸ => k
+      319 => 76,         # Ŀ => L
+      320 => 108,        # ŀ => l
+      321 => 76,         # Ł => L
+      322 => 108,        # ł => l
+      329 => 110,        # ŉ => n
+      330 => [78, 71],   # Ŋ => NG
+      331 => [110, 103], # ŋ => ng
+      338 => [79, 69],   # Œ => OE
+      339 => [111, 101], # œ => oe
+      358 => 84,         # Ŧ => T
+      359 => 116         # ŧ => t
+    }

-    # The iconv transliteration code doesn't function correctly
-    # on some platforms, but it's very fast where it does function.
-    elsif "foo" != (Inflector.transliterate("föö") rescue nil)
-      undef_method :transliterate
-      def transliterate(string)
-        string.mb_chars.normalize(:kd). # Decompose accented characters
-          gsub(/[^\x00-\x7F]+/, '')     # Remove anything non-ASCII entirely (e.g. diacritics).
-      end
+    # Replaces accented characters with an ASCII approximation, or deletes it if none exsits.
+    def transliterate(string)
+      ActiveSupport::Multibyte::Chars.new(string).tidy_bytes.normalize(:d).unpack("U*").map do |char|
+        ASCII_APPROXIMATIONS[char] || (char if char < 128)
+      end.compact.flatten.pack("U*")
    end

    # Replaces special characters in a string so that it may be used as part of a 'pretty' URL.
@@ -45,8 +60,6 @@ def transliterate(string)
    #   <%= link_to(@person.name, person_path(@person)) %>
    #   # => <a href="/person/1-donald-e-knuth">Donald E. Knuth</a>
    def parameterize(string, sep = '-')
-      # remove malformed utf8 characters
-      string = string.toutf8 unless string.is_utf8?
      # replace accented chars with their ascii equivalents
      parameterized_string = transliterate(string)
      # Turn unwanted chars into the separator
@@ -59,6 +72,6 @@ def parameterize(string, sep = '-')
        parameterized_string.gsub!(/^#{re_sep}|#{re_sep}$/i, '')
      end
      parameterized_string.downcase
-    end    
+    end
  end
 end
--- a/activesupport/test/inflector_test_cases.rb
+++ b/activesupport/test/inflector_test_cases.rb
@@ -188,7 +188,10 @@ module InflectorTestCases
  StringToParameterizedAndNormalized = {
    "Malmö"                               => "malmo",
    "Garçons"                             => "garcons",
-    "Ops\331"                            => "ops"
+    "Ops\331"                             => "opsu",
+    "Ærøskøbing"                          => "aeroskobing",
+    "Aßlar"                               => "asslar",
+    "Japanese: 日本語"                    => "japanese"
  }

  UnderscoreToHuman = {

--- a/activesupport/test/transliterate_test.rb
+++ b/activesupport/test/transliterate_test.rb
+# encoding: utf-8
+require 'abstract_unit'
+require 'active_support/inflector/transliterate'
+
+class TransliterateTest < Test::Unit::TestCase
+
+  APPROXIMATIONS = {
+    "À"=>"A", "Á"=>"A", "Â"=>"A", "Ã"=>"A", "Ä"=>"A", "Å"=>"A", "Æ"=>"AE",
+    "Ç"=>"C", "È"=>"E", "É"=>"E", "Ê"=>"E", "Ë"=>"E", "Ì"=>"I", "Í"=>"I",
+    "Î"=>"I", "Ï"=>"I", "Ð"=>"D", "Ñ"=>"N", "Ò"=>"O", "Ó"=>"O", "Ô"=>"O",
+    "Õ"=>"O", "Ö"=>"O", "Ø"=>"O", "Ù"=>"U", "Ú"=>"U", "Û"=>"U", "Ü"=>"U",
+    "Ý"=>"Y", "Þ"=>"Th", "ß"=>"ss", "à"=>"a", "á"=>"a", "â"=>"a", "ã"=>"a",
+    "ä"=>"a", "å"=>"a", "æ"=>"ae", "ç"=>"c", "è"=>"e", "é"=>"e", "ê"=>"e",
+    "ë"=>"e", "ì"=>"i", "í"=>"i", "î"=>"i", "ï"=>"i", "ð"=>"d", "ñ"=>"n",
+    "ò"=>"o", "ó"=>"o", "ô"=>"o", "õ"=>"o", "ö"=>"o", "ø"=>"o", "ù"=>"u",
+    "ú"=>"u", "û"=>"u", "ü"=>"u", "ý"=>"y", "þ"=>"th", "ÿ"=>"y", "Ā"=>"A",
+    "ā"=>"a", "Ă"=>"A", "ă"=>"a", "Ą"=>"A", "ą"=>"a", "Ć"=>"C", "ć"=>"c",
+    "Ĉ"=>"C", "ĉ"=>"c", "Ċ"=>"C", "ċ"=>"c", "Č"=>"C", "č"=>"c", "Ď"=>"D",
+    "ď"=>"d", "Đ"=>"D", "đ"=>"d", "Ē"=>"E", "ē"=>"e", "Ĕ"=>"E", "ĕ"=>"e",
+    "Ė"=>"E", "ė"=>"e", "Ę"=>"E", "ę"=>"e", "Ě"=>"E", "ě"=>"e", "Ĝ"=>"G",
+    "ĝ"=>"g", "Ğ"=>"G", "ğ"=>"g", "Ġ"=>"G", "ġ"=>"g", "Ģ"=>"G", "ģ"=>"g",
+    "Ĥ"=>"H", "ĥ"=>"h", "Ħ"=>"H", "ħ"=>"h", "Ĩ"=>"I", "ĩ"=>"i", "Ī"=>"I",
+    "ī"=>"i", "Ĭ"=>"I", "ĭ"=>"i", "Į"=>"I", "į"=>"i", "İ"=>"I", "ı"=>"i",
+    "Ĳ"=>"IJ", "ĳ"=>"ij", "Ĵ"=>"J", "ĵ"=>"j", "Ķ"=>"K", "ķ"=>"k", "ĸ"=>"k",
+    "Ĺ"=>"L", "ĺ"=>"l", "Ļ"=>"L", "ļ"=>"l", "Ľ"=>"L", "ľ"=>"l", "Ŀ"=>"L",
+    "ŀ"=>"l", "Ł"=>"L", "ł"=>"l", "Ń"=>"N", "ń"=>"n", "Ņ"=>"N", "ņ"=>"n",
+    "Ň"=>"N", "ň"=>"n", "ŉ"=>"n", "Ŋ"=>"NG", "ŋ"=>"ng", "Ō"=>"O", "ō"=>"o",
+    "Ŏ"=>"O", "ŏ"=>"o", "Ő"=>"O", "ő"=>"o", "Œ"=>"OE", "œ"=>"oe", "Ŕ"=>"R",
+    "ŕ"=>"r", "Ŗ"=>"R", "ŗ"=>"r", "Ř"=>"R", "ř"=>"r", "Ś"=>"S", "ś"=>"s",
+    "Ŝ"=>"S", "ŝ"=>"s", "Ş"=>"S", "ş"=>"s", "Š"=>"S", "š"=>"s", "Ţ"=>"T",
+    "ţ"=>"t", "Ť"=>"T", "ť"=>"t", "Ŧ"=>"T", "ŧ"=>"t", "Ũ"=>"U", "ũ"=>"u",
+    "Ū"=>"U", "ū"=>"u", "Ŭ"=>"U", "ŭ"=>"u", "Ů"=>"U", "ů"=>"u", "Ű"=>"U",
+    "ű"=>"u", "Ų"=>"U", "ų"=>"u", "Ŵ"=>"W", "ŵ"=>"w", "Ŷ"=>"Y", "ŷ"=>"y",
+    "Ÿ"=>"Y", "Ź"=>"Z", "ź"=>"z", "Ż"=>"Z", "ż"=>"z", "Ž"=>"Z", "ž"=>"z"
+  }
+
+  def test_transliterate_should_not_change_ascii_chars
+    (0..127).each do |byte|
+      char = [byte].pack("U")
+      assert_equal char, ActiveSupport::Inflector.transliterate(char)
+    end
+  end
+
+  def test_should_convert_accented_chars_to_approximate_ascii_chars
+    APPROXIMATIONS.each do |given, expected|
+      assert_equal expected, ActiveSupport::Inflector.transliterate(given)
+    end
+  end
+
+end