Use multibyte proxy class on 1.9, refactor Unicode.

Makes String#mb_chars on Ruby 1.9 return an instance of ActiveSupport::Multibyte::Chars to work around 1.9's lack of Unicode case folding. Refactors class methods from ActiveSupport::Multibyte::Chars into new Unicode module, adding other related functionality for consistency. [#4594 state:resolved] Signed-off-by: N Jeremy Kemper <jeremy@bitsweat.net>

Use multibyte proxy class on 1.9, refactor Unicode.
Makes String#mb_chars on Ruby 1.9 return an instance of ActiveSupport::Multibyte::Chars to work around 1.9's lack of Unicode case folding. Refactors class methods from ActiveSupport::Multibyte::Chars into new Unicode module, adding other related functionality for consistency. [#4594 state:resolved] Signed-off-by: N Jeremy Kemper <jeremy@bitsweat.net>
f3abc8ac · Norman Clarke · Jeremy Kemper · ad4be3d7 · f3abc8ac · f3abc8ac
13 changed file
--- a/activesupport/CHANGELOG
+++ b/activesupport/CHANGELOG
 *Rails 3.0.0 [beta 4/release candidate] (unreleased)*

+* Ruby 1.9: support UTF-8 case folding.  #4595 [Norman Clarke]
+
 * Renames Array#rand -> Array#random_element. [Santiago Pastorino, Rizwan Reza]

-* 1.9 compat: Renames last_(month|year) to prev_(month|year) in Date and Time. [fxn]
+* Ruby 1.9: Renames last_(month|year) to prev_(month|year) in Date and Time. [fxn]

 * Aliases Date#sunday to Date#end_of_week. [fxn]


--- a/activesupport/bin/generate_tables
+++ b/activesupport/bin/generate_tables
@@ -11,126 +11,129 @@ require 'tmpdir'

 module ActiveSupport
  module Multibyte
-    class UnicodeDatabase
-      def load; end
-    end
-    
-    class UnicodeDatabaseGenerator
-      BASE_URI = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::UNICODE_VERSION}/ucd/"
-      SOURCES = {
-        :codepoints => BASE_URI + 'UnicodeData.txt',
-        :composition_exclusion => BASE_URI + 'CompositionExclusions.txt',
-        :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt',
-        :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT'
-      }
-
-      def initialize
-        @ucd = UnicodeDatabase.new
-
-        default = Codepoint.new
-        default.combining_class = 0
-        default.uppercase_mapping = 0
-        default.lowercase_mapping = 0
-        @ucd.codepoints = Hash.new(default)
-      end
+    module Unicode

-      def parse_codepoints(line)
-        codepoint = Codepoint.new
-        raise "Could not parse input." unless line =~ /^
-          ([0-9A-F]+);        # code
-          ([^;]+);            # name
-          ([A-Z]+);           # general category
-          ([0-9]+);           # canonical combining class
-          ([A-Z]+);           # bidi class
-          (<([A-Z]*)>)?       # decomposition type
-          ((\ ?[0-9A-F]+)*);  # decompomposition mapping
-          ([0-9]*);           # decimal digit
-          ([0-9]*);           # digit
-          ([^;]*);            # numeric
-          ([YN]*);            # bidi mirrored
-          ([^;]*);            # unicode 1.0 name
-          ([^;]*);            # iso comment
-          ([0-9A-F]*);        # simple uppercase mapping
-          ([0-9A-F]*);        # simple lowercase mapping
-          ([0-9A-F]*)$/ix     # simple titlecase mapping
-        codepoint.code              = $1.hex
-        #codepoint.name              = $2
-        #codepoint.category          = $3
-        codepoint.combining_class   = Integer($4)
-        #codepoint.bidi_class        = $5
-        codepoint.decomp_type       = $7
-        codepoint.decomp_mapping    = ($8=='') ? nil : $8.split.collect { |element| element.hex }
-        #codepoint.bidi_mirrored     = ($13=='Y') ? true : false
-        codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex
-        codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex
-        #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex
-        @ucd.codepoints[codepoint.code] = codepoint
+      class UnicodeDatabase
+        def load; end
      end

-      def parse_grapheme_break_property(line)
-        if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/
-          type = $2.downcase.intern
-          @ucd.boundary[type] ||= []
-          if $1.include? '..'
-            parts = $1.split '..'
-            @ucd.boundary[type] << (parts[0].hex..parts[1].hex)
-          else
-            @ucd.boundary[type] << $1.hex
+      class DatabaseGenerator
+        BASE_URI = "http://www.unicode.org/Public/#{UNICODE_VERSION}/ucd/"
+        SOURCES = {
+          :codepoints => BASE_URI + 'UnicodeData.txt',
+          :composition_exclusion => BASE_URI + 'CompositionExclusions.txt',
+          :grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt',
+          :cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT'
+        }
+
+        def initialize
+          @ucd = Unicode::UnicodeDatabase.new
+
+          default = Codepoint.new
+          default.combining_class = 0
+          default.uppercase_mapping = 0
+          default.lowercase_mapping = 0
+          @ucd.codepoints = Hash.new(default)
+        end
+
+        def parse_codepoints(line)
+          codepoint = Codepoint.new
+          raise "Could not parse input." unless line =~ /^
+            ([0-9A-F]+);        # code
+            ([^;]+);            # name
+            ([A-Z]+);           # general category
+            ([0-9]+);           # canonical combining class
+            ([A-Z]+);           # bidi class
+            (<([A-Z]*)>)?       # decomposition type
+            ((\ ?[0-9A-F]+)*);  # decompomposition mapping
+            ([0-9]*);           # decimal digit
+            ([0-9]*);           # digit
+            ([^;]*);            # numeric
+            ([YN]*);            # bidi mirrored
+            ([^;]*);            # unicode 1.0 name
+            ([^;]*);            # iso comment
+            ([0-9A-F]*);        # simple uppercase mapping
+            ([0-9A-F]*);        # simple lowercase mapping
+            ([0-9A-F]*)$/ix     # simple titlecase mapping
+          codepoint.code              = $1.hex
+          #codepoint.name              = $2
+          #codepoint.category          = $3
+          codepoint.combining_class   = Integer($4)
+          #codepoint.bidi_class        = $5
+          codepoint.decomp_type       = $7
+          codepoint.decomp_mapping    = ($8=='') ? nil : $8.split.collect { |element| element.hex }
+          #codepoint.bidi_mirrored     = ($13=='Y') ? true : false
+          codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex
+          codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex
+          #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex
+          @ucd.codepoints[codepoint.code] = codepoint
+        end
+
+        def parse_grapheme_break_property(line)
+          if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/
+            type = $2.downcase.intern
+            @ucd.boundary[type] ||= []
+            if $1.include? '..'
+              parts = $1.split '..'
+              @ucd.boundary[type] << (parts[0].hex..parts[1].hex)
+            else
+              @ucd.boundary[type] << $1.hex
+            end
          end
        end
-      end

-      def parse_composition_exclusion(line)
-        if line =~ /^([0-9A-F]+)/i
-          @ucd.composition_exclusion << $1.hex
+        def parse_composition_exclusion(line)
+          if line =~ /^([0-9A-F]+)/i
+            @ucd.composition_exclusion << $1.hex
+          end
        end
-      end

-      def parse_cp1252(line)
-        if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
-          @ucd.cp1252[$1.hex] = $2.hex
+        def parse_cp1252(line)
+          if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
+            @ucd.cp1252[$1.hex] = $2.hex
+          end
        end
-      end

-      def create_composition_map
-        @ucd.codepoints.each do |_, cp|
-          if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code)
-            @ucd.composition_map[cp.decomp_mapping[0]] ||= {}
-            @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
+        def create_composition_map
+          @ucd.codepoints.each do |_, cp|
+            if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd.codepoints[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code)
+              @ucd.composition_map[cp.decomp_mapping[0]] ||= {}
+              @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code
+            end
          end
        end
-      end

-      def normalize_boundary_map
-        @ucd.boundary.each do |k,v|
-          if [:lf, :cr].include? k
-            @ucd.boundary[k] = v[0]
+        def normalize_boundary_map
+          @ucd.boundary.each do |k,v|
+            if [:lf, :cr].include? k
+              @ucd.boundary[k] = v[0]
+            end
          end
        end
-      end

-      def parse
-        SOURCES.each do |type, url|
-          filename =  File.join(Dir.tmpdir, "#{url.split('/').last}")
-          unless File.exist?(filename)
-            $stderr.puts "Downloading #{url.split('/').last}"
-            File.open(filename, 'wb') do |target|
-              open(url) do |source|
-                source.each_line { |line| target.write line }
+        def parse
+          SOURCES.each do |type, url|
+            filename =  File.join(Dir.tmpdir, "#{url.split('/').last}")
+            unless File.exist?(filename)
+              $stderr.puts "Downloading #{url.split('/').last}"
+              File.open(filename, 'wb') do |target|
+                open(url) do |source|
+                  source.each_line { |line| target.write line }
+                end
              end
            end
+            File.open(filename) do |file|
+              file.each_line { |line| send "parse_#{type}".intern, line }
+            end
          end
-          File.open(filename) do |file|
-            file.each_line { |line| send "parse_#{type}".intern, line }
-          end
+          create_composition_map
+          normalize_boundary_map
        end
-        create_composition_map
-        normalize_boundary_map
-      end

-      def dump_to(filename)
-        File.open(filename, 'wb') do |f|
-          f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
+        def dump_to(filename)
+          File.open(filename, 'wb') do |f|
+            f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
+          end
        end
      end
    end
@@ -138,8 +141,8 @@ module ActiveSupport
 end

 if __FILE__ == $0
-  filename = ActiveSupport::Multibyte::UnicodeDatabase.filename
-  generator = ActiveSupport::Multibyte::UnicodeDatabaseGenerator.new
+  filename = ActiveSupport::Multibyte::Unicode::UnicodeDatabase.filename
+  generator = ActiveSupport::Multibyte::Unicode::DatabaseGenerator.new
  generator.parse
  print "Writing to: #{filename}"
  generator.dump_to filename

--- a/activesupport/lib/active_support/core_ext/string/multibyte.rb
+++ b/activesupport/lib/active_support/core_ext/string/multibyte.rb
@@ -2,7 +2,7 @@
 require 'active_support/multibyte'

 class String
-  unless '1.9'.respond_to?(:force_encoding)
+  if '1.9'.respond_to?(:force_encoding)
    # == Multibyte proxy
    #
    # +mb_chars+ is a multibyte safe proxy for string methods.
@@ -37,23 +37,13 @@ class String
    # For more information about the methods defined on the Chars proxy see ActiveSupport::Multibyte::Chars. For
    # information about how to change the default Multibyte behaviour see ActiveSupport::Multibyte.
    def mb_chars
-      if ActiveSupport::Multibyte.proxy_class.wants?(self)
+      if ActiveSupport::Multibyte.proxy_class.consumes?(self)
        ActiveSupport::Multibyte.proxy_class.new(self)
      else
        self
      end
    end
-    
-    # Returns true if the string has UTF-8 semantics (a String used for purely byte resources is unlikely to have
-    # them), returns false otherwise.
-    def is_utf8?
-      ActiveSupport::Multibyte::Chars.consumes?(self)
-    end
-  else
-    def mb_chars #:nodoc
-      self
-    end
-    
+
    def is_utf8? #:nodoc
      case encoding
      when Encoding::UTF_8
@@ -64,5 +54,19 @@ def is_utf8? #:nodoc
        false
      end
    end
+  else
+    def mb_chars
+      if ActiveSupport::Multibyte.proxy_class.wants?(self)
+        ActiveSupport::Multibyte.proxy_class.new(self)
+      else
+        self
+      end
+    end
+
+    # Returns true if the string has UTF-8 semantics (a String used for purely byte resources is unlikely to have
+    # them), returns false otherwise.
+    def is_utf8?
+      ActiveSupport::Multibyte::Chars.consumes?(self)
+    end
  end
 end
--- a/activesupport/lib/active_support/inflector/transliterate.rb
+++ b/activesupport/lib/active_support/inflector/transliterate.rb
@@ -58,8 +58,9 @@ module Inflector
    #   transliterate("Jürgen")
    #   # => "Juergen"
    def transliterate(string, replacement = "?")
-      I18n.transliterate(Multibyte::Chars.normalize(
-        Multibyte::Chars.tidy_bytes(string), :c), :replacement => replacement)
+      I18n.transliterate(ActiveSupport::Multibyte::Unicode.normalize(
+        ActiveSupport::Multibyte::Unicode.tidy_bytes(string), :c),
+          :replacement => replacement)
    end

    # Replaces special characters in a string so that it may be used as part of a 'pretty' URL.

--- a/activesupport/lib/active_support/multibyte.rb
+++ b/activesupport/lib/active_support/multibyte.rb
 # encoding: utf-8
-
 require 'active_support/core_ext/module/attribute_accessors'

 module ActiveSupport #:nodoc:
  module Multibyte
    autoload :EncodingError, 'active_support/multibyte/exceptions'
    autoload :Chars, 'active_support/multibyte/chars'
-    autoload :UnicodeDatabase, 'active_support/multibyte/unicode_database'
-    autoload :Codepoint, 'active_support/multibyte/unicode_database'
-    autoload :UCD, 'active_support/multibyte/unicode_database'
+    autoload :Unicode, 'active_support/multibyte/unicode'
    
-    # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more
-    # information about normalization.
-    NORMALIZATION_FORMS = [:c, :kc, :d, :kd]
-
-    # The Unicode version that is supported by the implementation
-    UNICODE_VERSION = '5.1.0'
-
-    # The default normalization used for operations that require normalization. It can be set to any of the
-    # normalizations in NORMALIZATION_FORMS.
-    #
-    # Example:
-    #   ActiveSupport::Multibyte.default_normalization_form = :c
-    mattr_accessor :default_normalization_form
-    self.default_normalization_form = :kc
-
    # The proxy class returned when calling mb_chars. You can use this accessor to configure your own proxy
    # class so you can support other encodings. See the ActiveSupport::Multibyte::Chars implementation for
    # an example how to do this.

--- a/activesupport/lib/active_support/multibyte/chars.rb
+++ b/activesupport/lib/active_support/multibyte/chars.rb
--- a/activesupport/lib/active_support/multibyte/unicode.rb
+++ b/activesupport/lib/active_support/multibyte/unicode.rb
+module ActiveSupport
+  module Multibyte
+    module Unicode
+
+      extend self
+
+      # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more
+      # information about normalization.
+      NORMALIZATION_FORMS = [:c, :kc, :d, :kd]
+
+      # The Unicode version that is supported by the implementation
+      UNICODE_VERSION = '5.1.0'
+
+      # The default normalization used for operations that require normalization. It can be set to any of the
+      # normalizations in NORMALIZATION_FORMS.
+      #
+      # Example:
+      #   ActiveSupport::Multibyte::Unicode.default_normalization_form = :c
+      attr_accessor :default_normalization_form
+      @default_normalization_form = :kc
+
+      # Hangul character boundaries and properties
+      HANGUL_SBASE = 0xAC00
+      HANGUL_LBASE = 0x1100
+      HANGUL_VBASE = 0x1161
+      HANGUL_TBASE = 0x11A7
+      HANGUL_LCOUNT = 19
+      HANGUL_VCOUNT = 21
+      HANGUL_TCOUNT = 28
+      HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT
+      HANGUL_SCOUNT = 11172
+      HANGUL_SLAST = HANGUL_SBASE + HANGUL_SCOUNT
+      HANGUL_JAMO_FIRST = 0x1100
+      HANGUL_JAMO_LAST = 0x11FF
+
+      # All the unicode whitespace
+      WHITESPACE = [
+        (0x0009..0x000D).to_a, # White_Space # Cc   [5] <control-0009>..<control-000D>
+        0x0020,                # White_Space # Zs       SPACE
+        0x0085,                # White_Space # Cc       <control-0085>
+        0x00A0,                # White_Space # Zs       NO-BREAK SPACE
+        0x1680,                # White_Space # Zs       OGHAM SPACE MARK
+        0x180E,                # White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
+        (0x2000..0x200A).to_a, # White_Space # Zs  [11] EN QUAD..HAIR SPACE
+        0x2028,                # White_Space # Zl       LINE SEPARATOR
+        0x2029,                # White_Space # Zp       PARAGRAPH SEPARATOR
+        0x202F,                # White_Space # Zs       NARROW NO-BREAK SPACE
+        0x205F,                # White_Space # Zs       MEDIUM MATHEMATICAL SPACE
+        0x3000,                # White_Space # Zs       IDEOGRAPHIC SPACE
+      ].flatten.freeze
+
+      # BOM (byte order mark) can also be seen as whitespace, it's a non-rendering character used to distinguish
+      # between little and big endian. This is not an issue in utf-8, so it must be ignored.
+      LEADERS_AND_TRAILERS = WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM
+
+      # Returns a regular expression pattern that matches the passed Unicode codepoints
+      def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:
+        array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|')
+      end
+      TRAILERS_PAT = /(#{codepoints_to_pattern(LEADERS_AND_TRAILERS)})+\Z/u
+      LEADERS_PAT = /\A(#{codepoints_to_pattern(LEADERS_AND_TRAILERS)})+/u
+
+      # Unpack the string at codepoints boundaries. Raises an EncodingError when the encoding of the string isn't
+      # valid UTF-8.
+      #
+      # Example:
+      #   Unicode.u_unpack('Café') #=> [67, 97, 102, 233]
+      def u_unpack(string)
+        begin
+          string.unpack 'U*'
+        rescue ArgumentError
+          raise EncodingError, 'malformed UTF-8 character'
+        end
+      end
+
+      # Detect whether the codepoint is in a certain character class. Returns +true+ when it's in the specified
+      # character class and +false+ otherwise. Valid character classes are: <tt>:cr</tt>, <tt>:lf</tt>, <tt>:l</tt>,
+      # <tt>:v</tt>, <tt>:lv</tt>, <tt>:lvt</tt> and <tt>:t</tt>.
+      #
+      # Primarily used by the grapheme cluster support.
+      def in_char_class?(codepoint, classes)
+        classes.detect { |c| database.boundary[c] === codepoint } ? true : false
+      end
+
+      # Unpack the string at grapheme boundaries. Returns a list of character lists.
+      #
+      # Example:
+      #   Unicode.g_unpack('क्षि') #=> [[2325, 2381], [2359], [2367]]
+      #   Unicode.g_unpack('Café') #=> [[67], [97], [102], [233]]
+      def g_unpack(string)
+        codepoints = u_unpack(string)
+        unpacked = []
+        pos = 0
+        marker = 0
+        eoc = codepoints.length
+        while(pos < eoc)
+          pos += 1
+          previous = codepoints[pos-1]
+          current = codepoints[pos]
+          if (
+              # CR X LF
+              one = ( previous == database.boundary[:cr] and current == database.boundary[:lf] ) or
+              # L X (L|V|LV|LVT)
+              two = ( database.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or
+              # (LV|V) X (V|T)
+              three = ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or
+              # (LVT|T) X (T)
+              four = ( in_char_class?(previous, [:lvt,:t]) and database.boundary[:t] === current ) or
+              # X Extend
+              five = (database.boundary[:extend] === current)
+            )
+          else
+            unpacked << codepoints[marker..pos-1]
+            marker = pos
+          end
+        end
+        unpacked
+      end
+
+      # Reverse operation of g_unpack.
+      #
+      # Example:
+      #   Unicode.g_pack(Unicode.g_unpack('क्षि')) #=> 'क्षि'
+      def g_pack(unpacked)
+        (unpacked.flatten).pack('U*')
+      end
+
+      # Re-order codepoints so the string becomes canonical.
+      def reorder_characters(codepoints)
+        length = codepoints.length- 1
+        pos = 0
+        while pos < length do
+          cp1, cp2 = database.codepoints[codepoints[pos]], database.codepoints[codepoints[pos+1]]
+          if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0)
+            codepoints[pos..pos+1] = cp2.code, cp1.code
+            pos += (pos > 0 ? -1 : 1)
+          else
+            pos += 1
+          end
+        end
+        codepoints
+      end
+
+      # Decompose composed characters to the decomposed form.
+      def decompose_codepoints(type, codepoints)
+        codepoints.inject([]) do |decomposed, cp|
+          # if it's a hangul syllable starter character
+          if HANGUL_SBASE <= cp and cp < HANGUL_SLAST
+            sindex = cp - HANGUL_SBASE
+            ncp = [] # new codepoints
+            ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT
+            ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
+            tindex = sindex % HANGUL_TCOUNT
+            ncp << (HANGUL_TBASE + tindex) unless tindex == 0
+            decomposed.concat ncp
+          # if the codepoint is decomposable in with the current decomposition type
+          elsif (ncp = database.codepoints[cp].decomp_mapping) and (!database.codepoints[cp].decomp_type || type == :compatability)
+            decomposed.concat decompose_codepoints(type, ncp.dup)
+          else
+            decomposed << cp
+          end
+        end
+      end
+
+      # Compose decomposed characters to the composed form.
+      def compose_codepoints(codepoints)
+        pos = 0
+        eoa = codepoints.length - 1
+        starter_pos = 0
+        starter_char = codepoints[0]
+        previous_combining_class = -1
+        while pos < eoa
+          pos += 1
+          lindex = starter_char - HANGUL_LBASE
+          # -- Hangul
+          if 0 <= lindex and lindex < HANGUL_LCOUNT
+            vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1
+            if 0 <= vindex and vindex < HANGUL_VCOUNT
+              tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1
+              if 0 <= tindex and tindex < HANGUL_TCOUNT
+                j = starter_pos + 2
+                eoa -= 2
+              else
+                tindex = 0
+                j = starter_pos + 1
+                eoa -= 1
+              end
+              codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE
+            end
+            starter_pos += 1
+            starter_char = codepoints[starter_pos]
+          # -- Other characters
+          else
+            current_char = codepoints[pos]
+            current = database.codepoints[current_char]
+            if current.combining_class > previous_combining_class
+              if ref = database.composition_map[starter_char]
+                composition = ref[current_char]
+              else
+                composition = nil
+              end
+              unless composition.nil?
+                codepoints[starter_pos] = composition
+                starter_char = composition
+                codepoints.delete_at pos
+                eoa -= 1
+                pos -= 1
+                previous_combining_class = -1
+              else
+                previous_combining_class = current.combining_class
+              end
+            else
+              previous_combining_class = current.combining_class
+            end
+            if current.combining_class == 0
+              starter_pos = pos
+              starter_char = codepoints[pos]
+            end
+          end
+        end
+        codepoints
+      end
+
+      # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
+      #
+      # Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1.
+      def tidy_bytes(string, force = false)
+        if force
+          return string.unpack("C*").map do |b|
+            tidy_byte(b)
+          end.flatten.compact.pack("C*").unpack("U*").pack("U*")
+        end
+
+        bytes = string.unpack("C*")
+        conts_expected = 0
+        last_lead = 0
+
+        bytes.each_index do |i|
+
+          byte          = bytes[i]
+          is_ascii      = byte < 128
+          is_cont       = byte > 127 && byte < 192
+          is_lead       = byte > 191 && byte < 245
+          is_unused     = byte > 240
+          is_restricted = byte > 244
+
+          # Impossible or highly unlikely byte? Clean it.
+          if is_unused || is_restricted
+            bytes[i] = tidy_byte(byte)
+          elsif is_cont
+            # Not expecting contination byte? Clean up. Otherwise, now expect one less.
+            conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
+          else
+            if conts_expected > 0
+              # Expected continuation, but got ASCII or leading? Clean backwards up to
+              # the leading byte.
+              (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
+              conts_expected = 0
+            end
+            if is_lead
+              # Final byte is leading? Clean it.
+              if i == bytes.length - 1
+                bytes[i] = tidy_byte(bytes.last)
+              else
+                # Valid leading byte? Expect continuations determined by position of
+                # first zero bit, with max of 3.
+                conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
+                last_lead = i
+              end
+            end
+          end
+        end
+        bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
+      end
+
+      # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for
+      # passing strings to databases and validations.
+      #
+      # * <tt>string</tt> - The string to perform normalization on.
+      # * <tt>form</tt> - The form you want to normalize in. Should be one of the following:
+      #   <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is
+      #   ActiveSupport::Multibyte.default_normalization_form
+      def normalize(string, form=nil)
+        form ||= @default_normalization_form
+        # See http://www.unicode.org/reports/tr15, Table 1
+        codepoints = u_unpack(string)
+        case form
+          when :d
+            reorder_characters(decompose_codepoints(:canonical, codepoints))
+          when :c
+            compose_codepoints(reorder_characters(decompose_codepoints(:canonical, codepoints)))
+          when :kd
+            reorder_characters(decompose_codepoints(:compatability, codepoints))
+          when :kc
+            compose_codepoints(reorder_characters(decompose_codepoints(:compatability, codepoints)))
+          else
+            raise ArgumentError, "#{form} is not a valid normalization variant", caller
+        end.pack('U*')
+      end
+
+      def apply_mapping(string, mapping) #:nodoc:
+        u_unpack(string).map do |codepoint|
+          cp = database.codepoints[codepoint]
+          if cp and (ncp = cp.send(mapping)) and ncp > 0
+            ncp
+          else
+            codepoint
+          end
+        end.pack('U*')
+      end
+
+      # Holds data about a codepoint in the Unicode database
+      class Codepoint
+        attr_accessor :code, :combining_class, :decomp_type, :decomp_mapping, :uppercase_mapping, :lowercase_mapping
+      end
+
+      # Holds static data from the Unicode database
+      class UnicodeDatabase
+        ATTRIBUTES = :codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252
+
+        attr_writer(*ATTRIBUTES)
+
+        def initialize
+          @codepoints = Hash.new(Codepoint.new)
+          @composition_exclusion = []
+          @composition_map = {}
+          @boundary = {}
+          @cp1252 = {}
+        end
+
+        # Lazy load the Unicode database so it's only loaded when it's actually used
+        ATTRIBUTES.each do |attr_name|
+          class_eval(<<-EOS, __FILE__, __LINE__ + 1)
+            def #{attr_name}     # def codepoints
+              load               #   load
+              @#{attr_name}      #   @codepoints
+            end                  # end
+          EOS
+        end
+
+        # Loads the Unicode database and returns all the internal objects of UnicodeDatabase.
+        def load
+          begin
+            @codepoints, @composition_exclusion, @composition_map, @boundary, @cp1252 = File.open(self.class.filename, 'rb') { |f| Marshal.load f.read }
+          rescue Exception => e
+              raise IOError.new("Couldn't load the Unicode tables for UTF8Handler (#{e.message}), ActiveSupport::Multibyte is unusable")
+          end
+
+          # Redefine the === method so we can write shorter rules for grapheme cluster breaks
+          @boundary.each do |k,_|
+            @boundary[k].instance_eval do
+              def ===(other)
+                detect { |i| i === other } ? true : false
+              end
+            end if @boundary[k].kind_of?(Array)
+          end
+
+          # define attr_reader methods for the instance variables
+          class << self
+            attr_reader(*ATTRIBUTES)
+          end
+        end
+
+        # Returns the directory in which the data files are stored
+        def self.dirname
+          File.dirname(__FILE__) + '/../values/'
+        end
+
+        # Returns the filename for the data file for this version
+        def self.filename
+          File.expand_path File.join(dirname, "unicode_tables.dat")
+        end
+      end
+
+      private
+
+      def tidy_byte(byte)
+        if byte < 160
+          [database.cp1252[byte] || byte].pack("U").unpack("C*")
+        elsif byte < 192
+          [194, byte]
+        else
+          [195, byte - 64]
+        end
+      end
+
+      def database
+        @database ||= UnicodeDatabase.new
+      end
+
+    end
+  end
+end
--- a/activesupport/lib/active_support/multibyte/unicode_database.rb
+++ b/activesupport/lib/active_support/multibyte/unicode_database.rb
-# encoding: utf-8
-
-module ActiveSupport #:nodoc:
-  module Multibyte #:nodoc:
-    # Holds data about a codepoint in the Unicode database
-    class Codepoint
-      attr_accessor :code, :combining_class, :decomp_type, :decomp_mapping, :uppercase_mapping, :lowercase_mapping
-    end
-
-    # Holds static data from the Unicode database
-    class UnicodeDatabase
-      ATTRIBUTES = :codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252
-
-      attr_writer(*ATTRIBUTES)
-
-      def initialize
-        @codepoints = Hash.new(Codepoint.new)
-        @composition_exclusion = []
-        @composition_map = {}
-        @boundary = {}
-        @cp1252 = {}
-      end
-
-      # Lazy load the Unicode database so it's only loaded when it's actually used
-      ATTRIBUTES.each do |attr_name|
-        class_eval(<<-EOS, __FILE__, __LINE__ + 1)
-          def #{attr_name}     # def codepoints
-            load               #   load
-            @#{attr_name}      #   @codepoints
-          end                  # end
-        EOS
-      end
-
-      # Loads the Unicode database and returns all the internal objects of UnicodeDatabase.
-      def load
-        begin
-          @codepoints, @composition_exclusion, @composition_map, @boundary, @cp1252 = File.open(self.class.filename, 'rb') { |f| Marshal.load f.read }
-        rescue Exception => e
-            raise IOError.new("Couldn't load the Unicode tables for UTF8Handler (#{e.message}), ActiveSupport::Multibyte is unusable")
-        end
-
-        # Redefine the === method so we can write shorter rules for grapheme cluster breaks
-        @boundary.each do |k,_|
-          @boundary[k].instance_eval do
-            def ===(other)
-              detect { |i| i === other } ? true : false
-            end
-          end if @boundary[k].kind_of?(Array)
-        end
-
-        # define attr_reader methods for the instance variables
-        class << self
-          attr_reader(*ATTRIBUTES)
-        end
-      end
-
-      # Returns the directory in which the data files are stored
-      def self.dirname
-        File.dirname(__FILE__) + '/../values/'
-      end
-
-      # Returns the filename for the data file for this version
-      def self.filename
-        File.expand_path File.join(dirname, "unicode_tables.dat")
-      end
-    end
-
-    # UniCode Database
-    UCD = UnicodeDatabase.new
-  end
-end
\ No newline at end of file
--- a/activesupport/lib/active_support/values/unicode_tables.dat
+++ b/activesupport/lib/active_support/values/unicode_tables.dat
--- a/activesupport/test/core_ext/string_ext_test.rb
+++ b/activesupport/test/core_ext/string_ext_test.rb
@@ -245,11 +245,9 @@ def test_mb_chars_returns_an_instance_of_the_chars_proxy_when_kcode_utf8
        assert_kind_of ActiveSupport::Multibyte.proxy_class, UNICODE_STRING.mb_chars
      end
    end
-  end
-
-  if RUBY_VERSION >= '1.9'
-    def test_mb_chars_returns_string
-      assert_kind_of String, UNICODE_STRING.mb_chars
+  else
+    def test_mb_chars_returns_instance_of_proxy_class
+      assert_kind_of ActiveSupport::Multibyte.proxy_class, UNICODE_STRING.mb_chars
    end
  end
 end

--- a/activesupport/test/multibyte_chars_test.rb
+++ b/activesupport/test/multibyte_chars_test.rb
@@ -36,7 +36,7 @@ def test_forwarded_method_calls_should_return_new_chars_instance
  end

  def test_forwarded_bang_method_calls_should_return_the_original_chars_instance
-    assert_kind_of @proxy_class, @chars.__method_for_multibyte_testing! 
+    assert_kind_of @proxy_class, @chars.__method_for_multibyte_testing!
    assert_equal @chars.object_id, @chars.__method_for_multibyte_testing!.object_id
  end

@@ -65,33 +65,32 @@ def test_consumes_utf8_strings
  end

  def test_unpack_utf8_strings
-    assert_equal 4, @proxy_class.u_unpack(UNICODE_STRING).length
-    assert_equal 5, @proxy_class.u_unpack(ASCII_STRING).length
+    assert_equal 4, ActiveSupport::Multibyte::Unicode.u_unpack(UNICODE_STRING).length
+    assert_equal 5, ActiveSupport::Multibyte::Unicode.u_unpack(ASCII_STRING).length
  end

  def test_unpack_raises_encoding_error_on_broken_strings
    assert_raise(ActiveSupport::Multibyte::EncodingError) do
-      @proxy_class.u_unpack(BYTE_STRING)
+      ActiveSupport::Multibyte::Unicode.u_unpack(BYTE_STRING)
    end
  end

-  if RUBY_VERSION < '1.9'
-    def test_concatenation_should_return_a_proxy_class_instance
-      assert_equal ActiveSupport::Multibyte.proxy_class, ('a'.mb_chars + 'b').class
-      assert_equal ActiveSupport::Multibyte.proxy_class, ('a'.mb_chars << 'b').class
-    end
+  def test_concatenation_should_return_a_proxy_class_instance
+    assert_equal ActiveSupport::Multibyte.proxy_class, ('a'.mb_chars + 'b').class
+    assert_equal ActiveSupport::Multibyte.proxy_class, ('a'.mb_chars << 'b').class
+  end

-    def test_ascii_strings_are_treated_at_utf8_strings
-      assert_equal ActiveSupport::Multibyte.proxy_class, ASCII_STRING.mb_chars.class
-    end
+  def test_ascii_strings_are_treated_at_utf8_strings
+    assert_equal ActiveSupport::Multibyte.proxy_class, ASCII_STRING.mb_chars.class
+  end

-    def test_concatenate_should_return_proxy_instance
-      assert(('a'.mb_chars + 'b').kind_of?(@proxy_class))
-      assert(('a'.mb_chars + 'b'.mb_chars).kind_of?(@proxy_class))
-      assert(('a'.mb_chars << 'b').kind_of?(@proxy_class))
-      assert(('a'.mb_chars << 'b'.mb_chars).kind_of?(@proxy_class))
-    end
+  def test_concatenate_should_return_proxy_instance
+    assert(('a'.mb_chars + 'b').kind_of?(@proxy_class))
+    assert(('a'.mb_chars + 'b'.mb_chars).kind_of?(@proxy_class))
+    assert(('a'.mb_chars << 'b').kind_of?(@proxy_class))
+    assert(('a'.mb_chars << 'b'.mb_chars).kind_of?(@proxy_class))
  end
+
 end

 class MultibyteCharsUTF8BehaviourTest < Test::Unit::TestCase
@@ -111,35 +110,33 @@ def setup
    @byte_order_mark = [65279].pack('U')
  end

-  if RUBY_VERSION < '1.9'
-    def test_split_should_return_an_array_of_chars_instances
-      @chars.split(//).each do |character|
-        assert_kind_of ActiveSupport::Multibyte.proxy_class, character
-      end
+  def test_split_should_return_an_array_of_chars_instances
+    @chars.split(//).each do |character|
+      assert_kind_of ActiveSupport::Multibyte.proxy_class, character
    end
+  end

-    def test_indexed_insert_accepts_fixnums
-      @chars[2] = 32
-      assert_equal 'こに わ', @chars
-    end
+  def test_indexed_insert_accepts_fixnums
+    @chars[2] = 32
+    assert_equal 'こに わ', @chars
+  end

-    def test_overridden_bang_methods_return_self
-      [:rstrip!, :lstrip!, :strip!, :reverse!, :upcase!, :downcase!, :capitalize!].each do |method|
-        assert_equal @chars.object_id, @chars.send(method).object_id
-      end
+  def test_overridden_bang_methods_return_self
+    [:rstrip!, :lstrip!, :strip!, :reverse!, :upcase!, :downcase!, :capitalize!].each do |method|
+      assert_equal @chars.object_id, @chars.send(method).object_id
    end
+  end

-    def test_overridden_bang_methods_change_wrapped_string
-      [:rstrip!, :lstrip!, :strip!, :reverse!, :upcase!, :downcase!].each do |method|
-        original = ' Café '
-        proxy = chars(original.dup)
-        proxy.send(method)
-        assert_not_equal original, proxy.to_s
-      end
-      proxy = chars('òu')
-      proxy.capitalize!
-      assert_equal 'Òu', proxy.to_s
+  def test_overridden_bang_methods_change_wrapped_string
+    [:rstrip!, :lstrip!, :strip!, :reverse!, :upcase!, :downcase!].each do |method|
+      original = ' Café '
+      proxy = chars(original.dup)
+      proxy.send(method)
+      assert_not_equal original, proxy.to_s
    end
+    proxy = chars('òu')
+    proxy.capitalize!
+    assert_equal 'Òu', proxy.to_s
  end

  if RUBY_VERSION >= '1.9'
@@ -151,11 +148,7 @@ def test_unicode_string_should_have_utf8_encoding
  def test_identity
    assert_equal @chars, @chars
    assert @chars.eql?(@chars)
-    if RUBY_VERSION <= '1.9'
-      assert !@chars.eql?(UNICODE_STRING)
-    else
-      assert @chars.eql?(UNICODE_STRING)
-    end
+    assert !@chars.eql?(UNICODE_STRING)
  end

  def test_string_methods_are_chainable
@@ -207,7 +200,7 @@ def test_should_use_character_offsets_for_insert_offsets
    assert_equal 'こわにちわ', @chars.insert(1, 'わ')
    assert_equal 'こわわわにちわ', @chars.insert(2, 'わわ')
    assert_equal 'わこわわわにちわ', @chars.insert(0, 'わ')
-    assert_equal 'わこわわわにちわ', @chars.wrapped_string if RUBY_VERSION < '1.9'
+    assert_equal 'わこわわわにちわ', @chars.wrapped_string
  end

  def test_insert_should_be_destructive
@@ -330,7 +323,7 @@ def test_center_should_raise_argument_errors_on_bad_arguments
    assert_raise(ArgumentError) { @chars.center }
  end

-  def test_center_should_count_charactes_instead_of_bytes
+  def test_center_should_count_characters_instead_of_bytes
    assert_equal UNICODE_STRING, @chars.center(-3)
    assert_equal UNICODE_STRING, @chars.center(0)
    assert_equal UNICODE_STRING, @chars.center(4)

--- a/activesupport/test/multibyte_conformance.rb
+++ b/activesupport/test/multibyte_conformance.rb
@@ -28,7 +28,7 @@ def self.download(from, to)
 class MultibyteConformanceTest < Test::Unit::TestCase
  include MultibyteTestHelpers
  
-  UNIDATA_URL = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::UNICODE_VERSION}/ucd"
+  UNIDATA_URL = "http://www.unicode.org/Public/#{ActiveSupport::Multibyte::Unicode::UNICODE_VERSION}/ucd"
  UNIDATA_FILE = '/NormalizationTest.txt'
  CACHE_DIR = File.join(Dir.tmpdir, 'cache')
  

--- a/activesupport/test/multibyte_unicode_database_test.rb
+++ b/activesupport/test/multibyte_unicode_database_test.rb
 # encoding: utf-8
 require 'abstract_unit'

+
 class MultibyteUnicodeDatabaseTest < Test::Unit::TestCase
+
+  include ActiveSupport::Multibyte::Unicode
+
  def setup
-    @ucd = ActiveSupport::Multibyte::UnicodeDatabase.new
+    @ucd = UnicodeDatabase.new
  end

-  ActiveSupport::Multibyte::UnicodeDatabase::ATTRIBUTES.each do |attribute|
+  UnicodeDatabase::ATTRIBUTES.each do |attribute|
    define_method "test_lazy_loading_on_attribute_access_of_#{attribute}" do
      @ucd.expects(:load)
      @ucd.send(attribute)
    end
  end
-  
+
  def test_load
    @ucd.load
-    ActiveSupport::Multibyte::UnicodeDatabase::ATTRIBUTES.each do |attribute|
+    UnicodeDatabase::ATTRIBUTES.each do |attribute|
      assert @ucd.send(attribute).length > 1
    end
  end