6755060: Collator.compare() does not compare correctly for the Thai locale

Reviewed-by: naoto

6755060: Collator.compare() does not compare correctly for the Thai locale
Reviewed-by: naoto
eeacd1a2 · yhuang · e87e49c7 · eeacd1a2 · eeacd1a2
2 changed file
--- a/src/share/classes/sun/text/resources/CollationData_th.java
+++ b/src/share/classes/sun/text/resources/CollationData_th.java
 /*
- * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@@ -103,18 +103,13 @@ public class CollationData_th extends ListResourceBundle {
                //
                // Normal vowels
                //
+                + "< \u0E4D "                   //  NIKHAHIT
                + "< \u0E30 "                   //  SARA A
                + "< \u0E31 "                   //  MAI HAN-AKAT
                + "< \u0E32 "                   //  SARA AA
-                // Normalizer will decompose this character to \u0e4d\u0e32.  This is
+                // Normalizer will decompose this character to \u0e4d\u0e32.
-                // a Bad Thing, because we want the separate characters to sort
+                + "< \u0E33 = \u0E4D\u0E32 "                   //  SARA AM
-                // differently than this individual one.  Since there's no public way to
-                // set the decomposition to be used when creating a collator, there's
-                // no way around this right now.
-                // It's best to go ahead and leave the character in, because it occurs
-                // this way a lot more often than it occurs as separate characters.
-                + "< \u0E33 "                   //  SARA AM
                + "< \u0E34 "                   //  SARA I
@@ -133,62 +128,58 @@ public class CollationData_th extends ListResourceBundle {
                + "< \u0E43 "                   //  SARA AI MAIMUAN
                + "< \u0E44 "                   //  SARA AI MAIMALAI
-                //
-                // Digits
-                //
-                + "< \u0E50 "                   //  DIGIT ZERO
-                + "< \u0E51 "                   //  DIGIT ONE
-                + "< \u0E52 "                   //  DIGIT TWO
-                + "< \u0E53 "                   //  DIGIT THREE
-                + "< \u0E54 "                   //  DIGIT FOUR
-                + "< \u0E55 "                   //  DIGIT FIVE
-                + "< \u0E56 "                   //  DIGIT SIX
-                + "< \u0E57 "                   //  DIGIT SEVEN
-                + "< \u0E58 "                   //  DIGIT EIGHT
-                + "< \u0E59 "                   //  DIGIT NINE
-                // Sorta tonal marks, but maybe not really
-                + "< \u0E4D "                   //  NIKHAHIT
-                //
+                //according to CLDR, it's after 0e44
-                // Thai symbols are supposed to sort "after white space".
+                + "< \u0E3A "                   //  PHINTHU
-                // I'm treating this as making them sort just after the normal Latin-1
-                // symbols, which are in turn after the white space.
-                //
-                + "&'\u007d'"  //  right-brace
+                // This rare symbol comes after all characters.
-                + "< \u0E2F "                   //  PAIYANNOI      (ellipsis, abbreviation)
-                + "< \u0E46 "                   //  MAIYAMOK
-                + "< \u0E4F "                   //  FONGMAN
-                + "< \u0E5A "                   //  ANGKHANKHU
-                + "< \u0E5B "                   //  KHOMUT
-                + "< \u0E3F "                   //  CURRENCY SYMBOL BAHT
-                // These symbols are supposed to be "after all characters"
-                + "< \u0E4E "                   //  YAMAKKAN
-                // This rare symbol also comes after all characters.  But when it is
-                // used in combination with RU and LU, the combination is treated as
-                // a separate letter, ala "CH" sorting after "C" in traditional Spanish.
                + "< \u0E45 "                   //  LAKKHANGYAO
-                + "& \u0E24 < \u0E24\u0E45 "
+                + "& \u0E32 , \0E45 "           // According to CLDR, 0E45 is after 0E32 in tertiary level
-                + "& \u0E26 < \u0E26\u0E45 "
-                // Tonal marks are primary ignorables but are treated as secondary
-                // differences
+                // Below are thai puntuation marks and Tonal(Accent) marks. According to CLDR 1.9 and
+                // ISO/IEC 14651, Annex C, C.2.1 Thai ordering principles, 0E2F to 0E5B are punctuaion marks that need to be ignored
+                // in the first three leveles.  0E4E to 0E4B are tonal marks to be compared in secondary level.
+                // In real implmentation, set puncutation marks in tertiary as there is no fourth level in Java.
+                // Set all these special marks after \u0301, the accute accent.
                + "& \u0301 "   // acute accent
+                //puncutation marks
+                + ", \u0E2F "                   //  PAIYANNOI      (ellipsis, abbreviation)
+                + ", \u0E46 "                   //  MAIYAMOK
+                + ", \u0E4F "                   //  FONGMAN
+                + ", \u0E5A "                   //  ANGKHANKHU
+                + ", \u0E5B "                   //  KHOMUT
+                //tonal marks
+                + "; \u0E4E "                   //  YAMAKKAN
+                + "; \u0E4C "                   //  THANTHAKHAT
                + "; \u0E47 "                   //  MAITAIKHU
                + "; \u0E48 "                   //  MAI EK
                + "; \u0E49 "                   //  MAI THO
                + "; \u0E4A "                   //  MAI TRI
                + "; \u0E4B "                   //  MAI CHATTAWA
-                + "; \u0E4C "                   //  THANTHAKHAT
-                // These are supposed to be ignored, so I'm treating them as controls
+                //
-                + "& \u0001 "
+                // Digits are equal to their corresponding Arabic digits in the first level
-                + "= \u0E3A "                   //  PHINTHU
+                //
-                + "= '.' "                      //  period
+                + "& 0 = \u0E50 "                   //  DIGIT ZERO
-                }
+                + "& 1 = \u0E51 "                   //  DIGIT ONE
+                + "& 2 = \u0E52 "                   //  DIGIT TWO
+                + "& 3 = \u0E53 "                   //  DIGIT THREE
+                + "& 4 = \u0E54 "                   //  DIGIT FOUR
+                + "& 5 = \u0E55 "                   //  DIGIT FIVE
+                + "& 6 = \u0E56 "                   //  DIGIT SIX
+                + "& 7 = \u0E57 "                   //  DIGIT SEVEN
+                + "& 8 = \u0E58 "                   //  DIGIT EIGHT
+                + "& 9 = \u0E59 "                   //  DIGIT NINE
+            }
        };
    }
 }
--- a/test/sun/text/resources/Collator/Bug6755060.java
+++ b/test/sun/text/resources/Collator/Bug6755060.java
+/*
+ * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+/*
+ * @test
+ * @bug 6755060
+ * @summary updating collation tables for thai to make it consistent with CLDR 1.9
+ */
+import java.text.*;
+import java.util.*;
+public class Bug6755060 {
+  /********************************************************
+  *********************************************************/
+  public static void main (String[] args) {
+    Locale reservedLocale = Locale.getDefault();
+    try{
+        int errors=0;
+        Locale loc = new Locale ("th", "TH");   // Thai
+        Locale.setDefault (loc);
+        Collator col = Collator.getInstance ();
+        /*
+        * The original data "data" are the data to be sorted provided by the submitter of the CR.
+        * It's in correct order in accord with thai collation in CLDR 1.9. If we use old Java without this fix,
+        * the output order will be incorrect. Correct order will be turned into incorrect order.
+        * If fix is there, "data" after sorting will be unchanged, same as "sortedData". If fix is lost (regression),
+        * "data" after sorting will be changed, not as "sortedData".(not correct anymore)
+        * The submitter of the CR also gives a expected "sortedData" in the CR, but it's in accord with collation in CLDR 1.4.
+        * His data to be sorted are actually well sorted in accord with CLDR 1.9.
+        */
+        String[] data = {"\u0e01", "\u0e01\u0e2f", "\u0e01\u0e46", "\u0e01\u0e4f", "\u0e01\u0e5a", "\u0e01\u0e5b", "\u0e01\u0e4e", "\u0e01\u0e4c", "\u0e01\u0e48", "\u0e01\u0e01", "\u0e01\u0e4b\u0e01", "\u0e01\u0e4d", "\u0e01\u0e30", "\u0e01\u0e31\u0e01", "\u0e01\u0e32", "\u0e01\u0e33", "\u0e01\u0e34", "\u0e01\u0e35", "\u0e01\u0e36", "\u0e01\u0e37", "\u0e01\u0e38", "\u0e01\u0e39", "\u0e40\u0e01", "\u0e40\u0e01\u0e48", "\u0e40\u0e01\u0e49", "\u0e40\u0e01\u0e4b", "\u0e41\u0e01", "\u0e42\u0e01", "\u0e43\u0e01", "\u0e44\u0e01", "\u0e01\u0e3a", "\u0e24\u0e32", "\u0e24\u0e45", "\u0e40\u0e25", "\u0e44\u0e26"};
+        String[] sortedData = {"\u0e01", "\u0e01\u0e2f", "\u0e01\u0e46", "\u0e01\u0e4f", "\u0e01\u0e5a", "\u0e01\u0e5b", "\u0e01\u0e4e", "\u0e01\u0e4c", "\u0e01\u0e48", "\u0e01\u0e01", "\u0e01\u0e4b\u0e01", "\u0e01\u0e4d", "\u0e01\u0e30", "\u0e01\u0e31\u0e01", "\u0e01\u0e32", "\u0e01\u0e33", "\u0e01\u0e34", "\u0e01\u0e35", "\u0e01\u0e36", "\u0e01\u0e37", "\u0e01\u0e38", "\u0e01\u0e39", "\u0e40\u0e01", "\u0e40\u0e01\u0e48", "\u0e40\u0e01\u0e49", "\u0e40\u0e01\u0e4b", "\u0e41\u0e01", "\u0e42\u0e01", "\u0e43\u0e01", "\u0e44\u0e01", "\u0e01\u0e3a", "\u0e24\u0e32", "\u0e24\u0e45", "\u0e40\u0e25", "\u0e44\u0e26"};
+        Arrays.sort (data, col);
+        System.out.println ("Using " + loc.getDisplayName());
+        for (int i = 0;  i < data.length;  i++) {
+            System.out.println(data[i] + "  :  " + sortedData[i]);
+            if (sortedData[i].compareTo(data[i]) != 0) {
+                errors++;
+            }
+        }//end for
+        if (errors > 0){
+            StringBuffer expected = new StringBuffer(), actual = new StringBuffer();
+            expected.append(sortedData[0]);
+            actual.append(data[0]);
+                for (int i=1; i<data.length; i++) {
+                    expected.append(",");
+                    expected.append(sortedData[i]);
+                    actual.append(",");
+                    actual.append(data[i]);
+                }
+            String errmsg = "Error is found in collation testing in Thai\n" + "exepected order is: " + expected.toString() + "\n" + "actual order is: " + actual.toString() + "\n";
+            throw new RuntimeException(errmsg);
+        }
+    }finally{
+        // restore the reserved locale
+        Locale.setDefault(reservedLocale);
+    }
+  }//end main
+}//end class CollatorTest