7014645: Support perl style Unicode hex notation \x{...}

Summary: Added the construct \x{...} for Unicode hex notation support Reviewed-by: alanb, okutsu

7014645: Support perl style Unicode hex notation \x{...}
Summary: Added the construct \x{...} for Unicode hex notation support Reviewed-by: alanb, okutsu
729dd870 · sherman · 85c6e9dd · 729dd870 · 729dd870
隐藏空白更改
内联并排

Showing with 75 addition and 10 deletion

src/share/classes/java/util/regex/Pattern.java src/share/classes/java/util/regex/Pattern.java +22 -0

test/java/util/regex/RegExTest.java test/java/util/regex/RegExTest.java +53 -10

未找到文件。
--- a/src/share/classes/java/util/regex/Pattern.java
+++ b/src/share/classes/java/util/regex/Pattern.java
@@ -101,6 +101,11 @@ import java.util.Arrays;
 *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;<tt>0x</tt><i>hh</i></td></tr>
 * <tr><td valign="top" headers="construct characters"><tt>&#92;u</tt><i>hhhh</i></td>
 *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;<tt>0x</tt><i>hhhh</i></td></tr>
+ * <tr><td valign="top" headers="construct characters"><tt>&#92;x</tt><i>{h...h}</i></td>
+ *     <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;<tt>0x</tt><i>h...h</i>
+ *         ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}
+ *         &nbsp;&lt;=&nbsp;<tt>0x</tt><i>h...h</i>&nbsp;&lt;=&nbsp
+ *          {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr>
 * <tr><td valign="top" headers="matches"><tt>\t</tt></td>
 *     <td headers="matches">The tab character (<tt>'&#92;u0009'</tt>)</td></tr>
 * <tr><td valign="top" headers="construct characters"><tt>\n</tt></td>
@@ -529,6 +534,13 @@ import java.util.Arrays;
 * while not equal, compile into the same pattern, which matches the character
 * with hexadecimal value <tt>0x2014</tt>.
 *
+ * <p> A Unicode character can also be represented in a regular-expression by
+ * using its hexadecimal code point value directly as described in construct
+ * <tt>&#92;x{...}</tt>, for example a supplementary character U+2011F
+ * can be specified as <tt>&#92;x{2011F}</tt>, instead of two consecutive
+ * Unicode escape sequences of the surrogate pair
+ * <tt>&#92;uD840</tt><tt>&#92;uDD1F</tt>.
+ *
 * <a name="ubc">
 * <p>Unicode scripts, blocks and categories are written with the <tt>\p</tt> and
 * <tt>\P</tt> constructs as in Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
@@ -2993,6 +3005,16 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
            if (ASCII.isHexDigit(m)) {
                return ASCII.toDigit(n) * 16 + ASCII.toDigit(m);
            }
+        } else if (n == '{' && ASCII.isHexDigit(peek())) {
+            int ch = 0;
+            while (ASCII.isHexDigit(n = read())) {
+                ch = (ch << 4) + ASCII.toDigit(n);
+                if (ch > Character.MAX_CODE_POINT)
+                    throw error("Hexadecimal codepoint is too big");
+            }
+            if (n != '}')
+                throw error("Unclosed hexadecimal escape sequence");
+            return ch;
        }
        throw error("Illegal hexadecimal escape sequence");
    }

--- a/test/java/util/regex/RegExTest.java
+++ b/test/java/util/regex/RegExTest.java
@@ -32,7 +32,7 @@
 * 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476
 * 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
 * 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
- * 6350801 6676425 6878475 6919132 6931676 6948903
+ * 6350801 6676425 6878475 6919132 6931676 6948903 7014645
 */
 import java.util.regex.*;
@@ -136,6 +136,7 @@ public class RegExTest {
        namedGroupCaptureTest();
        nonBmpClassComplementTest();
        unicodePropertiesTest();
+        unicodeHexNotationTest();
        if (failure)
            throw new RuntimeException("Failure in the RE handling.");
        else
@@ -161,18 +162,19 @@ public class RegExTest {
    private static void check(Matcher m, String result, boolean expected) {
        m.find();
-        if (m.group().equals(result))
+        if (m.group().equals(result) != expected)
-            failCount += (expected) ? 0 : 1;
+            failCount++;
-        else
-            failCount += (expected) ? 1 : 0;
    }
    private static void check(Pattern p, String s, boolean expected) {
-        Matcher matcher = p.matcher(s);
+        if (p.matcher(s).find() != expected)
-        if (matcher.find())
+            failCount++;
-            failCount += (expected) ? 0 : 1;
+    }
-        else
-            failCount += (expected) ? 1 : 0;
+    private static void check(String p, String s, boolean expected) {
+        Matcher matcher = Pattern.compile(p).matcher(s);
+        if (matcher.find() != expected)
+            failCount++;
    }
    private static void check(String p, char c, boolean expected) {
@@ -3614,4 +3616,45 @@ public class RegExTest {
        }
        report("unicodeProperties");
    }
+    private static void unicodeHexNotationTest() throws Exception {
+        // negative
+        checkExpectedFail("\\x{-23}");
+        checkExpectedFail("\\x{110000}");
+        checkExpectedFail("\\x{}");
+        checkExpectedFail("\\x{AB[ef]");
+        // codepoint
+        check("^\\x{1033c}$",              "\uD800\uDF3C", true);
+        check("^\\xF0\\x90\\x8C\\xBC$",    "\uD800\uDF3C", false);
+        check("^\\x{D800}\\x{DF3c}+$",     "\uD800\uDF3C", false);
+        check("^\\xF0\\x90\\x8C\\xBC$",    "\uD800\uDF3C", false);
+        // in class
+        check("^[\\x{D800}\\x{DF3c}]+$",   "\uD800\uDF3C", false);
+        check("^[\\xF0\\x90\\x8C\\xBC]+$", "\uD800\uDF3C", false);
+        check("^[\\x{D800}\\x{DF3C}]+$",   "\uD800\uDF3C", false);
+        check("^[\\x{DF3C}\\x{D800}]+$",   "\uD800\uDF3C", false);
+        check("^[\\x{D800}\\x{DF3C}]+$",   "\uDF3C\uD800", true);
+        check("^[\\x{DF3C}\\x{D800}]+$",   "\uDF3C\uD800", true);
+        for (int cp = 0; cp <= 0x10FFFF; cp++) {
+             String s = "A" + new String(Character.toChars(cp)) + "B";
+             String hexUTF16 = (cp <= 0xFFFF)? String.format("\\u%04x", cp)
+                                             : String.format("\\u%04x\\u%04x",
+                                               (int) Character.toChars(cp)[0],
+                                               (int) Character.toChars(cp)[1]);
+             String hexCodePoint = "\\x{" + Integer.toHexString(cp) + "}";
+             if (!Pattern.matches("A" + hexUTF16 + "B", s))
+                 failCount++;
+             if (!Pattern.matches("A[" + hexUTF16 + "]B", s))
+                 failCount++;
+             if (!Pattern.matches("A" + hexCodePoint + "B", s))
+                 failCount++;
+             if (!Pattern.matches("A[" + hexCodePoint + "]B", s))
+                 failCount++;
+         }
+         report("unicodeHexNotation");
+     }
 }