From 729dd870c12b4c094da5914f197707b6f0c89241 Mon Sep 17 00:00:00 2001 From: sherman Date: Thu, 3 Feb 2011 13:49:25 -0800 Subject: [PATCH] 7014645: Support perl style Unicode hex notation \x{...} Summary: Added the construct \x{...} for Unicode hex notation support Reviewed-by: alanb, okutsu --- .../classes/java/util/regex/Pattern.java | 22 +++++++ test/java/util/regex/RegExTest.java | 63 ++++++++++++++++--- 2 files changed, 75 insertions(+), 10 deletions(-) diff --git a/src/share/classes/java/util/regex/Pattern.java b/src/share/classes/java/util/regex/Pattern.java index 7c012e5de..e0494ab42 100644 --- a/src/share/classes/java/util/regex/Pattern.java +++ b/src/share/classes/java/util/regex/Pattern.java @@ -101,6 +101,11 @@ import java.util.Arrays; * The character with hexadecimal value 0xhh * \uhhhh * The character with hexadecimal value 0xhhhh + * \x{h...h} + * The character with hexadecimal value 0xh...h + * ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT} + *  <= 0xh...h <=  + * {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT}) * \t * The tab character ('\u0009') * \n @@ -529,6 +534,13 @@ import java.util.Arrays; * while not equal, compile into the same pattern, which matches the character * with hexadecimal value 0x2014. * + *

A Unicode character can also be represented in a regular-expression by + * using its hexadecimal code point value directly as described in construct + * \x{...}, for example a supplementary character U+2011F + * can be specified as \x{2011F}, instead of two consecutive + * Unicode escape sequences of the surrogate pair + * \uD840\uDD1F. + * * *

Unicode scripts, blocks and categories are written with the \p and * \P constructs as in Perl. \p{prop} matches if @@ -2993,6 +3005,16 @@ loop: for(int x=0, offset=0; x Character.MAX_CODE_POINT) + throw error("Hexadecimal codepoint is too big"); + } + if (n != '}') + throw error("Unclosed hexadecimal escape sequence"); + return ch; } throw error("Illegal hexadecimal escape sequence"); } diff --git a/test/java/util/regex/RegExTest.java b/test/java/util/regex/RegExTest.java index 0e8470c4c..e323066c3 100644 --- a/test/java/util/regex/RegExTest.java +++ b/test/java/util/regex/RegExTest.java @@ -32,7 +32,7 @@ * 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476 * 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940 * 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133 - * 6350801 6676425 6878475 6919132 6931676 6948903 + * 6350801 6676425 6878475 6919132 6931676 6948903 7014645 */ import java.util.regex.*; @@ -136,6 +136,7 @@ public class RegExTest { namedGroupCaptureTest(); nonBmpClassComplementTest(); unicodePropertiesTest(); + unicodeHexNotationTest(); if (failure) throw new RuntimeException("Failure in the RE handling."); else @@ -161,18 +162,19 @@ public class RegExTest { private static void check(Matcher m, String result, boolean expected) { m.find(); - if (m.group().equals(result)) - failCount += (expected) ? 0 : 1; - else - failCount += (expected) ? 1 : 0; + if (m.group().equals(result) != expected) + failCount++; } private static void check(Pattern p, String s, boolean expected) { - Matcher matcher = p.matcher(s); - if (matcher.find()) - failCount += (expected) ? 0 : 1; - else - failCount += (expected) ? 1 : 0; + if (p.matcher(s).find() != expected) + failCount++; + } + + private static void check(String p, String s, boolean expected) { + Matcher matcher = Pattern.compile(p).matcher(s); + if (matcher.find() != expected) + failCount++; } private static void check(String p, char c, boolean expected) { @@ -3614,4 +3616,45 @@ public class RegExTest { } report("unicodeProperties"); } + + private static void unicodeHexNotationTest() throws Exception { + + // negative + checkExpectedFail("\\x{-23}"); + checkExpectedFail("\\x{110000}"); + checkExpectedFail("\\x{}"); + checkExpectedFail("\\x{AB[ef]"); + + // codepoint + check("^\\x{1033c}$", "\uD800\uDF3C", true); + check("^\\xF0\\x90\\x8C\\xBC$", "\uD800\uDF3C", false); + check("^\\x{D800}\\x{DF3c}+$", "\uD800\uDF3C", false); + check("^\\xF0\\x90\\x8C\\xBC$", "\uD800\uDF3C", false); + + // in class + check("^[\\x{D800}\\x{DF3c}]+$", "\uD800\uDF3C", false); + check("^[\\xF0\\x90\\x8C\\xBC]+$", "\uD800\uDF3C", false); + check("^[\\x{D800}\\x{DF3C}]+$", "\uD800\uDF3C", false); + check("^[\\x{DF3C}\\x{D800}]+$", "\uD800\uDF3C", false); + check("^[\\x{D800}\\x{DF3C}]+$", "\uDF3C\uD800", true); + check("^[\\x{DF3C}\\x{D800}]+$", "\uDF3C\uD800", true); + + for (int cp = 0; cp <= 0x10FFFF; cp++) { + String s = "A" + new String(Character.toChars(cp)) + "B"; + String hexUTF16 = (cp <= 0xFFFF)? String.format("\\u%04x", cp) + : String.format("\\u%04x\\u%04x", + (int) Character.toChars(cp)[0], + (int) Character.toChars(cp)[1]); + String hexCodePoint = "\\x{" + Integer.toHexString(cp) + "}"; + if (!Pattern.matches("A" + hexUTF16 + "B", s)) + failCount++; + if (!Pattern.matches("A[" + hexUTF16 + "]B", s)) + failCount++; + if (!Pattern.matches("A" + hexCodePoint + "B", s)) + failCount++; + if (!Pattern.matches("A[" + hexCodePoint + "]B", s)) + failCount++; + } + report("unicodeHexNotation"); + } } -- GitLab