提交 729dd870 编写于 作者: S sherman

7014645: Support perl style Unicode hex notation \x{...}

Summary: Added the construct \x{...} for Unicode hex notation support
Reviewed-by: alanb, okutsu
上级 85c6e9dd
...@@ -101,6 +101,11 @@ import java.util.Arrays; ...@@ -101,6 +101,11 @@ import java.util.Arrays;
* <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;<tt>0x</tt><i>hh</i></td></tr> * <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;<tt>0x</tt><i>hh</i></td></tr>
* <tr><td valign="top" headers="construct characters"><tt>&#92;u</tt><i>hhhh</i></td> * <tr><td valign="top" headers="construct characters"><tt>&#92;u</tt><i>hhhh</i></td>
* <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;<tt>0x</tt><i>hhhh</i></td></tr> * <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;<tt>0x</tt><i>hhhh</i></td></tr>
* <tr><td valign="top" headers="construct characters"><tt>&#92;x</tt><i>{h...h}</i></td>
* <td headers="matches">The character with hexadecimal&nbsp;value&nbsp;<tt>0x</tt><i>h...h</i>
* ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}
* &nbsp;&lt;=&nbsp;<tt>0x</tt><i>h...h</i>&nbsp;&lt;=&nbsp
* {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr>
* <tr><td valign="top" headers="matches"><tt>\t</tt></td> * <tr><td valign="top" headers="matches"><tt>\t</tt></td>
* <td headers="matches">The tab character (<tt>'&#92;u0009'</tt>)</td></tr> * <td headers="matches">The tab character (<tt>'&#92;u0009'</tt>)</td></tr>
* <tr><td valign="top" headers="construct characters"><tt>\n</tt></td> * <tr><td valign="top" headers="construct characters"><tt>\n</tt></td>
...@@ -529,6 +534,13 @@ import java.util.Arrays; ...@@ -529,6 +534,13 @@ import java.util.Arrays;
* while not equal, compile into the same pattern, which matches the character * while not equal, compile into the same pattern, which matches the character
* with hexadecimal value <tt>0x2014</tt>. * with hexadecimal value <tt>0x2014</tt>.
* *
* <p> A Unicode character can also be represented in a regular-expression by
* using its hexadecimal code point value directly as described in construct
* <tt>&#92;x{...}</tt>, for example a supplementary character U+2011F
* can be specified as <tt>&#92;x{2011F}</tt>, instead of two consecutive
* Unicode escape sequences of the surrogate pair
* <tt>&#92;uD840</tt><tt>&#92;uDD1F</tt>.
*
* <a name="ubc"> * <a name="ubc">
* <p>Unicode scripts, blocks and categories are written with the <tt>\p</tt> and * <p>Unicode scripts, blocks and categories are written with the <tt>\p</tt> and
* <tt>\P</tt> constructs as in Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if * <tt>\P</tt> constructs as in Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
...@@ -2993,6 +3005,16 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { ...@@ -2993,6 +3005,16 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
if (ASCII.isHexDigit(m)) { if (ASCII.isHexDigit(m)) {
return ASCII.toDigit(n) * 16 + ASCII.toDigit(m); return ASCII.toDigit(n) * 16 + ASCII.toDigit(m);
} }
} else if (n == '{' && ASCII.isHexDigit(peek())) {
int ch = 0;
while (ASCII.isHexDigit(n = read())) {
ch = (ch << 4) + ASCII.toDigit(n);
if (ch > Character.MAX_CODE_POINT)
throw error("Hexadecimal codepoint is too big");
}
if (n != '}')
throw error("Unclosed hexadecimal escape sequence");
return ch;
} }
throw error("Illegal hexadecimal escape sequence"); throw error("Illegal hexadecimal escape sequence");
} }
......
...@@ -32,7 +32,7 @@ ...@@ -32,7 +32,7 @@
* 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476 * 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476
* 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940 * 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
* 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133 * 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
* 6350801 6676425 6878475 6919132 6931676 6948903 * 6350801 6676425 6878475 6919132 6931676 6948903 7014645
*/ */
import java.util.regex.*; import java.util.regex.*;
...@@ -136,6 +136,7 @@ public class RegExTest { ...@@ -136,6 +136,7 @@ public class RegExTest {
namedGroupCaptureTest(); namedGroupCaptureTest();
nonBmpClassComplementTest(); nonBmpClassComplementTest();
unicodePropertiesTest(); unicodePropertiesTest();
unicodeHexNotationTest();
if (failure) if (failure)
throw new RuntimeException("Failure in the RE handling."); throw new RuntimeException("Failure in the RE handling.");
else else
...@@ -161,18 +162,19 @@ public class RegExTest { ...@@ -161,18 +162,19 @@ public class RegExTest {
private static void check(Matcher m, String result, boolean expected) { private static void check(Matcher m, String result, boolean expected) {
m.find(); m.find();
if (m.group().equals(result)) if (m.group().equals(result) != expected)
failCount += (expected) ? 0 : 1; failCount++;
else
failCount += (expected) ? 1 : 0;
} }
private static void check(Pattern p, String s, boolean expected) { private static void check(Pattern p, String s, boolean expected) {
Matcher matcher = p.matcher(s); if (p.matcher(s).find() != expected)
if (matcher.find()) failCount++;
failCount += (expected) ? 0 : 1; }
else
failCount += (expected) ? 1 : 0; private static void check(String p, String s, boolean expected) {
Matcher matcher = Pattern.compile(p).matcher(s);
if (matcher.find() != expected)
failCount++;
} }
private static void check(String p, char c, boolean expected) { private static void check(String p, char c, boolean expected) {
...@@ -3614,4 +3616,45 @@ public class RegExTest { ...@@ -3614,4 +3616,45 @@ public class RegExTest {
} }
report("unicodeProperties"); report("unicodeProperties");
} }
private static void unicodeHexNotationTest() throws Exception {
// negative
checkExpectedFail("\\x{-23}");
checkExpectedFail("\\x{110000}");
checkExpectedFail("\\x{}");
checkExpectedFail("\\x{AB[ef]");
// codepoint
check("^\\x{1033c}$", "\uD800\uDF3C", true);
check("^\\xF0\\x90\\x8C\\xBC$", "\uD800\uDF3C", false);
check("^\\x{D800}\\x{DF3c}+$", "\uD800\uDF3C", false);
check("^\\xF0\\x90\\x8C\\xBC$", "\uD800\uDF3C", false);
// in class
check("^[\\x{D800}\\x{DF3c}]+$", "\uD800\uDF3C", false);
check("^[\\xF0\\x90\\x8C\\xBC]+$", "\uD800\uDF3C", false);
check("^[\\x{D800}\\x{DF3C}]+$", "\uD800\uDF3C", false);
check("^[\\x{DF3C}\\x{D800}]+$", "\uD800\uDF3C", false);
check("^[\\x{D800}\\x{DF3C}]+$", "\uDF3C\uD800", true);
check("^[\\x{DF3C}\\x{D800}]+$", "\uDF3C\uD800", true);
for (int cp = 0; cp <= 0x10FFFF; cp++) {
String s = "A" + new String(Character.toChars(cp)) + "B";
String hexUTF16 = (cp <= 0xFFFF)? String.format("\\u%04x", cp)
: String.format("\\u%04x\\u%04x",
(int) Character.toChars(cp)[0],
(int) Character.toChars(cp)[1]);
String hexCodePoint = "\\x{" + Integer.toHexString(cp) + "}";
if (!Pattern.matches("A" + hexUTF16 + "B", s))
failCount++;
if (!Pattern.matches("A[" + hexUTF16 + "]B", s))
failCount++;
if (!Pattern.matches("A" + hexCodePoint + "B", s))
failCount++;
if (!Pattern.matches("A[" + hexCodePoint + "]B", s))
failCount++;
}
report("unicodeHexNotation");
}
} }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册