提交 8ff675ea 编写于 作者: S sherman

7039066: j.u.rgex does not match TR18 RL1.4 Simple Word Boundaries and RL1.2 Properties

Summary: updated the regex Unicode property support
Reviewed-by: alanb
上级 bc3e9700
/*
* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package java.util.regex;
import java.util.HashMap;
import java.util.Locale;
enum UnicodeProp {
ALPHABETIC {
public boolean is(int ch) {
return Character.isAlphabetic(ch);
}
},
LETTER {
public boolean is(int ch) {
return Character.isLetter(ch);
}
},
IDEOGRAPHIC {
public boolean is(int ch) {
return Character.isIdeographic(ch);
}
},
LOWERCASE {
public boolean is(int ch) {
return Character.isLowerCase(ch);
}
},
UPPERCASE {
public boolean is(int ch) {
return Character.isUpperCase(ch);
}
},
TITLECASE {
public boolean is(int ch) {
return Character.isTitleCase(ch);
}
},
WHITE_SPACE {
// \p{Whitespace}
public boolean is(int ch) {
return ((((1 << Character.SPACE_SEPARATOR) |
(1 << Character.LINE_SEPARATOR) |
(1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
!= 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85);
}
},
CONTROL {
// \p{gc=Control}
public boolean is(int ch) {
return Character.getType(ch) == Character.CONTROL;
}
},
PUNCTUATION {
// \p{gc=Punctuation}
public boolean is(int ch) {
return ((((1 << Character.CONNECTOR_PUNCTUATION) |
(1 << Character.DASH_PUNCTUATION) |
(1 << Character.START_PUNCTUATION) |
(1 << Character.END_PUNCTUATION) |
(1 << Character.OTHER_PUNCTUATION) |
(1 << Character.INITIAL_QUOTE_PUNCTUATION) |
(1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
!= 0;
}
},
HEX_DIGIT {
// \p{gc=Decimal_Number}
// \p{Hex_Digit} -> PropList.txt: Hex_Digit
public boolean is(int ch) {
return DIGIT.is(ch) ||
(ch >= 0x0030 && ch <= 0x0039) ||
(ch >= 0x0041 && ch <= 0x0046) ||
(ch >= 0x0061 && ch <= 0x0066) ||
(ch >= 0xFF10 && ch <= 0xFF19) ||
(ch >= 0xFF21 && ch <= 0xFF26) ||
(ch >= 0xFF41 && ch <= 0xFF46);
}
},
ASSIGNED {
public boolean is(int ch) {
return Character.getType(ch) != Character.UNASSIGNED;
}
},
NONCHARACTER_CODE_POINT {
// PropList.txt:Noncharacter_Code_Point
public boolean is(int ch) {
return (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
}
},
DIGIT {
// \p{gc=Decimal_Number}
public boolean is(int ch) {
return Character.isDigit(ch);
}
},
ALNUM {
// \p{alpha}
// \p{digit}
public boolean is(int ch) {
return ALPHABETIC.is(ch) || DIGIT.is(ch);
}
},
BLANK {
// \p{Whitespace} --
// [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85
// \p{gc=Line_Separator}
// \p{gc=Paragraph_Separator}]
public boolean is(int ch) {
return Character.getType(ch) == Character.SPACE_SEPARATOR ||
ch == 0x9; // \N{HT}
}
},
GRAPH {
// [^
// \p{space}
// \p{gc=Control}
// \p{gc=Surrogate}
// \p{gc=Unassigned}]
public boolean is(int ch) {
return ((((1 << Character.SPACE_SEPARATOR) |
(1 << Character.LINE_SEPARATOR) |
(1 << Character.PARAGRAPH_SEPARATOR) |
(1 << Character.CONTROL) |
(1 << Character.SURROGATE) |
(1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1)
== 0;
}
},
PRINT {
// \p{graph}
// \p{blank}
// -- \p{cntrl}
public boolean is(int ch) {
return (GRAPH.is(ch) || BLANK.is(ch)) && !CONTROL.is(ch);
}
},
WORD {
// \p{alpha}
// \p{gc=Mark}
// \p{digit}
// \p{gc=Connector_Punctuation}
public boolean is(int ch) {
return ALPHABETIC.is(ch) ||
((((1 << Character.NON_SPACING_MARK) |
(1 << Character.ENCLOSING_MARK) |
(1 << Character.COMBINING_SPACING_MARK) |
(1 << Character.DECIMAL_DIGIT_NUMBER) |
(1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1)
!= 0;
}
};
private final static HashMap<String, String> posix = new HashMap<>();
private final static HashMap<String, String> aliases = new HashMap<>();
static {
posix.put("ALPHA", "ALPHABETIC");
posix.put("LOWER", "LOWERCASE");
posix.put("UPPER", "UPPERCASE");
posix.put("SPACE", "WHITE_SPACE");
posix.put("PUNCT", "PUNCTUATION");
posix.put("XDIGIT","HEX_DIGIT");
posix.put("ALNUM", "ALNUM");
posix.put("CNTRL", "CONTROL");
posix.put("DIGIT", "DIGIT");
posix.put("BLANK", "BLANK");
posix.put("GRAPH", "GRAPH");
posix.put("PRINT", "PRINT");
aliases.put("WHITESPACE", "WHITE_SPACE");
aliases.put("HEXDIGIT","HEX_DIGIT");
aliases.put("NONCHARACTERCODEPOINT", "NONCHARACTER_CODE_POINT");
}
public static UnicodeProp forName(String propName) {
propName = propName.toUpperCase(Locale.ENGLISH);
String alias = aliases.get(propName);
if (alias != null)
propName = alias;
try {
return valueOf (propName);
} catch (IllegalArgumentException x) {}
return null;
}
public static UnicodeProp forPOSIXName(String propName) {
propName = posix.get(propName.toUpperCase(Locale.ENGLISH));
if (propName == null)
return null;
return valueOf (propName);
}
public abstract boolean is(int ch);
}
/*
* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
final class POSIX_ASCII {
static final int UPPER = 0x00000100;
static final int LOWER = 0x00000200;
static final int DIGIT = 0x00000400;
static final int SPACE = 0x00000800;
static final int PUNCT = 0x00001000;
static final int CNTRL = 0x00002000;
static final int BLANK = 0x00004000;
static final int HEX = 0x00008000;
static final int UNDER = 0x00010000;
static final int ASCII = 0x0000FF00;
static final int ALPHA = (UPPER|LOWER);
static final int ALNUM = (UPPER|LOWER|DIGIT);
static final int GRAPH = (PUNCT|UPPER|LOWER|DIGIT);
static final int WORD = (UPPER|LOWER|UNDER|DIGIT);
static final int XDIGIT = (HEX);
private static final int[] ctype = new int[] {
CNTRL, /* 00 (NUL) */
CNTRL, /* 01 (SOH) */
CNTRL, /* 02 (STX) */
CNTRL, /* 03 (ETX) */
CNTRL, /* 04 (EOT) */
CNTRL, /* 05 (ENQ) */
CNTRL, /* 06 (ACK) */
CNTRL, /* 07 (BEL) */
CNTRL, /* 08 (BS) */
SPACE+CNTRL+BLANK, /* 09 (HT) */
SPACE+CNTRL, /* 0A (LF) */
SPACE+CNTRL, /* 0B (VT) */
SPACE+CNTRL, /* 0C (FF) */
SPACE+CNTRL, /* 0D (CR) */
CNTRL, /* 0E (SI) */
CNTRL, /* 0F (SO) */
CNTRL, /* 10 (DLE) */
CNTRL, /* 11 (DC1) */
CNTRL, /* 12 (DC2) */
CNTRL, /* 13 (DC3) */
CNTRL, /* 14 (DC4) */
CNTRL, /* 15 (NAK) */
CNTRL, /* 16 (SYN) */
CNTRL, /* 17 (ETB) */
CNTRL, /* 18 (CAN) */
CNTRL, /* 19 (EM) */
CNTRL, /* 1A (SUB) */
CNTRL, /* 1B (ESC) */
CNTRL, /* 1C (FS) */
CNTRL, /* 1D (GS) */
CNTRL, /* 1E (RS) */
CNTRL, /* 1F (US) */
SPACE+BLANK, /* 20 SPACE */
PUNCT, /* 21 ! */
PUNCT, /* 22 " */
PUNCT, /* 23 # */
PUNCT, /* 24 $ */
PUNCT, /* 25 % */
PUNCT, /* 26 & */
PUNCT, /* 27 ' */
PUNCT, /* 28 ( */
PUNCT, /* 29 ) */
PUNCT, /* 2A * */
PUNCT, /* 2B + */
PUNCT, /* 2C , */
PUNCT, /* 2D - */
PUNCT, /* 2E . */
PUNCT, /* 2F / */
DIGIT+HEX+0, /* 30 0 */
DIGIT+HEX+1, /* 31 1 */
DIGIT+HEX+2, /* 32 2 */
DIGIT+HEX+3, /* 33 3 */
DIGIT+HEX+4, /* 34 4 */
DIGIT+HEX+5, /* 35 5 */
DIGIT+HEX+6, /* 36 6 */
DIGIT+HEX+7, /* 37 7 */
DIGIT+HEX+8, /* 38 8 */
DIGIT+HEX+9, /* 39 9 */
PUNCT, /* 3A : */
PUNCT, /* 3B ; */
PUNCT, /* 3C < */
PUNCT, /* 3D = */
PUNCT, /* 3E > */
PUNCT, /* 3F ? */
PUNCT, /* 40 @ */
UPPER+HEX+10, /* 41 A */
UPPER+HEX+11, /* 42 B */
UPPER+HEX+12, /* 43 C */
UPPER+HEX+13, /* 44 D */
UPPER+HEX+14, /* 45 E */
UPPER+HEX+15, /* 46 F */
UPPER+16, /* 47 G */
UPPER+17, /* 48 H */
UPPER+18, /* 49 I */
UPPER+19, /* 4A J */
UPPER+20, /* 4B K */
UPPER+21, /* 4C L */
UPPER+22, /* 4D M */
UPPER+23, /* 4E N */
UPPER+24, /* 4F O */
UPPER+25, /* 50 P */
UPPER+26, /* 51 Q */
UPPER+27, /* 52 R */
UPPER+28, /* 53 S */
UPPER+29, /* 54 T */
UPPER+30, /* 55 U */
UPPER+31, /* 56 V */
UPPER+32, /* 57 W */
UPPER+33, /* 58 X */
UPPER+34, /* 59 Y */
UPPER+35, /* 5A Z */
PUNCT, /* 5B [ */
PUNCT, /* 5C \ */
PUNCT, /* 5D ] */
PUNCT, /* 5E ^ */
PUNCT|UNDER, /* 5F _ */
PUNCT, /* 60 ` */
LOWER+HEX+10, /* 61 a */
LOWER+HEX+11, /* 62 b */
LOWER+HEX+12, /* 63 c */
LOWER+HEX+13, /* 64 d */
LOWER+HEX+14, /* 65 e */
LOWER+HEX+15, /* 66 f */
LOWER+16, /* 67 g */
LOWER+17, /* 68 h */
LOWER+18, /* 69 i */
LOWER+19, /* 6A j */
LOWER+20, /* 6B k */
LOWER+21, /* 6C l */
LOWER+22, /* 6D m */
LOWER+23, /* 6E n */
LOWER+24, /* 6F o */
LOWER+25, /* 70 p */
LOWER+26, /* 71 q */
LOWER+27, /* 72 r */
LOWER+28, /* 73 s */
LOWER+29, /* 74 t */
LOWER+30, /* 75 u */
LOWER+31, /* 76 v */
LOWER+32, /* 77 w */
LOWER+33, /* 78 x */
LOWER+34, /* 79 y */
LOWER+35, /* 7A z */
PUNCT, /* 7B { */
PUNCT, /* 7C | */
PUNCT, /* 7D } */
PUNCT, /* 7E ~ */
CNTRL, /* 7F (DEL) */
};
static int getType(int ch) {
return ((ch & 0xFFFFFF80) == 0 ? ctype[ch] : 0);
}
static boolean isType(int ch, int type) {
return (getType(ch) & type) != 0;
}
static boolean isAscii(int ch) {
return ((ch & 0xFFFFFF80) == 0);
}
static boolean isAlpha(int ch) {
return isType(ch, ALPHA);
}
static boolean isDigit(int ch) {
return ((ch-'0')|('9'-ch)) >= 0;
}
static boolean isAlnum(int ch) {
return isType(ch, ALNUM);
}
static boolean isGraph(int ch) {
return isType(ch, GRAPH);
}
static boolean isPrint(int ch) {
return ((ch-0x20)|(0x7E-ch)) >= 0;
}
static boolean isPunct(int ch) {
return isType(ch, PUNCT);
}
static boolean isSpace(int ch) {
return isType(ch, SPACE);
}
static boolean isHexDigit(int ch) {
return isType(ch, HEX);
}
static boolean isCntrl(int ch) {
return isType(ch, CNTRL);
}
static boolean isLower(int ch) {
return ((ch-'a')|('z'-ch)) >= 0;
}
static boolean isUpper(int ch) {
return ((ch-'A')|('Z'-ch)) >= 0;
}
static boolean isWord(int ch) {
return isType(ch, WORD);
}
}
/*
* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
import java.util.HashMap;
import java.util.Locale;
final public class POSIX_Unicode {
public static boolean isAlpha(int ch) {
return Character.isAlphabetic(ch);
}
public static boolean isLower(int ch) {
return Character.isLowerCase(ch);
}
public static boolean isUpper(int ch) {
return Character.isUpperCase(ch);
}
// \p{Whitespace}
public static boolean isSpace(int ch) {
return ((((1 << Character.SPACE_SEPARATOR) |
(1 << Character.LINE_SEPARATOR) |
(1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
!= 0 ||
(ch >= 0x9 && ch <= 0xd) ||
(ch == 0x85);
}
// \p{gc=Control}
public static boolean isCntrl(int ch) {
return Character.getType(ch) == Character.CONTROL;
}
// \p{gc=Punctuation}
public static boolean isPunct(int ch) {
return ((((1 << Character.CONNECTOR_PUNCTUATION) |
(1 << Character.DASH_PUNCTUATION) |
(1 << Character.START_PUNCTUATION) |
(1 << Character.END_PUNCTUATION) |
(1 << Character.OTHER_PUNCTUATION) |
(1 << Character.INITIAL_QUOTE_PUNCTUATION) |
(1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
!= 0;
}
// \p{gc=Decimal_Number}
// \p{Hex_Digit} -> PropList.txt: Hex_Digit
public static boolean isHexDigit(int ch) {
return Character.isDigit(ch) ||
(ch >= 0x0030 && ch <= 0x0039) ||
(ch >= 0x0041 && ch <= 0x0046) ||
(ch >= 0x0061 && ch <= 0x0066) ||
(ch >= 0xFF10 && ch <= 0xFF19) ||
(ch >= 0xFF21 && ch <= 0xFF26) ||
(ch >= 0xFF41 && ch <= 0xFF46);
}
// \p{gc=Decimal_Number}
public static boolean isDigit(int ch) {
return Character.isDigit(ch);
};
// \p{alpha}
// \p{digit}
public static boolean isAlnum(int ch) {
return Character.isAlphabetic(ch) || Character.isDigit(ch);
}
// \p{Whitespace} --
// [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85
// \p{gc=Line_Separator}
// \p{gc=Paragraph_Separator}]
public static boolean isBlank(int ch) {
int type = Character.getType(ch);
return isSpace(ch) &&
ch != 0xa & ch != 0xb && ch !=0xc && ch != 0xd && ch != 0x85 &&
type != Character.LINE_SEPARATOR &&
type != Character.PARAGRAPH_SEPARATOR;
}
// [^
// \p{space}
// \p{gc=Control}
// \p{gc=Surrogate}
// \p{gc=Unassigned}]
public static boolean isGraph(int ch) {
int type = Character.getType(ch);
return !(isSpace(ch) ||
Character.CONTROL == type ||
Character.SURROGATE == type ||
Character.UNASSIGNED == type);
}
// \p{graph}
// \p{blank}
// -- \p{cntrl}
public static boolean isPrint(int ch) {
return (isGraph(ch) || isBlank(ch)) && !isCntrl(ch);
}
// PropList.txt:Noncharacter_Code_Point
public static boolean isNoncharacterCodePoint(int ch) {
return (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
}
// \p{alpha}
// \p{gc=Mark}
// \p{digit}
// \p{gc=Connector_Punctuation}
public static boolean isWord(int ch) {
return isAlpha(ch) ||
((((1 << Character.NON_SPACING_MARK) |
(1 << Character.ENCLOSING_MARK) |
(1 << Character.COMBINING_SPACING_MARK) |
(1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1)
!= 0 ||
isDigit(ch);
}
}
...@@ -32,7 +32,7 @@ ...@@ -32,7 +32,7 @@
* 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476 * 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476
* 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940 * 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
* 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133 * 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
* 6350801 6676425 6878475 6919132 6931676 6948903 7014645 * 6350801 6676425 6878475 6919132 6931676 6948903 7014645 7039066
*/ */
import java.util.regex.*; import java.util.regex.*;
...@@ -137,6 +137,7 @@ public class RegExTest { ...@@ -137,6 +137,7 @@ public class RegExTest {
nonBmpClassComplementTest(); nonBmpClassComplementTest();
unicodePropertiesTest(); unicodePropertiesTest();
unicodeHexNotationTest(); unicodeHexNotationTest();
unicodeClassesTest();
if (failure) if (failure)
throw new RuntimeException("Failure in the RE handling."); throw new RuntimeException("Failure in the RE handling.");
else else
...@@ -3656,5 +3657,146 @@ public class RegExTest { ...@@ -3656,5 +3657,146 @@ public class RegExTest {
failCount++; failCount++;
} }
report("unicodeHexNotation"); report("unicodeHexNotation");
} }
private static void unicodeClassesTest() throws Exception {
Matcher lower = Pattern.compile("\\p{Lower}").matcher("");
Matcher upper = Pattern.compile("\\p{Upper}").matcher("");
Matcher ASCII = Pattern.compile("\\p{ASCII}").matcher("");
Matcher alpha = Pattern.compile("\\p{Alpha}").matcher("");
Matcher digit = Pattern.compile("\\p{Digit}").matcher("");
Matcher alnum = Pattern.compile("\\p{Alnum}").matcher("");
Matcher punct = Pattern.compile("\\p{Punct}").matcher("");
Matcher graph = Pattern.compile("\\p{Graph}").matcher("");
Matcher print = Pattern.compile("\\p{Print}").matcher("");
Matcher blank = Pattern.compile("\\p{Blank}").matcher("");
Matcher cntrl = Pattern.compile("\\p{Cntrl}").matcher("");
Matcher xdigit = Pattern.compile("\\p{XDigit}").matcher("");
Matcher space = Pattern.compile("\\p{Space}").matcher("");
Matcher bound = Pattern.compile("\\b").matcher("");
Matcher word = Pattern.compile("\\w++").matcher("");
// UNICODE_CHARACTER_CLASS
Matcher lowerU = Pattern.compile("\\p{Lower}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher upperU = Pattern.compile("\\p{Upper}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher ASCIIU = Pattern.compile("\\p{ASCII}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher alphaU = Pattern.compile("\\p{Alpha}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher digitU = Pattern.compile("\\p{Digit}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher alnumU = Pattern.compile("\\p{Alnum}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher punctU = Pattern.compile("\\p{Punct}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher graphU = Pattern.compile("\\p{Graph}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher printU = Pattern.compile("\\p{Print}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher blankU = Pattern.compile("\\p{Blank}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher cntrlU = Pattern.compile("\\p{Cntrl}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher xdigitU = Pattern.compile("\\p{XDigit}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher spaceU = Pattern.compile("\\p{Space}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher boundU = Pattern.compile("\\b", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher wordU = Pattern.compile("\\w", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
// embedded flag (?U)
Matcher lowerEU = Pattern.compile("(?U)\\p{Lower}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher graphEU = Pattern.compile("(?U)\\p{Graph}", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher wordEU = Pattern.compile("(?U)\\w", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher bwb = Pattern.compile("\\b\\w\\b").matcher("");
Matcher bwbU = Pattern.compile("\\b\\w++\\b", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
Matcher bwbEU = Pattern.compile("(?U)\\b\\w++\\b", Pattern.UNICODE_CHARACTER_CLASS).matcher("");
// properties
Matcher lowerP = Pattern.compile("\\p{IsLowerCase}").matcher("");
Matcher upperP = Pattern.compile("\\p{IsUpperCase}").matcher("");
Matcher titleP = Pattern.compile("\\p{IsTitleCase}").matcher("");
Matcher letterP = Pattern.compile("\\p{IsLetter}").matcher("");
Matcher alphaP = Pattern.compile("\\p{IsAlphabetic}").matcher("");
Matcher ideogP = Pattern.compile("\\p{IsIdeographic}").matcher("");
Matcher cntrlP = Pattern.compile("\\p{IsControl}").matcher("");
Matcher spaceP = Pattern.compile("\\p{IsWhiteSpace}").matcher("");
Matcher definedP = Pattern.compile("\\p{IsAssigned}").matcher("");
Matcher nonCCPP = Pattern.compile("\\p{IsNoncharacterCodePoint}").matcher("");
// javaMethod
Matcher lowerJ = Pattern.compile("\\p{javaLowerCase}").matcher("");
Matcher upperJ = Pattern.compile("\\p{javaUpperCase}").matcher("");
Matcher alphaJ = Pattern.compile("\\p{javaAlphabetic}").matcher("");
Matcher ideogJ = Pattern.compile("\\p{javaIdeographic}").matcher("");
for (int cp = 1; cp < 0x30000; cp++) {
String str = new String(Character.toChars(cp));
int type = Character.getType(cp);
if (// lower
POSIX_ASCII.isLower(cp) != lower.reset(str).matches() ||
Character.isLowerCase(cp) != lowerU.reset(str).matches() ||
Character.isLowerCase(cp) != lowerP.reset(str).matches() ||
Character.isLowerCase(cp) != lowerEU.reset(str).matches()||
Character.isLowerCase(cp) != lowerJ.reset(str).matches()||
// upper
POSIX_ASCII.isUpper(cp) != upper.reset(str).matches() ||
POSIX_Unicode.isUpper(cp) != upperU.reset(str).matches() ||
Character.isUpperCase(cp) != upperP.reset(str).matches() ||
Character.isUpperCase(cp) != upperJ.reset(str).matches() ||
// alpha
POSIX_ASCII.isAlpha(cp) != alpha.reset(str).matches() ||
POSIX_Unicode.isAlpha(cp) != alphaU.reset(str).matches() ||
Character.isAlphabetic(cp)!= alphaP.reset(str).matches() ||
Character.isAlphabetic(cp)!= alphaJ.reset(str).matches() ||
// digit
POSIX_ASCII.isDigit(cp) != digit.reset(str).matches() ||
Character.isDigit(cp) != digitU.reset(str).matches() ||
// alnum
POSIX_ASCII.isAlnum(cp) != alnum.reset(str).matches() ||
POSIX_Unicode.isAlnum(cp) != alnumU.reset(str).matches() ||
// punct
POSIX_ASCII.isPunct(cp) != punct.reset(str).matches() ||
POSIX_Unicode.isPunct(cp) != punctU.reset(str).matches() ||
// graph
POSIX_ASCII.isGraph(cp) != graph.reset(str).matches() ||
POSIX_Unicode.isGraph(cp) != graphU.reset(str).matches() ||
POSIX_Unicode.isGraph(cp) != graphEU.reset(str).matches()||
// blank
POSIX_ASCII.isType(cp, POSIX_ASCII.BLANK)
!= blank.reset(str).matches() ||
POSIX_Unicode.isBlank(cp) != blankU.reset(str).matches() ||
// print
POSIX_ASCII.isPrint(cp) != print.reset(str).matches() ||
POSIX_Unicode.isPrint(cp) != printU.reset(str).matches() ||
// cntrl
POSIX_ASCII.isCntrl(cp) != cntrl.reset(str).matches() ||
POSIX_Unicode.isCntrl(cp) != cntrlU.reset(str).matches() ||
(Character.CONTROL == type) != cntrlP.reset(str).matches() ||
// hexdigit
POSIX_ASCII.isHexDigit(cp) != xdigit.reset(str).matches() ||
POSIX_Unicode.isHexDigit(cp) != xdigitU.reset(str).matches() ||
// space
POSIX_ASCII.isSpace(cp) != space.reset(str).matches() ||
POSIX_Unicode.isSpace(cp) != spaceU.reset(str).matches() ||
POSIX_Unicode.isSpace(cp) != spaceP.reset(str).matches() ||
// word
POSIX_ASCII.isWord(cp) != word.reset(str).matches() ||
POSIX_Unicode.isWord(cp) != wordU.reset(str).matches() ||
POSIX_Unicode.isWord(cp) != wordEU.reset(str).matches()||
// bwordb
POSIX_ASCII.isWord(cp) != bwb.reset(str).matches() ||
POSIX_Unicode.isWord(cp) != bwbU.reset(str).matches() ||
// properties
Character.isTitleCase(cp) != titleP.reset(str).matches() ||
Character.isLetter(cp) != letterP.reset(str).matches()||
Character.isIdeographic(cp) != ideogP.reset(str).matches() ||
Character.isIdeographic(cp) != ideogJ.reset(str).matches() ||
(Character.UNASSIGNED == type) == definedP.reset(str).matches() ||
POSIX_Unicode.isNoncharacterCodePoint(cp) != nonCCPP.reset(str).matches())
failCount++;
}
// bounds/word align
twoFindIndexes(" \u0180sherman\u0400 ", bound, 1, 10);
if (!bwbU.reset("\u0180sherman\u0400").matches())
failCount++;
twoFindIndexes(" \u0180sh\u0345erman\u0400 ", bound, 1, 11);
if (!bwbU.reset("\u0180sh\u0345erman\u0400").matches())
failCount++;
twoFindIndexes(" \u0724\u0739\u0724 ", bound, 1, 4);
if (!bwbU.reset("\u0724\u0739\u0724").matches())
failCount++;
if (!bwbEU.reset("\u0724\u0739\u0724").matches())
failCount++;
report("unicodePredefinedClasses");
}
} }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册