提交 3f6679ba 编写于 作者: S sherman

6945564: Unicode script support in Character class

6948903: Make Unicode scripts available for use in regular expressions
Summary: added Unicode script suport
Reviewed-by: martin
上级 7a89f869
...@@ -34,6 +34,7 @@ JAVA_JAVA_java = \ ...@@ -34,6 +34,7 @@ JAVA_JAVA_java = \
java/lang/Thread.java \ java/lang/Thread.java \
java/lang/Character.java \ java/lang/Character.java \
java/lang/CharacterData.java \ java/lang/CharacterData.java \
java/lang/CharacterName.java \
sun/misc/ASCIICaseInsensitiveComparator.java \ sun/misc/ASCIICaseInsensitiveComparator.java \
sun/misc/VM.java \ sun/misc/VM.java \
sun/misc/Signal.java \ sun/misc/Signal.java \
......
...@@ -384,6 +384,27 @@ clean:: ...@@ -384,6 +384,27 @@ clean::
$(RM) $(GENSRCDIR)/java/lang/CharacterDataUndefined.java $(RM) $(GENSRCDIR)/java/lang/CharacterDataUndefined.java
$(RM) $(GENSRCDIR)/java/lang/CharacterDataPrivateUse.java $(RM) $(GENSRCDIR)/java/lang/CharacterDataPrivateUse.java
#
# Rules to generate classes/java/lang/uniName.dat
#
UNINAME = $(CLASSBINDIR)/java/lang/uniName.dat
GENERATEUNINAME_JARFILE = $(BUILDTOOLJARDIR)/generatecharacter.jar
build: $(UNINAME)
$(UNINAME): $(UNICODEDATA)/UnicodeData.txt \
$(GENERATECHARACTER_JARFILE)
@$(prep-target)
$(BOOT_JAVA_CMD) -classpath $(GENERATECHARACTER_JARFILE) \
build.tools.generatecharacter.CharacterName \
$(UNICODEDATA)/UnicodeData.txt $(UNINAME)
clean::
$(RM) $(UNINAME)
# #
# End of rules to create $(GENSRCDIR)/java/lang/CharacterDataXX.java # End of rules to create $(GENSRCDIR)/java/lang/CharacterDataXX.java
# #
......
此差异已折叠。
package build.tools.generatecharacter;
import java.io.*;
import java.nio.*;
import java.util.*;
import java.util.zip.*;
public class CharacterName {
public static void main(String[] args) {
FileReader reader = null;
try {
if (args.length != 2) {
System.err.println("Usage: java CharacterName UniocdeData.txt uniName.dat");
System.exit(1);
}
reader = new FileReader(args[0]);
BufferedReader bfr = new BufferedReader(reader);
String line = null;
StringBuilder namePool = new StringBuilder();
byte[] cpPoolBytes = new byte[0x100000];
ByteBuffer cpBB = ByteBuffer.wrap(cpPoolBytes);
int lastCp = 0;
int cpNum = 0;
while ((line = bfr.readLine()) != null) {
if (line.startsWith("#"))
continue;
UnicodeSpec spec = UnicodeSpec.parse(line);
if (spec != null) {
int cp = spec.getCodePoint();
String name = spec.getName();
cpNum++;
if (name.equals("<control>") && spec.getOldName() != null) {
if (spec.getOldName().length() != 0)
name = spec.getOldName();
else
continue;
} else if (name.startsWith("<")) {
/*
3400 <CJK Ideograph Extension A, First>
4db5 <CJK Ideograph Extension A, Last>
4e00 <CJK Ideograph, First>
9fc3 <CJK Ideograph, Last>
ac00 <Hangul Syllable, First>
d7a3 <Hangul Syllable, Last>
d800 <Non Private Use High Surrogate, First>
db7f <Non Private Use High Surrogate, Last>
db80 <Private Use High Surrogate, First>
dbff <Private Use High Surrogate, Last>
dc00 <Low Surrogate, First>
dfff <Low Surrogate, Last>
e000 <Private Use, First>
f8ff <Private Use, Last>
20000 <CJK Ideograph Extension B, First>
2a6d6 <CJK Ideograph Extension B, Last>
f0000 <Plane 15 Private Use, First>
ffffd <Plane 15 Private Use, Last>
*/
continue;
}
if (cp == lastCp + 1) {
cpBB.put((byte)name.length());
} else {
cpBB.put((byte)0); // segment start flag
cpBB.putInt((name.length() << 24) | (cp & 0xffffff));
}
namePool.append(name);
lastCp = cp;
}
}
byte[] namePoolBytes = namePool.toString().getBytes("ASCII");
int cpLen = cpBB.position();
int total = cpLen + namePoolBytes.length;
DataOutputStream dos = new DataOutputStream(
new DeflaterOutputStream(
new FileOutputStream(args[1])));
dos.writeInt(total); // total
dos.writeInt(cpLen); // nameOff
dos.write(cpPoolBytes, 0, cpLen);
dos.write(namePoolBytes);
dos.close();
} catch (Throwable e) {
System.out.println("Unexpected exception:");
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (Throwable ee) { ee.printStackTrace(); }
}
}
}
}
import java.util.regex.*;
import java.util.*;
import java.io.*;
public class CharacterScript {
// generate the code needed for j.l.C.UnicodeScript
static void fortest(String fmt, Object... o) {
//System.out.printf(fmt, o);
}
static void print(String fmt, Object... o) {
System.out.printf(fmt, o);
}
static void debug(String fmt, Object... o) {
//System.out.printf(fmt, o);
}
public static void main(String args[]){
try {
if (args.length != 1) {
System.out.println("java CharacterScript script.txt out");
System.exit(1);
}
int i, j;
BufferedReader sbfr = new BufferedReader(new FileReader(args[0]));
HashMap<String,Integer> scriptMap = new HashMap<String,Integer>();
String line = null;
Matcher m = Pattern.compile("(\\p{XDigit}+)(?:\\.{2}(\\p{XDigit}+))?\\s+;\\s+(\\w+)\\s+#.*").matcher("");
int prevS = -1;
int prevE = -1;
String prevN = null;
int[][] scripts = new int[1024][3];
int scriptSize = 0;
while ((line = sbfr.readLine()) != null) {
if (line.length() <= 1 || line.charAt(0) == '#') {
continue;
}
m.reset(line);
if (m.matches()) {
int start = Integer.parseInt(m.group(1), 16);
int end = (m.group(2)==null)?start
:Integer.parseInt(m.group(2), 16);
String name = m.group(3);
if (name.equals(prevN) && start == prevE + 1) {
prevE = end;
} else {
if (prevS != -1) {
if (scriptMap.get(prevN) == null) {
scriptMap.put(prevN, scriptMap.size());
}
scripts[scriptSize][0] = prevS;
scripts[scriptSize][1] = prevE;
scripts[scriptSize][2] = scriptMap.get(prevN);
scriptSize++;
}
debug("%x-%x\t%s%n", prevS, prevE, prevN);
prevS = start; prevE = end; prevN = name;
}
} else {
debug("Warning: Unrecognized line <%s>%n", line);
}
}
//last one.
if (scriptMap.get(prevN) == null) {
scriptMap.put(prevN, scriptMap.size());
}
scripts[scriptSize][0] = prevS;
scripts[scriptSize][1] = prevE;
scripts[scriptSize][2] = scriptMap.get(prevN);
scriptSize++;
debug("%x-%x\t%s%n", prevS, prevE, prevN);
debug("-----------------%n");
debug("Total scripts=%s%n", scriptMap.size());
debug("-----------------%n%n");
String[] names = new String[scriptMap.size()];
for (String name: scriptMap.keySet()) {
names[scriptMap.get(name).intValue()] = name;
}
for (j = 0; j < scriptSize; j++) {
for (int cp = scripts[j][0]; cp <= scripts[j][1]; cp++) {
String name = names[scripts[j][2]].toUpperCase(Locale.ENGLISH);;
if (cp > 0xffff)
System.out.printf("%05X %s%n", cp, name);
else
System.out.printf("%05X %s%n", cp, name);
}
}
Arrays.sort(scripts, 0, scriptSize,
new Comparator<int[]>() {
public int compare(int[] a1, int[] a2) {
return a1[0] - a2[0];
}
public boolean compare(Object obj) {
return obj == this;
}
});
// Consolidation: there are lots of "reserved" code points
// embedded in those otherwise "sequential" blocks.
// To make the lookup table smaller, we combine those
// separated segments with the assumption that the lookup
// implementation checks
// Character.getType() != Character.UNASSIGNED
// first (return UNKNOWN for unassigned)
ArrayList<int[]> list = new ArrayList();
list.add(scripts[0]);
int[] last = scripts[0];
for (i = 1; i < scriptSize; i++) {
if (scripts[i][0] != (last[1] + 1)) {
boolean isNotUnassigned = false;
for (int cp = last[1] + 1; cp < scripts[i][0]; cp++) {
if (Character.getType(cp) != Character.UNASSIGNED) {
isNotUnassigned = true;
debug("Warning: [%x] is ASSIGNED but in NON script%n", cp);
break;
}
}
if (isNotUnassigned) {
// surrogates only?
int[] a = new int[3];
a[0] = last[1] + 1;
a[1] = scripts[i][0] - 1;
a[2] = -1; // unknown
list.add(a);
} else {
if (last[2] == scripts[i][2]) {
//combine
last[1] = scripts[i][1];
continue;
} else {
// expand last
last[1] = scripts[i][0] - 1;
}
}
}
list.add(scripts[i]);
last = scripts[i];
}
for (i = 0; i < list.size(); i++) {
int[] a = (int[])list.get(i);
String name = "UNKNOWN";
if (a[2] != -1)
name = names[a[2]].toUpperCase(Locale.US);
debug("0x%05x, 0x%05x %s%n", a[0], a[1], name);
}
debug("--->total=%d%n", list.size());
//////////////////OUTPUT//////////////////////////////////
print("public class Scripts {%n%n");
print(" public static enum UnicodeScript {%n");
for (i = 0; i < names.length; i++) {
print(" /**%n * Unicode script \"%s\".%n */%n", names[i]);
print(" %s,%n%n", names[i].toUpperCase(Locale.US));
}
print(" /**%n * Unicode script \"Unknown\".%n */%n UNKNOWN;%n%n");
// lookup table
print(" private static final int[] scriptStarts = {%n");
for (int[] a : list) {
String name = "UNKNOWN";
if (a[2] != -1)
name = names[a[2]].toUpperCase(Locale.US);
if (a[0] < 0x10000)
print(" 0x%04X, // %04X..%04X; %s%n",
a[0], a[0], a[1], name);
else
print(" 0x%05X, // %05X..%05X; %s%n",
a[0], a[0], a[1], name);
}
last = list.get(list.size() -1);
if (last[1] != Character.MAX_CODE_POINT)
print(" 0x%05X // %05X..%06X; %s%n",
last[1] + 1, last[1] + 1, Character.MAX_CODE_POINT,
"UNKNOWN");
print("%n };%n%n");
print(" private static final UnicodeScript[] scripts = {%n");
for (int[] a : list) {
String name = "UNKNOWN";
if (a[2] != -1)
name = names[a[2]].toUpperCase(Locale.US);
print(" %s,%n", name);
}
if (last[1] != Character.MAX_CODE_POINT)
print(" UNKNOWN%n");
print(" };%n");
print(" }%n");
print("}%n");
} catch (Exception e) {
e.printStackTrace();
}
}
}
...@@ -35,6 +35,8 @@ import java.io.BufferedWriter; ...@@ -35,6 +35,8 @@ import java.io.BufferedWriter;
import java.io.FileWriter; import java.io.FileWriter;
import java.io.File; import java.io.File;
import build.tools.generatecharacter.CharacterName;
/** /**
* This program generates the source code for the class java.lang.Character. * This program generates the source code for the class java.lang.Character.
* It also generates native C code that can perform the same operations. * It also generates native C code that can perform the same operations.
......
/*
* Copyright 2010 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Sun designates this
* particular file as subject to the "Classpath" exception as provided
* by Sun in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
* CA 95054 USA or visit www.sun.com if you need additional information or
* have any questions.
*/
package java.lang;
import java.io.DataInputStream;
import java.io.InputStream;
import java.lang.ref.SoftReference;
import java.util.Arrays;
import java.util.zip.InflaterInputStream;
import java.security.AccessController;
import java.security.PrivilegedAction;
class CharacterName {
private static SoftReference<byte[]> refStrPool;
private static int[][] lookup;
private static synchronized byte[] initNamePool() {
byte[] strPool = null;
if (refStrPool != null && (strPool = refStrPool.get()) != null)
return strPool;
DataInputStream dis = null;
try {
dis = new DataInputStream(new InflaterInputStream(
AccessController.doPrivileged(new PrivilegedAction<InputStream>()
{
public InputStream run() {
return getClass().getResourceAsStream("uniName.dat");
}
})));
lookup = new int[(Character.MAX_CODE_POINT + 1) >> 8][];
int total = dis.readInt();
int cpEnd = dis.readInt();
byte ba[] = new byte[cpEnd];
dis.readFully(ba);
int nameOff = 0;
int cpOff = 0;
int cp = 0;
do {
int len = ba[cpOff++] & 0xff;
if (len == 0) {
len = ba[cpOff++] & 0xff;
// always big-endian
cp = ((ba[cpOff++] & 0xff) << 16) |
((ba[cpOff++] & 0xff) << 8) |
((ba[cpOff++] & 0xff));
} else {
cp++;
}
int hi = cp >> 8;
if (lookup[hi] == null) {
lookup[hi] = new int[0x100];
}
lookup[hi][cp&0xff] = (nameOff << 8) | len;
nameOff += len;
} while (cpOff < cpEnd);
strPool = new byte[total - cpEnd];
dis.readFully(strPool);
refStrPool = new SoftReference<byte[]>(strPool);
} catch (Exception x) {
throw new InternalError(x.getMessage());
} finally {
try {
if (dis != null)
dis.close();
} catch (Exception xx) {}
}
return strPool;
}
public static String get(int cp) {
byte[] strPool = null;
if (refStrPool == null || (strPool = refStrPool.get()) == null)
strPool = initNamePool();
int off = 0;
if (lookup[cp>>8] == null ||
(off = lookup[cp>>8][cp&0xff]) == 0)
return null;
return new String(strPool, 0, off >>> 8, off & 0xff); // ASCII
}
}
...@@ -29,6 +29,7 @@ import java.security.AccessController; ...@@ -29,6 +29,7 @@ import java.security.AccessController;
import java.security.PrivilegedAction; import java.security.PrivilegedAction;
import java.text.CharacterIterator; import java.text.CharacterIterator;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
...@@ -200,8 +201,9 @@ import java.util.Arrays; ...@@ -200,8 +201,9 @@ import java.util.Arrays;
* <td>Equivalent to java.lang.Character.isMirrored()</td></tr> * <td>Equivalent to java.lang.Character.isMirrored()</td></tr>
* *
* <tr><th>&nbsp;</th></tr> * <tr><th>&nbsp;</th></tr>
* <tr align="left"><th colspan="2" id="unicode">Classes for Unicode blocks and categories</th></tr> * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks and categories</th></tr>
* * * <tr><td valign="top" headers="construct unicode"><tt>\p{IsLatin}</tt></td>
* <td headers="matches">A Latin&nbsp;script character (simple <a href="#ubc">script</a>)</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td> * <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td>
* <td headers="matches">A character in the Greek&nbsp;block (simple <a href="#ubc">block</a>)</td></tr> * <td headers="matches">A character in the Greek&nbsp;block (simple <a href="#ubc">block</a>)</td></tr>
* <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td> * <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td>
...@@ -527,25 +529,40 @@ import java.util.Arrays; ...@@ -527,25 +529,40 @@ import java.util.Arrays;
* while not equal, compile into the same pattern, which matches the character * while not equal, compile into the same pattern, which matches the character
* with hexadecimal value <tt>0x2014</tt>. * with hexadecimal value <tt>0x2014</tt>.
* *
* <a name="ubc"> <p>Unicode blocks and categories are written with the * <a name="ubc">
* <tt>\p</tt> and <tt>\P</tt> constructs as in * <p>Unicode scripts, blocks and categories are written with the <tt>\p</tt> and
* Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if the input has the * <tt>\P</tt> constructs as in Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
* property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt> does not match if * the input has the property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt>
* the input has that property. Blocks are specified with the prefix * does not match if the input has that property.
* <tt>In</tt>, as in <tt>InMongolian</tt>. Categories may be specified with * <p>
* the optional prefix <tt>Is</tt>: Both <tt>\p{L}</tt> and <tt>\p{IsL}</tt> * Scripts are specified either with the prefix {@code Is}, as in
* denote the category of Unicode letters. Blocks and categories can be used * {@code IsHiragana}, or by using the {@code script} keyword (or its short
* both inside and outside of a character class. * form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}.
* * <p>
* Blocks are specified with the prefix {@code In}, as in
* {@code InMongolian}, or by using the keyword {@code block} (or its short
* form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}.
* <p>
* Categories may be specified with the optional prefix {@code Is}:
* Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode
* letters. Same as scripts and blocks, categories can also be specified
* by using the keyword {@code general_category} (or its short form
* {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}.
* <p>
* Scripts, blocks and categories can be used both inside and outside of a
* character class.
* <p> The supported categories are those of * <p> The supported categories are those of
* <a href="http://www.unicode.org/unicode/standard/standard.html"> * <a href="http://www.unicode.org/unicode/standard/standard.html">
* <i>The Unicode Standard</i></a> in the version specified by the * <i>The Unicode Standard</i></a> in the version specified by the
* {@link java.lang.Character Character} class. The category names are those * {@link java.lang.Character Character} class. The category names are those
* defined in the Standard, both normative and informative. * defined in the Standard, both normative and informative.
* The script names supported by <code>Pattern</code> are the valid script names
* accepted and defined by
* {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
* The block names supported by <code>Pattern</code> are the valid block names * The block names supported by <code>Pattern</code> are the valid block names
* accepted and defined by * accepted and defined by
* {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}. * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
* * <p>
* <a name="jcc"> <p>Categories that behave like the java.lang.Character * <a name="jcc"> <p>Categories that behave like the java.lang.Character
* boolean is<i>methodname</i> methods (except for the deprecated ones) are * boolean is<i>methodname</i> methods (except for the deprecated ones) are
* available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where * available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where
...@@ -2488,12 +2505,34 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { ...@@ -2488,12 +2505,34 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
name = new String(temp, i, j-i-1); name = new String(temp, i, j-i-1);
} }
if (name.startsWith("In")) { int i = name.indexOf('=');
node = unicodeBlockPropertyFor(name.substring(2)); if (i != -1) {
// property construct \p{name=value}
String value = name.substring(i + 1);
name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
if ("sc".equals(name) || "script".equals(name)) {
node = unicodeScriptPropertyFor(value);
} else if ("blk".equals(name) || "block".equals(name)) {
node = unicodeBlockPropertyFor(value);
} else if ("gc".equals(name) || "general_category".equals(name)) {
node = charPropertyNodeFor(value);
} else {
throw error("Unknown Unicode property {name=<" + name + ">, "
+ "value=<" + value + ">}");
}
} else { } else {
if (name.startsWith("Is")) if (name.startsWith("In")) {
// \p{inBlockName}
node = unicodeBlockPropertyFor(name.substring(2));
} else if (name.startsWith("Is")) {
// \p{isGeneralCategory} and \p{isScriptName}
name = name.substring(2); name = name.substring(2);
node = charPropertyNodeFor(name); node = CharPropertyNames.charPropertyFor(name);
if (node == null)
node = unicodeScriptPropertyFor(name);
} else {
node = charPropertyNodeFor(name);
}
} }
if (maybeComplement) { if (maybeComplement) {
if (node instanceof Category || node instanceof Block) if (node instanceof Category || node instanceof Block)
...@@ -2503,6 +2542,21 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { ...@@ -2503,6 +2542,21 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
return node; return node;
} }
/**
* Returns a CharProperty matching all characters belong to
* a UnicodeScript.
*/
private CharProperty unicodeScriptPropertyFor(String name) {
final Character.UnicodeScript script;
try {
script = Character.UnicodeScript.forName(name);
} catch (IllegalArgumentException iae) {
throw error("Unknown character script name {" + name + "}");
}
return new Script(script);
}
/** /**
* Returns a CharProperty matching all characters in a UnicodeBlock. * Returns a CharProperty matching all characters in a UnicodeBlock.
*/ */
...@@ -3566,6 +3620,19 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { ...@@ -3566,6 +3620,19 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
} }
} }
/**
* Node class that matches a Unicode script
*/
static final class Script extends CharProperty {
final Character.UnicodeScript script;
Script(Character.UnicodeScript script) {
this.script = script;
}
boolean isSatisfiedBy(int ch) {
return script == Character.UnicodeScript.of(ch);
}
}
/** /**
* Node class that matches a Unicode category. * Node class that matches a Unicode category.
*/ */
......
/**
* @test
* @bug 6945564
* @summary Check that the j.l.Character.UnicodeScript
* @ignore don't run until #6903266 is integrated
*/
import java.io.*;
import java.lang.reflect.*;
import java.util.*;
import java.util.regex.*;
import java.lang.Character.UnicodeScript;
public class CheckScript {
public static void main(String[] args) throws Exception {
if (args.length != 1) {
System.out.println("java CharacterScript script.txt");
System.exit(1);
}
BufferedReader sbfr = new BufferedReader(new FileReader(args[0]));
Matcher m = Pattern.compile("(\\p{XDigit}+)(?:\\.{2}(\\p{XDigit}+))?\\s+;\\s+(\\w+)\\s+#.*").matcher("");
String line = null;
HashMap<String,ArrayList<Integer>> scripts = new HashMap<>();
while ((line = sbfr.readLine()) != null) {
if (line.length() <= 1 || line.charAt(0) == '#') {
continue;
}
m.reset(line);
if (m.matches()) {
int start = Integer.parseInt(m.group(1), 16);
int end = (m.group(2)==null)?start
:Integer.parseInt(m.group(2), 16);
String name = m.group(3).toLowerCase(Locale.ENGLISH);
ArrayList<Integer> ranges = scripts.get(name);
if (ranges == null) {
ranges = new ArrayList<Integer>();
scripts.put(name, ranges);
}
ranges.add(start);
ranges.add(end);
}
}
sbfr.close();
// check all defined ranges
Integer[] ZEROSIZEARRAY = new Integer[0];
for (String name : scripts.keySet()) {
System.out.println("Checking " + name + "...");
Integer[] ranges = scripts.get(name).toArray(ZEROSIZEARRAY);
Character.UnicodeScript expected =
Character.UnicodeScript.forName(name);
int off = 0;
while (off < ranges.length) {
int start = ranges[off++];
int end = ranges[off++];
for (int cp = start; cp <= end; cp++) {
Character.UnicodeScript script =
Character.UnicodeScript.of(cp);
if (script != expected) {
throw new RuntimeException(
"UnicodeScript failed: cp=" +
Integer.toHexString(cp) +
", of(cp)=<" + script + "> but <" +
expected + "> is expected");
}
}
}
}
// check all codepoints
for (int cp = 0; cp < Character.MAX_CODE_POINT; cp++) {
Character.UnicodeScript script = Character.UnicodeScript.of(cp);
if (script == Character.UnicodeScript.UNKNOWN) {
if (Character.getType(cp) != Character.UNASSIGNED &&
Character.getType(cp) != Character.SURROGATE &&
Character.getType(cp) != Character.PRIVATE_USE)
throw new RuntimeException(
"UnicodeScript failed: cp=" +
Integer.toHexString(cp) +
", of(cp)=<" + script + "> but UNKNOWN is expected");
} else {
Integer[] ranges =
scripts.get(script.name().toLowerCase(Locale.ENGLISH))
.toArray(ZEROSIZEARRAY);
int off = 0;
boolean found = false;
while (off < ranges.length) {
int start = ranges[off++];
int end = ranges[off++];
if (cp >= start && cp <= end)
found = true;
}
if (!found) {
throw new RuntimeException(
"UnicodeScript failed: cp=" +
Integer.toHexString(cp) +
", of(cp)=<" + script +
"> but NOT in ranges of this script");
}
}
}
}
}
此差异已折叠。
...@@ -32,7 +32,7 @@ ...@@ -32,7 +32,7 @@
* 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476 * 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476
* 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940 * 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
* 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133 * 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
* 6350801 6676425 6878475 6919132 6931676 * 6350801 6676425 6878475 6919132 6931676 6948903
*/ */
import java.util.regex.*; import java.util.regex.*;
...@@ -135,7 +135,7 @@ public class RegExTest { ...@@ -135,7 +135,7 @@ public class RegExTest {
surrogatesInClassTest(); surrogatesInClassTest();
namedGroupCaptureTest(); namedGroupCaptureTest();
nonBmpClassComplementTest(); nonBmpClassComplementTest();
unicodePropertiesTest();
if (failure) if (failure)
throw new RuntimeException("Failure in the RE handling."); throw new RuntimeException("Failure in the RE handling.");
else else
...@@ -3515,7 +3515,7 @@ public class RegExTest { ...@@ -3515,7 +3515,7 @@ public class RegExTest {
report("NamedGroupCapture"); report("NamedGroupCapture");
} }
// This is for bug 6919132 // This is for bug 6969132
private static void nonBmpClassComplementTest() throws Exception { private static void nonBmpClassComplementTest() throws Exception {
Pattern p = Pattern.compile("\\P{Lu}"); Pattern p = Pattern.compile("\\P{Lu}");
Matcher m = p.matcher(new String(new int[] {0x1d400}, 0, 1)); Matcher m = p.matcher(new String(new int[] {0x1d400}, 0, 1));
...@@ -3539,4 +3539,79 @@ public class RegExTest { ...@@ -3539,4 +3539,79 @@ public class RegExTest {
report("NonBmpClassComplement"); report("NonBmpClassComplement");
} }
private static void unicodePropertiesTest() throws Exception {
// different forms
if (!Pattern.compile("\\p{IsLu}").matcher("A").matches() ||
!Pattern.compile("\\p{Lu}").matcher("A").matches() ||
!Pattern.compile("\\p{gc=Lu}").matcher("A").matches() ||
!Pattern.compile("\\p{general_category=Lu}").matcher("A").matches() ||
!Pattern.compile("\\p{IsLatin}").matcher("B").matches() ||
!Pattern.compile("\\p{sc=Latin}").matcher("B").matches() ||
!Pattern.compile("\\p{script=Latin}").matcher("B").matches() ||
!Pattern.compile("\\p{InBasicLatin}").matcher("c").matches() ||
!Pattern.compile("\\p{blk=BasicLatin}").matcher("c").matches() ||
!Pattern.compile("\\p{block=BasicLatin}").matcher("c").matches())
failCount++;
Matcher common = Pattern.compile("\\p{script=Common}").matcher("");
Matcher unknown = Pattern.compile("\\p{IsUnknown}").matcher("");
Matcher lastSM = common;
Character.UnicodeScript lastScript = Character.UnicodeScript.of(0);
Matcher latin = Pattern.compile("\\p{block=basic_latin}").matcher("");
Matcher greek = Pattern.compile("\\p{InGreek}").matcher("");
Matcher lastBM = latin;
Character.UnicodeBlock lastBlock = Character.UnicodeBlock.of(0);
for (int cp = 1; cp < Character.MAX_CODE_POINT; cp++) {
if (cp >= 0x30000 && (cp & 0x70) == 0){
continue; // only pick couple code points, they are the same
}
// Unicode Script
Character.UnicodeScript script = Character.UnicodeScript.of(cp);
Matcher m;
String str = new String(Character.toChars(cp));
if (script == lastScript) {
m = lastSM;
m.reset(str);
} else {
m = Pattern.compile("\\p{Is" + script.name() + "}").matcher(str);
}
if (!m.matches()) {
failCount++;
}
Matcher other = (script == Character.UnicodeScript.COMMON)? unknown : common;
other.reset(str);
if (other.matches()) {
failCount++;
}
lastSM = m;
lastScript = script;
// Unicode Block
Character.UnicodeBlock block = Character.UnicodeBlock.of(cp);
if (block == null) {
//System.out.printf("Not a Block: cp=%x%n", cp);
continue;
}
if (block == lastBlock) {
m = lastBM;
m.reset(str);
} else {
m = Pattern.compile("\\p{block=" + block.toString() + "}").matcher(str);
}
if (!m.matches()) {
failCount++;
}
other = (block == Character.UnicodeBlock.BASIC_LATIN)? greek : latin;
other.reset(str);
if (other.matches()) {
failCount++;
}
lastBM = m;
lastBlock = block;
}
report("unicodeProperties");
}
} }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册