提交 9f7d8b76 编写于 作者: S sherman

6919132: Regex \P{Lu} selects half of a surrogate pari

Summary: To use StartS for complement category/block class
Reviewed-by: martin, okutsu
上级 82a84d63
...@@ -860,6 +860,14 @@ public final class Pattern ...@@ -860,6 +860,14 @@ public final class Pattern
*/ */
private transient int patternLength; private transient int patternLength;
/**
* If the Start node might possibly match supplementary characters.
* It is set to true during compiling if
* (1) There is supplementary char in pattern, or
* (2) There is complement node of Category or Block
*/
private transient boolean hasSupplementary;
/** /**
* Compiles the given regular expression into a pattern. </p> * Compiles the given regular expression into a pattern. </p>
* *
...@@ -1481,7 +1489,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { ...@@ -1481,7 +1489,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
// Use double zero to terminate pattern // Use double zero to terminate pattern
temp = new int[patternLength + 2]; temp = new int[patternLength + 2];
boolean hasSupplementary = false; hasSupplementary = false;
int c, count = 0; int c, count = 0;
// Convert all chars into code points // Convert all chars into code points
for (int x = 0; x < patternLength; x += Character.charCount(c)) { for (int x = 0; x < patternLength; x += Character.charCount(c)) {
...@@ -1787,7 +1795,8 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { ...@@ -1787,7 +1795,8 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
* character or unpaired surrogate. * character or unpaired surrogate.
*/ */
private static final boolean isSupplementary(int ch) { private static final boolean isSupplementary(int ch) {
return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT || isSurrogate(ch); return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT ||
Character.isSurrogate((char)ch);
} }
/** /**
...@@ -1885,7 +1894,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { ...@@ -1885,7 +1894,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
} else { } else {
oneLetter = false; oneLetter = false;
} }
node = family(oneLetter).maybeComplement(comp); node = family(oneLetter, comp);
} else { } else {
unread(); unread();
node = atom(); node = atom();
...@@ -2001,7 +2010,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { ...@@ -2001,7 +2010,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
unread(); unread();
else else
oneLetter = false; oneLetter = false;
return family(oneLetter).maybeComplement(comp); return family(oneLetter, comp);
} }
} }
unread(); unread();
...@@ -2404,7 +2413,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { ...@@ -2404,7 +2413,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
unread(); unread();
else else
oneLetter = false; oneLetter = false;
return family(oneLetter).maybeComplement(comp); return family(oneLetter, comp);
} else { // ordinary escape } else { // ordinary escape
unread(); unread();
ch = escape(true, true); ch = escape(true, true);
...@@ -2450,9 +2459,12 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { ...@@ -2450,9 +2459,12 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
/** /**
* Parses a Unicode character family and returns its representative node. * Parses a Unicode character family and returns its representative node.
*/ */
private CharProperty family(boolean singleLetter) { private CharProperty family(boolean singleLetter,
boolean maybeComplement)
{
next(); next();
String name; String name;
CharProperty node;
if (singleLetter) { if (singleLetter) {
int c = temp[cursor]; int c = temp[cursor];
...@@ -2477,12 +2489,18 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { ...@@ -2477,12 +2489,18 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
} }
if (name.startsWith("In")) { if (name.startsWith("In")) {
return unicodeBlockPropertyFor(name.substring(2)); node = unicodeBlockPropertyFor(name.substring(2));
} else { } else {
if (name.startsWith("Is")) if (name.startsWith("Is"))
name = name.substring(2); name = name.substring(2);
return charPropertyNodeFor(name); node = charPropertyNodeFor(name);
}
if (maybeComplement) {
if (node instanceof Category || node instanceof Block)
hasSupplementary = true;
node = node.complement();
} }
return node;
} }
/** /**
...@@ -2495,9 +2513,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { ...@@ -2495,9 +2513,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
} catch (IllegalArgumentException iae) { } catch (IllegalArgumentException iae) {
throw error("Unknown character block name {" + name + "}"); throw error("Unknown character block name {" + name + "}");
} }
return new CharProperty() { return new Block(block);
boolean isSatisfiedBy(int ch) {
return block == Character.UnicodeBlock.of(ch);}};
} }
/** /**
...@@ -2968,13 +2984,6 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { ...@@ -2968,13 +2984,6 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
// Utility methods for code point support // Utility methods for code point support
// //
/**
* Tests a surrogate value.
*/
private static final boolean isSurrogate(int c) {
return c >= Character.MIN_HIGH_SURROGATE && c <= Character.MAX_LOW_SURROGATE;
}
private static final int countChars(CharSequence seq, int index, private static final int countChars(CharSequence seq, int index,
int lengthInCodePoints) { int lengthInCodePoints) {
// optimization // optimization
...@@ -3174,20 +3183,17 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { ...@@ -3174,20 +3183,17 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
matcher.hitEnd = true; matcher.hitEnd = true;
return false; return false;
} }
boolean ret = false;
int guard = matcher.to - minLength; int guard = matcher.to - minLength;
for (; i <= guard; i++) { for (; i <= guard; i++) {
if (ret = next.match(matcher, i, seq)) if (next.match(matcher, i, seq)) {
break; matcher.first = i;
if (i == guard) matcher.groups[0] = matcher.first;
matcher.hitEnd = true; matcher.groups[1] = matcher.last;
} return true;
if (ret) { }
matcher.first = i;
matcher.groups[0] = matcher.first;
matcher.groups[1] = matcher.last;
} }
return ret; matcher.hitEnd = true;
return false;
} }
boolean study(TreeInfo info) { boolean study(TreeInfo info) {
next.study(info); next.study(info);
...@@ -3209,27 +3215,28 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { ...@@ -3209,27 +3215,28 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
matcher.hitEnd = true; matcher.hitEnd = true;
return false; return false;
} }
boolean ret = false;
int guard = matcher.to - minLength; int guard = matcher.to - minLength;
while (i <= guard) { while (i <= guard) {
if ((ret = next.match(matcher, i, seq)) || i == guard) //if ((ret = next.match(matcher, i, seq)) || i == guard)
if (next.match(matcher, i, seq)) {
matcher.first = i;
matcher.groups[0] = matcher.first;
matcher.groups[1] = matcher.last;
return true;
}
if (i == guard)
break; break;
// Optimization to move to the next character. This is // Optimization to move to the next character. This is
// faster than countChars(seq, i, 1). // faster than countChars(seq, i, 1).
if (Character.isHighSurrogate(seq.charAt(i++))) { if (Character.isHighSurrogate(seq.charAt(i++))) {
if (i < seq.length() && Character.isLowSurrogate(seq.charAt(i))) { if (i < seq.length() &&
Character.isLowSurrogate(seq.charAt(i))) {
i++; i++;
} }
} }
if (i == guard)
matcher.hitEnd = true;
}
if (ret) {
matcher.first = i;
matcher.groups[0] = matcher.first;
matcher.groups[1] = matcher.last;
} }
return ret; matcher.hitEnd = true;
return false;
} }
} }
...@@ -3461,9 +3468,6 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { ...@@ -3461,9 +3468,6 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
boolean isSatisfiedBy(int ch) { boolean isSatisfiedBy(int ch) {
return ! CharProperty.this.isSatisfiedBy(ch);}}; return ! CharProperty.this.isSatisfiedBy(ch);}};
} }
CharProperty maybeComplement(boolean complement) {
return complement ? complement() : this;
}
boolean match(Matcher matcher, int i, CharSequence seq) { boolean match(Matcher matcher, int i, CharSequence seq) {
if (i < matcher.to) { if (i < matcher.to) {
int ch = Character.codePointAt(seq, i); int ch = Character.codePointAt(seq, i);
...@@ -3548,6 +3552,20 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { ...@@ -3548,6 +3552,20 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
} }
} }
/**
* Node class that matches a Unicode block.
*/
static final class Block extends CharProperty {
final Character.UnicodeBlock block;
Block(Character.UnicodeBlock block) {
this.block = block;
}
boolean isSatisfiedBy(int ch) {
return block == Character.UnicodeBlock.of(ch);
}
}
/** /**
* Node class that matches a Unicode category. * Node class that matches a Unicode category.
*/ */
......
...@@ -32,7 +32,7 @@ ...@@ -32,7 +32,7 @@
* 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476 * 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476
* 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940 * 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
* 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133 * 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
* 6350801 6676425 6878475 * 6350801 6676425 6878475 6919132
*/ */
import java.util.regex.*; import java.util.regex.*;
...@@ -134,6 +134,7 @@ public class RegExTest { ...@@ -134,6 +134,7 @@ public class RegExTest {
toMatchResultTest(); toMatchResultTest();
surrogatesInClassTest(); surrogatesInClassTest();
namedGroupCaptureTest(); namedGroupCaptureTest();
nonBmpClassComplementTest();
if (failure) if (failure)
throw new RuntimeException("Failure in the RE handling."); throw new RuntimeException("Failure in the RE handling.");
...@@ -365,7 +366,6 @@ public class RegExTest { ...@@ -365,7 +366,6 @@ public class RegExTest {
m.find(); m.find();
if (!m.hitEnd()) if (!m.hitEnd())
failCount++; failCount++;
report("hitEnd from a Slice"); report("hitEnd from a Slice");
} }
...@@ -3514,4 +3514,29 @@ public class RegExTest { ...@@ -3514,4 +3514,29 @@ public class RegExTest {
null); null);
report("NamedGroupCapture"); report("NamedGroupCapture");
} }
// This is for bug 6969132
private static void nonBmpClassComplementTest() throws Exception {
Pattern p = Pattern.compile("\\P{Lu}");
Matcher m = p.matcher(new String(new int[] {0x1d400}, 0, 1));
if (m.find() && m.start() == 1)
failCount++;
// from a unicode category
p = Pattern.compile("\\P{Lu}");
m = p.matcher(new String(new int[] {0x1d400}, 0, 1));
if (m.find())
failCount++;
if (!m.hitEnd())
failCount++;
// block
p = Pattern.compile("\\P{InMathematicalAlphanumericSymbols}");
m = p.matcher(new String(new int[] {0x1d400}, 0, 1));
if (m.find() && m.start() == 1)
failCount++;
report("NonBmpClassComplement");
}
} }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册