6919132: Regex \P{Lu} selects half of a surrogate pari

Summary: To use StartS for complement category/block class Reviewed-by: martin, okutsu

6919132: Regex \P{Lu} selects half of a surrogate pari
Summary: To use StartS for complement category/block class Reviewed-by: martin, okutsu
9f7d8b76 · sherman · 82a84d63 · 9f7d8b76 · 9f7d8b76
隐藏空白更改
内联并排

Showing with 88 addition and 45 deletion

src/share/classes/java/util/regex/Pattern.java src/share/classes/java/util/regex/Pattern.java +61 -43

test/java/util/regex/RegExTest.java test/java/util/regex/RegExTest.java +27 -2

未找到文件。
--- a/src/share/classes/java/util/regex/Pattern.java
+++ b/src/share/classes/java/util/regex/Pattern.java
@@ -860,6 +860,14 @@ public final class Pattern
     */
    private transient int patternLength;

+    /**
+     * If the Start node might possibly match supplementary characters.
+     * It is set to true during compiling if
+     * (1) There is supplementary char in pattern, or
+     * (2) There is complement node of Category or Block
+     */
+    private transient boolean hasSupplementary;
+
    /**
     * Compiles the given regular expression into a pattern.  </p>
     *
@@ -1481,7 +1489,7 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
        // Use double zero to terminate pattern
        temp = new int[patternLength + 2];

-        boolean hasSupplementary = false;
+        hasSupplementary = false;
        int c, count = 0;
        // Convert all chars into code points
        for (int x = 0; x < patternLength; x += Character.charCount(c)) {
@@ -1787,7 +1795,8 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
     * character or unpaired surrogate.
     */
    private static final boolean isSupplementary(int ch) {
-        return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT || isSurrogate(ch);
+        return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT ||
+               Character.isSurrogate((char)ch);
    }

    /**
@@ -1885,7 +1894,7 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
                    } else {
                        oneLetter = false;
                    }
-                    node = family(oneLetter).maybeComplement(comp);
+                    node = family(oneLetter, comp);
                } else {
                    unread();
                    node = atom();
@@ -2001,7 +2010,7 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
                            unread();
                        else
                            oneLetter = false;
-                        return family(oneLetter).maybeComplement(comp);
+                        return family(oneLetter, comp);
                    }
                }
                unread();
@@ -2404,7 +2413,7 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
                    unread();
                else
                    oneLetter = false;
-                return family(oneLetter).maybeComplement(comp);
+                return family(oneLetter, comp);
            } else { // ordinary escape
                unread();
                ch = escape(true, true);
@@ -2450,9 +2459,12 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
    /**
     * Parses a Unicode character family and returns its representative node.
     */
-    private CharProperty family(boolean singleLetter) {
+    private CharProperty family(boolean singleLetter,
+                                boolean maybeComplement)
+    {
        next();
        String name;
+        CharProperty node;

        if (singleLetter) {
            int c = temp[cursor];
@@ -2477,12 +2489,18 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
        }

        if (name.startsWith("In")) {
-            return unicodeBlockPropertyFor(name.substring(2));
+            node = unicodeBlockPropertyFor(name.substring(2));
        } else {
            if (name.startsWith("Is"))
                name = name.substring(2);
-            return charPropertyNodeFor(name);
+            node = charPropertyNodeFor(name);
+        }
+        if (maybeComplement) {
+            if (node instanceof Category || node instanceof Block)
+                hasSupplementary = true;
+            node = node.complement();
        }
+        return node;
    }

    /**
@@ -2495,9 +2513,7 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
        } catch (IllegalArgumentException iae) {
            throw error("Unknown character block name {" + name + "}");
        }
-        return new CharProperty() {
-                boolean isSatisfiedBy(int ch) {
-                    return block == Character.UnicodeBlock.of(ch);}};
+        return new Block(block);
    }

    /**
@@ -2968,13 +2984,6 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
    // Utility methods for code point support
    //

-    /**
-     * Tests a surrogate value.
-     */
-    private static final boolean isSurrogate(int c) {
-        return c >= Character.MIN_HIGH_SURROGATE && c <= Character.MAX_LOW_SURROGATE;
-    }
-
    private static final int countChars(CharSequence seq, int index,
                                        int lengthInCodePoints) {
        // optimization
@@ -3174,20 +3183,17 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
                matcher.hitEnd = true;
                return false;
            }
-            boolean ret = false;
            int guard = matcher.to - minLength;
            for (; i <= guard; i++) {
-                if (ret = next.match(matcher, i, seq))
-                    break;
-                if (i == guard)
-                    matcher.hitEnd = true;
-            }
-            if (ret) {
-                matcher.first = i;
-                matcher.groups[0] = matcher.first;
-                matcher.groups[1] = matcher.last;
+                if (next.match(matcher, i, seq)) {
+                    matcher.first = i;
+                    matcher.groups[0] = matcher.first;
+                    matcher.groups[1] = matcher.last;
+                    return true;
+                }
            }
-            return ret;
+            matcher.hitEnd = true;
+            return false;
        }
        boolean study(TreeInfo info) {
            next.study(info);
@@ -3209,27 +3215,28 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
                matcher.hitEnd = true;
                return false;
            }
-            boolean ret = false;
            int guard = matcher.to - minLength;
            while (i <= guard) {
-                if ((ret = next.match(matcher, i, seq)) || i == guard)
+                //if ((ret = next.match(matcher, i, seq)) || i == guard)
+                if (next.match(matcher, i, seq)) {
+                    matcher.first = i;
+                    matcher.groups[0] = matcher.first;
+                    matcher.groups[1] = matcher.last;
+                    return true;
+                }
+                if (i == guard)
                    break;
                // Optimization to move to the next character. This is
                // faster than countChars(seq, i, 1).
                if (Character.isHighSurrogate(seq.charAt(i++))) {
-                    if (i < seq.length() && Character.isLowSurrogate(seq.charAt(i))) {
+                    if (i < seq.length() &&
+                        Character.isLowSurrogate(seq.charAt(i))) {
                        i++;
                    }
                }
-                if (i == guard)
-                    matcher.hitEnd = true;
-            }
-            if (ret) {
-                matcher.first = i;
-                matcher.groups[0] = matcher.first;
-                matcher.groups[1] = matcher.last;
            }
-            return ret;
+            matcher.hitEnd = true;
+            return false;
        }
    }

@@ -3461,9 +3468,6 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
                    boolean isSatisfiedBy(int ch) {
                        return ! CharProperty.this.isSatisfiedBy(ch);}};
        }
-        CharProperty maybeComplement(boolean complement) {
-            return complement ? complement() : this;
-        }
        boolean match(Matcher matcher, int i, CharSequence seq) {
            if (i < matcher.to) {
                int ch = Character.codePointAt(seq, i);
@@ -3548,6 +3552,20 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
        }
    }

+
+    /**
+     * Node class that matches a Unicode block.
+     */
+    static final class Block extends CharProperty {
+        final Character.UnicodeBlock block;
+        Block(Character.UnicodeBlock block) {
+            this.block = block;
+        }
+        boolean isSatisfiedBy(int ch) {
+            return block == Character.UnicodeBlock.of(ch);
+        }
+    }
+
    /**
     * Node class that matches a Unicode category.
     */

--- a/test/java/util/regex/RegExTest.java
+++ b/test/java/util/regex/RegExTest.java
@@ -32,7 +32,7 @@
 * 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476
 * 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
 * 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
- * 6350801 6676425 6878475
+ * 6350801 6676425 6878475 6919132
 */

 import java.util.regex.*;
@@ -134,6 +134,7 @@ public class RegExTest {
        toMatchResultTest();
        surrogatesInClassTest();
        namedGroupCaptureTest();
+        nonBmpClassComplementTest();

        if (failure)
            throw new RuntimeException("Failure in the RE handling.");
@@ -365,7 +366,6 @@ public class RegExTest {
        m.find();
        if (!m.hitEnd())
            failCount++;
-
        report("hitEnd from a Slice");
    }

@@ -3514,4 +3514,29 @@ public class RegExTest {
                          null);
        report("NamedGroupCapture");
    }
+
+    // This is for bug 6969132
+    private static void nonBmpClassComplementTest() throws Exception {
+        Pattern p = Pattern.compile("\\P{Lu}");
+        Matcher m = p.matcher(new String(new int[] {0x1d400}, 0, 1));
+        if (m.find() && m.start() == 1)
+            failCount++;
+
+        // from a unicode category
+        p = Pattern.compile("\\P{Lu}");
+        m = p.matcher(new String(new int[] {0x1d400}, 0, 1));
+        if (m.find())
+            failCount++;
+        if (!m.hitEnd())
+            failCount++;
+
+        // block
+        p = Pattern.compile("\\P{InMathematicalAlphanumericSymbols}");
+        m = p.matcher(new String(new int[] {0x1d400}, 0, 1));
+        if (m.find() && m.start() == 1)
+            failCount++;
+
+        report("NonBmpClassComplement");
+    }
+
 }