提交 317942f2 编写于 作者: S sherman

8027645: Pattern.split() with positive lookahead

6559590: Pattern.compile(".*").split("") returns incorrect result
Summary: updated spec/impl for these two corner cases
Reviewed-by: alanb, psandoz
上级 b2c2d597
......@@ -2235,7 +2235,13 @@ public final class String
* expression or is terminated by the end of the string. The substrings in
* the array are in the order in which they occur in this string. If the
* expression does not match any part of the input then the resulting array
* has just one element, namely this string.
* has just one element, namely this string. A zero-length input sequence
* always results zero-length resulting array.
*
* <p> When there is a positive-width match at the beginning of this
* string then an empty leading substring is included at the beginning
* of the resulting array. A zero-width match at the beginning however
* never produces such empty leading substring.
*
* <p> The {@code limit} parameter controls the number of times the
* pattern is applied and therefore affects the length of the resulting
......@@ -2325,6 +2331,8 @@ public final class String
(ch < Character.MIN_HIGH_SURROGATE ||
ch > Character.MAX_LOW_SURROGATE))
{
if (value.length == 0)
return new String[0];
int off = 0;
int next = 0;
boolean limited = limit > 0;
......
......@@ -1142,9 +1142,15 @@ public final class Pattern
* input sequence that is terminated by another subsequence that matches
* this pattern or is terminated by the end of the input sequence. The
* substrings in the array are in the order in which they occur in the
* input. If this pattern does not match any subsequence of the input then
* input. If this pattern does not match any subsequence of the input then
* the resulting array has just one element, namely the input sequence in
* string form.
* string form. A zero-length input sequence always results zero-length
* resulting array.
*
* <p> When there is a positive-width match at the beginning of the input
* sequence then an empty leading substring is included at the beginning
* of the resulting array. A zero-width match at the beginning however
* never produces such empty leading substring.
*
* <p> The <tt>limit</tt> parameter controls the number of times the
* pattern is applied and therefore affects the length of the resulting
......@@ -1185,7 +1191,6 @@ public final class Pattern
* <td><tt>{ "b", "", ":and:f" }</tt></td></tr>
* </table></blockquote>
*
*
* @param input
* The character sequence to be split
*
......@@ -1196,6 +1201,8 @@ public final class Pattern
* around matches of this pattern
*/
public String[] split(CharSequence input, int limit) {
if (input.length() == 0)
return new String[0];
int index = 0;
boolean matchLimited = limit > 0;
ArrayList<String> matchList = new ArrayList<>();
......@@ -1204,6 +1211,11 @@ public final class Pattern
// Add segments before each match found
while(m.find()) {
if (!matchLimited || matchList.size() < limit - 1) {
if (index == 0 && index == m.start() && m.start() == m.end()) {
// no empty leading substring included for zero-width match
// at the beginning of the input char sequence.
continue;
}
String match = input.subSequence(index, m.start()).toString();
matchList.add(match);
index = m.end();
......@@ -5762,6 +5774,13 @@ NEXT: while (i <= last) {
* the resulting stream has just one element, namely the input sequence in
* string form.
*
* <p> A zero-length input sequence always results an empty stream.
*
* <p> When there is a positive-width match at the beginning of the input
* sequence then an empty leading substring is included at the beginning
* of the stream. A zero-width match at the beginning however never produces
* such empty leading substring.
*
* <p> If the input sequence is mutable, it must remain constant during the
* execution of the terminal stream operation. Otherwise, the result of the
* terminal stream operation is undefined.
......@@ -5817,7 +5836,8 @@ NEXT: while (i <= last) {
current = matcher.end();
if (!nextElement.isEmpty()) {
return true;
} else {
} else if (current > 0) { // no empty leading substring for zero-width
// match at the beginning of the input
emptyElementCount++;
}
}
......
......@@ -23,7 +23,7 @@
/**
* @test
* @bug 6840246
* @bug 6840246 6559590
* @summary test String.split()
*/
import java.util.Arrays;
......@@ -78,12 +78,11 @@ public class Split {
throw new RuntimeException("String.split failure 7");
}
// Check the case for limit == 0, source = "";
// split() now returns 0-length for empty source "" see #6559590
source = "";
String[] result = source.split("e", 0);
if (result.length != 1)
if (result.length != 0)
throw new RuntimeException("String.split failure 8");
if (!result[0].equals(source))
throw new RuntimeException("String.split failure 9");
// check fastpath of String.split()
source = "0123456789abcdefgABCDEFG";
......
......@@ -33,7 +33,8 @@
* 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
* 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
* 6350801 6676425 6878475 6919132 6931676 6948903 6990617 7014645 7039066
* 7067045 7014640 7189363 8007395 8013252 8013254 8012646 8023647
* 7067045 7014640 7189363 8007395 8013252 8013254 8012646 8023647 6559590
* 8027645
*/
import java.util.regex.*;
......@@ -148,6 +149,7 @@ public class RegExTest {
groupCurlyNotFoundSuppTest();
groupCurlyBackoffTest();
patternAsPredicate();
if (failure) {
throw new
RuntimeException("RegExTest failed, 1st failure: " +
......@@ -1776,13 +1778,68 @@ public class RegExTest {
failCount++;
}
// Check the case for limit == 0, source = "";
// split() now returns 0-length for empty source "" see #6559590
source = "";
result = source.split("e", 0);
if (result.length != 1)
failCount++;
if (!result[0].equals(source))
failCount++;
if (result.length != 0)
failCount++;
// Check both split() and splitAsStraem(), especially for zero-lenth
// input and zero-lenth match cases
String[][] input = new String[][] {
{ " ", "Abc Efg Hij" }, // normal non-zero-match
{ " ", " Abc Efg Hij" }, // leading empty str for non-zero-match
{ " ", "Abc Efg Hij" }, // non-zero-match in the middle
{ "(?=\\p{Lu})", "AbcEfgHij" }, // no leading empty str for zero-match
{ "(?=\\p{Lu})", "AbcEfg" },
{ "(?=\\p{Lu})", "Abc" },
{ " ", "" }, // zero-length input
{ ".*", "" },
// some tests from PatternStreamTest.java
{ "4", "awgqwefg1fefw4vssv1vvv1" },
{ "\u00a3a", "afbfq\u00a3abgwgb\u00a3awngnwggw\u00a3a\u00a3ahjrnhneerh" },
{ "1", "awgqwefg1fefw4vssv1vvv1" },
{ "1", "a\u4ebafg1fefw\u4eba4\u9f9cvssv\u9f9c1v\u672c\u672cvv" },
{ "\u56da", "1\u56da23\u56da456\u56da7890" },
{ "\u56da", "1\u56da23\u9f9c\u672c\u672c\u56da456\u56da\u9f9c\u672c7890" },
{ "\u56da", "" },
{ "[ \t,:.]","This is,testing: with\tdifferent separators." }, //multiple septs
{ "o", "boo:and:foo" },
{ "o", "booooo:and:fooooo" },
{ "o", "fooooo:" },
};
String[][] expected = new String[][] {
{ "Abc", "Efg", "Hij" },
{ "", "Abc", "Efg", "Hij" },
{ "Abc", "", "Efg", "Hij" },
{ "Abc", "Efg", "Hij" },
{ "Abc", "Efg" },
{ "Abc" },
{},
{},
{ "awgqwefg1fefw", "vssv1vvv1" },
{ "afbfq", "bgwgb", "wngnwggw", "", "hjrnhneerh" },
{ "awgqwefg", "fefw4vssv", "vvv" },
{ "a\u4ebafg", "fefw\u4eba4\u9f9cvssv\u9f9c", "v\u672c\u672cvv" },
{ "1", "23", "456", "7890" },
{ "1", "23\u9f9c\u672c\u672c", "456", "\u9f9c\u672c7890" },
{},
{ "This", "is", "testing", "", "with", "different", "separators" },
{ "b", "", ":and:f" },
{ "b", "", "", "", "", ":and:f" },
{ "f", "", "", "", "", ":" },
};
for (int i = 0; i < input.length; i++) {
pattern = Pattern.compile(input[i][0]);
if (!Arrays.equals(pattern.split(input[i][1]), expected[i]))
failCount++;
if (!Arrays.equals(pattern.splitAsStream(input[i][1]).toArray(),
expected[i]))
failCount++;
}
report("Split");
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册