提交 5029f458 编写于 作者: H hannesw

8011714: Regexp decimal escape handling still not correct

Reviewed-by: lagergren, attila
上级 24978733
......@@ -108,15 +108,11 @@ final class RegExpScanner extends Scanner {
final int pos = iterator.next();
final int num = iterator.next();
if (num > caps.size()) {
// Non-existing reference should never match, if smaller than 8 convert to octal escape
// to be compatible with other engines.
if (num < 8) {
String escape = "\\x0" + num;
sb.insert(pos, escape);
} else {
neverMatches = true;
break;
}
// Non-existing backreference. If the number begins with a valid octal convert it to
// Unicode escape and append the rest to a literal character sequence.
final StringBuilder buffer = new StringBuilder();
octalOrLiteral(Integer.toString(num), buffer);
sb.insert(pos, buffer);
}
}
......@@ -632,7 +628,7 @@ final class RegExpScanner extends Scanner {
// form "\\ca".match([string with ascii 1 at char0]). Translating
// them to unicode does it though.
sb.setLength(sb.length() - 1);
unicode(c - 'A' + 1);
unicode(c - 'A' + 1, sb);
skip(1);
return true;
}
......@@ -673,7 +669,7 @@ final class RegExpScanner extends Scanner {
final int startIn = position;
final int startOut = sb.length();
if (ch0 == '0' && !isDecimalDigit(ch1)) {
if (ch0 == '0' && !isOctalDigit(ch1)) {
skip(1);
// DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000);
sb.append("\u0000");
......@@ -681,50 +677,56 @@ final class RegExpScanner extends Scanner {
}
if (isDecimalDigit(ch0)) {
final int num = ch0 - '0';
// Single digit escape, treat as backreference.
if (!isDecimalDigit(ch1)) {
if (num <= caps.size() && caps.get(num - 1).getNegativeLookaheadLevel() > 0) {
if (ch0 == '0') {
// We know this is an octal escape.
if (inCharClass) {
// Convert octal escape to unicode escape if inside character class.
int octalValue = 0;
while (isOctalDigit(ch0)) {
octalValue = octalValue * 8 + ch0 - '0';
skip(1);
}
unicode(octalValue, sb);
} else {
// Copy decimal escape as-is
decimalDigits();
}
} else {
// This should be a backreference, but could also be an octal escape or even a literal string.
int decimalValue = 0;
while (isDecimalDigit(ch0)) {
decimalValue = decimalValue * 10 + ch0 - '0';
skip(1);
}
if (inCharClass) {
// No backreferences in character classes. Encode as unicode escape or literal char sequence
sb.setLength(sb.length() - 1);
octalOrLiteral(Integer.toString(decimalValue), sb);
} else if (decimalValue <= caps.size() && caps.get(decimalValue - 1).getNegativeLookaheadLevel() > 0) {
// Captures that live inside a negative lookahead are dead after the
// lookahead and will be undefined if referenced from outside.
if (caps.get(num - 1).getNegativeLookaheadLevel() > negativeLookaheadLevel) {
if (caps.get(decimalValue - 1).getNegativeLookaheadLevel() > negativeLookaheadLevel) {
sb.setLength(sb.length() - 1);
} else {
sb.append(ch0);
sb.append(decimalValue);
}
skip(1);
return true;
} else if (num > caps.size()) {
// Forward reference to a capture group. Forward references are always undefined so we
// can omit it from the output buffer. Additionally, if the capture group does not exist
// the whole regexp becomes invalid, so register the reference for later processing.
} else if (decimalValue > caps.size()) {
// Forward reference to a capture group. Forward references are always undefined so we can omit
// it from the output buffer. However, if the target capture does not exist, we need to rewrite
// the reference as hex escape or literal string, so register the reference for later processing.
sb.setLength(sb.length() - 1);
forwardReferences.add(num);
forwardReferences.add(decimalValue);
forwardReferences.add(sb.length());
skip(1);
return true;
}
}
if (inCharClass) {
// Convert octal escape to unicode escape if inside character class.
StringBuilder digit = new StringBuilder(4);
while (isDecimalDigit(ch0)) {
digit.append(ch0);
skip(1);
}
int value = Integer.parseInt(digit.toString(), 8); //throws exception that leads to SyntaxError if not octal
if (value > 0xff) {
throw new NumberFormatException(digit.toString());
} else {
// Append as backreference
sb.append(decimalValue);
}
unicode(value);
} else {
// Copy decimal escape as-is
decimalDigits();
}
return true;
}
......@@ -965,13 +967,41 @@ final class RegExpScanner extends Scanner {
return true;
}
private void unicode(final int value) {
private void unicode(final int value, final StringBuilder buffer) {
final String hex = Integer.toHexString(value);
sb.append('u');
buffer.append('u');
for (int i = 0; i < 4 - hex.length(); i++) {
sb.append('0');
buffer.append('0');
}
buffer.append(hex);
}
// Convert what would have been a backreference into a unicode escape, or a number literal, or both.
private void octalOrLiteral(final String numberLiteral, final StringBuilder buffer) {
final int length = numberLiteral.length();
int octalValue = 0;
int pos = 0;
// Maximum value for octal escape is 0377 (255) so we stop the loop at 32
while (pos < length && octalValue < 0x20) {
final char ch = numberLiteral.charAt(pos);
if (isOctalDigit(ch)) {
octalValue = octalValue * 8 + ch - '0';
} else {
break;
}
pos++;
}
if (octalValue > 0) {
buffer.append('\\');
unicode(octalValue, buffer);
buffer.append(numberLiteral.substring(pos));
} else {
buffer.append(numberLiteral);
}
sb.append(hex);
}
private static boolean isOctalDigit(final char ch) {
return ch >= '0' && ch <= '7';
}
private static boolean isDecimalDigit(final char ch) {
......
/*
* Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* JDK-8011714: Regexp decimal escape handling still not correct
*
* @test
* @run
*/
// \0 should be interpreted as <NUL> character here
print(/\08/.test("\x008"));
print(/[\08]/.test("8"));
print(/[\08]/.test("\x00"));
// Can't be converted to octal thus encoded as literal char sequence
print(/\8/.exec("\\8"));
print(/[\8]/.exec("\\"));
print(/[\8]/.exec("8"));
// 0471 is too high for an octal escape so it is \047 outside a character class
// and \\471 inside a character class
print(/\471/.exec("\x271"));
print(/[\471]/.exec("1"));
print(/[\471]/.exec("\x27"));
// 0366 is a valid octal escape (246)
print(/\366/.test("\xf6"));
print(/[\366]/.test("\xf6"));
print(/[\366]/.test("\xf6"));
// more tests for conversion of invalid backreferences to octal escapes or literals
print(/(a)(b)(c)(d)\4/.exec("abcdd"));
print(/(a)(b)(c)(d)\4x/.exec("abcddx"));
print(/(a)(b)(c)(d)\47/.exec("abcdd7"));
print(/(a)(b)(c)(d)\47/.exec("abcd\x27"));
print(/(a)(b)(c)(d)\47xyz/.exec("abcd\x27xyz"));
print(/(a)(b)(c)(d)[\47]/.exec("abcd\x27"));
print(/(a)(b)(c)(d)[\47]xyz/.exec("abcd\x27xyz"));
print(/(a)(b)(c)(d)\48/.exec("abcd\x048"));
print(/(a)(b)(c)(d)\48xyz/.exec("abcd\x048xyz"));
print(/(a)(b)(c)(d)[\48]/.exec("abcd\x04"));
print(/(a)(b)(c)(d)[\48]xyz/.exec("abcd\x04xyz"));
print(/(a)(b)(c)(d)\84/.exec("abcd84"));
print(/(a)(b)(c)(d)\84xyz/.exec("abcd84xyz"));
print(/(a)(b)(c)(d)[\84]/.exec("abcd8"));
print(/(a)(b)(c)(d)[\84]xyz/.exec("abcd8xyz"));
true
true
true
8
null
8
'1
1
'
true
true
true
abcdd,a,b,c,d
abcddx,a,b,c,d
null
abcd',a,b,c,d
abcd'xyz,a,b,c,d
abcd',a,b,c,d
abcd'xyz,a,b,c,d
abcd8,a,b,c,d
abcd8xyz,a,b,c,d
abcd,a,b,c,d
abcdxyz,a,b,c,d
abcd84,a,b,c,d
abcd84xyz,a,b,c,d
abcd8,a,b,c,d
abcd8xyz,a,b,c,d
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册