提交 d0df35a2 编写于 作者: S sherman

7040220: java/char_encodin Optimize UTF-8 charset for String.getBytes()/new String(byte[])

Summary: implement sun.nio.cs.ArrayEn/Decoer in utf8
Reviewed-by: alanb
上级 116c9f8c
......@@ -222,13 +222,13 @@ class StringCoding {
off = 0;
}
}
cd.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE)
.reset();
if (cd instanceof ArrayDecoder) {
int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
return safeTrim(ca, clen, cs, isTrusted);
} else {
cd.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE)
.reset();
ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
CharBuffer cb = CharBuffer.wrap(ca);
try {
......@@ -356,13 +356,13 @@ class StringCoding {
off = 0;
}
}
ce.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE)
.reset();
if (ce instanceof ArrayEncoder) {
int blen = ((ArrayEncoder)ce).encode(ca, off, len, ba);
return safeTrim(ba, blen, cs, isTrusted);
} else {
ce.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE)
.reset();
ByteBuffer bb = ByteBuffer.wrap(ba);
CharBuffer cb = CharBuffer.wrap(ca, off, len);
try {
......
......@@ -34,6 +34,8 @@ import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.util.Arrays;
import sun.nio.cs.ArrayDecoder;
import sun.nio.cs.ArrayEncoder;
/**
* Utility class for zipfile name and comment decoding and encoding
......@@ -47,6 +49,15 @@ final class ZipCoder {
char[] ca = new char[len];
if (len == 0)
return new String(ca);
// UTF-8 only for now. Other ArrayDeocder only handles
// CodingErrorAction.REPLACE mode. ZipCoder uses
// REPORT mode.
if (isUTF8 && cd instanceof ArrayDecoder) {
int clen = ((ArrayDecoder)cd).decode(ba, 0, length, ca);
if (clen == -1) // malformed
throw new IllegalArgumentException("MALFORMED");
return new String(ca, 0, clen);
}
ByteBuffer bb = ByteBuffer.wrap(ba, 0, length);
CharBuffer cb = CharBuffer.wrap(ca);
CoderResult cr = cd.decode(bb, cb, true);
......@@ -69,6 +80,14 @@ final class ZipCoder {
byte[] ba = new byte[len];
if (len == 0)
return ba;
// UTF-8 only for now. Other ArrayDeocder only handles
// CodingErrorAction.REPLACE mode.
if (isUTF8 && ce instanceof ArrayEncoder) {
int blen = ((ArrayEncoder)ce).encode(ca, 0, ca.length, ba);
if (blen == -1) // malformed
throw new IllegalArgumentException("MALFORMED");
return Arrays.copyOf(ba, blen);
}
ByteBuffer bb = ByteBuffer.wrap(ba);
CharBuffer cb = CharBuffer.wrap(ca);
CoderResult cr = ce.encode(cb, bb, true);
......@@ -85,7 +104,7 @@ final class ZipCoder {
// assume invoked only if "this" is not utf8
byte[] getBytesUTF8(String s) {
if (isutf8)
if (isUTF8)
return getBytes(s);
if (utf8 == null)
utf8 = new ZipCoder(StandardCharset.UTF_8);
......@@ -94,7 +113,7 @@ final class ZipCoder {
String toStringUTF8(byte[] ba, int len) {
if (isutf8)
if (isUTF8)
return toString(ba, len);
if (utf8 == null)
utf8 = new ZipCoder(StandardCharset.UTF_8);
......@@ -102,18 +121,18 @@ final class ZipCoder {
}
boolean isUTF8() {
return isutf8;
return isUTF8;
}
private Charset cs;
private CharsetDecoder dec;
private CharsetEncoder enc;
private boolean isutf8;
private boolean isUTF8;
private ZipCoder utf8;
private ZipCoder(Charset cs) {
this.cs = cs;
this.isutf8 = cs.name().equals(StandardCharset.UTF_8.name());
this.isUTF8 = cs.name().equals(StandardCharset.UTF_8.name());
}
static ZipCoder get(Charset charset) {
......
......@@ -32,6 +32,7 @@ import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
/* Legal UTF-8 Byte Sequences
*
......@@ -77,7 +78,8 @@ class UTF_8 extends Unicode
dst.position(dp - dst.arrayOffset());
}
private static class Decoder extends CharsetDecoder {
private static class Decoder extends CharsetDecoder
implements ArrayDecoder {
private Decoder(Charset cs) {
super(cs, 1.0f, 1.0f);
}
......@@ -353,9 +355,132 @@ class UTF_8 extends Unicode
else
return decodeBufferLoop(src, dst);
}
private static ByteBuffer getByteBuffer(ByteBuffer bb, byte[] ba, int sp)
{
if (bb == null)
bb = ByteBuffer.wrap(ba);
bb.position(sp);
return bb;
}
// returns -1 if there is malformed byte(s) and the
// "action" for malformed input is not REPLACE.
public int decode(byte[] sa, int sp, int len, char[] da) {
final int sl = sp + len;
int dp = 0;
int dlASCII = Math.min(len, da.length);
ByteBuffer bb = null; // only necessary if malformed
// ASCII only optimized loop
while (dp < dlASCII && sa[sp] >= 0)
da[dp++] = (char) sa[sp++];
while (sp < sl) {
int b1 = sa[sp++];
if (b1 >= 0) {
// 1 byte, 7 bits: 0xxxxxxx
da[dp++] = (char) b1;
} else if ((b1 >> 5) == -2) {
// 2 bytes, 11 bits: 110xxxxx 10xxxxxx
if (sp < sl) {
int b2 = sa[sp++];
if (isMalformed2(b1, b2)) {
if (malformedInputAction() != CodingErrorAction.REPLACE)
return -1;
da[dp++] = replacement().charAt(0);
sp--; // malformedN(bb, 2) always returns 1
} else {
da[dp++] = (char) (((b1 << 6) ^ b2)^
(((byte) 0xC0 << 6) ^
((byte) 0x80 << 0)));
}
continue;
}
if (malformedInputAction() != CodingErrorAction.REPLACE)
return -1;
da[dp++] = replacement().charAt(0);
return dp;
} else if ((b1 >> 4) == -2) {
// 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
if (sp + 1 < sl) {
int b2 = sa[sp++];
int b3 = sa[sp++];
if (isMalformed3(b1, b2, b3)) {
if (malformedInputAction() != CodingErrorAction.REPLACE)
return -1;
da[dp++] = replacement().charAt(0);
sp -=3;
bb = getByteBuffer(bb, sa, sp);
sp += malformedN(bb, 3).length();
} else {
da[dp++] = (char)((b1 << 12) ^
(b2 << 6) ^
(b3 ^
(((byte) 0xE0 << 12) ^
((byte) 0x80 << 6) ^
((byte) 0x80 << 0))));
}
continue;
}
if (malformedInputAction() != CodingErrorAction.REPLACE)
return -1;
da[dp++] = replacement().charAt(0);
return dp;
} else if ((b1 >> 3) == -2) {
// 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
if (sp + 2 < sl) {
int b2 = sa[sp++];
int b3 = sa[sp++];
int b4 = sa[sp++];
int uc = ((b1 << 18) ^
(b2 << 12) ^
(b3 << 6) ^
(b4 ^
(((byte) 0xF0 << 18) ^
((byte) 0x80 << 12) ^
((byte) 0x80 << 6) ^
((byte) 0x80 << 0))));
if (isMalformed4(b2, b3, b4) ||
// shortest form check
!Character.isSupplementaryCodePoint(uc)) {
if (malformedInputAction() != CodingErrorAction.REPLACE)
return -1;
da[dp++] = replacement().charAt(0);
sp -= 4;
bb = getByteBuffer(bb, sa, sp);
sp += malformedN(bb, 4).length();
} else {
da[dp++] = Character.highSurrogate(uc);
da[dp++] = Character.lowSurrogate(uc);
}
continue;
}
if (malformedInputAction() != CodingErrorAction.REPLACE)
return -1;
da[dp++] = replacement().charAt(0);
return dp;
} else {
if (malformedInputAction() != CodingErrorAction.REPLACE)
return -1;
da[dp++] = replacement().charAt(0);
sp--;
bb = getByteBuffer(bb, sa, sp);
CoderResult cr = malformedN(bb, 1);
if (!cr.isError()) {
// leading byte for 5 or 6-byte, but don't have enough
// bytes in buffer to check. Consumed rest as malformed.
return dp;
}
sp += cr.length();
}
}
return dp;
}
}
private static class Encoder extends CharsetEncoder {
private static class Encoder extends CharsetEncoder
implements ArrayEncoder {
private Encoder(Charset cs) {
super(cs, 1.1f, 3.0f);
......@@ -495,5 +620,50 @@ class UTF_8 extends Unicode
else
return encodeBufferLoop(src, dst);
}
// returns -1 if there is malformed char(s) and the
// "action" for malformed input is not REPLACE.
public int encode(char[] sa, int sp, int len, byte[] da) {
int sl = sp + len;
int dp = 0;
int dlASCII = dp + Math.min(len, da.length);
// ASCII only optimized loop
while (dp < dlASCII && sa[sp] < '\u0080')
da[dp++] = (byte) sa[sp++];
while (sp < sl) {
char c = sa[sp++];
if (c < 0x80) {
// Have at most seven bits
da[dp++] = (byte)c;
} else if (c < 0x800) {
// 2 bytes, 11 bits
da[dp++] = (byte)(0xc0 | (c >> 6));
da[dp++] = (byte)(0x80 | (c & 0x3f));
} else if (Character.isSurrogate(c)) {
if (sgp == null)
sgp = new Surrogate.Parser();
int uc = sgp.parse(c, sa, sp - 1, sl);
if (uc < 0) {
if (malformedInputAction() != CodingErrorAction.REPLACE)
return -1;
da[dp++] = replacement()[0];
} else {
da[dp++] = (byte)(0xf0 | ((uc >> 18)));
da[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f));
da[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f));
da[dp++] = (byte)(0x80 | (uc & 0x3f));
sp++; // 2 chars
}
} else {
// 3 bytes, 16 bits
da[dp++] = (byte)(0xe0 | ((c >> 12)));
da[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f));
da[dp++] = (byte)(0x80 | (c & 0x3f));
}
}
return dp;
}
}
}
/*
* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
import java.util.*;
import java.nio.*;
import java.nio.charset.*;
public class StrCodingBenchmarkUTF8 {
public static void main(String[] args) throws Throwable {
final int itrs = Integer.getInteger("iterations", 100000);
final int size = 2048;
final int subsize = Integer.getInteger("subsize", 128);
final Random rnd = new Random();
final int maxchar = 0x7f;
Charset charset = Charset.forName("UTF-8");
final String csn = charset.name();
final Charset cs = charset;
int[] starts = new int[] { 0, 0x80, 0x800, 0x10000};
for (int nb = 1; nb <= 4; nb++) {
final CharsetEncoder enc = cs.newEncoder();
char[] cc = new char[size];
int i = 0;
while (i < size - 3) {
i += Character.toChars(starts[nb - 1] + rnd.nextInt(maxchar), cc, i);
}
final String string = new String(cc);
final byte[] bytes = string.getBytes(cs);
System.out.printf("%n--------%s[nb=%d]---------%n", csn, nb);
int sz = 12;
while (sz < size) {
System.out.printf(" [len=%d]%n", sz);
final byte[] bs = Arrays.copyOf(bytes, sz);
final String str = new String(bs, csn);
StrCodingBenchmark.Job[] jobs = {
new StrCodingBenchmark.Job("String decode: csn") {
public void work() throws Throwable {
for (int i = 0; i < itrs; i++)
new String(bs, csn);
}},
new StrCodingBenchmark.Job("String decode: cs") {
public void work() throws Throwable {
for (int i = 0; i < itrs; i++)
new String(bs, cs);
}},
new StrCodingBenchmark.Job("String encode: csn") {
public void work() throws Throwable {
for (int i = 0; i < itrs; i++)
str.getBytes(csn);
}},
new StrCodingBenchmark.Job("String encode: cs") {
public void work() throws Throwable {
for (int i = 0; i < itrs; i++)
str.getBytes(cs);
}},
};
StrCodingBenchmark.time(StrCodingBenchmark.filter(null, jobs));
sz <<= 1;
}
}
}
}
......@@ -24,7 +24,7 @@
*/
/* @test
@bug 6636323 6636319
@bug 6636323 6636319 7040220
@summary Test if StringCoding and NIO result have the same de/encoding result
* @run main/othervm/timeout=2000 TestStringCoding
*/
......@@ -111,6 +111,8 @@ public class TestStringCoding {
//encode unmappable surrogates
if (enc instanceof sun.nio.cs.ArrayEncoder &&
cs.contains(Charset.forName("ASCII"))) {
if (cs.name().equals("UTF-8")) // utf8 handles surrogates
return;
enc.replaceWith(new byte[] { (byte)'A'});
sun.nio.cs.ArrayEncoder cae = (sun.nio.cs.ArrayEncoder)enc;
......
/*
* Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/* @test
@bug 7040220
@summary Test if StringCoding and NIO result have the same de/encoding result for UTF-8
* @run main/othervm/timeout=2000 TestStringCodingUTF8
*/
import java.util.*;
import java.nio.*;
import java.nio.charset.*;
public class TestStringCodingUTF8 {
public static void main(String[] args) throws Throwable {
test();
// security manager on
System.setSecurityManager(new PermissiveSecurityManger());
test();
}
static void test() throws Throwable {
Charset cs = Charset.forName("UTF-8");
char[] bmp = new char[0x10000];
for (int i = 0; i < 0x10000; i++) {
bmp[i] = (char)i;
}
test(cs, bmp, 0, bmp.length);
ArrayList<Integer> list = new ArrayList<>(0x20000);
for (int i = 0; i < 0x20000; i++) {
list.add(i, i);
}
Collections.shuffle(list);
int j = 0;
char[] bmpsupp = new char[0x30000];
for (int i = 0; i < 0x20000; i++) {
j += Character.toChars(list.get(i), bmpsupp, j);
}
assert (j == bmpsupp.length);
test(cs, bmpsupp, 0, bmpsupp.length);
// randomed "off" and "len" on shuffled data
Random rnd = new Random();
int maxlen = 1000;
int itr = 5000;
for (int i = 0; i < itr; i++) {
int off = rnd.nextInt(bmpsupp.length - maxlen);
int len = rnd.nextInt(maxlen);
test(cs, bmpsupp, off, len);
}
// random length of bytes, test the edge corner case
for (int i = 0; i < itr; i++) {
byte[] ba = new byte[rnd.nextInt(maxlen)];
rnd.nextBytes(ba);
//new String(csn);
if (!new String(ba, cs.name()).equals(
new String(decode(cs, ba, 0, ba.length))))
throw new RuntimeException("new String(csn) failed");
//new String(cs);
if (!new String(ba, cs).equals(
new String(decode(cs, ba, 0, ba.length))))
throw new RuntimeException("new String(cs) failed");
}
System.out.println("done!");
}
static void test(Charset cs, char[] ca, int off, int len) throws Throwable {
String str = new String(ca, off, len);
byte[] ba = encode(cs, ca, off, len);
//getBytes(csn);
byte[] baStr = str.getBytes(cs.name());
if (!Arrays.equals(ba, baStr))
throw new RuntimeException("getBytes(csn) failed");
//getBytes(cs);
baStr = str.getBytes(cs);
if (!Arrays.equals(ba, baStr))
throw new RuntimeException("getBytes(cs) failed");
//new String(csn);
if (!new String(ba, cs.name()).equals(new String(decode(cs, ba, 0, ba.length))))
throw new RuntimeException("new String(csn) failed");
//new String(cs);
if (!new String(ba, cs).equals(new String(decode(cs, ba, 0, ba.length))))
throw new RuntimeException("new String(cs) failed");
}
// copy/paste of the StringCoding.decode()
static char[] decode(Charset cs, byte[] ba, int off, int len) {
CharsetDecoder cd = cs.newDecoder();
int en = (int)(len * cd.maxCharsPerByte());
char[] ca = new char[en];
if (len == 0)
return ca;
cd.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE)
.reset();
ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
CharBuffer cb = CharBuffer.wrap(ca);
try {
CoderResult cr = cd.decode(bb, cb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = cd.flush(cb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x) {
throw new Error(x);
}
return Arrays.copyOf(ca, cb.position());
}
// copy/paste of the StringCoding.encode()
static byte[] encode(Charset cs, char[] ca, int off, int len) {
CharsetEncoder ce = cs.newEncoder();
int en = (int)(len * ce.maxBytesPerChar());
byte[] ba = new byte[en];
if (len == 0)
return ba;
ce.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE)
.reset();
ByteBuffer bb = ByteBuffer.wrap(ba);
CharBuffer cb = CharBuffer.wrap(ca, off, len);
try {
CoderResult cr = ce.encode(cb, bb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = ce.flush(bb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x) {
throw new Error(x);
}
return Arrays.copyOf(ba, bb.position());
}
static class PermissiveSecurityManger extends SecurityManager {
@Override public void checkPermission(java.security.Permission p) {}
}
}
......@@ -23,7 +23,7 @@
/*
* @test
* @bug 4486841
* @bug 4486841 7040220
* @summary Test UTF-8 charset
*/
......@@ -70,6 +70,32 @@ public class TestUTF8 {
return dec.decode(bbf, cbf, true);
}
// copy/paste of the StringCoding.decode()
static char[] decode(Charset cs, byte[] ba, int off, int len) {
CharsetDecoder cd = cs.newDecoder();
int en = (int)(len * cd.maxCharsPerByte());
char[] ca = new char[en];
if (len == 0)
return ca;
cd.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE)
.reset();
ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
CharBuffer cb = CharBuffer.wrap(ca);
try {
CoderResult cr = cd.decode(bb, cb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = cd.flush(cb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x) {
throw new Error(x);
}
return Arrays.copyOf(ca, cb.position());
}
static byte[] encode(char[] cc, String csn, boolean testDirect)
throws Exception {
ByteBuffer bbf;
......@@ -142,7 +168,14 @@ public class TestUTF8 {
bb = encode(cc, csn, true);
ccO = decode(bb, csn, true);
if (!Arrays.equals(cc, ccO)) {
System.out.printf(" (direct) failed");
System.out.print(" (direct) failed");
}
// String.getBytes()/toCharArray() goes to ArrayDe/Encoder path
if (!Arrays.equals(bb, new String(cc).getBytes(csn))) {
System.out.printf(" String.getBytes() failed");
}
if (!Arrays.equals(cc, new String(bb, csn).toCharArray())) {
System.out.printf(" String.toCharArray() failed");
}
System.out.println();
}
......@@ -168,6 +201,12 @@ public class TestUTF8 {
if (!Arrays.equals(cc, ccO)) {
System.out.printf(" decoding(direct) failed%n");
}
// new String(bb, csn).getBytes(csn) will not return
// the 6 bytes surrogates as in bb, so only test
// toCharArray() here.
if (!Arrays.equals(cc, new String(bb, csn).toCharArray())) {
System.out.printf(" String.toCharArray() failed");
}
}
static void compare(String csn1, String csn2) throws Exception {
......@@ -274,6 +313,7 @@ public class TestUTF8 {
static void checkMalformed(String csn) throws Exception {
boolean failed = false;
System.out.printf(" Check malformed <%s>...%n", csn);
Charset cs = Charset.forName(csn);
for (boolean direct: new boolean[] {false, true}) {
for (byte[] bins : malformed) {
int mlen = bins[0];
......@@ -285,10 +325,15 @@ public class TestUTF8 {
ashex += Integer.toBinaryString((int)bin[i] & 0xff);
}
if (!cr.isMalformed()) {
System.out.printf(" FAIL(direct=%b): [%s] not malformed.\n", direct, ashex);
System.out.printf(" FAIL(direct=%b): [%s] not malformed.%n", direct, ashex);
failed = true;
} else if (cr.length() != mlen) {
System.out.printf(" FAIL(direct=%b): [%s] malformed[len=%d].\n", direct, ashex, cr.length());
System.out.printf(" FAIL(direct=%b): [%s] malformed[len=%d].%n", direct, ashex, cr.length());
failed = true;
}
if (!Arrays.equals(decode(cs, bin, 0, bin.length),
new String(bin, csn).toCharArray())) {
System.out.printf(" FAIL(new String(bb, %s)) failed%n", csn);
failed = true;
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册