提交 852c6940 编写于 作者: S sherman

7183053: Optimize DoubleByte charset for String.getBytes()/new String(byte[])

Summary: DoubleByte implements sun/nio.cs/ArrayDe/Encoder interface
Reviewed-by: alanb
上级 e57ac292
......@@ -33,6 +33,8 @@ import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.util.Arrays;
import sun.nio.cs.Surrogate;
import sun.nio.cs.ArrayDecoder;
import sun.nio.cs.ArrayEncoder;
import static sun.nio.cs.CharsetMapping.*;
/*
......@@ -107,7 +109,7 @@ public class DoubleByte {
}
public static class Decoder extends CharsetDecoder
implements DelegatableDecoder
implements DelegatableDecoder, ArrayDecoder
{
final char[][] b2c;
......@@ -209,6 +211,29 @@ public class DoubleByte {
return decodeBufferLoop(src, dst);
}
public int decode(byte[] src, int sp, int len, char[] dst) {
int dp = 0;
int sl = sp + len;
char repl = replacement().charAt(0);
while (sp < sl) {
int b1 = src[sp++] & 0xff;
char c = b2cSB[b1];
if (c == UNMAPPABLE_DECODING) {
if (sp < sl) {
int b2 = src[sp++] & 0xff;
if (b2 >= b2Min && b2 <= b2Max) {
c = b2c[b1][b2 - b2Min];
}
}
if (c == UNMAPPABLE_DECODING) {
c = repl;
}
}
dst[dp++] = c;
}
return dp;
}
public void implReset() {
super.implReset();
}
......@@ -228,6 +253,7 @@ public class DoubleByte {
return UNMAPPABLE_DECODING;
return b2c[b1][b2 - b2Min];
}
}
// IBM_EBCDIC_DBCS
......@@ -367,6 +393,46 @@ public class DoubleByte {
src.position(mark);
}
}
public int decode(byte[] src, int sp, int len, char[] dst) {
int dp = 0;
int sl = sp + len;
currentState = SBCS;
char repl = replacement().charAt(0);
while (sp < sl) {
int b1 = src[sp++] & 0xff;
if (b1 == SO) { // Shift out
if (currentState != SBCS)
dst[dp++] = repl;
else
currentState = DBCS;
} else if (b1 == SI) {
if (currentState != DBCS)
dst[dp++] = repl;
else
currentState = SBCS;
} else {
char c = UNMAPPABLE_DECODING;
if (currentState == SBCS) {
c = b2cSB[b1];
if (c == UNMAPPABLE_DECODING)
c = repl;
} else {
if (sl == sp) {
c = repl;
} else {
int b2 = src[sp++] & 0xff;
if (b2 < b2Min || b2 > b2Max ||
(c = b2c[b1][b2 - b2Min]) == UNMAPPABLE_DECODING) {
c = repl;
}
}
}
dst[dp++] = c;
}
}
return dp;
}
}
// EBCDIC_DBCS_ONLY
......@@ -405,9 +471,37 @@ public class DoubleByte {
return CoderResult.malformedForLength(1);
return CoderResult.unmappableForLength(2);
}
public int decode(byte[] src, int sp, int len, char[] dst) {
int dp = 0;
int sl = sp + len;
char repl = replacement().charAt(0);
while (sp < sl) {
int b1 = src[sp++] & 0xff;
char c = b2cSB[b1];
if (c == UNMAPPABLE_DECODING) {
if (sp < sl) {
int b2 = src[sp++] & 0xff;
if (b2 < b2Min || b2 > b2Max ||
(c = b2c[b1][b2 - b2Min]) == UNMAPPABLE_DECODING) {
if (b1 == SS2 || b1 == SS3) {
sp--;
}
c = repl;
}
} else {
c = repl;
}
}
dst[dp++] = c;
}
return dp;
}
}
public static class Encoder extends CharsetEncoder {
public static class Encoder extends CharsetEncoder
implements ArrayEncoder
{
final int MAX_SINGLEBYTE = 0xff;
private final char[] c2b;
private final char[] c2bIndex;
......@@ -516,6 +610,35 @@ public class DoubleByte {
return encodeBufferLoop(src, dst);
}
public int encode(char[] src, int sp, int len, byte[] dst) {
int dp = 0;
int sl = sp + len;
int dl = dst.length;
while (sp < sl) {
char c = src[sp++];
int bb = encodeChar(c);
if (bb == UNMAPPABLE_ENCODING) {
if (Character.isHighSurrogate(c) && sp < sl &&
Character.isLowSurrogate(src[sp])) {
sp++;
}
byte[] repl = replacement();
dst[dp++] = repl[0];
if (repl.length > 1)
dst[dp++] = repl[1];
continue;
} //else
if (bb > MAX_SINGLEBYTE) { // DoubleByte
dst[dp++] = (byte)(bb >> 8);
dst[dp++] = (byte)bb;
} else { // SingleByte
dst[dp++] = (byte)bb;
}
}
return dp;
}
public int encodeChar(char ch) {
return c2b[c2bIndex[ch >> 8] + (ch & 0xff)];
}
......@@ -604,7 +727,6 @@ public class DoubleByte {
}
}
// EBCDIC_DBCS_ONLY
public static class Encoder_EBCDIC_DBCSONLY extends Encoder {
Encoder_EBCDIC_DBCSONLY(Charset cs, byte[] repl,
char[] c2b, char[] c2bIndex) {
......@@ -619,7 +741,6 @@ public class DoubleByte {
}
}
// for IBM_EBCDIC_DBCS
public static class Encoder_EBCDIC extends Encoder {
static final int SBCS = 0;
static final int DBCS = 1;
......@@ -741,6 +862,47 @@ public class DoubleByte {
src.position(mark);
}
}
public int encode(char[] src, int sp, int len, byte[] dst) {
int dp = 0;
int sl = sp + len;
while (sp < sl) {
char c = src[sp++];
int bb = encodeChar(c);
if (bb == UNMAPPABLE_ENCODING) {
if (Character.isHighSurrogate(c) && sp < sl &&
Character.isLowSurrogate(src[sp])) {
sp++;
}
byte[] repl = replacement();
dst[dp++] = repl[0];
if (repl.length > 1)
dst[dp++] = repl[1];
continue;
} //else
if (bb > MAX_SINGLEBYTE) { // DoubleByte
if (currentState == SBCS) {
currentState = DBCS;
dst[dp++] = SO;
}
dst[dp++] = (byte)(bb >> 8);
dst[dp++] = (byte)bb;
} else { // SingleByte
if (currentState == DBCS) {
currentState = SBCS;
dst[dp++] = SI;
}
dst[dp++] = (byte)bb;
}
}
if (currentState == DBCS) {
currentState = SBCS;
dst[dp++] = SI;
}
return dp;
}
}
// EUC_SIMPLE
......
......@@ -175,6 +175,40 @@ public class HKSCS {
}
}
public int decode(byte[] src, int sp, int len, char[] dst) {
int dp = 0;
int sl = sp + len;
char repl = replacement().charAt(0);
while (sp < sl) {
int b1 = src[sp++] & 0xff;
char c = decodeSingle(b1);
if (c == UNMAPPABLE_DECODING) {
if (sl == sp) {
c = repl;
} else {
int b2 = src[sp++] & 0xff;
if (b2 < b2Min || b2 > b2Max) {
c = repl;
} else if ((c = decodeDouble(b1, b2)) == UNMAPPABLE_DECODING) {
c = decodeDoubleEx(b1, b2); //supp
if (c == UNMAPPABLE_DECODING) {
c = decodeBig5(b1, b2); //big5
if (c == UNMAPPABLE_DECODING)
c = repl;
} else {
// supplementary character in u+2xxxx area
dst[dp++] = Surrogate.high(0x20000 + c);
dst[dp++] = Surrogate.low(0x20000 + c);
continue;
}
}
}
}
dst[dp++] = c;
}
return dp;
}
public CoderResult decodeLoop(ByteBuffer src, CharBuffer dst) {
if (src.hasArray() && dst.hasArray())
return decodeArrayLoop(src, dst);
......@@ -322,6 +356,36 @@ public class HKSCS {
return encodeBufferLoop(src, dst);
}
public int encode(char[] src, int sp, int len, byte[] dst) {
int dp = 0;
int sl = sp + len;
while (sp < sl) {
char c = src[sp++];
int bb = encodeChar(c);
if (bb == UNMAPPABLE_ENCODING) {
if (!Character.isHighSurrogate(c) || sp == sl ||
!Character.isLowSurrogate(src[sp]) ||
(bb = encodeSupp(Character.toCodePoint(c, src[sp++])))
== UNMAPPABLE_ENCODING) {
byte[] repl = replacement();
dst[dp++] = repl[0];
if (repl.length > 1)
dst[dp++] = repl[1];
continue;
}
sp++;
}
if (bb > MAX_SINGLEBYTE) { // DoubleByte
dst[dp++] = (byte)(bb >> 8);
dst[dp++] = (byte)bb;
} else { // SingleByte
dst[dp++] = (byte)bb;
}
}
return dp;
}
static char[] C2B_UNMAPPABLE = new char[0x100];
static {
Arrays.fill(C2B_UNMAPPABLE, (char)UNMAPPABLE_ENCODING);
......
......@@ -75,7 +75,7 @@ public class StrCodingBenchmark {
return nanoss;
}
public static void time(Job ... jobs) throws Throwable {
public static long[] time(Job ... jobs) throws Throwable {
long[] warmup = time0(jobs); // Warm up run
long[] nanoss = time0(jobs); // Real timing run
......@@ -110,6 +110,7 @@ public class StrCodingBenchmark {
// Print out absolute and relative times, calibrated against first job
for (int i = 0; i < jobs.length; i++)
System.out.printf(format, jobs[i].name(), milliss[i], ratios[i]);
return milliss;
}
public static Job[] filter(Pattern filter, Job[] jobs) {
......
/*
* Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
import java.util.*;
import java.nio.*;
import java.nio.charset.*;
import java.util.concurrent.*;
import java.util.regex.Pattern;
public class StrCodingBenchmarkDB extends StrCodingBenchmark {
public static void main(String[] args) throws Throwable {
final int itrs = Integer.getInteger("iterations", 100000);
//final int itrs = Integer.getInteger("iterations", 12);
final int size = Integer.getInteger("size", 2048);
final int subsize = Integer.getInteger("subsize", 128);
final int maxchar = Integer.getInteger("maxchar", 128);
final String regex = System.getProperty("filter");
final Pattern filter = (regex == null) ? null : Pattern.compile(regex);
final boolean useSecurityManager = Boolean.getBoolean("SecurityManager");
if (useSecurityManager)
System.setSecurityManager(new PermissiveSecurityManger());
final Random rnd = new Random();
String[] csns = new String[] {
"Big5",
"Johab",
"EUC_CN",
"EUC_KR",
"MS932",
"MS936",
"MS949",
"MS950",
"GBK",
"Big5_HKSCS",
"Big5_HKSCS_2001",
"Big5_Solaris",
"MS950_HKSCS",
"MS950_HKSCS_XP",
"IBM1364",
"IBM1381",
"IBM1383",
"IBM930",
"IBM933",
"IBM935",
"IBM937",
"IBM939",
"IBM942",
"IBM943",
"IBM948",
"IBM949",
"IBM950",
"IBM970",
};
ArrayList<long[]> sum = new ArrayList<>();
for (final String csn : csns) {
final Charset cs = Charset.forName(csn);
List<Integer> cps = new ArrayList<>(0x4000);
int off = 0;
int cp = 0;
int n = 0;
CharsetEncoder enc = cs.newEncoder();
while (cp < 0x10000 && n < cps.size()) {
if (enc.canEncode((char)cp)) {
cps.add(cp);
n++;
}
cp++;
}
Collections.shuffle(cps);
char[] ca = new char[cps.size()];
for (int i = 0; i < cps.size(); i++)
ca[i] = (char)(int)cps.get(i);
System.out.printf("%n--------%s---------%n", csn);
for (int sz = 8; sz <= 2048; sz *= 2) {
System.out.printf(" [len=%d]%n", sz);
final char[] chars = Arrays.copyOf(ca, sz);
final String str = new String(chars);
final byte[] bs = str.getBytes(cs);
Job[] jobs = {
new Job("String decode: csn") {
public void work() throws Throwable {
for (int i = 0; i < itrs; i++)
new String(bs, csn);
}},
new Job("String decode: cs") {
public void work() throws Throwable {
for (int i = 0; i < itrs; i++)
new String(bs, cs);
}},
new Job("String encode: csn") {
public void work() throws Throwable {
for (int i = 0; i < itrs; i++)
str.getBytes(csn);
}},
new Job("String encode: cs") {
public void work() throws Throwable {
for (int i = 0; i < itrs; i++)
str.getBytes(cs);
}},
};
sum.add(time(jobs));
}
}
}
}
......@@ -24,7 +24,7 @@
*/
/* @test
@bug 6636323 6636319 7040220 7096080
@bug 6636323 6636319 7040220 7096080 7183053
@summary Test if StringCoding and NIO result have the same de/encoding result
* @run main/othervm/timeout=2000 TestStringCoding
*/
......@@ -70,11 +70,62 @@ public class TestStringCoding {
}
test(cs, Arrays.copyOf(bmpCA, clen), Arrays.copyOf(sbBA, blen));
}
testMixed(cs);
System.out.println("done!");
}
}
}
static void testMixed(Charset cs) throws Throwable {
CharsetDecoder dec = cs.newDecoder()
.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE);
CharsetEncoder enc = cs.newEncoder()
.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE);
List<Integer> cps = new ArrayList<>(0x10000);
int off = 0;
int cp = 0;
while (cp < 0x10000) {
if (enc.canEncode((char)cp)) {
cps.add(cp);
}
cp++;
}
Collections.shuffle(cps);
char[] bmpCA = new char[cps.size()];
for (int i = 0; i < cps.size(); i++)
bmpCA[i] = (char)(int)cps.get(i);
String bmpStr = new String(bmpCA);
//getBytes(csn);
byte[] bmpBA = bmpStr.getBytes(cs.name());
ByteBuffer bf = enc.reset().encode(CharBuffer.wrap(bmpCA));
byte[] baNIO = new byte[bf.limit()];
bf.get(baNIO, 0, baNIO.length);
if (!Arrays.equals(bmpBA, baNIO)) {
throw new RuntimeException("getBytes(csn) failed -> " + cs.name());
}
//getBytes(cs);
bmpBA = bmpStr.getBytes(cs);
if (!Arrays.equals(bmpBA, baNIO))
throw new RuntimeException("getBytes(cs) failed -> " + cs.name());
//new String(csn);
String strSC = new String(bmpBA, cs.name());
String strNIO = dec.reset().decode(ByteBuffer.wrap(bmpBA)).toString();
if(!strNIO.equals(strSC)) {
throw new RuntimeException("new String(csn) failed -> " + cs.name());
}
//new String(cs);
strSC = new String(bmpBA, cs);
if (!strNIO.equals(strSC))
throw new RuntimeException("new String(cs) failed -> " + cs.name());
}
static void test(Charset cs, char[] bmpCA, byte[] sbBA) throws Throwable {
String bmpStr = new String(bmpCA);
CharsetDecoder dec = cs.newDecoder()
......@@ -100,6 +151,7 @@ public class TestStringCoding {
//new String(csn);
String strSC = new String(sbBA, cs.name());
String strNIO = dec.reset().decode(ByteBuffer.wrap(sbBA)).toString();
if(!strNIO.equals(strSC))
throw new RuntimeException("new String(csn) failed -> " + cs.name());
......@@ -112,7 +164,7 @@ public class TestStringCoding {
if (enc instanceof sun.nio.cs.ArrayEncoder &&
cs.contains(Charset.forName("ASCII"))) {
if (cs.name().equals("UTF-8") || // utf8 handles surrogates
cs.name().equals("CESU-8")) // utf8 handles surrogates
cs.name().equals("CESU-8")) // utf8 handles surrogates
return;
enc.replaceWith(new byte[] { (byte)'A'});
sun.nio.cs.ArrayEncoder cae = (sun.nio.cs.ArrayEncoder)enc;
......@@ -137,12 +189,16 @@ public class TestStringCoding {
cs.name())))
throw new RuntimeException("encode3(surrogates) failed -> "
+ cs.name());
/* sun.nio.cs.ArrayDeEncoder works on the assumption that the
invoker (StringCoder) allocates enough output buf, utf8
and double-byte coder does not check the output buffer limit.
ba = new byte[str.length() - 1];
n = cae.encode(str.toCharArray(), 0, str.length(), ba);
if (n != 7 || !"abABABc".equals(new String(ba, 0, n,
cs.name())))
if (n != 7 || !"abABABc".equals(new String(ba, 0, n, cs.name()))) {
throw new RuntimeException("encode4(surrogates) failed -> "
+ cs.name());
}
*/
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册