提交 cfc21928 编写于 作者: M martin

6934265: Add public method Character.isBmpCodePoint

Summary: Move isBmpCodePoint from sun.nio.cs.Surrogate to Character
Reviewed-by: sherman
Contributed-by: NUlf Zibis <ulf.zibis@gmx.de>
上级 8685a9ca
......@@ -721,19 +721,18 @@ abstract class AbstractStringBuilder implements Appendable, CharSequence {
* {@code codePoint} isn't a valid Unicode code point
*/
public AbstractStringBuilder appendCodePoint(int codePoint) {
if (!Character.isValidCodePoint(codePoint)) {
throw new IllegalArgumentException();
}
int n = 1;
if (codePoint >= Character.MIN_SUPPLEMENTARY_CODE_POINT) {
n++;
}
ensureCapacityInternal(count + n);
if (n == 1) {
value[count++] = (char) codePoint;
} else {
final int count = this.count;
if (Character.isBmpCodePoint(codePoint)) {
ensureCapacityInternal(count + 1);
value[count] = (char) codePoint;
this.count = count + 1;
} else if (Character.isValidCodePoint(codePoint)) {
ensureCapacityInternal(count + 2);
Character.toSurrogates(codePoint, value, count);
count += n;
this.count = count + 2;
} else {
throw new IllegalArgumentException();
}
return this;
}
......
......@@ -67,17 +67,16 @@ import java.util.Locale;
* definition</i></a> of the U+<i>n</i> notation in the Unicode
* standard.)
*
* <p>The set of characters from U+0000 to U+FFFF is sometimes
* referred to as the <em>Basic Multilingual Plane (BMP)</em>. <a
* name="supplementary">Characters</a> whose code points are greater
* <p><a name="BMP">The set of characters from U+0000 to U+FFFF is
* sometimes referred to as the <em>Basic Multilingual Plane (BMP)</em>.
* <a name="supplementary">Characters</a> whose code points are greater
* than U+FFFF are called <em>supplementary character</em>s. The Java
* 2 platform uses the UTF-16 representation in <code>char</code>
* arrays and in the <code>String</code> and <code>StringBuffer</code>
* classes. In this representation, supplementary characters are
* represented as a pair of <code>char</code> values, the first from
* the <em>high-surrogates</em> range, (&#92;uD800-&#92;uDBFF), the
* second from the <em>low-surrogates</em> range
* (&#92;uDC00-&#92;uDFFF).
* platform uses the UTF-16 representation in <code>char</code> arrays and
* in the <code>String</code> and <code>StringBuffer</code> classes. In
* this representation, supplementary characters are represented as a pair
* of <code>char</code> values, the first from the <em>high-surrogates</em>
* range, (&#92;uD800-&#92;uDBFF), the second from the
* <em>low-surrogates</em> range (&#92;uDC00-&#92;uDFFF).
*
* <p>A <code>char</code> value, therefore, represents Basic
* Multilingual Plane (BMP) code points, including the surrogate
......@@ -3922,6 +3921,25 @@ class Character extends Object implements java.io.Serializable, Comparable<Chara
return plane < ((MAX_CODE_POINT + 1) >>> 16);
}
/**
* Determines whether the specified character (Unicode code point)
* is in the <a href="#BMP">Basic Multilingual Plane (BMP)</a>.
* Such code points can be represented using a single {@code char}.
*
* @param codePoint the character (Unicode code point) to be tested
* @return {@code true} if the specified code point is between
* {@link #MIN_VALUE} and {@link #MAX_VALUE} inclusive;
* {@code false} otherwise.
* @since 1.7
*/
public static boolean isBmpCodePoint(int codePoint) {
return codePoint >>> 16 == 0;
// Optimized form of:
// codePoint >= MIN_VALUE && codePoint <= MAX_VALUE
// We consistently use logical shift (>>>) to facilitate
// additional runtime optimizations.
}
/**
* Determines whether the specified character (Unicode code point)
* is in the <a href="#supplementary">supplementary character</a> range.
......@@ -4319,15 +4337,15 @@ class Character extends Object implements java.io.Serializable, Comparable<Chara
* @since 1.5
*/
public static int toChars(int codePoint, char[] dst, int dstIndex) {
if (codePoint < 0 || codePoint > MAX_CODE_POINT) {
throw new IllegalArgumentException();
}
if (codePoint < MIN_SUPPLEMENTARY_CODE_POINT) {
if (isBmpCodePoint(codePoint)) {
dst[dstIndex] = (char) codePoint;
return 1;
} else if (isValidCodePoint(codePoint)) {
toSurrogates(codePoint, dst, dstIndex);
return 2;
} else {
throw new IllegalArgumentException();
}
toSurrogates(codePoint, dst, dstIndex);
return 2;
}
/**
......@@ -4347,15 +4365,15 @@ class Character extends Object implements java.io.Serializable, Comparable<Chara
* @since 1.5
*/
public static char[] toChars(int codePoint) {
if (codePoint < 0 || codePoint > MAX_CODE_POINT) {
if (isBmpCodePoint(codePoint)) {
return new char[] { (char) codePoint };
} else if (isValidCodePoint(codePoint)) {
char[] result = new char[2];
toSurrogates(codePoint, result, 0);
return result;
} else {
throw new IllegalArgumentException();
}
if (codePoint < MIN_SUPPLEMENTARY_CODE_POINT) {
return new char[] { (char) codePoint };
}
char[] result = new char[2];
toSurrogates(codePoint, result, 0);
return result;
}
static void toSurrogates(int codePoint, char[] dst, int index) {
......@@ -6259,8 +6277,7 @@ class Character extends Object implements java.io.Serializable, Comparable<Chara
*/
static char[] toUpperCaseCharArray(int codePoint) {
// As of Unicode 4.0, 1:M uppercasings only happen in the BMP.
assert isValidCodePoint(codePoint) &&
!isSupplementaryCodePoint(codePoint);
assert isBmpCodePoint(codePoint);
return CharacterData.of(codePoint).toUpperCaseCharArray(codePoint);
}
......
......@@ -99,6 +99,8 @@ import java.util.regex.PatternSyntaxException;
*
* @author Lee Boynton
* @author Arthur van Hoff
* @author Martin Buchholz
* @author Ulf Zibis
* @see java.lang.Object#toString()
* @see java.lang.StringBuffer
* @see java.lang.StringBuilder
......@@ -273,32 +275,32 @@ public final class String
throw new StringIndexOutOfBoundsException(offset + count);
}
final int end = offset + count;
// Pass 1: Compute precise size of char[]
int n = 0;
for (int i = offset; i < offset + count; i++) {
int n = count;
for (int i = offset; i < end; i++) {
int c = codePoints[i];
if (c >= Character.MIN_CODE_POINT &&
c < Character.MIN_SUPPLEMENTARY_CODE_POINT)
n += 1;
else if (Character.isSupplementaryCodePoint(c))
n += 2;
if (Character.isBmpCodePoint(c))
continue;
else if (Character.isValidCodePoint(c))
n++;
else throw new IllegalArgumentException(Integer.toString(c));
}
// Pass 2: Allocate and fill in char[]
char[] v = new char[n];
for (int i = offset, j = 0; i < offset + count; i++) {
final char[] v = new char[n];
for (int i = offset, j = 0; i < end; i++, j++) {
int c = codePoints[i];
if (c < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
v[j++] = (char) c;
} else {
Character.toSurrogates(c, v, j);
j += 2;
}
if (Character.isBmpCodePoint(c))
v[j] = (char) c;
else
Character.toSurrogates(c, v, j++);
}
this.value = v;
this.count = v.length;
this.count = n;
this.offset = 0;
}
......
......@@ -24,7 +24,6 @@
*/
package sun.io;
import sun.nio.cs.Surrogate;
import sun.nio.cs.ext.DoubleByte;
import static sun.nio.cs.CharsetMapping.*;
......
......@@ -24,7 +24,6 @@
*/
package sun.io;
import sun.nio.cs.Surrogate;
import sun.nio.cs.ext.DoubleByte;
import static sun.nio.cs.CharsetMapping.*;
......
/*
* Copyright (c) 2000, 2001, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
......@@ -34,8 +34,9 @@ import java.nio.charset.UnmappableCharacterException;
* Utility class for dealing with surrogates.
*
* @author Mark Reinhold
* @author Martin Buchholz
* @author Ulf Zibis
*/
public class Surrogate {
private Surrogate() { }
......@@ -74,17 +75,10 @@ public class Surrogate {
return (MIN <= c) && (c <= MAX);
}
/**
* Tells whether or not the given UCS-4 character is in the Basic
* Multilingual Plane, and can be represented using a single char.
*/
public static boolean isBMPCodePoint(int uc) {
return uc >> 16 == 0;
}
/**
* Tells whether or not the given UCS-4 character must be represented as a
* surrogate pair in UTF-16.
* Use of {@link Character#isSupplementaryCodePoint} is generally preferred.
*/
public static boolean neededFor(int uc) {
return Character.isSupplementaryCodePoint(uc);
......@@ -110,6 +104,7 @@ public class Surrogate {
/**
* Converts the given surrogate pair into a 32-bit UCS-4 character.
* Use of {@link Character#toCodePoint} is generally preferred.
*/
public static int toUCS4(char c, char d) {
assert Character.isHighSurrogate(c) && Character.isLowSurrogate(d);
......@@ -290,8 +285,9 @@ public class Surrogate {
* error() will return a descriptive result object
*/
public int generate(int uc, int len, CharBuffer dst) {
if (Surrogate.isBMPCodePoint(uc)) {
if (Surrogate.is(uc)) {
if (Character.isBmpCodePoint(uc)) {
char c = (char) uc;
if (Character.isSurrogate(c)) {
error = CoderResult.malformedForLength(len);
return -1;
}
......@@ -299,10 +295,10 @@ public class Surrogate {
error = CoderResult.OVERFLOW;
return -1;
}
dst.put((char)uc);
dst.put(c);
error = null;
return 1;
} else if (Character.isSupplementaryCodePoint(uc)) {
} else if (Character.isValidCodePoint(uc)) {
if (dst.remaining() < 2) {
error = CoderResult.OVERFLOW;
return -1;
......@@ -334,8 +330,9 @@ public class Surrogate {
* error() will return a descriptive result object
*/
public int generate(int uc, int len, char[] da, int dp, int dl) {
if (Surrogate.isBMPCodePoint(uc)) {
if (Surrogate.is(uc)) {
if (Character.isBmpCodePoint(uc)) {
char c = (char) uc;
if (Character.isSurrogate(c)) {
error = CoderResult.malformedForLength(len);
return -1;
}
......@@ -343,10 +340,10 @@ public class Surrogate {
error = CoderResult.OVERFLOW;
return -1;
}
da[dp] = (char)uc;
da[dp] = c;
error = null;
return 1;
} else if (Character.isSupplementaryCodePoint(uc)) {
} else if (Character.isValidCodePoint(uc)) {
if (dl - dp < 2) {
error = CoderResult.OVERFLOW;
return -1;
......
......@@ -86,22 +86,21 @@ class UTF_32Coder {
src.position(mark);
}
}
while (src.remaining() > 3) {
while (src.remaining() >= 4) {
cp = getCP(src);
if (cp < 0 || cp > Surrogate.UCS4_MAX) {
return CoderResult.malformedForLength(4);
}
if (cp < Surrogate.UCS4_MIN) {
if (Character.isBmpCodePoint(cp)) {
if (!dst.hasRemaining())
return CoderResult.OVERFLOW;
mark += 4;
dst.put((char)cp);
} else {
dst.put((char) cp);
} else if (Character.isValidCodePoint(cp)) {
if (dst.remaining() < 2)
return CoderResult.OVERFLOW;
mark += 4;
dst.put(Surrogate.high(cp));
dst.put(Surrogate.low(cp));
} else {
return CoderResult.malformedForLength(4);
}
}
return CoderResult.UNDERFLOW;
......@@ -154,7 +153,12 @@ class UTF_32Coder {
try {
while (src.hasRemaining()) {
char c = src.get();
if (Character.isHighSurrogate(c)) {
if (!Character.isSurrogate(c)) {
if (dst.remaining() < 4)
return CoderResult.OVERFLOW;
mark++;
put(c, dst);
} else if (Character.isHighSurrogate(c)) {
if (!src.hasRemaining())
return CoderResult.UNDERFLOW;
char low = src.get();
......@@ -162,17 +166,13 @@ class UTF_32Coder {
if (dst.remaining() < 4)
return CoderResult.OVERFLOW;
mark += 2;
put(Surrogate.toUCS4(c, low), dst);
put(Character.toCodePoint(c, low), dst);
} else {
return CoderResult.malformedForLength(1);
}
} else if (Character.isLowSurrogate(c)) {
return CoderResult.malformedForLength(1);
} else {
if (dst.remaining() < 4)
return CoderResult.OVERFLOW;
mark++;
put(c, dst);
// assert Character.isLowSurrogate(c);
return CoderResult.malformedForLength(1);
}
}
return CoderResult.UNDERFLOW;
......
......@@ -102,7 +102,7 @@ class UTF_8 extends Unicode
// [F1..F3] [80..BF] [80..BF] [80..BF]
// [F4] [80..8F] [80..BF] [80..BF]
// only check 80-be range here, the [0xf0,0x80...] and [0xf4,0x90-...]
// will be checked by Surrogate.neededFor(uc)
// will be checked by Character.isSupplementaryCodePoint(uc)
private static boolean isMalformed4(int b2, int b3, int b4) {
return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 ||
(b4 & 0xc0) != 0x80;
......@@ -248,7 +248,8 @@ class UTF_8 extends Unicode
((b3 & 0x3f) << 06) |
(b4 & 0x3f);
if (isMalformed4(b2, b3, b4) ||
!Surrogate.neededFor(uc)) {
// shortest form check
!Character.isSupplementaryCodePoint(uc)) {
return malformed(src, sp, dst, dp, 4);
}
da[dp++] = Surrogate.high(uc);
......@@ -304,7 +305,8 @@ class UTF_8 extends Unicode
((b3 & 0x3f) << 06) |
(b4 & 0x3f);
if (isMalformed4(b2, b3, b4) ||
!Surrogate.neededFor(uc)) { // shortest form check
// shortest form check
!Character.isSupplementaryCodePoint(uc)) {
return malformed(src, mark, 4);
}
dst.put(Surrogate.high(uc));
......
......@@ -441,7 +441,7 @@ public class EUC_TW extends Charset implements HistoricallyNamedCharset
}
static int encode(char hi, char low, byte[] bb) {
int c = Surrogate.toUCS4(hi, low);
int c = Character.toCodePoint(hi, low);
if ((c & 0xf0000) != 0x20000)
return -1;
c -= 0x20000;
......
......@@ -12628,7 +12628,7 @@ public class GB18030
if (Character.isSurrogate(c)) {
if ((condensedKey=sgp.parse(c, sa, sp, sl)) < 0)
return sgp.error();
// Surogate.toUCS4 looks like
// Character.toCodePoint looks like
// (((high & 0x3ff) << 10) | (low & 0x3ff)) + 0x10000;
// so we add (0x2e248 - 0x10000) to get the "key".
condensedKey += 0x1E248;
......
......@@ -36,7 +36,6 @@ import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import sun.nio.cs.HistoricallyNamedCharset;
import sun.nio.cs.Surrogate;
public class IBM33722
extends Charset
......
......@@ -36,7 +36,6 @@ import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import sun.nio.cs.HistoricallyNamedCharset;
import sun.nio.cs.Surrogate;
public class IBM964
extends Charset
......
......@@ -46,7 +46,7 @@ public class BashStreams {
CharacterGenerator(long seed, String csn, int limit) {
rand = new Random(seed);
this.max = Surrogate.UCS4_MAX + 1;
this.max = Character.MAX_CODE_POINT + 1;
this.limit = limit;
}
......@@ -77,17 +77,20 @@ public class BashStreams {
int c;
for (;;) {
c = rand.nextInt(max);
if (Surrogate.is(c) || (c == 0xfffe) || (c == 0xffff))
if ((Character.isBmpCodePoint(c)
&& (Character.isSurrogate((char) c)
|| (c == 0xfffe) || (c == 0xffff))))
continue;
if (Surrogate.neededFor(c) && (count == limit - 1))
if (Character.isSupplementaryCodePoint(c)
&& (count == limit - 1))
continue;
break;
}
count++;
if (Surrogate.neededFor(c)) {
if (Character.isSupplementaryCodePoint(c)) {
count++;
push(Surrogate.low(c));
return Surrogate.high(c);
push(sun.nio.cs.Surrogate.low(c));
return sun.nio.cs.Surrogate.high(c);
}
return (char)c;
}
......@@ -137,7 +140,7 @@ public class BashStreams {
char d = cg.next();
if (c != d) {
if (c == '?') {
if (Surrogate.isHigh(d))
if (Character.isHighSurrogate(d))
cg.next();
continue;
}
......@@ -187,7 +190,7 @@ public class BashStreams {
w.write(ca, 0, n);
count += n;
}
if (Surrogate.isHigh(ca[n - 1]))
if (Character.isHighSurrogate(ca[n - 1]))
w.write(cg.next());
w.close();
}
......@@ -253,7 +256,8 @@ public class BashStreams {
if (!cg.hasNext())
break;
char c = cg.next();
if (Surrogate.isHigh(c) && (cb.remaining() == 1)) {
if (Character.isHighSurrogate(c)
&& cb.remaining() == 1) {
cg.push(c);
break;
}
......@@ -311,7 +315,7 @@ public class BashStreams {
mismatchedEOF(csn, count + i, cg.count());
char d = cg.next();
if (c == '?') {
if (Surrogate.isHigh(d)) {
if (Character.isHighSurrogate(d)) {
cg.next();
continue;
}
......
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
public class Surrogate {
public static final int UCS4_SURROGATE_MIN = 0x10000;
public static final int UCS4_MAX = (1 << 20) + UCS4_SURROGATE_MIN - 1;
// UTF-16 surrogate-character ranges
//
public static final char MIN_HIGH = '\uD800';
public static final char MAX_HIGH = '\uDBFF';
public static final char MIN_LOW = '\uDC00';
public static final char MAX_LOW = '\uDFFF';
public static final char MIN = MIN_HIGH;
public static final char MAX = MAX_LOW;
public static boolean neededFor(int uc) {
return (uc >= UCS4_SURROGATE_MIN) && (uc <= UCS4_MAX);
}
public static boolean isHigh(int c) {
return (MIN_HIGH <= c) && (c <= MAX_HIGH);
}
static char high(int uc) {
return (char)(0xd800 | (((uc - UCS4_SURROGATE_MIN) >> 10) & 0x3ff));
}
public static boolean isLow(int c) {
return (MIN_LOW <= c) && (c <= MAX_LOW);
}
static char low(int uc) {
return (char)(0xdc00 | ((uc - UCS4_SURROGATE_MIN) & 0x3ff));
}
public static boolean is(int c) {
return (MIN <= c) && (c <= MAX);
}
static int toUCS4(char c, char d) {
return (((c & 0x3ff) << 10) | (d & 0x3ff)) + 0x10000;
}
}
......@@ -42,9 +42,8 @@ public class Surrogates {
static void initData() throws IOException {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < LEN; i++) {
int c = Surrogate.UCS4_SURROGATE_MIN + 1;
sb.append(Surrogate.high(c));
sb.append(Surrogate.low(c));
int c = Character.MIN_SUPPLEMENTARY_CODE_POINT + 1;
sb.append(Character.toChars(c));
}
input = sb.toString().toCharArray();
ByteArrayOutputStream bos = new ByteArrayOutputStream();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册