/* * Portions Copyright 2005 Sun Microsystems, Inc. All Rights Reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Sun designates this * particular file as subject to the "Classpath" exception as provided * by Sun in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, * CA 95054 USA or visit www.sun.com if you need additional information or * have any questions. */ /* ******************************************************************************* * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * * * * The original version of this source code and documentation is copyrighted * * and owned by IBM, These materials are provided under terms of a License * * Agreement between IBM and Sun. This technology is protected by multiple * * US and International patents. This notice and attribution to IBM may not * * to removed. * ******************************************************************************* */ package sun.text.normalizer; import java.lang.ref.SoftReference; import java.util.HashMap; import java.util.Locale; import java.util.Map; /** *
* The UCharacter class provides extensions to the * * java.lang.Character class. These extensions provide support for * Unicode 3.2 properties and together with the UTF16 * class, provide support for supplementary characters (those with code * points above U+FFFF). *
** Code points are represented in these API using ints. While it would be * more convenient in Java to have a separate primitive datatype for them, * ints suffice in the meantime. *
*
* To use this class please add the jar file name icu4j.jar to the
* class path, since it contains data files which supply the information used
* by this file.
* E.g. In Windows
* set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar.
* Otherwise, another method would be to copy the files uprops.dat and
* unames.icu from the icu4j source subdirectory
* $ICU4J_SRC/src/com.ibm.icu.impl.data to your class directory
* $ICU4J_CLASS/com.ibm.icu.impl.data.
*
* Aside from the additions for UTF-16 support, and the updated Unicode 3.1 * properties, the main differences between UCharacter and Character are: *
* Further detail differences can be determined from the program * * com.ibm.icu.dev.test.lang.UCharacterCompare *
** This class is not subclassable *
* @author Syn Wee Quek * @stable ICU 2.1 * @see com.ibm.icu.lang.UCharacterEnums */ public final class UCharacter { /** * Numeric Type constants. * @see UProperty#NUMERIC_TYPE * @stable ICU 2.4 */ public static interface NumericType { /** * @stable ICU 2.4 */ public static final int NONE = 0; /** * @stable ICU 2.4 */ public static final int DECIMAL = 1; /** * @stable ICU 2.4 */ public static final int DIGIT = 2; /** * @stable ICU 2.4 */ public static final int NUMERIC = 3; /** * @stable ICU 2.4 */ public static final int COUNT = 4; } /** * Hangul Syllable Type constants. * * @see UProperty#HANGUL_SYLLABLE_TYPE * @stable ICU 2.6 */ public static interface HangulSyllableType { /** * @stable ICU 2.6 */ public static final int NOT_APPLICABLE = 0; /*[NA]*/ /*See note !!*/ /** * @stable ICU 2.6 */ public static final int LEADING_JAMO = 1; /*[L]*/ /** * @stable ICU 2.6 */ public static final int VOWEL_JAMO = 2; /*[V]*/ /** * @stable ICU 2.6 */ public static final int TRAILING_JAMO = 3; /*[T]*/ /** * @stable ICU 2.6 */ public static final int LV_SYLLABLE = 4; /*[LV]*/ /** * @stable ICU 2.6 */ public static final int LVT_SYLLABLE = 5; /*[LVT]*/ /** * @stable ICU 2.6 */ public static final int COUNT = 6; } /** * [Sun] This interface moved from UCharacterEnums.java. * * 'Enum' for the CharacterCategory constants. These constants are * compatible in name but not in value with those defined in *java.lang.Character.
* @see UCharacterCategory
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
public static interface ECharacterCategory
{
/**
* Character type Lu
* @stable ICU 2.1
*/
public static final int UPPERCASE_LETTER = 1;
/**
* Character type Lt
* @stable ICU 2.1
*/
public static final int TITLECASE_LETTER = 3;
/**
* Character type Lo
* @stable ICU 2.1
*/
public static final int OTHER_LETTER = 5;
}
// public data members -----------------------------------------------
/**
* The lowest Unicode code point value.
* @stable ICU 2.1
*/
public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
/**
* The highest Unicode code point value (scalar value) according to the
* Unicode Standard.
* This is a 21-bit value (21 bits, rounded up).java.lang.Character.digit(). Note that this
* will return positive values for code points for which isDigit
* returns false, just like java.lang.Character.
* Get the numeric value for a Unicode code point as defined in the * Unicode Character Database.
*A "double" return type is necessary because some numeric values are * fractions, negative, or too large for int.
*For characters without any numeric values in the Unicode Character * Database, this function will return NO_NUMERIC_VALUE.
*API Change: In release 2.2 and prior, this API has a * return type int and returns -1 when the argument ch does not have a * corresponding numeric value. This has been changed to synch with ICU4C *
* This corresponds to the ICU4C function u_getNumericValue. * @param ch Code point to get the numeric value for. * @return numeric value of ch, or NO_NUMERIC_VALUE if none is defined. * @stable ICU 2.4 */ public static double getUnicodeNumericValue(int ch) { // equivalent to c version double u_getNumericValue(UChar32 c) int props = PROPERTY_.getProperty(ch); int numericType = getNumericType(props); if (numericType > NumericType.NONE && numericType < NumericType.COUNT) { if (isNotExceptionIndicator(props)) { return UCharacterProperty.getSignedValue(props); } else { int index = UCharacterProperty.getExceptionIndex(props); boolean nex = false; boolean dex = false; double numerator = 0; if (PROPERTY_.hasExceptionValue(index, UCharacterProperty.EXC_NUMERIC_VALUE_)) { int num = PROPERTY_.getException(index, UCharacterProperty.EXC_NUMERIC_VALUE_); // There are special values for huge numbers that are // powers of ten. genprops/store.c documents: // if numericValue = 0x7fffff00 + x then // numericValue = 10 ^ x if (num >= NUMERATOR_POWER_LIMIT_) { num &= 0xff; // 10^x without math.h numerator = Math.pow(10, num); } else { numerator = num; } nex = true; } double denominator = 0; if (PROPERTY_.hasExceptionValue(index, UCharacterProperty.EXC_DENOMINATOR_VALUE_)) { denominator = PROPERTY_.getException(index, UCharacterProperty.EXC_DENOMINATOR_VALUE_); // faster path not in c if (numerator != 0) { return numerator / denominator; } dex = true; } if (nex) { if (dex) { return numerator / denominator; } return numerator; } if (dex) { return 1 / denominator; } } } return NO_NUMERIC_VALUE; } /** * Returns a value indicating a code point's Unicode category. * Up-to-date Unicode implementation of java.lang.Character.getType() * except for the above mentioned code points that had their category * changed.Get the "age" of the code point.
*The "age" is the Unicode version when the code point was first * designated (as a non-character or for Private Use) or assigned a * character. *
This can be useful to avoid emitting code points to receiving * processes that do not accept newer characters.
*The data is from the UCD file DerivedAge.txt.
* @param ch The code point. * @return the Unicode version number * @stable ICU 2.6 */ public static VersionInfo getAge(int ch) { if (ch < MIN_VALUE || ch > MAX_VALUE) { throw new IllegalArgumentException("Codepoint out of bounds"); } return PROPERTY_.getAge(ch); } /** *Gets the property value for an Unicode property type of a code point. * Also returns binary and mask property values.
*Unicode, especially in version 3.2, defines many more properties than * the original set in UnicodeData.txt.
*The properties APIs are intended to reflect Unicode properties as * defined in the Unicode Character Database (UCD) and Unicode Technical * Reports (UTR). For details about the properties see * http://www.unicode.org/.
*For names of Unicode properties see the UCD file PropertyAliases.txt. *
*
* Sample usage:
* int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH);
* int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC);
* boolean b = (ideo == 1) ? true : false;
*
* @param ch code point to test.
* @param type UProperty selector constant, identifies which binary
* property to check. Must be
* UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or
* UProperty.INT_START <= type < UProperty.INT_LIMIT or
* UProperty.MASK_START <= type < UProperty.MASK_LIMIT.
* @return numeric value that is directly the property value or,
* for enumerated properties, corresponds to the numeric value of
* the enumerated constant of the respective property value
* enumeration type (cast to enum type if necessary).
* Returns 0 or 1 (for false / true) for binary Unicode properties.
* Returns a bit-mask for mask properties.
* Returns 0 if 'type' is out of bounds or if the Unicode version
* does not have data for the property at all, or not for this code
* point.
* @see UProperty
* @see #hasBinaryProperty
* @see #getIntPropertyMinValue
* @see #getIntPropertyMaxValue
* @see #getUnicodeVersion
* @stable ICU 2.4
*/
public static int getIntPropertyValue(int ch, int type)
{
/*
* For Normalizer with Unicode 3.2, this method is called only for
* HANGUL_SYLLABLE_TYPE in UnicodeSet.addPropertyStarts().
*/
if (type == UProperty.HANGUL_SYLLABLE_TYPE) {
/* purely algorithmic; hardcode known characters, check for assigned new ones */
if(chc version genprops/store.c documents: * if numericValue = 0x7fffff00 + x then numericValue = 10 ^ x
*/ private static final int NUMERATOR_POWER_LIMIT_ = 0x7fffff00; /** * Integer properties mask and shift values for joining type. * Equivalent to icu4c UPROPS_JT_MASK. */ private static final int JOINING_TYPE_MASK_ = 0x00003800; /** * Integer properties mask and shift values for joining type. * Equivalent to icu4c UPROPS_JT_SHIFT. */ private static final int JOINING_TYPE_SHIFT_ = 11; /** * Integer properties mask and shift values for joining group. * Equivalent to icu4c UPROPS_JG_MASK. */ private static final int JOINING_GROUP_MASK_ = 0x000007e0; /** * Integer properties mask and shift values for joining group. * Equivalent to icu4c UPROPS_JG_SHIFT. */ private static final int JOINING_GROUP_SHIFT_ = 5; /** * Integer properties mask for decomposition type. * Equivalent to icu4c UPROPS_DT_MASK. */ private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f; /** * Integer properties mask and shift values for East Asian cell width. * Equivalent to icu4c UPROPS_EA_MASK */ private static final int EAST_ASIAN_MASK_ = 0x00038000; /** * Integer properties mask and shift values for East Asian cell width. * Equivalent to icu4c UPROPS_EA_SHIFT */ private static final int EAST_ASIAN_SHIFT_ = 15; /** * Integer properties mask and shift values for line breaks. * Equivalent to icu4c UPROPS_LB_MASK */ private static final int LINE_BREAK_MASK_ = 0x007C0000; /** * Integer properties mask and shift values for line breaks. * Equivalent to icu4c UPROPS_LB_SHIFT */ private static final int LINE_BREAK_SHIFT_ = 18; /** * Integer properties mask and shift values for blocks. * Equivalent to icu4c UPROPS_BLOCK_MASK */ private static final int BLOCK_MASK_ = 0x00007f80; /** * Integer properties mask and shift values for blocks. * Equivalent to icu4c UPROPS_BLOCK_SHIFT */ private static final int BLOCK_SHIFT_ = 7; /** * Integer properties mask and shift values for scripts. * Equivalent to icu4c UPROPS_SHIFT_MASK */ private static final int SCRIPT_MASK_ = 0x0000007f; // private constructor ----------------------------------------------- ///CLOVER:OFF /** * Private constructor to prevent instantiation */ private UCharacter() { } ///CLOVER:ON // private methods --------------------------------------------------- /** * Getting the digit values of characters like 'A' - 'Z', normal, * half-width and full-width. This method assumes that the other digit * characters are checked by the calling method. * @param ch character to test * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise * its corresponding digit will be returned. */ private static int getEuropeanDigit(int ch) { if ((ch > 0x7a && ch < 0xff21) || ch < 0x41 || (ch > 0x5a && ch < 0x61) || ch > 0xff5a || (ch > 0xff31 && ch < 0xff41)) { return -1; } if (ch <= 0x7a) { // ch >= 0x41 or ch < 0x61 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); } // ch >= 0xff21 if (ch <= 0xff3a) { return ch + 10 - 0xff21; } // ch >= 0xff41 && ch <= 0xff5a return ch + 10 - 0xff41; } /** * Gets the numeric type of the property argument * @param props 32 bit property * @return the numeric type */ private static int getNumericType(int props) { return (props & NUMERIC_TYPE_MASK_) >> NUMERIC_TYPE_SHIFT_; } /** * Checks if the property value has a exception indicator * @param props 32 bit property value * @return true if property does not have a exception indicator, false * otherwise */ private static boolean isNotExceptionIndicator(int props) { return (props & UCharacterProperty.EXCEPTION_MASK) == 0; } /** * Gets the property value at the index. * This is optimized. * Note this is alittle different from CharTrie the index m_trieData_ * is never negative. * This is a duplicate of UCharacterProperty.getProperty. For optimization * purposes, this method calls the trie data directly instead of through * UCharacterProperty.getProperty. * @param ch code point whose property value is to be retrieved * @return property value of code point * @stable ICU 2.6 */ private static int getProperty(int ch) { if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) { // BMP codepoint try { // using try for < 0 ch is faster than using an if statement return PROPERTY_DATA_[ PROPERTY_TRIE_DATA_[ (PROPERTY_TRIE_INDEX_[ch >> 5] << 2) + (ch & 0x1f)]]; } catch (ArrayIndexOutOfBoundsException e) { return PROPERTY_INITIAL_VALUE_; } } if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { // surrogate return PROPERTY_DATA_[ PROPERTY_TRIE_DATA_[ (PROPERTY_TRIE_INDEX_[(0x2800 >> 5) + (ch >> 5)] << 2) + (ch & 0x1f)]]; } // for optimization if (ch <= UTF16.CODEPOINT_MAX_VALUE) { // look at the construction of supplementary characters // trail forms the ends of it. return PROPERTY_DATA_[PROPERTY_.m_trie_.getSurrogateValue( UTF16.getLeadSurrogate(ch), (char)(ch & 0x3ff))]; } // return m_dataOffset_ if there is an error, in this case we return // the default value: m_initialValue_ // we cannot assume that m_initialValue_ is at offset 0 // this is for optimization. return PROPERTY_INITIAL_VALUE_; } }