UCharacterProperty.java 13.2 KB
Newer Older
D
duke 已提交
1
/*
P
peytoia 已提交
2
 * Portions Copyright 2005-2009 Sun Microsystems, Inc.  All Rights Reserved.
D
duke 已提交
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Sun designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Sun in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
 * CA 95054 USA or visit www.sun.com if you need additional information or
 * have any questions.
 */
/*
 *******************************************************************************
P
peytoia 已提交
27
 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
D
duke 已提交
28 29 30 31 32 33 34 35 36 37 38 39 40 41
 *                                                                             *
 * The original version of this source code and documentation is copyrighted   *
 * and owned by IBM, These materials are provided under terms of a License     *
 * Agreement between IBM and Sun. This technology is protected by multiple     *
 * US and International patents. This notice and attribution to IBM may not    *
 * to removed.                                                                 *
 *******************************************************************************
 */

package sun.text.normalizer;

import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.IOException;
P
peytoia 已提交
42
import java.util.MissingResourceException;
D
duke 已提交
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58

/**
* <p>Internal class used for Unicode character property database.</p>
* <p>This classes store binary data read from uprops.icu.
* It does not have the capability to parse the data into more high-level
* information. It only returns bytes of information when required.</p>
* <p>Due to the form most commonly used for retrieval, array of char is used
* to store the binary data.</p>
* <p>UCharacterPropertyDB also contains information on accessing indexes to
* significant points in the binary data.</p>
* <p>Responsibility for molding the binary data into more meaning form lies on
* <a href=UCharacter.html>UCharacter</a>.</p>
* @author Syn Wee Quek
* @since release 2.1, february 1st 2002
*/

P
peytoia 已提交
59
public final class UCharacterProperty
D
duke 已提交
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
{
    // public data members -----------------------------------------------

    /**
    * Trie data
    */
    public CharTrie m_trie_;
    /**
     * Optimization
     * CharTrie index array
     */
    public char[] m_trieIndex_;
    /**
     * Optimization
     * CharTrie data array
     */
    public char[] m_trieData_;
    /**
     * Optimization
     * CharTrie data offset
     */
    public int m_trieInitialValue_;
    /**
    * Unicode version
    */
    public VersionInfo m_unicodeVersion_;

P
peytoia 已提交
87 88 89 90 91 92
    // uprops.h enum UPropertySource --------------------------------------- ***

    /** From uchar.c/uprops.icu properties vectors trie */
    public static final int SRC_PROPSVEC=2;
    /** One more than the highest UPropertySource (SRC_) constant. */
    public static final int SRC_COUNT=9;
D
duke 已提交
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113

    // public methods ----------------------------------------------------

    /**
     * Java friends implementation
     */
    public void setIndexData(CharTrie.FriendAgent friendagent)
    {
        m_trieIndex_ = friendagent.getPrivateIndex();
        m_trieData_ = friendagent.getPrivateData();
        m_trieInitialValue_ = friendagent.getPrivateInitialValue();
    }

    /**
    * Gets the property value at the index.
    * This is optimized.
    * Note this is alittle different from CharTrie the index m_trieData_
    * is never negative.
    * @param ch code point whose property value is to be retrieved
    * @return property value of code point
    */
P
peytoia 已提交
114
    public final int getProperty(int ch)
D
duke 已提交
115 116 117 118
    {
        if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
            || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
                && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
P
peytoia 已提交
119
            // BMP codepoint 0000..D7FF or DC00..FFFF
D
duke 已提交
120
            // optimized
P
peytoia 已提交
121 122
            try { // using try for ch < 0 is faster than using an if statement
                return m_trieData_[
D
duke 已提交
123 124
                    (m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_]
                          << Trie.INDEX_STAGE_2_SHIFT_)
P
peytoia 已提交
125
                    + (ch & Trie.INDEX_STAGE_3_MASK_)];
D
duke 已提交
126
            } catch (ArrayIndexOutOfBoundsException e) {
P
peytoia 已提交
127
                return m_trieInitialValue_;
D
duke 已提交
128 129 130
            }
        }
        if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
P
peytoia 已提交
131 132
            // lead surrogate D800..DBFF
            return m_trieData_[
D
duke 已提交
133 134 135
                    (m_trieIndex_[Trie.LEAD_INDEX_OFFSET_
                                  + (ch >> Trie.INDEX_STAGE_1_SHIFT_)]
                          << Trie.INDEX_STAGE_2_SHIFT_)
P
peytoia 已提交
136
                    + (ch & Trie.INDEX_STAGE_3_MASK_)];
D
duke 已提交
137 138
        }
        if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
P
peytoia 已提交
139
            // supplementary code point 10000..10FFFF
D
duke 已提交
140 141
            // look at the construction of supplementary characters
            // trail forms the ends of it.
P
peytoia 已提交
142
            return m_trie_.getSurrogateValue(
D
duke 已提交
143
                                          UTF16.getLeadSurrogate(ch),
P
peytoia 已提交
144
                                          (char)(ch & Trie.SURROGATE_MASK_));
D
duke 已提交
145
        }
P
peytoia 已提交
146
        // ch is out of bounds
D
duke 已提交
147 148 149 150
        // return m_dataOffset_ if there is an error, in this case we return
        // the default value: m_initialValue_
        // we cannot assume that m_initialValue_ is at offset 0
        // this is for optimization.
P
peytoia 已提交
151 152 153
        return m_trieInitialValue_;

        // this all is an inlined form of return m_trie_.getCodePointValue(ch);
D
duke 已提交
154 155 156
    }

    /**
P
peytoia 已提交
157
    * Getting the unsigned numeric value of a character embedded in the property
D
duke 已提交
158 159
    * argument
    * @param prop the character
P
peytoia 已提交
160
    * @return unsigned numberic value
D
duke 已提交
161
    */
P
peytoia 已提交
162
    public static int getUnsignedValue(int prop)
D
duke 已提交
163 164 165 166 167 168 169 170 171
    {
        return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;
    }

    /**
     * Gets the unicode additional properties.
     * C version getUnicodeProperties.
     * @param codepoint codepoint whose additional properties is to be
     *                  retrieved
P
peytoia 已提交
172
     * @param column
D
duke 已提交
173 174
     * @return unicode properties
     */
P
peytoia 已提交
175 176 177 178 179 180 181 182 183
       public int getAdditional(int codepoint, int column) {
        if (column == -1) {
            return getProperty(codepoint);
        }
           if (column < 0 || column >= m_additionalColumnsCount_) {
           return 0;
       }
       return m_additionalVectors_[
                     m_additionalTrie_.getCodePointValue(codepoint) + column];
D
duke 已提交
184 185
       }

P
peytoia 已提交
186
       /**
D
duke 已提交
187 188 189 190 191 192 193 194 195 196 197 198 199
     * <p>Get the "age" of the code point.</p>
     * <p>The "age" is the Unicode version when the code point was first
     * designated (as a non-character or for Private Use) or assigned a
     * character.</p>
     * <p>This can be useful to avoid emitting code points to receiving
     * processes that do not accept newer characters.</p>
     * <p>The data is from the UCD file DerivedAge.txt.</p>
     * <p>This API does not check the validity of the codepoint.</p>
     * @param codepoint The code point.
     * @return the Unicode version number
     */
    public VersionInfo getAge(int codepoint)
    {
P
peytoia 已提交
200
        int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
D
duke 已提交
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
        return VersionInfo.getInstance(
                           (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
                           version & LAST_NIBBLE_MASK_, 0, 0);
    }

    /**
    * Forms a supplementary code point from the argument character<br>
    * Note this is for internal use hence no checks for the validity of the
    * surrogate characters are done
    * @param lead lead surrogate character
    * @param trail trailing surrogate character
    * @return code point of the supplementary character
    */
    public static int getRawSupplementary(char lead, char trail)
    {
        return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
    }

    /**
    * Loads the property data and initialize the UCharacterProperty instance.
P
peytoia 已提交
221
    * @throws MissingResourceException when data is missing or data has been corrupted
D
duke 已提交
222
    */
P
peytoia 已提交
223
    public static UCharacterProperty getInstance()
D
duke 已提交
224
    {
P
peytoia 已提交
225
        if(INSTANCE_ == null) {
D
duke 已提交
226 227 228 229
            try {
                INSTANCE_ = new UCharacterProperty();
            }
            catch (Exception e) {
P
peytoia 已提交
230
                throw new MissingResourceException(e.getMessage(),"","");
D
duke 已提交
231 232 233 234 235 236 237 238
            }
        }
        return INSTANCE_;
    }

    /**
     * Checks if the argument c is to be treated as a white space in ICU
     * rules. Usually ICU rule white spaces are ignored unless quoted.
P
peytoia 已提交
239 240 241
     * Equivalent to test for Pattern_White_Space Unicode property.
     * Stable set of characters, won't change.
     * See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
D
duke 已提交
242 243 244 245 246 247 248
     * @param c codepoint to check
     * @return true if c is a ICU white space
     */
    public static boolean isRuleWhiteSpace(int c)
    {
        /* "white space" in the sense of ICU rule parsers
           This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
P
peytoia 已提交
249
           See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
D
duke 已提交
250
           U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
P
peytoia 已提交
251
           Equivalent to test for Pattern_White_Space Unicode property.
D
duke 已提交
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303
        */
        return (c >= 0x0009 && c <= 0x2029 &&
                (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
                 c == 0x200E || c == 0x200F || c >= 0x2028));
    }

    // protected variables -----------------------------------------------

    /**
     * Extra property trie
     */
    CharTrie m_additionalTrie_;
    /**
     * Extra property vectors, 1st column for age and second for binary
     * properties.
     */
    int m_additionalVectors_[];
    /**
     * Number of additional columns
     */
    int m_additionalColumnsCount_;
    /**
     * Maximum values for block, bits used as in vector word
     * 0
     */
    int m_maxBlockScriptValue_;
    /**
     * Maximum values for script, bits used as in vector word
     * 0
     */
     int m_maxJTGValue_;

    // private variables -------------------------------------------------

      /**
     * UnicodeData.txt property object
     */
    private static UCharacterProperty INSTANCE_ = null;

    /**
    * Default name of the datafile
    */
    private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";

    /**
    * Default buffer size of datafile
    */
    private static final int DATA_BUFFER_SIZE_ = 25000;

    /**
    * Numeric value shift
    */
P
peytoia 已提交
304
    private static final int VALUE_SHIFT_ = 8;
D
duke 已提交
305 306 307 308

    /**
    * Mask to be applied after shifting to obtain an unsigned numeric value
    */
P
peytoia 已提交
309
    private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF;
D
duke 已提交
310 311 312 313 314 315 316 317 318 319 320 321 322 323

    /**
    * Shift value for lead surrogate to form a supplementary character.
    */
    private static final int LEAD_SURROGATE_SHIFT_ = 10;
    /**
    * Offset to add to combined surrogate pair to avoid msking.
    */
    private static final int SURROGATE_OFFSET_ =
                           UTF16.SUPPLEMENTARY_MIN_VALUE -
                           (UTF16.SURROGATE_MIN_VALUE <<
                           LEAD_SURROGATE_SHIFT_) -
                           UTF16.TRAIL_SURROGATE_MIN_VALUE;

P
peytoia 已提交
324
    // additional properties ----------------------------------------------
D
duke 已提交
325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342

    /**
     * First nibble shift
     */
    private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
    /**
     * Second nibble mask
     */
    private static final int LAST_NIBBLE_MASK_ = 0xF;
    /**
     * Age value shift
     */
    private static final int AGE_SHIFT_ = 24;

    // private constructors --------------------------------------------------

    /**
    * Constructor
P
peytoia 已提交
343
    * @exception IOException thrown when data reading fails or data corrupted
D
duke 已提交
344 345 346 347 348 349 350 351 352 353 354 355 356
    */
    private UCharacterProperty() throws IOException
    {
        // jar access
        InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_);
        BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_);
        UCharacterPropertyReader reader = new UCharacterPropertyReader(b);
        reader.read(this);
        b.close();

        m_trie_.putIndexData(this);
    }

P
peytoia 已提交
357 358 359 360 361 362 363 364
    public void upropsvec_addPropertyStarts(UnicodeSet set) {
        /* add the start code point of each same-value range of the properties vectors trie */
        if(m_additionalColumnsCount_>0) {
            /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
            TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_);
            RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element();
            while(propsVectorsIter.next(propsVectorsResult)){
                set.add(propsVectorsResult.start);
D
duke 已提交
365 366 367 368 369
            }
        }
    }

}