提交 59e20a9f 编写于 作者: P peytoia

6404304: RFE: Unicode 5.1 support

Reviewed-by: okutsu, naoto
上级 84a49425
......@@ -92,11 +92,11 @@ FILES_java = \
sun/text/normalizer/SymbolTable.java \
sun/text/normalizer/Trie.java \
sun/text/normalizer/TrieIterator.java \
sun/text/normalizer/UBiDiProps.java \
sun/text/normalizer/UCharacter.java \
sun/text/normalizer/UCharacterIterator.java \
sun/text/normalizer/UCharacterProperty.java \
sun/text/normalizer/UCharacterPropertyReader.java \
sun/text/normalizer/UProperty.java \
sun/text/normalizer/UTF16.java \
sun/text/normalizer/UnicodeMatcher.java \
sun/text/normalizer/UnicodeSet.java \
......
......@@ -64,7 +64,8 @@ BIFILES = $(TEXT_CLASSDIR)/CharacterBreakIteratorData \
$(TEXT_CLASSDIR)/SentenceBreakIteratorData
ICU_FILES = $(TEXT_CLASSDIR)/unorm.icu \
$(TEXT_CLASSDIR)/uprops.icu
$(TEXT_CLASSDIR)/uprops.icu \
$(TEXT_CLASSDIR)/ubidi.icu
# builder
GENERATEBREAKITERATORDATA_JARFILE = \
......@@ -89,7 +90,7 @@ $(BIFILES): $(GENERATEBREAKITERATORDATA_JARFILE) \
build: $(BIFILES) $(ICU_FILES)
#
# Extra rules to copy unorm.icu and uprops.icu
# Extra rules to copy unorm.icu, uprops.icu, and ubidi.icu
#
$(TEXT_CLASSDIR)/unorm.icu: $(TEXT_SRCDIR)/unorm.icu
$(install-file)
......@@ -97,6 +98,9 @@ $(TEXT_CLASSDIR)/unorm.icu: $(TEXT_SRCDIR)/unorm.icu
$(TEXT_CLASSDIR)/uprops.icu: $(TEXT_SRCDIR)/uprops.icu
$(install-file)
$(TEXT_CLASSDIR)/ubidi.icu: $(TEXT_SRCDIR)/ubidi.icu
$(install-file)
clean clobber::
$(RM) -r $(TEXT_CLASSES)
$(RM) -r $(BIFILES)
......
......@@ -144,6 +144,55 @@ class CharacterData00 extends CharacterData {
case 0x1FBC : mapChar = 0x1FB3; break;
case 0x1FCC : mapChar = 0x1FC3; break;
case 0x1FFC : mapChar = 0x1FF3; break;
case 0x023A : mapChar = 0x2C65; break;
case 0x023E : mapChar = 0x2C66; break;
case 0x10A0 : mapChar = 0x2D00; break;
case 0x10A1 : mapChar = 0x2D01; break;
case 0x10A2 : mapChar = 0x2D02; break;
case 0x10A3 : mapChar = 0x2D03; break;
case 0x10A4 : mapChar = 0x2D04; break;
case 0x10A5 : mapChar = 0x2D05; break;
case 0x10A6 : mapChar = 0x2D06; break;
case 0x10A7 : mapChar = 0x2D07; break;
case 0x10A8 : mapChar = 0x2D08; break;
case 0x10A9 : mapChar = 0x2D09; break;
case 0x10AA : mapChar = 0x2D0A; break;
case 0x10AB : mapChar = 0x2D0B; break;
case 0x10AC : mapChar = 0x2D0C; break;
case 0x10AD : mapChar = 0x2D0D; break;
case 0x10AE : mapChar = 0x2D0E; break;
case 0x10AF : mapChar = 0x2D0F; break;
case 0x10B0 : mapChar = 0x2D10; break;
case 0x10B1 : mapChar = 0x2D11; break;
case 0x10B2 : mapChar = 0x2D12; break;
case 0x10B3 : mapChar = 0x2D13; break;
case 0x10B4 : mapChar = 0x2D14; break;
case 0x10B5 : mapChar = 0x2D15; break;
case 0x10B6 : mapChar = 0x2D16; break;
case 0x10B7 : mapChar = 0x2D17; break;
case 0x10B8 : mapChar = 0x2D18; break;
case 0x10B9 : mapChar = 0x2D19; break;
case 0x10BA : mapChar = 0x2D1A; break;
case 0x10BB : mapChar = 0x2D1B; break;
case 0x10BC : mapChar = 0x2D1C; break;
case 0x10BD : mapChar = 0x2D1D; break;
case 0x10BE : mapChar = 0x2D1E; break;
case 0x10BF : mapChar = 0x2D1F; break;
case 0x10C0 : mapChar = 0x2D20; break;
case 0x10C1 : mapChar = 0x2D21; break;
case 0x10C2 : mapChar = 0x2D22; break;
case 0x10C3 : mapChar = 0x2D23; break;
case 0x10C4 : mapChar = 0x2D24; break;
case 0x10C5 : mapChar = 0x2D25; break;
case 0x1E9E : mapChar = 0x00DF; break;
case 0x2C62 : mapChar = 0x026B; break;
case 0x2C63 : mapChar = 0x1D7D; break;
case 0x2C64 : mapChar = 0x027D; break;
case 0x2C6D : mapChar = 0x0251; break;
case 0x2C6E : mapChar = 0x0271; break;
case 0x2C6F : mapChar = 0x0250; break;
case 0xA77D : mapChar = 0x1D79; break;
// default mapChar is already set, so no
// need to redo it here.
// default : mapChar = ch;
......@@ -196,6 +245,54 @@ class CharacterData00 extends CharacterData {
case 0x1FB3 : mapChar = 0x1FBC; break;
case 0x1FC3 : mapChar = 0x1FCC; break;
case 0x1FF3 : mapChar = 0x1FFC; break;
case 0x0250 : mapChar = 0x2C6F; break;
case 0x0251 : mapChar = 0x2C6D; break;
case 0x026B : mapChar = 0x2C62; break;
case 0x0271 : mapChar = 0x2C6E; break;
case 0x027D : mapChar = 0x2C64; break;
case 0x1D79 : mapChar = 0xA77D; break;
case 0x1D7D : mapChar = 0x2C63; break;
case 0x2C65 : mapChar = 0x023A; break;
case 0x2C66 : mapChar = 0x023E; break;
case 0x2D00 : mapChar = 0x10A0; break;
case 0x2D01 : mapChar = 0x10A1; break;
case 0x2D02 : mapChar = 0x10A2; break;
case 0x2D03 : mapChar = 0x10A3; break;
case 0x2D04 : mapChar = 0x10A4; break;
case 0x2D05 : mapChar = 0x10A5; break;
case 0x2D06 : mapChar = 0x10A6; break;
case 0x2D07 : mapChar = 0x10A7; break;
case 0x2D08 : mapChar = 0x10A8; break;
case 0x2D09 : mapChar = 0x10A9; break;
case 0x2D0A : mapChar = 0x10AA; break;
case 0x2D0B : mapChar = 0x10AB; break;
case 0x2D0C : mapChar = 0x10AC; break;
case 0x2D0D : mapChar = 0x10AD; break;
case 0x2D0E : mapChar = 0x10AE; break;
case 0x2D0F : mapChar = 0x10AF; break;
case 0x2D10 : mapChar = 0x10B0; break;
case 0x2D11 : mapChar = 0x10B1; break;
case 0x2D12 : mapChar = 0x10B2; break;
case 0x2D13 : mapChar = 0x10B3; break;
case 0x2D14 : mapChar = 0x10B4; break;
case 0x2D15 : mapChar = 0x10B5; break;
case 0x2D16 : mapChar = 0x10B6; break;
case 0x2D17 : mapChar = 0x10B7; break;
case 0x2D18 : mapChar = 0x10B8; break;
case 0x2D19 : mapChar = 0x10B9; break;
case 0x2D1A : mapChar = 0x10BA; break;
case 0x2D1B : mapChar = 0x10BB; break;
case 0x2D1C : mapChar = 0x10BC; break;
case 0x2D1D : mapChar = 0x10BD; break;
case 0x2D1E : mapChar = 0x10BE; break;
case 0x2D1F : mapChar = 0x10BF; break;
case 0x2D20 : mapChar = 0x10C0; break;
case 0x2D21 : mapChar = 0x10C1; break;
case 0x2D22 : mapChar = 0x10C2; break;
case 0x2D23 : mapChar = 0x10C3; break;
case 0x2D24 : mapChar = 0x10C4; break;
case 0x2D25 : mapChar = 0x10C5; break;
// ch must have a 1:M case mapping, but we
// can't handle it here. Return ch.
// since mapChar is already set, no need
......@@ -315,6 +412,12 @@ class CharacterData00 extends CharacterData {
case 0x32BE: retval = 49; break; // CIRCLED NUMBER FORTY NINE
case 0x32BF: retval = 50; break; // CIRCLED NUMBER FIFTY
case 0x0D71: retval = 100; break; // MALAYALAM NUMBER ONE HUNDRED
case 0x0D72: retval = 1000; break; // MALAYALAM NUMBER ONE THOUSAND
case 0x2186: retval = 50; break; // ROMAN NUMERAL FIFTY EARLY FORM
case 0x2187: retval = 50000; break; // ROMAN NUMERAL FIFTY THOUSAND
case 0x2188: retval = 100000; break; // ROMAN NUMERAL ONE HUNDRED THOUSAND
default: retval = -2; break;
}
break;
......@@ -383,6 +486,54 @@ class CharacterData00 extends CharacterData {
case 0x00B5 : mapChar = 0x039C; break;
case 0x017F : mapChar = 0x0053; break;
case 0x1FBE : mapChar = 0x0399; break;
case 0x0250 : mapChar = 0x2C6F; break;
case 0x0251 : mapChar = 0x2C6D; break;
case 0x026B : mapChar = 0x2C62; break;
case 0x0271 : mapChar = 0x2C6E; break;
case 0x027D : mapChar = 0x2C64; break;
case 0x1D79 : mapChar = 0xA77D; break;
case 0x1D7D : mapChar = 0x2C63; break;
case 0x2C65 : mapChar = 0x023A; break;
case 0x2C66 : mapChar = 0x023E; break;
case 0x2D00 : mapChar = 0x10A0; break;
case 0x2D01 : mapChar = 0x10A1; break;
case 0x2D02 : mapChar = 0x10A2; break;
case 0x2D03 : mapChar = 0x10A3; break;
case 0x2D04 : mapChar = 0x10A4; break;
case 0x2D05 : mapChar = 0x10A5; break;
case 0x2D06 : mapChar = 0x10A6; break;
case 0x2D07 : mapChar = 0x10A7; break;
case 0x2D08 : mapChar = 0x10A8; break;
case 0x2D09 : mapChar = 0x10A9; break;
case 0x2D0A : mapChar = 0x10AA; break;
case 0x2D0B : mapChar = 0x10AB; break;
case 0x2D0C : mapChar = 0x10AC; break;
case 0x2D0D : mapChar = 0x10AD; break;
case 0x2D0E : mapChar = 0x10AE; break;
case 0x2D0F : mapChar = 0x10AF; break;
case 0x2D10 : mapChar = 0x10B0; break;
case 0x2D11 : mapChar = 0x10B1; break;
case 0x2D12 : mapChar = 0x10B2; break;
case 0x2D13 : mapChar = 0x10B3; break;
case 0x2D14 : mapChar = 0x10B4; break;
case 0x2D15 : mapChar = 0x10B5; break;
case 0x2D16 : mapChar = 0x10B6; break;
case 0x2D17 : mapChar = 0x10B7; break;
case 0x2D18 : mapChar = 0x10B8; break;
case 0x2D19 : mapChar = 0x10B9; break;
case 0x2D1A : mapChar = 0x10BA; break;
case 0x2D1B : mapChar = 0x10BB; break;
case 0x2D1C : mapChar = 0x10BC; break;
case 0x2D1D : mapChar = 0x10BD; break;
case 0x2D1E : mapChar = 0x10BE; break;
case 0x2D1F : mapChar = 0x10BF; break;
case 0x2D20 : mapChar = 0x10C0; break;
case 0x2D21 : mapChar = 0x10C1; break;
case 0x2D22 : mapChar = 0x10C2; break;
case 0x2D23 : mapChar = 0x10C3; break;
case 0x2D24 : mapChar = 0x10C4; break;
case 0x2D25 : mapChar = 0x10C5; break;
default : mapChar = Character.ERROR; break;
}
}
......
......@@ -218,6 +218,48 @@ class CharacterData01 extends CharacterData {
case 0x10132: retval = 80000; break; // AEGEAN NUMBER EIGHTY THOUSAND
case 0x10133: retval = 90000; break; // AEGEAN NUMBER NINETY THOUSAND
case 0x10323: retval = 50; break; // OLD ITALIC NUMERAL FIFTY
case 0x010144: retval = 50; break; // ACROPHONIC ATTIC FIFTY
case 0x010145: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED
case 0x010146: retval = 5000; break; // ACROPHONIC ATTIC FIVE THOUSAND
case 0x010147: retval = 50000; break; // ACROPHONIC ATTIC FIFTY THOUSAND
case 0x01014A: retval = 50; break; // ACROPHONIC ATTIC FIFTY TALENTS
case 0x01014B: retval = 100; break; // ACROPHONIC ATTIC ONE HUNDRED TALENTS
case 0x01014C: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED TALENTS
case 0x01014D: retval = 1000; break; // ACROPHONIC ATTIC ONE THOUSAND TALENTS
case 0x01014E: retval = 5000; break; // ACROPHONIC ATTIC FIVE THOUSAND TALENTS
case 0x010151: retval = 50; break; // ACROPHONIC ATTIC FIFTY STATERS
case 0x010152: retval = 100; break; // ACROPHONIC ATTIC ONE HUNDRED STATERS
case 0x010153: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED STATERS
case 0x010154: retval = 1000; break; // ACROPHONIC ATTIC ONE THOUSAND STATERS
case 0x010155: retval = 10000; break; // ACROPHONIC ATTIC TEN THOUSAND STATERS
case 0x010156: retval = 50000; break; // ACROPHONIC ATTIC FIFTY THOUSAND STATERS
case 0x010166: retval = 50; break; // ACROPHONIC TROEZENIAN FIFTY
case 0x010167: retval = 50; break; // ACROPHONIC TROEZENIAN FIFTY ALTERNATE FORM
case 0x010168: retval = 50; break; // ACROPHONIC HERMIONIAN FIFTY
case 0x010169: retval = 50; break; // ACROPHONIC THESPIAN FIFTY
case 0x01016A: retval = 100; break; // ACROPHONIC THESPIAN ONE HUNDRED
case 0x01016B: retval = 300; break; // ACROPHONIC THESPIAN THREE HUNDRED
case 0x01016C: retval = 500; break; // ACROPHONIC EPIDAUREAN FIVE HUNDRED
case 0x01016D: retval = 500; break; // ACROPHONIC TROEZENIAN FIVE HUNDRED
case 0x01016E: retval = 500; break; // ACROPHONIC THESPIAN FIVE HUNDRED
case 0x01016F: retval = 500; break; // ACROPHONIC CARYSTIAN FIVE HUNDRED
case 0x010170: retval = 500; break; // ACROPHONIC NAXIAN FIVE HUNDRED
case 0x010171: retval = 1000; break; // ACROPHONIC THESPIAN ONE THOUSAND
case 0x010172: retval = 5000; break; // ACROPHONIC THESPIAN FIVE THOUSAND
case 0x010174: retval = 50; break; // ACROPHONIC STRATIAN FIFTY MNAS
case 0x010341: retval = 90; break; // GOTHIC LETTER NINETY
case 0x01034A: retval = 900; break; // GOTHIC LETTER NINE HUNDRED
case 0x0103D5: retval = 100; break; // OLD PERSIAN NUMBER HUNDRED
case 0x010919: retval = 100; break; // PHOENICIAN NUMBER ONE HUNDRED
case 0x010A46: retval = 100; break; // KHAROSHTHI NUMBER ONE HUNDRED
case 0x010A47: retval = 1000; break; // KHAROSHTHI NUMBER ONE THOUSAND
case 0x01D36C: retval = 40; break; // COUNTING ROD TENS DIGIT FOUR
case 0x01D36D: retval = 50; break; // COUNTING ROD TENS DIGIT FIVE
case 0x01D36E: retval = 60; break; // COUNTING ROD TENS DIGIT SIX
case 0x01D36F: retval = 70; break; // COUNTING ROD TENS DIGIT SEVEN
case 0x01D370: retval = 80; break; // COUNTING ROD TENS DIGIT EIGHT
case 0x01D371: retval = 90; break; // COUNTING ROD TENS DIGIT NINE
default: retval = -2; break;
}
......
# SpecialCasing-4.0.0.txt
# Date: 2003-03-14, 20:22:04 GMT [MD]
# SpecialCasing-5.1.0.txt
# Date: 2008-03-03, 21:58:10 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2008 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see UCD.html
#
# Special Casing Properties
#
# This file is a supplement to the UnicodeData file.
# It contains additional information about the casing of Unicode characters.
# (For compatibility, the UnicodeData.txt file only contains case mappings for
# characters where they are 1-1, and does not have locale-specific mappings.)
# characters where they are 1-1, and independent of context and language.
# For more information, see the discussion of Case Mappings in the Unicode Standard.
#
# All code points not listed in this file that do not have a simple case mappings
......@@ -18,31 +23,31 @@
#
# <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)? # <comment>
#
# <code>, <lower>, <title>, and <upper> provide character values in hex. If there is more than
# one character, they are separated by spaces. Other than as used to separate elements,
# spaces are to be ignored.
# <code>, <lower>, <title>, and <upper> provide character values in hex. If there is more
# than one character, they are separated by spaces. Other than as used to separate
# elements, spaces are to be ignored.
#
# The <condition_list> is optional. Where present, it consists of one or more locales or contexts,
# separated by spaces. In these conditions:
# The <condition_list> is optional. Where present, it consists of one or more language IDs
# or contexts, separated by spaces. In these conditions:
# - A condition list overrides the normal behavior if all of the listed conditions are true.
# - The context is always the context of the characters in the original string,
# NOT in the resulting string.
# - Case distinctions in the condition list are not significant.
# - Conditions preceded by "Not_" represent the negation of the condition.
# The condition list is not represented in the UCD as a formal property.
#
# A locale is defined as:
# <locale> := <ISO_639_code> ( "_" <ISO_3166_code> ( "_" <variant> )? )?
# <ISO_3166_code> := 2-letter ISO country code,
# <ISO_639_code> := 2-letter ISO language code
# A language ID is defined by BCP 47, with '-' and '_' treated equivalently.
#
# A context is one of the following, as defined in the Unicode Standard:
# Final_Sigma, After_Soft_Dotted, More_Above, Before_Dot, Not_Before_Dot, After_I
# A context for a character C is defined by Section 3.13 Default Case
# Operations, of The Unicode Standard, Version 5.0.
# (This is identical to the context defined by Unicode 4.1.0,
# as specified in http://www.unicode.org/versions/Unicode4.1.0/)
#
# Parsers of this file must be prepared to deal with future additions to this format:
# * Additional contexts
# * Additional fields
# ================================================================================
# @missing 0000..10FFFF; <slc>; <stc>; <suc>
# ================================================================================
# Unconditional mappings
# ================================================================================
......@@ -170,7 +175,7 @@ FB17; FB17; 0544 056D; 0544 053D; # ARMENIAN SMALL LIGATURE MEN XEH
1FF3; 1FF3; 1FFC; 03A9 0399; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI
1FFC; 1FF3; 1FFC; 03A9 0399; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
# Some characters with YPOGEGRAMMENI are also have no corresponding titlecases
# Some characters with YPOGEGRAMMENI also have no corresponding titlecases
1FB2; 1FB2; 1FBA 0345; 1FBA 0399; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
1FB4; 1FB4; 0386 0345; 0386 0399; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
......@@ -184,7 +189,14 @@ FB17; FB17; 0544 056D; 0544 053D; # ARMENIAN SMALL LIGATURE MEN XEH
1FF7; 1FF7; 03A9 0342 0345; 03A9 0342 0399; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
# ================================================================================
# Conditional mappings
# Conditional Mappings
# The remainder of this file provides conditional casing data used to produce
# full case mappings.
# ================================================================================
# Language-Insensitive Mappings
# These are characters whose full case mappings do not depend on language, but do
# depend on context (which characters come before or after). For more information
# see the header of this file and the Unicode Standard.
# ================================================================================
# Special case for final form of sigma
......@@ -203,7 +215,10 @@ FB17; FB17; 0544 056D; 0544 053D; # ARMENIAN SMALL LIGATURE MEN XEH
# 03C2; 03C3; 03A3; 03A3; Not_Final_Sigma; # GREEK SMALL LETTER FINAL SIGMA
# ================================================================================
# Locale-sensitive mappings
# Language-Sensitive Mappings
# These are characters whose full case mappings depend on language and perhaps also
# context (which characters come before or after). For more information
# see the header of this file and the Unicode Standard.
# ================================================================================
# Lithuanian
......@@ -254,3 +269,6 @@ FB17; FB17; 0544 056D; 0544 053D; # ARMENIAN SMALL LIGATURE MEN XEH
# Note: the following case is already in the UnicodeData file.
# 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I
# EOF
......@@ -74,6 +74,7 @@ final class ConditionalSpecialCasing {
new Entry(0x00CC, new char[]{0x0069, 0x0307, 0x0300}, new char[]{0x00CC}, "lt", 0), // # LATIN CAPITAL LETTER I WITH GRAVE
new Entry(0x00CD, new char[]{0x0069, 0x0307, 0x0301}, new char[]{0x00CD}, "lt", 0), // # LATIN CAPITAL LETTER I WITH ACUTE
new Entry(0x0128, new char[]{0x0069, 0x0307, 0x0303}, new char[]{0x0128}, "lt", 0), // # LATIN CAPITAL LETTER I WITH TILDE
new Entry(0x0130, new char[]{0x0069, 0x0307}, new char[]{0x0130}, "lt", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
//# ================================================================================
//# Turkish and Azeri
......@@ -84,7 +85,10 @@ final class ConditionalSpecialCasing {
new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "tr", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "az", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN SMALL LETTER I
new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "az", 0) // # LATIN SMALL LETTER I
new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "az", 0), // # LATIN SMALL LETTER I
//# ================================================================================
//# Other
new Entry(0x0130, new char[]{0x0069, 0x0307}, new char[]{0x0130}, "en", 0), // # LATIN CAPITALLETTER I WITH DOT ABOVE
};
// A hash table that contains the above entries
......
......@@ -2451,14 +2451,21 @@ public final class String
}
if (localeDependent || srcChar == '\u03A3') { // GREEK CAPITAL LETTER SIGMA
lowerChar = ConditionalSpecialCasing.toLowerCaseEx(this, i, locale);
} else if (srcChar == '\u0130') { // LATIN CAPITAL LETTER I DOT
lowerChar = Character.ERROR;
} else {
lowerChar = Character.toLowerCase(srcChar);
}
if ((lowerChar == Character.ERROR) ||
(lowerChar >= Character.MIN_SUPPLEMENTARY_CODE_POINT)) {
if (lowerChar == Character.ERROR) {
lowerCharArray =
ConditionalSpecialCasing.toLowerCaseCharArray(this, i, locale);
if (!localeDependent && srcChar == '\u0130') {
lowerCharArray =
ConditionalSpecialCasing.toLowerCaseCharArray(this, i, Locale.ENGLISH);
} else {
lowerCharArray =
ConditionalSpecialCasing.toLowerCaseCharArray(this, i, locale);
}
} else if (srcCount == 2) {
resultOffset += Character.toChars(lowerChar, result, i + resultOffset) - srcCount;
continue;
......
/*
* Portions Copyright 2003-2005 Sun Microsystems, Inc. All Rights Reserved.
* Portions Copyright 2005-2009 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
......@@ -22,10 +22,9 @@
* CA 95054 USA or visit www.sun.com if you need additional information or
* have any questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
......@@ -77,6 +76,66 @@ public class CharTrie extends Trie
m_friendAgent_ = new FriendAgent();
}
/**
* Make a dummy CharTrie.
* A dummy trie is an empty runtime trie, used when a real data trie cannot
* be loaded.
*
* The trie always returns the initialValue,
* or the leadUnitValue for lead surrogate code points.
* The Latin-1 part is always set up to be linear.
*
* @param initialValue the initial value that is set for all code points
* @param leadUnitValue the value for lead surrogate code _units_ that do not
* have associated supplementary data
* @param dataManipulate object which provides methods to parse the char data
*/
public CharTrie(int initialValue, int leadUnitValue, DataManipulate dataManipulate) {
super(new char[BMP_INDEX_LENGTH+SURROGATE_BLOCK_COUNT], HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_, dataManipulate);
int dataLength, latin1Length, i, limit;
char block;
/* calculate the actual size of the dummy trie data */
/* max(Latin-1, block 0) */
dataLength=latin1Length= INDEX_STAGE_1_SHIFT_<=8 ? 256 : DATA_BLOCK_LENGTH;
if(leadUnitValue!=initialValue) {
dataLength+=DATA_BLOCK_LENGTH;
}
m_data_=new char[dataLength];
m_dataLength_=dataLength;
m_initialValue_=(char)initialValue;
/* fill the index and data arrays */
/* indexes are preset to 0 (block 0) */
/* Latin-1 data */
for(i=0; i<latin1Length; ++i) {
m_data_[i]=(char)initialValue;
}
if(leadUnitValue!=initialValue) {
/* indexes for lead surrogate code units to the block after Latin-1 */
block=(char)(latin1Length>>INDEX_STAGE_2_SHIFT_);
i=0xd800>>INDEX_STAGE_1_SHIFT_;
limit=0xdc00>>INDEX_STAGE_1_SHIFT_;
for(; i<limit; ++i) {
m_index_[i]=block;
}
/* data for lead surrogate code units */
limit=latin1Length+DATA_BLOCK_LENGTH;
for(i=latin1Length; i<limit; ++i) {
m_data_[i]=(char)leadUnitValue;
}
}
m_friendAgent_ = new FriendAgent();
}
/**
* Java friend implementation
*/
......@@ -130,7 +189,18 @@ public class CharTrie extends Trie
*/
public final char getCodePointValue(int ch)
{
int offset = getCodePointOffset(ch);
int offset;
// fastpath for U+0000..U+D7FF
if(0 <= ch && ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
// copy of getRawOffset()
offset = (m_index_[ch >> INDEX_STAGE_1_SHIFT_] << INDEX_STAGE_2_SHIFT_)
+ (ch & INDEX_STAGE_3_MASK_);
return m_data_[offset];
}
// handle U+D800..U+10FFFF
offset = getCodePointOffset(ch);
// return -1 if there is an error, in this case we return the default
// value: m_initialValue_
......
/*
* Portions Copyright 2001-2006 Sun Microsystems, Inc. All Rights Reserved.
* Portions Copyright 2005-2009 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
......@@ -22,10 +22,9 @@
* CA 95054 USA or visit www.sun.com if you need additional information or
* have any questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
......@@ -127,7 +126,7 @@ import java.text.Normalizer;
* normalize(FCD) may be implemented with NFD.
*
* For more details on FCD see the collation design document:
* http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/collation/ICU_collation_design.htm
* http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm
*
* ICU collation performs either NFD or FCD normalization automatically if
* normalization is turned on for the collator object. Beyond collation and
......
/*
* Portions Copyright 2003-2006 Sun Microsystems, Inc. All Rights Reserved.
* Portions Copyright 2005-2009 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
......@@ -22,10 +22,9 @@
* CA 95054 USA or visit www.sun.com if you need additional information or
* have any questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
......@@ -331,7 +330,7 @@ final class NormalizerDataReader implements ICUBinary.Authenticate {
throws IOException{
//Read the bytes that make up the normTrie
dataInputStream.read(normBytes);
dataInputStream.readFully(normBytes);
//normTrieStream= new ByteArrayInputStream(normBytes);
......@@ -346,11 +345,11 @@ final class NormalizerDataReader implements ICUBinary.Authenticate {
}
//Read the fcdTrie
dataInputStream.read(fcdBytes);
dataInputStream.readFully(fcdBytes);
//Read the AuxTrie
dataInputStream.read(auxBytes);
dataInputStream.readFully(auxBytes);
}
public byte[] getDataFormatVersion(){
......
/*
* Portions Copyright 2003-2006 Sun Microsystems, Inc. All Rights Reserved.
* Portions Copyright 2005-2009 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
......@@ -22,10 +22,9 @@
* CA 95054 USA or visit www.sun.com if you need additional information or
* have any questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
......@@ -102,7 +101,7 @@ public final class NormalizerImpl {
private static final long MIN_SPECIAL = (long)(0xfc000000 & UNSIGNED_INT_MASK);
private static final long SURROGATES_TOP = (long)(0xfff00000 & UNSIGNED_INT_MASK);
private static final long MIN_HANGUL = (long)(0xfff00000 & UNSIGNED_INT_MASK);
private static final long MIN_JAMO_V = (long)(0xfff20000 & UNSIGNED_INT_MASK);
// private static final long MIN_JAMO_V = (long)(0xfff20000 & UNSIGNED_INT_MASK);
private static final long JAMO_V_TOP = (long)(0xfff30000 & UNSIGNED_INT_MASK);
......@@ -908,7 +907,7 @@ public final class NormalizerImpl {
buffer = composePart(args,prevStarter,src,srcStart,srcLimit,options,nx);
// compare the normalized version with the original
if(0!=strCompare(buffer,0,args.length,src,prevStarter,(srcStart-prevStarter), false)) {
if(0!=strCompare(buffer,0,args.length,src,prevStarter,srcStart, false)) {
result=NormalizerBase.NO; // normalization differs
break;
}
......@@ -2291,7 +2290,7 @@ public final class NormalizerImpl {
private static final int OPTIONS_NX_MASK=0x1f;
private static final int OPTIONS_UNICODE_MASK=0xe0;
public static final int OPTIONS_SETS_MASK=0xff;
private static final int OPTIONS_UNICODE_SHIFT=5;
// private static final int OPTIONS_UNICODE_SHIFT=5;
private static final UnicodeSet[] nxCache = new UnicodeSet[OPTIONS_SETS_MASK+1];
/* Constants for options flags for normalization.*/
......
/*
* Portions Copyright 2003-2005 Sun Microsystems, Inc. All Rights Reserved.
* Portions Copyright 2005-2009 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
......@@ -22,10 +22,9 @@
* CA 95054 USA or visit www.sun.com if you need additional information or
* have any questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
......@@ -37,10 +36,9 @@
package sun.text.normalizer;
import java.io.InputStream;
import java.io.DataInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.util.Arrays;
/**
* <p>A trie is a kind of compressed, serializable table of values
......@@ -81,7 +79,6 @@ public abstract class Trie
* This interface specifies methods to be implemented in order for
* com.ibm.impl.Trie, to surrogate offset information encapsulated within
* the data.
* @draft 2.1
*/
public static interface DataManipulate
{
......@@ -92,11 +89,17 @@ public abstract class Trie
* @param value data value for a surrogate from the trie, including the
* folding offset
* @return data offset or 0 if there is no data for the lead surrogate
* @draft 2.1
*/
public int getFoldingOffset(int value);
}
// default implementation
private static class DefaultGetFoldingOffset implements DataManipulate {
public int getFoldingOffset(int value) {
return value;
}
}
// protected constructor -------------------------------------------
/**
......@@ -107,7 +110,6 @@ public abstract class Trie
* trie data
* @throws IOException thrown when input stream does not have the
* right header.
* @draft 2.1
*/
protected Trie(InputStream inputStream,
DataManipulate dataManipulate) throws IOException
......@@ -121,7 +123,11 @@ public abstract class Trie
throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
}
m_dataManipulate_ = dataManipulate;
if(dataManipulate != null) {
m_dataManipulate_ = dataManipulate;
} else {
m_dataManipulate_ = new DefaultGetFoldingOffset();
}
m_isLatin1Linear_ = (m_options_ &
HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
m_dataOffset_ = input.readInt();
......@@ -135,19 +141,21 @@ public abstract class Trie
* @param options used by the trie
* @param dataManipulate object containing the information to parse the
* trie data
* @draft 2.2
*/
protected Trie(char index[], int options, DataManipulate dataManipulate)
{
m_options_ = options;
m_dataManipulate_ = dataManipulate;
if(dataManipulate != null) {
m_dataManipulate_ = dataManipulate;
} else {
m_dataManipulate_ = new DefaultGetFoldingOffset();
}
m_isLatin1Linear_ = (m_options_ &
HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
m_index_ = index;
m_dataOffset_ = m_index_.length;
}
// protected data members ------------------------------------------
/**
......@@ -158,7 +166,6 @@ public abstract class Trie
protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
/**
* Shift size for shifting right the input index. 1..9
* @draft 2.1
*/
protected static final int INDEX_STAGE_1_SHIFT_ = 5;
/**
......@@ -168,31 +175,39 @@ public abstract class Trie
* This requires blocks of stage 2 data to be aligned by
* DATA_GRANULARITY.
* 0..INDEX_STAGE_1_SHIFT
* @draft 2.1
*/
protected static final int INDEX_STAGE_2_SHIFT_ = 2;
/**
* Number of data values in a stage 2 (data array) block.
*/
protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
/**
* Mask for getting the lower bits from the input index.
* DATA_BLOCK_LENGTH_ - 1.
* @draft 2.1
* DATA_BLOCK_LENGTH - 1.
*/
protected static final int INDEX_STAGE_3_MASK_ =
(1 << INDEX_STAGE_1_SHIFT_) - 1;
protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
/** Number of bits of a trail surrogate that are used in index table lookups. */
protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_;
/**
* Number of index (stage 1) entries per lead surrogate.
* Same as number of index entries for 1024 trail surrogates,
* ==0x400>>INDEX_STAGE_1_SHIFT_
*/
protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS);
/** Length of the BMP portion of the index (stage 1) array. */
protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_;
/**
* Surrogate mask to use when shifting offset to retrieve supplementary
* values
* @draft 2.1
*/
protected static final int SURROGATE_MASK_ = 0x3FF;
/**
* Index or UTF16 characters
* @draft 2.1
*/
protected char m_index_[];
/**
* Internal TrieValue which handles the parsing of the data value.
* This class is to be implemented by the user
* @draft 2.1
*/
protected DataManipulate m_dataManipulate_;
/**
......@@ -200,7 +215,6 @@ public abstract class Trie
* index and data into a char array, so this is used to indicate the
* initial offset to the data portion.
* Note this index always points to the initial value.
* @draft 2.1
*/
protected int m_dataOffset_;
/**
......@@ -215,7 +229,6 @@ public abstract class Trie
* @param lead lead surrogate
* @param trail trailing surrogate
* @return offset to data
* @draft 2.1
*/
protected abstract int getSurrogateOffset(char lead, char trail);
......@@ -223,14 +236,12 @@ public abstract class Trie
* Gets the value at the argument index
* @param index value at index will be retrieved
* @return 32 bit value
* @draft 2.1
*/
protected abstract int getValue(int index);
/**
* Gets the default initial value
* @return 32 bit value
* @draft 2.1
*/
protected abstract int getInitialValue();
......@@ -247,7 +258,6 @@ public abstract class Trie
* @param offset index offset which ch is to start from
* @param ch index to be used after offset
* @return offset to the data
* @draft 2.1
*/
protected final int getRawOffset(int offset, char ch)
{
......@@ -261,7 +271,6 @@ public abstract class Trie
* Treats a lead surrogate as a normal code point.
* @param ch BMP character
* @return offset to data
* @draft 2.1
*/
protected final int getBMPOffset(char ch)
{
......@@ -279,7 +288,6 @@ public abstract class Trie
* the next trailing surrogate character.
* @param ch lead surrogate character
* @return offset to data
* @draft 2.1
*/
protected final int getLeadOffset(char ch)
{
......@@ -293,26 +301,27 @@ public abstract class Trie
* Gets the offset to data which the codepoint points to
* @param ch codepoint
* @return offset to data
* @draft 2.1
*/
protected final int getCodePointOffset(int ch)
{
// if ((ch >> 16) == 0) slower
if (ch >= UTF16.CODEPOINT_MIN_VALUE
&& ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
if (ch < 0) {
return -1;
} else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
// fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
return getRawOffset(0, (char)ch);
} else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
// BMP codepoint
return getBMPOffset((char)ch);
}
// for optimization
if (ch >= UTF16.CODEPOINT_MIN_VALUE
&& ch <= UCharacter.MAX_VALUE) {
} else if (ch <= UCharacter.MAX_VALUE) {
// look at the construction of supplementary characters
// trail forms the ends of it.
return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
(char)(ch & SURROGATE_MASK_));
} else {
// return -1 // if there is an error, in this case we return
return -1;
}
// return -1 if there is an error, in this case we return
return -1;
}
/**
......@@ -320,7 +329,6 @@ public abstract class Trie
* <p>This is overwritten by the child classes.
* @param inputStream input stream containing the trie information
* @exception IOException thrown when data reading fails.
* @draft 2.1
*/
protected void unserialize(InputStream inputStream) throws IOException
{
......@@ -335,7 +343,6 @@ public abstract class Trie
/**
* Determines if this is a 32 bit trie
* @return true if options specifies this is a 32 bit trie
* @draft 2.1
*/
protected final boolean isIntTrie()
{
......@@ -345,7 +352,6 @@ public abstract class Trie
/**
* Determines if this is a 16 bit trie
* @return true if this is a 16 bit trie
* @draft 2.1
*/
protected final boolean isCharTrie()
{
......@@ -354,40 +360,20 @@ public abstract class Trie
// private data members --------------------------------------------
/**
* Signature index
*/
private static final int HEADER_SIGNATURE_INDEX_ = 0;
/**
* Options index
*/
private static final int HEADER_OPTIONS_INDEX_ = 1 << 1;
/**
* Index length index
*/
private static final int HEADER_INDEX_LENGTH_INDEX_ = 2 << 1;
/**
* Data length index
*/
private static final int HEADER_DATA_LENGTH_INDEX_ = 3 << 1;
/**
* Size of header
*/
private static final int HEADER_LENGTH_ = 4 << 1;
/**
* Latin 1 option mask
*/
private static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
/**
* Constant number to authenticate the byte block
*/
private static final int HEADER_SIGNATURE_ = 0x54726965;
protected static final int HEADER_SIGNATURE_ = 0x54726965;
/**
* Header option formatting
*/
private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
private static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
private static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
/**
* Flag indicator for Latin quick access data block
......@@ -409,9 +395,8 @@ public abstract class Trie
/**
* Authenticates raw data header.
* Checking the header information, signature and options.
* @param rawdata array of char data to be checked
* @param signature This contains the options and type of a Trie
* @return true if the header is authenticated valid
* @draft 2.1
*/
private final boolean checkHeader(int signature)
{
......
/*
* Portions Copyright 2005-2006 Sun Microsystems, Inc. All Rights Reserved.
* Portions Copyright 2005-2009 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
......@@ -22,10 +22,9 @@
* CA 95054 USA or visit www.sun.com if you need additional information or
* have any questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
......@@ -108,15 +107,14 @@ package sun.text.normalizer;
* @since release 2.1, Jan 17 2002
*/
public class TrieIterator implements RangeValueIterator
{
// public constructor ---------------------------------------------
/**
* TrieEnumeration constructor
* @param trie to be used
* @exception IllegalArgumentException throw when argument is null.
* @draft 2.1
*/
public TrieIterator(Trie trie)
{
......@@ -141,7 +139,6 @@ public class TrieIterator implements RangeValueIterator
* @return true if we are not at the end of the iteration, false otherwise.
* @exception NoSuchElementException - if no more elements exist.
* @see com.ibm.icu.util.RangeValueIterator.Element
* @draft 2.1
*/
public final boolean next(Element element)
{
......@@ -158,7 +155,6 @@ public class TrieIterator implements RangeValueIterator
/**
* Resets the iterator to the beginning of the iteration
* @draft 2.1
*/
public final void reset()
{
......@@ -186,7 +182,6 @@ public class TrieIterator implements RangeValueIterator
* The default function is to return the value as it is.
* @param value a value from the trie
* @return extracted value
* @draft 2.1
*/
protected int extract(int value)
{
......@@ -278,7 +273,6 @@ public class TrieIterator implements RangeValueIterator
* Note, if there are no more iterations, it will never get to here.
* Blocked out by next().
* @param element return result object
* @draft 2.1
*/
private final void calculateNextSupplementaryElement(Element element)
{
......@@ -516,10 +510,6 @@ public class TrieIterator implements RangeValueIterator
*/
private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
/**
* Trail surrogate maximum value
*/
private static final int TRAIL_SURROGATE_MAX_VALUE_ = 0xDFFF;
/**
* Number of trail surrogate
*/
private static final int TRAIL_SURROGATE_COUNT_ = 0x400;
......@@ -538,11 +528,6 @@ public class TrieIterator implements RangeValueIterator
private static final int DATA_BLOCK_LENGTH_ =
1 << Trie.INDEX_STAGE_1_SHIFT_;
/**
* Number of codepoints in a stage 2 block
*/
private static final int DATA_BLOCK_SUPPLEMENTARY_LENGTH_ =
DATA_BLOCK_LENGTH_ << 10;
/**
* Trie instance
*/
private Trie m_trie_;
......@@ -560,10 +545,4 @@ public class TrieIterator implements RangeValueIterator
private int m_nextBlock_;
private int m_nextBlockIndex_;
private int m_nextTrailIndexOffset_;
/**
* This is the return result element
*/
private int m_start_;
private int m_limit_;
private int m_value_;
}
/*
* Portions Copyright 2005-2009 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Sun designates this
* particular file as subject to the "Classpath" exception as provided
* by Sun in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
* CA 95054 USA or visit www.sun.com if you need additional information or
* have any questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
* file name: UBiDiProps.java
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2005jan16
* created by: Markus W. Scherer
*
* Low-level Unicode bidi/shaping properties access.
* Java port of ubidi_props.h/.c.
*/
package sun.text.normalizer;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.InputStream;
import java.io.IOException;
public final class UBiDiProps {
// constructors etc. --------------------------------------------------- ***
// port of ubidi_openProps()
public UBiDiProps() throws IOException{
InputStream is=ICUData.getStream(DATA_FILE_NAME);
BufferedInputStream b=new BufferedInputStream(is, 4096 /* data buffer size */);
readData(b);
b.close();
is.close();
}
private void readData(InputStream is) throws IOException {
DataInputStream inputStream=new DataInputStream(is);
// read the header
ICUBinary.readHeader(inputStream, FMT, new IsAcceptable());
// read indexes[]
int i, count;
count=inputStream.readInt();
if(count<IX_INDEX_TOP) {
throw new IOException("indexes[0] too small in "+DATA_FILE_NAME);
}
indexes=new int[count];
indexes[0]=count;
for(i=1; i<count; ++i) {
indexes[i]=inputStream.readInt();
}
// read the trie
trie=new CharTrie(inputStream, null);
// read mirrors[]
count=indexes[IX_MIRROR_LENGTH];
if(count>0) {
mirrors=new int[count];
for(i=0; i<count; ++i) {
mirrors[i]=inputStream.readInt();
}
}
// read jgArray[]
count=indexes[IX_JG_LIMIT]-indexes[IX_JG_START];
jgArray=new byte[count];
for(i=0; i<count; ++i) {
jgArray[i]=inputStream.readByte();
}
}
// implement ICUBinary.Authenticate
private final class IsAcceptable implements ICUBinary.Authenticate {
public boolean isDataVersionAcceptable(byte version[]) {
return version[0]==1 &&
version[2]==Trie.INDEX_STAGE_1_SHIFT_ && version[3]==Trie.INDEX_STAGE_2_SHIFT_;
}
}
// UBiDiProps singleton
private static UBiDiProps gBdp=null;
// port of ubidi_getSingleton()
public static final synchronized UBiDiProps getSingleton() throws IOException {
if(gBdp==null) {
gBdp=new UBiDiProps();
}
return gBdp;
}
// UBiDiProps dummy singleton
private static UBiDiProps gBdpDummy=null;
private UBiDiProps(boolean makeDummy) { // ignore makeDummy, only creates a unique signature
indexes=new int[IX_TOP];
indexes[0]=IX_TOP;
trie=new CharTrie(0, 0, null); // dummy trie, always returns 0
}
/**
* Get a singleton dummy object, one that works with no real data.
* This can be used when the real data is not available.
* Using the dummy can reduce checks for available data after an initial failure.
* Port of ucase_getDummy().
*/
public static final synchronized UBiDiProps getDummy() {
if(gBdpDummy==null) {
gBdpDummy=new UBiDiProps(true);
}
return gBdpDummy;
}
public final int getClass(int c) {
return getClassFromProps(trie.getCodePointValue(c));
}
// data members -------------------------------------------------------- ***
private int indexes[];
private int mirrors[];
private byte jgArray[];
private CharTrie trie;
// data format constants ----------------------------------------------- ***
private static final String DATA_FILE_NAME = "/sun/text/resources/ubidi.icu";
/* format "BiDi" */
private static final byte FMT[]={ 0x42, 0x69, 0x44, 0x69 };
/* indexes into indexes[] */
private static final int IX_INDEX_TOP=0;
private static final int IX_MIRROR_LENGTH=3;
private static final int IX_JG_START=4;
private static final int IX_JG_LIMIT=5;
private static final int IX_TOP=16;
private static final int CLASS_MASK= 0x0000001f;
private static final int getClassFromProps(int props) {
return props&CLASS_MASK;
}
}
/*
* Portions Copyright 2005 Sun Microsystems, Inc. All Rights Reserved.
* Portions Copyright 2005-2009 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
......@@ -22,10 +22,9 @@
* CA 95054 USA or visit www.sun.com if you need additional information or
* have any questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
......@@ -37,8 +36,8 @@
package sun.text.normalizer;
import java.io.InputStream;
import java.io.DataInputStream;
import java.io.InputStream;
import java.io.IOException;
/**
......@@ -50,254 +49,13 @@ import java.io.IOException;
* </p>
* <p>uprops.icu which is in big-endian format is jared together with this
* package.</p>
*
* Unicode character properties file format see
* (ICU4C)/source/tools/genprops/store.c
*
* @author Syn Wee Quek
* @since release 2.1, February 1st 2002
* @draft 2.1
*/
/* Unicode character properties file format ------------------------------------
The file format prepared and written here contains several data
structures that store indexes or data.
The following is a description of format version 3 .
Data contents:
The contents is a parsed, binary form of several Unicode character
database files, most prominently UnicodeData.txt.
Any Unicode code point from 0 to 0x10ffff can be looked up to get
the properties, if any, for that code point. This means that the input
to the lookup are 21-bit unsigned integers, with not all of the
21-bit range used.
It is assumed that client code keeps a uint32_t pointer
to the beginning of the data:
const uint32_t *p32;
Formally, the file contains the following structures:
const int32_t indexes[16] with values i0..i15:
i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
i1 exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words
i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table
i7..i9 reservedIndexes; -- reserved values; 0 for now
i10 maxValues; -- maximum code values for vector word 0, see uprops.h (format version 3.1+)
i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (format version 3.2)
i12..i15 reservedIndexes; -- reserved values; 0 for now
PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
P const uint32_t props32[i1-i0];
E const uint32_t exceptions[i2-i1];
U const UChar uchars[2*(i3-i2)];
AT serialized trie for additional properties (byte size: 4*(i4-i3))
PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
Trie lookup and properties:
In order to condense the data for the 21-bit code space, several properties of
the Unicode code assignment are exploited:
- The code space is sparse.
- There are several 10k of consecutive codes with the same properties.
- Characters and scripts are allocated in groups of 16 code points.
- Inside blocks for scripts the properties are often repetitive.
- The 21-bit space is not fully used for Unicode.
The lookup of properties for a given code point is done with a trie lookup,
using the UTrie implementation.
The trie lookup result is a 16-bit index in the props32[] table where the
actual 32-bit properties word is stored. This is done to save space.
(There are thousands of 16-bit entries in the trie data table, but
only a few hundred unique 32-bit properties words.
If the trie data table contained 32-bit words directly, then that would be
larger because the length of the table would be the same as now but the
width would be 32 bits instead of 16. This saves more than 10kB.)
With a given Unicode code point
UChar32 c;
and 0<=c<0x110000, the lookup is done like this:
uint16_t i;
UTRIE_GET16(c, i);
uint32_t props=p32[i];
For some characters, not all of the properties can be efficiently encoded
using 32 bits. For them, the 32-bit word contains an index into the exceptions[]
array:
if(props&EXCEPTION_BIT)) {
uint16_t e=(uint16_t)(props>>VALUE_SHIFT);
...
}
The exception values are a variable number of uint32_t starting at
const uint32_t *pe=p32+exceptionsIndex+e;
The first uint32_t there contains flags about what values actually follow it.
Some of the exception values are UChar32 code points for the case mappings,
others are numeric values etc.
32-bit properties sets:
Each 32-bit properties word contains:
0.. 4 general category
5 has exception values
6..10 BiDi category
11 is mirrored
12..14 numericType:
0 no numeric value
1 decimal digit value
2 digit value
3 numeric value
### TODO: type 4 for Han digits & numbers?!
15..19 reserved
20..31 value according to bits 0..5:
if(has exception) {
exception index;
} else switch(general category) {
case Ll: delta to uppercase; -- same as titlecase
case Lu: -delta to lowercase; -- titlecase is same as c
case Lt: -delta to lowercase; -- uppercase is same as c
default:
if(is mirrored) {
delta to mirror;
} else if(numericType!=0) {
numericValue;
} else {
0;
};
}
Exception values:
In the first uint32_t exception word for a code point,
bits
31..16 reserved
15..0 flags that indicate which values follow:
bit
0 has uppercase mapping
1 has lowercase mapping
2 has titlecase mapping
3 unused
4 has numeric value (numerator)
if numericValue=0x7fffff00+x then numericValue=10^x
5 has denominator value
6 has a mirror-image Unicode code point
7 has SpecialCasing.txt entries
8 has CaseFolding.txt entries
According to the flags in this word, one or more uint32_t words follow it
in the sequence of the bit flags in the flags word; if a flag is not set,
then the value is missing or 0:
For the case mappings and the mirror-image Unicode code point,
one uint32_t or UChar32 each is the code point.
If the titlecase mapping is missing, then it is the same as the uppercase mapping.
For the digit values, bits 31..16 contain the decimal digit value, and
bits 15..0 contain the digit value. A value of -1 indicates that
this value is missing.
For the numeric/numerator value, an int32_t word contains the value directly,
except for when there is no numerator but a denominator, then the numerator
is implicitly 1. This means:
numerator denominator result
none none none
x none x
none y 1/y
x y x/y
If the numerator value is 0x7fffff00+x then it is replaced with 10^x.
For the denominator value, a uint32_t word contains the value directly.
For special casing mappings, the 32-bit exception word contains:
31 if set, this character has complex, conditional mappings
that are not stored;
otherwise, the mappings are stored according to the following bits
30..24 number of UChars used for mappings
23..16 reserved
15.. 0 UChar offset from the beginning of the UChars array where the
UChars for the special case mappings are stored in the following format:
Format of special casing UChars:
One UChar value with lengths as follows:
14..10 number of UChars for titlecase mapping
9.. 5 number of UChars for uppercase mapping
4.. 0 number of UChars for lowercase mapping
Followed by the UChars for lowercase, uppercase, titlecase mappings in this order.
For case folding mappings, the 32-bit exception word contains:
31..24 number of UChars used for the full mapping
23..16 reserved
15.. 0 UChar offset from the beginning of the UChars array where the
UChars for the special case mappings are stored in the following format:
Format of case folding UChars:
Two UChars contain the simple mapping as follows:
0, 0 no simple mapping
BMP,0 a simple mapping to a BMP code point
s1, s2 a simple mapping to a supplementary code point stored as two surrogates
This is followed by the UChars for the full case folding mappings.
Example:
U+2160, ROMAN NUMERAL ONE, needs an exception because it has a lowercase
mapping and a numeric value.
Its exception values would be stored as 3 uint32_t words:
- flags=0x0a (see above) with combining class 0
- lowercase mapping 0x2170
- numeric value=1
--- Additional properties (new in format version 2.1) ---
The second trie for additional properties (AT) is also a UTrie with 16-bit data.
The data words consist of 32-bit unit indexes (not row indexes!) into the
table of unique properties vectors (PV).
Each vector contains a set of properties.
The width of a vector (number of uint32_t per row) may change
with the formatVersion, it is stored in i5.
Current properties: see icu/source/common/uprops.h
--- Changes in format version 3.1 ---
See i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT.
--- Changes in format version 3.2 ---
- The tries use linear Latin-1 ranges.
- The additional properties bits store full properties XYZ instead
of partial Other_XYZ, so that changes in the derivation formulas
need not be tracked in runtime library code.
- Joining Type and Line Break are also stored completely, so that uprops.c
needs no runtime formulas for enumerated properties either.
- Store the case-sensitive flag in the main properties word.
- i10 also contains U_LB_COUNT and U_EA_COUNT.
- i11 contains maxValues2 for vector word 2.
----------------------------------------------------------------------------- */
final class UCharacterPropertyReader implements ICUBinary.Authenticate
{
// public methods ----------------------------------------------------
......@@ -315,7 +73,6 @@ final class UCharacterPropertyReader implements ICUBinary.Authenticate
* <p>Protected constructor.</p>
* @param inputStream ICU uprop.dat file input stream
* @exception IOException throw if data file fails authentication
* @draft 2.1
*/
protected UCharacterPropertyReader(InputStream inputStream)
throws IOException
......@@ -331,8 +88,7 @@ final class UCharacterPropertyReader implements ICUBinary.Authenticate
* <p>Reads uprops.icu, parse it into blocks of data to be stored in
* UCharacterProperty.</P
* @param ucharppty UCharacterProperty instance
* @exception thrown when data reading fails
* @draft 2.1
* @exception IOException thrown when data reading fails
*/
protected void read(UCharacterProperty ucharppty) throws IOException
{
......@@ -362,38 +118,30 @@ final class UCharacterPropertyReader implements ICUBinary.Authenticate
// read the trie index block
// m_props_index_ in terms of ints
ucharppty.m_trie_ = new CharTrie(m_dataInputStream_, ucharppty);
ucharppty.m_trie_ = new CharTrie(m_dataInputStream_, null);
// reads the 32 bit properties block
// skip the 32 bit properties block
int size = m_exceptionOffset_ - m_propertyOffset_;
ucharppty.m_property_ = new int[size];
for (int i = 0; i < size; i ++) {
ucharppty.m_property_[i] = m_dataInputStream_.readInt();
}
m_dataInputStream_.skipBytes(size * 4);
// reads the 32 bit exceptions block
size = m_caseOffset_ - m_exceptionOffset_;
ucharppty.m_exception_ = new int[size];
for (int i = 0; i < size; i ++) {
ucharppty.m_exception_[i] = m_dataInputStream_.readInt();
}
m_dataInputStream_.skipBytes(size * 4);
// reads the 32 bit case block
size = (m_additionalOffset_ - m_caseOffset_) << 1;
ucharppty.m_case_ = new char[size];
for (int i = 0; i < size; i ++) {
ucharppty.m_case_[i] = m_dataInputStream_.readChar();
}
// reads the additional property block
ucharppty.m_additionalTrie_ = new CharTrie(m_dataInputStream_,
ucharppty);
// additional properties
size = m_reservedOffset_ - m_additionalVectorsOffset_;
ucharppty.m_additionalVectors_ = new int[size];
for (int i = 0; i < size; i ++) {
ucharppty.m_additionalVectors_[i] = m_dataInputStream_.readInt();
m_dataInputStream_.skipBytes(size * 2);
if(m_additionalColumnsCount_ > 0) {
// reads the additional property block
ucharppty.m_additionalTrie_ = new CharTrie(m_dataInputStream_, null);
// additional properties
size = m_reservedOffset_ - m_additionalVectorsOffset_;
ucharppty.m_additionalVectors_ = new int[size];
for (int i = 0; i < size; i ++) {
ucharppty.m_additionalVectors_[i] = m_dataInputStream_.readInt();
}
}
m_dataInputStream_.close();
......@@ -428,12 +176,15 @@ final class UCharacterPropertyReader implements ICUBinary.Authenticate
private byte m_unicodeVersion_[];
/**
* File format version that this class understands.
* No guarantees are made if a older version is used
* Data format "UPro".
*/
private static final byte DATA_FORMAT_ID_[] = {(byte)0x55, (byte)0x50,
(byte)0x72, (byte)0x6F};
private static final byte DATA_FORMAT_VERSION_[] = {(byte)0x3, (byte)0x1,
/**
* Format version; this code works with all versions with the same major
* version number and the same Trie bit distribution.
*/
private static final byte DATA_FORMAT_VERSION_[] = {(byte)0x5, (byte)0,
(byte)Trie.INDEX_STAGE_1_SHIFT_,
(byte)Trie.INDEX_STAGE_2_SHIFT_};
}
/*
* Portions Copyright 2005 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Sun designates this
* particular file as subject to the "Classpath" exception as provided
* by Sun in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
* CA 95054 USA or visit www.sun.com if you need additional information or
* have any questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
/**
* <p>Selection constants for Unicode properties. </p>
* <p>These constants are used in functions like
* UCharacter.hasBinaryProperty(int) to select one of the Unicode properties.
* </p>
* <p>The properties APIs are intended to reflect Unicode properties as
* defined in the Unicode Character Database (UCD) and Unicode Technical
* Reports (UTR).</p>
* <p>For details about the properties see <a href=http://www.unicode.org>
* http://www.unicode.org</a>.</p>
* <p>For names of Unicode properties see the UCD file PropertyAliases.txt.
* </p>
* <p>Important: If ICU is built with UCD files from Unicode versions below
* 3.2, then properties marked with "new" are not or not fully
* available. Check UCharacter.getUnicodeVersion() to be sure.</p>
* @author Syn Wee Quek
* @stable ICU 2.6
* @see com.ibm.icu.lang.UCharacter
*/
public interface UProperty
{
// public data member --------------------------------------------------
/**
* Enumerated property Hangul_Syllable_Type, new in Unicode 4.
* Returns HangulSyllableType values.
* @stable ICU 2.6
*/
public static final int HANGUL_SYLLABLE_TYPE = 0x100B;
/**
* Bitmask property General_Category_Mask.
* This is the General_Category property returned as a bit mask.
* When used in UCharacter.getIntPropertyValue(c),
* returns bit masks for UCharacterCategory values where exactly one bit is set.
* When used with UCharacter.getPropertyValueName() and UCharacter.getPropertyValueEnum(),
* a multi-bit mask is used for sets of categories like "Letters".
* @stable ICU 2.4
*/
public static final int GENERAL_CATEGORY_MASK = 0x2000;
}
/*
* Portions Copyright 2005-2006 Sun Microsystems, Inc. All Rights Reserved.
* Portions Copyright 2005-2009 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
......@@ -22,10 +22,9 @@
* CA 95054 USA or visit www.sun.com if you need additional information or
* have any questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
......@@ -184,15 +183,16 @@ public final class UTF16
* bounds.
* @stable ICU 2.1
*/
public static int charAt(String source, int offset16)
{
if (offset16 < 0 || offset16 >= source.length()) {
throw new StringIndexOutOfBoundsException(offset16);
public static int charAt(String source, int offset16) {
char single = source.charAt(offset16);
if (single < LEAD_SURROGATE_MIN_VALUE) {
return single;
}
return _charAt(source, offset16, single);
}
char single = source.charAt(offset16);
if (single < LEAD_SURROGATE_MIN_VALUE ||
single > TRAIL_SURROGATE_MAX_VALUE) {
private static int _charAt(String source, int offset16, char single) {
if (single > TRAIL_SURROGATE_MAX_VALUE) {
return single;
}
......@@ -201,29 +201,23 @@ public final class UTF16
// low, look both directions.
if (single <= LEAD_SURROGATE_MAX_VALUE) {
++ offset16;
++offset16;
if (source.length() != offset16) {
char trail = source.charAt(offset16);
if (trail >= TRAIL_SURROGATE_MIN_VALUE &&
trail <= TRAIL_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(single,
trail);
if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(single, trail);
}
}
}
else
{
-- offset16;
if (offset16 >= 0) {
// single is a trail surrogate so
char lead = source.charAt(offset16);
if (lead >= LEAD_SURROGATE_MIN_VALUE &&
lead <= LEAD_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(lead,
single);
}
} else {
--offset16;
if (offset16 >= 0) {
// single is a trail surrogate so
char lead = source.charAt(offset16);
if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(lead, single);
}
}
}
return single; // return unmatched surrogate
}
......
/*
* Portions Copyright 2005-2006 Sun Microsystems, Inc. All Rights Reserved.
* Portions Copyright 2005-2009 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
......@@ -22,10 +22,9 @@
* CA 95054 USA or visit www.sun.com if you need additional information or
* have any questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
......@@ -38,11 +37,8 @@
package sun.text.normalizer;
import java.text.ParsePosition;
import java.util.Map;
import java.util.HashMap;
import java.util.TreeSet;
import java.util.Iterator;
import java.util.Collection;
import java.util.TreeSet;
/**
* A mutable set of Unicode characters and multicharacter strings. Objects of this class
......@@ -130,8 +126,8 @@ import java.util.Collection;
* "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a
* complete list of supported property patterns, see the User's Guide
* for UnicodeSet at
* <a href="http://oss.software.ibm.com/icu/userguide/unicodeSet.html">
* http://oss.software.ibm.com/icu/userguide/unicodeSet.html</a>.
* <a href="http://www.icu-project.org/userguide/unicodeSet.html">
* http://www.icu-project.org/userguide/unicodeSet.html</a>.
* Actual determination of property data is defined by the underlying
* Unicode database as implemented by UCharacter.
*
......@@ -271,9 +267,11 @@ import java.util.Collection;
* </tr>
* </table>
* </blockquote>
* <p>To iterate over contents of UnicodeSet, use UnicodeSetIterator class.
*
* @author Alan Liu
* @stable ICU 2.0
* @see UnicodeSetIterator
*/
public class UnicodeSet implements UnicodeMatcher {
......@@ -322,7 +320,7 @@ public class UnicodeSet implements UnicodeMatcher {
* properties are all exactly alike, e.g. CJK Ideographs from
* U+4E00 to U+9FA5.
*/
private static UnicodeSet INCLUSIONS = null;
private static UnicodeSet INCLUSIONS[] = null;
//----------------------------------------------------------------
// Public API
......@@ -471,17 +469,18 @@ public class UnicodeSet implements UnicodeMatcher {
return result;
}
return _generatePattern(result, escapeUnprintable);
return _generatePattern(result, escapeUnprintable, true);
}
/**
* Generate and append a string representation of this set to result.
* This does not use this.pat, the cleaned up copy of the string
* passed to applyPattern().
* @stable ICU 2.0
* @param includeStrings if false, doesn't include the strings.
* @stable ICU 3.8
*/
public StringBuffer _generatePattern(StringBuffer result,
boolean escapeUnprintable) {
boolean escapeUnprintable, boolean includeStrings) {
result.append('[');
int count = getRangeCount();
......@@ -524,7 +523,7 @@ public class UnicodeSet implements UnicodeMatcher {
}
}
if (strings.size() > 0) {
if (includeStrings && strings.size() > 0) {
Iterator it = strings.iterator();
while (it.hasNext()) {
result.append('{');
......@@ -535,19 +534,8 @@ public class UnicodeSet implements UnicodeMatcher {
return result.append(']');
}
/**
* Adds the specified range to this set if it is not already
* present. If this set already contains the specified range,
* the call leaves this set unchanged. If <code>end > start</code>
* then an empty range is added, leaving the set unchanged.
*
* @param start first character, inclusive, of range to be added
* to this set.
* @param end last character, inclusive, of range to be added
* to this set.
* @stable ICU 2.0
*/
public UnicodeSet add(int start, int end) {
// for internal use, after checkFrozen has been called
private UnicodeSet add_unchecked(int start, int end) {
if (start < MIN_VALUE || start > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
}
......@@ -569,6 +557,11 @@ public class UnicodeSet implements UnicodeMatcher {
* @stable ICU 2.0
*/
public final UnicodeSet add(int c) {
return add_unchecked(c);
}
// for internal use only, after checkFrozen has been called
private final UnicodeSet add_unchecked(int c) {
if (c < MIN_VALUE || c > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
}
......@@ -663,13 +656,12 @@ public class UnicodeSet implements UnicodeMatcher {
* @stable ICU 2.0
*/
public final UnicodeSet add(String s) {
int cp = getSingleCP(s);
if (cp < 0) {
strings.add(s);
pat = null;
} else {
add(cp, cp);
add_unchecked(cp, cp);
}
return this;
}
......@@ -981,7 +973,6 @@ public class UnicodeSet implements UnicodeMatcher {
*/
void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
StringBuffer rebuiltPat, int options) {
// Syntax characters: [ ] ^ - & { }
// Recognized special forms for chars, sets: c-c s-s s&s
......@@ -992,7 +983,7 @@ public class UnicodeSet implements UnicodeMatcher {
opts |= RuleCharacterIterator.SKIP_WHITESPACE;
}
StringBuffer pat = new StringBuffer(), buf = null;
StringBuffer patBuf = new StringBuffer(), buf = null;
boolean usePat = false;
UnicodeSet scratch = null;
Object backup = null;
......@@ -1049,13 +1040,13 @@ public class UnicodeSet implements UnicodeMatcher {
} else {
// Handle opening '[' delimiter
mode = 1;
pat.append('[');
patBuf.append('[');
backup = chars.getPos(backup); // prepare to backup
c = chars.next(opts);
literal = chars.isEscaped();
if (c == '^' && !literal) {
invert = true;
pat.append('^');
patBuf.append('^');
backup = chars.getPos(backup); // prepare to backup
c = chars.next(opts);
literal = chars.isEscaped();
......@@ -1093,13 +1084,13 @@ public class UnicodeSet implements UnicodeMatcher {
if (op != 0) {
syntaxError(chars, "Char expected after operator");
}
add(lastChar, lastChar);
_appendToPat(pat, lastChar, false);
add_unchecked(lastChar, lastChar);
_appendToPat(patBuf, lastChar, false);
lastItem = op = 0;
}
if (op == '-' || op == '&') {
pat.append(op);
patBuf.append(op);
}
if (nested == null) {
......@@ -1108,14 +1099,14 @@ public class UnicodeSet implements UnicodeMatcher {
}
switch (setMode) {
case 1:
nested.applyPattern(chars, symbols, pat, options);
nested.applyPattern(chars, symbols, patBuf, options);
break;
case 2:
chars.skipIgnored(opts);
nested.applyPropertyPattern(chars, pat, symbols);
nested.applyPropertyPattern(chars, patBuf, symbols);
break;
case 3: // `nested' already parsed
nested._toPattern(pat, false);
nested._toPattern(patBuf, false);
break;
}
......@@ -1158,17 +1149,17 @@ public class UnicodeSet implements UnicodeMatcher {
switch (c) {
case ']':
if (lastItem == 1) {
add(lastChar, lastChar);
_appendToPat(pat, lastChar, false);
add_unchecked(lastChar, lastChar);
_appendToPat(patBuf, lastChar, false);
}
// Treat final trailing '-' as a literal
if (op == '-') {
add(op, op);
pat.append(op);
add_unchecked(op, op);
patBuf.append(op);
} else if (op == '&') {
syntaxError(chars, "Trailing '&'");
}
pat.append(']');
patBuf.append(']');
mode = 2;
continue;
case '-':
......@@ -1178,11 +1169,11 @@ public class UnicodeSet implements UnicodeMatcher {
continue;
} else {
// Treat final trailing '-' as a literal
add(c, c);
add_unchecked(c, c);
c = chars.next(opts);
literal = chars.isEscaped();
if (c == ']' && !literal) {
pat.append("-]");
patBuf.append("-]");
mode = 2;
continue;
}
......@@ -1202,8 +1193,8 @@ public class UnicodeSet implements UnicodeMatcher {
syntaxError(chars, "Missing operand after operator");
}
if (lastItem == 1) {
add(lastChar, lastChar);
_appendToPat(pat, lastChar, false);
add_unchecked(lastChar, lastChar);
_appendToPat(patBuf, lastChar, false);
}
lastItem = 0;
if (buf == null) {
......@@ -1228,9 +1219,9 @@ public class UnicodeSet implements UnicodeMatcher {
// we don't need to drop through to the further
// processing
add(buf.toString());
pat.append('{');
_appendToPat(pat, buf.toString(), false);
pat.append('}');
patBuf.append('{');
_appendToPat(patBuf, buf.toString(), false);
patBuf.append('}');
continue;
case SymbolTable.SYMBOL_REF:
// symbols nosymbols
......@@ -1250,12 +1241,12 @@ public class UnicodeSet implements UnicodeMatcher {
}
if (anchor && op == 0) {
if (lastItem == 1) {
add(lastChar, lastChar);
_appendToPat(pat, lastChar, false);
add_unchecked(lastChar, lastChar);
_appendToPat(patBuf, lastChar, false);
}
add(UnicodeMatcher.ETHER);
add_unchecked(UnicodeMatcher.ETHER);
usePat = true;
pat.append(SymbolTable.SYMBOL_REF).append(']');
patBuf.append(SymbolTable.SYMBOL_REF).append(']');
mode = 2;
continue;
}
......@@ -1281,14 +1272,14 @@ public class UnicodeSet implements UnicodeMatcher {
// these are most likely typos.
syntaxError(chars, "Invalid range");
}
add(lastChar, c);
_appendToPat(pat, lastChar, false);
pat.append(op);
_appendToPat(pat, c, false);
add_unchecked(lastChar, c);
_appendToPat(patBuf, lastChar, false);
patBuf.append(op);
_appendToPat(patBuf, c, false);
lastItem = op = 0;
} else {
add(lastChar, lastChar);
_appendToPat(pat, lastChar, false);
add_unchecked(lastChar, lastChar);
_appendToPat(patBuf, lastChar, false);
lastChar = c;
}
break;
......@@ -1315,9 +1306,9 @@ public class UnicodeSet implements UnicodeMatcher {
// Use the rebuilt pattern (pat) only if necessary. Prefer the
// generated pattern.
if (usePat) {
rebuiltPat.append(pat.toString());
rebuiltPat.append(patBuf.toString());
} else {
_generatePattern(rebuiltPat, false);
_generatePattern(rebuiltPat, false, true);
}
}
......@@ -1590,7 +1581,9 @@ public class UnicodeSet implements UnicodeMatcher {
private static class VersionFilter implements Filter {
VersionInfo version;
VersionFilter(VersionInfo version) { this.version = version; }
public boolean contains(int ch) {
VersionInfo v = UCharacter.getAge(ch);
// Reference comparison ok; VersionInfo caches and reuses
......@@ -1600,18 +1593,28 @@ public class UnicodeSet implements UnicodeMatcher {
}
}
private static synchronized UnicodeSet getInclusions() {
private static synchronized UnicodeSet getInclusions(int src) {
if (INCLUSIONS == null) {
UCharacterProperty property = UCharacterProperty.getInstance();
INCLUSIONS = property.getInclusions();
INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT];
}
if(INCLUSIONS[src] == null) {
UnicodeSet incl = new UnicodeSet();
switch(src) {
case UCharacterProperty.SRC_PROPSVEC:
UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl);
break;
default:
throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")");
}
INCLUSIONS[src] = incl;
}
return INCLUSIONS;
return INCLUSIONS[src];
}
/**
* Generic filter-based scanning code for UCD property UnicodeSets.
*/
private UnicodeSet applyFilter(Filter filter) {
private UnicodeSet applyFilter(Filter filter, int src) {
// Walk through all Unicode characters, noting the start
// and end of each range for which filter.contain(c) is
// true. Add each range to a set.
......@@ -1629,7 +1632,7 @@ public class UnicodeSet implements UnicodeMatcher {
clear();
int startHasProperty = -1;
UnicodeSet inclusions = getInclusions();
UnicodeSet inclusions = getInclusions(src);
int limitRange = inclusions.getRangeCount();
for (int j=0; j<limitRange; ++j) {
......@@ -1646,19 +1649,18 @@ public class UnicodeSet implements UnicodeMatcher {
startHasProperty = ch;
}
} else if (startHasProperty >= 0) {
add(startHasProperty, ch-1);
add_unchecked(startHasProperty, ch-1);
startHasProperty = -1;
}
}
}
if (startHasProperty >= 0) {
add(startHasProperty, 0x10FFFF);
add_unchecked(startHasProperty, 0x10FFFF);
}
return this;
}
/**
* Remove leading and trailing rule white space and compress
* internal rule white space to a single space character.
......@@ -1686,10 +1688,6 @@ public class UnicodeSet implements UnicodeMatcher {
return buf.toString();
}
//----------------------------------------------------------------
// Property set API
//----------------------------------------------------------------
/**
* Modifies this set to contain those code points which have the
* given value for the given property. Prior contents of this
......@@ -1699,22 +1697,21 @@ public class UnicodeSet implements UnicodeMatcher {
* @param symbols if not null, then symbols are first called to see if a property
* is available. If true, then everything else is skipped.
* @return this set
* @draft ICU 3.2
* @deprecated This is a draft API and might change in a future release of ICU.
* @stable ICU 3.2
*/
public UnicodeSet applyPropertyAlias(String propertyAlias,
String valueAlias, SymbolTable symbols) {
if (propertyAlias.equals("Age"))
{
// Must munge name, since
// VersionInfo.getInstance() does not do
// 'loose' matching.
VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
applyFilter(new VersionFilter(version));
return this;
}
else
throw new IllegalArgumentException("Unsupported property");
if (valueAlias.length() > 0) {
if (propertyAlias.equals("Age")) {
// Must munge name, since
// VersionInfo.getInstance() does not do
// 'loose' matching.
VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
return this;
}
}
throw new IllegalArgumentException("Unsupported property: " + propertyAlias);
}
/**
......@@ -1840,14 +1837,14 @@ public class UnicodeSet implements UnicodeMatcher {
*/
private void applyPropertyPattern(RuleCharacterIterator chars,
StringBuffer rebuiltPat, SymbolTable symbols) {
String pat = chars.lookahead();
String patStr = chars.lookahead();
ParsePosition pos = new ParsePosition(0);
applyPropertyPattern(pat, pos, symbols);
applyPropertyPattern(patStr, pos, symbols);
if (pos.getIndex() == 0) {
syntaxError(chars, "Invalid property pattern");
}
chars.jumpahead(pos.getIndex());
rebuiltPat.append(pat.substring(0, pos.getIndex()));
rebuiltPat.append(patStr.substring(0, pos.getIndex()));
}
//----------------------------------------------------------------
......@@ -1860,8 +1857,9 @@ public class UnicodeSet implements UnicodeMatcher {
* which UCharacterProperty.isRuleWhiteSpace() returns true,
* unless they are quoted or escaped. This may be ORed together
* with other selectors.
* @internal
* @stable ICU 3.8
*/
public static final int IGNORE_SPACE = 1;
}
/*
* Portions Copyright 2005 Sun Microsystems, Inc. All Rights Reserved.
* Portions Copyright 2005-2009 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
......@@ -22,10 +22,9 @@
* CA 95054 USA or visit www.sun.com if you need additional information or
* have any questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
......@@ -167,8 +166,8 @@ public class UnicodeSetIterator {
* @param set the set to iterate over.
* @stable ICU 2.0
*/
public void reset(UnicodeSet set) {
this.set = set;
public void reset(UnicodeSet uset) {
set = uset;
reset();
}
......@@ -213,8 +212,8 @@ public class UnicodeSetIterator {
/**
* @internal
*/
protected void loadRange(int range) {
nextElement = set.getRangeStart(range);
endElement = set.getRangeEnd(range);
protected void loadRange(int aRange) {
nextElement = set.getRangeStart(aRange);
endElement = set.getRangeEnd(aRange);
}
}
/*
* Portions Copyright 2005 Sun Microsystems, Inc. All Rights Reserved.
* Portions Copyright 2005-2009 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
......@@ -24,7 +24,7 @@
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
......@@ -36,10 +36,27 @@
package sun.text.normalizer;
// This class contains utility functions so testing not needed
///CLOVER:OFF
public final class Utility {
/**
* Convenience utility to compare two Object[]s
* Ought to be in System.
* @param len the length to compare.
* The start indices and start+len must be valid.
*/
public final static boolean arrayRegionMatches(char[] source, int sourceStart,
char[] target, int targetStart,
int len)
{
int sourceEnd = sourceStart + len;
int delta = targetStart - sourceStart;
for (int i = sourceStart; i < sourceEnd; i++) {
if (source[i]!=target[i + delta])
return false;
}
return true;
}
/**
* Convert characters outside the range U+0020 to U+007F to
* Unicode escapes, and convert backslash to a double backslash.
......@@ -344,7 +361,6 @@ public final class Utility {
return false;
}
//// for StringPrep
/**
* Similar to StringBuffer.getChars, version 1.3.
* Since JDK 1.2 implements StringBuffer.getChars differently, this method
......@@ -356,7 +372,6 @@ public final class Utility {
* @param dst char array to store the retrieved chars
* @param dstBegin offset to the start of the destination char array to
* store the retrieved chars
* @draft since ICU4J 2.0
*/
public static void getChars(StringBuffer src, int srcBegin, int srcEnd,
char dst[], int dstBegin)
......@@ -367,23 +382,4 @@ public final class Utility {
src.getChars(srcBegin, srcEnd, dst, dstBegin);
}
/**
* Convenience utility to compare two char[]s.
* @param len the length to compare.
* The start indices and start+len must be valid.
*/
public final static boolean arrayRegionMatches(char[] source, int sourceStart,
char[] target, int targetStart,
int len)
{
int sourceEnd = sourceStart + len;
int delta = targetStart - sourceStart;
for (int i = sourceStart; i < sourceEnd; i++) {
if (source[i] != target[i + delta])
return false;
}
return true;
}
}
///CLOVER:ON
/*
* Portions Copyright 2005-2008 Sun Microsystems, Inc. All Rights Reserved.
* Portions Copyright 2005-2009 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
......@@ -24,7 +24,7 @@
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
......
......@@ -72,7 +72,7 @@ public class ToLowerCase {
// I-dot tests (Turkish and Azeri)
test("\u0130", turkish, "i");
test("\u0130", az, "i");
test("\u0130", Locale.US, "i");
test("\u0130", Locale.US, "i\u0307");
// Remove dot_above in the sequence I + dot_above (Turkish and Azeri)
test("I\u0307", turkish, "i");
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册