提交 2813e304 编写于 作者: B Behdad Esfahbod

[indic] Update data tables to Unicode 8.0

Test stats remain unchanged, except for Malayalam, which we investigate:

BENGALI: 353725 out of 354188 tests passed. 463 failed (0.130722%)
DEVANAGARI: 707307 out of 707394 tests passed. 87 failed (0.0122987%)
GUJARATI: 366349 out of 366457 tests passed. 108 failed (0.0294714%)
GURMUKHI: 60732 out of 60747 tests passed. 15 failed (0.0246926%)
KANNADA: 951190 out of 951913 tests passed. 723 failed (0.0759523%)
KHMER: 299070 out of 299124 tests passed. 54 failed (0.0180527%)
MALAYALAM: 1047584 out of 1048334 tests passed. 750 failed (0.0715421%)
ORIYA: 42320 out of 42329 tests passed. 9 failed (0.021262%)
SINHALA: 271662 out of 271847 tests passed. 185 failed (0.068053%)
TAMIL: 1091753 out of 1091754 tests passed. 1 failed (9.15957e-05%)
TELUGU: 970555 out of 970573 tests passed. 18 failed (0.00185457%)

Myanmar, compared to Windows 10 mmrtext.ttf:

MYANMAR: 1123865 out of 1123883 tests passed. 18 failed (0.00160159%)
上级 fc06cff4
......@@ -91,6 +91,7 @@ short = [{
"Visarga": 'Vs',
"Vowel": 'Vo',
"Vowel_Dependent": 'M',
"Consonant_Prefixed": 'CPrf',
"Other": 'x',
},{
"Not_Applicable": 'x',
......
......@@ -109,27 +109,31 @@ enum indic_syllabic_category_t {
INDIC_SYLLABIC_CATEGORY_AVAGRAHA = OT_Symbol,
INDIC_SYLLABIC_CATEGORY_BINDU = OT_SM,
INDIC_SYLLABIC_CATEGORY_BRAHMI_JOINING_NUMBER = OT_PLACEHOLDER, /* TODO */
INDIC_SYLLABIC_CATEGORY_BRAHMI_JOINING_NUMBER = OT_PLACEHOLDER, /* Don't care. */
INDIC_SYLLABIC_CATEGORY_CANTILLATION_MARK = OT_A,
INDIC_SYLLABIC_CATEGORY_CONSONANT = OT_C,
INDIC_SYLLABIC_CATEGORY_CONSONANT_DEAD = OT_C,
INDIC_SYLLABIC_CATEGORY_CONSONANT_FINAL = OT_CM,
INDIC_SYLLABIC_CATEGORY_CONSONANT_HEAD_LETTER = OT_C,
INDIC_SYLLABIC_CATEGORY_CONSONANT_KILLER = OT_M, /* U+17CD only. */
INDIC_SYLLABIC_CATEGORY_CONSONANT_MEDIAL = OT_CM,
INDIC_SYLLABIC_CATEGORY_CONSONANT_PLACEHOLDER = OT_PLACEHOLDER,
INDIC_SYLLABIC_CATEGORY_CONSONANT_PRECEDING_REPHA = OT_Repha,
INDIC_SYLLABIC_CATEGORY_CONSONANT_PREFIXED = OT_X, /* Don't care. */
INDIC_SYLLABIC_CATEGORY_CONSONANT_SUBJOINED = OT_CM,
INDIC_SYLLABIC_CATEGORY_CONSONANT_SUCCEEDING_REPHA = OT_N,
INDIC_SYLLABIC_CATEGORY_CONSONANT_WITH_STACKER = OT_Repha, /* TODO */
INDIC_SYLLABIC_CATEGORY_GEMINATION_MARK = OT_SM,
INDIC_SYLLABIC_CATEGORY_INVISIBLE_STACKER = OT_H, /* TODO */
INDIC_SYLLABIC_CATEGORY_INVISIBLE_STACKER = OT_Coeng,
INDIC_SYLLABIC_CATEGORY_JOINER = OT_ZWJ,
INDIC_SYLLABIC_CATEGORY_MODIFYING_LETTER = OT_X,
INDIC_SYLLABIC_CATEGORY_NON_JOINER = OT_ZWNJ,
INDIC_SYLLABIC_CATEGORY_NUKTA = OT_N,
INDIC_SYLLABIC_CATEGORY_NUMBER = OT_PLACEHOLDER,
INDIC_SYLLABIC_CATEGORY_NUMBER_JOINER = OT_PLACEHOLDER, /* TODO */
INDIC_SYLLABIC_CATEGORY_PURE_KILLER = OT_H, /* TODO */
INDIC_SYLLABIC_CATEGORY_NUMBER_JOINER = OT_PLACEHOLDER, /* Don't care. */
INDIC_SYLLABIC_CATEGORY_PURE_KILLER = OT_M, /* Is like a vowel matra. */
INDIC_SYLLABIC_CATEGORY_REGISTER_SHIFTER = OT_RS,
INDIC_SYLLABIC_CATEGORY_SYLLABLE_MODIFIER = OT_M, /* Misc Khmer signs. */
INDIC_SYLLABIC_CATEGORY_TONE_LETTER = OT_X,
INDIC_SYLLABIC_CATEGORY_TONE_MARK = OT_N,
INDIC_SYLLABIC_CATEGORY_VIRAMA = OT_H,
......@@ -162,17 +166,23 @@ enum indic_matra_category_t {
};
#define INDIC_COMBINE_CATEGORIES(S,M) \
(ASSERT_STATIC_EXPR_ZERO (M == INDIC_MATRA_CATEGORY_NOT_APPLICABLE || \
( \
S == INDIC_SYLLABIC_CATEGORY_CONSONANT_MEDIAL || \
S == INDIC_SYLLABIC_CATEGORY_GEMINATION_MARK || \
S == INDIC_SYLLABIC_CATEGORY_REGISTER_SHIFTER || \
S == INDIC_SYLLABIC_CATEGORY_CONSONANT_SUCCEEDING_REPHA || \
S == INDIC_SYLLABIC_CATEGORY_VIRAMA || \
S == INDIC_SYLLABIC_CATEGORY_VOWEL_DEPENDENT || \
false)) + \
ASSERT_STATIC_EXPR_ZERO (S < 255 && M < 255) + \
((M << 8) | S))
( \
ASSERT_STATIC_EXPR_ZERO (S < 255 && M < 255) + \
( S | \
( \
( \
S == INDIC_SYLLABIC_CATEGORY_CONSONANT_MEDIAL || \
S == INDIC_SYLLABIC_CATEGORY_GEMINATION_MARK || \
S == INDIC_SYLLABIC_CATEGORY_REGISTER_SHIFTER || \
S == INDIC_SYLLABIC_CATEGORY_CONSONANT_SUCCEEDING_REPHA || \
S == INDIC_SYLLABIC_CATEGORY_VIRAMA || \
S == INDIC_SYLLABIC_CATEGORY_VOWEL_DEPENDENT || \
false \
? M : INDIC_MATRA_CATEGORY_NOT_APPLICABLE \
) << 8 \
) \
) \
)
HB_INTERNAL INDIC_TABLE_ELEMENT_TYPE
hb_indic_get_categories (hb_codepoint_t u);
......
......@@ -176,24 +176,8 @@ set_indic_properties (hb_glyph_info_t &info)
* Re-assign category
*/
/* The spec says U+0952 is OT_A. However, testing shows that Uniscribe
* treats a whole bunch of characters similarly.
* TESTS: For example, for U+0951:
* U+092E,U+0947,U+0952
* U+092E,U+0952,U+0947
* U+092E,U+0947,U+0951
* U+092E,U+0951,U+0947
* U+092E,U+0951,U+0952
* U+092E,U+0952,U+0951
*/
if (unlikely (hb_in_ranges (u, 0x0951u, 0x0952u,
0x1CD0u, 0x1CD2u,
0x1CD4u, 0x1CE1u) ||
u == 0x1CF4u))
cat = OT_A;
/* The following act more like the Bindus. */
else if (unlikely (hb_in_range (u, 0x0953u, 0x0954u)))
if (unlikely (hb_in_range (u, 0x0953u, 0x0954u)))
cat = OT_SM;
/* The following act like consonants. */
else if (unlikely (hb_in_ranges (u, 0x0A72u, 0x0A73u,
......@@ -216,15 +200,12 @@ set_indic_properties (hb_glyph_info_t &info)
cat = OT_Symbol;
ASSERT_STATIC ((int) INDIC_SYLLABIC_CATEGORY_AVAGRAHA == OT_Symbol);
}
else if (unlikely (hb_in_range (u, 0x17CDu, 0x17D1u) ||
u == 0x17CBu || u == 0x17D3u || u == 0x17DDu)) /* Khmer Various signs */
else if (unlikely (u == 0x17DDu)) /* https://github.com/roozbehp/unicode-data/issues/2 */
{
/* These are like Top Matras. */
cat = OT_M;
pos = POS_ABOVE_C;
}
else if (unlikely (u == 0x17C6u)) cat = OT_N; /* Khmer Bindu doesn't like to be repositioned. */
else if (unlikely (u == 0x17D2u)) cat = OT_Coeng; /* Khmer coeng */
else if (unlikely (hb_in_range (u, 0x2010u, 0x2011u)))
cat = OT_PLACEHOLDER;
else if (unlikely (u == 0x25CCu)) cat = OT_DOTTEDCIRCLE;
......
......@@ -199,6 +199,10 @@ set_myanmar_properties (hb_glyph_info_t &info)
cat = (indic_category_t) OT_A;
break;
case 0x1039u:
cat = (indic_category_t) OT_H;
break;
case 0x103Au:
cat = (indic_category_t) OT_As;
break;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册