提交 b9ddbd55 编写于 作者: B Behdad Esfahbod

[Indic] Start an Indic shaper

Nothing functional in there yet.

So far, we're parsing IndicSyllabicCategory.txt and IndicMatraCategory.txt
fils from Unicode Character Database and store them in an array to be used
by the shaper.  Also hooked up the shaper, but it does not do anything
right now.
上级 697a65c5
......@@ -55,6 +55,8 @@ HBSOURCES += \
hb-ot-shape.cc \
hb-ot-shape-complex-arabic.cc \
hb-ot-shape-complex-arabic-table.hh \
hb-ot-shape-complex-indic.cc \
hb-ot-shape-complex-indic-table.hh \
hb-ot-shape-complex-private.hh \
hb-ot-shape-private.hh \
hb-ot-tag.cc \
......@@ -114,6 +116,7 @@ nodist_pkginclude_HEADERS = hb-version.h
GENERATORS = \
gen-arabic-table.py \
gen-indic-table.py \
$(NULL)
EXTRA_DIST += $(GENERATORS)
......
#!/usr/bin/python
import sys
if len (sys.argv) < 4:
print >>sys.stderr, "usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicMatraCategory.txt Blocks.txt"
sys.exit (1)
files = [file (sys.argv[i+1]) for i in range (3)]
headers = [[f.readline () for i in range (2)] for f in files]
blocks = {}
data = [{} for f in files]
values = [{} for f in files]
for i, f in enumerate (files):
for line in f:
j = line.find ('#')
if j >= 0:
line = line[:j]
fields = [x.strip () for x in line.split (';')]
if len (fields) == 1:
continue
uu = fields[0].split ('..')
start = int (uu[0], 16)
if len (uu) == 1:
end = start
else:
end = int (uu[1], 16)
t = fields[1]
for u in range (start, end + 1):
data[i][u] = t
values[i][t] = values[i].get (t, 0) + 1
if i == 2:
blocks[t] = (start, end)
# Merge data into one dict:
defaults = ('Other', 'Not_Applicable', 'No_Block')
for i,v in enumerate (defaults):
values[i][v] = values[i].get (v, 0) + 1
combined = {}
for i,d in enumerate (data):
for u,v in d.items ():
if i == 2 and not u in combined:
continue
if not u in combined:
combined[u] = list (defaults)
combined[u][i] = v
data = combined
del combined
num = len (data)
# Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out
singles = {}
for u in [0x00A0, 0x25CC]:
singles[u] = data[u]
del data[u]
print "/* == Start of generated table == */"
print "/*"
print " * The following table is generated by running:"
print " *"
print " * ./gen-indic-table.py IndicSyllabicCategory.txt IndicMatraCategory.txt Blocks.txt"
print " *"
print " * on files with these headers:"
print " *"
for h in headers:
for l in h:
print " * %s" % (l.strip())
print " */"
# Shorten values
print
short = [{
"Bindu": 'Bi',
"Visarga": 'Vs',
"Vowel": 'Vo',
"Other": 'X',
},{
"Not_Applicable": 'X',
}]
all_shorts = [[],[]]
# Add some of the values, to make them more readable, and to avoid duplicates
for i in range (2):
for v,s in short[i].items ():
all_shorts[i].append (s)
what = ["INDIC_SYLLABIC_CATEGORY", "INDIC_MATRA_CATEGORY"]
what_short = ["ISC", "IMC"]
for i in range (2):
print
vv = values[i].keys ()
vv.sort ()
for v in vv:
v_no_and = v.replace ('_And_', '_')
if v in short[i]:
s = short[i][v]
else:
s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')])
if s in all_shorts[i]:
raise Exception ("Duplicate short value alias", v, s)
all_shorts[i].append (s)
short[i][v] = s
print "#define %s_%s %s_%s %s/* %3d chars; %s */" % \
(what_short[i], s, what[i], v.upper (), \
' '* ((48-1 - len (what[i]) - 1 - len (v)) / 8), \
values[i][v], v)
print
print "#define _(S,M) INDIC_COMBINE_CATEGORIES (ISC_##S, IMC_##M)"
print
print
def print_block (block, start, end, data):
print
print
print " /* %s (%04X..%04X) */" % (block, start, end)
num = 0
for u in range (start, end+1):
if u % 8 == 0:
print
print " /* %04X */" % u,
if u in data:
num += 1
d = data.get (u, defaults)
print "%10s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]])),
if num == 0:
# Filler block, don't check occupancy
return
total = end - start + 1
occupancy = num * 100. / total
# Maintain at least 30% occupancy in the table */
if occupancy < 30:
raise Exception ("Table too sparse, please investigate: ", occupancy, block)
uu = data.keys ()
uu.sort ()
last = -1
num = 0
total = 0
tables = []
for u in uu:
if u <= last:
continue
block = data[u][2]
(start, end) = blocks[block]
if start != last + 1:
if start - last <= 33:
print_block ("FILLER", last+1, start-1, data)
last = start-1
else:
if last >= 0:
print
print "};"
print
print "static const INDIC_TABLE_ELEMENT_TYPE indic_table_0x%04x[] =" % start
print "{",
tables.append (start)
print_block (block, start, end, data)
last = end
print
print "};"
print
print
print "static INDIC_TABLE_ELEMENT_TYPE"
print "get_indic_categories (hb_codepoint_t u)"
print "{"
for u in tables:
t = "indic_table_0x%04x" % u
print " if (0x%04X <= u && u <= 0x%04X + ARRAY_LENGTH (%s)) return %s[u - 0x%04X];" % (u, u, t, t, u)
for u,d in singles.items ():
print " if (unlikely (u == 0x%04X)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]])
print " return _(X,X);"
print "}"
print
print "#undef _"
for i in range (2):
print
vv = values[i].keys ()
vv.sort ()
for v in vv:
print "#undef %s_%s" % \
(what_short[i], short[i][v])
print
print
print "/* == End of generated table == */"
此差异已折叠。
/*
* Copyright © 2010 Google, Inc.
*
* This is part of HarfBuzz, a text shaping library.
*
* Permission is hereby granted, without written agreement and without
* license or royalty fees, to use, copy, modify, and distribute this
* software and its documentation for any purpose, provided that the
* above copyright notice and the following two paragraphs appear in
* all copies of this software.
*
* IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
* DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
* ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
* IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*
* THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
* ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
* PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
*
* Google Author(s): Behdad Esfahbod
*/
#include "hb-ot-shape-complex-private.hh"
HB_BEGIN_DECLS
/* buffer var allocations */
#define indic_categories() var2.u32 /* indic shaping action */
#define INDIC_TABLE_ELEMENT_TYPE uint8_t
enum indic_syllabic_category_t {
INDIC_SYLLABIC_CATEGORY_AVAGRAHA,
INDIC_SYLLABIC_CATEGORY_BINDU,
INDIC_SYLLABIC_CATEGORY_CONSONANT,
INDIC_SYLLABIC_CATEGORY_CONSONANT_DEAD,
INDIC_SYLLABIC_CATEGORY_CONSONANT_FINAL,
INDIC_SYLLABIC_CATEGORY_CONSONANT_HEAD_LETTER,
INDIC_SYLLABIC_CATEGORY_CONSONANT_MEDIAL,
INDIC_SYLLABIC_CATEGORY_CONSONANT_PLACEHOLDER,
INDIC_SYLLABIC_CATEGORY_CONSONANT_REPHA,
INDIC_SYLLABIC_CATEGORY_CONSONANT_SUBJOINED,
INDIC_SYLLABIC_CATEGORY_MODIFYING_LETTER,
INDIC_SYLLABIC_CATEGORY_NUKTA,
INDIC_SYLLABIC_CATEGORY_OTHER,
INDIC_SYLLABIC_CATEGORY_REGISTER_SHIFTER,
INDIC_SYLLABIC_CATEGORY_TONE_LETTER,
INDIC_SYLLABIC_CATEGORY_TONE_MARK,
INDIC_SYLLABIC_CATEGORY_VIRAMA,
INDIC_SYLLABIC_CATEGORY_VISARGA,
INDIC_SYLLABIC_CATEGORY_VOWEL,
INDIC_SYLLABIC_CATEGORY_VOWEL_DEPENDENT,
INDIC_SYLLABIC_CATEGORY_VOWEL_INDEPENDENT,
};
enum indic_matra_category_t {
INDIC_MATRA_CATEGORY_BOTTOM,
INDIC_MATRA_CATEGORY_BOTTOM_AND_RIGHT,
INDIC_MATRA_CATEGORY_INVISIBLE,
INDIC_MATRA_CATEGORY_LEFT,
INDIC_MATRA_CATEGORY_LEFT_AND_RIGHT,
INDIC_MATRA_CATEGORY_NOT_APPLICABLE,
INDIC_MATRA_CATEGORY_OVERSTRUCK,
INDIC_MATRA_CATEGORY_RIGHT,
INDIC_MATRA_CATEGORY_TOP,
INDIC_MATRA_CATEGORY_TOP_AND_BOTTOM,
INDIC_MATRA_CATEGORY_TOP_AND_BOTTOM_AND_RIGHT,
INDIC_MATRA_CATEGORY_TOP_AND_LEFT,
INDIC_MATRA_CATEGORY_TOP_AND_LEFT_AND_RIGHT,
INDIC_MATRA_CATEGORY_TOP_AND_RIGHT,
INDIC_MATRA_CATEGORY_VISUAL_ORDER_LEFT,
};
#define INDIC_COMBINE_CATEGORIES(S,M) (S)
#include "hb-ot-shape-complex-indic-table.hh"
static const hb_tag_t indic_basic_features[] =
{
HB_TAG('n','u','k','t'),
HB_TAG('a','k','h','n'),
HB_TAG('r','p','h','f'),
HB_TAG('r','k','r','f'),
HB_TAG('p','r','e','f'),
HB_TAG('b','l','w','f'),
HB_TAG('h','a','l','f'),
HB_TAG('v','a','t','u'),
HB_TAG('p','s','t','f'),
HB_TAG('c','j','c','t'),
};
static const hb_tag_t indic_other_features[] =
{
HB_TAG('p','r','e','s'),
HB_TAG('a','b','v','s'),
HB_TAG('b','l','w','s'),
HB_TAG('p','s','t','s'),
HB_TAG('h','a','l','n'),
HB_TAG('d','i','s','t'),
HB_TAG('a','b','v','m'),
HB_TAG('b','l','w','m'),
};
void
_hb_ot_shape_complex_collect_features_indic (hb_ot_shape_planner_t *planner, const hb_segment_properties_t *props)
{
for (unsigned int i = 0; i < ARRAY_LENGTH (indic_basic_features); i++)
planner->map.add_bool_feature (indic_basic_features[i], false);
for (unsigned int i = 0; i < ARRAY_LENGTH (indic_other_features); i++)
planner->map.add_bool_feature (indic_other_features[i], true);
}
void
_hb_ot_shape_complex_setup_masks_indic (hb_ot_shape_context_t *c)
{
unsigned int count = c->buffer->len;
for (unsigned int i = 0; i < count; i++)
{
unsigned int this_type = get_indic_categories (c->buffer->info[i].codepoint);
c->buffer->info[i].indic_categories() = this_type;
}
hb_mask_t mask_array[ARRAY_LENGTH (indic_basic_features)] = {0};
unsigned int num_masks = ARRAY_LENGTH (indic_basic_features);
for (unsigned int i = 0; i < num_masks; i++)
mask_array[i] = c->plan->map.get_1_mask (indic_basic_features[i]);
}
HB_END_DECLS
......@@ -40,12 +40,58 @@ hb_ot_shape_complex_categorize (const hb_segment_properties_t *props)
switch ((int) props->script)
{
case HB_SCRIPT_ARABIC:
case HB_SCRIPT_NKO:
case HB_SCRIPT_SYRIAC:
case HB_SCRIPT_MANDAIC:
case HB_SCRIPT_MONGOLIAN:
case HB_SCRIPT_NKO:
case HB_SCRIPT_SYRIAC:
return hb_ot_complex_shaper_arabic;
/* TODO: These are all the scripts that the ucd/IndicSyllabicCategory.txt covers.
* Quite possibly many of these need no shaping, and some other are encoded visually.
* Needs to be refined.
*/
case HB_SCRIPT_BALINESE:
case HB_SCRIPT_BATAK:
case HB_SCRIPT_BENGALI:
case HB_SCRIPT_BRAHMI:
case HB_SCRIPT_BUGINESE:
case HB_SCRIPT_BUHID:
case HB_SCRIPT_CHAM:
case HB_SCRIPT_DEVANAGARI:
case HB_SCRIPT_GUJARATI:
case HB_SCRIPT_GURMUKHI:
case HB_SCRIPT_HANUNOO:
case HB_SCRIPT_JAVANESE:
case HB_SCRIPT_KAITHI:
case HB_SCRIPT_KANNADA:
case HB_SCRIPT_KAYAH_LI:
case HB_SCRIPT_KHAROSHTHI:
case HB_SCRIPT_KHMER:
case HB_SCRIPT_LAO:
case HB_SCRIPT_LEPCHA:
case HB_SCRIPT_LIMBU:
case HB_SCRIPT_MALAYALAM:
case HB_SCRIPT_MEETEI_MAYEK:
case HB_SCRIPT_MYANMAR:
case HB_SCRIPT_NEW_TAI_LUE:
case HB_SCRIPT_ORIYA:
case HB_SCRIPT_PHAGS_PA:
case HB_SCRIPT_REJANG:
case HB_SCRIPT_SAURASHTRA:
case HB_SCRIPT_SINHALA:
case HB_SCRIPT_SUNDANESE:
case HB_SCRIPT_SYLOTI_NAGRI:
case HB_SCRIPT_TAGALOG:
case HB_SCRIPT_TAGBANWA:
case HB_SCRIPT_TAI_LE:
case HB_SCRIPT_TAI_THAM:
case HB_SCRIPT_TAI_VIET:
case HB_SCRIPT_TAMIL:
case HB_SCRIPT_TELUGU:
case HB_SCRIPT_THAI:
case HB_SCRIPT_TIBETAN:
return hb_ot_complex_shaper_indic;
default:
return hb_ot_complex_shaper_none;
}
......@@ -62,6 +108,7 @@ hb_ot_shape_complex_categorize (const hb_segment_properties_t *props)
*/
HB_INTERNAL void _hb_ot_shape_complex_collect_features_arabic (hb_ot_shape_planner_t *plan, const hb_segment_properties_t *props);
HB_INTERNAL void _hb_ot_shape_complex_collect_features_indic (hb_ot_shape_planner_t *plan, const hb_segment_properties_t *props);
static inline void
hb_ot_shape_complex_collect_features (hb_ot_shape_planner_t *planner,
......@@ -69,6 +116,7 @@ hb_ot_shape_complex_collect_features (hb_ot_shape_planner_t *planner,
{
switch (planner->shaper) {
case hb_ot_complex_shaper_arabic: _hb_ot_shape_complex_collect_features_arabic (planner, props); return;
case hb_ot_complex_shaper_indic: _hb_ot_shape_complex_collect_features_indic (planner, props); return;
case hb_ot_complex_shaper_none: default: return;
}
}
......@@ -82,12 +130,14 @@ hb_ot_shape_complex_collect_features (hb_ot_shape_planner_t *planner,
*/
HB_INTERNAL void _hb_ot_shape_complex_setup_masks_arabic (hb_ot_shape_context_t *c);
HB_INTERNAL void _hb_ot_shape_complex_setup_masks_indic (hb_ot_shape_context_t *c);
static inline void
hb_ot_shape_complex_setup_masks (hb_ot_shape_context_t *c)
{
switch (c->plan->shaper) {
case hb_ot_complex_shaper_arabic: _hb_ot_shape_complex_setup_masks_arabic (c); return;
case hb_ot_complex_shaper_indic: _hb_ot_shape_complex_setup_masks_indic (c); return;
case hb_ot_complex_shaper_none: default: return;
}
}
......
......@@ -43,7 +43,8 @@ HB_BEGIN_DECLS
enum hb_ot_complex_shaper_t {
hb_ot_complex_shaper_none,
hb_ot_complex_shaper_arabic
hb_ot_complex_shaper_arabic,
hb_ot_complex_shaper_indic,
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册