gen-indic-table.py 5.3 KB
Newer Older
B
Behdad Esfahbod 已提交
1 2 3 4
#!/usr/bin/python

import sys

5
if len (sys.argv) != 4:
B
Behdad Esfahbod 已提交
6 7 8
	print >>sys.stderr, "usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicMatraCategory.txt Blocks.txt"
	sys.exit (1)

9 10
BLACKLISTED_BLOCKS = ["Thai", "Lao", "Tibetan"]

11
files = [file (x) for x in sys.argv[1:]]
B
Behdad Esfahbod 已提交
12 13 14 15 16 17 18 19 20 21 22

headers = [[f.readline () for i in range (2)] for f in files]

data = [{} for f in files]
values = [{} for f in files]
for i, f in enumerate (files):
	for line in f:

		j = line.find ('#')
		if j >= 0:
			line = line[:j]
B
Behdad Esfahbod 已提交
23

B
Behdad Esfahbod 已提交
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
		fields = [x.strip () for x in line.split (';')]
		if len (fields) == 1:
			continue

		uu = fields[0].split ('..')
		start = int (uu[0], 16)
		if len (uu) == 1:
			end = start
		else:
			end = int (uu[1], 16)

		t = fields[1]

		for u in range (start, end + 1):
			data[i][u] = t
		values[i][t] = values[i].get (t, 0) + 1

# Merge data into one dict:
defaults = ('Other', 'Not_Applicable', 'No_Block')
for i,v in enumerate (defaults):
	values[i][v] = values[i].get (v, 0) + 1
combined = {}
for i,d in enumerate (data):
	for u,v in d.items ():
		if i == 2 and not u in combined:
			continue
		if not u in combined:
			combined[u] = list (defaults)
		combined[u][i] = v
53
combined = {k:v for k,v in combined.items() if v[2] not in BLACKLISTED_BLOCKS}
B
Behdad Esfahbod 已提交
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
data = combined
del combined
num = len (data)

# Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out
singles = {}
for u in [0x00A0, 0x25CC]:
	singles[u] = data[u]
	del data[u]

print "/* == Start of generated table == */"
print "/*"
print " * The following table is generated by running:"
print " *"
print " *   ./gen-indic-table.py IndicSyllabicCategory.txt IndicMatraCategory.txt Blocks.txt"
print " *"
print " * on files with these headers:"
print " *"
for h in headers:
	for l in h:
		print " * %s" % (l.strip())
print " */"
B
Behdad Esfahbod 已提交
76
print
B
Behdad Esfahbod 已提交
77
print '#include "hb-ot-shape-complex-indic-private.hh"'
B
Behdad Esfahbod 已提交
78
print
B
Behdad Esfahbod 已提交
79 80 81 82 83 84

# Shorten values
short = [{
	"Bindu":		'Bi',
	"Visarga":		'Vs',
	"Vowel":		'Vo',
B
Behdad Esfahbod 已提交
85 86
	"Vowel_Dependent":	'M',
	"Other":		'x',
B
Behdad Esfahbod 已提交
87
},{
B
Behdad Esfahbod 已提交
88
	"Not_Applicable":	'x',
B
Behdad Esfahbod 已提交
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
}]
all_shorts = [[],[]]

# Add some of the values, to make them more readable, and to avoid duplicates


for i in range (2):
	for v,s in short[i].items ():
		all_shorts[i].append (s)

what = ["INDIC_SYLLABIC_CATEGORY", "INDIC_MATRA_CATEGORY"]
what_short = ["ISC", "IMC"]
for i in range (2):
	print
	vv = values[i].keys ()
	vv.sort ()
	for v in vv:
		v_no_and = v.replace ('_And_', '_')
		if v in short[i]:
			s = short[i][v]
		else:
			s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')])
			if s in all_shorts[i]:
				raise Exception ("Duplicate short value alias", v, s)
			all_shorts[i].append (s)
			short[i][v] = s
		print "#define %s_%s	%s_%s	%s/* %3d chars; %s */" % \
			(what_short[i], s, what[i], v.upper (), \
			'	'* ((48-1 - len (what[i]) - 1 - len (v)) / 8), \
			values[i][v], v)
print
print "#define _(S,M) INDIC_COMBINE_CATEGORIES (ISC_##S, IMC_##M)"
print
print

124 125
total = 0
used = 0
126
last_block = None
B
Behdad Esfahbod 已提交
127
def print_block (block, start, end, data):
128 129 130 131 132
	global total, used, last_block
	if block and block != last_block:
		print
		print
		print "  /* %s */" % block
B
Behdad Esfahbod 已提交
133
	num = 0
134 135
	assert start % 8 == 0
	assert (end+1) % 8 == 0
B
Behdad Esfahbod 已提交
136 137 138 139 140 141 142
	for u in range (start, end+1):
		if u % 8 == 0:
			print
			print "  /* %04X */" % u,
		if u in data:
			num += 1
		d = data.get (u, defaults)
B
Behdad Esfahbod 已提交
143
		sys.stdout.write ("%9s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]])))
B
Behdad Esfahbod 已提交
144

145 146
	total += end - start + 1
	used += num
147 148
	if block:
		last_block = block
B
Behdad Esfahbod 已提交
149 150 151 152 153 154

uu = data.keys ()
uu.sort ()

last = -1
num = 0
155 156 157 158
offset = 0
starts = []
ends = []
print "static const INDIC_TABLE_ELEMENT_TYPE indic_table[] = {"
B
Behdad Esfahbod 已提交
159 160 161 162
for u in uu:
	if u <= last:
		continue
	block = data[u][2]
163 164 165

	start = u//8*8
	end = start+1
B
Behdad Esfahbod 已提交
166
	while end in uu and block == data[end][2]:
167 168
		end += 1
	end = (end-1)//8*8 + 7
B
Behdad Esfahbod 已提交
169 170

	if start != last + 1:
171 172
		if start - last <= 1+16*3:
			print_block (None, last+1, start-1, data)
B
Behdad Esfahbod 已提交
173 174 175
			last = start-1
		else:
			if last >= 0:
176 177 178 179 180 181
				ends.append (last + 1)
				offset += ends[-1] - starts[-1]
			print
			print
			print "#define indic_offset_0x%04x %d" % (start, offset)
			starts.append (start)
B
Behdad Esfahbod 已提交
182 183 184

	print_block (block, start, end, data)
	last = end
185 186
ends.append (last + 1)
offset += ends[-1] - starts[-1]
B
Behdad Esfahbod 已提交
187 188
print
print
189
occupancy = used * 100. / total
190
print "}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)
B
Behdad Esfahbod 已提交
191
print
B
Behdad Esfahbod 已提交
192 193
print "INDIC_TABLE_ELEMENT_TYPE"
print "hb_indic_get_categories (hb_codepoint_t u)"
B
Behdad Esfahbod 已提交
194
print "{"
B
Behdad Esfahbod 已提交
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
print "  switch (u >> 12)"
print "  {"
pages = set([u>>12 for u in starts+ends+singles.keys()])
for p in pages:
	print "    case 0x%0X:" % p
	for (start,end) in zip (starts, ends):
		if p not in [start>>12, end>>12]: continue
		offset = "indic_offset_0x%04x" % start
		print "      if (0x%04X <= u && u <= 0x%04X) return indic_table[u - 0x%04X + %s];" % (start, end, start, offset)
	for u,d in singles.items ():
		if p != u>>12: continue
		print "      if (unlikely (u == 0x%04X)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]])
	print "      break;"
	print ""
print "    default:"
print "      break;"
print "  }"
B
Behdad Esfahbod 已提交
212
print "  return _(x,x);"
B
Behdad Esfahbod 已提交
213 214 215 216 217 218 219 220 221 222 223 224
print "}"
print
print "#undef _"
for i in range (2):
	print
	vv = values[i].keys ()
	vv.sort ()
	for v in vv:
		print "#undef %s_%s" % \
			(what_short[i], short[i][v])
print
print "/* == End of generated table == */"
225 226 227 228

# Maintain at least 30% occupancy in the table */
if occupancy < 30:
	raise Exception ("Table too sparse, please investigate: ", occupancy)