gen-indic-table.py 5.3 KB
Newer Older
B
Behdad Esfahbod 已提交
1 2 3 4
#!/usr/bin/python

import sys

5
if len (sys.argv) != 4:
B
Behdad Esfahbod 已提交
6 7 8
	print >>sys.stderr, "usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicMatraCategory.txt Blocks.txt"
	sys.exit (1)

9 10
BLACKLISTED_BLOCKS = ["Thai", "Lao", "Tibetan"]

11
files = [file (x) for x in sys.argv[1:]]
B
Behdad Esfahbod 已提交
12 13 14 15 16 17 18 19 20 21 22

headers = [[f.readline () for i in range (2)] for f in files]

data = [{} for f in files]
values = [{} for f in files]
for i, f in enumerate (files):
	for line in f:

		j = line.find ('#')
		if j >= 0:
			line = line[:j]
B
Behdad Esfahbod 已提交
23

B
Behdad Esfahbod 已提交
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
		fields = [x.strip () for x in line.split (';')]
		if len (fields) == 1:
			continue

		uu = fields[0].split ('..')
		start = int (uu[0], 16)
		if len (uu) == 1:
			end = start
		else:
			end = int (uu[1], 16)

		t = fields[1]

		for u in range (start, end + 1):
			data[i][u] = t
		values[i][t] = values[i].get (t, 0) + 1

# Merge data into one dict:
defaults = ('Other', 'Not_Applicable', 'No_Block')
for i,v in enumerate (defaults):
	values[i][v] = values[i].get (v, 0) + 1
combined = {}
for i,d in enumerate (data):
	for u,v in d.items ():
		if i == 2 and not u in combined:
			continue
		if not u in combined:
			combined[u] = list (defaults)
		combined[u][i] = v
53
combined = {k:v for k,v in combined.items() if v[2] not in BLACKLISTED_BLOCKS}
B
Behdad Esfahbod 已提交
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
data = combined
del combined
num = len (data)

# Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out
singles = {}
for u in [0x00A0, 0x25CC]:
	singles[u] = data[u]
	del data[u]

print "/* == Start of generated table == */"
print "/*"
print " * The following table is generated by running:"
print " *"
print " *   ./gen-indic-table.py IndicSyllabicCategory.txt IndicMatraCategory.txt Blocks.txt"
print " *"
print " * on files with these headers:"
print " *"
for h in headers:
	for l in h:
		print " * %s" % (l.strip())
print " */"
B
Behdad Esfahbod 已提交
76
print
B
Behdad Esfahbod 已提交
77
print '#include "hb-ot-shape-complex-indic-private.hh"'
B
Behdad Esfahbod 已提交
78
print
B
Behdad Esfahbod 已提交
79 80 81 82 83 84

# Shorten values
short = [{
	"Bindu":		'Bi',
	"Visarga":		'Vs',
	"Vowel":		'Vo',
B
Behdad Esfahbod 已提交
85 86
	"Vowel_Dependent":	'M',
	"Other":		'x',
B
Behdad Esfahbod 已提交
87
},{
B
Behdad Esfahbod 已提交
88
	"Not_Applicable":	'x',
B
Behdad Esfahbod 已提交
89
}]
B
Behdad Esfahbod 已提交
90
all_shorts = [{},{}]
B
Behdad Esfahbod 已提交
91 92 93 94 95 96

# Add some of the values, to make them more readable, and to avoid duplicates


for i in range (2):
	for v,s in short[i].items ():
B
Behdad Esfahbod 已提交
97
		all_shorts[i][s] = v
B
Behdad Esfahbod 已提交
98 99 100 101 102 103 104 105 106 107 108 109 110 111

what = ["INDIC_SYLLABIC_CATEGORY", "INDIC_MATRA_CATEGORY"]
what_short = ["ISC", "IMC"]
for i in range (2):
	print
	vv = values[i].keys ()
	vv.sort ()
	for v in vv:
		v_no_and = v.replace ('_And_', '_')
		if v in short[i]:
			s = short[i][v]
		else:
			s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')])
			if s in all_shorts[i]:
B
Behdad Esfahbod 已提交
112 113
				raise Exception ("Duplicate short value alias", v, all_shorts[i][s])
			all_shorts[i][s] = v
B
Behdad Esfahbod 已提交
114 115 116 117 118 119 120 121 122 123
			short[i][v] = s
		print "#define %s_%s	%s_%s	%s/* %3d chars; %s */" % \
			(what_short[i], s, what[i], v.upper (), \
			'	'* ((48-1 - len (what[i]) - 1 - len (v)) / 8), \
			values[i][v], v)
print
print "#define _(S,M) INDIC_COMBINE_CATEGORIES (ISC_##S, IMC_##M)"
print
print

124 125
total = 0
used = 0
126
last_block = None
B
Behdad Esfahbod 已提交
127
def print_block (block, start, end, data):
128 129 130 131 132
	global total, used, last_block
	if block and block != last_block:
		print
		print
		print "  /* %s */" % block
B
Behdad Esfahbod 已提交
133
	num = 0
134 135
	assert start % 8 == 0
	assert (end+1) % 8 == 0
B
Behdad Esfahbod 已提交
136 137 138 139 140 141 142
	for u in range (start, end+1):
		if u % 8 == 0:
			print
			print "  /* %04X */" % u,
		if u in data:
			num += 1
		d = data.get (u, defaults)
B
Behdad Esfahbod 已提交
143
		sys.stdout.write ("%9s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]])))
B
Behdad Esfahbod 已提交
144

145 146
	total += end - start + 1
	used += num
147 148
	if block:
		last_block = block
B
Behdad Esfahbod 已提交
149 150 151 152

uu = data.keys ()
uu.sort ()

B
Minor  
Behdad Esfahbod 已提交
153
last = -100000
B
Behdad Esfahbod 已提交
154
num = 0
155 156 157 158
offset = 0
starts = []
ends = []
print "static const INDIC_TABLE_ELEMENT_TYPE indic_table[] = {"
B
Behdad Esfahbod 已提交
159 160 161 162
for u in uu:
	if u <= last:
		continue
	block = data[u][2]
163 164 165

	start = u//8*8
	end = start+1
B
Behdad Esfahbod 已提交
166
	while end in uu and block == data[end][2]:
167 168
		end += 1
	end = (end-1)//8*8 + 7
B
Behdad Esfahbod 已提交
169 170

	if start != last + 1:
171 172
		if start - last <= 1+16*3:
			print_block (None, last+1, start-1, data)
B
Behdad Esfahbod 已提交
173 174 175
			last = start-1
		else:
			if last >= 0:
176 177 178 179 180 181
				ends.append (last + 1)
				offset += ends[-1] - starts[-1]
			print
			print
			print "#define indic_offset_0x%04x %d" % (start, offset)
			starts.append (start)
B
Behdad Esfahbod 已提交
182 183 184

	print_block (block, start, end, data)
	last = end
185 186
ends.append (last + 1)
offset += ends[-1] - starts[-1]
B
Behdad Esfahbod 已提交
187 188
print
print
189
occupancy = used * 100. / total
B
Behdad Esfahbod 已提交
190
page_bits = 12
191
print "}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)
B
Behdad Esfahbod 已提交
192
print
B
Behdad Esfahbod 已提交
193 194
print "INDIC_TABLE_ELEMENT_TYPE"
print "hb_indic_get_categories (hb_codepoint_t u)"
B
Behdad Esfahbod 已提交
195
print "{"
B
Behdad Esfahbod 已提交
196
print "  switch (u >> %d)" % page_bits
B
Behdad Esfahbod 已提交
197
print "  {"
B
Behdad Esfahbod 已提交
198
pages = set([u>>page_bits for u in starts+ends+singles.keys()])
199
for p in sorted(pages):
B
Behdad Esfahbod 已提交
200 201
	print "    case 0x%0X:" % p
	for (start,end) in zip (starts, ends):
B
Behdad Esfahbod 已提交
202
		if p not in [start>>page_bits, end>>page_bits]: continue
B
Behdad Esfahbod 已提交
203 204 205
		offset = "indic_offset_0x%04x" % start
		print "      if (0x%04X <= u && u <= 0x%04X) return indic_table[u - 0x%04X + %s];" % (start, end, start, offset)
	for u,d in singles.items ():
B
Behdad Esfahbod 已提交
206
		if p != u>>page_bits: continue
B
Behdad Esfahbod 已提交
207 208 209 210 211 212
		print "      if (unlikely (u == 0x%04X)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]])
	print "      break;"
	print ""
print "    default:"
print "      break;"
print "  }"
B
Behdad Esfahbod 已提交
213
print "  return _(x,x);"
B
Behdad Esfahbod 已提交
214 215 216 217 218 219 220 221 222 223 224 225
print "}"
print
print "#undef _"
for i in range (2):
	print
	vv = values[i].keys ()
	vv.sort ()
	for v in vv:
		print "#undef %s_%s" % \
			(what_short[i], short[i][v])
print
print "/* == End of generated table == */"
226 227 228 229

# Maintain at least 30% occupancy in the table */
if occupancy < 30:
	raise Exception ("Table too sparse, please investigate: ", occupancy)