gen-indic-table.py 6.6 KB
Newer Older
P
pssea 已提交
1
#!/usr/bin/env python3
2

P
pssea 已提交
3
"""usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt
B
Behdad Esfahbod 已提交
4

P
pssea 已提交
5 6 7 8 9 10 11
Input files:
* https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
* https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
* https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
"""

import sys
B
Behdad Esfahbod 已提交
12

13
if len (sys.argv) != 4:
P
pssea 已提交
14
	sys.exit (__doc__)
B
Behdad Esfahbod 已提交
15

16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
ALLOWED_SINGLES = [0x00A0, 0x25CC]
ALLOWED_BLOCKS = [
	'Basic Latin',
	'Latin-1 Supplement',
	'Devanagari',
	'Bengali',
	'Gurmukhi',
	'Gujarati',
	'Oriya',
	'Tamil',
	'Telugu',
	'Kannada',
	'Malayalam',
	'Sinhala',
	'Myanmar',
	'Khmer',
	'Vedic Extensions',
	'General Punctuation',
	'Superscripts and Subscripts',
	'Devanagari Extended',
	'Myanmar Extended-B',
	'Myanmar Extended-A',
]
39

P
pssea 已提交
40
files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
B
Behdad Esfahbod 已提交
41 42 43

headers = [[f.readline () for i in range (2)] for f in files]

P
pssea 已提交
44 45
data = [{} for _ in files]
values = [{} for _ in files]
B
Behdad Esfahbod 已提交
46 47 48 49 50 51
for i, f in enumerate (files):
	for line in f:

		j = line.find ('#')
		if j >= 0:
			line = line[:j]
B
Behdad Esfahbod 已提交
52

B
Behdad Esfahbod 已提交
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
		fields = [x.strip () for x in line.split (';')]
		if len (fields) == 1:
			continue

		uu = fields[0].split ('..')
		start = int (uu[0], 16)
		if len (uu) == 1:
			end = start
		else:
			end = int (uu[1], 16)

		t = fields[1]

		for u in range (start, end + 1):
			data[i][u] = t
68
		values[i][t] = values[i].get (t, 0) + end - start + 1
B
Behdad Esfahbod 已提交
69 70 71 72 73 74 75 76 77 78 79 80 81

# Merge data into one dict:
defaults = ('Other', 'Not_Applicable', 'No_Block')
for i,v in enumerate (defaults):
	values[i][v] = values[i].get (v, 0) + 1
combined = {}
for i,d in enumerate (data):
	for u,v in d.items ():
		if i == 2 and not u in combined:
			continue
		if not u in combined:
			combined[u] = list (defaults)
		combined[u][i] = v
82
combined = {k:v for k,v in combined.items() if k in ALLOWED_SINGLES or v[2] in ALLOWED_BLOCKS}
B
Behdad Esfahbod 已提交
83 84 85 86 87
data = combined
del combined

# Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out
singles = {}
88
for u in ALLOWED_SINGLES:
B
Behdad Esfahbod 已提交
89 90 91
	singles[u] = data[u]
	del data[u]

92 93 94 95 96 97 98 99
print ("/* == Start of generated table == */")
print ("/*")
print (" * The following table is generated by running:")
print (" *")
print (" *   ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt")
print (" *")
print (" * on files with these headers:")
print (" *")
B
Behdad Esfahbod 已提交
100 101
for h in headers:
	for l in h:
102 103 104
		print (" * %s" % (l.strip()))
print (" */")
print ()
105 106 107 108
print ('#include "hb.hh"')
print ()
print ('#ifndef HB_NO_OT_SHAPE')
print ()
109
print ('#include "hb-ot-shape-complex-indic.hh"')
110
print ()
B
Behdad Esfahbod 已提交
111 112 113 114

# Shorten values
short = [{
	"Bindu":		'Bi',
115 116 117 118
	"Cantillation_Mark":	'Ca',
	"Joiner":		'ZWJ',
	"Non_Joiner":		'ZWNJ',
	"Number":		'Nd',
B
Behdad Esfahbod 已提交
119 120
	"Visarga":		'Vs',
	"Vowel":		'Vo',
B
Behdad Esfahbod 已提交
121
	"Vowel_Dependent":	'M',
122
	"Consonant_Prefixed":	'CPrf',
B
Behdad Esfahbod 已提交
123
	"Other":		'x',
B
Behdad Esfahbod 已提交
124
},{
B
Behdad Esfahbod 已提交
125
	"Not_Applicable":	'x',
B
Behdad Esfahbod 已提交
126
}]
B
Behdad Esfahbod 已提交
127
all_shorts = [{},{}]
B
Behdad Esfahbod 已提交
128 129 130 131 132 133

# Add some of the values, to make them more readable, and to avoid duplicates


for i in range (2):
	for v,s in short[i].items ():
B
Behdad Esfahbod 已提交
134
		all_shorts[i][s] = v
B
Behdad Esfahbod 已提交
135 136 137

what = ["INDIC_SYLLABIC_CATEGORY", "INDIC_MATRA_CATEGORY"]
what_short = ["ISC", "IMC"]
138 139
print ('#pragma GCC diagnostic push')
print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
P
pssea 已提交
140
cat_defs = []
B
Behdad Esfahbod 已提交
141
for i in range (2):
142
	vv = sorted (values[i].keys ())
B
Behdad Esfahbod 已提交
143 144 145 146 147 148 149
	for v in vv:
		v_no_and = v.replace ('_And_', '_')
		if v in short[i]:
			s = short[i][v]
		else:
			s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')])
			if s in all_shorts[i]:
B
Behdad Esfahbod 已提交
150 151
				raise Exception ("Duplicate short value alias", v, all_shorts[i][s])
			all_shorts[i][s] = v
B
Behdad Esfahbod 已提交
152
			short[i][v] = s
P
pssea 已提交
153 154 155 156 157 158 159 160 161 162 163
		cat_defs.append ((what_short[i] + '_' + s, what[i] + '_' + v.upper (), str (values[i][v]), v))

maxlen_s = max ([len (c[0]) for c in cat_defs])
maxlen_l = max ([len (c[1]) for c in cat_defs])
maxlen_n = max ([len (c[2]) for c in cat_defs])
for s in what_short:
	print ()
	for c in [c for c in cat_defs if s in c[0]]:
		print ("#define %s %s /* %s chars; %s */" %
			(c[0].ljust (maxlen_s), c[1].ljust (maxlen_l), c[2].rjust (maxlen_n), c[3]))
print ()
164
print ('#pragma GCC diagnostic pop')
165 166 167 168
print ()
print ("#define _(S,M) INDIC_COMBINE_CATEGORIES (ISC_##S, IMC_##M)")
print ()
print ()
B
Behdad Esfahbod 已提交
169

170 171
total = 0
used = 0
172
last_block = None
B
Behdad Esfahbod 已提交
173
def print_block (block, start, end, data):
174 175
	global total, used, last_block
	if block and block != last_block:
176 177 178
		print ()
		print ()
		print ("  /* %s */" % block)
B
Behdad Esfahbod 已提交
179
	num = 0
180 181
	assert start % 8 == 0
	assert (end+1) % 8 == 0
B
Behdad Esfahbod 已提交
182 183
	for u in range (start, end+1):
		if u % 8 == 0:
184 185
			print ()
			print ("  /* %04X */" % u, end="")
B
Behdad Esfahbod 已提交
186 187 188
		if u in data:
			num += 1
		d = data.get (u, defaults)
189
		print ("%9s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]])), end="")
B
Behdad Esfahbod 已提交
190

191 192
	total += end - start + 1
	used += num
193 194
	if block:
		last_block = block
B
Behdad Esfahbod 已提交
195

196
uu = sorted (data.keys ())
B
Behdad Esfahbod 已提交
197

B
Minor  
Behdad Esfahbod 已提交
198
last = -100000
B
Behdad Esfahbod 已提交
199
num = 0
200 201 202
offset = 0
starts = []
ends = []
L
lancer 已提交
203
print ("static const uint16_t indic_table[] = {")
B
Behdad Esfahbod 已提交
204 205 206 207
for u in uu:
	if u <= last:
		continue
	block = data[u][2]
208 209 210

	start = u//8*8
	end = start+1
B
Behdad Esfahbod 已提交
211
	while end in uu and block == data[end][2]:
212 213
		end += 1
	end = (end-1)//8*8 + 7
B
Behdad Esfahbod 已提交
214 215

	if start != last + 1:
216 217
		if start - last <= 1+16*3:
			print_block (None, last+1, start-1, data)
B
Behdad Esfahbod 已提交
218 219
		else:
			if last >= 0:
220 221
				ends.append (last + 1)
				offset += ends[-1] - starts[-1]
222 223 224
			print ()
			print ()
			print ("#define indic_offset_0x%04xu %d" % (start, offset))
225
			starts.append (start)
B
Behdad Esfahbod 已提交
226 227 228

	print_block (block, start, end, data)
	last = end
229 230
ends.append (last + 1)
offset += ends[-1] - starts[-1]
231 232
print ()
print ()
233
occupancy = used * 100. / total
B
Behdad Esfahbod 已提交
234
page_bits = 12
235 236
print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
print ()
L
lancer 已提交
237
print ("uint16_t")
238 239 240 241
print ("hb_indic_get_categories (hb_codepoint_t u)")
print ("{")
print ("  switch (u >> %d)" % page_bits)
print ("  {")
242
pages = set ([u>>page_bits for u in starts+ends+list (singles.keys ())])
243
for p in sorted(pages):
244
	print ("    case 0x%0Xu:" % p)
245 246
	for u,d in singles.items ():
		if p != u>>page_bits: continue
247
		print ("      if (unlikely (u == 0x%04Xu)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]]))
B
Behdad Esfahbod 已提交
248
	for (start,end) in zip (starts, ends):
B
Behdad Esfahbod 已提交
249
		if p not in [start>>page_bits, end>>page_bits]: continue
250
		offset = "indic_offset_0x%04xu" % start
251 252 253 254 255 256 257 258 259 260
		print ("      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return indic_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset))
	print ("      break;")
	print ("")
print ("    default:")
print ("      break;")
print ("  }")
print ("  return _(x,x);")
print ("}")
print ()
print ("#undef _")
B
Behdad Esfahbod 已提交
261
for i in range (2):
P
pssea 已提交
262
	print ()
263
	vv = sorted (values[i].keys ())
B
Behdad Esfahbod 已提交
264
	for v in vv:
265 266 267
		print ("#undef %s_%s" %
			(what_short[i], short[i][v]))
print ()
268
print ('#endif')
P
pssea 已提交
269
print ()
270
print ("/* == End of generated table == */")
271 272 273 274

# Maintain at least 30% occupancy in the table */
if occupancy < 30:
	raise Exception ("Table too sparse, please investigate: ", occupancy)