gen-vowel-constraints.py 7.5 KB
Newer Older
P
pssea 已提交
1
#!/usr/bin/env python3
2 3 4

"""Generator of the function to prohibit certain vowel sequences.

B
Behdad Esfahbod 已提交
5
It creates ``_hb_preprocess_text_vowel_constraints``, which inserts dotted
6 7 8 9
circles into sequences prohibited by the USE script development spec.
This function should be used as the ``preprocess_text`` of an
``hb_ot_complex_shaper_t``.

P
pssea 已提交
10 11 12 13 14
usage: ./gen-vowel-constraints.py ms-use/IndicShapingInvalidCluster.txt Scripts.txt

Input file:
* https://unicode.org/Public/UCD/latest/ucd/Scripts.txt
"""
15 16

import collections
P
pssea 已提交
17 18 19
def write (s):
	sys.stdout.flush ()
	sys.stdout.buffer.write (s.encode ('utf-8'))
20 21 22
import sys

if len (sys.argv) != 3:
P
pssea 已提交
23
	sys.exit (__doc__)
24

P
pssea 已提交
25
with open (sys.argv[2], encoding='utf-8') as f:
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
	scripts_header = [f.readline () for i in range (2)]
	scripts = {}
	script_order = {}
	for line in f:
		j = line.find ('#')
		if j >= 0:
			line = line[:j]
		fields = [x.strip () for x in line.split (';')]
		if len (fields) == 1:
			continue
		uu = fields[0].split ('..')
		start = int (uu[0], 16)
		if len (uu) == 1:
			end = start
		else:
			end = int (uu[1], 16)
		script = fields[1]
		for u in range (start, end + 1):
			scripts[u] = script
		if script not in script_order:
			script_order[script] = start

class ConstraintSet (object):
	"""A set of prohibited code point sequences.

	Args:
		constraint (List[int]): A prohibited code point sequence.

	"""
	def __init__ (self, constraint):
		# Either a list or a dictionary. As a list of code points, it
		# represents a prohibited code point sequence. As a dictionary,
		# it represents a set of prohibited sequences, where each item
		# represents the set of prohibited sequences starting with the
		# key (a code point) concatenated with any of the values
		# (ConstraintSets).
		self._c = constraint

	def add (self, constraint):
		"""Add a constraint to this set."""
		if not constraint:
			return
		first = constraint[0]
		rest = constraint[1:]
		if isinstance (self._c, list):
			if constraint == self._c[:len (constraint)]:
				self._c = constraint
			elif self._c != constraint[:len (self._c)]:
				self._c = {self._c[0]: ConstraintSet (self._c[1:])}
		if isinstance (self._c, dict):
			if first in self._c:
				self._c[first].add (rest)
			else:
				self._c[first] = ConstraintSet (rest)

P
pssea 已提交
81 82
	@staticmethod
	def _indent (depth):
83 84 85 86 87 88 89
		return ('  ' * depth).replace ('        ', '\t')

	def __str__ (self, index=0, depth=4):
		s = []
		indent = self._indent (depth)
		if isinstance (self._c, list):
			if len (self._c) == 0:
P
pssea 已提交
90
				assert index == 2, 'Cannot use `matched` for this constraint; the general case has not been implemented'
91 92
				s.append ('{}matched = true;\n'.format (indent))
			elif len (self._c) == 1:
P
pssea 已提交
93
				assert index == 1, 'Cannot use `matched` for this constraint; the general case has not been implemented'
94 95
				s.append ('{}matched = 0x{:04X}u == buffer->cur ({}).codepoint;\n'.format (indent, next (iter (self._c)), index or ''))
			else:
P
pssea 已提交
96 97 98
				s.append ('{}if (0x{:04X}u == buffer->cur ({}).codepoint &&\n'.format (indent, self._c[0], index or ''))
				if index:
					s.append ('{}buffer->idx + {} < count &&\n'.format (self._indent (depth + 2), index + 1))
99 100 101 102
				for i, cp in enumerate (self._c[1:], start=1):
					s.append ('{}0x{:04X}u == buffer->cur ({}).codepoint{}\n'.format (
						self._indent (depth + 2), cp, index + i, ')' if i == len (self._c) - 1 else ' &&'))
				s.append ('{}{{\n'.format (indent))
P
pssea 已提交
103
				for i in range (index):
L
lancer 已提交
104
					s.append ('{}(void) buffer->next_glyph ();\n'.format (self._indent (depth + 1)))
P
pssea 已提交
105
				s.append ('{}matched = true;\n'.format (self._indent (depth + 1)))
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
				s.append ('{}}}\n'.format (indent))
		else:
			s.append ('{}switch (buffer->cur ({}).codepoint)\n'.format(indent, index or ''))
			s.append ('{}{{\n'.format (indent))
			cases = collections.defaultdict (set)
			for first, rest in sorted (self._c.items ()):
				cases[rest.__str__ (index + 1, depth + 2)].add (first)
			for body, labels in sorted (cases.items (), key=lambda b_ls: sorted (b_ls[1])[0]):
				for i, cp in enumerate (sorted (labels)):
					if i % 4 == 0:
						s.append (self._indent (depth + 1))
					else:
						s.append (' ')
					s.append ('case 0x{:04X}u:{}'.format (cp, '\n' if i % 4 == 3 else ''))
				if len (labels) % 4 != 0:
					s.append ('\n')
				s.append (body)
				s.append ('{}break;\n'.format (self._indent (depth + 2)))
			s.append ('{}}}\n'.format (indent))
		return ''.join (s)

B
Behdad Esfahbod 已提交
127
constraints = {}
P
pssea 已提交
128 129 130 131 132 133 134
with open (sys.argv[1], encoding='utf-8') as f:
	constraints_header = []
	while True:
		line = f.readline ().strip ()
		if line == '#':
			break
		constraints_header.append(line)
B
Behdad Esfahbod 已提交
135 136 137 138 139 140 141 142 143 144 145 146 147
	for line in f:
		j = line.find ('#')
		if j >= 0:
			line = line[:j]
		constraint = [int (cp, 16) for cp in line.split (';')[0].split ()]
		if not constraint: continue
		assert 2 <= len (constraint), 'Prohibited sequence is too short: {}'.format (constraint)
		script = scripts[constraint[0]]
		if script in constraints:
			constraints[script].add (constraint)
		else:
			constraints[script] = ConstraintSet (constraint)
		assert constraints, 'No constraints found'
148 149 150 151 152

print ('/* == Start of generated functions == */')
print ('/*')
print (' * The following functions are generated by running:')
print (' *')
P
pssea 已提交
153
print (' *   %s ms-use/IndicShapingInvalidCluster.txt Scripts.txt' % sys.argv[0])
154 155 156
print (' *')
print (' * on files with these headers:')
print (' *')
B
Behdad Esfahbod 已提交
157 158 159
for line in constraints_header:
	print (' * %s' % line.strip ())
print (' *')
160 161 162
for line in scripts_header:
	print (' * %s' % line.strip ())
print (' */')
163 164 165 166 167

print ()
print ('#include "hb.hh"')
print ()
print ('#ifndef HB_NO_OT_SHAPE')
168
print ()
B
Behdad Esfahbod 已提交
169
print ('#include "hb-ot-shape-complex-vowel-constraints.hh"')
170 171
print ()
print ('static void')
172
print ('_output_dotted_circle (hb_buffer_t *buffer)')
173
print ('{')
L
lancer 已提交
174 175
print ('  (void) buffer->output_glyph (0x25CCu);')
print ('  _hb_glyph_info_reset_continuation (&buffer->prev());')
176
print ('}')
177
print ()
178 179 180 181
print ('static void')
print ('_output_with_dotted_circle (hb_buffer_t *buffer)')
print ('{')
print ('  _output_dotted_circle (buffer);')
L
lancer 已提交
182
print ('  (void) buffer->next_glyph ();')
183 184 185
print ('}')
print ()

B
Behdad Esfahbod 已提交
186
print ('void')
187
print ('_hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,')
B
Behdad Esfahbod 已提交
188
print ('\t\t\t\t       hb_buffer_t              *buffer,')
189
print ('\t\t\t\t       hb_font_t                *font HB_UNUSED)')
190
print ('{')
P
pssea 已提交
191
print ('#ifdef HB_NO_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS')
192 193
print ('  return;')
print ('#endif')
194 195 196
print ('  if (buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE)')
print ('    return;')
print ()
197 198 199 200 201 202 203 204 205 206 207
print ('  /* UGLY UGLY UGLY business of adding dotted-circle in the middle of')
print ('   * vowel-sequences that look like another vowel.  Data for each script')
print ('   * collected from the USE script development spec.')
print ('   *')
print ('   * https://github.com/harfbuzz/harfbuzz/issues/1019')
print ('   */')
print ('  buffer->clear_output ();')
print ('  unsigned int count = buffer->len;')
print ('  switch ((unsigned) buffer->props.script)')
print ('  {')

B
Behdad Esfahbod 已提交
208
for script, constraints in sorted (constraints.items (), key=lambda s_c: script_order[s_c[0]]):
209 210 211 212 213
	print ('    case HB_SCRIPT_{}:'.format (script.upper ()))
	print ('      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)')
	print ('      {')
	print ('\tbool matched = false;')
	write (str (constraints))
L
lancer 已提交
214
	print ('\t(void) buffer->next_glyph ();')
215 216 217 218 219 220 221 222
	print ('\tif (matched) _output_with_dotted_circle (buffer);')
	print ('      }')
	print ('      break;')
	print ()

print ('    default:')
print ('      break;')
print ('  }')
L
lancer 已提交
223
print ('  buffer->swap_buffers ();')
224 225 226
print ('}')

print ()
227 228
print ()
print ('#endif')
229
print ('/* == End of generated functions == */')