import codecs class Alphabet(object): def __init__(self, config_file): self._config_file = config_file self._label_to_str = [] self._str_to_label = {} self._size = 0 self.blank_token = 1 with codecs.open(config_file, 'r', 'utf-8') as fin: for line in fin: if line[0:2] == '\\#': line = '#\n' elif line[0] == '#': continue self._label_to_str += line[:-1] # remove the line ending self._str_to_label[line[:-1]] = self._size self._size += 1 def string_from_label(self, label): return self._label_to_str[label] def label_from_string(self, string): try: return self._str_to_label[string] except KeyError as e: raise KeyError( '''ERROR: Your transcripts contain characters which do not occur in data/alphabet.txt! Use util/check_characters.py to see what characters are in your {train,dev,test}.csv transcripts, and then add all these to data/alphabet.txt.''' ).with_traceback(e.__traceback__) def decode(self, labels): res = '' for label in labels: res += self.string_from_label(label) return res def size(self): return self._size def config_file(self): return self._config_file