utils.py 1.5 KB
Newer Older
P
peterzhang2029 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
import os


class AsciiDic(object):
    UNK_ID = 0

    def __init__(self):
        self.dic = {
            '<unk>': self.UNK_ID,
        }
        self.chars = [chr(i) for i in range(40, 171)]
        for id, c in enumerate(self.chars):
            self.dic[c] = id + 1

    def lookup(self, w):
        return self.dic.get(w, self.UNK_ID)

    def id2word(self):
        '''
        Return a reversed char dict.
        '''
        self.id2word = {}
        for key, value in self.dic.items():
            self.id2word[value] = key

        return self.id2word

    def word2ids(self, word):
        '''
        Transform a word to a list of ids.

        :param word: The word appears in image data.
        :type word: str
        '''
        return [self.lookup(c) for c in list(word)]

    def size(self):
        return len(self.dic)


def get_file_list(image_file_list):
    '''
    Generate the file list for training and testing data.
    
    :param image_file_list: The path of the file which contains
                           path list of image files.
    :type image_file_list: str
    '''
    dirname = os.path.dirname(image_file_list)
    path_list = []
    with open(image_file_list) as f:
        for line in f:
            line_split = line.strip().split(',', 1)
            filename = line_split[0].strip()
            path = os.path.join(dirname, filename)
            label = line_split[1][2:-1]
            path_list.append((path, label))

    return path_list