reader.py 4.0 KB
Newer Older
C
caoying03 已提交
1 2
from utils import UNK, ModelType, TaskType, load_dic, \
        sent2ids, logger, ModelType
S
Superjom 已提交
3 4 5


class Dataset(object):
S
Superjom 已提交
6 7
    def __init__(self, train_path, test_path, source_dic_path, target_dic_path,
                 model_type):
S
Superjom 已提交
8 9 10 11
        self.train_path = train_path
        self.test_path = test_path
        self.source_dic_path = source_dic_path
        self.target_dic_path = target_dic_path
S
Superjom 已提交
12
        self.model_type = ModelType(model_type)
S
Superjom 已提交
13 14 15 16

        self.source_dic = load_dic(self.source_dic_path)
        self.target_dic = load_dic(self.target_dic_path)

S
Superjom 已提交
17 18 19 20 21 22 23 24
        _record_reader = {
            ModelType.CLASSIFICATION_MODE: self._read_classification_record,
            ModelType.REGRESSION_MODE: self._read_regression_record,
            ModelType.RANK_MODE: self._read_rank_record,
        }

        assert isinstance(model_type, ModelType)
        self.record_reader = _record_reader[model_type.mode]
S
Superjom 已提交
25
        self.is_infer = False
S
Superjom 已提交
26 27

    def train(self):
S
Superjom 已提交
28 29 30
        '''
        Load trainset.
        '''
S
Superjom 已提交
31 32 33 34 35 36
        logger.info("[reader] load trainset from %s" % self.train_path)
        with open(self.train_path) as f:
            for line_id, line in enumerate(f):
                yield self.record_reader(line)

    def test(self):
S
Superjom 已提交
37 38 39
        '''
        Load testset.
        '''
S
Superjom 已提交
40 41 42 43
        with open(self.test_path) as f:
            for line_id, line in enumerate(f):
                yield self.record_reader(line)

S
Superjom 已提交
44 45 46 47 48 49
    def infer(self):
        self.is_infer = True
        with open(self.train_path) as f:
            for line in f:
                yield self.record_reader(line)

S
Superjom 已提交
50 51 52 53 54 55 56 57 58 59 60 61 62 63
    def _read_classification_record(self, line):
        '''
        data format:
            <source words> [TAB] <target words> [TAB] <label>

        @line: str
            a string line which represent a record.
        '''
        fs = line.strip().split('\t')
        assert len(fs) == 3, "wrong format for classification\n" + \
            "the format shoud be " +\
            "<source words> [TAB] <target words> [TAB] <label>'"
        source = sent2ids(fs[0], self.source_dic)
        target = sent2ids(fs[1], self.target_dic)
S
Superjom 已提交
64 65
        if not self.is_infer:
            label = int(fs[2])
66 67 68 69
            return (
                source,
                target,
                label, )
S
Superjom 已提交
70
        return source, target
S
Superjom 已提交
71

S
Superjom 已提交
72 73 74 75 76 77 78 79 80 81 82 83 84 85
    def _read_regression_record(self, line):
        '''
        data format:
            <source words> [TAB] <target words> [TAB] <label>

        @line: str
            a string line which represent a record.
        '''
        fs = line.strip().split('\t')
        assert len(fs) == 3, "wrong format for regression\n" + \
            "the format shoud be " +\
            "<source words> [TAB] <target words> [TAB] <label>'"
        source = sent2ids(fs[0], self.source_dic)
        target = sent2ids(fs[1], self.target_dic)
S
Superjom 已提交
86 87
        if not self.is_infer:
            label = float(fs[2])
88 89 90 91
            return (
                source,
                target,
                [label], )
S
Superjom 已提交
92
        return source, target
S
Superjom 已提交
93

S
Superjom 已提交
94 95 96 97 98 99 100 101 102 103 104 105 106
    def _read_rank_record(self, line):
        '''
        data format:
            <source words> [TAB] <left_target words> [TAB] <right_target words> [TAB] <label>
        '''
        fs = line.strip().split('\t')
        assert len(fs) == 4, "wrong format for rank\n" + \
            "the format should be " +\
            "<source words> [TAB] <left_target words> [TAB] <right_target words> [TAB] <label>"

        source = sent2ids(fs[0], self.source_dic)
        left_target = sent2ids(fs[1], self.target_dic)
        right_target = sent2ids(fs[2], self.target_dic)
S
Superjom 已提交
107 108 109 110
        if not self.is_infer:
            label = int(fs[3])
            return (source, left_target, right_target, label)
        return source, left_target, right_target
S
Superjom 已提交
111 112 113 114 115 116 117


if __name__ == '__main__':
    path = './data/classification/train.txt'
    test_path = './data/classification/test.txt'
    source_dic = './data/vocab.txt'
    dataset = Dataset(path, test_path, source_dic, source_dic,
S
Superjom 已提交
118
                      ModelType.CLASSIFICATION)
S
Superjom 已提交
119 120 121

    for rcd in dataset.train():
        print rcd