api_train_v2.py 9.0 KB
Newer Older
D
dangqingqing 已提交
1
import math
D
update  
dangqingqing 已提交
2
import numpy as np
C
caoying03 已提交
3 4
import gzip
import logging
D
dangqingqing 已提交
5
import paddle.v2.dataset.conll05 as conll05
C
caoying03 已提交
6 7
import paddle.v2.evaluator as evaluator
import paddle.v2 as paddle
D
dangqingqing 已提交
8

C
caoying03 已提交
9 10
logger = logging.getLogger('paddle')

C
caoying03 已提交
11 12 13 14
word_dict, verb_dict, label_dict = conll05.get_dict()
word_dict_len = len(word_dict)
label_dict_len = len(label_dict)
pred_len = len(verb_dict)
D
dangqingqing 已提交
15

C
caoying03 已提交
16 17 18 19 20 21 22
mark_dict_len = 2
word_dim = 32
mark_dim = 5
hidden_dim = 512
depth = 8
default_std = 1 / math.sqrt(hidden_dim) / 3.0
mix_hidden_lr = 1e-3
D
dangqingqing 已提交
23 24


C
caoying03 已提交
25 26 27 28 29 30
def d_type(size):
    return paddle.data_type.integer_value_sequence(size)


def db_lstm():
    #8 features
D
dangqingqing 已提交
31 32 33 34 35 36 37 38 39 40
    word = paddle.layer.data(name='word_data', type=d_type(word_dict_len))
    predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len))

    ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len))
    ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len))
    ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len))
    ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len))
    ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len))
    mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))

C
caoying03 已提交
41
    emb_para = paddle.attr.Param(name='emb', initial_std=0., is_static=True)
D
dangqingqing 已提交
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
    std_0 = paddle.attr.Param(initial_std=0.)
    std_default = paddle.attr.Param(initial_std=default_std)

    predicate_embedding = paddle.layer.embedding(
        size=word_dim,
        input=predicate,
        param_attr=paddle.attr.Param(
            name='vemb', initial_std=default_std))
    mark_embedding = paddle.layer.embedding(
        size=mark_dim, input=mark, param_attr=std_0)

    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
    emb_layers = [
        paddle.layer.embedding(
            size=word_dim, input=x, param_attr=emb_para) for x in word_input
    ]
    emb_layers.append(predicate_embedding)
    emb_layers.append(mark_embedding)

    hidden_0 = paddle.layer.mixed(
        size=hidden_dim,
        bias_attr=std_default,
        input=[
            paddle.layer.full_matrix_projection(
                input=emb, param_attr=std_default) for emb in emb_layers
        ])

    lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
    hidden_para_attr = paddle.attr.Param(
        initial_std=default_std, learning_rate=mix_hidden_lr)

    lstm_0 = paddle.layer.lstmemory(
        input=hidden_0,
        act=paddle.activation.Relu(),
        gate_act=paddle.activation.Sigmoid(),
        state_act=paddle.activation.Sigmoid(),
        bias_attr=std_0,
        param_attr=lstm_para_attr)

    #stack L-LSTM and R-LSTM with direct edges
    input_tmp = [hidden_0, lstm_0]

    for i in range(1, depth):
        mix_hidden = paddle.layer.mixed(
            size=hidden_dim,
            bias_attr=std_default,
            input=[
                paddle.layer.full_matrix_projection(
                    input=input_tmp[0], param_attr=hidden_para_attr),
                paddle.layer.full_matrix_projection(
                    input=input_tmp[1], param_attr=lstm_para_attr)
            ])

        lstm = paddle.layer.lstmemory(
            input=mix_hidden,
            act=paddle.activation.Relu(),
            gate_act=paddle.activation.Sigmoid(),
            state_act=paddle.activation.Sigmoid(),
            reverse=((i % 2) == 1),
            bias_attr=std_0,
            param_attr=lstm_para_attr)

        input_tmp = [mix_hidden, lstm]

    feature_out = paddle.layer.mixed(
        size=label_dict_len,
        bias_attr=std_default,
        input=[
            paddle.layer.full_matrix_projection(
                input=input_tmp[0], param_attr=hidden_para_attr),
            paddle.layer.full_matrix_projection(
                input=input_tmp[1], param_attr=lstm_para_attr)
        ], )

C
caoying03 已提交
116 117 118 119 120 121 122 123 124
    return feature_out


def load_parameter(file_name, h, w):
    with open(file_name, 'rb') as f:
        f.read(16)  # skip header.
        return np.fromfile(f, dtype=np.float32).reshape(h, w)


C
caoying03 已提交
125
def train():
C
caoying03 已提交
126 127 128 129 130
    paddle.init(use_gpu=False, trainer_count=1)

    # define network topology
    feature_out = db_lstm()
    target = paddle.layer.data(name='target', type=d_type(label_dict_len))
D
dangqingqing 已提交
131 132 133 134 135 136 137 138 139 140 141 142 143
    crf_cost = paddle.layer.crf(size=label_dict_len,
                                input=feature_out,
                                label=target,
                                param_attr=paddle.attr.Param(
                                    name='crfw',
                                    initial_std=default_std,
                                    learning_rate=mix_hidden_lr))

    crf_dec = paddle.layer.crf_decoding(
        size=label_dict_len,
        input=feature_out,
        label=target,
        param_attr=paddle.attr.Param(name='crfw'))
C
caoying03 已提交
144
    evaluator.sum(input=crf_dec)
D
update  
dangqingqing 已提交
145

D
dangqingqing 已提交
146
    # create parameters
C
caoying03 已提交
147 148
    parameters = paddle.parameters.create(crf_cost)
    parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
D
dangqingqing 已提交
149 150 151 152 153 154 155 156

    # create optimizer
    optimizer = paddle.optimizer.Momentum(
        momentum=0,
        learning_rate=2e-2,
        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
        model_average=paddle.optimizer.ModelAverage(
            average_window=0.5, max_average_window=10000), )
D
dangqingqing 已提交
157

D
update  
dangqingqing 已提交
158 159
    trainer = paddle.trainer.SGD(cost=crf_cost,
                                 parameters=parameters,
C
caoying03 已提交
160 161
                                 update_equation=optimizer,
                                 extra_layers=crf_dec)
D
update  
dangqingqing 已提交
162

C
caoying03 已提交
163
    reader = paddle.batch(
D
update  
dangqingqing 已提交
164
        paddle.reader.shuffle(
D
update  
dangqingqing 已提交
165
            conll05.test(), buf_size=8192), batch_size=10)
D
dangqingqing 已提交
166

Y
Yu Yang 已提交
167
    feeding = {
D
update  
dangqingqing 已提交
168 169 170 171 172 173 174 175 176 177 178
        'word_data': 0,
        'ctx_n2_data': 1,
        'ctx_n1_data': 2,
        'ctx_0_data': 3,
        'ctx_p1_data': 4,
        'ctx_p2_data': 5,
        'verb_data': 6,
        'mark_data': 7,
        'target': 8
    }

C
caoying03 已提交
179 180 181
    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % 100 == 0:
C
caoying03 已提交
182 183 184
                logger.info("Pass %d, Batch %d, Cost %f, %s" % (
                    event.pass_id, event.batch_id, event.cost, event.metrics))
            if event.batch_id and event.batch_id % 1000 == 0:
C
caoying03 已提交
185
                result = trainer.test(reader=reader, feeding=feeding)
C
caoying03 已提交
186 187
                logger.info("\nTest with Pass %d, Batch %d, %s" %
                            (event.pass_id, event.batch_id, result.metrics))
C
caoying03 已提交
188 189 190 191 192 193 194

        if isinstance(event, paddle.event.EndPass):
            # save parameters
            with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
                parameters.to_tar(f)

            result = trainer.test(reader=reader, feeding=feeding)
C
caoying03 已提交
195 196 197 198 199 200 201 202
            logger.info("\nTest with Pass %d, %s" %
                        (event.pass_id, result.metrics))

    trainer.train(
        reader=reader,
        event_handler=event_handler,
        num_passes=10,
        feeding=feeding)
C
caoying03 已提交
203 204


C
caoying03 已提交
205 206 207
def infer_a_batch(inferer, test_data, word_dict, pred_dict, label_dict):
    probs = inferer.infer(input=test_data, field='id')
    assert len(probs) == sum(len(x[0]) for x in test_data)
C
caoying03 已提交
208

C
caoying03 已提交
209 210 211
    for idx, test_sample in enumerate(test_data):
        start_id = 0
        pred_str = "%s\t" % (pred_dict[test_sample[6][0]])
C
caoying03 已提交
212

C
caoying03 已提交
213 214 215 216 217
        for w, tag in zip(test_sample[0],
                          probs[start_id:start_id + len(test_sample[0])]):
            pred_str += "%s[%s] " % (word_dict[w], label_dict[tag])
        print(pred_str.strip())
        start_id += len(test_sample[0])
C
caoying03 已提交
218

C
caoying03 已提交
219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273

def infer():
    label_dict_reverse = dict((value, key)
                              for key, value in label_dict.iteritems())
    word_dict_reverse = dict((value, key)
                             for key, value in word_dict.iteritems())
    pred_dict_reverse = dict((value, key)
                             for key, value in verb_dict.iteritems())

    test_creator = paddle.dataset.conll05.test()

    paddle.init(use_gpu=False, trainer_count=1)

    # define network topology
    feature_out = db_lstm()
    predict = paddle.layer.crf_decoding(
        size=label_dict_len,
        input=feature_out,
        param_attr=paddle.attr.Param(name='crfw'))

    test_pass = 0
    with gzip.open('params_pass_%d.tar.gz' % (test_pass)) as f:
        parameters = paddle.parameters.Parameters.from_tar(f)
        inferer = paddle.inference.Inference(
            output_layer=predict, parameters=parameters)

        # prepare test data
        test_data = []
        test_batch_size = 50

        for idx, item in enumerate(test_creator()):
            test_data.append(item[0:8])

            if idx and (not idx % test_batch_size):
                infer_a_batch(
                    inferer,
                    test_data,
                    word_dict_reverse,
                    pred_dict_reverse,
                    label_dict_reverse, )
                test_data = []
        infer_a_batch(
            inferer,
            test_data,
            word_dict_reverse,
            pred_dict_reverse,
            label_dict_reverse, )
        test_data = []


def main(is_inferring=False):
    if is_inferring:
        infer()
    else:
        train()
D
dangqingqing 已提交
274 275 276


if __name__ == '__main__':
C
caoying03 已提交
277
    main(is_inferring=False)