api_train_v2.py 6.1 KB
Newer Older
D
dangqingqing 已提交
1 2
import sys
import math
D
update  
dangqingqing 已提交
3
import numpy as np
D
dangqingqing 已提交
4
import paddle.v2 as paddle
D
dangqingqing 已提交
5
import paddle.v2.dataset.conll05 as conll05
D
dangqingqing 已提交
6 7


D
dangqingqing 已提交
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
def db_lstm():
    word_dict, verb_dict, label_dict = conll05.get_dict()
    word_dict_len = len(word_dict)
    label_dict_len = len(label_dict)
    pred_len = len(verb_dict)

    mark_dict_len = 2
    word_dim = 32
    mark_dim = 5
    hidden_dim = 512
    depth = 8

    #8 features
    def d_type(size):
        return paddle.data_type.integer_value_sequence(size)

    word = paddle.layer.data(name='word_data', type=d_type(word_dict_len))
    predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len))

    ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len))
    ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len))
    ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len))
    ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len))
    ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len))
    mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))

    target = paddle.layer.data(name='target', type=d_type(label_dict_len))

    default_std = 1 / math.sqrt(hidden_dim) / 3.0

    emb_para = paddle.attr.Param(name='emb', initial_std=0., learning_rate=0.)
    std_0 = paddle.attr.Param(initial_std=0.)
    std_default = paddle.attr.Param(initial_std=default_std)

    predicate_embedding = paddle.layer.embedding(
        size=word_dim,
        input=predicate,
        param_attr=paddle.attr.Param(
            name='vemb', initial_std=default_std))
    mark_embedding = paddle.layer.embedding(
        size=mark_dim, input=mark, param_attr=std_0)

    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
    emb_layers = [
        paddle.layer.embedding(
            size=word_dim, input=x, param_attr=emb_para) for x in word_input
    ]
    emb_layers.append(predicate_embedding)
    emb_layers.append(mark_embedding)

    hidden_0 = paddle.layer.mixed(
        size=hidden_dim,
        bias_attr=std_default,
        input=[
            paddle.layer.full_matrix_projection(
                input=emb, param_attr=std_default) for emb in emb_layers
        ])

    mix_hidden_lr = 1e-3
    lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
    hidden_para_attr = paddle.attr.Param(
        initial_std=default_std, learning_rate=mix_hidden_lr)

    lstm_0 = paddle.layer.lstmemory(
        input=hidden_0,
        act=paddle.activation.Relu(),
        gate_act=paddle.activation.Sigmoid(),
        state_act=paddle.activation.Sigmoid(),
        bias_attr=std_0,
        param_attr=lstm_para_attr)

    #stack L-LSTM and R-LSTM with direct edges
    input_tmp = [hidden_0, lstm_0]

    for i in range(1, depth):
        mix_hidden = paddle.layer.mixed(
            size=hidden_dim,
            bias_attr=std_default,
            input=[
                paddle.layer.full_matrix_projection(
                    input=input_tmp[0], param_attr=hidden_para_attr),
                paddle.layer.full_matrix_projection(
                    input=input_tmp[1], param_attr=lstm_para_attr)
            ])

        lstm = paddle.layer.lstmemory(
            input=mix_hidden,
            act=paddle.activation.Relu(),
            gate_act=paddle.activation.Sigmoid(),
            state_act=paddle.activation.Sigmoid(),
            reverse=((i % 2) == 1),
            bias_attr=std_0,
            param_attr=lstm_para_attr)

        input_tmp = [mix_hidden, lstm]

    feature_out = paddle.layer.mixed(
        size=label_dict_len,
        bias_attr=std_default,
        input=[
            paddle.layer.full_matrix_projection(
                input=input_tmp[0], param_attr=hidden_para_attr),
            paddle.layer.full_matrix_projection(
                input=input_tmp[1], param_attr=lstm_para_attr)
        ], )

    crf_cost = paddle.layer.crf(size=label_dict_len,
                                input=feature_out,
                                label=target,
                                param_attr=paddle.attr.Param(
                                    name='crfw',
                                    initial_std=default_std,
                                    learning_rate=mix_hidden_lr))

    crf_dec = paddle.layer.crf_decoding(
        name='crf_dec_l',
        size=label_dict_len,
        input=feature_out,
        label=target,
        param_attr=paddle.attr.Param(name='crfw'))

    return crf_cost, crf_dec
D
dangqingqing 已提交
130 131


D
update  
dangqingqing 已提交
132 133
def load_parameter(file_name, h, w):
    with open(file_name, 'rb') as f:
D
dangqingqing 已提交
134
        f.read(16)  # skip header.
D
update  
dangqingqing 已提交
135 136 137
        return np.fromfile(f, dtype=np.float32).reshape(h, w)


D
dangqingqing 已提交
138 139 140 141
def main():
    paddle.init(use_gpu=False, trainer_count=1)

    # define network topology
D
dangqingqing 已提交
142
    crf_cost, crf_dec = db_lstm()
D
update  
dangqingqing 已提交
143

D
dangqingqing 已提交
144
    # create parameters
D
update  
dangqingqing 已提交
145
    parameters = paddle.parameters.create([crf_cost, crf_dec])
D
dangqingqing 已提交
146 147 148 149 150 151 152 153

    # create optimizer
    optimizer = paddle.optimizer.Momentum(
        momentum=0,
        learning_rate=2e-2,
        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
        model_average=paddle.optimizer.ModelAverage(
            average_window=0.5, max_average_window=10000), )
D
dangqingqing 已提交
154 155 156

    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
D
update  
dangqingqing 已提交
157 158 159
            if event.batch_id % 100 == 0:
                print "Pass %d, Batch %d, Cost %f, %s" % (
                    event.pass_id, event.batch_id, event.cost, event.metrics)
D
dangqingqing 已提交
160

D
update  
dangqingqing 已提交
161 162 163
    trainer = paddle.trainer.SGD(cost=crf_cost,
                                 parameters=parameters,
                                 update_equation=optimizer)
D
dangqingqing 已提交
164
    parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
D
update  
dangqingqing 已提交
165

Y
Yu Yang 已提交
166
    trn_reader = paddle.batch(
D
update  
dangqingqing 已提交
167
        paddle.reader.shuffle(
D
update  
dangqingqing 已提交
168
            conll05.test(), buf_size=8192), batch_size=10)
D
dangqingqing 已提交
169

Y
Yu Yang 已提交
170
    feeding = {
D
update  
dangqingqing 已提交
171 172 173 174 175 176 177 178 179 180 181
        'word_data': 0,
        'ctx_n2_data': 1,
        'ctx_n1_data': 2,
        'ctx_0_data': 3,
        'ctx_p1_data': 4,
        'ctx_p2_data': 5,
        'verb_data': 6,
        'mark_data': 7,
        'target': 8
    }

D
dangqingqing 已提交
182
    trainer.train(
D
update  
dangqingqing 已提交
183 184 185
        reader=trn_reader,
        event_handler=event_handler,
        num_passes=10000,
Y
Yu Yang 已提交
186
        feeding=feeding)
D
dangqingqing 已提交
187 188 189 190


if __name__ == '__main__':
    main()