train.py 8.2 KB
Newer Older
Q
qiaolongfei 已提交
1
import sys
C
caoying03 已提交
2
import numpy as np
L
Luo Tao 已提交
3

Q
qiaolongfei 已提交
4 5 6
import paddle.v2 as paddle


C
caoying03 已提交
7
def save_model(parameters, save_path):
8
    with open(save_path, 'w') as f:
C
caoying03 已提交
9 10 11 12 13 14 15 16
        parameters.to_tar(f)


def seq_to_seq_net(source_dict_dim,
                   target_dict_dim,
                   is_generating,
                   beam_size=3,
                   max_length=250):
Q
qiaolongfei 已提交
17 18
    ### Network Architecture
    word_vector_dim = 512  # dimension of word vector
C
caoying03 已提交
19 20
    decoder_size = 512  # dimension of hidden unit of GRU decoder
    encoder_size = 512  # dimension of hidden unit of GRU encoder
L
Luo Tao 已提交
21

Q
qiaolongfei 已提交
22 23
    #### Encoder
    src_word_id = paddle.layer.data(
24 25
        name='source_language_word',
        type=paddle.data_type.integer_value_sequence(source_dict_dim))
Q
qiaolongfei 已提交
26
    src_embedding = paddle.layer.embedding(
C
caoying03 已提交
27
        input=src_word_id, size=word_vector_dim)
Q
qiaolongfei 已提交
28
    src_forward = paddle.networks.simple_gru(
29
        input=src_embedding, size=encoder_size)
Q
qiaolongfei 已提交
30
    src_backward = paddle.networks.simple_gru(
31
        input=src_embedding, size=encoder_size, reverse=True)
Q
qiaolongfei 已提交
32 33 34
    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])

    #### Decoder
C
caoying03 已提交
35 36
    encoded_proj = paddle.layer.fc(
        act=paddle.activation.Linear(),
C
caoying03 已提交
37
        size=decoder_size,
C
caoying03 已提交
38 39
        bias_attr=False,
        input=encoded_vector)
Q
qiaolongfei 已提交
40 41 42

    backward_first = paddle.layer.first_seq(input=src_backward)

C
caoying03 已提交
43
    decoder_boot = paddle.layer.fc(
C
caoying03 已提交
44 45
        size=decoder_size,
        act=paddle.activation.Tanh(),
C
caoying03 已提交
46 47
        bias_attr=False,
        input=backward_first)
Q
qiaolongfei 已提交
48 49 50 51

    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):

        decoder_mem = paddle.layer.memory(
52
            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
Q
qiaolongfei 已提交
53 54

        context = paddle.networks.simple_attention(
55 56 57
            encoded_sequence=enc_vec,
            encoded_proj=enc_proj,
            decoder_state=decoder_mem)
Q
qiaolongfei 已提交
58

C
caoying03 已提交
59 60
        decoder_inputs = paddle.layer.fc(
            act=paddle.activation.Linear(),
C
caoying03 已提交
61
            size=decoder_size * 3,
C
caoying03 已提交
62 63 64 65
            bias_attr=False,
            input=[context, current_word],
            layer_attr=paddle.attr.ExtraLayerAttribute(
                error_clipping_threshold=100.0))
Q
qiaolongfei 已提交
66 67

        gru_step = paddle.layer.gru_step(
68 69 70 71
            name='gru_decoder',
            input=decoder_inputs,
            output_mem=decoder_mem,
            size=decoder_size)
Q
qiaolongfei 已提交
72

C
caoying03 已提交
73
        out = paddle.layer.fc(
C
caoying03 已提交
74 75 76
            size=target_dict_dim,
            bias_attr=True,
            act=paddle.activation.Softmax(),
C
caoying03 已提交
77
            input=gru_step)
Q
qiaolongfei 已提交
78 79
        return out

C
caoying03 已提交
80 81 82
    decoder_group_name = 'decoder_group'
    group_input1 = paddle.layer.StaticInput(input=encoded_vector)
    group_input2 = paddle.layer.StaticInput(input=encoded_proj)
Q
qiaolongfei 已提交
83 84
    group_inputs = [group_input1, group_input2]

L
Luo Tao 已提交
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
    if not is_generating:
        trg_embedding = paddle.layer.embedding(
            input=paddle.layer.data(
                name='target_language_word',
                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
            size=word_vector_dim,
            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
        group_inputs.append(trg_embedding)

        # For decoder equipped with attention mechanism, in training,
        # target embeding (the groudtruth) is the data input,
        # while encoded source sequence is accessed to as an unbounded memory.
        # Here, the StaticInput defines a read-only memory
        # for the recurrent_group.
        decoder = paddle.layer.recurrent_group(
            name=decoder_group_name,
            step=gru_decoder_with_attention,
            input=group_inputs)

        lbl = paddle.layer.data(
            name='target_language_next_word',
            type=paddle.data_type.integer_value_sequence(target_dict_dim))
        cost = paddle.layer.classification_cost(input=decoder, label=lbl)

        return cost
    else:
        # In generation, the decoder predicts a next target word based on
C
caoying03 已提交
112
        # the encoded source sequence and the previous generated target word.
L
Luo Tao 已提交
113 114 115

        # The encoded source sequence (encoder's output) must be specified by
        # StaticInput, which is a read-only memory.
C
caoying03 已提交
116 117
        # Embedding of the previous generated word is automatically retrieved
        # by GeneratedInputs initialized by a start mark <s>.
L
Luo Tao 已提交
118

C
caoying03 已提交
119
        trg_embedding = paddle.layer.GeneratedInput(
L
Luo Tao 已提交
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
            size=target_dict_dim,
            embedding_name='_target_language_embedding',
            embedding_size=word_vector_dim)
        group_inputs.append(trg_embedding)

        beam_gen = paddle.layer.beam_search(
            name=decoder_group_name,
            step=gru_decoder_with_attention,
            input=group_inputs,
            bos_id=0,
            eos_id=1,
            beam_size=beam_size,
            max_length=max_length)

        return beam_gen
Q
qiaolongfei 已提交
135 136 137


def main():
C
caoying03 已提交
138 139
    paddle.init(use_gpu=False, trainer_count=1)
    is_generating = False
Q
qiaolongfei 已提交
140

Q
qiaolongfei 已提交
141 142 143 144
    # source and target dict dim.
    dict_size = 30000
    source_dict_dim = target_dict_dim = dict_size

L
Luo Tao 已提交
145 146 147 148 149 150
    # train the network
    if not is_generating:
        # define optimize method and trainer
        optimizer = paddle.optimizer.Adam(
            learning_rate=5e-5,
            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
C
caoying03 已提交
151 152 153 154

        cost = seq_to_seq_net(source_dict_dim, target_dict_dim, is_generating)
        parameters = paddle.parameters.create(cost)

L
Luo Tao 已提交
155 156 157 158 159 160
        trainer = paddle.trainer.SGD(
            cost=cost, parameters=parameters, update_equation=optimizer)
        # define data reader
        wmt14_reader = paddle.batch(
            paddle.reader.shuffle(
                paddle.dataset.wmt14.train(dict_size), buf_size=8192),
C
caoying03 已提交
161
            batch_size=4)
L
Luo Tao 已提交
162 163 164 165 166

        # define event_handler callback
        def event_handler(event):
            if isinstance(event, paddle.event.EndIteration):
                if event.batch_id % 10 == 0:
C
caoying03 已提交
167 168 169
                    print("\nPass %d, Batch %d, Cost %f, %s" %
                          (event.pass_id, event.batch_id, event.cost,
                           event.metrics))
L
Luo Tao 已提交
170 171 172 173
                else:
                    sys.stdout.write('.')
                    sys.stdout.flush()

C
caoying03 已提交
174
                if not event.batch_id % 10:
175
                    save_path = 'params_pass_%05d_batch_%05d.tar' % (
C
caoying03 已提交
176 177 178 179 180
                        event.pass_id, event.batch_id)
                    save_model(parameters, save_path)

            if isinstance(event, paddle.event.EndPass):
                # save parameters
181
                save_path = 'params_pass_%05d.tar' % (event.pass_id)
C
caoying03 已提交
182 183
                save_model(parameters, save_path)

L
Luo Tao 已提交
184 185 186 187 188 189 190 191 192
        # start to train
        trainer.train(
            reader=wmt14_reader, event_handler=event_handler, num_passes=2)

    # generate a english sequence to french
    else:
        # use the first 3 samples for generation
        gen_data = []
        gen_num = 3
C
caoying03 已提交
193 194
        for item in paddle.dataset.wmt14.gen(dict_size)():
            gen_data.append([item[0]])
L
Luo Tao 已提交
195 196 197
            if len(gen_data) == gen_num:
                break

C
caoying03 已提交
198 199 200 201 202
        beam_size = 3
        beam_gen = seq_to_seq_net(source_dict_dim, target_dict_dim,
                                  is_generating, beam_size)

        # get the trained model, whose bleu = 26.92
L
Luo Tao 已提交
203
        parameters = paddle.dataset.wmt14.model()
C
caoying03 已提交
204

C
caoying03 已提交
205
        # prob is the prediction probabilities, and id is the prediction word.
L
Luo Tao 已提交
206 207 208 209 210 211
        beam_result = paddle.infer(
            output_layer=beam_gen,
            parameters=parameters,
            input=gen_data,
            field=['prob', 'id'])

C
caoying03 已提交
212
        # load the dictionary
L
Luo Tao 已提交
213 214
        src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)

C
caoying03 已提交
215 216 217 218 219 220 221 222 223 224
        gen_sen_idx = np.where(beam_result[1] == -1)[0]
        assert len(gen_sen_idx) == len(gen_data) * beam_size

        # -1 is the delimiter of generated sequences.
        # the first element of each generated sequence its length.
        start_pos, end_pos = 1, 0
        for i, sample in enumerate(gen_data):
            print(
                " ".join([src_dict[w] for w in sample[0][1:-1]])
            )  # skip the start and ending mark when printing the source sentence
L
Luo Tao 已提交
225
            for j in xrange(beam_size):
C
caoying03 已提交
226 227 228 229 230
                end_pos = gen_sen_idx[i * beam_size + j]
                print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
                    trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
                start_pos = end_pos + 2
            print("\n")
Q
qiaolongfei 已提交
231 232 233 234


if __name__ == '__main__':
    main()