train.py 8.3 KB
Newer Older
D
dzhwinter 已提交
1
import sys, os
C
caoying03 已提交
2
import numpy as np
Q
qiaolongfei 已提交
3 4
import paddle.v2 as paddle

D
dzhwinter 已提交
5
with_gpu = os.getenv('WITH_GPU', '0') != '0'
Q
qiaolongfei 已提交
6

H
Helin Wang 已提交
7

C
caoying03 已提交
8
def save_model(parameters, save_path):
9
    with open(save_path, 'w') as f:
C
caoying03 已提交
10 11 12 13 14 15 16 17
        parameters.to_tar(f)


def seq_to_seq_net(source_dict_dim,
                   target_dict_dim,
                   is_generating,
                   beam_size=3,
                   max_length=250):
Q
qiaolongfei 已提交
18 19
    ### Network Architecture
    word_vector_dim = 512  # dimension of word vector
C
caoying03 已提交
20 21
    decoder_size = 512  # dimension of hidden unit of GRU decoder
    encoder_size = 512  # dimension of hidden unit of GRU encoder
L
Luo Tao 已提交
22

Q
qiaolongfei 已提交
23 24
    #### Encoder
    src_word_id = paddle.layer.data(
25 26
        name='source_language_word',
        type=paddle.data_type.integer_value_sequence(source_dict_dim))
Q
qiaolongfei 已提交
27
    src_embedding = paddle.layer.embedding(
C
caoying03 已提交
28
        input=src_word_id, size=word_vector_dim)
Q
qiaolongfei 已提交
29
    src_forward = paddle.networks.simple_gru(
30
        input=src_embedding, size=encoder_size)
Q
qiaolongfei 已提交
31
    src_backward = paddle.networks.simple_gru(
32
        input=src_embedding, size=encoder_size, reverse=True)
Q
qiaolongfei 已提交
33 34 35
    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])

    #### Decoder
C
caoying03 已提交
36 37
    encoded_proj = paddle.layer.fc(
        act=paddle.activation.Linear(),
C
caoying03 已提交
38
        size=decoder_size,
C
caoying03 已提交
39 40
        bias_attr=False,
        input=encoded_vector)
Q
qiaolongfei 已提交
41 42 43

    backward_first = paddle.layer.first_seq(input=src_backward)

C
caoying03 已提交
44
    decoder_boot = paddle.layer.fc(
C
caoying03 已提交
45 46
        size=decoder_size,
        act=paddle.activation.Tanh(),
C
caoying03 已提交
47 48
        bias_attr=False,
        input=backward_first)
Q
qiaolongfei 已提交
49 50 51 52

    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):

        decoder_mem = paddle.layer.memory(
53
            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
Q
qiaolongfei 已提交
54 55

        context = paddle.networks.simple_attention(
56 57 58
            encoded_sequence=enc_vec,
            encoded_proj=enc_proj,
            decoder_state=decoder_mem)
Q
qiaolongfei 已提交
59

C
caoying03 已提交
60 61
        decoder_inputs = paddle.layer.fc(
            act=paddle.activation.Linear(),
C
caoying03 已提交
62
            size=decoder_size * 3,
C
caoying03 已提交
63 64 65 66
            bias_attr=False,
            input=[context, current_word],
            layer_attr=paddle.attr.ExtraLayerAttribute(
                error_clipping_threshold=100.0))
Q
qiaolongfei 已提交
67 68

        gru_step = paddle.layer.gru_step(
69 70 71 72
            name='gru_decoder',
            input=decoder_inputs,
            output_mem=decoder_mem,
            size=decoder_size)
Q
qiaolongfei 已提交
73

C
caoying03 已提交
74
        out = paddle.layer.fc(
C
caoying03 已提交
75 76 77
            size=target_dict_dim,
            bias_attr=True,
            act=paddle.activation.Softmax(),
C
caoying03 已提交
78
            input=gru_step)
Q
qiaolongfei 已提交
79 80
        return out

C
caoying03 已提交
81 82 83
    decoder_group_name = 'decoder_group'
    group_input1 = paddle.layer.StaticInput(input=encoded_vector)
    group_input2 = paddle.layer.StaticInput(input=encoded_proj)
Q
qiaolongfei 已提交
84 85
    group_inputs = [group_input1, group_input2]

L
Luo Tao 已提交
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
    if not is_generating:
        trg_embedding = paddle.layer.embedding(
            input=paddle.layer.data(
                name='target_language_word',
                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
            size=word_vector_dim,
            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
        group_inputs.append(trg_embedding)

        # For decoder equipped with attention mechanism, in training,
        # target embeding (the groudtruth) is the data input,
        # while encoded source sequence is accessed to as an unbounded memory.
        # Here, the StaticInput defines a read-only memory
        # for the recurrent_group.
        decoder = paddle.layer.recurrent_group(
            name=decoder_group_name,
            step=gru_decoder_with_attention,
            input=group_inputs)

        lbl = paddle.layer.data(
            name='target_language_next_word',
            type=paddle.data_type.integer_value_sequence(target_dict_dim))
        cost = paddle.layer.classification_cost(input=decoder, label=lbl)

        return cost
    else:
        # In generation, the decoder predicts a next target word based on
C
caoying03 已提交
113
        # the encoded source sequence and the previous generated target word.
L
Luo Tao 已提交
114 115 116

        # The encoded source sequence (encoder's output) must be specified by
        # StaticInput, which is a read-only memory.
C
caoying03 已提交
117 118
        # Embedding of the previous generated word is automatically retrieved
        # by GeneratedInputs initialized by a start mark <s>.
L
Luo Tao 已提交
119

C
caoying03 已提交
120
        trg_embedding = paddle.layer.GeneratedInput(
L
Luo Tao 已提交
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
            size=target_dict_dim,
            embedding_name='_target_language_embedding',
            embedding_size=word_vector_dim)
        group_inputs.append(trg_embedding)

        beam_gen = paddle.layer.beam_search(
            name=decoder_group_name,
            step=gru_decoder_with_attention,
            input=group_inputs,
            bos_id=0,
            eos_id=1,
            beam_size=beam_size,
            max_length=max_length)

        return beam_gen
Q
qiaolongfei 已提交
136 137 138


def main():
D
dzhwinter 已提交
139
    paddle.init(use_gpu=with_gpu, trainer_count=1)
C
caoying03 已提交
140
    is_generating = False
Q
qiaolongfei 已提交
141

Q
qiaolongfei 已提交
142 143 144 145
    # source and target dict dim.
    dict_size = 30000
    source_dict_dim = target_dict_dim = dict_size

L
Luo Tao 已提交
146 147 148 149 150 151
    # train the network
    if not is_generating:
        # define optimize method and trainer
        optimizer = paddle.optimizer.Adam(
            learning_rate=5e-5,
            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
C
caoying03 已提交
152 153 154 155

        cost = seq_to_seq_net(source_dict_dim, target_dict_dim, is_generating)
        parameters = paddle.parameters.create(cost)

L
Luo Tao 已提交
156 157 158 159 160 161
        trainer = paddle.trainer.SGD(
            cost=cost, parameters=parameters, update_equation=optimizer)
        # define data reader
        wmt14_reader = paddle.batch(
            paddle.reader.shuffle(
                paddle.dataset.wmt14.train(dict_size), buf_size=8192),
C
caoying03 已提交
162
            batch_size=4)
L
Luo Tao 已提交
163 164 165 166 167

        # define event_handler callback
        def event_handler(event):
            if isinstance(event, paddle.event.EndIteration):
                if event.batch_id % 10 == 0:
C
caoying03 已提交
168 169 170
                    print("\nPass %d, Batch %d, Cost %f, %s" %
                          (event.pass_id, event.batch_id, event.cost,
                           event.metrics))
L
Luo Tao 已提交
171 172 173 174
                else:
                    sys.stdout.write('.')
                    sys.stdout.flush()

C
caoying03 已提交
175
                if not event.batch_id % 10:
176
                    save_path = 'params_pass_%05d_batch_%05d.tar' % (
C
caoying03 已提交
177 178 179 180 181
                        event.pass_id, event.batch_id)
                    save_model(parameters, save_path)

            if isinstance(event, paddle.event.EndPass):
                # save parameters
182
                save_path = 'params_pass_%05d.tar' % (event.pass_id)
C
caoying03 已提交
183 184
                save_model(parameters, save_path)

L
Luo Tao 已提交
185 186 187 188 189 190 191 192 193
        # start to train
        trainer.train(
            reader=wmt14_reader, event_handler=event_handler, num_passes=2)

    # generate a english sequence to french
    else:
        # use the first 3 samples for generation
        gen_data = []
        gen_num = 3
C
caoying03 已提交
194 195
        for item in paddle.dataset.wmt14.gen(dict_size)():
            gen_data.append([item[0]])
L
Luo Tao 已提交
196 197 198
            if len(gen_data) == gen_num:
                break

C
caoying03 已提交
199 200 201 202 203
        beam_size = 3
        beam_gen = seq_to_seq_net(source_dict_dim, target_dict_dim,
                                  is_generating, beam_size)

        # get the trained model, whose bleu = 26.92
L
Luo Tao 已提交
204
        parameters = paddle.dataset.wmt14.model()
C
caoying03 已提交
205

C
caoying03 已提交
206
        # prob is the prediction probabilities, and id is the prediction word.
L
Luo Tao 已提交
207 208 209 210 211 212
        beam_result = paddle.infer(
            output_layer=beam_gen,
            parameters=parameters,
            input=gen_data,
            field=['prob', 'id'])

C
caoying03 已提交
213
        # load the dictionary
L
Luo Tao 已提交
214 215
        src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)

C
caoying03 已提交
216 217 218 219 220 221 222 223 224 225
        gen_sen_idx = np.where(beam_result[1] == -1)[0]
        assert len(gen_sen_idx) == len(gen_data) * beam_size

        # -1 is the delimiter of generated sequences.
        # the first element of each generated sequence its length.
        start_pos, end_pos = 1, 0
        for i, sample in enumerate(gen_data):
            print(
                " ".join([src_dict[w] for w in sample[0][1:-1]])
            )  # skip the start and ending mark when printing the source sentence
L
Luo Tao 已提交
226
            for j in xrange(beam_size):
C
caoying03 已提交
227 228 229 230 231
                end_pos = gen_sen_idx[i * beam_size + j]
                print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
                    trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
                start_pos = end_pos + 2
            print("\n")
Q
qiaolongfei 已提交
232 233 234 235


if __name__ == '__main__':
    main()