test_machine_translation.py 4.4 KB
Newer Older
D
dzhwinter 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
Y
Yan Chunwei 已提交
14 15
import numpy as np
import paddle.v2 as paddle
Q
Qiao Longfei 已提交
16
import paddle.v2.fluid as fluid
Y
Yan Chunwei 已提交
17 18 19
import paddle.v2.fluid.core as core
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers
Q
Qiao Longfei 已提交
20
from paddle.v2.fluid.executor import Executor
Y
Yan Chunwei 已提交
21 22 23 24

dict_size = 30000
source_dict_dim = target_dict_dim = dict_size
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
Q
Qiao Longfei 已提交
25 26
hidden_dim = 32
word_dim = 16
Y
Yan Chunwei 已提交
27
IS_SPARSE = True
Q
Qiao Longfei 已提交
28
batch_size = 10
Y
Yan Chunwei 已提交
29 30 31 32
max_length = 50
topk_size = 50
trg_dic_size = 10000

Q
Qiao Longfei 已提交
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
decoder_size = hidden_dim


def encoder_decoder():
    # encoder
    src_word_id = layers.data(
        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
    src_embedding = layers.embedding(
        input=src_word_id,
        size=[dict_size, word_dim],
        dtype='float32',
        is_sparse=IS_SPARSE,
        param_attr=fluid.ParamAttr(name='vemb'))

    fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
49
    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
Q
Qiao Longfei 已提交
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72

    # decoder
    trg_language_word = layers.data(
        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
    trg_embedding = layers.embedding(
        input=trg_language_word,
        size=[dict_size, word_dim],
        dtype='float32',
        is_sparse=IS_SPARSE,
        param_attr=fluid.ParamAttr(name='vemb'))

    rnn = fluid.layers.DynamicRNN()
    with rnn.block():
        current_word = rnn.step_input(trg_embedding)
        mem = rnn.memory(init=encoder_out)
        fc1 = fluid.layers.fc(input=[current_word, mem],
                              size=decoder_size,
                              act='tanh')
        out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
        rnn.update_memory(mem, fc1)
        rnn.output(out)

    return rnn()
Y
Yan Chunwei 已提交
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90


def to_lodtensor(data, place):
    seq_lens = [len(seq) for seq in data]
    cur_len = 0
    lod = [cur_len]
    for l in seq_lens:
        cur_len += l
        lod.append(cur_len)
    flattened_data = np.concatenate(data, axis=0).astype("int64")
    flattened_data = flattened_data.reshape([len(flattened_data), 1])
    res = core.LoDTensor()
    res.set(flattened_data, place)
    res.set_lod([lod])
    return res


def main():
Q
Qiao Longfei 已提交
91 92 93 94 95 96 97 98
    rnn_out = encoder_decoder()
    label = layers.data(
        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
    cost = layers.cross_entropy(input=rnn_out, label=label)
    avg_cost = fluid.layers.mean(x=cost)

    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
    optimizer.minimize(avg_cost)
Y
Yan Chunwei 已提交
99 100 101

    train_data = paddle.batch(
        paddle.reader.shuffle(
Q
Qiao Longfei 已提交
102
            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
Y
Yan Chunwei 已提交
103 104 105 106 107 108 109 110 111 112 113
        batch_size=batch_size)

    place = core.CPUPlace()
    exe = Executor(place)

    exe.run(framework.default_startup_program())

    batch_id = 0
    for pass_id in xrange(2):
        for data in train_data():
            word_data = to_lodtensor(map(lambda x: x[0], data), place)
Q
Qiao Longfei 已提交
114 115
            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
Y
Yan Chunwei 已提交
116
            outs = exe.run(framework.default_main_program(),
Q
Qiao Longfei 已提交
117 118 119 120 121 122 123 124 125 126 127 128
                           feed={
                               'src_word_id': word_data,
                               'target_language_word': trg_word,
                               'target_language_next_word': trg_word_next
                           },
                           fetch_list=[avg_cost])
            avg_cost_val = np.array(outs[0])
            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
                  " avg_cost=" + str(avg_cost_val))
            if batch_id > 3:
                exit(0)
            batch_id += 1
Y
Yan Chunwei 已提交
129 130 131 132


if __name__ == '__main__':
    main()