test_machine_translation.py 4.4 KB
Newer Older
D
dzhwinter 已提交
1
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
D
dzhwinter 已提交
2
#
D
dzhwinter 已提交
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
D
dzhwinter 已提交
6
#
D
dzhwinter 已提交
7
#     http://www.apache.org/licenses/LICENSE-2.0
D
dzhwinter 已提交
8
#
D
dzhwinter 已提交
9 10 11 12 13 14
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Y
Yan Chunwei 已提交
15 16
import numpy as np
import paddle.v2 as paddle
Q
Qiao Longfei 已提交
17
import paddle.v2.fluid as fluid
Y
Yan Chunwei 已提交
18 19 20
import paddle.v2.fluid.core as core
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers
Q
Qiao Longfei 已提交
21
from paddle.v2.fluid.executor import Executor
Y
Yan Chunwei 已提交
22 23 24 25

dict_size = 30000
source_dict_dim = target_dict_dim = dict_size
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
Q
Qiao Longfei 已提交
26 27
hidden_dim = 32
word_dim = 16
Y
Yan Chunwei 已提交
28
IS_SPARSE = True
Q
Qiao Longfei 已提交
29
batch_size = 10
Y
Yan Chunwei 已提交
30 31 32 33
max_length = 50
topk_size = 50
trg_dic_size = 10000

Q
Qiao Longfei 已提交
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
decoder_size = hidden_dim


def encoder_decoder():
    # encoder
    src_word_id = layers.data(
        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
    src_embedding = layers.embedding(
        input=src_word_id,
        size=[dict_size, word_dim],
        dtype='float32',
        is_sparse=IS_SPARSE,
        param_attr=fluid.ParamAttr(name='vemb'))

    fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
50
    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
Q
Qiao Longfei 已提交
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73

    # decoder
    trg_language_word = layers.data(
        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
    trg_embedding = layers.embedding(
        input=trg_language_word,
        size=[dict_size, word_dim],
        dtype='float32',
        is_sparse=IS_SPARSE,
        param_attr=fluid.ParamAttr(name='vemb'))

    rnn = fluid.layers.DynamicRNN()
    with rnn.block():
        current_word = rnn.step_input(trg_embedding)
        mem = rnn.memory(init=encoder_out)
        fc1 = fluid.layers.fc(input=[current_word, mem],
                              size=decoder_size,
                              act='tanh')
        out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
        rnn.update_memory(mem, fc1)
        rnn.output(out)

    return rnn()
Y
Yan Chunwei 已提交
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91


def to_lodtensor(data, place):
    seq_lens = [len(seq) for seq in data]
    cur_len = 0
    lod = [cur_len]
    for l in seq_lens:
        cur_len += l
        lod.append(cur_len)
    flattened_data = np.concatenate(data, axis=0).astype("int64")
    flattened_data = flattened_data.reshape([len(flattened_data), 1])
    res = core.LoDTensor()
    res.set(flattened_data, place)
    res.set_lod([lod])
    return res


def main():
Q
Qiao Longfei 已提交
92 93 94 95 96 97 98 99
    rnn_out = encoder_decoder()
    label = layers.data(
        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
    cost = layers.cross_entropy(input=rnn_out, label=label)
    avg_cost = fluid.layers.mean(x=cost)

    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
    optimizer.minimize(avg_cost)
Y
Yan Chunwei 已提交
100 101 102

    train_data = paddle.batch(
        paddle.reader.shuffle(
Q
Qiao Longfei 已提交
103
            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
Y
Yan Chunwei 已提交
104 105 106 107 108 109 110 111 112 113 114
        batch_size=batch_size)

    place = core.CPUPlace()
    exe = Executor(place)

    exe.run(framework.default_startup_program())

    batch_id = 0
    for pass_id in xrange(2):
        for data in train_data():
            word_data = to_lodtensor(map(lambda x: x[0], data), place)
Q
Qiao Longfei 已提交
115 116
            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
Y
Yan Chunwei 已提交
117
            outs = exe.run(framework.default_main_program(),
Q
Qiao Longfei 已提交
118 119 120 121 122 123 124 125 126 127 128 129
                           feed={
                               'src_word_id': word_data,
                               'target_language_word': trg_word,
                               'target_language_next_word': trg_word_next
                           },
                           fetch_list=[avg_cost])
            avg_cost_val = np.array(outs[0])
            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
                  " avg_cost=" + str(avg_cost_val))
            if batch_id > 3:
                exit(0)
            batch_id += 1
Y
Yan Chunwei 已提交
130 131 132 133


if __name__ == '__main__':
    main()