test_machine_translation.py 2.7 KB
Newer Older
Y
Yan Chunwei 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
import numpy as np
import paddle.v2 as paddle
import paddle.v2.dataset.conll05 as conll05
import paddle.v2.fluid.core as core
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor, g_scope
from paddle.v2.fluid.optimizer import SGDOptimizer
import paddle.v2.fluid as fluid
import paddle.v2.fluid.layers as pd

dict_size = 30000
source_dict_dim = target_dict_dim = dict_size
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
hidden_dim = 512
word_dim = 512
IS_SPARSE = True
batch_size = 50
max_length = 50
topk_size = 50
trg_dic_size = 10000

src_word_id = layers.data(name="src_word_id", shape=[1], dtype='int64')
src_embedding = layers.embedding(
    input=src_word_id,
    size=[dict_size, word_dim],
    dtype='float32',
    is_sparse=IS_SPARSE,
    param_attr=fluid.ParamAttr(name='vemb'))


def encoder():

    lstm_hidden0, lstm_0 = layers.dynamic_lstm(
        input=src_embedding,
        size=hidden_dim,
        candidate_activation='sigmoid',
        cell_activation='sigmoid')

    lstm_hidden1, lstm_1 = layers.dynamic_lstm(
        input=src_embedding,
        size=hidden_dim,
        candidate_activation='sigmoid',
        cell_activation='sigmoid',
        is_reverse=True)

    bidirect_lstm_out = layers.concat([lstm_hidden0, lstm_hidden1], axis=0)

    return bidirect_lstm_out


def decoder_trainer(context):
    '''
    decoder with trainer
    '''
    pass


def to_lodtensor(data, place):
    seq_lens = [len(seq) for seq in data]
    cur_len = 0
    lod = [cur_len]
    for l in seq_lens:
        cur_len += l
        lod.append(cur_len)
    flattened_data = np.concatenate(data, axis=0).astype("int64")
    flattened_data = flattened_data.reshape([len(flattened_data), 1])
    res = core.LoDTensor()
    res.set(flattened_data, place)
    res.set_lod([lod])
    return res


def main():
    encoder_out = encoder()
    # TODO(jacquesqiao) call here
    decoder_trainer(encoder_out)

    train_data = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.wmt14.train(8000), buf_size=1000),
        batch_size=batch_size)

    place = core.CPUPlace()
    exe = Executor(place)

    exe.run(framework.default_startup_program())

    batch_id = 0
    for pass_id in xrange(2):
        print 'pass_id', pass_id
        for data in train_data():
            print 'batch', batch_id
            batch_id += 1
            if batch_id > 10: break
            word_data = to_lodtensor(map(lambda x: x[0], data), place)
            outs = exe.run(framework.default_main_program(),
                           feed={'src_word_id': word_data, },
                           fetch_list=[encoder_out])


if __name__ == '__main__':
    main()