From 36fcc95cabdd74a9508f88666b51de8c29dc753f Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 8 Dec 2017 13:11:11 +0800 Subject: [PATCH] Nmt decoder train (#6367) * init decoder_trainer * can run * fix lod * add sharelod to cross_entropy_grad_op * add avg_cost to fetch list * modify learning rate * can run * optimie code * add early exit * fix print * revert test_understand_sentiment_conv.py * add act to fc --- paddle/framework/op_desc.cc | 2 +- paddle/operators/concat_op.cc | 12 +- paddle/operators/cross_entropy_op.cc | 1 + python/paddle/v2/fluid/layers.py | 3 +- .../tests/book/test_machine_translation.py | 120 ++++++++++-------- 5 files changed, 80 insertions(+), 58 deletions(-) diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index 2281d93df90..cde3f1ac2e4 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -59,7 +59,7 @@ class CompileTimeInferShapeContext : public InferShapeContext { auto *in_var = block_.FindVarRecursive(Inputs(in)[i]); auto *out_var = block_.FindVarRecursive(Outputs(out)[j]); if (in_var->GetType() != VarDesc::LOD_TENSOR) { - VLOG(3) << "input " << in << "is not LodTensor"; + VLOG(3) << "input " << in << " is not LodTensor"; return; } PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR, diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc index 6134ac78b14..cf522d6921e 100644 --- a/paddle/operators/concat_op.cc +++ b/paddle/operators/concat_op.cc @@ -41,14 +41,18 @@ class ConcatOp : public framework::OperatorWithKernel { for (size_t j = 0; j < in_zero_dims_size; j++) { if (j == axis) { out_dims[axis] += ins[i][j]; - continue; + } else { + PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j], + "Input tensors should have the same " + "elements except the specify axis."); } - PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j], - "Input tensors should have the same " - "elements except the specify axis."); } } + if (out_dims[axis] < 0) { + out_dims[axis] = -1; + } ctx->SetOutputDim("Out", out_dims); + ctx->ShareLoD("X", /*->*/ "Out"); } }; diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc index 1e82742eaf8..2b06012b690 100644 --- a/paddle/operators/cross_entropy_op.cc +++ b/paddle/operators/cross_entropy_op.cc @@ -95,6 +95,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { "Input(Label) should be 1."); } ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->ShareLoD("X", framework::GradVarName("X")); } protected: diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index fb444f2d869..b4426bad149 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -430,7 +430,8 @@ def _create_op_func_(op_type): dtype = each.dtype elif dtype != each.dtype: raise ValueError( - "operator {0} must input same dtype".format(op_type)) + "operator {0} must input same dtype. {1} vs {2}".format( + op_type, dtype, each.dtype)) return dtype diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/v2/fluid/tests/book/test_machine_translation.py index 5bc7e1b59d9..80ffc5a544c 100644 --- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py +++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py @@ -1,59 +1,62 @@ import numpy as np import paddle.v2 as paddle -import paddle.v2.dataset.conll05 as conll05 +import paddle.v2.fluid as fluid import paddle.v2.fluid.core as core import paddle.v2.fluid.framework as framework import paddle.v2.fluid.layers as layers -from paddle.v2.fluid.executor import Executor, g_scope -from paddle.v2.fluid.optimizer import SGDOptimizer -import paddle.v2.fluid as fluid -import paddle.v2.fluid.layers as pd +from paddle.v2.fluid.executor import Executor dict_size = 30000 source_dict_dim = target_dict_dim = dict_size src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) -hidden_dim = 512 -word_dim = 512 +hidden_dim = 32 +word_dim = 16 IS_SPARSE = True -batch_size = 50 +batch_size = 10 max_length = 50 topk_size = 50 trg_dic_size = 10000 -src_word_id = layers.data(name="src_word_id", shape=[1], dtype='int64') -src_embedding = layers.embedding( - input=src_word_id, - size=[dict_size, word_dim], - dtype='float32', - is_sparse=IS_SPARSE, - param_attr=fluid.ParamAttr(name='vemb')) - - -def encoder(): - - lstm_hidden0, lstm_0 = layers.dynamic_lstm( - input=src_embedding, - size=hidden_dim, - candidate_activation='sigmoid', - cell_activation='sigmoid') - - lstm_hidden1, lstm_1 = layers.dynamic_lstm( - input=src_embedding, - size=hidden_dim, - candidate_activation='sigmoid', - cell_activation='sigmoid', - is_reverse=True) - - bidirect_lstm_out = layers.concat([lstm_hidden0, lstm_hidden1], axis=0) - - return bidirect_lstm_out - - -def decoder_trainer(context): - ''' - decoder with trainer - ''' - pass +decoder_size = hidden_dim + + +def encoder_decoder(): + # encoder + src_word_id = layers.data( + name="src_word_id", shape=[1], dtype='int64', lod_level=1) + src_embedding = layers.embedding( + input=src_word_id, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr(name='vemb')) + + fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') + lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4) + encoder_out = layers.sequence_pool(input=lstm_hidden0, pool_type="last") + + # decoder + trg_language_word = layers.data( + name="target_language_word", shape=[1], dtype='int64', lod_level=1) + trg_embedding = layers.embedding( + input=trg_language_word, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr(name='vemb')) + + rnn = fluid.layers.DynamicRNN() + with rnn.block(): + current_word = rnn.step_input(trg_embedding) + mem = rnn.memory(init=encoder_out) + fc1 = fluid.layers.fc(input=[current_word, mem], + size=decoder_size, + act='tanh') + out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax') + rnn.update_memory(mem, fc1) + rnn.output(out) + + return rnn() def to_lodtensor(data, place): @@ -72,13 +75,18 @@ def to_lodtensor(data, place): def main(): - encoder_out = encoder() - # TODO(jacquesqiao) call here - decoder_trainer(encoder_out) + rnn_out = encoder_decoder() + label = layers.data( + name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) + cost = layers.cross_entropy(input=rnn_out, label=label) + avg_cost = fluid.layers.mean(x=cost) + + optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) + optimizer.minimize(avg_cost) train_data = paddle.batch( paddle.reader.shuffle( - paddle.dataset.wmt14.train(8000), buf_size=1000), + paddle.dataset.wmt14.train(dict_size), buf_size=1000), batch_size=batch_size) place = core.CPUPlace() @@ -88,15 +96,23 @@ def main(): batch_id = 0 for pass_id in xrange(2): - print 'pass_id', pass_id for data in train_data(): - print 'batch', batch_id - batch_id += 1 - if batch_id > 10: break word_data = to_lodtensor(map(lambda x: x[0], data), place) + trg_word = to_lodtensor(map(lambda x: x[1], data), place) + trg_word_next = to_lodtensor(map(lambda x: x[2], data), place) outs = exe.run(framework.default_main_program(), - feed={'src_word_id': word_data, }, - fetch_list=[encoder_out]) + feed={ + 'src_word_id': word_data, + 'target_language_word': trg_word, + 'target_language_next_word': trg_word_next + }, + fetch_list=[avg_cost]) + avg_cost_val = np.array(outs[0]) + print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + + " avg_cost=" + str(avg_cost_val)) + if batch_id > 3: + exit(0) + batch_id += 1 if __name__ == '__main__': -- GitLab