未验证 提交 36fcc95c 编写于 作者: Q Qiao Longfei 提交者: GitHub

Nmt decoder train (#6367)

* init decoder_trainer

* can run

* fix lod

* add sharelod to cross_entropy_grad_op

* add avg_cost to fetch list

* modify learning rate

* can run

* optimie code

* add early exit

* fix print

* revert test_understand_sentiment_conv.py

* add act to fc
上级 7d85b6d3
...@@ -59,7 +59,7 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -59,7 +59,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
auto *in_var = block_.FindVarRecursive(Inputs(in)[i]); auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
auto *out_var = block_.FindVarRecursive(Outputs(out)[j]); auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
if (in_var->GetType() != VarDesc::LOD_TENSOR) { if (in_var->GetType() != VarDesc::LOD_TENSOR) {
VLOG(3) << "input " << in << "is not LodTensor"; VLOG(3) << "input " << in << " is not LodTensor";
return; return;
} }
PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR, PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,
......
...@@ -41,14 +41,18 @@ class ConcatOp : public framework::OperatorWithKernel { ...@@ -41,14 +41,18 @@ class ConcatOp : public framework::OperatorWithKernel {
for (size_t j = 0; j < in_zero_dims_size; j++) { for (size_t j = 0; j < in_zero_dims_size; j++) {
if (j == axis) { if (j == axis) {
out_dims[axis] += ins[i][j]; out_dims[axis] += ins[i][j];
continue; } else {
PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
"Input tensors should have the same "
"elements except the specify axis.");
} }
PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
"Input tensors should have the same "
"elements except the specify axis.");
} }
} }
if (out_dims[axis] < 0) {
out_dims[axis] = -1;
}
ctx->SetOutputDim("Out", out_dims); ctx->SetOutputDim("Out", out_dims);
ctx->ShareLoD("X", /*->*/ "Out");
} }
}; };
......
...@@ -95,6 +95,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { ...@@ -95,6 +95,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
"Input(Label) should be 1."); "Input(Label) should be 1.");
} }
ctx->SetOutputDim(framework::GradVarName("X"), x_dims); ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->ShareLoD("X", framework::GradVarName("X"));
} }
protected: protected:
......
...@@ -430,7 +430,8 @@ def _create_op_func_(op_type): ...@@ -430,7 +430,8 @@ def _create_op_func_(op_type):
dtype = each.dtype dtype = each.dtype
elif dtype != each.dtype: elif dtype != each.dtype:
raise ValueError( raise ValueError(
"operator {0} must input same dtype".format(op_type)) "operator {0} must input same dtype. {1} vs {2}".format(
op_type, dtype, each.dtype))
return dtype return dtype
......
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.dataset.conll05 as conll05 import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
import paddle.v2.fluid.framework as framework import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor, g_scope from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.optimizer import SGDOptimizer
import paddle.v2.fluid as fluid
import paddle.v2.fluid.layers as pd
dict_size = 30000 dict_size = 30000
source_dict_dim = target_dict_dim = dict_size source_dict_dim = target_dict_dim = dict_size
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
hidden_dim = 512 hidden_dim = 32
word_dim = 512 word_dim = 16
IS_SPARSE = True IS_SPARSE = True
batch_size = 50 batch_size = 10
max_length = 50 max_length = 50
topk_size = 50 topk_size = 50
trg_dic_size = 10000 trg_dic_size = 10000
src_word_id = layers.data(name="src_word_id", shape=[1], dtype='int64') decoder_size = hidden_dim
src_embedding = layers.embedding(
input=src_word_id,
size=[dict_size, word_dim], def encoder_decoder():
dtype='float32', # encoder
is_sparse=IS_SPARSE, src_word_id = layers.data(
param_attr=fluid.ParamAttr(name='vemb')) name="src_word_id", shape=[1], dtype='int64', lod_level=1)
src_embedding = layers.embedding(
input=src_word_id,
def encoder(): size=[dict_size, word_dim],
dtype='float32',
lstm_hidden0, lstm_0 = layers.dynamic_lstm( is_sparse=IS_SPARSE,
input=src_embedding, param_attr=fluid.ParamAttr(name='vemb'))
size=hidden_dim,
candidate_activation='sigmoid', fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
cell_activation='sigmoid') lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
encoder_out = layers.sequence_pool(input=lstm_hidden0, pool_type="last")
lstm_hidden1, lstm_1 = layers.dynamic_lstm(
input=src_embedding, # decoder
size=hidden_dim, trg_language_word = layers.data(
candidate_activation='sigmoid', name="target_language_word", shape=[1], dtype='int64', lod_level=1)
cell_activation='sigmoid', trg_embedding = layers.embedding(
is_reverse=True) input=trg_language_word,
size=[dict_size, word_dim],
bidirect_lstm_out = layers.concat([lstm_hidden0, lstm_hidden1], axis=0) dtype='float32',
is_sparse=IS_SPARSE,
return bidirect_lstm_out param_attr=fluid.ParamAttr(name='vemb'))
rnn = fluid.layers.DynamicRNN()
def decoder_trainer(context): with rnn.block():
''' current_word = rnn.step_input(trg_embedding)
decoder with trainer mem = rnn.memory(init=encoder_out)
''' fc1 = fluid.layers.fc(input=[current_word, mem],
pass size=decoder_size,
act='tanh')
out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
rnn.update_memory(mem, fc1)
rnn.output(out)
return rnn()
def to_lodtensor(data, place): def to_lodtensor(data, place):
...@@ -72,13 +75,18 @@ def to_lodtensor(data, place): ...@@ -72,13 +75,18 @@ def to_lodtensor(data, place):
def main(): def main():
encoder_out = encoder() rnn_out = encoder_decoder()
# TODO(jacquesqiao) call here label = layers.data(
decoder_trainer(encoder_out) name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
cost = layers.cross_entropy(input=rnn_out, label=label)
avg_cost = fluid.layers.mean(x=cost)
optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
optimizer.minimize(avg_cost)
train_data = paddle.batch( train_data = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.wmt14.train(8000), buf_size=1000), paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=batch_size) batch_size=batch_size)
place = core.CPUPlace() place = core.CPUPlace()
...@@ -88,15 +96,23 @@ def main(): ...@@ -88,15 +96,23 @@ def main():
batch_id = 0 batch_id = 0
for pass_id in xrange(2): for pass_id in xrange(2):
print 'pass_id', pass_id
for data in train_data(): for data in train_data():
print 'batch', batch_id
batch_id += 1
if batch_id > 10: break
word_data = to_lodtensor(map(lambda x: x[0], data), place) word_data = to_lodtensor(map(lambda x: x[0], data), place)
trg_word = to_lodtensor(map(lambda x: x[1], data), place)
trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
outs = exe.run(framework.default_main_program(), outs = exe.run(framework.default_main_program(),
feed={'src_word_id': word_data, }, feed={
fetch_list=[encoder_out]) 'src_word_id': word_data,
'target_language_word': trg_word,
'target_language_next_word': trg_word_next
},
fetch_list=[avg_cost])
avg_cost_val = np.array(outs[0])
print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
" avg_cost=" + str(avg_cost_val))
if batch_id > 3:
exit(0)
batch_id += 1
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册