利用Executor的参数fetch list获取模型执行的中间结果时,出错
Created by: Akeepers
paddle version: 1.6.3 python 2.7 错误信息:
"The following exception is not an EOF exception.")
Traceback (most recent call last):
File "run_seq2seq.py", line 236, in <module>
main(args)
File "run_seq2seq.py", line 207, in main
train(args)
File "run_seq2seq.py", line 106, in train
preds_res, loss_val = exe.run(program=prog, feed=data, fetch_list=[preds_res, loss])
File "/home/yangpan/anaconda3/envs/paddle-py2.7/lib/python2.7/site-packages/paddle/fluid/executor.py", line 780, in run
six.reraise(*sys.exc_info())
File "/home/yangpan/anaconda3/envs/paddle-py2.7/lib/python2.7/site-packages/paddle/fluid/executor.py", line 775, in run
use_program_cache=use_program_cache)
File "/home/yangpan/anaconda3/envs/paddle-py2.7/lib/python2.7/site-packages/paddle/fluid/executor.py", line 834, in _run_impl
return_numpy=return_numpy) File "/home/yangpan/anaconda3/envs/paddle-py2.7/lib/python2.7/site-packages/paddle/fluid/executor.py", line 674, in _run_parallel
tensors = exe.run(fetch_var_names)._move_to_list()
paddle.fluid.core_avx.EnforceNotMet:
--------------------------------------------
C++ Call Stacks (More useful to developers):
--------------------------------------------
0 std::string paddle::platform::GetTraceBackString<std::string const&>(std::string const&, char const*, int)
1 paddle::platform::EnforceNotMet::EnforceNotMet(std::string const&, char const*, int)
2 paddle::framework::LoDTensor::MergeLoDTensor(std::vector<paddle::framework::LoDTensor const*, std::allocator<paddle::framework::LoDTensor const*> > const&, paddle::platform::Place)
3 paddle::framework::details::FetchOpHandle::WaitAndMergeCPUTensors() const
4 paddle::framework::details::FetchOpHandle::RunImpl()
5 paddle::framework::details::FastThreadedSSAGraphExecutor::RunOpSync(paddle::framework::details::OpHandleBase*)
6 paddle::framework::details::FastThreadedSSAGraphExecutor::RunOp(paddle::framework::details::OpHandleBase*, std::shared_ptr<paddle::framework::BlockingQueue<unsigned long> > const&, unsigned long*)
7 std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<void>, std::__future_base::_Result_base::_Deleter>, void> >::_M_invoke(std::_Any_data const&)
8 std::__future_base::_State_base::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>&, bool&)
9 ThreadPool::ThreadPool(unsigned long)::{lambda()#1}::operator()() const
----------------------
Error Message Summary:
----------------------
Error: Paddle internal Check failed. (Please help us create a new issue, here we need to find the developer to add a user friendly error message)
[Hint: Expected framework::product(new_dim) / new_dim[0] == framework::product(t->dims()) / t->dims()[0], but received framework::product(new_dim) / new_dim[0]:50 != framework::product(t->dims()) / t->dims()[0]:63.] at (/paddle/paddle/fluid/framework/lod_tensor.cc:363)
报错部分的paddle代码: 报错部分: preds_res, loss_val = exe.run(program=prog, feed=data, fetch_list=[preds_res, loss]) Note: 如果将fetch_list中的preds_res去除,代码运行正常
for current_epoch in range(args.epoch):
step = 0
for data in loader():
step += 1
preds_res, loss_val = exe.run(program=prog, feed=data, fetch_list=[preds_res, loss])
loss_val = np.mean(loss_val)
np_preds_res = np.array(preds_res)
current_example, current_epoch = reader.get_train_progress()
print('epoch: %d, batch_id: %d/%d, loss: %f' %
(current_epoch, step, num_examples, loss_val))
if step % args.save_steps == 0:
save_path = os.path.join(args.checkpoints,
"step_" + str(step))
fluid.io.save_persistables(exe, save_path, train_prog)
print("step: %d, save model in %s" % (step, save_path))
# except fluid.core.EOFException:
# save_path = os.path.join(args.checkpoints,
# "step_" + str(step))
# fluid.io.save_persistables(exe, save_path, train_prog)
# print("step: %d, save model in %s" % (step, save_path))
# loader.reset()
program定义:
train_prog = fluid.Program()
startup_prog = fluid.Program()
with fluid.program_guard(train_prog, startup_prog):
with fluid.unique_name.guard():
# For training:
inputs, loader = data_func(args, is_train=True)
logits = model_func(args, inputs, ernie_config, is_train=True)
loss, preds_res = loss_func(logits, inputs[-1], inputs[-2])
optimizer = optimizer_func()
optimizer.minimize(loss)
loss func定义:
def loss_func(logits, label, trg_sequence_length):
probs = layers.softmax(logits)
res = layers.argmax(probs, axis=-1)
loss = layers.cross_entropy(input=probs, label=label)
trg_mask = layers.sequence_mask(
trg_sequence_length, maxlen=layers.shape(logits)[1], dtype="float32")
avg_cost = layers.reduce_sum(loss * trg_mask) / layers.reduce_sum(trg_mask)
return avg_cost, res