save inference model error
Created by: gmcather
I want to save prediction
in inference model. The program runs well when parallel_do
is not used, on the contrary, the error happens when combining with parallel_do
. How to solve this problem.
# no errors
train(train_reader, word_dict, bow_net, use_cuda=False,
parallel=False, save_dirname="bow_model", lr=0.002,
pass_num=30, batch_size=128)
# error happens
train(train_reader, word_dict, bow_net, use_cuda=False,
parallel=True, save_dirname="bow_model", lr=0.002,
pass_num=30, batch_size=128)
Traceback (most recent call last):
File "train.py", line 116, in <module>
train_net()
File "train.py", line 98, in train_net
pass_num=30, batch_size=128)
File "train.py", line 73, in train
fetch_list=[cost, acc])
File "/root/.jumbo/lib/python2.7/site-packages/paddle/fluid/executor.py", line 336, in run
self.executor.run(program.desc, scope, 0, True, True)
paddle.fluid.core.EnforceNotMet: Cannot find variable fc_2.tmp_2@GRAD in the parent scope at [/paddle/paddle/fluid/operators/detail/safe_ref.h:28]
PaddlePaddle Call Stacks:
0 0x7fa43a6ae606p paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 486
1 0x7fa43afec81ap
2 0x7fa43aff545fp paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 431
3 0x7fa43a744e20p paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool) + 336
4 0x7fa43a745a34p paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool) + 100
5 0x7fa43a6c479bp _ZZN8pybind1112cpp_function10initializeIZNS0_C1IvN6paddle9framework8ExecutorEIRKNS4_11ProgramDescEPNS4_5ScopeEibbEINS_4nameENS_9is_methodENS_7siblingEEEEMT0_FT_DpT1_EDpRKT2_EUlPS5_S8_SA_ibbE_vISO_S8_SA_ibbEISB_SC_SD_EEEvOSF_PFSE_SH_ESN_ENUlRNS_6detail13function_callEE1_4_FUNESV_ + 555
6 0x7fa43a6be164p pybind11::cpp_function::dispatcher(_object*, _object*, _object*) + 2596
7 0x7fa474d8f3d4p PyEval_EvalFrameEx + 25956
8 0x7fa474d90120p PyEval_EvalCodeEx + 2240
9 0x7fa474d8e491p PyEval_EvalFrameEx + 22049
10 0x7fa474d90120p PyEval_EvalCodeEx + 2240
11 0x7fa474d8e491p PyEval_EvalFrameEx + 22049
12 0x7fa474d8ec46p PyEval_EvalFrameEx + 24022
13 0x7fa474d90120p PyEval_EvalCodeEx + 2240
14 0x7fa474d90232p PyEval_EvalCode + 50
15 0x7fa474daa61cp
16 0x7fa474daa6f0p PyRun_FileExFlags + 144
17 0x7fa474dabbfcp PyRun_SimpleFileExFlags + 220
18 0x7fa474dbd4bcp Py_Main + 3164
19 0x7fa474090d1dp __libc_start_main + 253
20 0x400659p
here is my code
"""
For http://wiki.baidu.com/display/LegoNet/Text+Classification
"""
import paddle.fluid as fluid
import paddle.v2 as paddle
import numpy as np
import sys
import time
import unittest
import contextlib
import utils
from nets import bow_net
from nets import cnn_net
from nets import lstm_net
from nets import gru_net
def train(train_reader,
word_dict,
network,
use_cuda,
parallel,
save_dirname,
lr=0.2,
batch_size=128,
pass_num=30):
"""
train network
"""
data = fluid.layers.data(
name="words",
shape=[1],
dtype="int64",
lod_level=1)
label = fluid.layers.data(
name="label",
shape=[1],
dtype="int64")
if not parallel:
cost, acc, prediction = network(
data, label, len(word_dict))
else:
places = fluid.layers.get_places(device_count = 2)
pd = fluid.layers.ParallelDo(places)
with pd.do():
cost, acc, prediction = network(
pd.read_input(data),
pd.read_input(label),
len(word_dict))
pd.write_output(cost)
pd.write_output(acc)
pd.write_output(prediction)
cost, acc, prediction = pd()
cost = fluid.layers.mean(cost)
acc = fluid.layers.mean(acc)
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=lr)
sgd_optimizer.minimize(cost)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
exe.run(fluid.default_startup_program())
for pass_id in xrange(pass_num):
data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0
for data in train_reader():
avg_cost_np, avg_acc_np = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[cost, acc])
data_size = len(data)
total_acc += data_size * avg_acc_np
total_cost += data_size * avg_cost_np
data_count += data_size
avg_cost = total_cost / data_count
avg_acc = total_acc / data_count
print("pass_id: %d, avg_acc: %f, avg_cost: %f" % (pass_id, avg_acc, avg_cost))
epoch_model = save_dirname + "/" + "epoch" + str(pass_id)
fluid.io.save_inference_model(
epoch_model,
["words"],
prediction, exe)
def train_net():
word_dict, train_reader, test_reader = utils.prepare_data(
"tiny_imdb", self_dict = False,
batch_size = 128, buf_size = 50000)
train(train_reader, word_dict, bow_net, use_cuda=False,
parallel=True, save_dirname="bow_model", lr=0.002,
pass_num=30, batch_size=128)
if __name__ == "__main__":
train_net()