ocr 模型增加异步模式,运行失败
Created by: ccmeteorljh
模型代码库地址: https://github.com/Yancey1989/paddle_fluid_benchmark/tree/master/ocr_recognition 修改如下: ctc_train.py 加入参数
add_arg('async_mode', bool, False, "Whether start pserver in async mode to support ASGD")
t.transpile(
trainer_id,
pservers=pserver_endpoints,
trainers=trainers,
sync_mode=not args.async_mode)
启动ps脚本如下:
export FLAGS_fraction_of_gpu_memory_to_use=0.0
export TRAINING_ROLE=PSERVER
export PADDLE_PORT=5002
export PADDLE_TRAINERS_NUM=$2
export POD_IP=$(hostname -i)
export GLOG_logtostderr=1
export GLOG_v=3
PADDLE_PSERVERS=
python ctc_train.py --local 0 --use_gpu 0 --batch_size 256 --async_mode 1
同步模式下可以正常训练,异步训练失败,结果如下:
Traceback (most recent call last):
File "ctc_train.py", line 206, in <module>
main()
File "ctc_train.py", line 202, in main
train(args, data_reader=ctc_reader)
File "ctc_train.py", line 184, in train
exe.run(pserver_program)
File "/usr/local/lib/python2.7/dist-packages/paddle/fluid/executor.py", line 441, in run
self.executor.run(program.desc, scope, 0, True, True)
paddle.fluid.core.EnforceNotMet: enforce grad_to_block_id.count(pieces[0]) == 0 failed, 1 != 0
at [/paddle/paddle/fluid/operators/listen_and_serv_op.cc:180]
PaddlePaddle Call Stacks:
0 0x7f2e88f1b656p paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 486
1 0x7f2e89b8fc14p paddle::operators::ListenAndServOp::RunAsyncLoop(paddle::framework::Executor*, paddle::framework::ProgramDesc*) const + 3060
2 0x7f2e89b9260fp paddle::operators::ListenAndServOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 3727
3 0x7f2e89d4c56dp paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) + 205
4 0x7f2e88fb5f4fp paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool, bool) + 255
5 0x7f2e88fb6fa0p paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool) + 128
6 0x7f2e88f3362bp void pybind11::cpp_function::initialize<pybind11::cpp_function::initialize<void, paddle::framework::Executor, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, pybind11::name, pybind11::is_method, pybind11::sibling>(void (paddle::framework::Executor::*)(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&)::{lambda(paddle::framework::Executor*, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool)#1}, void, paddle::framework::Executor*, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, pybind11::name, pybind11::is_method, pybind11::sibling>(pybind11::cpp_function::initialize<void, paddle::framework::Executor, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, pybind11::name, pybind11::is_method, pybind11::sibling>(void (paddle::framework::Executor::*)(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&)::{lambda(paddle::framework::Executor*, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool)#1}&&, void (*)(paddle::framework::Executor*, paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&)::{lambda(pybind11::detail::function_call&)#3}::_FUN(pybind11::detail::function_call) + 555
7 0x7f2e88f2b804p pybind11::cpp_function::dispatcher(_object*, _object*, _object*) + 2596
8 0x4c37edp PyEval_EvalFrameEx + 31165
9 0x4b9ab6p PyEval_EvalCodeEx + 774
10 0x4c16e7p PyEval_EvalFrameEx + 22711
11 0x4b9ab6p PyEval_EvalCodeEx + 774
12 0x4c16e7p PyEval_EvalFrameEx + 22711
13 0x4c136fp PyEval_EvalFrameEx + 21823
14 0x4b9ab6p PyEval_EvalCodeEx + 774
15 0x4eb30fp
16 0x4e5422p PyRun_FileExFlags + 130
17 0x4e3cd6p PyRun_SimpleFileExFlags + 390
18 0x493ae2p Py_Main + 1554
19 0x7f2ee2611830p __libc_start_main + 240
20 0x4933e9p _start + 41