已train好的模型再次加载增量训练的时候报错
Created by: sitongchen
报错日志如下: psever.log: get_pserver_program() is deprecated, call get_pserver_programs() to get pserver main and startup in a single call. Traceback (most recent call last): File "train.py", line 670, in use_parallel_executor=bool(args.use_parallel_exe) File "train.py", line 611, in train fluid.io.load_persistables(exe, "./init_model", pserver_prog) File "/home/disk1/normandy/maybach/app-user-20190719142438-10326/workspace/python27-gcc482/lib/python2.7/site-packages/paddle/fluid/io.py", line 697, in load_persistables executor, dirname=dirname, main_program=main_program) File "/home/disk1/normandy/maybach/app-user-20190719142438-10326/workspace/python27-gcc482/lib/python2.7/site-packages/paddle/fluid/io.py", line 824, in load_distributed_persistables load_persistable_vars(executor, dirname, need_load_vars) File "/home/disk1/normandy/maybach/app-user-20190719142438-10326/workspace/python27-gcc482/lib/python2.7/site-packages/paddle/fluid/io.py", line 807, in load_persistable_vars executor.run(load_prog) File "/home/disk1/normandy/maybach/app-user-20190719142438-10326/workspace/python27-gcc482/lib/python2.7/site-packages/paddle/fluid/executor.py", line 565, in run use_program_cache=use_program_cache) File "/home/disk1/normandy/maybach/app-user-20190719142438-10326/workspace/python27-gcc482/lib/python2.7/site-packages/paddle/fluid/executor.py", line 642, in run exe.run(program.desc, scope, 0, True, True, fetch_var_name) paddle.fluid.core.EnforceNotMet: Invoke operator load error. Python Callstacks: File "/home/disk1/normandy/maybach/app-user-20190719142438-10326/workspace/python27-gcc482/lib/python2.7/site-packages/paddle/fluid/framework.py", line 1654, in append_op attrs=kwargs.get("attrs", None)) File "/home/disk1/normandy/maybach/app-user-20190719142438-10326/workspace/python27-gcc482/lib/python2.7/site-packages/paddle/fluid/io.py", line 800, in load_persistable_vars 'file_path': os.path.join(dirname, origin_var.name) File "/home/disk1/normandy/maybach/app-user-20190719142438-10326/workspace/python27-gcc482/lib/python2.7/site-packages/paddle/fluid/io.py", line 824, in load_distributed_persistables load_persistable_vars(executor, dirname, need_load_vars) File "/home/disk1/normandy/maybach/app-user-20190719142438-10326/workspace/python27-gcc482/lib/python2.7/site-packages/paddle/fluid/io.py", line 697, in load_persistables executor, dirname=dirname, main_program=main_program) File "train.py", line 611, in train fluid.io.load_persistables(exe, "./init_model", pserver_prog) File "train.py", line 670, in use_parallel_executor=bool(args.use_parallel_exe) C++ Callstacks: Cannot open file ./init_model/AttentionDislike_fc.b_0_beta1_pow_acc_0 for load op at [/paddle/paddle/fluid/operators/load_op.h:37] PaddlePaddle Call Stacks: 0 0x7f4ea954b090p void paddle::platform::EnforceNotMet::Init<char const*>(char const*, char const*, int) + 352 1 0x7f4ea954b409p paddle::platform::EnforceNotMet::EnforceNotMet(std::exception_ptr::exception_ptr, char const*, int) + 137 2 0x7f4ea9bf62f6p paddle::operators::LoadOpKernel<paddle::platform::CPUDeviceContext, float>::Compute(paddle::framework::ExecutionContext const&) const + 774 3 0x7f4ea9bf6563p std::Function_handler<void (paddle::framework::ExecutionContext const&), paddle::framework::OpKernelRegistrarFunctor<paddle::platform::CPUPlace, false, 0ul, paddle::operators::LoadOpKernel<paddle::platform::CPUDeviceContext, float>, paddle::operators::LoadOpKernel<paddle::platform::CPUDeviceContext, double>, paddle::operators::LoadOpKernel<paddle::platform::CPUDeviceContext, int>, paddle::operators::LoadOpKernel<paddle::platform::CPUDeviceContext, signed char>, paddle::operators::LoadOpKernel<paddle::platform::CPUDeviceContext, long> >::operator()(char const*, char const*, int) const::{lambda(paddle::framework::ExecutionContext const&)#1 (closed)}>::M_invoke(std::Any_data const&, paddle::framework::ExecutionContext const&) + 35 4 0x7f4eaa6fa176p paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void, boost::detail::variant::void_> const&, paddle::framework::RuntimeContext*) const + 662 5 0x7f4eaa6fac74p paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 292 6 0x7f4eaa6f8e4bp paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) + 267 7 0x7f4ea96b2a0ep paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool, bool) + 158 8 0x7f4ea96b59efp paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, std::vector<std::string, std::allocatorstd::string > const&, bool) + 143 9 0x7f4ea953be3ep 10 0x7f4ea957ae5ep 11 0x7f4f503f9010p PyEval_EvalFrameEx + 16384 12 0x7f4f503fab80p PyEval_EvalCodeEx + 2128 13 0x7f4f503f908ep PyEval_EvalFrameEx + 16510 14 0x7f4f503fab80p PyEval_EvalCodeEx + 2128 15 0x7f4f503f908ep PyEval_EvalFrameEx + 16510 16 0x7f4f503fab80p PyEval_EvalCodeEx + 2128 17 0x7f4f503f908ep PyEval_EvalFrameEx + 16510 18 0x7f4f503fab80p PyEval_EvalCodeEx + 2128 19 0x7f4f503f908ep PyEval_EvalFrameEx + 16510 20 0x7f4f503fab80p PyEval_EvalCodeEx + 2128 21 0x7f4f503f908ep PyEval_EvalFrameEx + 16510 22 0x7f4f503fab80p PyEval_EvalCodeEx + 2128 23 0x7f4f503f908ep PyEval_EvalFrameEx + 16510 24 0x7f4f503fab80p PyEval_EvalCodeEx + 2128 25 0x7f4f503fac82p PyEval_EvalCode + 50 26 0x7f4f5041360fp 27 0x7f4f5041467ep PyRun_FileExFlags + 126 28 0x7f4f504157d7p PyRun_SimpleFileExFlags + 199 29 0x7f4f50425d9dp Py_Main + 3133 30 0x7f4f4f66bbd5p __libc_start_main + 245 31 0x4007c1p
train.py代码如下: if is_local: with open("local_main.proto", "w") as f: f.write(str(fluid.default_main_program())) with open("local_startup.proto", "w") as f: f.write(str(fluid.default_startup_program())) train_loop(fluid.default_main_program()) else: print_log("dist train") config = fluid.DistributeTranspilerConfig() #config.slice_var_up = False t = fluid.DistributeTranspiler(config=config) t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainer_num, sync_mode=False, current_endpoint=current_endpoint) if training_role == "PSERVER": print_log("run pserver") pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) with open("pserver_startup.proto", "w") as f: f.write(str(pserver_startup)) with open("pserver_main.proto", "w") as f: f.write(str(pserver_prog)) exe.run(pserver_startup) if os.path.exists('./init_model'): #load_persistables_for_increment("./init_model", exe, pserver_prog, "dis_emb", "./init_model/lookup_table/dis_emb_" + str(pserver_id)) fluid.io.load_persistables(exe, "./init_model", pserver_prog) #load_persistables_for_increment("./init_model", exe, pserver_prog, "dis_emb", "./init_model/lookup_table/dis_emb_" + str(pserver_id)) print_log("finish loading init_model") exe.run(pserver_prog) elif training_role == "TRAINER": print_log("run trianer") main_program = t.get_trainer_program() with open("trainer_main.proto", "w") as f: f.write(str(main_program)) train_loop(main_program, trainer_id)