分布式训练时提示CUBLAS: not initialized
Created by: imistyrain
使用dist_train.py进行分布式训练,提示
Traceback (most recent call last):
File "dist_train.py", line 23, in <module>
import paddle.fluid as fluid
File "/home/ar/.jumbo/lib/python2.7/site-packages/paddle/fluid/__init__.py", line 142, in <module>
__bootstrap__()
File "/home/ar/.jumbo/lib/python2.7/site-packages/paddle/fluid/__init__.py", line 136, in __bootstrap__
core.init_devices(not in_test)
paddle.fluid.core.EnforceNotMet: CUBLAS: not initialized, at [/paddle/paddle/fluid/platform/device_context.cc:218]
PaddlePaddle Call Stacks:
0 0x7f0eb82fad06p paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 486
1 0x7f0eb9917d7fp paddle::platform::CUDADeviceContext::CUDADeviceContext(paddle::platform::CUDAPlace) + 3135
2 0x7f0eb9918a08p paddle::platform::DeviceContextPool::DeviceContextPool(std::vector<boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_>, std::allocator<boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, paddle::platform::CUDAPinnedPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> > > const&) + 552
3 0x7f0eb991b2d8p paddle::framework::InitDevices(bool, std::vector<int, std::allocator<int> >) + 696
4 0x7f0eb991b50dp paddle::framework::InitDevices(bool) + 285
5 0x7f0eb82d9a0ap
6 0x7f0eb8311414p pybind11::cpp_function::dispatcher(_object*, _object*, _object*) + 2596
7 0x7f0f199c8679p PyEval_EvalFrameEx + 22473
8 0x7f0f199ca160p PyEval_EvalCodeEx + 2240
9 0x7f0f199c84d1p PyEval_EvalFrameEx + 22049
10 0x7f0f199ca160p PyEval_EvalCodeEx + 2240
11 0x7f0f199ca272p PyEval_EvalCode + 50
12 0x7f0f199d9792p PyImport_ExecCodeModuleEx + 194
13 0x7f0f199d9c36p
14 0x7f0f199da9cdp
15 0x7f0f199db0bfp
16 0x7f0f199db354p
17 0x7f0f199db9c8p
18 0x7f0f199dc6f4p PyImport_ImportModuleLevel + 68
19 0x7f0f199c1cdfp
20 0x7f0f1992e123p PyObject_Call + 83
21 0x7f0f199c21c3p PyEval_CallObjectWithKeywords + 67
22 0x7f0f199c4cb3p PyEval_EvalFrameEx + 7683
23 0x7f0f199ca160p PyEval_EvalCodeEx + 2240
24 0x7f0f199ca272p PyEval_EvalCode + 50
25 0x7f0f199e465cp
26 0x7f0f199e4730p PyRun_FileExFlags + 144
27 0x7f0f199e5c3cp PyRun_SimpleFileExFlags + 220
28 0x7f0f199f74fcp Py_Main + 3164
29 0x38bfc21b45p __libc_start_main + 245
30 0x400699p`
使用的命令为
PADDLE_TRAINING_ROLE=PSERVER \
PADDLE_TRAINERS=2 \
PADDLE_PSERVER_IPS=127.0.0.1 \
PADDLE_CURRENT_IP=127.0.0.1 \
PADDLE_PSERVER_PORT=7164 \
python dist_train.py \
--model=DistResnet \
--batch_size=32 \
--update_method=pserver \
--device=CPU \
--data_dir=../data/ILSVRC2012
而相同环境变量下使用单机多卡GPU版本时训练时并未提示需要cublas.