动态图NCCL数据并行报invalid data type
Created by: Gaffey
-
版本、环境信息: 1)PaddlePaddle版本:1.5 post97 3)GPU:cuda 9.0 cudnn 7.1
-
训练信息 1)单机/多卡 2)22g
-
问题描述:
Traceback (most recent call last):
File "trainfast_aiflow_elem_cls_oneshot.py", line 237, in <module>
main()
File "trainfast_aiflow_elem_cls_oneshot.py", line 233, in main
train_async(args)
File "trainfast_aiflow_elem_cls_oneshot.py", line 145, in train_async
prediction, acc1, acc5 = model.forward(img, label)
File "/ssd1/wenshuo/anaconda2/lib/python2.7/site-packages/paddle/fluid/dygraph/parallel.py", line 148, in forward
return self._layers(*inputs, **kwargs)
File "/ssd1/wenshuo/anaconda2/lib/python2.7/site-packages/paddle/fluid/dygraph/layers.py", line 162, in __call__
outputs = self.forward(*inputs)
File "/ssd1/wenshuo/imgnet_multi_gpu/thirdparty/paddlemodels/metric_learning/models/resnet_embedding_oneshot.py", line 118, in forward
x = self.conv1(inputs)
File "/ssd1/wenshuo/anaconda2/lib/python2.7/site-packages/paddle/fluid/dygraph/layers.py", line 162, in __call__
outputs = self.forward(*inputs)
File "/ssd1/wenshuo/imgnet_multi_gpu/thirdparty/paddlemodels/metric_learning/models/resnet_embedding_oneshot.py", line 39, in forward
x = self.conv(inputs)
File "/ssd1/wenshuo/anaconda2/lib/python2.7/site-packages/paddle/fluid/dygraph/layers.py", line 160, in __call__
parallel_helper._broadcast_parameters(self._parameters.values())
File "/ssd1/wenshuo/anaconda2/lib/python2.7/site-packages/paddle/fluid/dygraph/parallel_helper.py", line 43, in _broadcast_parameters
collective._broadcast(param, 0, sync_mode=True)
File "/ssd1/wenshuo/anaconda2/lib/python2.7/site-packages/paddle/fluid/layers/collective.py", line 59, in _broadcast
"root": root})
File "/ssd1/wenshuo/anaconda2/lib/python2.7/site-packages/paddle/fluid/layer_helper.py", line 43, in append_op
return self.main_program.current_block().append_op(*args, **kwargs)
File "/ssd1/wenshuo/anaconda2/lib/python2.7/site-packages/paddle/fluid/framework.py", line 1742, in append_op
kwargs.get("stop_gradient", False))
File "/ssd1/wenshuo/anaconda2/lib/python2.7/site-packages/paddle/fluid/dygraph/tracer.py", line 59, in trace_op
framework._current_expected_place(), stop_gradient)
paddle.fluid.core_avx.EnforceNotMet: invalid data type at [/paddle/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc:60]
PaddlePaddle Call Stacks:
0 0x7f25b37d96a0p void paddle::platform::EnforceNotMet::Init<char const*>(char const*, char const*, int) + 352
1 0x7f25b37d9a19p paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 137
2 0x7f25b4639871p paddle::operators::NCCLBroadcastOpKernel<float>::Compute(paddle::framework::ExecutionContext const&) const + 1953
3 0x7f25b4639ac3p std::_Function_handler<void (paddle::framework::ExecutionContext const&), paddle::framework::OpKernelRegistrarFunctor
<paddle::platform::CUDAPlace, false, 0ul, paddle::operators::NCCLBroadcastOpKernel<float>, paddle::operators::NCCLBroadcastOpKernel<double>, paddle::operators::NCCLBroadcastOpKernel<int>, paddle::operators::NCCLBroadcastOpKernel<long>, paddle::operators::NCCLBroadcastOpKernel<paddle::platform::float16> >::operator()(char const*, char const*, int) const::{lambda(paddle::framework::ExecutionContext const&)#1}>::_M_invoke(std::_Any_data const&, paddle::framework::ExecutionContext const&) + 35
4 0x7f25b39dd400p
5 0x7f25b38c7a45p
6 0x7f25b380c5c6p
7 0x7f25f1a4ffc7p PyEval_EvalFrameEx + 28695
8 0x7f25f1a524e9p PyEval_EvalCodeEx + 2025
9 0x7f25f1a4f9b8p PyEval_EvalFrameEx + 27144
10 0x7f25f1a524e9p PyEval_EvalCodeEx + 2025
11 0x7f25f19db377p
12 0x7f25f19b67a3p PyObject_Call + 67
13 0x7f25f1a4b4bep PyEval_EvalFrameEx + 9486
14 0x7f25f1a524e9p PyEval_EvalCodeEx + 2025
15 0x7f25f1a4f9b8p PyEval_EvalFrameEx + 27144
16 0x7f25f1a524e9p PyEval_EvalCodeEx + 2025
17 0x7f25f1a4f9b8p PyEval_EvalFrameEx + 27144
18 0x7f25f1a50f9ep PyEval_EvalFrameEx + 32750
19 0x7f25f1a524e9p PyEval_EvalCodeEx + 2025
20 0x7f25f19db28ap
21 0x7f25f19b67a3p PyObject_Call + 67
22 0x7f25f19c563dp
23 0x7f25f19b67a3p PyObject_Call + 67
24 0x7f25f1a0f8a4p
25 0x7f25f19b67a3p PyObject_Call + 67
26 0x7f25f1a4cb69p PyEval_EvalFrameEx + 15289
27 0x7f25f1a524e9p PyEval_EvalCodeEx + 2025
28 0x7f25f19db28ap
29 0x7f25f19b67a3p PyObject_Call + 67
30 0x7f25f1a4b4bep PyEval_EvalFrameEx + 9486
31 0x7f25f1a524e9p PyEval_EvalCodeEx + 2025
32 0x7f25f19db28ap
33 0x7f25f19b67a3p PyObject_Call + 67
34 0x7f25f19c563dp
35 0x7f25f19b67a3p PyObject_Call + 67
36 0x7f25f1a0f8a4p
37 0x7f25f19b67a3p PyObject_Call + 67
38 0x7f25f1a4cb69p PyEval_EvalFrameEx + 15289
39 0x7f25f1a524e9p PyEval_EvalCodeEx + 2025
40 0x7f25f19db28ap
41 0x7f25f19b67a3p PyObject_Call + 67
42 0x7f25f1a4b4bep PyEval_EvalFrameEx + 9486
43 0x7f25f1a524e9p PyEval_EvalCodeEx + 2025
44 0x7f25f19db377p
45 0x7f25f19b67a3p PyObject_Call + 67
46 0x7f25f19c563dp
47 0x7f25f19b67a3p PyObject_Call + 67
48 0x7f25f1a0f8a4p
49 0x7f25f19b67a3p PyObject_Call + 67
50 0x7f25f1a4b4bep PyEval_EvalFrameEx + 9486
51 0x7f25f1a524e9p PyEval_EvalCodeEx + 2025
52 0x7f25f1a4f9b8p PyEval_EvalFrameEx + 27144
53 0x7f25f1a50f9ep PyEval_EvalFrameEx + 32750
54 0x7f25f1a50f9ep PyEval_EvalFrameEx + 32750
55 0x7f25f1a524e9p PyEval_EvalCodeEx + 2025
56 0x7f25f1a5270ap PyEval_EvalCode + 26
57 0x7f25f1a6b9cdp
58 0x7f25f1a6cb48p PyRun_FileExFlags + 120
59 0x7f25f1a6dd68p PyRun_SimpleFileExFlags + 232
60 0x7f25f1a7ff8cp Py_Main + 2988
61 0x7f25f0ca9b45p __libc_start_main + 245
62 0x7f25f1f6f8bfp
部分相关代码
class ConvBNLayer(fluid.dygraph.Layer):
def __init__(self, namescope,
num_filters,
filter_size,
stride=1,
groups=1,
act=None):
super(ConvBNLayer, self).__init__(namescope)
self.conv = fluid.dygraph.Conv2D(
self.full_name(),
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) / 2,
groups=groups,
act=None,
bias_attr=False)
self.bn = fluid.dygraph.BatchNorm(self.full_name(), num_filters, act=act)
def forward(self, inputs):
39: x = self.conv(inputs)
x = self.bn(x)
return x
===========================
with fluid.dygraph.guard(place):
logging.debug('enter train')
model_name = args.model
checkpoint = args.checkpoint
pretrained_model = args.pretrained_model
model_save_dir = args.model_save_dir
model = models.__dict__[args.model]()
params = model.params
if args.multi_gpu:
strategy = fluid.dygraph.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy)
params["lr"] = args.lr
params["lr_list"] = args.lr_list
params["learning_strategy"]["lr_steps"] = args.lr_steps
params["learning_strategy"]["name"] = args.lr_strategy
optimizer = optimizer_setting(params, args)
"""
if args.with_mem_opt:
fluid.memory_optimize(train_prog, skip_opt_set=set(train_fetch_list))
"""
logging.debug('after run startup program')
if checkpoint is not None:
load_params = fluid.dygraph.load_persistables(checkpoint)
model.load_dict(load_params[0])
devicenum = getgpunum()
assert (args.train_batch_size % devicenum) == 0
train_batch_size = args.train_batch_size / devicenum
test_batch_size = args.test_batch_size
train_reader = paddle.batch(reader.train(args), batch_size=train_batch_size, drop_last=True)
if args.multi_gpu:
train_reader = fluid.contrib.reader.distributed_batch_reader(train_reader)
test_reader = paddle.batch(reader.val(args), batch_size=test_batch_size, drop_last=False)
totalruntime = 0
iter_no = args.start_step
train_info = [0, 0, 0, 0]
image_shape = [int(m) for m in args.image_shape.split(",")]
while iter_no <= args.total_iter_num:
for batch_id, data in enumerate(train_reader()):
dy_x_data = np.array([x[0].reshape(image_shape)
for x in data]).astype('float32')
y_data = np.array(
[x[1] for x in data]).astype('int64').reshape(-1, 1)
img = fluid.dygraph.to_variable(dy_x_data)
label = fluid.dygraph.to_variable(y_data)
label.stop_gradient = True
t1 = time.time()
model.train()
prediction, acc1, acc5 = model.forward(img, label)
loss = fluid.layers.cross_entropy(prediction, label)
avg_loss = fluid.layers.mean(loss)
if args.multi_gpu:
avg_loss = model.scale_loss(avg_loss)
avg_loss.backward()
model.apply_collective_grads()
else:
avg_loss.backward()
optimizer.minimize(avg_loss)