diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu index ef7e298aaf6a3cbe1f9b9d1464a3be882ac0003f..f1640d2f4a3f539babd065b8658334744026425f 100644 --- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu @@ -265,7 +265,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor { auto map = distributed::ProcessGroupMapFromGid::getInstance(); distributed::ProcessGroup* pg = map->get(rid); distributed::AllreduceOptions opts; - opts.reduce_op = distributed::ReduceOp::SUM; + opts.reduce_op = distributed::ReduceOp::MAX; // allocate memory on device. softmax->mutable_data(place); @@ -348,6 +348,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor { in_out.clear(); in_out.push_back(predicted_logits); + opts.reduce_op = distributed::ReduceOp::SUM; pg->AllReduce(in_out, in_out, opts)->Synchronize(); // step 4, obtain exp(logit) @@ -364,6 +365,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor { in_out.clear(); in_out.push_back(sum_exp_logits); + opts.reduce_op = distributed::ReduceOp::SUM; pg->AllReduce(in_out, in_out, opts)->Synchronize(); auto eigen_loss = math::EigenMatrix::From(loss_2d); diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py index 3246866f225ce023b3ad3bab1fa153114efc7e66..e7bd434b94fd32c19daa99defefe979058e99355 100644 --- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py +++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py @@ -106,13 +106,26 @@ def _broadcast_data_help(data, shape, dtype, hcg): group=model_parallel_group, sync_op=True) + if mp_rank != 0: + if in_dygraph_mode(): + data._clear_data() + input_data._share_buffer_to(data) + else: + data.value().get_tensor()._clear() + data.value().get_tensor()._share_data_with( + input_data.value().get_tensor()) + def broadcast_input_data(hcg, *inputs, **kwargs): cur_device = paddle.get_device() for v in inputs: if isinstance(v, (core.VarBase, core.eager.Tensor)): with framework.no_grad(): - v = v.cuda() if "gpu" in cur_device else v + if "gpu" in cur_device and in_dygraph_mode() \ + and not v.place.is_gpu_place(): + v_gpu = v.cuda(int(cur_device.split(":")[1])) + v._clear_data() + v_gpu._share_buffer_to(v) _broadcast_data_help(v, v.shape, v.dtype, hcg) else: logger.error("it doesn't support data type {}".format(type(v))) @@ -120,7 +133,11 @@ def broadcast_input_data(hcg, *inputs, **kwargs): for k, v in kwargs.items(): if isinstance(v, (core.VarBase, core.eager.Tensor)): with framework.no_grad(): - v = v.cuda() if "gpu" in cur_device else v + if "gpu" in cur_device and in_dygraph_mode() \ + and not v.place.is_gpu_place(): + v_gpu = v.cuda(int(cur_device.split(":")[1])) + v._clear_data() + v_gpu._share_buffer_to(v) _broadcast_data_help(v, v.shape, v.dtype, hcg) kwargs[k] = v else: