[Dygraph] Fix bugs of mp in eager mode (#46303)

* fix bugs of mp * fix bugs of mp * update * update * fix bug

[Dygraph] Fix bugs of mp in eager mode (#46303)
* fix bugs of mp * fix bugs of mp * update * update * fix bug
11002430 · Haohongxiang · GitHub · 8bed3192 · 11002430 · 11002430
2 changed file
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -265,7 +265,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
    auto map = distributed::ProcessGroupMapFromGid::getInstance();
    distributed::ProcessGroup* pg = map->get(rid);
    distributed::AllreduceOptions opts;
-    opts.reduce_op = distributed::ReduceOp::SUM;
+    opts.reduce_op = distributed::ReduceOp::MAX;
    // allocate memory on device.
    softmax->mutable_data<T>(place);
@@ -348,6 +348,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
    in_out.clear();
    in_out.push_back(predicted_logits);
+    opts.reduce_op = distributed::ReduceOp::SUM;
    pg->AllReduce(in_out, in_out, opts)->Synchronize();
    // step 4, obtain exp(logit)
@@ -364,6 +365,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
    in_out.clear();
    in_out.push_back(sum_exp_logits);
+    opts.reduce_op = distributed::ReduceOp::SUM;
    pg->AllReduce(in_out, in_out, opts)->Synchronize();
    auto eigen_loss = math::EigenMatrix<T>::From(loss_2d);

--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -106,13 +106,26 @@ def _broadcast_data_help(data, shape, dtype, hcg):
                                 group=model_parallel_group,
                                 sync_op=True)
+    if mp_rank != 0:
+        if in_dygraph_mode():
+            data._clear_data()
+            input_data._share_buffer_to(data)
+        else:
+            data.value().get_tensor()._clear()
+            data.value().get_tensor()._share_data_with(
+                input_data.value().get_tensor())
 def broadcast_input_data(hcg, *inputs, **kwargs):
    cur_device = paddle.get_device()
    for v in inputs:
        if isinstance(v, (core.VarBase, core.eager.Tensor)):
            with framework.no_grad():
-                v = v.cuda() if "gpu" in cur_device else v
+                if "gpu" in cur_device and in_dygraph_mode() \
+                    and not v.place.is_gpu_place():
+                    v_gpu = v.cuda(int(cur_device.split(":")[1]))
+                    v._clear_data()
+                    v_gpu._share_buffer_to(v)
                _broadcast_data_help(v, v.shape, v.dtype, hcg)
        else:
            logger.error("it doesn't support data type {}".format(type(v)))
@@ -120,7 +133,11 @@ def broadcast_input_data(hcg, *inputs, **kwargs):
    for k, v in kwargs.items():
        if isinstance(v, (core.VarBase, core.eager.Tensor)):
            with framework.no_grad():
-                v = v.cuda() if "gpu" in cur_device else v
+                if "gpu" in cur_device and in_dygraph_mode() \
+                    and not v.place.is_gpu_place():
+                    v_gpu = v.cuda(int(cur_device.split(":")[1]))
+                    v._clear_data()
+                    v_gpu._share_buffer_to(v)
                _broadcast_data_help(v, v.shape, v.dtype, hcg)
            kwargs[k] = v
        else: