diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
index ef7e298aaf6a3cbe1f9b9d1464a3be882ac0003f..f1640d2f4a3f539babd065b8658334744026425f 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -265,7 +265,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
     auto map = distributed::ProcessGroupMapFromGid::getInstance();
     distributed::ProcessGroup* pg = map->get(rid);
     distributed::AllreduceOptions opts;
-    opts.reduce_op = distributed::ReduceOp::SUM;
+    opts.reduce_op = distributed::ReduceOp::MAX;
 
     // allocate memory on device.
     softmax->mutable_data<T>(place);
@@ -348,6 +348,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
 
     in_out.clear();
     in_out.push_back(predicted_logits);
+    opts.reduce_op = distributed::ReduceOp::SUM;
     pg->AllReduce(in_out, in_out, opts)->Synchronize();
 
     // step 4, obtain exp(logit)
@@ -364,6 +365,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
 
     in_out.clear();
     in_out.push_back(sum_exp_logits);
+    opts.reduce_op = distributed::ReduceOp::SUM;
     pg->AllReduce(in_out, in_out, opts)->Synchronize();
 
     auto eigen_loss = math::EigenMatrix<T>::From(loss_2d);
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
index 3246866f225ce023b3ad3bab1fa153114efc7e66..e7bd434b94fd32c19daa99defefe979058e99355 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -106,13 +106,26 @@ def _broadcast_data_help(data, shape, dtype, hcg):
                                  group=model_parallel_group,
                                  sync_op=True)
 
+    if mp_rank != 0:
+        if in_dygraph_mode():
+            data._clear_data()
+            input_data._share_buffer_to(data)
+        else:
+            data.value().get_tensor()._clear()
+            data.value().get_tensor()._share_data_with(
+                input_data.value().get_tensor())
+
 
 def broadcast_input_data(hcg, *inputs, **kwargs):
     cur_device = paddle.get_device()
     for v in inputs:
         if isinstance(v, (core.VarBase, core.eager.Tensor)):
             with framework.no_grad():
-                v = v.cuda() if "gpu" in cur_device else v
+                if "gpu" in cur_device and in_dygraph_mode() \
+                    and not v.place.is_gpu_place():
+                    v_gpu = v.cuda(int(cur_device.split(":")[1]))
+                    v._clear_data()
+                    v_gpu._share_buffer_to(v)
                 _broadcast_data_help(v, v.shape, v.dtype, hcg)
         else:
             logger.error("it doesn't support data type {}".format(type(v)))
@@ -120,7 +133,11 @@ def broadcast_input_data(hcg, *inputs, **kwargs):
     for k, v in kwargs.items():
         if isinstance(v, (core.VarBase, core.eager.Tensor)):
             with framework.no_grad():
-                v = v.cuda() if "gpu" in cur_device else v
+                if "gpu" in cur_device and in_dygraph_mode() \
+                    and not v.place.is_gpu_place():
+                    v_gpu = v.cuda(int(cur_device.split(":")[1]))
+                    v._clear_data()
+                    v_gpu._share_buffer_to(v)
                 _broadcast_data_help(v, v.shape, v.dtype, hcg)
             kwargs[k] = v
         else: