update by comment

1e1b6622 · Yancey1989 · b084dfab · 1e1b6622 · 1e1b6622 · 1e1b6622
4 changed file
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -46,11 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 #endif
 void AllReduceOpHandle::RunImpl() {
-  if (dev_ctxes_.size() > 0UL) {
+  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
-    platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
-  } else {
-    platform::RecordEvent record_event(Name(), nullptr);
-  }
  if (NoDummyInputSize() == 1) {
    return;  // No need to all reduce when GPU count = 1;

--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -22,11 +22,7 @@ namespace framework {
 namespace details {
 void BroadcastOpHandle::RunImpl() {
-  if (dev_ctxes_.size() > 0UL) {
+  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
-    platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
-  } else {
-    platform::RecordEvent record_event(Name(), nullptr);
-  }
  if (places_.size() == 1) return;

--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -87,12 +87,6 @@ std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
 }
 void DataBalanceOpHandle::RunImpl() {
-  if (dev_ctxes_.size() > 0UL) {
-    platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
-  } else {
-    platform::RecordEvent record_event(Name(), nullptr);
-  }
  PADDLE_ENFORCE_GT(places_.size(), 1,
                    "Data balance can only be enabled when the number of "
                    "places to run larger than 1.");

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -431,10 +431,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
                    CreateReduceOp(&result, g_name, cur_device_id);
                    graph->Get<ShardedVarDevice>(kShardedVarDevice)
                        .emplace(g_name, cur_device_id);
-                    if (!is_dist_train) {
+                    bcast_var_name_set[cur_device_id].emplace(p_name);
-                      // will send gradients directly when distributed training
-                      bcast_var_name_set[cur_device_id].emplace(p_name);
-                    }
                    break;
                  case BuildStrategy::ReduceStrategy::kAllReduce:
                    if (IsSparseGradient(g_name)) {
@@ -461,9 +458,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
  use_gpu = nccl_ctxs_ != nullptr;
 #endif
-  if ((use_gpu &&
+  if (use_gpu && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce &&
-       strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) ||
+      !is_dist_train) {
-      is_dist_train) {
    // Insert BCast Ops
    for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
      auto &to_bcast_set = bcast_var_name_set[dev_id];