diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 75143b9a1a0c85a24de337ad02afeea1112ca85c..afd0b70c29b99505e04a5cefb9fce5a546c1d0ed 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -126,6 +126,9 @@ void BroadcastOpHandle::BroadcastOneVar(
             &VariableVisitor::GetMutableTensor(out_var));
       }
     });
+    for (auto &p : places_) {
+      nccl_ctxs_->DevCtx(p)->Wait();
+    }
 #else
     PADDLE_THROW("CUDA is not enabled.");
 #endif
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index d14ed36e28a7907a0b9255ed46e55ac72896cd12..216fb66c034a0980b68641004077e4d50b19983c 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -278,12 +278,12 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
 #else
                                 const bool use_cuda) const {
 #endif
-  VLOG(3) << "apply all passes";
+  VLOG(1) << "apply all passes";
   // Create a default one if not finalized by user.
   CreatePassesFromStrategy(false);
 
   for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
-    VLOG(3) << "BuildStrategy::Apply pass:" << pass->Type();
+    VLOG(1) << "BuildStrategy::Apply pass:" << pass->Type();
     if (IsMultiDevPass(pass->Type())) {
       pass->Erase(kPlaces);
       pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
@@ -349,11 +349,11 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
         continue;
       }
     }
-    VLOG(3) << "Start Apply Pass " << pass->Type();
+    VLOG(1) << "Start Apply Pass " << pass->Type();
     graph = pass->Apply(graph);
-    VLOG(3) << "Finish Apply Pass " << pass->Type();
+    VLOG(1) << "Finish Apply Pass " << pass->Type();
   }
-  VLOG(3) << "All Passes Applied";
+  VLOG(1) << "All Passes Applied";
   return graph;
 }
 
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 929cb51b8454b0220c5dbe6a7b82af6af06c1d53..47409b89bcfe81b814af4fc59c65668aa4f3804a 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -98,7 +98,7 @@ struct BuildStrategy {
   // faster. Because fusing broadcast OP equals delaying the execution of all
   // broadcast Ops, in this case, all nccl streams are used only for reduce
   // operations for a period of time.
-  bool fuse_broadcast_ops_{false};
+  bool fuse_broadcast_ops_{true};
   // replace batch_norm with sync_batch_norm.
   bool sync_batch_norm_{false};
 
diff --git a/paddle/fluid/framework/ir/pass_builder.cc b/paddle/fluid/framework/ir/pass_builder.cc
index 457de41c8f6a84cf81798c71b2366fb1d989b9de..8355764aa6c983ace203906190e6cc6d86b500dd 100644
--- a/paddle/fluid/framework/ir/pass_builder.cc
+++ b/paddle/fluid/framework/ir/pass_builder.cc
@@ -21,7 +21,7 @@ namespace framework {
 namespace ir {
 
 std::shared_ptr<Pass> PassBuilder::AppendPass(const std::string& pass_type) {
-  VLOG(3) << "Append " << pass_type;
+  VLOG(1) << "Append " << pass_type;
   auto pass = ir::PassRegistry::Instance().Get(pass_type);
   passes_.emplace_back(pass.release());
   return passes_.back();