diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 75143b9a1a0c85a24de337ad02afeea1112ca85c..afd0b70c29b99505e04a5cefb9fce5a546c1d0ed 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -126,6 +126,9 @@ void BroadcastOpHandle::BroadcastOneVar( &VariableVisitor::GetMutableTensor(out_var)); } }); + for (auto &p : places_) { + nccl_ctxs_->DevCtx(p)->Wait(); + } #else PADDLE_THROW("CUDA is not enabled."); #endif diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index d14ed36e28a7907a0b9255ed46e55ac72896cd12..216fb66c034a0980b68641004077e4d50b19983c 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -278,12 +278,12 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, #else const bool use_cuda) const { #endif - VLOG(3) << "apply all passes"; + VLOG(1) << "apply all passes"; // Create a default one if not finalized by user. CreatePassesFromStrategy(false); for (std::shared_ptr &pass : pass_builder_->AllPasses()) { - VLOG(3) << "BuildStrategy::Apply pass:" << pass->Type(); + VLOG(1) << "BuildStrategy::Apply pass:" << pass->Type(); if (IsMultiDevPass(pass->Type())) { pass->Erase(kPlaces); pass->SetNotOwned>(kPlaces, &places); @@ -349,11 +349,11 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, continue; } } - VLOG(3) << "Start Apply Pass " << pass->Type(); + VLOG(1) << "Start Apply Pass " << pass->Type(); graph = pass->Apply(graph); - VLOG(3) << "Finish Apply Pass " << pass->Type(); + VLOG(1) << "Finish Apply Pass " << pass->Type(); } - VLOG(3) << "All Passes Applied"; + VLOG(1) << "All Passes Applied"; return graph; } diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 929cb51b8454b0220c5dbe6a7b82af6af06c1d53..47409b89bcfe81b814af4fc59c65668aa4f3804a 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -98,7 +98,7 @@ struct BuildStrategy { // faster. Because fusing broadcast OP equals delaying the execution of all // broadcast Ops, in this case, all nccl streams are used only for reduce // operations for a period of time. - bool fuse_broadcast_ops_{false}; + bool fuse_broadcast_ops_{true}; // replace batch_norm with sync_batch_norm. bool sync_batch_norm_{false}; diff --git a/paddle/fluid/framework/ir/pass_builder.cc b/paddle/fluid/framework/ir/pass_builder.cc index 457de41c8f6a84cf81798c71b2366fb1d989b9de..8355764aa6c983ace203906190e6cc6d86b500dd 100644 --- a/paddle/fluid/framework/ir/pass_builder.cc +++ b/paddle/fluid/framework/ir/pass_builder.cc @@ -21,7 +21,7 @@ namespace framework { namespace ir { std::shared_ptr PassBuilder::AppendPass(const std::string& pass_type) { - VLOG(3) << "Append " << pass_type; + VLOG(1) << "Append " << pass_type; auto pass = ir::PassRegistry::Instance().Get(pass_type); passes_.emplace_back(pass.release()); return passes_.back();