diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 5811693b7ce6d6f2aebc9a8896960226295bd3e5..cc48c51e924039d93b2e1e18bea752611e7bef92 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -91,7 +91,11 @@ struct BuildStrategy { bool enable_sequential_execution_{false}; - bool fuse_broadcast_op_{false}; + // NOTE(zcd): In reduce mode, fusing broadcast ops may make the program + // faster. Because fusing broadcast OP equals delaying the execution of all + // broadcast Ops, in this case, all nccl streams are used only for reduce + // operations for a period of time. + bool fuse_broadcast_ops_{false}; // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, // num_trainers is 1, so the current fields of build_strategy doesn't tell if diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 81e200c0dae4484a46afa16e69db68ff746484c6..6c8b8937ebe646042f71cb58cfbc2d32426a4e3c 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -63,7 +63,8 @@ void FetchOpHandle::RunImpl() { auto &t = var->Get(); if (platform::is_gpu_place(t.place())) { #ifdef PADDLE_WITH_CUDA - TensorCopySync(t, cpu, &tensors_[i]); + TensorCopy(t, cpu, *dev_ctxes_.at(t.place()), &tensors_[i]); + dev_ctxes_.at(t.place())->Wait(); #endif } else { tensors_[i].ShareDataWith(t); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index e22bd3917895be8d84a83b3986c6919564b2ddab..f213e07b555ca9fc4b73a2f91412063f4e7f47d4 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -658,7 +658,7 @@ bool ReduceSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, void ReduceSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { if (UseGPU()) { - if (strategy_.fuse_broadcast_op_) { + if (strategy_.fuse_broadcast_ops_) { CreateFusedBroadcastOp(result, bcast_var_name_set_); } else { for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) { @@ -1021,7 +1021,7 @@ void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { return; } - if (strategy_.fuse_broadcast_op_) { + if (strategy_.fuse_broadcast_ops_) { CreateFusedBroadcastOp(result, bcast_var_name_set_); } else { for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 5b79b759d555934bbc40a03da316d138f4e81a99..044677fb756e0368c65b84f15fdf2540abbd14b8 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1299,7 +1299,20 @@ All parameter, weight, gradient are variables in Paddle. to fuse relu and depthwise_conv2d, it will save GPU memory and may make the execution faster. This options is only available in GPU devices. - Default False)DOC") + Default False.)DOC") + .def_property( + "fuse_broadcast_ops", + [](const BuildStrategy &self) { return self.fuse_broadcast_ops_; }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); + self.fuse_broadcast_ops_ = b; + }, + R"DOC(The type is BOOL, fuse_broadcast_op indicates whether + to fuse the broadcast ops. Note that, in Reduce mode, + fusing broadcast ops may make the program faster. Because + fusing broadcast OP equals delaying the execution of all + broadcast Ops, in this case, all nccl streams are used only + for NCCLReduce operations for a period of time. Default False.)DOC") .def_property("fuse_all_optimizer_ops", [](const BuildStrategy &self) { return self.fuse_all_optimizer_ops_;