Add Stream for fetch op handle (#16600)

* expose fuse broadcast ops

Add Stream for fetch op handle (#16600)
* expose fuse broadcast ops
b75a69ba · chengduo · GitHub · 1342e2ea · b75a69ba · b75a69ba
4 changed file
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -91,7 +91,11 @@ struct BuildStrategy {
  bool enable_sequential_execution_{false};
-  bool fuse_broadcast_op_{false};
+  // NOTE(zcd): In reduce mode, fusing broadcast ops may make the program
+  // faster. Because fusing broadcast OP equals delaying the execution of all
+  // broadcast Ops, in this case, all nccl streams are used only for reduce
+  // operations for a period of time.
+  bool fuse_broadcast_ops_{false};
  // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
  // num_trainers is 1, so the current fields of build_strategy doesn't tell if

--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -63,7 +63,8 @@ void FetchOpHandle::RunImpl() {
    auto &t = var->Get<framework::LoDTensor>();
    if (platform::is_gpu_place(t.place())) {
 #ifdef PADDLE_WITH_CUDA
-      TensorCopySync(t, cpu, &tensors_[i]);
+      TensorCopy(t, cpu, *dev_ctxes_.at(t.place()), &tensors_[i]);
+      dev_ctxes_.at(t.place())->Wait();
 #endif
    } else {
      tensors_[i].ShareDataWith(t);

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -658,7 +658,7 @@ bool ReduceSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
 void ReduceSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
  if (UseGPU()) {
-    if (strategy_.fuse_broadcast_op_) {
+    if (strategy_.fuse_broadcast_ops_) {
      CreateFusedBroadcastOp(result, bcast_var_name_set_);
    } else {
      for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) {
@@ -1021,7 +1021,7 @@ void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const {
        strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
      return;
    }
-    if (strategy_.fuse_broadcast_op_) {
+    if (strategy_.fuse_broadcast_ops_) {
      CreateFusedBroadcastOp(result, bcast_var_name_set_);
    } else {
      for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) {

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1299,7 +1299,20 @@ All parameter, weight, gradient are variables in Paddle.
                      to fuse relu and depthwise_conv2d,
                      it will save GPU memory and may make the execution faster.
                      This options is only available in GPU devices.
-                      Default False)DOC")
+                      Default False.)DOC")
+      .def_property(
+          "fuse_broadcast_ops",
+          [](const BuildStrategy &self) { return self.fuse_broadcast_ops_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
+            self.fuse_broadcast_ops_ = b;
+          },
+          R"DOC(The type is BOOL, fuse_broadcast_op indicates whether
+                      to fuse the broadcast ops. Note that, in Reduce mode,
+                      fusing broadcast ops may make the program faster. Because
+                      fusing broadcast OP equals delaying the execution of all
+                      broadcast Ops, in this case, all nccl streams are used only
+                      for NCCLReduce operations for a period of time. Default False.)DOC")
      .def_property("fuse_all_optimizer_ops",
                    [](const BuildStrategy &self) {
                      return self.fuse_all_optimizer_ops_;