Support CPU/GPU mixture for ParallelExecutor

79be0604 · Yu Yang · 7083c2a6 · 79be0604 · 79be0604 · 79be0604
3 changed file
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -116,6 +116,19 @@ void NCCLAllReduceOpHandle::RunImpl() {
      // Reduce All Tensor to trg in CPU
      ReduceLoDTensor func(lod_tensors, &trg);
      VisitDataType(ToDataType(lod_tensors[0].type()), func);
+
+      for (size_t i = 0; i < local_scopes_.size(); ++i) {
+        auto &scope = local_scopes_[i];
+        auto &p = places_[i];
+        auto *var = scope->FindVar(var_name);
+        auto *dev_ctx = dev_ctxes_[p];
+
+        RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
+          auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
+          auto &tensor_cpu = trg;
+          TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);
+        });
+      }
    }
  }
 }

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -107,6 +107,22 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 #endif
 }

+void OpHandleBase::RunAndRecordEvent(platform::Place p,
+                                     const std::function<void()> &callback) {
+  if (platform::is_cpu_place(p) || events_.empty()) {
+    callback();
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    auto *ctx = dev_ctxes_.at(p);
+    auto *cuda_ctx = static_cast<platform::CUDADeviceContext *>(ctx);
+    cuda_ctx->RecordEvent(events_.at(boost::get<platform::CUDAPlace>(p).device),
+                          callback);
+#else
+    PADDLE_THROW("Not implemented");
+#endif
+  }
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -64,6 +64,9 @@ class OpHandleBase {
 protected:
  void RunAndRecordEvent(const std::function<void()> &callback);

+  void RunAndRecordEvent(platform::Place p,
+                         const std::function<void()> &callback);
+
  virtual void RunImpl() = 0;
 };