Add No Mutex

c99fca5f · chengduoZH · 13de7238 · c99fca5f · c99fca5f · c99fca5f
5 changed file
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -103,6 +103,13 @@ void BroadcastOpHandle::RunImpl() {
          });
    }
+    // FIXME(zcd): a temporary fix for some language model that has sparse
+    // parameter.
+    bool use_mutex = true;
+    if (in_var->IsType<paddle::framework::SelectedRows>()) {
+      use_mutex = false;
+    }
+    if (use_mutex) {
      this->RunAndRecordEvent([&] {
        {
          platform::NCCLGroupGuard guard;
@@ -120,6 +127,26 @@ void BroadcastOpHandle::RunImpl() {
              &VariableVisitor::GetMutableTensor(out_var));
        }
      });
+    } else {
+      this->RunAndRecordEventNoMutex([&] {
+        {
+          platform::NCCLGroupGuard guard;
+          for (auto &call : broadcast_calls) {
+            call();
+          }
+        }
+        if (!out_handle->IsTheSameVar(*in_var_handle)) {
+          auto out_var = var_scopes.at(in_var_handle->scope_idx_)
+                             ->FindVar(out_var_handles[0]->name_);
+          paddle::framework::TensorCopy(
+              in_tensor, in_var_handle->place_,
+              *(dev_ctxes_.at(in_var_handle->place_)),
+              &VariableVisitor::GetMutableTensor(out_var));
+        }
+      });
+    }
 #else
    PADDLE_THROW("CUDA is not enabled.");
 #endif

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -139,6 +139,29 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 #endif
 }
+void OpHandleBase::RunAndRecordEventNoMutex(
+    const std::function<void()> &callback) {
+#ifdef PADDLE_WITH_CUDA
+  if (!events_.empty()) {  // Use event
+    std::function<void()> method = callback;
+    for (auto &p : dev_ctxes_) {
+      method = [method, p, this]() {
+        static_cast<platform::CUDADeviceContext *>(p.second)
+            ->RecordEventNoMutex(
+                events_.at(boost::get<platform::CUDAPlace>(p.first).device),
+                method);
+      };
+    }
+    method();
+  } else {
+#endif
+    callback();
+#ifdef PADDLE_WITH_CUDA
+  }
+#endif
+}
 void OpHandleBase::RunAndRecordEvent(platform::Place p,
                                     const std::function<void()> &callback) {
 #ifdef PADDLE_WITH_CUDA

--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -85,6 +85,10 @@ class OpHandleBase {
 protected:
  void RunAndRecordEvent(const std::function<void()> &callback);
+  // FIXME(zcd): A temporary fix for some language model that has sparse
+  // parameter.
+  void RunAndRecordEventNoMutex(const std::function<void()> &callback);
  void RunAndRecordEvent(platform::Place p,
                         const std::function<void()> &callback);

--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -80,7 +80,9 @@ void ReduceOpHandle::RunImpl() {
  }
  if (pre_in_var->IsType<framework::SelectedRows>()) {
-    this->RunAndRecordEvent([&] {
+    // FIXME(zcd): A temporary fix for some language model that has sparse
+    // parameter.
+    this->RunAndRecordEventNoMutex([&] {
      std::vector<const SelectedRows *> in_selected_rows =
          GetInputValues<SelectedRows>(in_var_handles, var_scopes);
      GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p,

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -106,6 +106,14 @@ class CUDADeviceContext : public DeviceContext {
    PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
  }
+  // FIXME(zcd): A temporary fix for some language model that has sparse
+  // parameter.
+  template <typename Callback>
+  void RecordEventNoMutex(cudaEvent_t ev, Callback callback) {
+    callback();
+    PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
+  }
 private:
  CUDAPlace place_;