Fix Transformer Hang Problem

6b20b355 · Yu Yang · 5a4d9328 · 6b20b355 · 6b20b355 · 6b20b355
9 changed file
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -35,7 +35,9 @@ void ComputationOpHandle::RunImpl() {
    }
  }
-  op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
+  this->RunAndRecordEvent([this] {
+    op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
+  });
 }
 std::string ComputationOpHandle::Name() const { return op_->Type(); }

--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -69,10 +69,12 @@ void NCCLAllReduceOpHandle::RunImpl() {
      });
    }
-    platform::NCCLGroupGuard guard;
+    this->RunAndRecordEvent([&] {
-    for (auto &call : all_reduce_calls) {
+      platform::NCCLGroupGuard guard;
-      call();
+      for (auto &call : all_reduce_calls) {
-    }
+        call();
+      }
+    });
  }
 }

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -54,17 +54,6 @@ void OpHandleBase::Run(bool use_event) {
 #endif
  RunImpl();
-#ifdef PADDLE_WITH_CUDA
-  if (use_event) {
-    for (auto &p : dev_ctxes_) {
-      int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
-      auto stream =
-          static_cast<platform::CUDADeviceContext *>(p.second)->stream();
-      PADDLE_ENFORCE(cudaEventRecord(events_.at(dev_id), stream));
-    }
-  }
-#endif
 }
 void OpHandleBase::Wait(platform::DeviceContext *waited_dev) {
@@ -97,6 +86,27 @@ void OpHandleBase::AddOutput(VarHandleBase *out) {
  out->generated_op_ = this;
 }
+void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
+#ifdef PADDLE_WITH_CUDA
+  if (!events_.empty()) {  // Use event
+    std::function<void()> method = callback;
+    for (auto &p : dev_ctxes_) {
+      method = [method, p, this]() {
+        static_cast<platform::CUDADeviceContext *>(p.second)->RecordEvent(
+            events_.at(boost::get<platform::CUDAPlace>(p.first).device),
+            method);
+      };
+    }
+    method();
+  } else {
+#endif
+    callback();
+#ifdef PADDLE_WITH_CUDA
+  }
+#endif
+}
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -62,6 +62,8 @@ class OpHandleBase {
  virtual bool IsMultiDeviceTransfer() { return false; }
 protected:
+  void RunAndRecordEvent(const std::function<void()> &callback);
  virtual void RunImpl() = 0;
 };

--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -14,6 +14,8 @@
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+#include <string>
 namespace paddle {
 namespace framework {
 namespace details {
@@ -37,11 +39,13 @@ void ScaleLossGradOpHandle::RunImpl() {
    *tmp = coeff_;
  } else {
 #ifdef PADDLE_WITH_CUDA
-    auto stream =
+    this->RunAndRecordEvent([&] {
-        static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
+      auto stream =
-            ->stream();
+          static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
-    memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
+              ->stream();
-                 platform::CPUPlace(), &coeff_, sizeof(float), stream);
+      memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
+                   platform::CPUPlace(), &coeff_, sizeof(float), stream);
+    });
 #endif
  }
 }

--- a/paddle/fluid/framework/details/send_op_handle.cc
+++ b/paddle/fluid/framework/details/send_op_handle.cc
@@ -34,7 +34,7 @@ void SendOpHandle::RunImpl() {
    }
    in->generated_op_->Wait(dev_ctxes_[p]);
  }
-  op_->Run(*local_scope_, place_);
+  this->RunAndRecordEvent([&] { op_->Run(*local_scope_, place_); });
 }
 std::string SendOpHandle::Name() const { return "send"; }

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -196,10 +196,12 @@ void ThreadedSSAGraphExecutor::RunOp(
    BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
  auto op_run = [ready_var_q, op, this] {
    try {
-      VLOG(10) << op->Name() << " : " << op->DebugString();
+      VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
      op->Run(use_event_);
+      VLOG(10) << op << " " << op->Name() << " Done ";
      running_ops_--;
      ready_var_q->Extend(op->outputs_);
+      VLOG(10) << op << " " << op->Name() << "Signal posted";
    } catch (platform::EnforceNotMet ex) {
      exception_.reset(new platform::EnforceNotMet(ex));
    } catch (...) {

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -175,7 +175,7 @@ CUDADeviceContext::~CUDADeviceContext() {
 Place CUDADeviceContext::GetPlace() const { return place_; }
 void CUDADeviceContext::Wait() const {
-  std::lock_guard<std::mutex> guard(mutex_);
+  std::lock_guard<std::recursive_mutex> guard(mutex_);
  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
  PADDLE_ENFORCE(cudaGetLastError());
 }

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -98,13 +98,20 @@ class CUDADeviceContext : public DeviceContext {
  /*! \brief  Return cuda stream in the device context. */
  cudaStream_t stream() const;
+  template <typename Callback>
+  void RecordEvent(cudaEvent_t ev, Callback callback) {
+    std::lock_guard<std::recursive_mutex> guard(mutex_);
+    callback();
+    PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
+  }
 private:
  CUDAPlace place_;
  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
  std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
-  mutable std::mutex mutex_;
+  mutable std::recursive_mutex mutex_;
  cudaStream_t stream_;
  cudnnHandle_t cudnn_handle_;
  cublasHandle_t cublas_handle_;