diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index e3f8bbb72f2a1b75b6041d41496cef0efc81874f..ff6d91c1dafb0ab4cabb1646cc333e19a89eb812 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -35,7 +35,9 @@ void ComputationOpHandle::RunImpl() { } } - op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); + this->RunAndRecordEvent([this] { + op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); + }); } std::string ComputationOpHandle::Name() const { return op_->Type(); } diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc index 55b5f113589e090386d287e228349f22fb94a7ab..0611ec6376d2097c85dad1e5d7430c7b0713a385 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc @@ -69,10 +69,12 @@ void NCCLAllReduceOpHandle::RunImpl() { }); } - platform::NCCLGroupGuard guard; - for (auto &call : all_reduce_calls) { - call(); - } + this->RunAndRecordEvent([&] { + platform::NCCLGroupGuard guard; + for (auto &call : all_reduce_calls) { + call(); + } + }); } } diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index e4194a7442f677ec8970dbc387bb01ebbbf579f1..846bc21be27cf6c889ae34b967b8bff3c60ab743 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -54,17 +54,6 @@ void OpHandleBase::Run(bool use_event) { #endif RunImpl(); - -#ifdef PADDLE_WITH_CUDA - if (use_event) { - for (auto &p : dev_ctxes_) { - int dev_id = boost::get(p.first).device; - auto stream = - static_cast(p.second)->stream(); - PADDLE_ENFORCE(cudaEventRecord(events_.at(dev_id), stream)); - } - } -#endif } void OpHandleBase::Wait(platform::DeviceContext *waited_dev) { @@ -97,6 +86,27 @@ void OpHandleBase::AddOutput(VarHandleBase *out) { out->generated_op_ = this; } +void OpHandleBase::RunAndRecordEvent(const std::function &callback) { +#ifdef PADDLE_WITH_CUDA + if (!events_.empty()) { // Use event + std::function method = callback; + + for (auto &p : dev_ctxes_) { + method = [method, p, this]() { + static_cast(p.second)->RecordEvent( + events_.at(boost::get(p.first).device), + method); + }; + } + method(); + } else { +#endif + callback(); +#ifdef PADDLE_WITH_CUDA + } +#endif +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index fbdb54ba8d940c8dedd44a42a85825af5d2ec664..1aacba5a4c3c959b6f584aa5f4dcdc5c0dc43e76 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -62,6 +62,8 @@ class OpHandleBase { virtual bool IsMultiDeviceTransfer() { return false; } protected: + void RunAndRecordEvent(const std::function &callback); + virtual void RunImpl() = 0; }; diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index 0a6f6129b812ca84db7573957b1ee0a32c1ef5c4..7fb9f99a8a1bc044e2f25f373265a5ec9f7d76d5 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" +#include + namespace paddle { namespace framework { namespace details { @@ -37,11 +39,13 @@ void ScaleLossGradOpHandle::RunImpl() { *tmp = coeff_; } else { #ifdef PADDLE_WITH_CUDA - auto stream = - static_cast(this->dev_ctxes_[place_]) - ->stream(); - memory::Copy(boost::get(place_), tmp, - platform::CPUPlace(), &coeff_, sizeof(float), stream); + this->RunAndRecordEvent([&] { + auto stream = + static_cast(this->dev_ctxes_[place_]) + ->stream(); + memory::Copy(boost::get(place_), tmp, + platform::CPUPlace(), &coeff_, sizeof(float), stream); + }); #endif } } diff --git a/paddle/fluid/framework/details/send_op_handle.cc b/paddle/fluid/framework/details/send_op_handle.cc index d181607e86372f4872c38bc35db786ac142ccc65..549b9d9abbe5bfd17df3509e0442bfa19b7ecd61 100644 --- a/paddle/fluid/framework/details/send_op_handle.cc +++ b/paddle/fluid/framework/details/send_op_handle.cc @@ -34,7 +34,7 @@ void SendOpHandle::RunImpl() { } in->generated_op_->Wait(dev_ctxes_[p]); } - op_->Run(*local_scope_, place_); + this->RunAndRecordEvent([&] { op_->Run(*local_scope_, place_); }); } std::string SendOpHandle::Name() const { return "send"; } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 1ce69ab02b09fe7ec17f479bcef97c931e853dc4..a371ee10fe03cda86c316f3503f9cadb8c716ae5 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -196,10 +196,12 @@ void ThreadedSSAGraphExecutor::RunOp( BlockingQueue *ready_var_q, details::OpHandleBase *op) { auto op_run = [ready_var_q, op, this] { try { - VLOG(10) << op->Name() << " : " << op->DebugString(); + VLOG(10) << op << " " << op->Name() << " : " << op->DebugString(); op->Run(use_event_); + VLOG(10) << op << " " << op->Name() << " Done "; running_ops_--; ready_var_q->Extend(op->outputs_); + VLOG(10) << op << " " << op->Name() << "Signal posted"; } catch (platform::EnforceNotMet ex) { exception_.reset(new platform::EnforceNotMet(ex)); } catch (...) { diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index f03165fae5ca16c5c263ce0683af7ec56e6a3766..1f733d71bdfb777d4a2f316a5fefc3c874879862 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -175,7 +175,7 @@ CUDADeviceContext::~CUDADeviceContext() { Place CUDADeviceContext::GetPlace() const { return place_; } void CUDADeviceContext::Wait() const { - std::lock_guard guard(mutex_); + std::lock_guard guard(mutex_); PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE(cudaGetLastError()); } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index b17558337914e0ca8fdba283edf4024d94e85f0f..a9c1984616bc731e0557f2cb89282423aa9c3bac 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -98,13 +98,20 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return cuda stream in the device context. */ cudaStream_t stream() const; + template + void RecordEvent(cudaEvent_t ev, Callback callback) { + std::lock_guard guard(mutex_); + callback(); + PADDLE_ENFORCE(cudaEventRecord(ev, stream_)); + } + private: CUDAPlace place_; std::unique_ptr eigen_device_; std::unique_ptr eigen_stream_; - mutable std::mutex mutex_; + mutable std::recursive_mutex mutex_; cudaStream_t stream_; cudnnHandle_t cudnn_handle_; cublasHandle_t cublas_handle_;