diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index cdac00739bc48648b41751e644a953d0d310ffbf..0c8acf71bfa0814e66560258ad6131c743ebc81b 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -136,6 +136,8 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { platform::SetDeviceId(dev_id); #endif } + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::RecordEvent record_event(Type(), pool.Get(place)); RunImpl(scope, place); VLOG(10) << "+ " << DebugStringEx(&scope); } @@ -639,9 +641,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); - // For profiling, don't move out of this function because that will result - // in the failure of multi-GPU profiling. - platform::RecordEvent record_event(Type(), dev_ctx); // check if op[type] has kernel registered. auto& all_op_kernels = AllOpKernels(); auto kernels_iter = all_op_kernels.find(type_); diff --git a/paddle/fluid/operators/feed_op.cc b/paddle/fluid/operators/feed_op.cc index bcb3e63ed7dbc775c1de6c4522f0548ea48a6cf0..dc7ef664958238ddbd48745bd59cc7db28e49f5b 100644 --- a/paddle/fluid/operators/feed_op.cc +++ b/paddle/fluid/operators/feed_op.cc @@ -31,7 +31,6 @@ class FeedOp : public framework::OperatorBase { const platform::Place &place) const override { // get device context from pool auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place); - platform::RecordEvent record_event(Type(), dev_ctx); auto feed_var_name = Input("X"); auto *feed_var = scope.FindVar(feed_var_name); diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc index 680fde19eefe57475b7526ebc29d4ff977a16977..d9cd956dfdff3d009d38ee5088f5396080580483 100644 --- a/paddle/fluid/operators/fetch_barrier_op.cc +++ b/paddle/fluid/operators/fetch_barrier_op.cc @@ -36,12 +36,6 @@ class FetchBarrierOp : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const platform::Place& place) const override { std::vector eps = Attr>("endpoints"); - - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - // For profiling - platform::RecordEvent record_event(Type(), &ctx); - distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance(); diff --git a/paddle/fluid/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc index 1640a2a22c69a0e3ab81a2889d6105b2cf4162b7..c197b45e8196a47def6465128e8ca39d8daefed6 100644 --- a/paddle/fluid/operators/fetch_op.cc +++ b/paddle/fluid/operators/fetch_op.cc @@ -30,9 +30,6 @@ class FetchOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope &scope, const platform::Place &place) const override { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - platform::RecordEvent record_event(Type(), pool.Get(place)); - auto fetch_var_name = Input("X"); auto *fetch_var = scope.FindVar(fetch_var_name); PADDLE_ENFORCE(fetch_var != nullptr, diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index ac35cf0b89bfaa0c0f8e64445f18a3bbd478e70a..27e26cb1b5c1e831f05dac299489628b92eaa58c 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -31,9 +31,6 @@ class LoadOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope &scope, const platform::Place &place) const override { - auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place); - platform::RecordEvent record_event(Type(), dev_ctx); - // FIXME(yuyang18): We save variable to local file now, but we should change // it to save an output stream. auto filename = Attr("file_path"); diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc index 1ba684014904e61a86bebacd7d29d7e10d313092..4a6ce938a5f337d035b21f562d46daf606236db0 100644 --- a/paddle/fluid/operators/recv_op.cc +++ b/paddle/fluid/operators/recv_op.cc @@ -40,8 +40,6 @@ class RecvOp : public framework::OperatorBase { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& ctx = *pool.Get(place); - // For profiling - platform::RecordEvent record_event(Type(), &ctx); distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance(); diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc index d7f8e994afd7e656bd5a9dd7c5ab45f0d52fe88b..1866a86048acbefadcb4d82cd6309cd16f0352d6 100644 --- a/paddle/fluid/operators/send_barrier_op.cc +++ b/paddle/fluid/operators/send_barrier_op.cc @@ -39,11 +39,6 @@ class SendBarrierOp : public framework::OperatorBase { std::vector eps = Attr>("endpoints"); bool sync_mode = Attr("sync_mode"); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - // For profiling - platform::RecordEvent record_event(Type(), &ctx); - distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance(); diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index 829f310d4233c01a7fbb9ccf7427f6e47ce8d384..3cd42f2d059532b7090e66ce21de8e5cb014adf1 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -42,9 +42,6 @@ class SendOp : public framework::OperatorBase { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& ctx = *pool.Get(place); - // For profiling - platform::RecordEvent record_event(Type(), &ctx); - distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance(); diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index f4a0e2a86134096985ba169e3e28d9980f21c83c..7c8d8a5964fa5258bebaf2c8522886ae5886ab2c 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -110,6 +110,8 @@ Event::Event(EventType type, std::string name, uint32_t thread_id, has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false; if (has_cuda_) { auto* cuda_dev_ctx = static_cast(dev_ctx); + PADDLE_ENFORCE(cudaSetDevice( + boost::get(cuda_dev_ctx->GetPlace()).device)); PADDLE_ENFORCE(cudaGetDevice(&device_)); PADDLE_ENFORCE(cudaEventCreate(&event_)); auto stream = cuda_dev_ctx->stream(); @@ -176,6 +178,7 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) : is_enabled_(false), start_ns_(PosixInNsec()) { + std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled) return; is_enabled_ = true; dev_ctx_ = dev_ctx; @@ -186,6 +189,7 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) } RecordEvent::~RecordEvent() { + std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled || !is_enabled_) return; DeviceTracer* tracer = GetDeviceTracer(); if (tracer) { @@ -198,6 +202,7 @@ RecordEvent::~RecordEvent() { RecordBlock::RecordBlock(int block_id) : is_enabled_(false), start_ns_(PosixInNsec()) { + std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled) return; is_enabled_ = true; SetCurBlock(block_id); @@ -205,6 +210,7 @@ RecordBlock::RecordBlock(int block_id) } RecordBlock::~RecordBlock() { + std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled || !is_enabled_) return; DeviceTracer* tracer = GetDeviceTracer(); if (tracer) {