Merge pull request #12541 from jacquesqiao/optimize-profiler

optimize profiler

Merge pull request #12541 from jacquesqiao/optimize-profiler
optimize profiler
88a607c3 · Qiao Longfei · GitHub · 0fd2f713 · 954d680b · 88a607c3
9 changed file
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -136,6 +136,8 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
    platform::SetDeviceId(dev_id);
 #endif
  }
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  platform::RecordEvent record_event(Type(), pool.Get(place));
  RunImpl(scope, place);
  VLOG(10) << "+ " << DebugStringEx(&scope);
 }
@@ -639,9 +641,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto* dev_ctx = pool.Get(place);
-  // For profiling, don't move out of this function because that will result
-  // in the failure of multi-GPU profiling.
-  platform::RecordEvent record_event(Type(), dev_ctx);
  // check if op[type] has kernel registered.
  auto& all_op_kernels = AllOpKernels();
  auto kernels_iter = all_op_kernels.find(type_);

--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
@@ -31,7 +31,6 @@ class FeedOp : public framework::OperatorBase {
               const platform::Place &place) const override {
    // get device context from pool
    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    platform::RecordEvent record_event(Type(), dev_ctx);
    auto feed_var_name = Input("X");
    auto *feed_var = scope.FindVar(feed_var_name);

--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@@ -36,12 +36,6 @@ class FetchBarrierOp : public framework::OperatorBase {
  void RunImpl(const framework::Scope& scope,
               const platform::Place& place) const override {
    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
    distributed::RPCClient* rpc_client =
        distributed::RPCClient::GetInstance<RPCCLIENT_T>();

--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
@@ -30,9 +30,6 @@ class FetchOp : public framework::OperatorBase {
 private:
  void RunImpl(const framework::Scope &scope,
               const platform::Place &place) const override {
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    platform::RecordEvent record_event(Type(), pool.Get(place));
    auto fetch_var_name = Input("X");
    auto *fetch_var = scope.FindVar(fetch_var_name);
    PADDLE_ENFORCE(fetch_var != nullptr,

--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -31,9 +31,6 @@ class LoadOp : public framework::OperatorBase {
 private:
  void RunImpl(const framework::Scope &scope,
               const platform::Place &place) const override {
-    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    platform::RecordEvent record_event(Type(), dev_ctx);
    // FIXME(yuyang18): We save variable to local file now, but we should change
    // it to save an output stream.
    auto filename = Attr<std::string>("file_path");

--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -40,8 +40,6 @@ class RecvOp : public framework::OperatorBase {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
    distributed::RPCClient* rpc_client =
        distributed::RPCClient::GetInstance<RPCCLIENT_T>();

--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -39,11 +39,6 @@ class SendBarrierOp : public framework::OperatorBase {
    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
    bool sync_mode = Attr<bool>("sync_mode");
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
    distributed::RPCClient* rpc_client =
        distributed::RPCClient::GetInstance<RPCCLIENT_T>();

--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -42,9 +42,6 @@ class SendOp : public framework::OperatorBase {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    auto& ctx = *pool.Get(place);
-    // For profiling
-    platform::RecordEvent record_event(Type(), &ctx);
    distributed::RPCClient* rpc_client =
        distributed::RPCClient::GetInstance<RPCCLIENT_T>();

--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -110,6 +110,8 @@ Event::Event(EventType type, std::string name, uint32_t thread_id,
  has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
  if (has_cuda_) {
    auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
+    PADDLE_ENFORCE(cudaSetDevice(
+        boost::get<platform::CUDAPlace>(cuda_dev_ctx->GetPlace()).device));
    PADDLE_ENFORCE(cudaGetDevice(&device_));
    PADDLE_ENFORCE(cudaEventCreate(&event_));
    auto stream = cuda_dev_ctx->stream();
@@ -176,6 +178,7 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
 RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
    : is_enabled_(false), start_ns_(PosixInNsec()) {
+  std::lock_guard<std::mutex> l(profiler_mu);
  if (g_state == ProfilerState::kDisabled) return;
  is_enabled_ = true;
  dev_ctx_ = dev_ctx;
@@ -186,6 +189,7 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
 }
 RecordEvent::~RecordEvent() {
+  std::lock_guard<std::mutex> l(profiler_mu);
  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
  DeviceTracer* tracer = GetDeviceTracer();
  if (tracer) {
@@ -198,6 +202,7 @@ RecordEvent::~RecordEvent() {
 RecordBlock::RecordBlock(int block_id)
    : is_enabled_(false), start_ns_(PosixInNsec()) {
+  std::lock_guard<std::mutex> l(profiler_mu);
  if (g_state == ProfilerState::kDisabled) return;
  is_enabled_ = true;
  SetCurBlock(block_id);
@@ -205,6 +210,7 @@ RecordBlock::RecordBlock(int block_id)
 }
 RecordBlock::~RecordBlock() {
+  std::lock_guard<std::mutex> l(profiler_mu);
  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
  DeviceTracer* tracer = GetDeviceTracer();
  if (tracer) {