From 90773473a06b3762376a51b5878abb8a626ba78c Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuancoder@163.com>
Date: Wed, 20 Jan 2021 18:29:40 +0800
Subject: [PATCH] use nvtx push pop in timeline (#30567)

* delete empty line of pybing.cc, test=develop

* use nvtx push pop in timeline, test=develop

* change year, test=develop

* add #ifdef PADDLE_WITH_CUDA, test=develop

* add #ifndef WIN32, test=develop

* is_pushed to is_pushed_, test=develop
---
 paddle/fluid/platform/CMakeLists.txt          |  2 +-
 paddle/fluid/platform/cuda_profiler.h         |  9 ++++
 paddle/fluid/platform/dynload/CMakeLists.txt  |  2 +-
 .../fluid/platform/dynload/dynamic_loader.cc  | 13 +++++
 .../fluid/platform/dynload/dynamic_loader.h   |  1 +
 paddle/fluid/platform/dynload/nvtx.cc         | 31 +++++++++++
 paddle/fluid/platform/dynload/nvtx.h          | 53 +++++++++++++++++++
 paddle/fluid/platform/profiler.cc             | 25 +++++++++
 paddle/fluid/platform/profiler.h              |  4 ++
 paddle/fluid/platform/profiler_helper.h       |  2 +
 paddle/fluid/pybind/CMakeLists.txt            |  4 ++
 paddle/fluid/pybind/pybind.cc                 |  4 ++
 12 files changed, 148 insertions(+), 2 deletions(-)
 create mode 100644 paddle/fluid/platform/dynload/nvtx.cc
 create mode 100644 paddle/fluid/platform/dynload/nvtx.h
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index f2a8309f00c..73add8ea06f 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -124,7 +124,7 @@ cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_pri
 
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 if(WITH_GPU)
-  nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce)
+  nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda)
   nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
   nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
 else()
diff --git a/paddle/fluid/platform/cuda_profiler.h b/paddle/fluid/platform/cuda_profiler.h
index a9382f2c8ad..6edc141205a 100644
--- a/paddle/fluid/platform/cuda_profiler.h
+++ b/paddle/fluid/platform/cuda_profiler.h
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include <string>
 
+#include "paddle/fluid/platform/dynload/nvtx.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -38,5 +39,13 @@ void CudaProfilerStart() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStart()); }
 
 void CudaProfilerStop() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStop()); }
 
+#ifndef _WIN32
+void CudaNvtxRangePush(std::string name) {
+  dynload::nvtxRangePushA(name.c_str());
+}
+
+void CudaNvtxRangePop() { dynload::nvtxRangePop(); }
+#endif
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 647bff93122..725b7fcf9dd 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,6 +1,6 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 
-list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc)
+list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc nvtx.cc)
 #hip
 if (WITH_ROCM_PLATFORM)
     list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc)
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 4c39a35030b..e7130544689 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -416,6 +416,19 @@ void* GetOpDsoHandle(const std::string& dso_name) {
 #endif
 }
 
+void* GetNvtxDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  PADDLE_THROW(platform::errors::Unimplemented("Nvtx do not support Apple."));
+#elif defined(_WIN32)
+  PADDLE_THROW(platform::errors::Unimplemented("Nvtx do not support Windows."));
+#elif !defined(PADDLE_WITH_CUDA)
+  PADDLE_THROW(
+      platform::errors::Unimplemented("Nvtx do not support without CUDA."));
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so");
+#endif
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index 1136184ce1f..c3f5953c785 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -37,6 +37,7 @@ void* GetNCCLDsoHandle();
 void* GetTensorRtDsoHandle();
 void* GetMKLMLDsoHandle();
 void* GetOpDsoHandle(const std::string& dso_name);
+void* GetNvtxDsoHandle();
 
 void SetPaddleLibPath(const std::string&);
 }  // namespace dynload
diff --git a/paddle/fluid/platform/dynload/nvtx.cc b/paddle/fluid/platform/dynload/nvtx.cc
new file mode 100644
index 00000000000..372f8500e54
--- /dev/null
+++ b/paddle/fluid/platform/dynload/nvtx.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef _WIN32
+#include "paddle/fluid/platform/dynload/nvtx.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag nvtx_dso_flag;
+void *nvtx_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+NVTX_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/dynload/nvtx.h b/paddle/fluid/platform/dynload/nvtx.h
new file mode 100644
index 00000000000..b696bbf9181
--- /dev/null
+++ b/paddle/fluid/platform/dynload/nvtx.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifndef _WIN32
+#include <cuda.h>
+#include <nvToolsExt.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+extern std::once_flag nvtx_dso_flag;
+extern void *nvtx_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_NVTX_WRAP(__name)                           \
+  struct DynLoad__##__name {                                             \
+    template <typename... Args>                                          \
+    int operator()(Args... args) {                                       \
+      using nvtxFunc = decltype(&::__name);                              \
+      std::call_once(nvtx_dso_flag, []() {                               \
+        nvtx_dso_handle = paddle::platform::dynload::GetNvtxDsoHandle(); \
+      });                                                                \
+      static void *p_##__name = dlsym(nvtx_dso_handle, #__name);         \
+      return reinterpret_cast<nvtxFunc>(p_##__name)(args...);            \
+    }                                                                    \
+  };                                                                     \
+  extern DynLoad__##__name __name
+
+#define NVTX_ROUTINE_EACH(__macro) \
+  __macro(nvtxRangePushA);         \
+  __macro(nvtxRangePop);
+
+NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_NVTX_WRAP
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 56a6275b582..c8e8e68dcda 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -21,6 +21,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler_helper.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/dynload/nvtx.h"
+#endif
 
 DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
 
@@ -51,6 +54,14 @@ double Event::CudaElapsedMs(const Event &e) const {
 }
 
 RecordEvent::RecordEvent(const std::string &name, const EventRole role) {
+#ifndef _WIN32
+#ifdef PADDLE_WITH_CUDA
+  if (g_enable_nvprof_hook) {
+    dynload::nvtxRangePushA(name.c_str());
+    is_pushed_ = true;
+  }
+#endif
+#endif
   if (g_state == ProfilerState::kDisabled || name.empty()) return;
 
   // do some initialization
@@ -65,6 +76,13 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role) {
 }
 
 RecordEvent::~RecordEvent() {
+#ifndef _WIN32
+#ifdef PADDLE_WITH_CUDA
+  if (g_enable_nvprof_hook && is_pushed_) {
+    dynload::nvtxRangePop();
+  }
+#endif
+#endif
   if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   // lock is not needed, the code below is thread-safe
   DeviceTracer *tracer = GetDeviceTracer();
@@ -299,5 +317,12 @@ void SetProfileListener() {
 
 int64_t ListenerId() { return profiler_lister_id; }
 
+void NvprofEnableRecordEvent() {
+  SynchronizeAllDevice();
+  g_enable_nvprof_hook = true;
+}
+
+void NvprofDisableRecordEvent() { g_enable_nvprof_hook = false; }
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 0185328ff32..66a102a3d58 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -131,6 +131,7 @@ struct RecordEvent {
   ~RecordEvent();
 
   bool is_enabled_{false};
+  bool is_pushed_{false};
   uint64_t start_ns_;
   // Event name
   std::string name_;
@@ -227,5 +228,8 @@ void DummyKernelAndEvent();
 void SetProfileListener();
 int64_t ListenerId();
 
+void NvprofEnableRecordEvent();
+void NvprofDisableRecordEvent();
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index 96296861322..66595aa651a 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -42,6 +42,8 @@ std::mutex profiler_mu;
 static TracerOption g_tracer_option = TracerOption::kDefault;
 // The profiler state, the initial value is ProfilerState::kDisabled
 static ProfilerState g_state = ProfilerState::kDisabled;
+// To hook RecordEvent's events, use it to nvtx timeline
+static bool g_enable_nvprof_hook = false;
 // The thread local event list only can be accessed by the specific thread
 // The thread index of each thread
 static thread_local int32_t g_thread_id;
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 0f52d7344c8..39e83ab12d5 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -3,6 +3,10 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
   gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper)
 
+if (WITH_GPU)
+  set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda)
+endif()
+
 if (WITH_NCCL)
   set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
   set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 72b3c9645ba..03a21b29921 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1951,6 +1951,10 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("nvprof_init", platform::CudaProfilerInit);
   m.def("nvprof_start", platform::CudaProfilerStart);
   m.def("nvprof_stop", platform::CudaProfilerStop);
+  m.def("nvprof_nvtx_push", platform::CudaNvtxRangePush);
+  m.def("nvprof_nvtx_pop", platform::CudaNvtxRangePop);
+  m.def("nvprof_enable_record_event", platform::NvprofEnableRecordEvent);
+  m.def("nvprof_disable_record_event", platform::NvprofDisableRecordEvent);
 #endif
 #endif
 
-- 
GitLab