From de6e7431a2f31244307ac97e0894996141e22f98 Mon Sep 17 00:00:00 2001
From: Yuanle Liu <yuanlehome@163.com>
Date: Wed, 19 Oct 2022 15:15:43 +0800
Subject: [PATCH] add nvtxRangePush/Pop for naive_executor and refine some code
 (#47139)

---
 CMakeLists.txt                                |  1 +
 cmake/inference_lib.cmake                     |  4 ++++
 paddle/fluid/framework/naive_executor.cc      | 19 +++++++++++++++++--
 paddle/fluid/inference/CMakeLists.txt         |  4 ++++
 paddle/fluid/inference/api/analysis_config.cc |  2 +-
 .../fluid/inference/api/analysis_predictor.cc | 12 ++++++------
 .../fluid/inference/api/analysis_predictor.h  |  2 +-
 .../platform/device/gpu/cuda/cuda_profiler.cc | 18 +++++++++++++-----
 .../platform/device/gpu/cuda/cuda_profiler.h  | 18 ++++++++++++++----
 paddle/fluid/platform/dynload/nvtx.h          | 12 ++++--------
 paddle/phi/backends/dynload/nvtx.h            |  1 +
 11 files changed, 66 insertions(+), 27 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a4c8a52486..3cb05a98da 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -268,6 +268,7 @@ option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE})
 option(WITH_HETERPS "Compile with heterps" OFF})
 option(WITH_INFERENCE_API_TEST
        "Test fluid inference C++ high-level api interface" OFF)
+option(WITH_INFERENCE_NVTX "Paddle inference with nvtx for profiler" OFF)
 option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
 option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE})
 option(
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 2fc1be2545..6ab33109c9 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -356,6 +356,10 @@ else()
   )
 endif()
 
+if(WITH_INFERENCE_NVTX AND NOT WIN32)
+  add_definitions(-DPADDLE_WITH_INFERENCE_NVTX)
+endif()
+
 copy(
   inference_lib_dist
   SRCS ${src_dir}/inference/capi_exp/pd_*.h ${paddle_inference_c_lib}
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 651ac23e52..ee04cef32e 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -22,9 +22,12 @@
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
-#if PADDLE_WITH_TENSORRT
+#ifdef PADDLE_WITH_TENSORRT
 #include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
 #endif
+#ifdef PADDLE_WITH_INFERENCE_NVTX
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -48,12 +51,24 @@ void NaiveExecutor::Run() {
   platform::RegisterModelLayout(ops_, place_);
 #endif
   platform::ScopedFlushDenormal flush;
+#ifdef PADDLE_WITH_INFERENCE_NVTX
+  platform::CudaNvtxRangePush("model", platform::NvtxRangeColor::Yellow);
+#endif
   for (auto &op : ops_) {
     VLOG(4) << std::this_thread::get_id() << " run "
             << op->DebugStringEx(scope_) << " on scope " << scope_;
     op->SetIsCalledByExecutor(false);
+#ifdef PADDLE_WITH_INFERENCE_NVTX
+    platform::CudaNvtxRangePush(op->Type(), platform::NvtxRangeColor::Green);
+#endif
     op->Run(*scope_, place_);
+#ifdef PADDLE_WITH_INFERENCE_NVTX
+    platform::CudaNvtxRangePop();
+#endif
   }
+#ifdef PADDLE_WITH_INFERENCE_NVTX
+  platform::CudaNvtxRangePop();
+#endif
 }
 
 void NaiveExecutor::CreateVariables(const ProgramDesc &desc,
@@ -146,7 +161,7 @@ NaiveExecutor::~NaiveExecutor() {
 }
 
 void NaiveExecutor::ResetTrtOps(int num) {
-#if PADDLE_WITH_TENSORRT
+#ifdef PADDLE_WITH_TENSORRT
   for (auto &op : ops_) {
     if (op->Type() == "tensorrt_engine") {
       operators::TensorRTEngineOp *trtop =
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 0b20dbfde7..7e14a3b264 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -108,6 +108,10 @@ if(WITH_PSCORE)
                             tensor_table)
 endif()
 
+if(WITH_INFERENCE_NVTX AND NOT WIN32)
+  set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} cuda_profiler)
+endif()
+
 if(WITH_ONNXRUNTIME)
   set(SHARED_INFERENCE_SRCS
       ${SHARED_INFERENCE_SRCS}
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 68864ad152..2af92c7e14 100755
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -655,7 +655,7 @@ void AnalysisConfig::EnableTensorRtEngine(
   }
 
   use_tensorrt_ = true;
-#if PADDLE_WITH_TENSORRT
+#ifdef PADDLE_WITH_TENSORRT
   // https://forums.developer.nvidia.com/t/nvinfer1-createexecutioncontextwithoutdevicememory-returns-nullptr/111878/2
   // when trt version less than 7.2,
   // createExecutionContextWithoutDeviceMemory() has bug.
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index f4107975cb..79527c97d4 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -79,7 +79,7 @@
 #include "paddle/fluid/inference/api/onnxruntime_predictor.h"
 #endif
 
-#if PADDLE_WITH_TENSORRT
+#ifdef PADDLE_WITH_TENSORRT
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
@@ -92,7 +92,7 @@
 namespace paddle {
 
 using inference::Singleton;
-#if PADDLE_WITH_TENSORRT
+#ifdef PADDLE_WITH_TENSORRT
 using inference::tensorrt::TRTCalibratorEngine;
 using inference::tensorrt::TRTCalibratorEngineManager;
 using inference::tensorrt::TRTInt8Calibrator;
@@ -1271,7 +1271,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
       [](framework::ProgramDesc *prog) {
 // Note, please do NOT use any member variables, because member variables may
 // have been destructed in multiple threads.
-#if PADDLE_WITH_TENSORRT
+#ifdef PADDLE_WITH_TENSORRT
         auto &block = prog->Block(0);
         for (auto &op_desc : block.AllOps()) {
           if (op_desc->Type() == "tensorrt_engine") {
@@ -1977,7 +1977,7 @@ void AnalysisPredictor::ClearIntermediateTensor() {
   }
 }
 
-#if PADDLE_WITH_TENSORRT
+#ifdef PADDLE_WITH_TENSORRT
 bool AnalysisPredictor::SaveTrtCalibToDisk() {
   PADDLE_ENFORCE_EQ(config_.tensorrt_engine_enabled(),
                     true,
@@ -2033,7 +2033,7 @@ bool AnalysisPredictor::SaveTrtCalibToDisk() {
 #endif
 
 AnalysisPredictor::~AnalysisPredictor() {
-#if PADDLE_WITH_TENSORRT
+#ifdef PADDLE_WITH_TENSORRT
   if (config_.tensorrt_engine_enabled() &&
       config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 &&
       Singleton<TRTCalibratorEngineManager>::Global().Has()) {
@@ -2157,7 +2157,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>(
 
 }  // namespace paddle
 
-#if PADDLE_WITH_TENSORRT
+#ifdef PADDLE_WITH_TENSORRT
 USE_TRT_CONVERTER(elementwise_add_weight);
 USE_TRT_CONVERTER(elementwise_sub_weight);
 USE_TRT_CONVERTER(elementwise_mul_weight);
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index ff34bac545..bb196e2ef5 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -382,7 +382,7 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   void MkldnnPostReset();
 
-#if PADDLE_WITH_TENSORRT
+#ifdef PADDLE_WITH_TENSORRT
   ///
   /// \brief save calibration table
   ///
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
index 291dd6c7ce..cebb36cbc6 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
@@ -17,9 +17,9 @@
 namespace paddle {
 namespace platform {
 
-void CudaProfilerInit(std::string output_file,
-                      std::string output_mode,
-                      std::string config_file) {
+void CudaProfilerInit(const std::string& output_file,
+                      const std::string& output_mode,
+                      const std::string& config_file) {
   PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv",
                  platform::errors::InvalidArgument(
                      "Unsupported cuda profiler output mode, expect `kvp` or "
@@ -35,8 +35,16 @@ void CudaProfilerStart() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStart()); }
 void CudaProfilerStop() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStop()); }
 
 #ifndef _WIN32
-void CudaNvtxRangePush(std::string name) {
-  dynload::nvtxRangePushA(name.c_str());
+void CudaNvtxRangePush(const std::string& name, const NvtxRangeColor color) {
+  nvtxEventAttributes_t eventAttrib;
+  eventAttrib.version = NVTX_VERSION;
+  eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+  eventAttrib.colorType = NVTX_COLOR_ARGB;
+  eventAttrib.color = static_cast<uint32_t>(color);
+  eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+  eventAttrib.message.ascii = name.c_str();
+
+  dynload::nvtxRangePushEx(&eventAttrib);
 }
 
 void CudaNvtxRangePop() { dynload::nvtxRangePop(); }
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h
index 6c7cf0fd8d..193e08bdde 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h
@@ -23,16 +23,26 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-void CudaProfilerInit(std::string output_file,
-                      std::string output_mode,
-                      std::string config_file);
+void CudaProfilerInit(const std::string& output_file,
+                      const std::string& output_mode,
+                      const std::string& config_file);
 
 void CudaProfilerStart();
 
 void CudaProfilerStop();
 
 #ifndef _WIN32
-void CudaNvtxRangePush(std::string name);
+enum class NvtxRangeColor : uint32_t {
+  Black = 0x00000000,
+  Red = 0x00ff0000,
+  Green = 0x0000ff00,
+  Blue = 0x000000ff,
+  White = 0x00ffffff,
+  Yellow = 0x00ffff00,
+};
+
+void CudaNvtxRangePush(const std::string& name,
+                       const NvtxRangeColor color = NvtxRangeColor::Blue);
 
 void CudaNvtxRangePop();
 #endif
diff --git a/paddle/fluid/platform/dynload/nvtx.h b/paddle/fluid/platform/dynload/nvtx.h
index c3dc9e31df..e5816e240e 100644
--- a/paddle/fluid/platform/dynload/nvtx.h
+++ b/paddle/fluid/platform/dynload/nvtx.h
@@ -13,11 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #ifndef _WIN32
-#include <cuda.h>
-#include <nvToolsExt.h>
-
-#include <mutex>  // NOLINT
-
 #include "paddle/phi/backends/dynload/nvtx.h"
 
 namespace paddle {
@@ -28,11 +23,12 @@ namespace dynload {
   using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
   extern DynLoad__##__name __name
 
-#define NVTX_ROUTINE_EACH(__macro) \
-  __macro(nvtxRangePushA);         \
+#define PLATFORM_NVTX_ROUTINE_EACH(__macro) \
+  __macro(nvtxRangePushA);                  \
+  __macro(nvtxRangePushEx);                 \
   __macro(nvtxRangePop);
 
-NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
+PLATFORM_NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
 
 #undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP
 }  // namespace dynload
diff --git a/paddle/phi/backends/dynload/nvtx.h b/paddle/phi/backends/dynload/nvtx.h
index a9a166b289..e51bbf2154 100644
--- a/paddle/phi/backends/dynload/nvtx.h
+++ b/paddle/phi/backends/dynload/nvtx.h
@@ -42,6 +42,7 @@ extern void *nvtx_dso_handle;
 
 #define NVTX_ROUTINE_EACH(__macro) \
   __macro(nvtxRangePushA);         \
+  __macro(nvtxRangePushEx);        \
   __macro(nvtxRangePop);
 
 NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
-- 
GitLab