Add recursive mutex and counter for gpu profiler

8393c19c · liaogang · 9670b9a1 · 8393c19c · 9670b9a1 · 8393c19c
5 changed file
--- a/doc/optimization/gpu_profiling.rst
+++ b/doc/optimization/gpu_profiling.rst
@@ -24,7 +24,7 @@ Why we need profiling?
 ======================
 Since training deep neural network typically take a very long time to get over, performance is gradually becoming
 the most important thing in deep learning field. The first step to improve performance is to understand what parts
-are slow. No point in improving performance of a region which doesn’t take much time!
+are slow.  There is no point in improving performance of a region which doesn’t take much time!


 How to do profiling?
@@ -59,6 +59,7 @@ above profilers.
 The above code snippet includes two methods, you can use any of them to profile the regions of interest.

 1. :code:`REGISTER_TIMER_INFO` is a built-in timer wrapper which can calculate the time overhead of both cpu functions and cuda kernels.
+
 2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
 program crashes when CPU version of PaddlePaddle invokes them.


--- a/doc/optimization/nvprof.png
+++ b/doc/optimization/nvprof.png
--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ b/paddle/math/tests/test_GpuProfiler.cpp
@@ -70,10 +70,14 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
  input->randomizeUniform();
  inputGpu->copyFrom(*input);

-  target->bilinearForward(*input, imgSizeH, imgSizeW,
-      2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
-  targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW,
-      2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
+  {
+    // nvprof: GPU Proflier
+    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
+    target->bilinearForward(*input, imgSizeH, imgSizeW,
+        2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
+    targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW,
+        2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
+  }

  // check
  targetCheck->copyFrom(*targetGpu);
@@ -104,25 +108,29 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
  MatrixCheckErr(*inputGrad, *targetCheckGrad);
 }

-TEST(Profiler, BilinearFwdBwd) {
+TEST(Profiler, testBilinearFwdBwd) {
  auto numSamples = 10;
  auto channels = 16;
  auto imgSize = 64;
  {
    // nvprof: GPU Proflier
-    REGISTER_GPU_PROFILER("testBilinearFwdBwd",
-      "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
+    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
    // Paddle built-in timer
    REGISTER_TIMER_INFO("testBilinearFwdBwd",
      "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
    testBilinearFwdBwd(numSamples, imgSize, imgSize, channels);
  }
-  globalStat.printStatus("testBilinearFwdBwd");
+  globalStat.printAllStatus();
 }

 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);
+
+  // nvprof: GPU Proflier
+  REGISTER_GPU_PROFILER("RecursiveProfilingTest",
+    "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
+
  return RUN_ALL_TESTS();
 }


--- a/paddle/utils/Stat.cpp
+++ b/paddle/utils/Stat.cpp
@@ -203,4 +203,22 @@ StatInfo::~StatInfo() {
  }
 }

+static unsigned g_profileCount = 0;
+static std::recursive_mutex g_profileMutex;
+
+GpuProfiler::GpuProfiler(std::string statName, std::string info)
+  : guard_(g_profileMutex)  {
+  if (++g_profileCount == 1) {
+    LOG(INFO) << "Enable GPU Profiler Stat: ["
+              << statName << "] " << info;
+    hl_profiler_start();
+  }
+}
+
+GpuProfiler::~GpuProfiler() {
+  if (--g_profileCount == 0) {
+    hl_profiler_end();
+  }
+}
+
 }  // namespace paddle
--- a/paddle/utils/Stat.h
+++ b/paddle/utils/Stat.h
@@ -283,8 +283,10 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1,

 class GpuProfiler final {
 public:
-  GpuProfiler() { hl_profiler_start(); }
-  ~GpuProfiler() { hl_profiler_end(); }
+  GpuProfiler(std::string statName, std::string info);
+  ~GpuProfiler();
+private:
+  std::lock_guard<std::recursive_mutex> guard_;
 };

 #ifdef PADDLE_DISABLE_PROFILER
@@ -293,10 +295,8 @@ public:

 #else

-#define REGISTER_GPU_PROFILER(statName, ...)                                \
-  LOG(INFO) << "Enable GPU Profiler Stat: ["                                \
-            << statName << "] " << #__VA_ARGS__;                            \
-  GpuProfiler __gpuProfiler;
+#define REGISTER_GPU_PROFILER(statName, ...) \
+  GpuProfiler __gpuProfiler(statName, #__VA_ARGS__);

 #endif  // DISABLE_PROFILER