diff --git a/doc/optimization/gpu_profiling.rst b/doc/optimization/gpu_profiling.rst
index 013edb396e07d9919a77ab4dc953dfb7342d85cb..44ecb348858cc05d5ce394ce7443d434539b46d1 100644
--- a/doc/optimization/gpu_profiling.rst
+++ b/doc/optimization/gpu_profiling.rst
@@ -24,7 +24,7 @@ Why we need profiling?
 ======================
 Since training deep neural network typically take a very long time to get over, performance is gradually becoming
 the most important thing in deep learning field. The first step to improve performance is to understand what parts
-are slow. No point in improving performance of a region which doesn’t take much time!
+are slow.  There is no point in improving performance of a region which doesn’t take much time!
 
 
 How to do profiling?
@@ -59,6 +59,7 @@ above profilers.
 The above code snippet includes two methods, you can use any of them to profile the regions of interest.
 
 1. :code:`REGISTER_TIMER_INFO` is a built-in timer wrapper which can calculate the time overhead of both cpu functions and cuda kernels.
+
 2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
 program crashes when CPU version of PaddlePaddle invokes them.
 
diff --git a/doc/optimization/nvprof.png b/doc/optimization/nvprof.png
deleted file mode 100644
index 5931a9b7dc43e6438c9c2105020f59eb3367f0d9..0000000000000000000000000000000000000000
Binary files a/doc/optimization/nvprof.png and /dev/null differ
diff --git a/paddle/math/tests/test_GpuProfiler.cpp b/paddle/math/tests/test_GpuProfiler.cpp
index ea1bd2481e7c13ae7792a36a4491e2cdde5dced6..c3542b7834224e2fa6fe323a1fbe8ea1e7cd68de 100644
--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ b/paddle/math/tests/test_GpuProfiler.cpp
@@ -70,10 +70,14 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
   input->randomizeUniform();
   inputGpu->copyFrom(*input);
 
-  target->bilinearForward(*input, imgSizeH, imgSizeW,
-      2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
-  targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW,
-      2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
+  {
+    // nvprof: GPU Proflier
+    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
+    target->bilinearForward(*input, imgSizeH, imgSizeW,
+        2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
+    targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW,
+        2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
+  }
 
   // check
   targetCheck->copyFrom(*targetGpu);
@@ -104,25 +108,29 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
   MatrixCheckErr(*inputGrad, *targetCheckGrad);
 }
 
-TEST(Profiler, BilinearFwdBwd) {
+TEST(Profiler, testBilinearFwdBwd) {
   auto numSamples = 10;
   auto channels = 16;
   auto imgSize = 64;
   {
     // nvprof: GPU Proflier
-    REGISTER_GPU_PROFILER("testBilinearFwdBwd",
-      "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
+    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
     // Paddle built-in timer
     REGISTER_TIMER_INFO("testBilinearFwdBwd",
       "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
     testBilinearFwdBwd(numSamples, imgSize, imgSize, channels);
   }
-  globalStat.printStatus("testBilinearFwdBwd");
+  globalStat.printAllStatus();
 }
 
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
+
+  // nvprof: GPU Proflier
+  REGISTER_GPU_PROFILER("RecursiveProfilingTest",
+    "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
+
   return RUN_ALL_TESTS();
 }
 
diff --git a/paddle/utils/Stat.cpp b/paddle/utils/Stat.cpp
index 733fc7f1a7036aa3abc5ec922cfa480ec379680e..ab140c33502ad315d087bb3afc7f39bffc122894 100644
--- a/paddle/utils/Stat.cpp
+++ b/paddle/utils/Stat.cpp
@@ -203,4 +203,22 @@ StatInfo::~StatInfo() {
   }
 }
 
+static unsigned g_profileCount = 0;
+static std::recursive_mutex g_profileMutex;
+
+GpuProfiler::GpuProfiler(std::string statName, std::string info)
+  : guard_(g_profileMutex)  {
+  if (++g_profileCount == 1) {
+    LOG(INFO) << "Enable GPU Profiler Stat: ["
+              << statName << "] " << info;
+    hl_profiler_start();
+  }
+}
+
+GpuProfiler::~GpuProfiler() {
+  if (--g_profileCount == 0) {
+    hl_profiler_end();
+  }
+}
+
 }  // namespace paddle
diff --git a/paddle/utils/Stat.h b/paddle/utils/Stat.h
index 8f65abb2d059f37bd38a65c22b00f9f0bef4a437..8bfe42a6948ff3ea04a1b8f494e9765ba4829609 100644
--- a/paddle/utils/Stat.h
+++ b/paddle/utils/Stat.h
@@ -283,8 +283,10 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1,
 
 class GpuProfiler final {
 public:
-  GpuProfiler() { hl_profiler_start(); }
-  ~GpuProfiler() { hl_profiler_end(); }
+  GpuProfiler(std::string statName, std::string info);
+  ~GpuProfiler();
+private:
+  std::lock_guard<std::recursive_mutex> guard_;
 };
 
 #ifdef PADDLE_DISABLE_PROFILER
@@ -293,10 +295,8 @@ public:
 
 #else
 
-#define REGISTER_GPU_PROFILER(statName, ...)                                \
-  LOG(INFO) << "Enable GPU Profiler Stat: ["                                \
-            << statName << "] " << #__VA_ARGS__;                            \
-  GpuProfiler __gpuProfiler;
+#define REGISTER_GPU_PROFILER(statName, ...) \
+  GpuProfiler __gpuProfiler(statName, #__VA_ARGS__);
 
 #endif  // DISABLE_PROFILER