diff --git a/doc/optimization/gpu_profiling.rst b/doc/optimization/gpu_profiling.rst index 013edb396e07d9919a77ab4dc953dfb7342d85cb..44ecb348858cc05d5ce394ce7443d434539b46d1 100644 --- a/doc/optimization/gpu_profiling.rst +++ b/doc/optimization/gpu_profiling.rst @@ -24,7 +24,7 @@ Why we need profiling? ====================== Since training deep neural network typically take a very long time to get over, performance is gradually becoming the most important thing in deep learning field. The first step to improve performance is to understand what parts -are slow. No point in improving performance of a region which doesn’t take much time! +are slow. There is no point in improving performance of a region which doesn’t take much time! How to do profiling? @@ -59,6 +59,7 @@ above profilers. The above code snippet includes two methods, you can use any of them to profile the regions of interest. 1. :code:`REGISTER_TIMER_INFO` is a built-in timer wrapper which can calculate the time overhead of both cpu functions and cuda kernels. + 2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid program crashes when CPU version of PaddlePaddle invokes them. diff --git a/doc/optimization/nvprof.png b/doc/optimization/nvprof.png deleted file mode 100644 index 5931a9b7dc43e6438c9c2105020f59eb3367f0d9..0000000000000000000000000000000000000000 Binary files a/doc/optimization/nvprof.png and /dev/null differ diff --git a/paddle/math/tests/test_GpuProfiler.cpp b/paddle/math/tests/test_GpuProfiler.cpp index ea1bd2481e7c13ae7792a36a4491e2cdde5dced6..c3542b7834224e2fa6fe323a1fbe8ea1e7cd68de 100644 --- a/paddle/math/tests/test_GpuProfiler.cpp +++ b/paddle/math/tests/test_GpuProfiler.cpp @@ -70,10 +70,14 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW, input->randomizeUniform(); inputGpu->copyFrom(*input); - target->bilinearForward(*input, imgSizeH, imgSizeW, - 2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW); - targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW, - 2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW); + { + // nvprof: GPU Proflier + REGISTER_GPU_PROFILER("testBilinearFwdBwd"); + target->bilinearForward(*input, imgSizeH, imgSizeW, + 2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW); + targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW, + 2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW); + } // check targetCheck->copyFrom(*targetGpu); @@ -104,25 +108,29 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW, MatrixCheckErr(*inputGrad, *targetCheckGrad); } -TEST(Profiler, BilinearFwdBwd) { +TEST(Profiler, testBilinearFwdBwd) { auto numSamples = 10; auto channels = 16; auto imgSize = 64; { // nvprof: GPU Proflier - REGISTER_GPU_PROFILER("testBilinearFwdBwd", - "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64"); + REGISTER_GPU_PROFILER("testBilinearFwdBwd"); // Paddle built-in timer REGISTER_TIMER_INFO("testBilinearFwdBwd", "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64"); testBilinearFwdBwd(numSamples, imgSize, imgSize, channels); } - globalStat.printStatus("testBilinearFwdBwd"); + globalStat.printAllStatus(); } int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); initMain(argc, argv); + + // nvprof: GPU Proflier + REGISTER_GPU_PROFILER("RecursiveProfilingTest", + "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64"); + return RUN_ALL_TESTS(); } diff --git a/paddle/utils/Stat.cpp b/paddle/utils/Stat.cpp index 733fc7f1a7036aa3abc5ec922cfa480ec379680e..ab140c33502ad315d087bb3afc7f39bffc122894 100644 --- a/paddle/utils/Stat.cpp +++ b/paddle/utils/Stat.cpp @@ -203,4 +203,22 @@ StatInfo::~StatInfo() { } } +static unsigned g_profileCount = 0; +static std::recursive_mutex g_profileMutex; + +GpuProfiler::GpuProfiler(std::string statName, std::string info) + : guard_(g_profileMutex) { + if (++g_profileCount == 1) { + LOG(INFO) << "Enable GPU Profiler Stat: [" + << statName << "] " << info; + hl_profiler_start(); + } +} + +GpuProfiler::~GpuProfiler() { + if (--g_profileCount == 0) { + hl_profiler_end(); + } +} + } // namespace paddle diff --git a/paddle/utils/Stat.h b/paddle/utils/Stat.h index 8f65abb2d059f37bd38a65c22b00f9f0bef4a437..8bfe42a6948ff3ea04a1b8f494e9765ba4829609 100644 --- a/paddle/utils/Stat.h +++ b/paddle/utils/Stat.h @@ -283,8 +283,10 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1, class GpuProfiler final { public: - GpuProfiler() { hl_profiler_start(); } - ~GpuProfiler() { hl_profiler_end(); } + GpuProfiler(std::string statName, std::string info); + ~GpuProfiler(); +private: + std::lock_guard guard_; }; #ifdef PADDLE_DISABLE_PROFILER @@ -293,10 +295,8 @@ public: #else -#define REGISTER_GPU_PROFILER(statName, ...) \ - LOG(INFO) << "Enable GPU Profiler Stat: [" \ - << statName << "] " << #__VA_ARGS__; \ - GpuProfiler __gpuProfiler; +#define REGISTER_GPU_PROFILER(statName, ...) \ + GpuProfiler __gpuProfiler(statName, #__VA_ARGS__); #endif // DISABLE_PROFILER