提交 8393c19c 编写于 作者: L liaogang

Add recursive mutex and counter for gpu profiler

上级 9670b9a1
...@@ -24,7 +24,7 @@ Why we need profiling? ...@@ -24,7 +24,7 @@ Why we need profiling?
====================== ======================
Since training deep neural network typically take a very long time to get over, performance is gradually becoming Since training deep neural network typically take a very long time to get over, performance is gradually becoming
the most important thing in deep learning field. The first step to improve performance is to understand what parts the most important thing in deep learning field. The first step to improve performance is to understand what parts
are slow. No point in improving performance of a region which doesn’t take much time! are slow. There is no point in improving performance of a region which doesn’t take much time!
How to do profiling? How to do profiling?
...@@ -59,6 +59,7 @@ above profilers. ...@@ -59,6 +59,7 @@ above profilers.
The above code snippet includes two methods, you can use any of them to profile the regions of interest. The above code snippet includes two methods, you can use any of them to profile the regions of interest.
1. :code:`REGISTER_TIMER_INFO` is a built-in timer wrapper which can calculate the time overhead of both cpu functions and cuda kernels. 1. :code:`REGISTER_TIMER_INFO` is a built-in timer wrapper which can calculate the time overhead of both cpu functions and cuda kernels.
2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid 2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
program crashes when CPU version of PaddlePaddle invokes them. program crashes when CPU version of PaddlePaddle invokes them.
......
...@@ -70,10 +70,14 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW, ...@@ -70,10 +70,14 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
input->randomizeUniform(); input->randomizeUniform();
inputGpu->copyFrom(*input); inputGpu->copyFrom(*input);
target->bilinearForward(*input, imgSizeH, imgSizeW, {
2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW); // nvprof: GPU Proflier
targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW, REGISTER_GPU_PROFILER("testBilinearFwdBwd");
2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW); target->bilinearForward(*input, imgSizeH, imgSizeW,
2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW,
2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
}
// check // check
targetCheck->copyFrom(*targetGpu); targetCheck->copyFrom(*targetGpu);
...@@ -104,25 +108,29 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW, ...@@ -104,25 +108,29 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
MatrixCheckErr(*inputGrad, *targetCheckGrad); MatrixCheckErr(*inputGrad, *targetCheckGrad);
} }
TEST(Profiler, BilinearFwdBwd) { TEST(Profiler, testBilinearFwdBwd) {
auto numSamples = 10; auto numSamples = 10;
auto channels = 16; auto channels = 16;
auto imgSize = 64; auto imgSize = 64;
{ {
// nvprof: GPU Proflier // nvprof: GPU Proflier
REGISTER_GPU_PROFILER("testBilinearFwdBwd", REGISTER_GPU_PROFILER("testBilinearFwdBwd");
"numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
// Paddle built-in timer // Paddle built-in timer
REGISTER_TIMER_INFO("testBilinearFwdBwd", REGISTER_TIMER_INFO("testBilinearFwdBwd",
"numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64"); "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
testBilinearFwdBwd(numSamples, imgSize, imgSize, channels); testBilinearFwdBwd(numSamples, imgSize, imgSize, channels);
} }
globalStat.printStatus("testBilinearFwdBwd"); globalStat.printAllStatus();
} }
int main(int argc, char** argv) { int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv); testing::InitGoogleTest(&argc, argv);
initMain(argc, argv); initMain(argc, argv);
// nvprof: GPU Proflier
REGISTER_GPU_PROFILER("RecursiveProfilingTest",
"numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
return RUN_ALL_TESTS(); return RUN_ALL_TESTS();
} }
......
...@@ -203,4 +203,22 @@ StatInfo::~StatInfo() { ...@@ -203,4 +203,22 @@ StatInfo::~StatInfo() {
} }
} }
static unsigned g_profileCount = 0;
static std::recursive_mutex g_profileMutex;
GpuProfiler::GpuProfiler(std::string statName, std::string info)
: guard_(g_profileMutex) {
if (++g_profileCount == 1) {
LOG(INFO) << "Enable GPU Profiler Stat: ["
<< statName << "] " << info;
hl_profiler_start();
}
}
GpuProfiler::~GpuProfiler() {
if (--g_profileCount == 0) {
hl_profiler_end();
}
}
} // namespace paddle } // namespace paddle
...@@ -283,8 +283,10 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1, ...@@ -283,8 +283,10 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1,
class GpuProfiler final { class GpuProfiler final {
public: public:
GpuProfiler() { hl_profiler_start(); } GpuProfiler(std::string statName, std::string info);
~GpuProfiler() { hl_profiler_end(); } ~GpuProfiler();
private:
std::lock_guard<std::recursive_mutex> guard_;
}; };
#ifdef PADDLE_DISABLE_PROFILER #ifdef PADDLE_DISABLE_PROFILER
...@@ -293,10 +295,8 @@ public: ...@@ -293,10 +295,8 @@ public:
#else #else
#define REGISTER_GPU_PROFILER(statName, ...) \ #define REGISTER_GPU_PROFILER(statName, ...) \
LOG(INFO) << "Enable GPU Profiler Stat: [" \ GpuProfiler __gpuProfiler(statName, #__VA_ARGS__);
<< statName << "] " << #__VA_ARGS__; \
GpuProfiler __gpuProfiler;
#endif // DISABLE_PROFILER #endif // DISABLE_PROFILER
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册