提交 8393c19c 编写于 作者: L liaogang

Add recursive mutex and counter for gpu profiler

上级 9670b9a1
......@@ -24,7 +24,7 @@ Why we need profiling?
======================
Since training deep neural network typically take a very long time to get over, performance is gradually becoming
the most important thing in deep learning field. The first step to improve performance is to understand what parts
are slow. No point in improving performance of a region which doesn’t take much time!
are slow. There is no point in improving performance of a region which doesn’t take much time!
How to do profiling?
......@@ -59,6 +59,7 @@ above profilers.
The above code snippet includes two methods, you can use any of them to profile the regions of interest.
1. :code:`REGISTER_TIMER_INFO` is a built-in timer wrapper which can calculate the time overhead of both cpu functions and cuda kernels.
2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
program crashes when CPU version of PaddlePaddle invokes them.
......
......@@ -70,10 +70,14 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
input->randomizeUniform();
inputGpu->copyFrom(*input);
target->bilinearForward(*input, imgSizeH, imgSizeW,
2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW,
2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
{
// nvprof: GPU Proflier
REGISTER_GPU_PROFILER("testBilinearFwdBwd");
target->bilinearForward(*input, imgSizeH, imgSizeW,
2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW,
2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
}
// check
targetCheck->copyFrom(*targetGpu);
......@@ -104,25 +108,29 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
MatrixCheckErr(*inputGrad, *targetCheckGrad);
}
TEST(Profiler, BilinearFwdBwd) {
TEST(Profiler, testBilinearFwdBwd) {
auto numSamples = 10;
auto channels = 16;
auto imgSize = 64;
{
// nvprof: GPU Proflier
REGISTER_GPU_PROFILER("testBilinearFwdBwd",
"numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
REGISTER_GPU_PROFILER("testBilinearFwdBwd");
// Paddle built-in timer
REGISTER_TIMER_INFO("testBilinearFwdBwd",
"numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
testBilinearFwdBwd(numSamples, imgSize, imgSize, channels);
}
globalStat.printStatus("testBilinearFwdBwd");
globalStat.printAllStatus();
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
// nvprof: GPU Proflier
REGISTER_GPU_PROFILER("RecursiveProfilingTest",
"numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
return RUN_ALL_TESTS();
}
......
......@@ -203,4 +203,22 @@ StatInfo::~StatInfo() {
}
}
static unsigned g_profileCount = 0;
static std::recursive_mutex g_profileMutex;
GpuProfiler::GpuProfiler(std::string statName, std::string info)
: guard_(g_profileMutex) {
if (++g_profileCount == 1) {
LOG(INFO) << "Enable GPU Profiler Stat: ["
<< statName << "] " << info;
hl_profiler_start();
}
}
GpuProfiler::~GpuProfiler() {
if (--g_profileCount == 0) {
hl_profiler_end();
}
}
} // namespace paddle
......@@ -283,8 +283,10 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1,
class GpuProfiler final {
public:
GpuProfiler() { hl_profiler_start(); }
~GpuProfiler() { hl_profiler_end(); }
GpuProfiler(std::string statName, std::string info);
~GpuProfiler();
private:
std::lock_guard<std::recursive_mutex> guard_;
};
#ifdef PADDLE_DISABLE_PROFILER
......@@ -293,10 +295,8 @@ public:
#else
#define REGISTER_GPU_PROFILER(statName, ...) \
LOG(INFO) << "Enable GPU Profiler Stat: [" \
<< statName << "] " << #__VA_ARGS__; \
GpuProfiler __gpuProfiler;
#define REGISTER_GPU_PROFILER(statName, ...) \
GpuProfiler __gpuProfiler(statName, #__VA_ARGS__);
#endif // DISABLE_PROFILER
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册