diff --git a/.travis.yml b/.travis.yml index ffe3bc193b49eb3b3318cbbc7f1c3d86dc205c14..effcf90769647960d55b971af0939496dc850e7a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -42,7 +42,7 @@ addons: before_install: - | if [ ${JOB} == "BUILD_AND_TEST" ]; then - if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)' + if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)' then echo "Only markdown docs were updated, stopping build process." exit diff --git a/CMakeLists.txt b/CMakeLists.txt index 090ac9e188422099cc4270b87064b5590e7b620c..af193c27ae7d802a8724fdc1e23b4b5b583e9f7c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,6 +36,7 @@ option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF) option(WITH_GLOG "Compile PaddlePaddle use glog, otherwise use a log implement internally" ${LIBGLOG_FOUND}) option(WITH_GFLAGS "Compile PaddlePaddle use gflags, otherwise use a flag implement internally" ${GFLAGS_FOUND}) option(WITH_TIMER "Compile PaddlePaddle use timer" OFF) +option(WITH_PROFILER "Compile PaddlePaddle use gpu profiler" OFF) option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND}) option(WITH_DOC "Compile PaddlePaddle with documentation" OFF) option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND}) @@ -115,7 +116,6 @@ else() endif(WITH_AVX) if(WITH_DSO) - set(CUDA_LIBRARIES "") add_definitions(-DPADDLE_USE_DSO) endif(WITH_DSO) @@ -135,6 +135,10 @@ if(NOT WITH_TIMER) add_definitions(-DPADDLE_DISABLE_TIMER) endif(NOT WITH_TIMER) +if(NOT WITH_PROFILER) + add_definitions(-DPADDLE_DISABLE_PROFILER) +endif(NOT WITH_PROFILER) + if(WITH_AVX) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}") diff --git a/doc/build/build_from_source.md b/doc/build/build_from_source.md index e44fa0d38e9982e5d0ed159743994ce6acc51246..b932fbc0fa4443d2fd8abfc9d8a78e68c44f667c 100644 --- a/doc/build/build_from_source.md +++ b/doc/build/build_from_source.md @@ -95,7 +95,7 @@ As a simple example, consider the following: ```bash # necessary sudo apt-get update - sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git + sudo apt-get install -y g++ make cmake swig build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git # optional sudo apt-get install libgoogle-glog-dev sudo apt-get install libgflags-dev @@ -149,15 +149,15 @@ If still not found, you can manually set it based on CMake error information fro As a simple example, consider the following: -- **Only CPU** +- **Only CPU with swig** ```bash - cmake .. -DWITH_GPU=OFF + cmake .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON ``` -- **GPU** +- **GPU with swig** ```bash - cmake .. -DWITH_GPU=ON + cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON ``` - **GPU with doc and swig** @@ -170,15 +170,13 @@ Finally, you can build PaddlePaddle: ```bash # you can add build option here, such as: -cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX= +cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX= -DWITH_SWIG_PY=ON # please use sudo make install, if you want to install PaddlePaddle into the system make -j `nproc` && make install # set PaddlePaddle installation path in ~/.bashrc export PATH=/bin:$PATH ``` -**Note:** - If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed. Otherwise, PaddlePaddle will automatically install python dependencies at first time when user run paddle commands, such as `paddle version`, `paddle train`. diff --git a/doc/index.rst b/doc/index.rst index 668ad75a902bdd14c6198c41380ae93e29cec0d3..76fb7a3ace8057d9cd34e03134c63ef0cd298cae 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -8,3 +8,4 @@ PaddlePaddle Documentation user_guide.rst dev/index.rst algorithm/index.rst + optimization/index.rst diff --git a/doc/optimization/gpu_profiling.rst b/doc/optimization/gpu_profiling.rst new file mode 100644 index 0000000000000000000000000000000000000000..667bf1364e7cd4c9098caba72a127228d78ca38b --- /dev/null +++ b/doc/optimization/gpu_profiling.rst @@ -0,0 +1,237 @@ +Profiling on PaddlePaddle +========================= + +This tutorial will guide you step-by-step through how to conduct profiling and performance tuning using built-in timer, **nvprof** and **nvvp**. + +- What is profiling? +- Why we need profiling? +- How to do profiling? +- Profile tools +- Hands-on Tutorial +- Profiling tips + +What's profiling? +================= +In software engineering, profiling is a form of dynamic program analysis that measures the space (memory) or time +complexity of a program, the usage of particular instructions, or the frequency and duration of function calls. +Most commonly, profiling information serves to aid program optimization. + +Briefly, profiler is used to measure application performance. Program analysis tools are extremely important for +understanding program behavior. Simple profiling can tell you that how long does an operation take? For advanced +profiling, it can interpret why does an operation take a long time? + +Why we need profiling? +====================== +Since training deep neural network typically take a very long time to get over, performance is gradually becoming +the most important thing in deep learning field. The first step to improve performance is to understand what parts +are slow. There is no point in improving performance of a region which doesn’t take much time! + + +How to do profiling? +==================== +To achieve maximum performance, there are five steps you can take to reach your goals. + +- Profile the code +- Find the slow parts +- Work out why they’re slow +- Make them fast +- Profile the code again + +Usually, processor has two key performance limits include float point throughput and +memory throughput. For GPU, it also need more parallelism to fulfill its potential. +This is why they can be so fast. + +Profiler Tools +============== +For general GPU profiling, a bunch of tools are provided from both NVIDIA and third party. + +**nvprof** is Nvidia profiler and **nvvp** is (GUI based) Nvidia visual profiler. +In this tutorial, we will focus on nvprof and nvvp. + +:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate +above profilers. + +.. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp + :language: c++ + :lines: 111-124 + :linenos: + +The above code snippet includes two methods, you can use any of them to profile the regions of interest. + +1. :code:`REGISTER_TIMER_INFO` is a built-in timer wrapper which can calculate the time overhead of both cpu functions and cuda kernels. + +2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid +program crashes when CPU version of PaddlePaddle invokes them. + +You can find more details about how to use both of them in the next session. + +Hands-on Approach +================= + +Built-in Timer +-------------- + +To enable built-in timer in PaddlePaddle, first you have to add :code:`REGISTER_TIMER_INFO` into the regions of you interest. +Then, all information could be stamped in the console via :code:`printStatus` or :code:`printAllStatus` function. +As a simple example, consider the following: + +1. Add :code:`REGISTER_TIMER_INFO` and :code:`printAllStatus` functions (see the emphasize-lines). + + .. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp + :language: c++ + :lines: 111-124 + :emphasize-lines: 8-10,13 + :linenos: + +2. Configure cmake with **WITH_TIMER** and recompile PaddlePaddle. + + .. code-block:: bash + + cmake .. -DWITH_TIMER=ON + make + +3. Execute your code and observe the results (see the emphasize-lines). + + .. code-block:: bash + :emphasize-lines: 1,12-15 + + > ./paddle/math/tests/test_GpuProfiler + I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler + I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions + I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done. + [==========] Running 1 test from 1 test case. + [----------] Global test environment set-up. + [----------] 1 test from Profiler + [ RUN ] Profiler.BilinearFwdBwd + I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im + gSizeX = 64, imgSizeY = 64" + I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751 + I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ====== + I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd total=136.141 avg=136.141 max=136.141 min=136.141 count=1 + I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ====== + I1117 11:13:42.981575 2522362816 Stat.cpp:154] -------------------------------------------------- + [ OK ] Profiler.BilinearFwdBwd (136 ms) + [----------] 1 test from Profiler (136 ms total) + + [----------] Global test environment tear-down + [==========] 1 test from 1 test case ran. (136 ms total) + [ PASSED ] 1 test. + +nvprof profiler +--------------- + +To use this command line profiler **nvprof**, you can simply issue the following command: + +1. Add :code:`REGISTER_GPU_PROFILER` function (see the emphasize-lines). + + .. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp + :language: c++ + :lines: 111-124 + :emphasize-lines: 6-7 + :linenos: + +2. Configure cmake with **WITH_PROFILER** and recompile PaddlePaddle. + + .. code-block:: bash + + cmake .. -DWITH_PROFILER=ON + make + +3. Use Nvidia profiler **nvprof** to profile the binary. + + .. code-block:: bash + + nvprof ./paddle/math/tests/test_GpuProfiler + +Then, you can get the following profiling result: + +.. code-block:: bash + + ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler + ==78544== Profiling result: + Time(%) Time Calls Avg Min Max Name + 27.60% 9.6305ms 5 1.9261ms 3.4560us 6.4035ms [CUDA memcpy HtoD] + 26.07% 9.0957ms 1 9.0957ms 9.0957ms 9.0957ms KeBilinearInterpBw + 23.78% 8.2977ms 1 8.2977ms 8.2977ms 8.2977ms KeBilinearInterpFw + 22.55% 7.8661ms 2 3.9330ms 1.5798ms 6.2863ms [CUDA memcpy DtoH] + + ==78544== API calls: + Time(%) Time Calls Avg Min Max Name + 46.85% 682.28ms 8 85.285ms 12.639us 682.03ms cudaStreamCreateWithFlags + 39.83% 580.00ms 4 145.00ms 302ns 550.27ms cudaFree + 9.82% 143.03ms 9 15.892ms 8.7090us 142.78ms cudaStreamCreate + 1.23% 17.983ms 7 2.5690ms 23.210us 6.4563ms cudaMemcpy + 1.23% 17.849ms 2 8.9247ms 8.4726ms 9.3768ms cudaStreamSynchronize + 0.66% 9.5969ms 7 1.3710ms 288.43us 2.4279ms cudaHostAlloc + 0.13% 1.9530ms 11 177.54us 7.6810us 591.06us cudaMalloc + 0.07% 1.0424ms 8 130.30us 1.6970us 453.72us cudaGetDevice + 0.04% 527.90us 40 13.197us 525ns 253.99us cudaEventCreateWithFlags + 0.03% 435.73us 348 1.2520us 124ns 42.704us cuDeviceGetAttribute + 0.03% 419.36us 1 419.36us 419.36us 419.36us cudaGetDeviceCount + 0.02% 260.75us 2 130.38us 129.32us 131.43us cudaGetDeviceProperties + 0.02% 222.32us 2 111.16us 106.94us 115.39us cudaLaunch + 0.01% 214.06us 4 53.514us 28.586us 77.655us cuDeviceGetName + 0.01% 115.45us 4 28.861us 9.8250us 44.526us cuDeviceTotalMem + 0.01% 83.988us 4 20.997us 578ns 77.760us cudaSetDevice + 0.00% 38.918us 1 38.918us 38.918us 38.918us cudaEventCreate + 0.00% 34.573us 31 1.1150us 279ns 12.784us cudaDeviceGetAttribute + 0.00% 17.767us 1 17.767us 17.767us 17.767us cudaProfilerStart + 0.00% 15.228us 2 7.6140us 3.5460us 11.682us cudaConfigureCall + 0.00% 14.536us 2 7.2680us 1.1490us 13.387us cudaGetLastError + 0.00% 8.6080us 26 331ns 173ns 783ns cudaSetupArgument + 0.00% 5.5470us 6 924ns 215ns 2.6780us cuDeviceGet + 0.00% 5.4090us 6 901ns 328ns 3.3320us cuDeviceGetCount + 0.00% 4.1770us 3 1.3920us 1.0630us 1.8300us cuDriverGetVersion + 0.00% 3.4650us 3 1.1550us 1.0810us 1.2680us cuInit + 0.00% 830ns 1 830ns 830ns 830ns cudaRuntimeGetVersion + + +nvvp profiler +------------- + +For visual profiler **nvvp**, you can either import the output of :code:`nvprof –o ...` or +run application through GUI. + +**Note: nvvp also support CPU profiling** (Click the box in nvvp to enable profile execution on CPU). + +.. image:: nvvp1.png + :align: center + :scale: 33% + +From the perspective of kernel functions, **nvvp** can even illustrate why does an operation take a long time? +As shown in the following figure, kernel's block usage, register usage and shared memory usage from :code:`nvvp` +allow us to fully utilize all warps on the GPU. + +.. image:: nvvp2.png + :align: center + :scale: 33% + +From the perspective of application, **nvvp** can give you some suggestions to address performance bottleneck. +For instance, some advice in data movement and compute utilization from the below figure can guide you to tune performance. + +.. image:: nvvp3.png + :align: center + :scale: 33% + +.. image:: nvvp4.png + :align: center + :scale: 33% + +Profiling tips +============== + +- The **nvprof** and **nvvp** output is a very good place to start. +- The timeline is a good place to go next. +- Only dig deep into a kernel if it’s taking a significant amount of your time. +- Where possible, try to match profiler output with theory. + 1) For example, if I know I’m moving 1GB, and my kernel takes 10ms, I expect the profiler to report 100GB/s. + 2) Discrepancies are likely to mean your application isn’t doing what you thought it was. +- Know your hardware: If your GPU can do 6 TFLOPs, and you’re already doing 5.5 TFLOPs, you won’t go much faster! + + +Profiling is a key step in optimization. Sometimes quite simple changes can lead to big improvements in performance. +Your mileage may vary! + +Reference +========= +Jeremy Appleyard, `GPU Profiling for Deep Learning `_, 2015 diff --git a/doc/optimization/index.rst b/doc/optimization/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..c9e87e0778dfe44fa3d1bb84d0ad340aa6f25d08 --- /dev/null +++ b/doc/optimization/index.rst @@ -0,0 +1,7 @@ +Performance Tuning +================== + +.. toctree:: + :maxdepth: 3 + + gpu_profiling.rst diff --git a/doc/optimization/nvvp1.png b/doc/optimization/nvvp1.png new file mode 100644 index 0000000000000000000000000000000000000000..1af23ac3c52929b2b0645d2f9fa4d4c6db1f6e77 Binary files /dev/null and b/doc/optimization/nvvp1.png differ diff --git a/doc/optimization/nvvp2.png b/doc/optimization/nvvp2.png new file mode 100644 index 0000000000000000000000000000000000000000..177c9db708da6863d1075f3e615f5962dbe18b29 Binary files /dev/null and b/doc/optimization/nvvp2.png differ diff --git a/doc/optimization/nvvp3.png b/doc/optimization/nvvp3.png new file mode 100644 index 0000000000000000000000000000000000000000..d8f393667d6569b6f1e61ffccac43fae5888b6db Binary files /dev/null and b/doc/optimization/nvvp3.png differ diff --git a/doc/optimization/nvvp4.png b/doc/optimization/nvvp4.png new file mode 100644 index 0000000000000000000000000000000000000000..51f2f3e183295de6cf8ddaf2b3b8a0862aa35f01 Binary files /dev/null and b/doc/optimization/nvvp4.png differ diff --git a/paddle/cuda/include/hl_cuda.h b/paddle/cuda/include/hl_cuda.h index 357286e3188a6f3184bc56e75232bf2e1ec54e44..2c7d665101f36f9c32ab132ca279abf3ac062a8f 100644 --- a/paddle/cuda/include/hl_cuda.h +++ b/paddle/cuda/include/hl_cuda.h @@ -15,8 +15,8 @@ limitations under the License. */ #ifndef HL_CUDA_H_ #define HL_CUDA_H_ -#include "hl_base.h" #include +#include "hl_base.h" /** * @brief HPPL event. @@ -332,4 +332,14 @@ extern bool hl_cuda_event_is_ready(hl_event_t event); */ extern void hl_device_synchronize(); +/** + * @brief gpu profiler start + */ +extern void hl_profiler_start(); + +/** + * @brief gpu profiler stop + */ +extern void hl_profiler_end(); + #endif // HL_CUDA_H_ diff --git a/paddle/cuda/include/stub/hl_cuda_stub.h b/paddle/cuda/include/stub/hl_cuda_stub.h index 1f91068cdf8b3d472c4b403d1ec7d5293c28c07e..24923a0d4a0cdd49214305c2f7716eeef575c7ee 100644 --- a/paddle/cuda/include/stub/hl_cuda_stub.h +++ b/paddle/cuda/include/stub/hl_cuda_stub.h @@ -90,4 +90,8 @@ inline bool hl_cuda_event_is_ready(hl_event_t event) { return true; } inline void hl_device_synchronize() {} +inline void hl_profiler_start() {} + +inline void hl_profiler_end() {} + #endif // HL_CUDA_STUB_H_ diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc index 745be35b56278ed2e0033d5fd2806320d3164d7c..6b71a538485a09cf40a53eddf1ee2f3e2c768b2c 100644 --- a/paddle/cuda/src/hl_cuda_device.cc +++ b/paddle/cuda/src/hl_cuda_device.cc @@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#include #include -#include #include +#include +#include #include #include "hl_cuda.h" #include "hl_cuda.ph" -#include "hl_thread.ph" #include "hl_dso_loader.h" +#include "hl_thread.ph" #include "paddle/utils/Logging.h" namespace dynload { @@ -133,7 +134,9 @@ void *cudart_dso_handle = nullptr; __macro(cudaGetLastError) \ __macro(cudaFuncSetCacheConfig) \ __macro(cudaRuntimeGetVersion) \ - __macro(cudaGetErrorString) + __macro(cudaGetErrorString) \ + __macro(cudaProfilerStart) \ + __macro(cudaProfilerStop) // clang-format on CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP) @@ -742,3 +745,7 @@ bool hl_cuda_event_is_ready(hl_event_t event) { } return true; } + +void hl_profiler_start() { CHECK_CUDA(dynload::cudaProfilerStart()); } + +void hl_profiler_end() { CHECK_CUDA(dynload::cudaProfilerStop()); } diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt index 247be983ba3296383c8e2f30f1036859ecfde492..33d4478b4d36d7be5da6fb43365acb95e5bc7d04 100644 --- a/paddle/math/tests/CMakeLists.txt +++ b/paddle/math/tests/CMakeLists.txt @@ -14,3 +14,4 @@ add_simple_unittest(test_perturbation) add_simple_unittest(test_CpuGpuVector) add_simple_unittest(test_Allocator) add_simple_unittest(test_FPException) +add_simple_unittest(test_GpuProfiler) \ No newline at end of file diff --git a/paddle/math/tests/test_GpuProfiler.cpp b/paddle/math/tests/test_GpuProfiler.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c3542b7834224e2fa6fe323a1fbe8ea1e7cd68de --- /dev/null +++ b/paddle/math/tests/test_GpuProfiler.cpp @@ -0,0 +1,137 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef PADDLE_ONLY_CPU + +#include "paddle/utils/Util.h" +#include "paddle/math/Matrix.h" +#include "paddle/math/SparseMatrix.h" +#include +#include "paddle/gserver/tests/TestUtil.h" +#include "paddle/utils/Stat.h" + +using namespace paddle; // NOLINT +using namespace std; // NOLINT + +void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) { + CHECK(matrix1.getHeight() == matrix2.getHeight()); + CHECK(matrix1.getWidth() == matrix2.getWidth()); +#ifndef PADDLE_TYPE_DOUBLE + real err = 1e-3; +#else + real err = 1e-10; +#endif + + int height = matrix1.getHeight(); + int width = matrix1.getWidth(); + const real* data1 = matrix1.getData(); + const real* data2 = matrix2.getData(); + int count = 0; + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + real a = data1[i * width + j]; + real b = data2[i * width + j]; + if (fabs(a - b) > err) { + if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) { + count++; + } + } + } + } + EXPECT_EQ(count, 0) << "There are " << count << " different element."; +} + +void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW, + int channels) { + int inWidth = imgSizeH * imgSizeW * channels; + int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels; + real ratioH = 0.5; + real ratioW = 0.5; + + // forward + MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false); + MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true); + + MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false); + MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true); + MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false); + + input->randomizeUniform(); + inputGpu->copyFrom(*input); + + { + // nvprof: GPU Proflier + REGISTER_GPU_PROFILER("testBilinearFwdBwd"); + target->bilinearForward(*input, imgSizeH, imgSizeW, + 2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW); + targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW, + 2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW); + } + + // check + targetCheck->copyFrom(*targetGpu); + MatrixCheckErr(*target, *targetCheck); + + // backward + MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false); + MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true); + + MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false); + MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, false, + true); + MatrixPtr targetCheckGrad = + CpuMatrix::create(numSamples, inWidth, false, false); + + inputGrad->randomizeUniform(); + targetGrad->randomizeUniform(); + inputGpuGrad->copyFrom(*inputGrad); + targetGpuGrad->copyFrom(*targetGrad); + + inputGrad->bilinearBackward(*targetGrad, 2 * imgSizeH, 2 * imgSizeW, + imgSizeH, imgSizeW, channels, ratioH, ratioW); + inputGpuGrad->bilinearBackward(*targetGpuGrad, 2 * imgSizeH, 2 * imgSizeW, + imgSizeH, imgSizeW, channels, ratioH, ratioW); + + // check + targetCheckGrad->copyFrom(*inputGpuGrad); + MatrixCheckErr(*inputGrad, *targetCheckGrad); +} + +TEST(Profiler, testBilinearFwdBwd) { + auto numSamples = 10; + auto channels = 16; + auto imgSize = 64; + { + // nvprof: GPU Proflier + REGISTER_GPU_PROFILER("testBilinearFwdBwd"); + // Paddle built-in timer + REGISTER_TIMER_INFO("testBilinearFwdBwd", + "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64"); + testBilinearFwdBwd(numSamples, imgSize, imgSize, channels); + } + globalStat.printAllStatus(); +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + initMain(argc, argv); + + // nvprof: GPU Proflier + REGISTER_GPU_PROFILER("RecursiveProfilingTest", + "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64"); + + return RUN_ALL_TESTS(); +} + +#endif /* PADDLE_ONLY_CPU */ diff --git a/paddle/utils/Stat.cpp b/paddle/utils/Stat.cpp index d7b20ca5eb2f4eadaa6b4acad056d669a9b59c14..ab140c33502ad315d087bb3afc7f39bffc122894 100644 --- a/paddle/utils/Stat.cpp +++ b/paddle/utils/Stat.cpp @@ -65,6 +65,7 @@ std::ostream& operator<<(std::ostream& outPut, const Stat& stat) { auto showStat = [&](const StatInfo* info, pid_t tid, bool isFirst = true) { uint64_t average = 0; if (info->count_ > 0) { + outPut << std::setfill(' ') << std::left; if (!isFirst) { outPut << std::setw(42) << " "; } @@ -202,4 +203,22 @@ StatInfo::~StatInfo() { } } +static unsigned g_profileCount = 0; +static std::recursive_mutex g_profileMutex; + +GpuProfiler::GpuProfiler(std::string statName, std::string info) + : guard_(g_profileMutex) { + if (++g_profileCount == 1) { + LOG(INFO) << "Enable GPU Profiler Stat: [" + << statName << "] " << info; + hl_profiler_start(); + } +} + +GpuProfiler::~GpuProfiler() { + if (--g_profileCount == 0) { + hl_profiler_end(); + } +} + } // namespace paddle diff --git a/paddle/utils/Stat.h b/paddle/utils/Stat.h index 4051145d9246639fce5d041103c1211a939eddca..1ef688ea8da53ee0cd51b1775e671f2b10be782b 100644 --- a/paddle/utils/Stat.h +++ b/paddle/utils/Stat.h @@ -15,19 +15,19 @@ limitations under the License. */ #pragma once #include -#include #include -#include #include +#include +#include #include +#include #include -#include -#include "Logging.h" #include "BarrierStat.h" #include "Locks.h" +#include "Logging.h" #include "ThreadLocal.h" -#include "BarrierStat.h" +#include "hl_gpu.h" namespace paddle { @@ -283,4 +283,24 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1, #endif // DISABLE_TIMER +class GpuProfiler final { +public: + GpuProfiler(std::string statName, std::string info); + ~GpuProfiler(); + +private: + std::lock_guard guard_; +}; + +#ifdef PADDLE_DISABLE_PROFILER + +#define REGISTER_GPU_PROFILER(statName, ...) + +#else + +#define REGISTER_GPU_PROFILER(statName, ...) \ + GpuProfiler __gpuProfiler(statName, #__VA_ARGS__); + +#endif // DISABLE_PROFILER + } // namespace paddle