提交 345c6263 编写于 作者: L liaogang

Fix conflict with gpu profiling docs

......@@ -42,7 +42,7 @@ addons:
before_install:
- |
if [ ${JOB} == "BUILD_AND_TEST" ]; then
if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)'
if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)'
then
echo "Only markdown docs were updated, stopping build process."
exit
......
......@@ -36,6 +36,7 @@ option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF)
option(WITH_GLOG "Compile PaddlePaddle use glog, otherwise use a log implement internally" ${LIBGLOG_FOUND})
option(WITH_GFLAGS "Compile PaddlePaddle use gflags, otherwise use a flag implement internally" ${GFLAGS_FOUND})
option(WITH_TIMER "Compile PaddlePaddle use timer" OFF)
option(WITH_PROFILER "Compile PaddlePaddle use gpu profiler" OFF)
option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
......@@ -115,7 +116,6 @@ else()
endif(WITH_AVX)
if(WITH_DSO)
set(CUDA_LIBRARIES "")
add_definitions(-DPADDLE_USE_DSO)
endif(WITH_DSO)
......@@ -135,6 +135,10 @@ if(NOT WITH_TIMER)
add_definitions(-DPADDLE_DISABLE_TIMER)
endif(NOT WITH_TIMER)
if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER)
if(WITH_AVX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
......
......@@ -18,7 +18,5 @@ set -x
# download the dictionary and pretrained model
for file in baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb
do
# following is the google drive address
# you can also directly download from https://pan.baidu.com/s/1o8q577s
wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/$file --no-check-certificate
wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/$file
done
......@@ -24,9 +24,7 @@ echo "Downloading ResNet models..."
for file in resnet_50.tar.gz resnet_101.tar.gz resnet_152.tar.gz mean_meta_224.tar.gz
do
# following is the google drive address
# you can also directly download from https://pan.baidu.com/s/1o8q577s
wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/imagenet/$file --no-check-certificate
wget http://paddlepaddle.bj.bcebos.com/model_zoo/imagenet/$file
tar -xvf $file
rm $file
done
......
......@@ -16,9 +16,7 @@ set -e
set -x
# download the in-house paraphrase dataset
# following is the google drive address
# you can also directly download from https://pan.baidu.com/s/1o8q577s
wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/paraphrase.tar.gz --no-check-certificate
wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/paraphrase.tar.gz
# untar the dataset
tar -zxvf paraphrase.tar.gz
......
......@@ -16,9 +16,7 @@ set -e
set -x
# download the pretrained model
# following is the google drive address
# you can also directly download from https://pan.baidu.com/s/1o8q577s
wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/wmt14_model.tar.gz --no-check-certificate
wget http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz
# untar the model
tar -zxvf wmt14_model.tar.gz
......
......@@ -95,7 +95,7 @@ As a simple example, consider the following:
```bash
# necessary
sudo apt-get update
sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
sudo apt-get install -y g++ make cmake swig build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
# optional
sudo apt-get install libgoogle-glog-dev
sudo apt-get install libgflags-dev
......@@ -149,15 +149,15 @@ If still not found, you can manually set it based on CMake error information fro
As a simple example, consider the following:
- **Only CPU**
- **Only CPU with swig**
```bash
cmake .. -DWITH_GPU=OFF
cmake .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON
```
- **GPU**
- **GPU with swig**
```bash
cmake .. -DWITH_GPU=ON
cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON
```
- **GPU with doc and swig**
......@@ -170,15 +170,13 @@ Finally, you can build PaddlePaddle:
```bash
# you can add build option here, such as:
cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<path to install>
cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<path to install> -DWITH_SWIG_PY=ON
# please use sudo make install, if you want to install PaddlePaddle into the system
make -j `nproc` && make install
# set PaddlePaddle installation path in ~/.bashrc
export PATH=<path to install>/bin:$PATH
```
**Note:**
If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
Otherwise, PaddlePaddle will automatically install python dependencies
at first time when user run paddle commands, such as `paddle version`, `paddle train`.
......
Profiling on PaddlePaddle
=========================
This tutorial will guide you step-by-step through how to conduct profiling and performance tuning using built-in timer, **nvprof** and **nvvp**.
- What is profiling?
- Why we need profiling?
- How to do profiling?
- Profile tools
- Hands-on Tutorial
- Profiling tips
What's profiling?
=================
In software engineering, profiling is a form of dynamic program analysis that measures the space (memory) or time
complexity of a program, the usage of particular instructions, or the frequency and duration of function calls.
Most commonly, profiling information serves to aid program optimization.
Briefly, profiler is used to measure application performance. Program analysis tools are extremely important for
understanding program behavior. Simple profiling can tell you that how long does an operation take? For advanced
profiling, it can interpret why does an operation take a long time?
Why we need profiling?
======================
Since training deep neural network typically take a very long time to get over, performance is gradually becoming
the most important thing in deep learning field. The first step to improve performance is to understand what parts
are slow. There is no point in improving performance of a region which doesn’t take much time!
How to do profiling?
====================
To achieve maximum performance, there are five steps you can take to reach your goals.
- Profile the code
- Find the slow parts
- Work out why they’re slow
- Make them fast
- Profile the code again
Usually, processor has two key performance limits include float point throughput and
memory throughput. For GPU, it also need more parallelism to fulfill its potential.
This is why they can be so fast.
Profiler Tools
==============
For general GPU profiling, a bunch of tools are provided from both NVIDIA and third party.
**nvprof** is Nvidia profiler and **nvvp** is (GUI based) Nvidia visual profiler.
In this tutorial, we will focus on nvprof and nvvp.
:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
above profilers.
.. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
:language: c++
:lines: 111-124
:linenos:
The above code snippet includes two methods, you can use any of them to profile the regions of interest.
1. :code:`REGISTER_TIMER_INFO` is a built-in timer wrapper which can calculate the time overhead of both cpu functions and cuda kernels.
2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
program crashes when CPU version of PaddlePaddle invokes them.
You can find more details about how to use both of them in the next session.
Hands-on Approach
=================
Built-in Timer
--------------
To enable built-in timer in PaddlePaddle, first you have to add :code:`REGISTER_TIMER_INFO` into the regions of you interest.
Then, all information could be stamped in the console via :code:`printStatus` or :code:`printAllStatus` function.
As a simple example, consider the following:
1. Add :code:`REGISTER_TIMER_INFO` and :code:`printAllStatus` functions (see the emphasize-lines).
.. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
:language: c++
:lines: 111-124
:emphasize-lines: 8-10,13
:linenos:
2. Configure cmake with **WITH_TIMER** and recompile PaddlePaddle.
.. code-block:: bash
cmake .. -DWITH_TIMER=ON
make
3. Execute your code and observe the results (see the emphasize-lines).
.. code-block:: bash
:emphasize-lines: 1,12-15
> ./paddle/math/tests/test_GpuProfiler
I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler
I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
[==========] Running 1 test from 1 test case.
[----------] Global test environment set-up.
[----------] 1 test from Profiler
[ RUN ] Profiler.BilinearFwdBwd
I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
gSizeX = 64, imgSizeY = 64"
I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd total=136.141 avg=136.141 max=136.141 min=136.141 count=1
I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
[ OK ] Profiler.BilinearFwdBwd (136 ms)
[----------] 1 test from Profiler (136 ms total)
[----------] Global test environment tear-down
[==========] 1 test from 1 test case ran. (136 ms total)
[ PASSED ] 1 test.
nvprof profiler
---------------
To use this command line profiler **nvprof**, you can simply issue the following command:
1. Add :code:`REGISTER_GPU_PROFILER` function (see the emphasize-lines).
.. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
:language: c++
:lines: 111-124
:emphasize-lines: 6-7
:linenos:
2. Configure cmake with **WITH_PROFILER** and recompile PaddlePaddle.
.. code-block:: bash
cmake .. -DWITH_PROFILER=ON
make
3. Use Nvidia profiler **nvprof** to profile the binary.
.. code-block:: bash
nvprof ./paddle/math/tests/test_GpuProfiler
Then, you can get the following profiling result:
.. code-block:: bash
==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler
==78544== Profiling result:
Time(%) Time Calls Avg Min Max Name
27.60% 9.6305ms 5 1.9261ms 3.4560us 6.4035ms [CUDA memcpy HtoD]
26.07% 9.0957ms 1 9.0957ms 9.0957ms 9.0957ms KeBilinearInterpBw
23.78% 8.2977ms 1 8.2977ms 8.2977ms 8.2977ms KeBilinearInterpFw
22.55% 7.8661ms 2 3.9330ms 1.5798ms 6.2863ms [CUDA memcpy DtoH]
==78544== API calls:
Time(%) Time Calls Avg Min Max Name
46.85% 682.28ms 8 85.285ms 12.639us 682.03ms cudaStreamCreateWithFlags
39.83% 580.00ms 4 145.00ms 302ns 550.27ms cudaFree
9.82% 143.03ms 9 15.892ms 8.7090us 142.78ms cudaStreamCreate
1.23% 17.983ms 7 2.5690ms 23.210us 6.4563ms cudaMemcpy
1.23% 17.849ms 2 8.9247ms 8.4726ms 9.3768ms cudaStreamSynchronize
0.66% 9.5969ms 7 1.3710ms 288.43us 2.4279ms cudaHostAlloc
0.13% 1.9530ms 11 177.54us 7.6810us 591.06us cudaMalloc
0.07% 1.0424ms 8 130.30us 1.6970us 453.72us cudaGetDevice
0.04% 527.90us 40 13.197us 525ns 253.99us cudaEventCreateWithFlags
0.03% 435.73us 348 1.2520us 124ns 42.704us cuDeviceGetAttribute
0.03% 419.36us 1 419.36us 419.36us 419.36us cudaGetDeviceCount
0.02% 260.75us 2 130.38us 129.32us 131.43us cudaGetDeviceProperties
0.02% 222.32us 2 111.16us 106.94us 115.39us cudaLaunch
0.01% 214.06us 4 53.514us 28.586us 77.655us cuDeviceGetName
0.01% 115.45us 4 28.861us 9.8250us 44.526us cuDeviceTotalMem
0.01% 83.988us 4 20.997us 578ns 77.760us cudaSetDevice
0.00% 38.918us 1 38.918us 38.918us 38.918us cudaEventCreate
0.00% 34.573us 31 1.1150us 279ns 12.784us cudaDeviceGetAttribute
0.00% 17.767us 1 17.767us 17.767us 17.767us cudaProfilerStart
0.00% 15.228us 2 7.6140us 3.5460us 11.682us cudaConfigureCall
0.00% 14.536us 2 7.2680us 1.1490us 13.387us cudaGetLastError
0.00% 8.6080us 26 331ns 173ns 783ns cudaSetupArgument
0.00% 5.5470us 6 924ns 215ns 2.6780us cuDeviceGet
0.00% 5.4090us 6 901ns 328ns 3.3320us cuDeviceGetCount
0.00% 4.1770us 3 1.3920us 1.0630us 1.8300us cuDriverGetVersion
0.00% 3.4650us 3 1.1550us 1.0810us 1.2680us cuInit
0.00% 830ns 1 830ns 830ns 830ns cudaRuntimeGetVersion
nvvp profiler
-------------
For visual profiler **nvvp**, you can either import the output of :code:`nvprof –o ...` or
run application through GUI.
**Note: nvvp also support CPU profiling** (Click the box in nvvp to enable profile execution on CPU).
.. image:: nvvp1.png
:align: center
:scale: 33%
From the perspective of kernel functions, **nvvp** can even illustrate why does an operation take a long time?
As shown in the following figure, kernel's block usage, register usage and shared memory usage from :code:`nvvp`
allow us to fully utilize all warps on the GPU.
.. image:: nvvp2.png
:align: center
:scale: 33%
From the perspective of application, **nvvp** can give you some suggestions to address performance bottleneck.
For instance, some advice in data movement and compute utilization from the below figure can guide you to tune performance.
.. image:: nvvp3.png
:align: center
:scale: 33%
.. image:: nvvp4.png
:align: center
:scale: 33%
Profiling tips
==============
- The **nvprof** and **nvvp** output is a very good place to start.
- The timeline is a good place to go next.
- Only dig deep into a kernel if it’s taking a significant amount of your time.
- Where possible, try to match profiler output with theory.
1) For example, if I know I’m moving 1GB, and my kernel takes 10ms, I expect the profiler to report 100GB/s.
2) Discrepancies are likely to mean your application isn’t doing what you thought it was.
- Know your hardware: If your GPU can do 6 TFLOPs, and you’re already doing 5.5 TFLOPs, you won’t go much faster!
Profiling is a key step in optimization. Sometimes quite simple changes can lead to big improvements in performance.
Your mileage may vary!
Reference
=========
Jeremy Appleyard, `GPU Profiling for Deep Learning <http://www.robots.ox.ac.uk/~seminars/seminars/Extra/2015_10_08_JeremyAppleyard.pdf>`_, 2015
Performance Tuning
==================
.. toctree::
:maxdepth: 3
gpu_profiling.rst
# 支持双层序列作为输入的Layer
###########################
支持双层序列作为输入的Layer
###########################
## 概述
.. contents::
概述
====
在自然语言处理任务中,序列是一种常见的数据类型。一个独立的词语,可以看作是一个非序列输入,或者,我们称之为一个0层的序列;由词语构成的句子,是一个单层序列;若干个句子构成一个段落,是一个双层的序列。
......@@ -12,55 +17,79 @@
+ 单层序列:排成一列的多个元素,每个元素是一个0层序列,元素之间的顺序是重要的输入信息
+ 双层序列:排成一列的多个元素,每个元素是一个单层序列,称之为双层序列的一个子序列(subseq),subseq的每个元素是一个0层序列
在 PaddlePaddle中,下面这些Layer能够接受双层序列作为输入,完成相应的计算。
## pooling_layer
pooling_layer的使用示例如下,详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#pooling-layer">配置API</a>
```python
seq_pool = pooling_layer(input=layer,
pooling_layer
==============
pooling_layer 的使用示例如下,详细见 `pooling_layer`_ 配置API。
.. code-block:: bash
seq_pool = pooling_layer(input=layer,
pooling_type=AvgPooling(),
agg_level=AggregateLevel.EACH_SEQUENCE)
```
- `pooling_type` 目前支持两种,分别是:MaxPooling()和AvgPooling()。
- `agg_level=AggregateLevel.TIMESTEP`时(默认值):
- `agg_level=AggregateLevel.TIMESTEP` 时(默认值):
- 作用:双层序列经过运算变成一个0层序列,或单层序列经过运算变成一个0层序列
- 输入:一个双层序列,或一个单层序列
- 输出:一个0层序列,即整个输入序列(单层或双层)的平均值(或最大值)
- `agg_level=AggregateLevel.EACH_SEQUENCE`时:
- `agg_level=AggregateLevel.EACH_SEQUENCE` 时:
- 作用:一个双层序列经过运算变成一个单层序列
- 输入:必须是一个双层序列
- 输出:一个单层序列,序列的每个元素是原来双层序列每个subseq元素的平均值(或最大值)
## last_seq 和 first_seq
last_seq 和 first_seq
=====================
last_seq 的使用示例如下( `first_seq`_ 类似),详细见 `last_seq`_ 配置API。
.. code-block:: bash
last_seq的使用示例如下(first_seq类似),详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#last-seq">配置API</a>
```python
last = last_seq(input=layer,
last = last_seq(input=layer,
agg_level=AggregateLevel.EACH_SEQUENCE)
```
- `agg_level=AggregateLevel.TIMESTEP`时(默认值):
- `agg_level=AggregateLevel.TIMESTEP` 时(默认值):
- 作用:一个双层序列经过运算变成一个0层序列,或一个单层序列经过运算变成一个0层序列
- 输入:一个双层序列或一个单层序列
- 输出:一个0层序列,即整个输入序列(双层或者单层)最后一个,或第一个元素。
- `agg_level=AggregateLevel.EACH_SEQUENCE`时:
- `agg_level=AggregateLevel.EACH_SEQUENCE` 时:
- 作用:一个双层序列经过运算变成一个单层序列
- 输入:必须是一个双层序列
- 输出:一个单层序列,其中每个元素是双层序列中每个subseq最后一个(或第一个)元素。
## expand_layer
expand_layer
============
expand_layer的使用示例如下,详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#expand-layer">配置API</a>
```python
expand = expand_layer(input=layer1,
expand_layer 的使用示例如下,详细见 `expand_layer`_ 配置API。
.. code-block:: bash
expand = expand_layer(input=layer1,
expand_as=layer2,
expand_level=ExpandLevel.FROM_TIMESTEP)
```
- `expand_level=ExpandLevel.FROM_TIMESTEP`时(默认值):
- `expand_level=ExpandLevel.FROM_TIMESTEP` 时(默认值):
- 作用:一个0层序列经过运算扩展成一个单层序列,或者一个双层序列
- 输入:layer1必须是一个0层序列,是待扩展的数据;layer2可以是一个单层序列,或者是一个双层序列,提供扩展的长度信息
- 输出:一个单层序列,或一个双层序列,输出序列的类型(双层序列,或单层序列)和序列中含有元素的数目同 layer2一致。若输出是单层序列,单层序列的每个元素(0层序列),都是对layer1元素的拷贝;若输出是双层序列,双层序列每个subseq中每个元素(0层序列),都是对layer1元素的拷贝
- `expand_level=ExpandLevel.FROM_SEQUENCE`时:
- 输入:layer1必须是一个0层序列,是待扩展的数据;layer2 可以是一个单层序列,或者是一个双层序列,提供扩展的长度信息
- 输出:一个单层序列或一个双层序列,输出序列的类型(双层序列或单层序列)和序列中含有元素的数目同 layer2 一致。若输出是单层序列,单层序列的每个元素(0层序列),都是对layer1元素的拷贝;若输出是双层序列,双层序列每个subseq中每个元素(0层序列),都是对layer1元素的拷贝
- `expand_level=ExpandLevel.FROM_SEQUENCE` 时:
- 作用:一个单层序列经过运算扩展成一个双层序列
- 输入:layer1必须是一个单层序列,是待扩展的数据;layer2必须是一个双层序列,提供扩展的长度信息
- 输出:一个双层序列,序列中含有元素的数目同layer2一致。要求单层序列含有元素的数目(0层序列),和双层序列含有subseq 的数目一致。单层序列第i个元素(0层序列),被扩展为一个单层序列,构成了输出双层序列的第i个subseq。
- 输入:layer1必须是一个单层序列,是待扩展的数据;layer2 必须是一个双层序列,提供扩展的长度信息
- 输出:一个双层序列,序列中含有元素的数目同 layer2 一致。要求单层序列含有元素的数目(0层序列)和双层序列含有subseq 的数目一致。单层序列第i个元素(0层序列),被扩展为一个单层序列,构成了输出双层序列的第i个 subseq 。
.. _pooling_layer: ../../../doc/ui/api/trainer_config_helpers/layers.html#pooling-layer
.. _last_seq: ../../../doc/ui/api/trainer_config_helpers/layers.html#last-seq
.. _first_seq: ../../../doc/ui/api/trainer_config_helpers/layers.html#first-seq
.. _expand_layer: ../../../doc/ui/api/trainer_config_helpers/layers.html#expand-layer
......@@ -11,7 +11,7 @@ PaddlePaddle的ubuntu安装包分为四个版本,他们是 cpu、gpu、cpu-noa
.. code-block:: shell
gdebi paddle-*-cpu.deb
gdebi paddle-*-cpu*.deb
如果 :code:`gdebi` 没有安装,则需要使用 :code:`sudo apt-get install gdebi`, 来安装 :code:`gdebi` 。
......@@ -20,7 +20,7 @@ PaddlePaddle的ubuntu安装包分为四个版本,他们是 cpu、gpu、cpu-noa
.. code-block:: shell
dpkg -i paddle-*-cpu.deb
dpkg -i paddle-*-cpu*.deb
apt-get install -f
在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的,
......
......@@ -15,8 +15,8 @@ limitations under the License. */
#ifndef HL_CUDA_H_
#define HL_CUDA_H_
#include "hl_base.h"
#include <string>
#include "hl_base.h"
/**
* @brief HPPL event.
......@@ -332,4 +332,14 @@ extern bool hl_cuda_event_is_ready(hl_event_t event);
*/
extern void hl_device_synchronize();
/**
* @brief gpu profiler start
*/
extern void hl_profiler_start();
/**
* @brief gpu profiler stop
*/
extern void hl_profiler_end();
#endif // HL_CUDA_H_
......@@ -90,4 +90,8 @@ inline bool hl_cuda_event_is_ready(hl_event_t event) { return true; }
inline void hl_device_synchronize() {}
inline void hl_profiler_start() {}
inline void hl_profiler_end() {}
#endif // HL_CUDA_STUB_H_
......@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <sys/time.h>
#include <cuda_profiler_api.h>
#include <string.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <unistd.h>
#include <mutex>
#include "hl_cuda.h"
#include "hl_cuda.ph"
#include "hl_thread.ph"
#include "hl_dso_loader.h"
#include "hl_thread.ph"
#include "paddle/utils/Logging.h"
namespace dynload {
......@@ -133,7 +134,9 @@ void *cudart_dso_handle = nullptr;
__macro(cudaGetLastError) \
__macro(cudaFuncSetCacheConfig) \
__macro(cudaRuntimeGetVersion) \
__macro(cudaGetErrorString)
__macro(cudaGetErrorString) \
__macro(cudaProfilerStart) \
__macro(cudaProfilerStop)
// clang-format on
CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
......@@ -742,3 +745,7 @@ bool hl_cuda_event_is_ready(hl_event_t event) {
}
return true;
}
void hl_profiler_start() { CHECK_CUDA(dynload::cudaProfilerStart()); }
void hl_profiler_end() { CHECK_CUDA(dynload::cudaProfilerStop()); }
......@@ -14,3 +14,4 @@ add_simple_unittest(test_perturbation)
add_simple_unittest(test_CpuGpuVector)
add_simple_unittest(test_Allocator)
add_simple_unittest(test_FPException)
add_simple_unittest(test_GpuProfiler)
\ No newline at end of file
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef PADDLE_ONLY_CPU
#include "paddle/utils/Util.h"
#include "paddle/math/Matrix.h"
#include "paddle/math/SparseMatrix.h"
#include <gtest/gtest.h>
#include "paddle/gserver/tests/TestUtil.h"
#include "paddle/utils/Stat.h"
using namespace paddle; // NOLINT
using namespace std; // NOLINT
void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) {
CHECK(matrix1.getHeight() == matrix2.getHeight());
CHECK(matrix1.getWidth() == matrix2.getWidth());
#ifndef PADDLE_TYPE_DOUBLE
real err = 1e-3;
#else
real err = 1e-10;
#endif
int height = matrix1.getHeight();
int width = matrix1.getWidth();
const real* data1 = matrix1.getData();
const real* data2 = matrix2.getData();
int count = 0;
for (int i = 0; i < height; i++) {
for (int j = 0; j < width; j++) {
real a = data1[i * width + j];
real b = data2[i * width + j];
if (fabs(a - b) > err) {
if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
count++;
}
}
}
}
EXPECT_EQ(count, 0) << "There are " << count << " different element.";
}
void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
int channels) {
int inWidth = imgSizeH * imgSizeW * channels;
int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
real ratioH = 0.5;
real ratioW = 0.5;
// forward
MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
input->randomizeUniform();
inputGpu->copyFrom(*input);
{
// nvprof: GPU Proflier
REGISTER_GPU_PROFILER("testBilinearFwdBwd");
target->bilinearForward(*input, imgSizeH, imgSizeW,
2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW,
2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
}
// check
targetCheck->copyFrom(*targetGpu);
MatrixCheckErr(*target, *targetCheck);
// backward
MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, false,
true);
MatrixPtr targetCheckGrad =
CpuMatrix::create(numSamples, inWidth, false, false);
inputGrad->randomizeUniform();
targetGrad->randomizeUniform();
inputGpuGrad->copyFrom(*inputGrad);
targetGpuGrad->copyFrom(*targetGrad);
inputGrad->bilinearBackward(*targetGrad, 2 * imgSizeH, 2 * imgSizeW,
imgSizeH, imgSizeW, channels, ratioH, ratioW);
inputGpuGrad->bilinearBackward(*targetGpuGrad, 2 * imgSizeH, 2 * imgSizeW,
imgSizeH, imgSizeW, channels, ratioH, ratioW);
// check
targetCheckGrad->copyFrom(*inputGpuGrad);
MatrixCheckErr(*inputGrad, *targetCheckGrad);
}
TEST(Profiler, testBilinearFwdBwd) {
auto numSamples = 10;
auto channels = 16;
auto imgSize = 64;
{
// nvprof: GPU Proflier
REGISTER_GPU_PROFILER("testBilinearFwdBwd");
// Paddle built-in timer
REGISTER_TIMER_INFO("testBilinearFwdBwd",
"numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
testBilinearFwdBwd(numSamples, imgSize, imgSize, channels);
}
globalStat.printAllStatus();
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
// nvprof: GPU Proflier
REGISTER_GPU_PROFILER("RecursiveProfilingTest",
"numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
return RUN_ALL_TESTS();
}
#endif /* PADDLE_ONLY_CPU */
......@@ -65,6 +65,7 @@ std::ostream& operator<<(std::ostream& outPut, const Stat& stat) {
auto showStat = [&](const StatInfo* info, pid_t tid, bool isFirst = true) {
uint64_t average = 0;
if (info->count_ > 0) {
outPut << std::setfill(' ') << std::left;
if (!isFirst) {
outPut << std::setw(42) << " ";
}
......@@ -202,4 +203,22 @@ StatInfo::~StatInfo() {
}
}
static unsigned g_profileCount = 0;
static std::recursive_mutex g_profileMutex;
GpuProfiler::GpuProfiler(std::string statName, std::string info)
: guard_(g_profileMutex) {
if (++g_profileCount == 1) {
LOG(INFO) << "Enable GPU Profiler Stat: ["
<< statName << "] " << info;
hl_profiler_start();
}
}
GpuProfiler::~GpuProfiler() {
if (--g_profileCount == 0) {
hl_profiler_end();
}
}
} // namespace paddle
......@@ -15,19 +15,19 @@ limitations under the License. */
#pragma once
#include <stdint.h>
#include <string>
#include <sys/time.h>
#include <memory>
#include <iostream>
#include <list>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include <list>
#include "Logging.h"
#include "BarrierStat.h"
#include "Locks.h"
#include "Logging.h"
#include "ThreadLocal.h"
#include "BarrierStat.h"
#include "hl_gpu.h"
namespace paddle {
......@@ -283,4 +283,24 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1,
#endif // DISABLE_TIMER
class GpuProfiler final {
public:
GpuProfiler(std::string statName, std::string info);
~GpuProfiler();
private:
std::lock_guard<std::recursive_mutex> guard_;
};
#ifdef PADDLE_DISABLE_PROFILER
#define REGISTER_GPU_PROFILER(statName, ...)
#else
#define REGISTER_GPU_PROFILER(statName, ...) \
GpuProfiler __gpuProfiler(statName, #__VA_ARGS__);
#endif // DISABLE_PROFILER
} // namespace paddle
......@@ -16,7 +16,8 @@ __all__ = [
"TanhActivation", "SigmoidActivation", "SoftmaxActivation",
"IdentityActivation", "LinearActivation", 'SequenceSoftmaxActivation',
'ExpActivation', "ReluActivation", "BReluActivation", "SoftReluActivation",
"STanhActivation", "AbsActivation", "SquareActivation", "BaseActivation"
"STanhActivation", "AbsActivation", "SquareActivation", "BaseActivation",
"LogActivation"
]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册