Fix conflict with gpu profiling docs

345c6263 · liaogang · a48f19cf · b47aff69 · 345c6263 · 345c6263
24 changed file
--- a/.travis.yml
+++ b/.travis.yml
@@ -42,7 +42,7 @@ addons:
 before_install:
  - |
    if [ ${JOB} == "BUILD_AND_TEST" ]; then
-      if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)'
+      if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)'
      then
        echo "Only markdown docs were updated, stopping build process."
        exit

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,7 @@ option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF)
 option(WITH_GLOG "Compile PaddlePaddle use glog, otherwise use a log implement internally" ${LIBGLOG_FOUND})
 option(WITH_GFLAGS "Compile PaddlePaddle use gflags, otherwise use a flag implement internally" ${GFLAGS_FOUND})
 option(WITH_TIMER "Compile PaddlePaddle use timer" OFF)
+option(WITH_PROFILER "Compile PaddlePaddle use gpu profiler" OFF)
 option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
 option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
 option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
@@ -115,7 +116,6 @@ else()
    endif(WITH_AVX)
    if(WITH_DSO)
-        set(CUDA_LIBRARIES "")
        add_definitions(-DPADDLE_USE_DSO)
    endif(WITH_DSO)
@@ -135,6 +135,10 @@ if(NOT WITH_TIMER)
    add_definitions(-DPADDLE_DISABLE_TIMER)
 endif(NOT WITH_TIMER)
+if(NOT WITH_PROFILER)
+    add_definitions(-DPADDLE_DISABLE_PROFILER)
+endif(NOT WITH_PROFILER)
 if(WITH_AVX)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")

--- a/demo/model_zoo/embedding/pre_DictAndModel.sh
+++ b/demo/model_zoo/embedding/pre_DictAndModel.sh
@@ -18,7 +18,5 @@ set -x
 # download the dictionary and pretrained model 
 for file in baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb
 do 
-  # following is the google drive address
+  wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/$file
-  # you can also directly download from https://pan.baidu.com/s/1o8q577s
-  wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/$file --no-check-certificate
 done
--- a/demo/model_zoo/resnet/get_model.sh
+++ b/demo/model_zoo/resnet/get_model.sh
@@ -24,9 +24,7 @@ echo "Downloading ResNet models..."
 for file in resnet_50.tar.gz resnet_101.tar.gz resnet_152.tar.gz mean_meta_224.tar.gz 
 do 
-  # following is the google drive address
+  wget http://paddlepaddle.bj.bcebos.com/model_zoo/imagenet/$file
-  # you can also directly download from https://pan.baidu.com/s/1o8q577s
-  wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/imagenet/$file --no-check-certificate
  tar -xvf $file 
  rm $file
 done

--- a/demo/seqToseq/data/paraphrase_data.sh
+++ b/demo/seqToseq/data/paraphrase_data.sh
@@ -16,9 +16,7 @@ set -e
 set -x
 # download the in-house paraphrase dataset
-# following is the google drive address
+wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/paraphrase.tar.gz
-# you can also directly download from https://pan.baidu.com/s/1o8q577s
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/paraphrase.tar.gz --no-check-certificate
 # untar the dataset
 tar -zxvf paraphrase.tar.gz

--- a/demo/seqToseq/data/wmt14_model.sh
+++ b/demo/seqToseq/data/wmt14_model.sh
@@ -16,9 +16,7 @@ set -e
 set -x
 # download the pretrained model
-# following is the google drive address
+wget http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz
-# you can also directly download from https://pan.baidu.com/s/1o8q577s
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/wmt14_model.tar.gz --no-check-certificate
 # untar the model
 tar -zxvf wmt14_model.tar.gz

--- a/doc/index.rst
+++ b/doc/index.rst
--- a/doc/introduction/build_and_install/build_from_source.md
+++ b/doc/introduction/build_and_install/build_from_source.md
@@ -95,7 +95,7 @@ As a simple example, consider the following:
    ```bash
    # necessary
    sudo apt-get update
-    sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
+    sudo apt-get install -y g++ make cmake swig build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
    # optional
    sudo apt-get install libgoogle-glog-dev
    sudo apt-get install libgflags-dev
@@ -149,15 +149,15 @@ If still not found, you can manually set it based on CMake error information fro
 As a simple example, consider the following:
- **Only CPU**
+- **Only CPU with swig**
  ```bash
-  cmake  .. -DWITH_GPU=OFF
+  cmake  .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON
  ```
- **GPU**
+- **GPU with swig**
  ```bash
-  cmake .. -DWITH_GPU=ON
+  cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON
  ```
 - **GPU with doc and swig**
@@ -170,15 +170,13 @@ Finally, you can build PaddlePaddle:
 ```bash
 # you can add build option here, such as:    
-cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<path to install>
+cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<path to install> -DWITH_SWIG_PY=ON
 # please use sudo make install, if you want to install PaddlePaddle into the system
 make -j `nproc` && make install
 # set PaddlePaddle installation path in ~/.bashrc
 export PATH=<path to install>/bin:$PATH
 ```
-**Note:**
 If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
 Otherwise, PaddlePaddle will automatically install python dependencies
 at first time when user run paddle commands, such as `paddle version`, `paddle train`.

--- a/doc/optimization/gpu_profiling.rst
+++ b/doc/optimization/gpu_profiling.rst
+Profiling on PaddlePaddle
+=========================
+This tutorial will guide you step-by-step through how to conduct profiling and performance tuning using built-in timer, **nvprof** and **nvvp**.
+- What is profiling?
+- Why we need profiling?
+- How to do profiling?
+- Profile tools
+- Hands-on Tutorial
+- Profiling tips
+What's profiling?
+=================
+In software engineering, profiling is a form of dynamic program analysis that measures the space (memory) or time
+complexity of a program, the usage of particular instructions, or the frequency and duration of function calls.
+Most commonly, profiling information serves to aid program optimization.
+Briefly, profiler is used to measure application performance. Program analysis tools are extremely important for
+understanding program behavior. Simple profiling can tell you that how long does an operation take? For advanced
+profiling, it can interpret why does an operation take a long time?
+Why we need profiling?
+======================
+Since training deep neural network typically take a very long time to get over, performance is gradually becoming
+the most important thing in deep learning field. The first step to improve performance is to understand what parts
+are slow.  There is no point in improving performance of a region which doesn’t take much time!
+How to do profiling?
+====================
+To achieve maximum performance, there are five steps you can take to reach your goals.
+- Profile the code
+- Find the slow parts
+- Work out why they’re slow
+- Make them fast
+- Profile the code again
+Usually, processor has two key performance limits include float point throughput and
+memory throughput. For GPU,  it also need more parallelism to fulfill its potential.
+This is why they can be so fast.
+Profiler Tools
+==============
+For general GPU profiling, a bunch of tools are provided from both NVIDIA and third party.
+**nvprof** is Nvidia profiler and **nvvp** is (GUI based) Nvidia visual profiler.
+In this tutorial, we will focus on nvprof and nvvp.
+:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
+above profilers. 
+.. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
+   :language: c++
+   :lines: 111-124
+   :linenos:
+The above code snippet includes two methods, you can use any of them to profile the regions of interest.
+1. :code:`REGISTER_TIMER_INFO` is a built-in timer wrapper which can calculate the time overhead of both cpu functions and cuda kernels.
+2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
+program crashes when CPU version of PaddlePaddle invokes them.
+You can find more details about how to use both of them in the next session.
+Hands-on Approach
+=================
+Built-in Timer
+--------------
+To enable built-in timer in PaddlePaddle, first you have to add :code:`REGISTER_TIMER_INFO` into the regions of you interest.
+Then, all information could be stamped in the console via :code:`printStatus` or :code:`printAllStatus` function.
+As a simple example, consider the following:
+1. Add :code:`REGISTER_TIMER_INFO` and :code:`printAllStatus` functions (see the emphasize-lines).
+    .. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 111-124
+        :emphasize-lines: 8-10,13
+        :linenos:
+2. Configure cmake with **WITH_TIMER** and recompile PaddlePaddle.
+    .. code-block:: bash
+        cmake .. -DWITH_TIMER=ON
+        make
+3. Execute your code and observe the results (see the emphasize-lines). 
+    .. code-block:: bash
+        :emphasize-lines: 1,12-15
+        > ./paddle/math/tests/test_GpuProfiler                                                                             
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler                                             
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions                                                                      
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.                                                                   
+        [==========] Running 1 test from 1 test case.                                                                                                
+        [----------] Global test environment set-up.                                                                                                 
+        [----------] 1 test from Profiler                                                                                                            
+        [ RUN      ] Profiler.BilinearFwdBwd                                                                                                         
+        I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
+        gSizeX = 64, imgSizeY = 64"                                                                                                                  
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751                                           
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======                                               
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1                                                                                                                                  
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======                                                          
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------                                            
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)                                                                                                
+        [----------] 1 test from Profiler (136 ms total)                                                                                             
+        [----------] Global test environment tear-down                                                                                               
+        [==========] 1 test from 1 test case ran. (136 ms total)                                                                                     
+        [  PASSED  ] 1 test.
+nvprof profiler
+---------------
+To use this command line profiler **nvprof**, you can simply issue the following command:
+1. Add :code:`REGISTER_GPU_PROFILER` function (see the emphasize-lines).
+    .. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 111-124
+        :emphasize-lines: 6-7
+        :linenos:
+2. Configure cmake with **WITH_PROFILER** and recompile PaddlePaddle.
+    .. code-block:: bash
+        cmake .. -DWITH_PROFILER=ON
+        make
+3. Use Nvidia profiler **nvprof** to profile the binary.
+    .. code-block:: bash
+        nvprof  ./paddle/math/tests/test_GpuProfiler
+Then, you can get the following profiling result:
+.. code-block:: bash
+    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler                                                                                                      
+    ==78544== Profiling result:                                                                                                                                                
+    Time(%)     Time     Calls       Avg       Min       Max  Name                                                                                                            
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]                                                                                              
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw                                                                                            
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw                                                                                        
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]                                                                                              
+    ==78544== API calls:                                                                                                                                                       
+    Time(%)     Time     Calls       Avg       Min       Max  Name                                                                                                            
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags                                                                                       
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree                                                                                                        
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate                                                                                                
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy                                                                                                      
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize                                                                                           
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc                                                                                                   
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc                                                                                                      
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice                                                                                                   
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags                                                                                        
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute                                                                                            
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount                                                                                              
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties                                                                                         
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch                                                                                                      
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName                                                                                                 
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem                                                                                                
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice                                                                                                   
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate                                                                                                 
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute                                                                                          
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart                                                                                               
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall                                                                                               
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError                                                                                                
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument                                                                                               
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet                                                                                                     
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount                                                                                                
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion                                                                                              
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit                                                                                                          
+    0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
+nvvp profiler
+-------------
+For visual profiler **nvvp**, you can either import the output of :code:`nvprof –o ...` or
+run application through GUI.
+**Note: nvvp also support CPU profiling** (Click the box in nvvp to enable profile execution on CPU).
+..  image:: nvvp1.png
+    :align: center
+    :scale: 33%
+From the perspective of kernel functions, **nvvp** can even illustrate why does an operation take a long time?
+As shown in the following figure, kernel's block usage, register usage and shared memory usage from :code:`nvvp`
+allow us to fully utilize all warps on the GPU.
+..  image:: nvvp2.png
+    :align: center
+    :scale: 33%
+From the perspective of application, **nvvp** can give you some suggestions to address performance bottleneck.
+For instance, some advice in data movement and compute utilization from the below figure can guide you to tune performance.
+..  image:: nvvp3.png
+    :align: center
+    :scale: 33%
+..  image:: nvvp4.png
+    :align: center
+    :scale: 33%
+Profiling tips
+==============
+- The **nvprof** and **nvvp** output is a very good place to start.
+- The timeline is a good place to go next.
+- Only dig deep into a kernel if it’s taking a significant amount of your time.
+- Where possible, try to match profiler output with theory.
+    1) For example, if I know I’m moving 1GB, and my kernel takes 10ms, I expect the profiler to report 100GB/s.
+    2) Discrepancies are likely to mean your application isn’t doing what you thought it was.
+- Know your hardware: If your GPU can do 6 TFLOPs, and you’re already doing 5.5 TFLOPs, you won’t go much faster!
+Profiling is a key step in optimization. Sometimes quite simple changes can lead to big improvements in performance.
+Your mileage may vary!
+Reference
+=========
+Jeremy Appleyard, `GPU Profiling for Deep Learning <http://www.robots.ox.ac.uk/~seminars/seminars/Extra/2015_10_08_JeremyAppleyard.pdf>`_, 2015
--- a/doc/optimization/index.rst
+++ b/doc/optimization/index.rst
+Performance Tuning
+==================
+.. toctree::
+  :maxdepth: 3
+  gpu_profiling.rst
--- a/doc/optimization/nvvp1.png
+++ b/doc/optimization/nvvp1.png
--- a/doc/optimization/nvvp2.png
+++ b/doc/optimization/nvvp2.png
--- a/doc/optimization/nvvp3.png
+++ b/doc/optimization/nvvp3.png
--- a/doc/optimization/nvvp4.png
+++ b/doc/optimization/nvvp4.png
--- a/doc_cn/algorithm/rnn/hierarchical-layer.md
+++ b/doc_cn/algorithm/rnn/hierarchical-layer.md
-# 支持双层序列作为输入的Layer
+###########################
+支持双层序列作为输入的Layer
+###########################
-## 概述
+..	contents::
+概述
+====
 在自然语言处理任务中，序列是一种常见的数据类型。一个独立的词语，可以看作是一个非序列输入，或者，我们称之为一个0层的序列；由词语构成的句子，是一个单层序列；若干个句子构成一个段落，是一个双层的序列。
@@ -12,55 +17,79 @@
 + 单层序列：排成一列的多个元素，每个元素是一个0层序列，元素之间的顺序是重要的输入信息
 + 双层序列：排成一列的多个元素，每个元素是一个单层序列，称之为双层序列的一个子序列（subseq），subseq的每个元素是一个0层序列
 在 PaddlePaddle中，下面这些Layer能够接受双层序列作为输入，完成相应的计算。
-## pooling_layer
-pooling_layer的使用示例如下，详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#pooling-layer">配置API</a>。
+pooling_layer
-```python
+==============
-seq_pool = pooling_layer(input=layer,
+pooling_layer 的使用示例如下，详细见 `pooling_layer`_ 配置API。
+..	code-block:: bash
+        seq_pool = pooling_layer(input=layer,
                                 pooling_type=AvgPooling(),
                                 agg_level=AggregateLevel.EACH_SEQUENCE)
-```
 - `pooling_type` 目前支持两种，分别是：MaxPooling()和AvgPooling()。
- `agg_level=AggregateLevel.TIMESTEP`时（默认值）：
+- `agg_level=AggregateLevel.TIMESTEP` 时（默认值）：
  - 作用：双层序列经过运算变成一个0层序列，或单层序列经过运算变成一个0层序列
  - 输入：一个双层序列，或一个单层序列
  - 输出：一个0层序列，即整个输入序列（单层或双层）的平均值（或最大值）
- `agg_level=AggregateLevel.EACH_SEQUENCE`时：
+- `agg_level=AggregateLevel.EACH_SEQUENCE` 时：
  - 作用：一个双层序列经过运算变成一个单层序列
  - 输入：必须是一个双层序列
  - 输出：一个单层序列，序列的每个元素是原来双层序列每个subseq元素的平均值（或最大值）
-## last_seq 和 first_seq
+last_seq 和 first_seq
+=====================
+last_seq 的使用示例如下（ `first_seq`_ 类似），详细见 `last_seq`_ 配置API。
+..	code-block:: bash
-last_seq的使用示例如下（first_seq类似），详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#last-seq">配置API</a>。
+        last = last_seq(input=layer,
-```python
-last = last_seq(input=layer,
                        agg_level=AggregateLevel.EACH_SEQUENCE)
-```
- `agg_level=AggregateLevel.TIMESTEP`时（默认值）：
+- `agg_level=AggregateLevel.TIMESTEP` 时（默认值）：
  - 作用：一个双层序列经过运算变成一个0层序列，或一个单层序列经过运算变成一个0层序列
  - 输入：一个双层序列或一个单层序列
  - 输出：一个0层序列，即整个输入序列（双层或者单层）最后一个，或第一个元素。
- `agg_level=AggregateLevel.EACH_SEQUENCE`时：
+- `agg_level=AggregateLevel.EACH_SEQUENCE` 时：
  - 作用：一个双层序列经过运算变成一个单层序列
  - 输入：必须是一个双层序列
  - 输出：一个单层序列，其中每个元素是双层序列中每个subseq最后一个（或第一个）元素。
-## expand_layer
+expand_layer
+============
-expand_layer的使用示例如下，详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#expand-layer">配置API</a>。
+expand_layer 的使用示例如下，详细见 `expand_layer`_ 配置API。
-```python
-expand = expand_layer(input=layer1,
+..	code-block:: bash
+        expand = expand_layer(input=layer1,
                              expand_as=layer2,
                              expand_level=ExpandLevel.FROM_TIMESTEP)
-```
- `expand_level=ExpandLevel.FROM_TIMESTEP`时（默认值）：
+- `expand_level=ExpandLevel.FROM_TIMESTEP` 时（默认值）：
  - 作用：一个0层序列经过运算扩展成一个单层序列，或者一个双层序列
-  - 输入：layer1必须是一个0层序列，是待扩展的数据；layer2可以是一个单层序列，或者是一个双层序列，提供扩展的长度信息
+  - 输入：layer1必须是一个0层序列，是待扩展的数据；layer2 可以是一个单层序列，或者是一个双层序列，提供扩展的长度信息
-  - 输出：一个单层序列，或一个双层序列，输出序列的类型（双层序列，或单层序列）和序列中含有元素的数目同 layer2一致。若输出是单层序列，单层序列的每个元素（0层序列），都是对layer1元素的拷贝；若输出是双层序列，双层序列每个subseq中每个元素（0层序列），都是对layer1元素的拷贝
+  - 输出：一个单层序列或一个双层序列，输出序列的类型（双层序列或单层序列）和序列中含有元素的数目同 layer2 一致。若输出是单层序列，单层序列的每个元素（0层序列），都是对layer1元素的拷贝；若输出是双层序列，双层序列每个subseq中每个元素（0层序列），都是对layer1元素的拷贝
- `expand_level=ExpandLevel.FROM_SEQUENCE`时：
+- `expand_level=ExpandLevel.FROM_SEQUENCE` 时：
  - 作用：一个单层序列经过运算扩展成一个双层序列
-  - 输入：layer1必须是一个单层序列，是待扩展的数据；layer2必须是一个双层序列，提供扩展的长度信息
+  - 输入：layer1必须是一个单层序列，是待扩展的数据；layer2 必须是一个双层序列，提供扩展的长度信息
-  - 输出：一个双层序列，序列中含有元素的数目同layer2一致。要求单层序列含有元素的数目（0层序列），和双层序列含有subseq 的数目一致。单层序列第i个元素（0层序列），被扩展为一个单层序列，构成了输出双层序列的第i个subseq。
+  - 输出：一个双层序列，序列中含有元素的数目同 layer2 一致。要求单层序列含有元素的数目（0层序列）和双层序列含有subseq 的数目一致。单层序列第i个元素（0层序列），被扩展为一个单层序列，构成了输出双层序列的第i个 subseq 。
+.. _pooling_layer: ../../../doc/ui/api/trainer_config_helpers/layers.html#pooling-layer
+.. _last_seq: ../../../doc/ui/api/trainer_config_helpers/layers.html#last-seq
+.. _first_seq: ../../../doc/ui/api/trainer_config_helpers/layers.html#first-seq
+.. _expand_layer: ../../../doc/ui/api/trainer_config_helpers/layers.html#expand-layer
--- a/doc_cn/build_and_install/install/ubuntu_install.rst
+++ b/doc_cn/build_and_install/install/ubuntu_install.rst
@@ -11,7 +11,7 @@ PaddlePaddle的ubuntu安装包分为四个版本，他们是 cpu、gpu、cpu-noa
 ..  code-block:: shell
-    gdebi paddle-*-cpu.deb
+    gdebi paddle-*-cpu*.deb
 如果 :code:`gdebi` 没有安装,则需要使用 :code:`sudo apt-get install gdebi`, 来安装 :code:`gdebi` 。
@@ -20,7 +20,7 @@ PaddlePaddle的ubuntu安装包分为四个版本，他们是 cpu、gpu、cpu-noa
 ..  code-block:: shell
-    dpkg -i paddle-*-cpu.deb
+    dpkg -i paddle-*-cpu*.deb
    apt-get install -f
 在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的，

--- a/paddle/cuda/include/hl_cuda.h
+++ b/paddle/cuda/include/hl_cuda.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #ifndef HL_CUDA_H_
 #define HL_CUDA_H_
-#include "hl_base.h"
 #include <string>
+#include "hl_base.h"
 /**
 * @brief   HPPL event.
@@ -332,4 +332,14 @@ extern bool hl_cuda_event_is_ready(hl_event_t event);
 */
 extern void hl_device_synchronize();
+/**
+ * @brief   gpu profiler start
+ */
+extern void hl_profiler_start();
+/**
+ * @brief   gpu profiler stop
+ */
+extern void hl_profiler_end();
 #endif  // HL_CUDA_H_
--- a/paddle/cuda/include/stub/hl_cuda_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_stub.h
@@ -90,4 +90,8 @@ inline bool hl_cuda_event_is_ready(hl_event_t event) { return true; }
 inline void hl_device_synchronize() {}
+inline void hl_profiler_start() {}
+inline void hl_profiler_end() {}
 #endif  // HL_CUDA_STUB_H_
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <sys/time.h>
+#include <cuda_profiler_api.h>
 #include <string.h>
-#include <unistd.h>
 #include <sys/syscall.h>
+#include <sys/time.h>
+#include <unistd.h>
 #include <mutex>
 #include "hl_cuda.h"
 #include "hl_cuda.ph"
-#include "hl_thread.ph"
 #include "hl_dso_loader.h"
+#include "hl_thread.ph"
 #include "paddle/utils/Logging.h"
 namespace dynload {
@@ -133,7 +134,9 @@ void *cudart_dso_handle = nullptr;
  __macro(cudaGetLastError)               \
  __macro(cudaFuncSetCacheConfig)         \
  __macro(cudaRuntimeGetVersion)          \
-  __macro(cudaGetErrorString)
+  __macro(cudaGetErrorString)             \
+  __macro(cudaProfilerStart)              \
+  __macro(cudaProfilerStop)
 // clang-format on
 CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
@@ -742,3 +745,7 @@ bool hl_cuda_event_is_ready(hl_event_t event) {
  }
  return true;
 }
+void hl_profiler_start() { CHECK_CUDA(dynload::cudaProfilerStart()); }
+void hl_profiler_end() { CHECK_CUDA(dynload::cudaProfilerStop()); }
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -14,3 +14,4 @@ add_simple_unittest(test_perturbation)
 add_simple_unittest(test_CpuGpuVector)
 add_simple_unittest(test_Allocator)
 add_simple_unittest(test_FPException)
+add_simple_unittest(test_GpuProfiler)
\ No newline at end of file
--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ b/paddle/math/tests/test_GpuProfiler.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef PADDLE_ONLY_CPU
+#include "paddle/utils/Util.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+#include <gtest/gtest.h>
+#include "paddle/gserver/tests/TestUtil.h"
+#include "paddle/utils/Stat.h"
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      real a = data1[i * width + j];
+      real b = data2[i * width + j];
+      if (fabs(a - b) > err) {
+        if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
+          count++;
+        }
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
+                        int channels) {
+  int inWidth = imgSizeH * imgSizeW * channels;
+  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
+  real ratioH = 0.5;
+  real ratioW = 0.5;
+  // forward
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+  input->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  {
+    // nvprof: GPU Proflier
+    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
+    target->bilinearForward(*input, imgSizeH, imgSizeW,
+        2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
+    targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW,
+        2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
+  }
+  // check
+  targetCheck->copyFrom(*targetGpu);
+  MatrixCheckErr(*target, *targetCheck);
+  // backward
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, false,
+                                              true);
+  MatrixPtr targetCheckGrad =
+      CpuMatrix::create(numSamples, inWidth, false, false);
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+  inputGrad->bilinearBackward(*targetGrad, 2 * imgSizeH, 2 * imgSizeW,
+      imgSizeH, imgSizeW, channels, ratioH, ratioW);
+  inputGpuGrad->bilinearBackward(*targetGpuGrad, 2 * imgSizeH, 2 * imgSizeW,
+      imgSizeH, imgSizeW, channels, ratioH, ratioW);
+  // check
+  targetCheckGrad->copyFrom(*inputGpuGrad);
+  MatrixCheckErr(*inputGrad, *targetCheckGrad);
+}
+TEST(Profiler, testBilinearFwdBwd) {
+  auto numSamples = 10;
+  auto channels = 16;
+  auto imgSize = 64;
+  {
+    // nvprof: GPU Proflier
+    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
+    // Paddle built-in timer
+    REGISTER_TIMER_INFO("testBilinearFwdBwd",
+      "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
+    testBilinearFwdBwd(numSamples, imgSize, imgSize, channels);
+  }
+  globalStat.printAllStatus();
+}
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  // nvprof: GPU Proflier
+  REGISTER_GPU_PROFILER("RecursiveProfilingTest",
+    "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
+  return RUN_ALL_TESTS();
+}
+#endif /* PADDLE_ONLY_CPU */
--- a/paddle/utils/Stat.cpp
+++ b/paddle/utils/Stat.cpp
@@ -65,6 +65,7 @@ std::ostream& operator<<(std::ostream& outPut, const Stat& stat) {
  auto showStat = [&](const StatInfo* info, pid_t tid, bool isFirst = true) {
    uint64_t average = 0;
    if (info->count_ > 0) {
+      outPut << std::setfill(' ') << std::left;
      if (!isFirst) {
        outPut << std::setw(42) << " ";
      }
@@ -202,4 +203,22 @@ StatInfo::~StatInfo() {
  }
 }
+static unsigned g_profileCount = 0;
+static std::recursive_mutex g_profileMutex;
+GpuProfiler::GpuProfiler(std::string statName, std::string info)
+  : guard_(g_profileMutex)  {
+  if (++g_profileCount == 1) {
+    LOG(INFO) << "Enable GPU Profiler Stat: ["
+              << statName << "] " << info;
+    hl_profiler_start();
+  }
+}
+GpuProfiler::~GpuProfiler() {
+  if (--g_profileCount == 0) {
+    hl_profiler_end();
+  }
+}
 }  // namespace paddle
--- a/paddle/utils/Stat.h
+++ b/paddle/utils/Stat.h
@@ -15,19 +15,19 @@ limitations under the License. */
 #pragma once
 #include <stdint.h>
-#include <string>
 #include <sys/time.h>
-#include <memory>
 #include <iostream>
+#include <list>
+#include <memory>
 #include <mutex>
+#include <string>
 #include <unordered_map>
-#include <list>
-#include "Logging.h"
 #include "BarrierStat.h"
 #include "Locks.h"
+#include "Logging.h"
 #include "ThreadLocal.h"
-#include "BarrierStat.h"
+#include "hl_gpu.h"
 namespace paddle {
@@ -283,4 +283,24 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1,
 #endif  // DISABLE_TIMER
+class GpuProfiler final {
+public:
+  GpuProfiler(std::string statName, std::string info);
+  ~GpuProfiler();
+private:
+  std::lock_guard<std::recursive_mutex> guard_;
+};
+#ifdef PADDLE_DISABLE_PROFILER
+#define REGISTER_GPU_PROFILER(statName, ...)
+#else
+#define REGISTER_GPU_PROFILER(statName, ...) \
+  GpuProfiler __gpuProfiler(statName, #__VA_ARGS__);
+#endif  // DISABLE_PROFILER
 }  // namespace paddle
--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
@@ -16,7 +16,8 @@ __all__ = [
    "TanhActivation", "SigmoidActivation", "SoftmaxActivation",
    "IdentityActivation", "LinearActivation", 'SequenceSoftmaxActivation',
    'ExpActivation', "ReluActivation", "BReluActivation", "SoftReluActivation",
-    "STanhActivation", "AbsActivation", "SquareActivation", "BaseActivation"
+    "STanhActivation", "AbsActivation", "SquareActivation", "BaseActivation",
+    "LogActivation"
 ]