diff --git a/.travis.yml b/.travis.yml index ffe3bc193b49eb3b3318cbbc7f1c3d86dc205c14..effcf90769647960d55b971af0939496dc850e7a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -42,7 +42,7 @@ addons: before_install: - | if [ ${JOB} == "BUILD_AND_TEST" ]; then - if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)' + if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)' then echo "Only markdown docs were updated, stopping build process." exit diff --git a/CMakeLists.txt b/CMakeLists.txt index 090ac9e188422099cc4270b87064b5590e7b620c..af193c27ae7d802a8724fdc1e23b4b5b583e9f7c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,6 +36,7 @@ option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF) option(WITH_GLOG "Compile PaddlePaddle use glog, otherwise use a log implement internally" ${LIBGLOG_FOUND}) option(WITH_GFLAGS "Compile PaddlePaddle use gflags, otherwise use a flag implement internally" ${GFLAGS_FOUND}) option(WITH_TIMER "Compile PaddlePaddle use timer" OFF) +option(WITH_PROFILER "Compile PaddlePaddle use gpu profiler" OFF) option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND}) option(WITH_DOC "Compile PaddlePaddle with documentation" OFF) option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND}) @@ -115,7 +116,6 @@ else() endif(WITH_AVX) if(WITH_DSO) - set(CUDA_LIBRARIES "") add_definitions(-DPADDLE_USE_DSO) endif(WITH_DSO) @@ -135,6 +135,10 @@ if(NOT WITH_TIMER) add_definitions(-DPADDLE_DISABLE_TIMER) endif(NOT WITH_TIMER) +if(NOT WITH_PROFILER) + add_definitions(-DPADDLE_DISABLE_PROFILER) +endif(NOT WITH_PROFILER) + if(WITH_AVX) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}") diff --git a/demo/model_zoo/embedding/pre_DictAndModel.sh b/demo/model_zoo/embedding/pre_DictAndModel.sh index 7821850fb25cc5b87aa305c2113efbf50b093ed1..6d647f5dd9368eaf81c19386511c7d231e4799e3 100755 --- a/demo/model_zoo/embedding/pre_DictAndModel.sh +++ b/demo/model_zoo/embedding/pre_DictAndModel.sh @@ -18,7 +18,5 @@ set -x # download the dictionary and pretrained model for file in baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb do - # following is the google drive address - # you can also directly download from https://pan.baidu.com/s/1o8q577s - wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/$file --no-check-certificate + wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/$file done diff --git a/demo/model_zoo/resnet/get_model.sh b/demo/model_zoo/resnet/get_model.sh index 89312d43edf8e4e7d639be73d5b3983ea916b902..133d08fca431540f2ed5cd6e63b51d9ce3a1b344 100755 --- a/demo/model_zoo/resnet/get_model.sh +++ b/demo/model_zoo/resnet/get_model.sh @@ -24,9 +24,7 @@ echo "Downloading ResNet models..." for file in resnet_50.tar.gz resnet_101.tar.gz resnet_152.tar.gz mean_meta_224.tar.gz do - # following is the google drive address - # you can also directly download from https://pan.baidu.com/s/1o8q577s - wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/imagenet/$file --no-check-certificate + wget http://paddlepaddle.bj.bcebos.com/model_zoo/imagenet/$file tar -xvf $file rm $file done diff --git a/demo/seqToseq/data/paraphrase_data.sh b/demo/seqToseq/data/paraphrase_data.sh index ea1f8dbcfad35699189f6cd4efc81d97e8c89148..1b3f1d45e11fbd5e600e58f583e503a603e484ff 100755 --- a/demo/seqToseq/data/paraphrase_data.sh +++ b/demo/seqToseq/data/paraphrase_data.sh @@ -16,9 +16,7 @@ set -e set -x # download the in-house paraphrase dataset -# following is the google drive address -# you can also directly download from https://pan.baidu.com/s/1o8q577s -wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/paraphrase.tar.gz --no-check-certificate +wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/paraphrase.tar.gz # untar the dataset tar -zxvf paraphrase.tar.gz diff --git a/demo/seqToseq/data/wmt14_model.sh b/demo/seqToseq/data/wmt14_model.sh index 2cec30688d27a57902cdf64d7be5712d12c69bdd..d6e7a732644dc188a165215ddf3f69e1514425eb 100755 --- a/demo/seqToseq/data/wmt14_model.sh +++ b/demo/seqToseq/data/wmt14_model.sh @@ -16,9 +16,7 @@ set -e set -x # download the pretrained model -# following is the google drive address -# you can also directly download from https://pan.baidu.com/s/1o8q577s -wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/wmt14_model.tar.gz --no-check-certificate +wget http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz # untar the model tar -zxvf wmt14_model.tar.gz diff --git a/doc/index.rst b/doc/index.rst index a7feed52395ed39c12fd1ddd659a6271729ccaac..36a410881ad8f03b628f34a25350ab74a3e1667f 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -9,4 +9,4 @@ PaddlePaddle Documentation howto/index.rst api/index.rst about/index.rst - + \ No newline at end of file diff --git a/doc/introduction/build_and_install/build_from_source.md b/doc/introduction/build_and_install/build_from_source.md index e44fa0d38e9982e5d0ed159743994ce6acc51246..b932fbc0fa4443d2fd8abfc9d8a78e68c44f667c 100644 --- a/doc/introduction/build_and_install/build_from_source.md +++ b/doc/introduction/build_and_install/build_from_source.md @@ -95,7 +95,7 @@ As a simple example, consider the following: ```bash # necessary sudo apt-get update - sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git + sudo apt-get install -y g++ make cmake swig build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git # optional sudo apt-get install libgoogle-glog-dev sudo apt-get install libgflags-dev @@ -149,15 +149,15 @@ If still not found, you can manually set it based on CMake error information fro As a simple example, consider the following: -- **Only CPU** +- **Only CPU with swig** ```bash - cmake .. -DWITH_GPU=OFF + cmake .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON ``` -- **GPU** +- **GPU with swig** ```bash - cmake .. -DWITH_GPU=ON + cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON ``` - **GPU with doc and swig** @@ -170,15 +170,13 @@ Finally, you can build PaddlePaddle: ```bash # you can add build option here, such as: -cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX= +cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX= -DWITH_SWIG_PY=ON # please use sudo make install, if you want to install PaddlePaddle into the system make -j `nproc` && make install # set PaddlePaddle installation path in ~/.bashrc export PATH=/bin:$PATH ``` -**Note:** - If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed. Otherwise, PaddlePaddle will automatically install python dependencies at first time when user run paddle commands, such as `paddle version`, `paddle train`. diff --git a/doc/optimization/gpu_profiling.rst b/doc/optimization/gpu_profiling.rst new file mode 100644 index 0000000000000000000000000000000000000000..667bf1364e7cd4c9098caba72a127228d78ca38b --- /dev/null +++ b/doc/optimization/gpu_profiling.rst @@ -0,0 +1,237 @@ +Profiling on PaddlePaddle +========================= + +This tutorial will guide you step-by-step through how to conduct profiling and performance tuning using built-in timer, **nvprof** and **nvvp**. + +- What is profiling? +- Why we need profiling? +- How to do profiling? +- Profile tools +- Hands-on Tutorial +- Profiling tips + +What's profiling? +================= +In software engineering, profiling is a form of dynamic program analysis that measures the space (memory) or time +complexity of a program, the usage of particular instructions, or the frequency and duration of function calls. +Most commonly, profiling information serves to aid program optimization. + +Briefly, profiler is used to measure application performance. Program analysis tools are extremely important for +understanding program behavior. Simple profiling can tell you that how long does an operation take? For advanced +profiling, it can interpret why does an operation take a long time? + +Why we need profiling? +====================== +Since training deep neural network typically take a very long time to get over, performance is gradually becoming +the most important thing in deep learning field. The first step to improve performance is to understand what parts +are slow. There is no point in improving performance of a region which doesn’t take much time! + + +How to do profiling? +==================== +To achieve maximum performance, there are five steps you can take to reach your goals. + +- Profile the code +- Find the slow parts +- Work out why they’re slow +- Make them fast +- Profile the code again + +Usually, processor has two key performance limits include float point throughput and +memory throughput. For GPU, it also need more parallelism to fulfill its potential. +This is why they can be so fast. + +Profiler Tools +============== +For general GPU profiling, a bunch of tools are provided from both NVIDIA and third party. + +**nvprof** is Nvidia profiler and **nvvp** is (GUI based) Nvidia visual profiler. +In this tutorial, we will focus on nvprof and nvvp. + +:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate +above profilers. + +.. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp + :language: c++ + :lines: 111-124 + :linenos: + +The above code snippet includes two methods, you can use any of them to profile the regions of interest. + +1. :code:`REGISTER_TIMER_INFO` is a built-in timer wrapper which can calculate the time overhead of both cpu functions and cuda kernels. + +2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid +program crashes when CPU version of PaddlePaddle invokes them. + +You can find more details about how to use both of them in the next session. + +Hands-on Approach +================= + +Built-in Timer +-------------- + +To enable built-in timer in PaddlePaddle, first you have to add :code:`REGISTER_TIMER_INFO` into the regions of you interest. +Then, all information could be stamped in the console via :code:`printStatus` or :code:`printAllStatus` function. +As a simple example, consider the following: + +1. Add :code:`REGISTER_TIMER_INFO` and :code:`printAllStatus` functions (see the emphasize-lines). + + .. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp + :language: c++ + :lines: 111-124 + :emphasize-lines: 8-10,13 + :linenos: + +2. Configure cmake with **WITH_TIMER** and recompile PaddlePaddle. + + .. code-block:: bash + + cmake .. -DWITH_TIMER=ON + make + +3. Execute your code and observe the results (see the emphasize-lines). + + .. code-block:: bash + :emphasize-lines: 1,12-15 + + > ./paddle/math/tests/test_GpuProfiler + I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler + I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions + I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done. + [==========] Running 1 test from 1 test case. + [----------] Global test environment set-up. + [----------] 1 test from Profiler + [ RUN ] Profiler.BilinearFwdBwd + I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im + gSizeX = 64, imgSizeY = 64" + I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751 + I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ====== + I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd total=136.141 avg=136.141 max=136.141 min=136.141 count=1 + I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ====== + I1117 11:13:42.981575 2522362816 Stat.cpp:154] -------------------------------------------------- + [ OK ] Profiler.BilinearFwdBwd (136 ms) + [----------] 1 test from Profiler (136 ms total) + + [----------] Global test environment tear-down + [==========] 1 test from 1 test case ran. (136 ms total) + [ PASSED ] 1 test. + +nvprof profiler +--------------- + +To use this command line profiler **nvprof**, you can simply issue the following command: + +1. Add :code:`REGISTER_GPU_PROFILER` function (see the emphasize-lines). + + .. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp + :language: c++ + :lines: 111-124 + :emphasize-lines: 6-7 + :linenos: + +2. Configure cmake with **WITH_PROFILER** and recompile PaddlePaddle. + + .. code-block:: bash + + cmake .. -DWITH_PROFILER=ON + make + +3. Use Nvidia profiler **nvprof** to profile the binary. + + .. code-block:: bash + + nvprof ./paddle/math/tests/test_GpuProfiler + +Then, you can get the following profiling result: + +.. code-block:: bash + + ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler + ==78544== Profiling result: + Time(%) Time Calls Avg Min Max Name + 27.60% 9.6305ms 5 1.9261ms 3.4560us 6.4035ms [CUDA memcpy HtoD] + 26.07% 9.0957ms 1 9.0957ms 9.0957ms 9.0957ms KeBilinearInterpBw + 23.78% 8.2977ms 1 8.2977ms 8.2977ms 8.2977ms KeBilinearInterpFw + 22.55% 7.8661ms 2 3.9330ms 1.5798ms 6.2863ms [CUDA memcpy DtoH] + + ==78544== API calls: + Time(%) Time Calls Avg Min Max Name + 46.85% 682.28ms 8 85.285ms 12.639us 682.03ms cudaStreamCreateWithFlags + 39.83% 580.00ms 4 145.00ms 302ns 550.27ms cudaFree + 9.82% 143.03ms 9 15.892ms 8.7090us 142.78ms cudaStreamCreate + 1.23% 17.983ms 7 2.5690ms 23.210us 6.4563ms cudaMemcpy + 1.23% 17.849ms 2 8.9247ms 8.4726ms 9.3768ms cudaStreamSynchronize + 0.66% 9.5969ms 7 1.3710ms 288.43us 2.4279ms cudaHostAlloc + 0.13% 1.9530ms 11 177.54us 7.6810us 591.06us cudaMalloc + 0.07% 1.0424ms 8 130.30us 1.6970us 453.72us cudaGetDevice + 0.04% 527.90us 40 13.197us 525ns 253.99us cudaEventCreateWithFlags + 0.03% 435.73us 348 1.2520us 124ns 42.704us cuDeviceGetAttribute + 0.03% 419.36us 1 419.36us 419.36us 419.36us cudaGetDeviceCount + 0.02% 260.75us 2 130.38us 129.32us 131.43us cudaGetDeviceProperties + 0.02% 222.32us 2 111.16us 106.94us 115.39us cudaLaunch + 0.01% 214.06us 4 53.514us 28.586us 77.655us cuDeviceGetName + 0.01% 115.45us 4 28.861us 9.8250us 44.526us cuDeviceTotalMem + 0.01% 83.988us 4 20.997us 578ns 77.760us cudaSetDevice + 0.00% 38.918us 1 38.918us 38.918us 38.918us cudaEventCreate + 0.00% 34.573us 31 1.1150us 279ns 12.784us cudaDeviceGetAttribute + 0.00% 17.767us 1 17.767us 17.767us 17.767us cudaProfilerStart + 0.00% 15.228us 2 7.6140us 3.5460us 11.682us cudaConfigureCall + 0.00% 14.536us 2 7.2680us 1.1490us 13.387us cudaGetLastError + 0.00% 8.6080us 26 331ns 173ns 783ns cudaSetupArgument + 0.00% 5.5470us 6 924ns 215ns 2.6780us cuDeviceGet + 0.00% 5.4090us 6 901ns 328ns 3.3320us cuDeviceGetCount + 0.00% 4.1770us 3 1.3920us 1.0630us 1.8300us cuDriverGetVersion + 0.00% 3.4650us 3 1.1550us 1.0810us 1.2680us cuInit + 0.00% 830ns 1 830ns 830ns 830ns cudaRuntimeGetVersion + + +nvvp profiler +------------- + +For visual profiler **nvvp**, you can either import the output of :code:`nvprof –o ...` or +run application through GUI. + +**Note: nvvp also support CPU profiling** (Click the box in nvvp to enable profile execution on CPU). + +.. image:: nvvp1.png + :align: center + :scale: 33% + +From the perspective of kernel functions, **nvvp** can even illustrate why does an operation take a long time? +As shown in the following figure, kernel's block usage, register usage and shared memory usage from :code:`nvvp` +allow us to fully utilize all warps on the GPU. + +.. image:: nvvp2.png + :align: center + :scale: 33% + +From the perspective of application, **nvvp** can give you some suggestions to address performance bottleneck. +For instance, some advice in data movement and compute utilization from the below figure can guide you to tune performance. + +.. image:: nvvp3.png + :align: center + :scale: 33% + +.. image:: nvvp4.png + :align: center + :scale: 33% + +Profiling tips +============== + +- The **nvprof** and **nvvp** output is a very good place to start. +- The timeline is a good place to go next. +- Only dig deep into a kernel if it’s taking a significant amount of your time. +- Where possible, try to match profiler output with theory. + 1) For example, if I know I’m moving 1GB, and my kernel takes 10ms, I expect the profiler to report 100GB/s. + 2) Discrepancies are likely to mean your application isn’t doing what you thought it was. +- Know your hardware: If your GPU can do 6 TFLOPs, and you’re already doing 5.5 TFLOPs, you won’t go much faster! + + +Profiling is a key step in optimization. Sometimes quite simple changes can lead to big improvements in performance. +Your mileage may vary! + +Reference +========= +Jeremy Appleyard, `GPU Profiling for Deep Learning `_, 2015 diff --git a/doc/optimization/index.rst b/doc/optimization/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..c9e87e0778dfe44fa3d1bb84d0ad340aa6f25d08 --- /dev/null +++ b/doc/optimization/index.rst @@ -0,0 +1,7 @@ +Performance Tuning +================== + +.. toctree:: + :maxdepth: 3 + + gpu_profiling.rst diff --git a/doc/optimization/nvvp1.png b/doc/optimization/nvvp1.png new file mode 100644 index 0000000000000000000000000000000000000000..1af23ac3c52929b2b0645d2f9fa4d4c6db1f6e77 Binary files /dev/null and b/doc/optimization/nvvp1.png differ diff --git a/doc/optimization/nvvp2.png b/doc/optimization/nvvp2.png new file mode 100644 index 0000000000000000000000000000000000000000..177c9db708da6863d1075f3e615f5962dbe18b29 Binary files /dev/null and b/doc/optimization/nvvp2.png differ diff --git a/doc/optimization/nvvp3.png b/doc/optimization/nvvp3.png new file mode 100644 index 0000000000000000000000000000000000000000..d8f393667d6569b6f1e61ffccac43fae5888b6db Binary files /dev/null and b/doc/optimization/nvvp3.png differ diff --git a/doc/optimization/nvvp4.png b/doc/optimization/nvvp4.png new file mode 100644 index 0000000000000000000000000000000000000000..51f2f3e183295de6cf8ddaf2b3b8a0862aa35f01 Binary files /dev/null and b/doc/optimization/nvvp4.png differ diff --git a/doc_cn/algorithm/rnn/hierarchical-layer.md b/doc_cn/algorithm/rnn/hierarchical-layer.rst similarity index 50% rename from doc_cn/algorithm/rnn/hierarchical-layer.md rename to doc_cn/algorithm/rnn/hierarchical-layer.rst index 519653df081d6e7919ada3cbff6aaf4d2a2f6115..a9906b8b9c2036ae349f30d7edee770884f73f99 100644 --- a/doc_cn/algorithm/rnn/hierarchical-layer.md +++ b/doc_cn/algorithm/rnn/hierarchical-layer.rst @@ -1,6 +1,11 @@ -# 支持双层序列作为输入的Layer +########################### +支持双层序列作为输入的Layer +########################### -## 概述 +.. contents:: + +概述 +==== 在自然语言处理任务中,序列是一种常见的数据类型。一个独立的词语,可以看作是一个非序列输入,或者,我们称之为一个0层的序列;由词语构成的句子,是一个单层序列;若干个句子构成一个段落,是一个双层的序列。 @@ -12,55 +17,79 @@ + 单层序列:排成一列的多个元素,每个元素是一个0层序列,元素之间的顺序是重要的输入信息 + 双层序列:排成一列的多个元素,每个元素是一个单层序列,称之为双层序列的一个子序列(subseq),subseq的每个元素是一个0层序列 - 在 PaddlePaddle中,下面这些Layer能够接受双层序列作为输入,完成相应的计算。 -## pooling_layer - -pooling_layer的使用示例如下,详细见配置API。 -```python -seq_pool = pooling_layer(input=layer, - pooling_type=AvgPooling(), - agg_level=AggregateLevel.EACH_SEQUENCE) -``` + +pooling_layer +============== + +pooling_layer 的使用示例如下,详细见 `pooling_layer`_ 配置API。 + +.. code-block:: bash + + seq_pool = pooling_layer(input=layer, + pooling_type=AvgPooling(), + agg_level=AggregateLevel.EACH_SEQUENCE) + - `pooling_type` 目前支持两种,分别是:MaxPooling()和AvgPooling()。 -- `agg_level=AggregateLevel.TIMESTEP`时(默认值): + +- `agg_level=AggregateLevel.TIMESTEP` 时(默认值): + - 作用:双层序列经过运算变成一个0层序列,或单层序列经过运算变成一个0层序列 - 输入:一个双层序列,或一个单层序列 - 输出:一个0层序列,即整个输入序列(单层或双层)的平均值(或最大值) -- `agg_level=AggregateLevel.EACH_SEQUENCE`时: + +- `agg_level=AggregateLevel.EACH_SEQUENCE` 时: + - 作用:一个双层序列经过运算变成一个单层序列 - 输入:必须是一个双层序列 - 输出:一个单层序列,序列的每个元素是原来双层序列每个subseq元素的平均值(或最大值) -## last_seq 和 first_seq +last_seq 和 first_seq +===================== + +last_seq 的使用示例如下( `first_seq`_ 类似),详细见 `last_seq`_ 配置API。 + +.. code-block:: bash + + last = last_seq(input=layer, + agg_level=AggregateLevel.EACH_SEQUENCE) + +- `agg_level=AggregateLevel.TIMESTEP` 时(默认值): -last_seq的使用示例如下(first_seq类似),详细见配置API。 -```python -last = last_seq(input=layer, - agg_level=AggregateLevel.EACH_SEQUENCE) -``` -- `agg_level=AggregateLevel.TIMESTEP`时(默认值): - 作用:一个双层序列经过运算变成一个0层序列,或一个单层序列经过运算变成一个0层序列 - 输入:一个双层序列或一个单层序列 - 输出:一个0层序列,即整个输入序列(双层或者单层)最后一个,或第一个元素。 -- `agg_level=AggregateLevel.EACH_SEQUENCE`时: + +- `agg_level=AggregateLevel.EACH_SEQUENCE` 时: - 作用:一个双层序列经过运算变成一个单层序列 - 输入:必须是一个双层序列 - 输出:一个单层序列,其中每个元素是双层序列中每个subseq最后一个(或第一个)元素。 -## expand_layer +expand_layer +============ + +expand_layer 的使用示例如下,详细见 `expand_layer`_ 配置API。 + +.. code-block:: bash + + expand = expand_layer(input=layer1, + expand_as=layer2, + expand_level=ExpandLevel.FROM_TIMESTEP) + +- `expand_level=ExpandLevel.FROM_TIMESTEP` 时(默认值): -expand_layer的使用示例如下,详细见配置API。 -```python -expand = expand_layer(input=layer1, - expand_as=layer2, - expand_level=ExpandLevel.FROM_TIMESTEP) -``` -- `expand_level=ExpandLevel.FROM_TIMESTEP`时(默认值): - 作用:一个0层序列经过运算扩展成一个单层序列,或者一个双层序列 - - 输入:layer1必须是一个0层序列,是待扩展的数据;layer2可以是一个单层序列,或者是一个双层序列,提供扩展的长度信息 - - 输出:一个单层序列,或一个双层序列,输出序列的类型(双层序列,或单层序列)和序列中含有元素的数目同 layer2一致。若输出是单层序列,单层序列的每个元素(0层序列),都是对layer1元素的拷贝;若输出是双层序列,双层序列每个subseq中每个元素(0层序列),都是对layer1元素的拷贝 -- `expand_level=ExpandLevel.FROM_SEQUENCE`时: + - 输入:layer1必须是一个0层序列,是待扩展的数据;layer2 可以是一个单层序列,或者是一个双层序列,提供扩展的长度信息 + - 输出:一个单层序列或一个双层序列,输出序列的类型(双层序列或单层序列)和序列中含有元素的数目同 layer2 一致。若输出是单层序列,单层序列的每个元素(0层序列),都是对layer1元素的拷贝;若输出是双层序列,双层序列每个subseq中每个元素(0层序列),都是对layer1元素的拷贝 + +- `expand_level=ExpandLevel.FROM_SEQUENCE` 时: + - 作用:一个单层序列经过运算扩展成一个双层序列 - - 输入:layer1必须是一个单层序列,是待扩展的数据;layer2必须是一个双层序列,提供扩展的长度信息 - - 输出:一个双层序列,序列中含有元素的数目同layer2一致。要求单层序列含有元素的数目(0层序列),和双层序列含有subseq 的数目一致。单层序列第i个元素(0层序列),被扩展为一个单层序列,构成了输出双层序列的第i个subseq。 + - 输入:layer1必须是一个单层序列,是待扩展的数据;layer2 必须是一个双层序列,提供扩展的长度信息 + - 输出:一个双层序列,序列中含有元素的数目同 layer2 一致。要求单层序列含有元素的数目(0层序列)和双层序列含有subseq 的数目一致。单层序列第i个元素(0层序列),被扩展为一个单层序列,构成了输出双层序列的第i个 subseq 。 + + +.. _pooling_layer: ../../../doc/ui/api/trainer_config_helpers/layers.html#pooling-layer +.. _last_seq: ../../../doc/ui/api/trainer_config_helpers/layers.html#last-seq +.. _first_seq: ../../../doc/ui/api/trainer_config_helpers/layers.html#first-seq +.. _expand_layer: ../../../doc/ui/api/trainer_config_helpers/layers.html#expand-layer diff --git a/doc_cn/build_and_install/install/ubuntu_install.rst b/doc_cn/build_and_install/install/ubuntu_install.rst index 70ac5225bd82e40838875b49f67e70ff08eff853..0fb59e25f6932214a3f1c67b12b426e388c3fc5d 100644 --- a/doc_cn/build_and_install/install/ubuntu_install.rst +++ b/doc_cn/build_and_install/install/ubuntu_install.rst @@ -11,7 +11,7 @@ PaddlePaddle的ubuntu安装包分为四个版本,他们是 cpu、gpu、cpu-noa .. code-block:: shell - gdebi paddle-*-cpu.deb + gdebi paddle-*-cpu*.deb 如果 :code:`gdebi` 没有安装,则需要使用 :code:`sudo apt-get install gdebi`, 来安装 :code:`gdebi` 。 @@ -20,7 +20,7 @@ PaddlePaddle的ubuntu安装包分为四个版本,他们是 cpu、gpu、cpu-noa .. code-block:: shell - dpkg -i paddle-*-cpu.deb + dpkg -i paddle-*-cpu*.deb apt-get install -f 在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的, diff --git a/paddle/cuda/include/hl_cuda.h b/paddle/cuda/include/hl_cuda.h index 357286e3188a6f3184bc56e75232bf2e1ec54e44..2c7d665101f36f9c32ab132ca279abf3ac062a8f 100644 --- a/paddle/cuda/include/hl_cuda.h +++ b/paddle/cuda/include/hl_cuda.h @@ -15,8 +15,8 @@ limitations under the License. */ #ifndef HL_CUDA_H_ #define HL_CUDA_H_ -#include "hl_base.h" #include +#include "hl_base.h" /** * @brief HPPL event. @@ -332,4 +332,14 @@ extern bool hl_cuda_event_is_ready(hl_event_t event); */ extern void hl_device_synchronize(); +/** + * @brief gpu profiler start + */ +extern void hl_profiler_start(); + +/** + * @brief gpu profiler stop + */ +extern void hl_profiler_end(); + #endif // HL_CUDA_H_ diff --git a/paddle/cuda/include/stub/hl_cuda_stub.h b/paddle/cuda/include/stub/hl_cuda_stub.h index 1f91068cdf8b3d472c4b403d1ec7d5293c28c07e..24923a0d4a0cdd49214305c2f7716eeef575c7ee 100644 --- a/paddle/cuda/include/stub/hl_cuda_stub.h +++ b/paddle/cuda/include/stub/hl_cuda_stub.h @@ -90,4 +90,8 @@ inline bool hl_cuda_event_is_ready(hl_event_t event) { return true; } inline void hl_device_synchronize() {} +inline void hl_profiler_start() {} + +inline void hl_profiler_end() {} + #endif // HL_CUDA_STUB_H_ diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc index 745be35b56278ed2e0033d5fd2806320d3164d7c..6b71a538485a09cf40a53eddf1ee2f3e2c768b2c 100644 --- a/paddle/cuda/src/hl_cuda_device.cc +++ b/paddle/cuda/src/hl_cuda_device.cc @@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#include #include -#include #include +#include +#include #include #include "hl_cuda.h" #include "hl_cuda.ph" -#include "hl_thread.ph" #include "hl_dso_loader.h" +#include "hl_thread.ph" #include "paddle/utils/Logging.h" namespace dynload { @@ -133,7 +134,9 @@ void *cudart_dso_handle = nullptr; __macro(cudaGetLastError) \ __macro(cudaFuncSetCacheConfig) \ __macro(cudaRuntimeGetVersion) \ - __macro(cudaGetErrorString) + __macro(cudaGetErrorString) \ + __macro(cudaProfilerStart) \ + __macro(cudaProfilerStop) // clang-format on CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP) @@ -742,3 +745,7 @@ bool hl_cuda_event_is_ready(hl_event_t event) { } return true; } + +void hl_profiler_start() { CHECK_CUDA(dynload::cudaProfilerStart()); } + +void hl_profiler_end() { CHECK_CUDA(dynload::cudaProfilerStop()); } diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt index 247be983ba3296383c8e2f30f1036859ecfde492..33d4478b4d36d7be5da6fb43365acb95e5bc7d04 100644 --- a/paddle/math/tests/CMakeLists.txt +++ b/paddle/math/tests/CMakeLists.txt @@ -14,3 +14,4 @@ add_simple_unittest(test_perturbation) add_simple_unittest(test_CpuGpuVector) add_simple_unittest(test_Allocator) add_simple_unittest(test_FPException) +add_simple_unittest(test_GpuProfiler) \ No newline at end of file diff --git a/paddle/math/tests/test_GpuProfiler.cpp b/paddle/math/tests/test_GpuProfiler.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c3542b7834224e2fa6fe323a1fbe8ea1e7cd68de --- /dev/null +++ b/paddle/math/tests/test_GpuProfiler.cpp @@ -0,0 +1,137 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef PADDLE_ONLY_CPU + +#include "paddle/utils/Util.h" +#include "paddle/math/Matrix.h" +#include "paddle/math/SparseMatrix.h" +#include +#include "paddle/gserver/tests/TestUtil.h" +#include "paddle/utils/Stat.h" + +using namespace paddle; // NOLINT +using namespace std; // NOLINT + +void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) { + CHECK(matrix1.getHeight() == matrix2.getHeight()); + CHECK(matrix1.getWidth() == matrix2.getWidth()); +#ifndef PADDLE_TYPE_DOUBLE + real err = 1e-3; +#else + real err = 1e-10; +#endif + + int height = matrix1.getHeight(); + int width = matrix1.getWidth(); + const real* data1 = matrix1.getData(); + const real* data2 = matrix2.getData(); + int count = 0; + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + real a = data1[i * width + j]; + real b = data2[i * width + j]; + if (fabs(a - b) > err) { + if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) { + count++; + } + } + } + } + EXPECT_EQ(count, 0) << "There are " << count << " different element."; +} + +void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW, + int channels) { + int inWidth = imgSizeH * imgSizeW * channels; + int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels; + real ratioH = 0.5; + real ratioW = 0.5; + + // forward + MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false); + MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true); + + MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false); + MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true); + MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false); + + input->randomizeUniform(); + inputGpu->copyFrom(*input); + + { + // nvprof: GPU Proflier + REGISTER_GPU_PROFILER("testBilinearFwdBwd"); + target->bilinearForward(*input, imgSizeH, imgSizeW, + 2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW); + targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW, + 2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW); + } + + // check + targetCheck->copyFrom(*targetGpu); + MatrixCheckErr(*target, *targetCheck); + + // backward + MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false); + MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true); + + MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false); + MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, false, + true); + MatrixPtr targetCheckGrad = + CpuMatrix::create(numSamples, inWidth, false, false); + + inputGrad->randomizeUniform(); + targetGrad->randomizeUniform(); + inputGpuGrad->copyFrom(*inputGrad); + targetGpuGrad->copyFrom(*targetGrad); + + inputGrad->bilinearBackward(*targetGrad, 2 * imgSizeH, 2 * imgSizeW, + imgSizeH, imgSizeW, channels, ratioH, ratioW); + inputGpuGrad->bilinearBackward(*targetGpuGrad, 2 * imgSizeH, 2 * imgSizeW, + imgSizeH, imgSizeW, channels, ratioH, ratioW); + + // check + targetCheckGrad->copyFrom(*inputGpuGrad); + MatrixCheckErr(*inputGrad, *targetCheckGrad); +} + +TEST(Profiler, testBilinearFwdBwd) { + auto numSamples = 10; + auto channels = 16; + auto imgSize = 64; + { + // nvprof: GPU Proflier + REGISTER_GPU_PROFILER("testBilinearFwdBwd"); + // Paddle built-in timer + REGISTER_TIMER_INFO("testBilinearFwdBwd", + "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64"); + testBilinearFwdBwd(numSamples, imgSize, imgSize, channels); + } + globalStat.printAllStatus(); +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + initMain(argc, argv); + + // nvprof: GPU Proflier + REGISTER_GPU_PROFILER("RecursiveProfilingTest", + "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64"); + + return RUN_ALL_TESTS(); +} + +#endif /* PADDLE_ONLY_CPU */ diff --git a/paddle/utils/Stat.cpp b/paddle/utils/Stat.cpp index d7b20ca5eb2f4eadaa6b4acad056d669a9b59c14..ab140c33502ad315d087bb3afc7f39bffc122894 100644 --- a/paddle/utils/Stat.cpp +++ b/paddle/utils/Stat.cpp @@ -65,6 +65,7 @@ std::ostream& operator<<(std::ostream& outPut, const Stat& stat) { auto showStat = [&](const StatInfo* info, pid_t tid, bool isFirst = true) { uint64_t average = 0; if (info->count_ > 0) { + outPut << std::setfill(' ') << std::left; if (!isFirst) { outPut << std::setw(42) << " "; } @@ -202,4 +203,22 @@ StatInfo::~StatInfo() { } } +static unsigned g_profileCount = 0; +static std::recursive_mutex g_profileMutex; + +GpuProfiler::GpuProfiler(std::string statName, std::string info) + : guard_(g_profileMutex) { + if (++g_profileCount == 1) { + LOG(INFO) << "Enable GPU Profiler Stat: [" + << statName << "] " << info; + hl_profiler_start(); + } +} + +GpuProfiler::~GpuProfiler() { + if (--g_profileCount == 0) { + hl_profiler_end(); + } +} + } // namespace paddle diff --git a/paddle/utils/Stat.h b/paddle/utils/Stat.h index 4051145d9246639fce5d041103c1211a939eddca..1ef688ea8da53ee0cd51b1775e671f2b10be782b 100644 --- a/paddle/utils/Stat.h +++ b/paddle/utils/Stat.h @@ -15,19 +15,19 @@ limitations under the License. */ #pragma once #include -#include #include -#include #include +#include +#include #include +#include #include -#include -#include "Logging.h" #include "BarrierStat.h" #include "Locks.h" +#include "Logging.h" #include "ThreadLocal.h" -#include "BarrierStat.h" +#include "hl_gpu.h" namespace paddle { @@ -283,4 +283,24 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1, #endif // DISABLE_TIMER +class GpuProfiler final { +public: + GpuProfiler(std::string statName, std::string info); + ~GpuProfiler(); + +private: + std::lock_guard guard_; +}; + +#ifdef PADDLE_DISABLE_PROFILER + +#define REGISTER_GPU_PROFILER(statName, ...) + +#else + +#define REGISTER_GPU_PROFILER(statName, ...) \ + GpuProfiler __gpuProfiler(statName, #__VA_ARGS__); + +#endif // DISABLE_PROFILER + } // namespace paddle diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py index 6261934e1bc8e8df62aeaa0757f4a237f91ef748..eeed18a98a27313dac65a695960043d0543bb577 100644 --- a/python/paddle/trainer_config_helpers/activations.py +++ b/python/paddle/trainer_config_helpers/activations.py @@ -16,7 +16,8 @@ __all__ = [ "TanhActivation", "SigmoidActivation", "SoftmaxActivation", "IdentityActivation", "LinearActivation", 'SequenceSoftmaxActivation', 'ExpActivation', "ReluActivation", "BReluActivation", "SoftReluActivation", - "STanhActivation", "AbsActivation", "SquareActivation", "BaseActivation" + "STanhActivation", "AbsActivation", "SquareActivation", "BaseActivation", + "LogActivation" ]