diff --git a/.travis.yml b/.travis.yml
index d0e2696f100e55f320e410afd6a3038db647f76f..c51e02eb79a9e53a2b8d1d663e8f0c3e0d8c3a61 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -30,6 +30,7 @@ addons:
- automake
- libtool
- ccache
+ ssh_known_hosts: 52.76.173.135
before_install:
- if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
# Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
@@ -42,6 +43,14 @@ script:
- |
timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
+ - |
+ if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
+ if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
+ if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
+ export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
+ export DOCS_DIR=`pwd`
+ cd ..
+ curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc
notifications:
email:
on_success: change
diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md
index 1bf9ea9df02a1f0e0b71400207a9f375a2b3d25b..040f5ffa41968cbf93a817faa1db86c18956341e 100644
--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@@ -23,7 +23,7 @@ On each machine, we will test and compare the performance of training on single
## Benchmark Model
### Server
-Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148M CPU @ 2.40GHz
+Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
Input image size - 3 * 224 * 224, Time: images/second
diff --git a/doc/design/images/asgd.gif b/doc/design/images/asgd.gif
new file mode 100644
index 0000000000000000000000000000000000000000..4a0da7bf6df9326a2aab1638b77c5455c18b8c4e
Binary files /dev/null and b/doc/design/images/asgd.gif differ
diff --git a/doc/design/images/theta_star.gif b/doc/design/images/theta_star.gif
new file mode 100644
index 0000000000000000000000000000000000000000..dd24d33e124396be3fc410c9b12f33148f64efe2
Binary files /dev/null and b/doc/design/images/theta_star.gif differ
diff --git a/doc/design/parameter_average.md b/doc/design/parameter_average.md
new file mode 100644
index 0000000000000000000000000000000000000000..2c4edee9fe31d502ea62b9fe5c8757c0a4c5e79f
--- /dev/null
+++ b/doc/design/parameter_average.md
@@ -0,0 +1,72 @@
+# Averaging Parameter in PaddlePaddle
+
+## Why Averaging
+In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable if we can obtain the optimal values of parameters by going through the data in as few passes as we can.
+
+Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
+
+Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for
. The averaging is done as follows:
+
+
+
+We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
+
+### How to perform Parameter Averaging in PaddlePaddle
+
+Parameter Averaging in PaddlePaddle works in the following way during training :
+1. It will take in an instance of a normal optimizer as an input, e.g. RMSPropOptimizer
+2. The optimizer itself is responsible for updating the parameters.
+3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself:
+ 1. In concept, the values of this copy are the average of the values of the parameters in the most recent N batches.
+ 2. However, saving all the N instances of the parameters in memory is not feasible.
+ 3. Therefore, an approximation algorithm is used.
+
+Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved.
+
+During the testing/ saving the model phase, we perform the following steps:
+1. Perform the delayed operations.
+2. Save current values of the parameters to a temporary variable.
+3. Replace the values of the parameters with the averaged values.
+4. Perform testing and/or save the parameters.
+5. Restore the values of the parameters once done.
+
+### How to implement Averaging of Parameter in PaddlePaddle
+
+We can add the ParameterAverageOptimizer op to the graph through Python API. Using this approach, we manually add this op to the graph and direct the output of the optimizer op to this op during training.
+
+ **Advantages**:
+ - Allows for greater flexibility to the users of PaddlePaddle. Using this approach, the users can plug different optimizers into ParameterAverageOptimizer by passing in the optimizer to the op.
+ - Makes it easy for the users to customize and extend the framework.
+
+ **Disadvantages**:
+ - Implementation requires re-writing the averaging methodology in Python.
+
+### Low-Level implementation
+
+In the new design, we propose to create a new operation for averaging parameter updates (ParameterAverageOptimizer). For now, we can add an op that takes in the following as input:
+- the optimizer
+- the window_size to keep the updates
+
+The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU.
+
+The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API.
+
+### Python API implementation for ParameterAverageOptimizer
+
+Based on Polyak and Juditsky (1992), we can generalize the averaging of updates to any optimizer. The input to the op would be the following:
+- Any optimizer (RMSProp , AdaGrad etc.)
+- A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision.
+
+Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions.
+We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.cc)
+
+#### Creation of the ParameterAverageOptimizer operator
+There are two ways for creating the ParameterAverageOptimizer op:
+1. We create the op immediately while building the computation graph.
+2. We add the op in a lazy manner, just before the backward pass, similar to the way the optimization ops are added.
+
+The proposal is to add the op immediately while building the computation graph.
+
+#### High-level API
+
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions.
diff --git a/doc/howto/cross_compiling/cross_compiling_for_android.md b/doc/howto/cross_compiling/cross_compiling_for_android.md
new file mode 100644
index 0000000000000000000000000000000000000000..161863e5c0a2c002af7d7611dad53c2c19148722
--- /dev/null
+++ b/doc/howto/cross_compiling/cross_compiling_for_android.md
@@ -0,0 +1,153 @@
+# Build PaddlePaddle for Android
+
+There are two approaches to build PaddlePaddle for Android: using Docker and on Linux without Docker.
+
+## Cross-Compiling Using Docker
+
+Docker-based cross-compiling is the recommended approach because Docker runs on all major operating systems, including Linux, Mac OS X, and Windows.
+
+### Build the Docker Image
+
+The following steps pack all the tools that we need to build PaddlePaddle into a Docker image.
+
+```bash
+$ git clone https://github.com/PaddlePaddle/Paddle.git
+$ cd Paddle
+$ docker build -t paddle:dev-android . -f Dockerfile.android
+```
+
+### Build the Inference Library
+
+We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
+
+```bash
+$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android
+```
+
+The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
+
+| Argument | Optional Values | Default |
+|-----------------|-------------------------|---------|
+|`ANDROID_ABI` |`armeabi-v7a, arm64-v8a` | `armeabi-v7a` |
+|`ANDROID_API` |`>= 21` | `21` |
+
+The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API.
+
+The default entry-point of the Docker image, [`paddle/scripts/docker/build_android.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`. For information about other configuration arguments, please continue reading.
+
+The above command generates and outputs the inference library in `$PWD/install_android` and puts third-party libraries in `$PWD/install_android/third_party`.
+
+## Cross-Compiling on Linux
+
+The Linux-base approach to cross-compile is to run steps in `Dockerfile.android` manually on a Linux x64 computer.
+
+### Setup the Environment
+
+To build for Android's, we need [Android NDK](
+https://developer.android.com/ndk/downloads/index.html):
+
+```bash
+wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
+unzip -q android-ndk-r14b-linux-x86_64.zip
+```
+
+Android NDK includes everything we need to build the [*standalone toolchain*](https://developer.android.com/ndk/guides/standalone_toolchain.html), which in then used to build PaddlePaddle for Android. (We plan to remove the intermediate stage of building the standalone toolchain in the near future.)
+
+- To build the standalone toolchain for `armeabi-v7a` and Android API level 21:
+
+ ```bash
+ your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+ --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
+ ```
+
+ The generated standalone toolchain will be in `your/path/to/arm_standalone_toolchain`.
+
+- To build the standalone toolchain for `arm64-v8a` and Android API level 21:
+
+ ```bash
+ your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+ --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
+ ```
+
+ The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`.
+
+**Please be aware that the minimum level of Android API required by PaddlePaddle is 21.**
+
+### Cross-Compiling Arguments
+
+CMake supports [choosing the toolchain](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling). PaddlePaddle provides [`android.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake), which configures the Android cross-compiling toolchain for CMake. `android.cmake` is not required for CMake >= 3.7, which support Android cross-compiling. PaddlePaddle detects the CMake version, for those newer than 3.7, it uses [the official version](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling).
+
+Some other CMake arguments you need to know:
+
+- `CMAKE_SYSTEM_NAME` must be `Android`. This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, and `WITH_RDMA=OFF`.
+- `WITH_C_API` must be `ON`, to build the C-based inference library for Android.
+- `WITH_SWIG_PY` must be `OFF` because the Android platform doesn't support SWIG-based API.
+
+Some Android-specific arguments:
+
+- `ANDROID_STANDALONE_TOOLCHAIN`: the absolute path of the Android standalone toolchain, or the path relative to the CMake build directory. PaddlePaddle's CMake extensions would derive the cross-compiler, sysroot and Android API level from this argument.
+- `ANDROID_TOOLCHAIN`: could be `gcc` or `clang`. The default value is `clang`.
+ - For CMake >= 3.7, it should anyway be `clang`. For older versions, it could be `gcc`.
+ - Android's official `clang` requires `glibc` >= 2.15.
+- `ANDROID_ABI`: could be `armeabi-v7a` or `arm64-v8a`. The default value is `armeabi-v7a`.
+- `ANDROID_NATIVE_API_LEVEL`: could be derived from the value of `ANDROID_STANDALONE_TOOLCHAIN`.
+- `ANROID_ARM_MODE`:
+ - could be `ON` or `OFF`, and defaults to `ON`, when `ANDROID_ABI=armeabi-v7a`;
+ - no need to specify when `ANDROID_ABI=arm64-v8a`.
+- `ANDROID_ARM_NEON`: indicates if to use NEON instructions.
+ - could be `ON` or `OFF`, and defaults to `ON`, when `ANDROID_ABI=armeabi-v7a`;
+ - no need to specify when `ANDROID_ABI=arm64-v8a`.
+
+Other useful arguments:
+
+- `USE_EIGEN_FOR_BLAS`: indicates if using Eigen. Could be `ON` or `OFF`, defaults to `OFF`.
+- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS. It defaults to the value of the environment variable `CC`, or `cc`.
+
+Some frequent configurations for your reference:
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=Android \
+ -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm_standalone_toolchain \
+ -DANDROID_ABI=armeabi-v7a \
+ -DANDROID_ARM_NEON=ON \
+ -DANDROID_ARM_MODE=ON \
+ -DUSE_EIGEN_FOR_BLAS=ON \
+ -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+ -DWITH_C_API=ON \
+ -DWITH_SWIG_PY=OFF \
+ ..
+```
+
+```
+cmake -DCMAKE_SYSTEM_NAME=Android \
+ -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \
+ -DANDROID_ABI=arm64-v8a \
+ -DUSE_EIGEN_FOR_BLAS=OFF \
+ -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+ -DWITH_C_API=ON \
+ -DWITH_SWIG_PY=OFF \
+ ..
+```
+
+
+There are some other arguments you might want to configure.
+
+- `CMAKE_BUILD_TYPE=MinSizeRel` minimizes the size of library.
+- `CMAKE_BUILD_TYPE-Release` optimizes the runtime performance.
+
+Our own tip for performance optimization to use clang and Eigen or OpenBLAS:
+- `CMAKE_BUILD_TYPE=Release`
+- `ANDROID_TOOLCHAIN=clang`
+- `USE_EIGEN_BLAS=ON` for `armeabi-v7a`, or `USE_EIGEN_FOR_BLAS=OFF` for `arm64-v8a`.
+
+### Build and Install
+
+After running `cmake`, we can run `make; make install` to build and install.
+
+Before building, you might want to remove the `third_party` and `build` directories including pre-built libraries for other architectures.
+
+After building,in the directory `CMAKE_INSTALL_PREFIX`, you will find three sub-directories:
+
+- `include`: the header file of the inference library,
+- `lib`: the inference library built for various Android ABIs,
+- `third_party`: dependent third-party libraries built for Android.
diff --git a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md b/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
index 1fc58c37cc9151d5e4d99b939e30c29aa99e04f1..58e4dd9c3fe43f963d00152aa4f456fadbb12bf3 100644
--- a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
+++ b/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
@@ -1,7 +1,7 @@
# 构建Android平台上的PaddlePaddle库
用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库:
-- 基于Docker容器的编译方式
+- 基于Docker容器的编译方式
- 基于Linux交叉编译环境的编译方式
## 基于Docker容器的编译方式
@@ -26,14 +26,14 @@ Android的Docker开发镜像向用户提供两个可配置的参数:
|`ANDROID_API` |`>= 21` | `21` |
- 编译`armeabi-v7a`,`Android API 21`的PaddlePaddle库
-```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
-```
+ ```bash
+ $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
+ ```
-- 编译`arm64-v8a`,`Android API 21`的PaddlePaddle库
-```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
-```
+- 编译`arm64-v8a`,`Android API 21`的PaddlePaddle库
+ ```bash
+ $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
+ ```
执行上述`docker run`命令时,容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置,并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`,`ANDROID_API<21`时,Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文**配置交叉编译参数**章节,根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后,PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录,所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
@@ -82,16 +82,16 @@ CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cm
Android平台可选配置参数:
- `ANDROID_STANDALONE_TOOLCHAIN`,独立工具链所在的绝对路径,或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动推导和设置需要使用的交叉编译器、sysroot、以及Android API级别;否则,用户需要在cmake时手动设置这些值。无默认值。
-- `ANDROID_TOOLCHAIN`,目标工具链。可设置`gcc/clang`,默认值为`clang`。
- - CMake 3.7以上,将会始终使用`clang`工具链;CMake 3.7以下,可设置`ANDROID_TOOLCHAIN=gcc`以使用`gcc`工具链。
+- `ANDROID_TOOLCHAIN`,目标工具链。可设置`gcc/clang`,默认值为`clang`。
+ - CMake 3.7以上,将会始终使用`clang`工具链;CMake 3.7以下,可设置`ANDROID_TOOLCHAIN=gcc`以使用`gcc`工具链。
- Android官方提供的`clang`编译器要求系统支持`GLIBC 2.15`以上。
- `ANDROID_ABI`,目标架构ABI。目前支持`armeabi-v7a`和`arm64-v8a`,默认值为`armeabi-v7a`。
- `ANDROID_NATIVE_API_LEVEL`,工具链的Android API级别。若没有显式设置,PaddlePaddle将根据`ANDROID_STANDALONE_TOOLCHAIN`的值自动推导得到。
-- `ANROID_ARM_MODE`,是否使用ARM模式。
- - `ANDROID_ABI=armeabi-v7a`时,可设置`ON/OFF`,默认值为`ON`;
+- `ANROID_ARM_MODE`,是否使用ARM模式。
+ - `ANDROID_ABI=armeabi-v7a`时,可设置`ON/OFF`,默认值为`ON`;
- `ANDROID_ABI=arm64-v8a`时,不需要设置。
-- `ANDROID_ARM_NEON`,是否使用NEON指令。
- - `ANDROID_ABI=armeabi-v7a`时,可设置`ON/OFF`,默认值为`ON`;
+- `ANDROID_ARM_NEON`,是否使用NEON指令。
+ - `ANDROID_ABI=armeabi-v7a`时,可设置`ON/OFF`,默认值为`ON`;
- `ANDROID_ABI=arm64-v8a`时,不需要设置。
其他配置参数:
@@ -119,7 +119,7 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
-DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \
-DANDROID_ABI=arm64-v8a \
-DUSE_EIGEN_FOR_BLAS=OFF \
- -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+ -DCMAKE_INSTALL_PREFIX=your/path/to/install \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
..
@@ -128,8 +128,8 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。
**性能TIPS**,为了达到最快的计算速度,在CMake参数配置上,有以下建议:
-- 设置`CMAKE_BUILD_TYPE`为`Release`
-- 使用`clang`编译工具链
+- 设置`CMAKE_BUILD_TYPE`为`Release`
+- 使用`clang`编译工具链
- `armeabi-v7a`时,设置`USE_EIGEN_BLAS=ON`,使用Eigen进行矩阵计算;`arm64-v8a`时,设置`USE_EIGEN_FOR_BLAS=OFF`,使用OpenBLAS进行矩阵计算
### 编译和安装
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
index c7f25109972195fb56b9e96c4b68d952363e6338..7daca18761b80eac0f876b21377a6ccc6a853485 100644
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -300,4 +300,12 @@ extern void hl_matrix_col2Vol(real* dataDst,
real alpha,
real beta);
+/**
+ * @brief Matrix col2Vol: Convert col matrix into 3D volume
+ * @param[out] out output int vector.
+ * @param[in] vec input float vector.
+ * @param[in] size size of the vector.
+ */
+extern void hl_vector_cast2int(int* out, real* vec, int size);
+
#endif /* HL_MATRIX_H_ */
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
index 6ac332945c8f09fef23f35680ba5bb1d9ba9f4fd..46e77e140768dd80fd327dd4eb3b0f62a3370950 100644
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -133,4 +133,6 @@ inline void hl_matrix_col2Vol(real* dataDst,
real alpha,
real beta) {}
+inline void hl_vector_cast2int(int* out, real* vec, int size) {}
+
#endif // HL_MATRIX_STUB_H_
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index b41a3a1e06db7b2566acef19ce430645f79d486d..607efb4f6b0aa0d22a2789397b8743f7a5271d5b 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -793,3 +793,14 @@ void hl_matrix_col2Vol(real* dataDst,
CHECK_SYNC("hl_matrix_col2Vol failed");
}
+
+__global__ void keVectorCast2Int(int* out, real* vec, int size) {
+ for (int i = threadIdx.x; i < (size); i += blockDim.x) {
+ out[i] = int(vec[i]);
+ }
+}
+
+void hl_vector_cast2int(int* out, real* vec, int size) {
+ keVectorCast2Int<<<1, 512, 0, STREAM_DEFAULT>>>(out, vec, size);
+ CHECK_SYNC("hl_vector_cast2int failed");
+}
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index dbd5a14f9f3b681f0b77b9bd507b34edfaa78766..ed94540c268e5ed990c1d92859c6a2093c052868 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -24,7 +24,6 @@
#include "paddle/framework/op_registry.h"
#include "paddle/operators/dynamic_recurrent_op.h"
#include "paddle/operators/net_op.h"
-#include "paddle/operators/recurrent_op.h"
namespace paddle {
namespace framework {
@@ -38,7 +37,7 @@ static inline std::unique_ptr CreateGradOp(
op_desc.SetType(op.Type());
op_desc.SetAttrMap(op.Attrs());
auto& info = OpInfoMap::Instance().Get(op.Type());
- auto grad_descs = info.GradOpMaker()(op_desc, no_grad_set, grad_to_var);
+ auto grad_descs = info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, {});
std::vector> grad_ops;
grad_ops.reserve(grad_descs.size());
std::transform(grad_descs.begin(), grad_descs.end(),
@@ -220,19 +219,7 @@ static std::unique_ptr BackwardRecursive(
});
// process recurrent gradient op as a special operator.
- if (forwardOp.Type() == "recurrent") {
- // NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
- // or this will result in infinite loop.
- const auto& rnnop =
- *static_cast(&forwardOp);
- auto rnn_grad_op =
- static_cast(grad_op.get());
- const auto& stepnet_op =
- *static_cast(&rnnop.stepnet());
- // create stepnet's gradient op
- rnn_grad_op->set_stepnet(
- BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id));
- } else if (forwardOp.Type() == "dynamic_recurrent") {
+ if (forwardOp.Type() == "dynamic_recurrent") {
// NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
// or this will result in infinite loop.
const auto& rnnop =
@@ -331,7 +318,7 @@ static void CreateGradVarInBlock(
continue;
}
auto pname = FwdName(arg);
- auto* param = block_desc->FindVar(pname);
+ auto* param = block_desc->FindVarRecursive(pname);
auto* grad = block_desc->FindVar(arg);
if (param == nullptr) {
LOG(WARNING) << "Cannot find forward variable of " << arg
@@ -348,7 +335,9 @@ static void CreateGradVarInBlock(
std::vector> MakeOpGrad(
const OpDescBind* op_desc, std::unordered_set* no_grad_vars,
- std::unordered_map* grad_to_var) {
+ std::unordered_map* grad_to_var,
+ const std::vector& grad_block =
+ std::vector()) {
std::vector> grad_op_descs;
// All input gradients of forwarding operator do not need to calculate.
const std::vector& inputs = op_desc->InputArgumentNames();
@@ -364,9 +353,10 @@ std::vector> MakeOpGrad(
return grad_op_descs; // empty vector
}
- grad_op_descs = OpInfoMap::Instance()
- .Get(op_desc->Type())
- .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var);
+ grad_op_descs =
+ OpInfoMap::Instance()
+ .Get(op_desc->Type())
+ .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var, grad_block);
std::list> pending_fill_zeros_ops;
for (auto& desc : grad_op_descs) {
@@ -400,21 +390,20 @@ std::vector> MakeBlockBackward(
std::vector> backward_descs;
for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
- std::vector> op_grads =
- MakeOpGrad(*it, no_grad_vars, grad_to_var);
+ std::vector> op_grads;
if ((*it)->Type() == "recurrent") {
- PADDLE_ENFORCE_EQ(
- op_grads.size(), static_cast(1),
- "rnn_op's gradient process should contain only one op.");
int step_block_idx = (*it)->GetBlockAttr("step_block");
auto backward_block_op_descs = MakeBlockBackward(
program_desc, step_block_idx, no_grad_vars, grad_to_var);
- BlockDescBind* backward_block = program_desc.AppendBlock(*cur_block);
+ BlockDescBind* backward_block =
+ program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
for (auto& ptr : backward_block_op_descs) {
backward_block->AppendAllocatedOp(std::move(ptr));
}
- op_grads[0]->SetBlockAttr("step_block", *backward_block);
+ op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
+ } else {
+ op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
}
for (const auto& desc : op_grads) {
diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h
index 72f77a88a24434fd7d2ed685ac850c88888d6808..26adf6a20ff09483b84f479db08efcf402135053 100644
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -88,6 +88,8 @@ class BlockDescBind {
BlockDesc *Proto();
+ ProgramDescBind *Program() { return this->prog_; }
+
private:
void ClearPBOps();
void ClearPBVars();
diff --git a/paddle/framework/details/op_registry.h b/paddle/framework/details/op_registry.h
index b731840ef2a4b2d5d82b019d28ad6517fa4b7607..f91e0e03410c95f84a65f02beed38b7bbfdcaa86 100644
--- a/paddle/framework/details/op_registry.h
+++ b/paddle/framework/details/op_registry.h
@@ -108,8 +108,9 @@ struct OpInfoFiller {
info->grad_op_maker_ = [](
const OpDescBind& fwd_op,
const std::unordered_set& no_grad_set,
- std::unordered_map* grad_to_var) {
- T maker(fwd_op, no_grad_set, grad_to_var);
+ std::unordered_map* grad_to_var,
+ const std::vector& grad_block) {
+ T maker(fwd_op, no_grad_set, grad_to_var, grad_block);
return maker();
};
}
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 9bf2311dc835c701c9311880b8adba486a7d446c..f8d32de5df222e0ba9f143c3f7a3d34d07fed6b4 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -31,7 +31,7 @@ namespace framework {
const std::string kFeedOpType = "feed";
const std::string kFetchOpType = "fetch";
-Executor::Executor(const std::vector& places) {
+Executor::Executor(const std::vector& places) : own_(true) {
PADDLE_ENFORCE_GT(places.size(), 0);
device_contexts_.resize(places.size());
for (size_t i = 0; i < places.size(); i++) {
@@ -52,8 +52,10 @@ Executor::Executor(const std::vector& places) {
}
Executor::~Executor() {
- for (auto& device_context : device_contexts_) {
- delete device_context;
+ if (own_) {
+ for (auto& device_context : device_contexts_) {
+ delete device_context;
+ }
}
}
@@ -66,14 +68,18 @@ static void CreateTensor(Variable* var, VarDesc::VarType var_type) {
var->GetMutable();
} else if (var_type == VarDesc::FETCH_LIST) {
var->GetMutable();
+ } else if (var_type == VarDesc::STEP_SCOPES) {
+ var->GetMutable>();
} else {
PADDLE_THROW(
- "Variable type must be "
- "LoDTensor/SelectedRows/FEED_MINIBATCH/FETCH_LIST.");
+ "Variable type %d is not in "
+ "[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST]",
+ var_type);
}
}
-void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id) {
+void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
+ bool create_local_scope) {
// TODO(tonyyang-svail):
// - only runs on the first device (i.e. no interdevice communication)
// - will change to use multiple blocks for RNN op and Cond Op
@@ -81,29 +87,42 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id) {
auto& block = pdesc.Block(block_id);
auto& device = device_contexts_[0];
- Scope& local_scope = scope->NewScope();
-
- for (auto& var : block.AllVars()) {
- if (var->Persistable()) {
- auto* ptr = scope->Var(var->Name());
- CreateTensor(ptr, var->GetType());
- VLOG(3) << "Create Variable " << var->Name()
- << " global, which pointer is " << ptr;
- } else {
- auto* ptr = local_scope.Var(var->Name());
+ Scope* local_scope = scope;
+ if (create_local_scope) {
+ local_scope = &scope->NewScope();
+ for (auto& var : block.AllVars()) {
+ if (var->Persistable()) {
+ auto* ptr = scope->Var(var->Name());
+ CreateTensor(ptr, var->GetType());
+ VLOG(3) << "Create Variable " << var->Name()
+ << " global, which pointer is " << ptr;
+ } else {
+ auto* ptr = local_scope->Var(var->Name());
+ CreateTensor(ptr, var->GetType());
+ VLOG(3) << "Create Variable " << var->Name()
+ << " locally, which pointer is " << ptr;
+ }
+ }
+ } else {
+ for (auto& var : block.AllVars()) {
+ auto* ptr = local_scope->Var(var->Name());
CreateTensor(ptr, var->GetType());
- VLOG(3) << "Create Variable " << var->Name()
- << " locally, which pointer is " << ptr;
+ VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
+ << ptr;
}
}
for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
- op->Run(local_scope, *device);
+ op->Run(*local_scope, *device);
+ }
+ if (create_local_scope) {
+ scope->DeleteScope(local_scope);
}
-
- scope->DeleteScope(&local_scope);
}
+Executor::Executor(const platform::DeviceContext& device)
+ : device_contexts_({&device}), own_(false) {}
+
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index c78bfe8f9f07f1324515f0baaca4a94cc0fe844e..b745f4f6474ef688774f4c833a3958942e9aa8cb 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -25,6 +25,7 @@ namespace framework {
class Executor {
public:
explicit Executor(const std::vector& places);
+ explicit Executor(const platform::DeviceContext& devices);
~Executor();
/* @Brief
@@ -34,10 +35,11 @@ class Executor {
* ProgramDesc
* Scope
*/
- void Run(const ProgramDescBind&, Scope*, int);
+ void Run(const ProgramDescBind&, Scope*, int, bool create_local_scope = true);
private:
- std::vector device_contexts_;
+ std::vector device_contexts_;
+ bool own_;
};
} // namespace framework
diff --git a/paddle/framework/grad_op_desc_maker.h b/paddle/framework/grad_op_desc_maker.h
index 94944c79b64d38e799df436de874cabc3661e30a..998186e33915a11f2864eb5387d19ed1bfbab51c 100644
--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
@@ -15,6 +15,7 @@
#pragma once
#include
#include
+#include
#include "paddle/framework/op_desc.h"
#include "paddle/framework/operator.h"
@@ -26,8 +27,13 @@ class GradOpDescMakerBase {
explicit GradOpDescMakerBase(
const OpDescBind& fwd_op,
const std::unordered_set& no_grad_set,
- std::unordered_map* grad_to_var)
- : fwd_op_(fwd_op), no_grad_set_(no_grad_set), grad_to_var_(grad_to_var) {}
+ std::unordered_map* grad_to_var,
+ const std::vector& grad_block =
+ std::vector())
+ : fwd_op_(fwd_op),
+ no_grad_set_(no_grad_set),
+ grad_to_var_(grad_to_var),
+ grad_block_(grad_block) {}
virtual ~GradOpDescMakerBase() = default;
virtual std::vector> operator()() const = 0;
@@ -102,6 +108,9 @@ class GradOpDescMakerBase {
const OpDescBind& fwd_op_;
const std::unordered_set& no_grad_set_;
std::unordered_map* grad_to_var_;
+
+ protected:
+ std::vector grad_block_;
};
class SingleGradOpDescMaker : public GradOpDescMakerBase {
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 0779137639e6cd9f6ecf3bbbc24d081cae3de9c0..c96166f35d1425218a4a74f50dc5ed542d677b68 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -327,6 +327,19 @@ void OpDescBind::InferShape(const BlockDescBind &block) const {
PADDLE_ENFORCE(static_cast(infer_shape),
"%s's infer_shape has not been registered", this->Type());
CompileTimeInferShapeContext ctx(*this, block);
+ if (VLOG_IS_ON(10)) {
+ std::ostringstream sout;
+ auto inames = this->InputArgumentNames();
+ sout << " From [";
+ std::copy(inames.begin(), inames.end(),
+ std::ostream_iterator(sout, ", "));
+ sout << "] to [";
+ auto onames = this->OutputArgumentNames();
+ std::copy(onames.begin(), onames.end(),
+ std::ostream_iterator(sout, ", "));
+ sout << "]";
+ VLOG(10) << sout.str();
+ }
infer_shape(&ctx);
}
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 3be26fdc4fb6ebdd0ec427a2248b0f97d9edff01..9295d36c2b2e66130ad273ebd3a40de739efeea7 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -126,7 +126,7 @@ OperatorBase::OperatorBase(const std::string& type,
std::vector OperatorBase::InputVars() const {
std::vector ret_val;
- for (auto& o : outputs_) {
+ for (auto& o : inputs_) {
ret_val.reserve(ret_val.size() + o.second.size());
ret_val.insert(ret_val.end(), o.second.begin(), o.second.end());
}
@@ -394,7 +394,19 @@ class RuntimeInferShapeContext : public InferShapeContext {
void OperatorWithKernel::Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const {
- VLOG(3) << "Running operator " << this->Type();
+ if (VLOG_IS_ON(1)) {
+ auto inputs = this->InputVars();
+ auto outputs = this->OutputVars(true);
+ std::ostringstream sout;
+ sout << "Run operator " << this->Type() << " From [";
+ std::ostream_iterator out_it(sout, ",");
+ std::copy(inputs.begin(), inputs.end(), out_it);
+ sout << "] to [";
+ std::copy(outputs.begin(), outputs.end(), out_it);
+ sout << "]";
+ VLOG(1) << sout.str();
+ }
+
RuntimeInferShapeContext infer_shape_ctx(*this, scope);
this->InferShape(&infer_shape_ctx);
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index 14cc530448379eb6d4bf0435f607494aa01ef5b5..fb2c69105627f663ddcce07d31526c9e4278e863 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -47,8 +47,12 @@ Variable* Scope::Var(const std::string& name) {
return v;
}
-Variable* Scope::Var() {
- return Var(string::Sprintf("%p.%d", this, vars_.size()));
+Variable* Scope::Var(std::string* name) {
+ auto var_name = string::Sprintf("%p.%d", this, vars_.size());
+ if (name != nullptr) {
+ *name = var_name;
+ }
+ return Var(var_name);
}
Variable* Scope::FindVar(const std::string& name) const {
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index ac334da5ef0c8ad563b6be5413df33f5d0bdbcf8..fb660949394149ebf2c6172a0ac3f4c7594f4286 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -49,7 +49,7 @@ class Scope {
Variable* Var(const std::string& name);
/// Create a variable with a scope-unique name.
- Variable* Var();
+ Variable* Var(std::string* name = nullptr);
/// Find a variable in the scope or any of its ancestors. Returns
/// nullptr if cannot find.
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 9eab67561a42b3fb4e22d8475ad5eeb146a72f1c..28d0fcf94ec31c82476e093f93ccee222a0c9d9a 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -125,7 +125,7 @@ class Tensor {
* @param[in] end_idx The index of the end row(exclusive) to slice.
* The index number begins from 0.
*/
- inline Tensor Slice(const int& begin_idx, const int& end_idx) const;
+ inline Tensor Slice(int begin_idx, int end_idx) const;
platform::Place place() const {
PADDLE_ENFORCE_NOT_NULL(
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index bcccdd5881775e199297dce7e70aaf6aae62d95a..d78a2c4c21149ef3c800991b9a144ea198f1bdcf 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -228,7 +228,7 @@ inline void Tensor::CopyFromVector(const std::vector& src,
#endif
}
-inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
+inline Tensor Tensor::Slice(int begin_idx, int end_idx) const {
check_memory_size();
PADDLE_ENFORCE_GE(begin_idx, 0,
"The start row index must be greater than 0.");
diff --git a/paddle/framework/type_defs.h b/paddle/framework/type_defs.h
index afeeb1914ac30188b93c3b9da30bb5ceaf74416e..baeb98c9bd49ec65da5931bcbe33ab788f86f3e8 100644
--- a/paddle/framework/type_defs.h
+++ b/paddle/framework/type_defs.h
@@ -29,6 +29,7 @@ class OpDescBind;
class BlockDescBind;
class BlockDesc;
class InferShapeContext;
+class BlockDescBind;
using VariableNameMap = std::map>;
@@ -46,7 +47,8 @@ using OpCreator = std::function>(
const OpDescBind&, const std::unordered_set& /*no_grad_set*/,
- std::unordered_map* /*grad_to_var*/)>;
+ std::unordered_map* /*grad_to_var*/,
+ const std::vector& grad_block)>;
using InferVarTypeFN = std::function;
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
index 9db6d252d97bfeee3fe376bcda431fe94c65a678..87cb2d280866ac2be2d6f85e872e547e12548feb 100644
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -395,14 +395,24 @@ real AucEvaluator::evalImp(std::vector& arguments) {
CHECK_LE(arguments.size(), (size_t)3);
MatrixPtr output = arguments[0].value;
IVectorPtr label = arguments[1].ids;
+ MatrixPtr labelval = arguments[1].value;
bool supportWeight = (3 == arguments.size()) ? true : false;
MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
- if (nullptr == output || nullptr == label ||
- (supportWeight && nullptr == weight)) {
+
+ if (nullptr == output || (supportWeight && nullptr == weight)) {
return 0;
}
size_t insNum = output->getHeight();
size_t outputDim = output->getWidth();
+ // Copy label from value to a vector.
+ if (nullptr == label && nullptr != labelval) {
+ // label width is 1
+ CHECK_EQ(1, labelval->getWidth());
+ VectorPtr vec =
+ Vector::create(labelval->getData(), insNum, output->useGpu());
+ label = vec->castToInt();
+ }
+
CHECK_EQ(insNum, label->getSize());
if (supportWeight) {
CHECK_EQ(insNum, weight->getHeight());
@@ -443,6 +453,7 @@ real AucEvaluator::evalImp(std::vector& arguments) {
int* labelD = label->getData();
real* weightD = supportWeight ? weight->getData() : nullptr;
size_t pos = realColumnIdx_;
+
for (size_t i = 0; i < insNum; ++i) {
real value = outputD[pos];
uint32_t binIdx = static_cast(value * kBinNum_);
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
index ff72672e3ab77212b309fcfea835839a916fa632..346008439c35a2bcbcd2e9dfd36d689e01d7495f 100644
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
#include
#include "Matrix.h"
#include "hl_gpu.h"
+#include "hl_matrix.h"
#include "hl_table_apply.h"
#include "paddle/utils/Flags.h"
#include "paddle/utils/Logging.h"
@@ -99,6 +100,19 @@ MatrixPtr VectorT::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
return mat;
}
+template <>
+std::shared_ptr> VectorT::castToInt() {
+ std::shared_ptr> ret = IVector::create(this->getSize(), useGpu_);
+ if (useGpu_) {
+ hl_vector_cast2int(ret->getData(), this->getData(), this->getSize());
+ } else {
+ for (size_t i = 0; i < getSize(); ++i) {
+ ret->getData()[i] = int(this->getData()[i]);
+ }
+ }
+ return ret;
+}
+
template
GpuVectorT::GpuVectorT(size_t size)
: VectorT(size,
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
index 80b9775fccf10c57bb48145ef56165ec7c86d8b8..f965a5809209da313c78a545c44e7aa39e95ac65 100644
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -162,6 +162,13 @@ public:
*/
std::shared_ptr toOneHotSparseMatrix(size_t idRange, bool useGpu);
+ /**
+ * @brief cast vector of "real" elements to "int" elements.
+ *
+ * @note: float -> int must be casted, or you'll get wrong data.
+ */
+ std::shared_ptr> castToInt();
+
/**
* This function will crash if the size of src and dest is different.
*/
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 60dc55a32f5f05875e4f3ce77431556e14adc74a..81d92ec6f4f8c94e08d3b86b6319a9bf06f76a22 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -131,9 +131,10 @@ add_subdirectory(math)
add_subdirectory(nccl)
set(DEPS_OPS
- recurrent_op
cond_op
cross_entropy_op
+ recurrent_op
+ dynamic_recurrent_op
softmax_with_cross_entropy_op
sum_op
pool_op
@@ -142,9 +143,6 @@ set(DEPS_OPS
sequence_conv_op
lstm_op)
-
-op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
- DEPS framework_proto tensor net_op)
op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
op_library(cross_entropy_op DEPS cross_entropy)
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
@@ -156,7 +154,9 @@ op_library(nccl_op DEPS nccl_common)
endif()
op_library(sequence_conv_op DEPS context_project)
op_library(lstm_op DEPS sequence2batch lstm_compute)
-
+op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
+ DEPS net_op tensor_array)
+op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS})
op_library(${src})
@@ -168,8 +168,9 @@ cc_test(gather_test SRCS gather_test.cc DEPS tensor)
cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
-cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc DEPS dynamic_recurrent_op recurrent_op tensor_array)
-
+cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc
+ rnn/recurrent_op_utils.cc
+ DEPS dynamic_recurrent_op)
if(WITH_GPU)
nv_test(nccl_op_test SRCS nccl_op_test.cu DEPS nccl_op gpu_info device_context)
endif()
diff --git a/paddle/operators/conv2d_transpose_cudnn_op.cc b/paddle/operators/conv2d_transpose_cudnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8ce94e0f04f14e1eae7e7d01280601cc72dea8c4
--- /dev/null
+++ b/paddle/operators/conv2d_transpose_cudnn_op.cc
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include "paddle/operators/conv2d_transpose_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
+ public:
+ CudnnConv2DTransposeOpMaker(framework::OpProto* proto,
+ framework::OpAttrChecker* op_checker)
+ : Conv2DTransposeOpMaker(proto, op_checker) {
+ AddAttr>("dilations", "dilations of convolution operator.")
+ .SetDefault(std::vector{1, 1});
+ AddAttr("workspace_size_MB",
+ "workspace size for cudnn, in MB, "
+ "workspace is a section of GPU memory which will be "
+ "allocated/freed each time the operator runs, larger "
+ "workspace size can increase performance but also requires "
+ "better hardward. This size should be carefully setted.")
+ .SetDefault(4096);
+ }
+};
+
+} // namespace operators
+} // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(conv2d_transpose_cudnn, ops::Conv2DTransposeOp,
+ ops::CudnnConv2DTransposeOpMaker, conv2d_transpose_cudnn_grad,
+ ops::Conv2DTransposeOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+ conv2d_transpose_cudnn,
+ ops::GemmConv2DTransposeKernel);
+REGISTER_OP_CPU_KERNEL(
+ conv2d_transpose_cudnn_grad,
+ ops::GemmConv2DTransposeGradKernel);
diff --git a/paddle/operators/conv2d_transpose_cudnn_op.cu b/paddle/operators/conv2d_transpose_cudnn_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..61fcfb3bd8fa57f2c45fbf3a980dbe41041cff18
--- /dev/null
+++ b/paddle/operators/conv2d_transpose_cudnn_op.cu
@@ -0,0 +1,240 @@
+/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memory.h"
+#include "paddle/operators/conv2d_transpose_op.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
+using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
+using DataLayout = platform::DataLayout;
+using CUDADeviceContext = platform::CUDADeviceContext;
+
+static constexpr size_t kConvCudnnWorkspaceLimitBytes = 1024 * 1024 * 1024;
+
+template
+class CudnnConvTransposeOpKernel : public framework::OpKernel {
+ public:
+ void Compute(const framework::ExecutionContext& ctx) const override {
+ PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+ "It must use GPUPlace.");
+ auto* input = ctx.Input("Input");
+ auto* filter = ctx.Input("Filter");
+ auto* output = ctx.Output("Output");
+
+ std::vector strides = ctx.Attr>("strides");
+ std::vector paddings = ctx.Attr>("paddings");
+ // cudnn v5 does not support dilations
+ std::vector dilations = ctx.Attr>("dilations");
+ int user_workspace_size = ctx.Attr("workspace_size_MB");
+
+ const T* input_data = input->data();
+ const T* filter_data = filter->data();
+ T* output_data = output->mutable_data(ctx.GetPlace());
+ // ------------------- cudnn descriptors ---------------------
+ ScopedTensorDescriptor input_desc;
+ ScopedTensorDescriptor output_desc;
+ ScopedFilterDescriptor filter_desc;
+ ScopedConvolutionDescriptor conv_desc;
+ DataLayout layout = DataLayout::kNCHW;
+
+ // N, M, H, W
+ cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor(
+ layout, framework::vectorize2int(input->dims()));
+ // N, C, O_h, O_w
+ cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor(
+ layout, framework::vectorize2int(output->dims()));
+ // M, C, K_h, K_w
+ cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor(
+ layout, framework::vectorize2int(filter->dims()));
+ cudnnConvolutionDescriptor_t cudnn_conv_desc =
+ conv_desc.descriptor(paddings, strides, dilations);
+
+ // ------------------- cudnn conv workspace ---------------------
+ void* cudnn_workspace = nullptr;
+ size_t workspace_size_in_bytes; // final workspace to allocate.
+ size_t workspace_size_limit = kConvCudnnWorkspaceLimitBytes;
+ if (user_workspace_size > 0) {
+ workspace_size_limit = user_workspace_size * 1024 * 1024;
+ }
+ // ------------------- cudnn conv algorithm ---------------------
+ cudnnConvolutionBwdDataAlgo_t algo;
+ auto handle = ctx.cuda_device_context().cudnn_handle();
+ // Get the algorithm
+ PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+ handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
+ // dxDesc: Handle to the previously initialized output tensor
+ // descriptor.
+ cudnn_output_desc, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+ workspace_size_limit, &algo));
+
+ // get workspace size able to allocate
+ PADDLE_ENFORCE(
+ platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+ handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
+ cudnn_output_desc, algo, &workspace_size_in_bytes));
+
+ // Allocate on GPU memory
+ platform::GPUPlace gpu = boost::get(ctx.GetPlace());
+ cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
+
+ // ------------------- cudnn conv transpose forward ---------------------
+ T alpha = 1.0f, beta = 0.0f;
+ PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+ handle, &alpha, cudnn_filter_desc, filter_data, cudnn_input_desc,
+ input_data, cudnn_conv_desc, algo, cudnn_workspace,
+ workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
+
+ // Release the cudnn workspace
+ paddle::memory::Free(gpu, cudnn_workspace);
+ }
+};
+
+template
+class CudnnConvTransposeGradOpKernel : public framework::OpKernel {
+ public:
+ void Compute(const framework::ExecutionContext& ctx) const override {
+ PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+ "It must use GPUPlace.");
+ auto input = ctx.Input("Input");
+ auto filter = ctx.Input("Filter");
+ auto output_grad = ctx.Input(framework::GradVarName("Output"));
+ auto input_grad = ctx.Output(framework::GradVarName("Input"));
+ auto filter_grad = ctx.Output(framework::GradVarName("Filter"));
+ const T* input_data = input->data();
+ const T* output_grad_data = output_grad->data();
+ const T* filter_data = filter->data();
+
+ std::vector strides = ctx.Attr>("strides");
+ std::vector paddings = ctx.Attr>("paddings");
+ // cudnn v5 does not support dilations
+ std::vector dilations = ctx.Attr>("dilations");
+ int user_workspace_size = ctx.Attr("workspace_size_MB");
+
+ // ------------------- cudnn descriptors ---------------------
+ ScopedTensorDescriptor input_desc;
+ ScopedTensorDescriptor output_desc;
+ ScopedFilterDescriptor filter_desc;
+ ScopedConvolutionDescriptor conv_desc;
+ DataLayout layout = DataLayout::kNCHW;
+
+ // Input: (N, M, H, W)
+ cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor(
+ layout, framework::vectorize2int(input->dims()));
+ // Output: (N, C, O_H, O_W)
+ cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor(
+ layout, framework::vectorize2int(output_grad->dims()));
+ // Filter (M, C, K_H, K_W)
+ cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor(
+ layout, framework::vectorize2int(filter->dims()));
+
+ cudnnConvolutionDescriptor_t cudnn_conv_desc =
+ conv_desc.descriptor(paddings, strides, dilations);
+
+ // ------------------- cudnn backward algorithm ---------------------
+ cudnnConvolutionFwdAlgo_t data_algo;
+ cudnnConvolutionBwdFilterAlgo_t filter_algo;
+ size_t bwd_filter_ws_size, fwd_ws_size;
+ size_t workspace_size_in_bytes = 0;
+ size_t workspace_size_limit = kConvCudnnWorkspaceLimitBytes;
+ if (user_workspace_size > 0) {
+ workspace_size_limit = user_workspace_size * 1024 * 1024;
+ }
+
+ auto handle = ctx.cuda_device_context().cudnn_handle();
+ if (input_grad) {
+ // choose backward algorithm for data
+ PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+ handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
+ cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+ workspace_size_limit, &data_algo));
+ PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+ handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
+ cudnn_input_desc, data_algo, &fwd_ws_size));
+ workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size);
+ }
+
+ if (filter_grad) {
+ // choose backward algorithm for filter
+ PADDLE_ENFORCE(
+ platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+ handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
+ cudnn_filter_desc,
+ CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+ workspace_size_limit, &filter_algo));
+
+ // get workspace for backwards filter algorithm
+ PADDLE_ENFORCE(
+ platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+ handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
+ cudnn_filter_desc, filter_algo, &bwd_filter_ws_size));
+ workspace_size_in_bytes =
+ std::max(workspace_size_in_bytes, bwd_filter_ws_size);
+ }
+
+ // ------------------- cudnn conv workspace ---------------------
+ // Already on GPU
+ void* cudnn_workspace = nullptr;
+ platform::GPUPlace gpu = boost::get(ctx.GetPlace());
+ cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
+ // ------------------- cudnn conv backward data ---------------------
+ // FIXME(typhoonzero): template type T may not be the same as cudnn call.
+ T alpha = 1.0f, beta = 0.0f;
+ if (input_grad) {
+ T* input_grad_data = input_grad->mutable_data(ctx.GetPlace());
+ auto t = framework::EigenVector::Flatten(*input_grad);
+ t.device(ctx.GetEigenDevice()) =
+ t.constant(static_cast(0));
+
+ PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
+ handle, &alpha, cudnn_output_desc, output_grad_data,
+ cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo,
+ cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
+ input_grad_data));
+ }
+
+ // ------------------- cudnn conv backward filter ---------------------
+ if (filter_grad) {
+ T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace());
+ auto t = framework::EigenVector::Flatten(*filter_grad);
+ t.device(ctx.GetEigenDevice()) =
+ t.constant(static_cast(0));
+ // Gradient with respect to the filter
+ PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+ handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
+ input_data, cudnn_conv_desc, filter_algo, cudnn_workspace,
+ workspace_size_in_bytes, &beta, cudnn_filter_desc, filter_grad_data));
+ }
+ // Release the cudnn workspace
+ paddle::memory::Free(gpu, cudnn_workspace);
+ }
+};
+
+} // namespace operators
+} // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn,
+ ops::CudnnConvTransposeOpKernel);
+REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn_grad,
+ ops::CudnnConvTransposeGradOpKernel);
diff --git a/paddle/operators/conv2dtranspose_op.cc b/paddle/operators/conv2d_transpose_op.cc
similarity index 95%
rename from paddle/operators/conv2dtranspose_op.cc
rename to paddle/operators/conv2d_transpose_op.cc
index c1b231906e2f172b6f9cee55f850d1a5ec6c3221..348527728bdd4ed60676d6e6e44c4e761b803096 100644
--- a/paddle/operators/conv2dtranspose_op.cc
+++ b/paddle/operators/conv2d_transpose_op.cc
@@ -12,7 +12,7 @@
See the License for the specific language governing permissions and
limitations under the License. */
-#include "paddle/operators/conv2dtranspose_op.h"
+#include "paddle/operators/conv2d_transpose_op.h"
namespace paddle {
namespace operators {
@@ -95,13 +95,13 @@ void Conv2DTransposeOpGrad::InferShape(
} // namespace paddle
namespace ops = paddle::operators;
-REGISTER_OP(conv2dtranspose, ops::Conv2DTransposeOp,
- ops::Conv2DTransposeOpMaker, conv2dtranspose_grad,
+REGISTER_OP(conv2d_transpose, ops::Conv2DTransposeOp,
+ ops::Conv2DTransposeOpMaker, conv2d_transpose_grad,
ops::Conv2DTransposeOpGrad);
REGISTER_OP_CPU_KERNEL(
- conv2dtranspose,
+ conv2d_transpose,
ops::GemmConv2DTransposeKernel);
REGISTER_OP_CPU_KERNEL(
- conv2dtranspose_grad,
+ conv2d_transpose_grad,
ops::GemmConv2DTransposeGradKernel);
diff --git a/paddle/operators/conv2dtranspose_op.cu b/paddle/operators/conv2d_transpose_op.cu
similarity index 89%
rename from paddle/operators/conv2dtranspose_op.cu
rename to paddle/operators/conv2d_transpose_op.cu
index 761bc1959e69be94f43571728e6b92a322558b99..931ac9eed294c4fe7c726d8cc2c4d9a39ec12828 100644
--- a/paddle/operators/conv2dtranspose_op.cu
+++ b/paddle/operators/conv2d_transpose_op.cu
@@ -12,13 +12,13 @@
See the License for the specific language governing permissions and
limitations under the License. */
-#include "paddle/operators/conv2dtranspose_op.h"
+#include "paddle/operators/conv2d_transpose_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(
- conv2dtranspose,
+ conv2d_transpose,
ops::GemmConv2DTransposeKernel);
REGISTER_OP_GPU_KERNEL(
- conv2dtranspose_grad,
+ conv2d_transpose_grad,
ops::GemmConv2DTransposeGradKernel);
diff --git a/paddle/operators/conv2dtranspose_op.h b/paddle/operators/conv2d_transpose_op.h
similarity index 99%
rename from paddle/operators/conv2dtranspose_op.h
rename to paddle/operators/conv2d_transpose_op.h
index 8c70b3dcec1e26ab3d8a42d88040764c643b5ae6..cab7788227690621a0e5b744197b86c515bbef72 100644
--- a/paddle/operators/conv2dtranspose_op.h
+++ b/paddle/operators/conv2d_transpose_op.h
@@ -62,7 +62,7 @@ class GemmConv2DTransposeKernel : public framework::OpKernel {
std::vector strides = context.Attr>("strides");
// TODO(Zhuoyuan): Paddings can be added in future.
- // groups will alway be disabled in conv2dtranspose.
+ // groups will alway be disabled in conv2d_transpose.
const int batch_size = input->dims()[0];
const int m = input->dims()[1];
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc
index 58c9f1cd2c79c150aaed7753641f6ad6120dd0f5..0244adb42392c707d755e95c7abdebd826c219b4 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -36,7 +36,12 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
[](int a) { return static_cast(a); });
auto dims = framework::make_ddim(shape_int64);
- dims[0] = ctx->GetInputDim("Input")[0];
+ int dim_idx = ctx->Attrs().Get("dim_idx");
+ PADDLE_ENFORCE_GE(dim_idx, 0);
+ PADDLE_ENFORCE_GT(static_cast(shape.size()), dim_idx);
+ PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), dim_idx);
+
+ dims[dim_idx] = ctx->GetInputDim("Input")[dim_idx];
ctx->SetOutputDim("Out", dims);
}
@@ -57,15 +62,18 @@ class FillConstantBatchSizeLikeOpMaker
"(int, default 5 (FP32)) "
"Output data type")
.SetDefault(framework::DataType::FP32);
- AddAttr>("shape", "(vector) The shape of the output");
- AddAttr("value", "(float, default 0) The value to be filled")
- .SetDefault(0.0f);
AddInput("Input",
"(Tensor) Tensor "
- "whose first dimension is used to specify the batch_size");
+ "whose dim_idx th dimension is used to specify the batch_size");
AddOutput("Out",
"(Tensor) Tensor of specified shape will be filled "
"with the specified value");
+ AddAttr>("shape", "(vector) The shape of the output");
+ AddAttr("dim_idx",
+ "(int, default 0) the index of batch size dimension")
+ .SetDefault(0);
+ AddAttr("value", "(float, default 0) The value to be filled")
+ .SetDefault(0.0f);
AddComment(R"DOC(Fill up a variable with specified constant value.)DOC");
}
};
diff --git a/paddle/operators/lookup_table_op.h b/paddle/operators/lookup_table_op.h
index ea3289d2731a4b2098c3a199464559b0a0ce7202..99b912163b71594340d8917645dff107fd208aea 100644
--- a/paddle/operators/lookup_table_op.h
+++ b/paddle/operators/lookup_table_op.h
@@ -90,11 +90,13 @@ class LookupTableGradKernel : public framework::OpKernel {
auto* d_output_data = d_output->data();
auto* d_table_data = d_table->mutable_data(context.GetPlace());
+ memset(d_table_data, 0, d_table->numel() * sizeof(T));
+
for (int64_t i = 0; i < ids->numel(); ++i) {
PADDLE_ENFORCE_LT(ids_data[i], N);
PADDLE_ENFORCE_GE(ids_data[i], 0);
for (int j = 0; j < D; ++j) {
- d_table_data[ids_data[i] * D + j] = d_output_data[i * D + j];
+ d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j];
}
}
}
diff --git a/paddle/operators/lstm_unit_op.cu b/paddle/operators/lstm_unit_op.cu
index 49ea550b6f49a13bf31d14321d7a9eb13a834d4b..e192283aa0afac49e8e467506f3703d1ce60d2a6 100644
--- a/paddle/operators/lstm_unit_op.cu
+++ b/paddle/operators/lstm_unit_op.cu
@@ -12,6 +12,10 @@
See the License for the specific language governing permissions and
limitations under the License. */
+/* Acknowledgement: the following code is strongly inspired by
+https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.cu
+*/
+
#include "paddle/framework/op_registry.h"
#include "paddle/operators/cross_entropy_op.h"
#include "paddle/platform/assert.h"
diff --git a/paddle/operators/lstm_unit_op.h b/paddle/operators/lstm_unit_op.h
index 625b1852c2f0eb2ed435f73fea251c40c614a7dd..38cb298f92a21bb5c7508761fec701d28279a85f 100644
--- a/paddle/operators/lstm_unit_op.h
+++ b/paddle/operators/lstm_unit_op.h
@@ -12,6 +12,10 @@
See the License for the specific language governing permissions and
limitations under the License. */
+/* Acknowledgement: the following code is strongly inspired by
+https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op.h
+*/
+
#pragma once
#include "glog/logging.h"
#include "paddle/framework/op_registry.h"
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index 245d3b47d3a6331a3cf20dbdbd972639d68cd496..90acf034d905e6ab3ba7bf8c3d29e1ef1161ed0c 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -29,9 +29,14 @@ class MulOpShapeInference : public framework::InferShapeBase {
auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y");
+
int x_num_col_dims = ctx->Attrs().Get("x_num_col_dims");
int y_num_col_dims = ctx->Attrs().Get("y_num_col_dims");
+ VLOG(3) << "mul operator x.shape=" << x_dims << " y.shape=" << y_dims
+ << " x_num_col_dims=" << x_num_col_dims
+ << " y_num_col_dims=" << y_num_col_dims;
+
PADDLE_ENFORCE_GT(
x_dims.size(), x_num_col_dims,
"The input tensor X's rank of MulOp should be larger than "
diff --git a/paddle/operators/precision_recall_op.cc b/paddle/operators/precision_recall_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..39da1e0bf89ce308de62d38a6cce6dbd4c7c7f83
--- /dev/null
+++ b/paddle/operators/precision_recall_op.cc
@@ -0,0 +1,179 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/precision_recall_op.h"
+
+namespace paddle {
+namespace operators {
+
+class PrecisionRecallOp : public framework::OperatorWithKernel {
+ public:
+ using framework::OperatorWithKernel::OperatorWithKernel;
+
+ void InferShape(framework::InferShapeContext *ctx) const override {
+ PADDLE_ENFORCE(ctx->HasInput("MaxProbs"),
+ "Input(MaxProbs) should not be null.");
+ PADDLE_ENFORCE(ctx->HasInput("Indices"),
+ "Input(Indices) should not be null.");
+ PADDLE_ENFORCE(ctx->HasInput("Labels"),
+ "Input(Labels) should not be null.");
+ PADDLE_ENFORCE(ctx->HasOutput("BatchMetrics"),
+ "Output(BatchMetrics) should not be null.");
+ PADDLE_ENFORCE(ctx->HasOutput("AccumMetrics"),
+ "Output(AccumMetrics) should not be null.");
+ PADDLE_ENFORCE(ctx->HasOutput("AccumStatesInfo"),
+ "Output(AccumStatesInfo) should not be null.");
+
+ int64_t cls_num =
+ static_cast(ctx->Attrs().Get("class_number"));
+ auto max_probs_dims = ctx->GetInputDim("MaxProbs");
+ auto labels_dims = ctx->GetInputDim("Labels");
+
+ PADDLE_ENFORCE_EQ(max_probs_dims[1], 1,
+ "Each instance contains one max probability, so the "
+ "shape of Input(MaxProbs) should be [batch_size, 1].");
+ PADDLE_ENFORCE_EQ(ctx->GetInputDim("Indices"), max_probs_dims,
+ "The shape of Input(Indices) should be [batch_size, 1].");
+ PADDLE_ENFORCE_EQ(max_probs_dims[0], labels_dims[0],
+ "The 1st dimension of Input(MaxProbs) and "
+ "Input(Labels) both are batch_size and the shape should "
+ "be the same.");
+ PADDLE_ENFORCE_EQ(labels_dims[1], 1,
+ "The 2nd dimension of Input(Labels) contains instance "
+ "label and the shape should be equal to 1.");
+ if (ctx->HasInput("Weights")) {
+ auto weights_dims = ctx->GetInputDim("Weights");
+ PADDLE_ENFORCE_EQ(weights_dims,
+ framework::make_ddim({max_probs_dims[0], 1}),
+ "The shape of Input(Weights) should be "
+ "[batch_size, 1].");
+ }
+ if (ctx->HasInput("StatesInfo")) {
+ auto states_dims = ctx->GetInputDim("StatesInfo");
+ PADDLE_ENFORCE_EQ(states_dims, framework::make_ddim({cls_num, 4}),
+ "The shape of Input(StatesInfo) should be "
+ "[class_number, 4].");
+ }
+
+ // Layouts of BatchMetrics and AccumMetrics both are:
+ // [
+ // macro average precision, macro average recall, macro average F1 score,
+ // micro average precision, micro average recall, micro average F1 score
+ // ]
+ ctx->SetOutputDim("BatchMetrics", {6});
+ ctx->SetOutputDim("AccumMetrics", {6});
+ // Shape of AccumStatesInfo is [class_number, 4]
+ // The layout of each row is:
+ // [ TP, FP, TN, FN ]
+ ctx->SetOutputDim("AccumStatesInfo", {cls_num, 4});
+ }
+
+ protected:
+ framework::DataType IndicateDataType(
+ const framework::ExecutionContext &ctx) const override {
+ return framework::ToDataType(ctx.Input("MaxProbs")->type());
+ }
+};
+
+class PrecisionRecallOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ PrecisionRecallOpMaker(framework::OpProto *proto,
+ framework::OpAttrChecker *op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput("MaxProbs",
+ "(Tensor, default Tensor), a 2-D tensor with shape N x 1, "
+ "where N is the batch size. Each row contains the max probability "
+ "of an instance which computed by the previous top_k (k=1) "
+ "operator.");
+ AddInput("Indices",
+ "(Tensor, default Tensor), a 2-D tensor with shape N x 1, "
+ "where N is the batch size. Each row contains the corresponding "
+ "index which computed by the previous top_k (k=1) operator.");
+ AddInput("Labels",
+ "(Tensor, default Tensor), a 2-D tensor with shape N x 1, "
+ "where N is the batch size. Each element is a label and the "
+ "value should be in [0, class_number - 1].");
+ AddInput("Weights",
+ "(Tensor, default Tensor), a 2-D tensor with shape N x 1, "
+ "where N is the batch size. This input is optional. If provided, "
+ "weight of instance would be considered when computing metrics.")
+ .AsDispensable();
+ AddInput("StatesInfo",
+ "(Tensor, default Tensor), a 2-D tensor with shape D x 4, "
+ "where D is the number of classes. This input is optional. If "
+ "provided, current state will be accumulated to this state and "
+ "the accumulation state will be as the output state.")
+ .AsDispensable();
+ AddOutput("BatchMetrics",
+ "(Tensor, default Tensor), a 1-D tensor with shape {6}."
+ "This output tensor contains metrics for current batch data."
+ "The layout is [macro average precision, macro average recall, "
+ "macro f1 score, micro average precision, micro average recall, "
+ "micro f1 score]");
+ AddOutput("AccumMetrics",
+ "(Tensor, default Tensor), a 1-D tensor with shape {6}."
+ "This output tensor contains metrics for accumulated data."
+ "The layout is [macro average precision, macro average recall, "
+ "macro f1 score, micro average precision, micro average recall, "
+ "micro f1 score]");
+ AddOutput("AccumStatesInfo",
+ "(Tensor, default Tensor), a 2-D tensor with shape D x 4, "
+ "where D is equal to class number. This output tensor contains "
+ "accumulated state variables used to compute metrics. The layout "
+ "for each class is [true positives, false positives, "
+ "true negatives, false negatives].");
+ AddAttr("class_number", "Number of classes to be evaluated.");
+ AddComment(R"DOC(
+When given 'Input(Indices)' and 'Input(Labels)', this operator can be used
+to compute various metrics including:
+ - macro average precision
+ - macro average recall
+ - macro f1 score
+ - micro average precision
+ - micro average recall
+ - micro f1 score
+
+To compute the above metrics, we need to do statistics for true positives,
+false positives and false negatives. Here count of true negatives is not
+necessary, but counting it may provide potential usage and the cost is
+trivial, so the operator also provides count of true negatives.
+
+We define state as a 2-D tensor with shape [class_number, 4]. Each row of a
+state contains statistic variables for corresponding class. Layout of each row
+is: TP(true positives), FP(false positives), TN(true negatives),
+FN(false negatives). If 'Input(Weights)' provided, TP, FP, TN, FN will be
+calculated by given weight instead of instance count.
+
+This operator also supports metrics computing for cross-batch situation. To
+achieve this, 'Input(StatesInfo)' should be provided. State of current batch
+data will be accumulated to 'Input(StatesInfo)' and 'Output(AccumStatesInfo)'
+is the accumulation state.
+
+'Output(BatchMetrics)' is metrics of current batch data while
+'Output(AccumStatesInfo)' is metrics of accumulation data.
+
+)DOC");
+ }
+};
+
+} // namespace operators
+} // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(precision_recall, ops::PrecisionRecallOp,
+ ops::PrecisionRecallOpMaker);
+REGISTER_OP_CPU_KERNEL(
+ precision_recall,
+ ops::PrecisionRecallKernel,
+ ops::PrecisionRecallKernel);
diff --git a/paddle/operators/precision_recall_op.h b/paddle/operators/precision_recall_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a871ce6741469cf9af409ec90215f721d52f36c
--- /dev/null
+++ b/paddle/operators/precision_recall_op.h
@@ -0,0 +1,161 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template
+using EigenMatrix = framework::EigenMatrix;
+
+enum StateVariable { TP = 0, FP, TN, FN };
+
+template
+class PrecisionRecallKernel : public framework::OpKernel {
+ public:
+ void Compute(const framework::ExecutionContext& ctx) const override {
+ auto* in0 = ctx.Input("Indices");
+ auto* in1 = ctx.Input("Labels");
+ auto* in2 = ctx.Input("Weights");
+ auto* in3 = ctx.Input("StatesInfo");
+ auto* out0 = ctx.Output("BatchMetrics");
+ auto* out1 = ctx.Output("AccumMetrics");
+ auto* out2 = ctx.Output("AccumStatesInfo");
+
+ const int* ids_data = in0->data();
+ const int* labels_data = in1->data();
+ size_t cls_num = static_cast(ctx.Attr("class_number"));
+ const T* weights_data = in2 ? in2->data() : nullptr;
+ const T* states_data = in3 ? in3->data() : nullptr;
+ double* batch_metrics_data = out0->mutable_data(ctx.GetPlace());
+ double* accum_metrics_data = out1->mutable_data(ctx.GetPlace());
+ out2->mutable_data(ctx.GetPlace());
+ auto accum_states = EigenMatrix::From(*out2);
+ accum_states.setZero();
+ T* accum_states_data = out2->data();
+
+ size_t sample_num = in0->dims()[0];
+ size_t state_var_num = 4; // TP FP TN FN
+
+ // get states info for current batch
+ for (size_t i = 0; i < sample_num; ++i) {
+ size_t idx = ids_data[i];
+ size_t label = labels_data[i];
+
+ PADDLE_ENFORCE(idx >= 0 && idx < cls_num,
+ "Class index of each instance should be in "
+ "[0, class_number).");
+ PADDLE_ENFORCE(label >= 0 && label < cls_num,
+ "Label of each instance should be in [0, class_number).");
+
+ T w = weights_data ? weights_data[i] : 1.0;
+ if (idx == label) {
+ accum_states_data[idx * state_var_num + TP] += w;
+ for (size_t j = 0; j < cls_num; ++j) {
+ accum_states_data[j * state_var_num + TN] += w;
+ }
+ accum_states_data[idx * state_var_num + TN] -= w;
+ } else {
+ accum_states_data[label * state_var_num + FN] += w;
+ accum_states_data[idx * state_var_num + FP] += w;
+ for (size_t j = 0; j < cls_num; ++j) {
+ accum_states_data[j * state_var_num + TN] += w;
+ }
+ accum_states_data[idx * state_var_num + TN] -= w;
+ accum_states_data[label * state_var_num + TN] -= w;
+ }
+ }
+
+ ComputeMetrics(accum_states_data, batch_metrics_data, state_var_num,
+ cls_num);
+
+ if (states_data) {
+ for (size_t i = 0; i < cls_num; ++i) {
+ for (size_t j = 0; j < state_var_num; ++j) {
+ size_t idx = i * state_var_num + j;
+ accum_states_data[idx] += states_data[idx];
+ }
+ }
+ }
+
+ ComputeMetrics(accum_states_data, accum_metrics_data, state_var_num,
+ cls_num);
+ }
+
+ // expose to be reused
+ static inline T CalcPrecision(T tp_count, T fp_count) {
+ if (tp_count > 0.0 || fp_count > 0.0) {
+ return tp_count / (tp_count + fp_count);
+ }
+ return 1.0;
+ }
+
+ static inline T CalcRecall(T tp_count, T fn_count) {
+ if (tp_count > 0.0 || fn_count > 0.0) {
+ return tp_count / (tp_count + fn_count);
+ }
+ return 1.0;
+ }
+
+ static inline T CalcF1Score(T precision, T recall) {
+ if (precision > 0.0 || recall > 0.0) {
+ return 2 * precision * recall / (precision + recall);
+ }
+ return 0.0;
+ }
+
+ protected:
+ void ComputeMetrics(const T* states_data, double* metrics_data,
+ size_t state_var_num, size_t cls_num) const {
+ T total_tp_count = 0;
+ T total_fp_count = 0;
+ T total_fn_count = 0;
+ T macro_avg_precision = 0.0;
+ T macro_avg_recall = 0.0;
+
+ for (size_t i = 0; i < cls_num; ++i) {
+ T tp_count = states_data[i * state_var_num + TP];
+ T fp_count = states_data[i * state_var_num + FP];
+ T fn_count = states_data[i * state_var_num + FN];
+ total_tp_count += tp_count;
+ total_fp_count += fp_count;
+ total_fn_count += fn_count;
+ macro_avg_precision += CalcPrecision(tp_count, fp_count);
+ macro_avg_recall += CalcRecall(tp_count, fn_count);
+ }
+ macro_avg_precision /= cls_num;
+ macro_avg_recall /= cls_num;
+ T macro_f1_score = CalcF1Score(macro_avg_precision, macro_avg_recall);
+
+ T micro_avg_precision = CalcPrecision(total_tp_count, total_fp_count);
+ T micro_avg_recall = CalcRecall(total_tp_count, total_fn_count);
+ T micro_f1_score = CalcF1Score(micro_avg_precision, micro_avg_recall);
+
+ // fill metrics data
+ metrics_data[0] = macro_avg_precision;
+ metrics_data[1] = macro_avg_recall;
+ metrics_data[2] = macro_f1_score;
+ metrics_data[3] = micro_avg_precision;
+ metrics_data[4] = micro_avg_recall;
+ metrics_data[5] = micro_f1_score;
+ }
+};
+
+} // namespace operators
+} // namespace paddle
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 40303e3adf4db7e8336ed72667fe69afa56c3f69..9eb2d79b4f65d23222e68ad2a439f7554469278b 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -12,181 +12,618 @@
See the License for the specific language governing permissions and
limitations under the License. */
-#include "paddle/operators/recurrent_op.h"
-
-#include
-#include
-
+#include
+#include "paddle/framework/executor.h"
#include "paddle/framework/op_registry.h"
-#include "paddle/operators/net_op.h"
namespace paddle {
namespace operators {
+constexpr char kInputs[] = "inputs";
+constexpr char kInitialStates[] = "initial_states";
+constexpr char kParameters[] = "parameters";
+constexpr char kOutputs[] = "outputs";
+constexpr char kStepScopes[] = "step_scopes";
+constexpr char kExStates[] = "ex_states";
+constexpr char kStates[] = "states";
+constexpr char kStepBlock[] = "step_block";
+constexpr char kReverse[] = "reverse";
+constexpr char kIsTrain[] = "is_train";
+#define GRAD_SUFFIX "@GRAD"
+constexpr char kInputGrads[] = "inputs" GRAD_SUFFIX;
+constexpr char kOutputGrads[] = "outputs" GRAD_SUFFIX;
+constexpr char kParamGrads[] = "parameters" GRAD_SUFFIX;
+constexpr char kInitStateGrads[] = "initial_states" GRAD_SUFFIX;
-using Scope = framework::Scope;
-using Variable = framework::Variable;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-void RecurrentAlgorithm::Run(const Scope& scope,
- const platform::DeviceContext& dev_ctx) const {
- auto* input0 = scope.FindVar(arg_->inlinks[0]);
- PADDLE_ENFORCE_NOT_NULL(input0);
- size_t seq_len = input0->GetMutable()->dims()[0];
- PADDLE_ENFORCE_GT(seq_len, 0);
-
- CreateScopes(scope, seq_len);
- auto& step_scopes = GetStepScopes(scope);
- rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len);
- InitMemories(step_scopes[0]);
-
- for (size_t step_id = 0; step_id < seq_len; step_id++) {
- if (step_id > 0) {
- rnn::LinkMemories(step_scopes, arg_->states, step_id, -1);
+using StepScopeVar = std::vector;
+
+// StepScopes manages scopes inside RNN.
+// StepScopes::CurScope() get the current scope
+// StepScopes::ExScope() get the ex-scope, or scope in previous time step.
+// StepScopes::Next() move to next time step.
+//
+// if is_train = False, then
+// there are two scopes for the RNN and just support forward.
+// else
+// the len(scopes) == seq_len
+//
+// if is_backward = True, then
+// reversely access scopes
+// else
+// access scopes from begin to end.
+class StepScopes {
+ public:
+ StepScopes(const framework::Scope &parent, StepScopeVar *scopes,
+ bool is_train, size_t seq_len, bool is_backward = false)
+ : counter_(is_backward ? seq_len - 1 : 0UL),
+ scopes_(scopes),
+ is_train_(is_train),
+ is_backward_(is_backward) {
+ size_t num_step_scopes = is_train ? seq_len : 2;
+ PADDLE_ENFORCE(is_train || !is_backward,
+ "Cannot backward when is not training");
+ if (!is_backward_) {
+ PADDLE_ENFORCE(scopes->empty());
+ scopes->reserve(static_cast(num_step_scopes));
+ for (size_t i = 0; i < num_step_scopes; ++i) {
+ scopes->emplace_back(&parent.NewScope());
+ }
}
- (*stepnet_)->Run(*step_scopes[step_id], dev_ctx);
- }
- rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len, dev_ctx);
-}
-
-void RecurrentAlgorithm::CreateScopes(const Scope& scope,
- size_t seq_len) const {
- // TODO(superjom) Only two scopes are needed for inference, this case will be
- // supported later.
- auto* step_scopes_var = scope.FindVar(arg_->step_scopes);
- PADDLE_ENFORCE(step_scopes_var != nullptr, "");
- auto* step_scopes = step_scopes_var->GetMutable>();
-
- // Now all variables in scope must be created outside of op.
- PADDLE_ENFORCE_NOT_NULL(stepnet_);
- PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(),
- "step_unit_ op has no outputs");
-
- if (seq_len > step_scopes->size()) {
- for (size_t i = step_scopes->size(); i < seq_len; ++i) {
- auto& step_scope = scope.NewScope();
-
- // create step net's temp inputs
- for (auto& input : (*stepnet_)->Inputs()) {
- // the weight are located in parent scope
- for (auto& var_name : input.second) {
- if (!step_scope.FindVar(var_name)) {
- step_scope.Var(var_name)->GetMutable();
- }
+ }
+
+ framework::Scope &CurScope() { return GetScope(counter_); }
+
+ framework::Scope &ExScope() {
+ auto &scope = GetScope(is_backward_ ? counter_ + 1 : counter_ - 1);
+ return scope;
+ }
+
+ void Next() {
+ if (is_backward_) {
+ --counter_;
+ } else {
+ ++counter_;
+ }
+ }
+
+ private:
+ framework::Scope &GetScope(size_t scope_id) const {
+ if (!is_train_) {
+ scope_id %= 2;
+ }
+ PADDLE_ENFORCE_LT(scope_id, scopes_->size());
+ return *(*scopes_)[scope_id];
+ }
+
+ size_t counter_;
+ StepScopeVar *scopes_;
+ bool is_train_;
+ bool is_backward_;
+};
+
+// Base class for RecurrentOp/RecurrentGradOp
+// Some common protected functions for RecurrentOp/RecurrentGradOp
+class RecurrentBase : public framework::OperatorBase {
+ public:
+ RecurrentBase(const std::string &type,
+ const framework::VariableNameMap &inputs,
+ const framework::VariableNameMap &outputs,
+ const framework::AttributeMap &attrs)
+ : OperatorBase(type, inputs, outputs, attrs) {}
+
+ protected:
+ // Get SequenceLength from Scope
+ // The sequence length is got from input tensor. The input tensor's
+ // dimension should be [SEQ_LEN, ..., ...]. The first of the tensor's shape
+ // is SEQ_LEN. The second of the tensor's shape could be the batch size or
+ // nested sequence length.
+ int64_t GetSequenceLength(const framework::Scope &scope) const {
+ // Dim format SEQ_LEN, BATCH_SIZE, ...
+ int64_t seq_len = -1;
+ auto &all_inputs = Inputs(kInputs);
+ PADDLE_ENFORCE(!all_inputs.empty());
+ for (auto &iname : all_inputs) {
+ auto *var = scope.FindVar(iname);
+ PADDLE_ENFORCE(var != nullptr);
+ PADDLE_ENFORCE(var->IsType());
+ auto &dim = var->Get().dims();
+ if (seq_len == -1) {
+ seq_len = dim[0];
+ } else {
+ PADDLE_ENFORCE_EQ(seq_len, dim[0]);
+ }
+ }
+ return seq_len;
+ }
+
+ // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
+ // map(dst_scope.Var, dst_vars)):
+ // dst_tensor.ShareDataWith(src_tensor)
+ static void LinkTensor(const framework::Scope &src_scope,
+ const std::vector &src_vars,
+ framework::Scope *dst_scope,
+ const std::vector &dst_vars) {
+ LinkTensorWithCallback(
+ src_scope, src_vars, dst_scope, dst_vars,
+ [&](const framework::Tensor &src, framework::Tensor *dst) {
+ dst->ShareDataWith(src);
+ });
+ }
+
+ // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
+ // map(dst_scope.Var, dst_vars)):
+ // callback(src_tensor, &dst_tensor)
+ template
+ static void LinkTensorWithCallback(const framework::Scope &src_scope,
+ const std::vector &src_vars,
+ framework::Scope *dst_scope,
+ const std::vector &dst_vars,
+ Callback callback) {
+ PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
+ for (size_t i = 0; i < dst_vars.size(); ++i) {
+ VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
+ AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback);
+ }
+ }
+
+ // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
+ // map(dst_scope.FindVar, dst_vars)):
+ // callback(src_tensor, &dst_tensor)
+ template
+ static void LinkTensorWithCallback(const framework::Scope &src_scope,
+ const std::vector &src_vars,
+ const framework::Scope &dst_scope,
+ const std::vector &dst_vars,
+ Callback callback) {
+ PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
+ for (size_t i = 0; i < dst_vars.size(); ++i) {
+ VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
+ AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback);
+ }
+ }
+
+ // (seq_len, shape) -> return [seq_len] + list(shape)
+ static framework::DDim PrependDims(size_t seq_len,
+ const framework::DDim &src) {
+ auto dims = framework::vectorize(src);
+ dims.insert(dims.begin(), static_cast(seq_len));
+ return framework::make_ddim(dims);
+ }
+
+ private:
+ template
+ static void AccessTensor(const framework::Scope &src_scope,
+ const std::string &src_var_name,
+ framework::Scope *dst_scope,
+ const std::string &dst_var_name, Callback callback) {
+ auto *src_var = src_scope.FindVar(src_var_name);
+ PADDLE_ENFORCE(src_var != nullptr);
+ auto &src_tensor = src_var->Get();
+
+ auto *dst_var = dst_scope->Var(dst_var_name);
+ auto *dst_tensor = dst_var->GetMutable();
+ callback(src_tensor, dst_tensor);
+ }
+
+ template
+ static void AccessTensor(const framework::Scope &src_scope,
+ const std::string &src_var_name,
+ const framework::Scope &dst_scope,
+ const std::string &dst_var_name, Callback callback) {
+ auto *src_var = src_scope.FindVar(src_var_name);
+ PADDLE_ENFORCE(src_var != nullptr);
+ auto &src_tensor = src_var->Get();
+ auto *dst_var = dst_scope.FindVar(dst_var_name);
+ PADDLE_ENFORCE(dst_var != nullptr);
+ auto *dst_tensor = dst_var->GetMutable();
+ callback(src_tensor, dst_tensor);
+ }
+};
+
+class RecurrentOp : public RecurrentBase {
+ public:
+ RecurrentOp(const std::string &type, const framework::VariableNameMap &inputs,
+ const framework::VariableNameMap &outputs,
+ const framework::AttributeMap &attrs)
+ : RecurrentBase(type, inputs, outputs, attrs) {}
+
+ void Run(const framework::Scope &scope,
+ const platform::DeviceContext &dev_ctx) const override {
+ auto seq_len = static_cast(this->GetSequenceLength(scope));
+ VLOG(3) << "Static RNN input sequence length = " << seq_len;
+ StepScopes scopes = CreateStepScopes(scope, seq_len);
+ auto reverse = Attr(kReverse);
+
+ framework::Executor executor(dev_ctx);
+ auto *block = Attr(kStepBlock);
+ auto *program = block->Program();
+
+ for (size_t i = 0; i < seq_len; ++i) {
+ size_t seq_offset = reverse ? seq_len - i - 1 : i;
+ VLOG(3) << "Recurrent operate at the time step " << seq_offset;
+
+ auto &cur_scope = scopes.CurScope();
+
+ // Link outside::input --> inside::input
+ // inside::input = outside::input[seq_offset: seq_offset+1]
+ LinkTensorWithCallback(
+ scope, Inputs(kInputs), &cur_scope, Inputs(kInputs),
+ [&seq_offset](const framework::Tensor &outside,
+ framework::Tensor *inside) {
+ inside->ShareDataWith(outside.Slice(seq_offset, seq_offset + 1));
+ auto dims = framework::vectorize(inside->dims());
+ dims.erase(dims.begin());
+ inside->Resize(framework::make_ddim(dims));
+ });
+
+ if (i == 0) {
+ // Link initial states --> ex_states
+ LinkTensor(scope, Inputs(kInitialStates), &cur_scope,
+ Attr>(kExStates));
+ } else {
+ auto &ex_scope = scopes.ExScope();
+ // Link ex_scope::state --> cur_scope::ex_state
+ LinkTensor(ex_scope, Attr>(kStates),
+ &cur_scope, Attr>(kExStates));
+ }
+
+ // Every inputs are linked now, execute!
+ executor.Run(*program, &cur_scope, block->ID(),
+ false /*create_local_scope*/);
+
+ // Copy inside::output -> outside::output
+ // outside::output[seq_offset: seq_offset + 1] = inside::output
+ this->LinkTensorWithCallback(
+ cur_scope, Outputs(kOutputs), scope, Outputs(kOutputs),
+ [&](const framework::LoDTensor &src_tensor,
+ framework::LoDTensor *dst_tensor) {
+ if (i == 0) { // create output tensor at begin
+ dst_tensor->Resize(PrependDims(seq_len, src_tensor.dims()));
+ dst_tensor->mutable_data(dev_ctx.GetPlace(), src_tensor.type());
+ }
+
+ auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
+ // Explicit copy output since the local RNN scope can be destroyed
+ // early.
+ dst_out.CopyFrom(src_tensor, dev_ctx.GetPlace(), dev_ctx);
+ });
+
+ scopes.Next();
+ }
+ }
+
+ private:
+ StepScopes CreateStepScopes(const framework::Scope &scope,
+ size_t seq_len) const {
+ auto *var = scope.FindVar(Output(kStepScopes));
+ PADDLE_ENFORCE(var != nullptr);
+ return StepScopes(scope, var->GetMutable(),
+ Attr(kIsTrain), seq_len);
+ }
+};
+
+class RecurrentGradOp : public RecurrentBase {
+ public:
+ RecurrentGradOp(const std::string &type,
+ const framework::VariableNameMap &inputs,
+ const framework::VariableNameMap &outputs,
+ const framework::AttributeMap &attrs)
+ : RecurrentBase(type, inputs, outputs, attrs) {}
+
+ void Run(const framework::Scope &scope,
+ const platform::DeviceContext &dev_ctx) const override {
+ auto seq_len = static_cast(GetSequenceLength(scope));
+ StepScopes scopes = CreateStepScopes(scope, seq_len);
+ auto reverse = Attr(kReverse);
+
+ framework::Executor executor(dev_ctx);
+ auto *block = Attr(kStepBlock);
+ auto *program = block->Program();
+
+ for (size_t step_id = 0; step_id < seq_len; ++step_id) {
+ size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
+ VLOG(3) << "Recurrent backward operate at the time step " << seq_offset;
+ auto &cur_scope = scopes.CurScope();
+ // Link outside::output_grads --> inside::output_grads
+ // inside::output_grad = outside::output_grad[seq_offset:seq_offset+1]
+ LinkTensorWithCallback(
+ scope, Inputs(kOutputGrads), &cur_scope, Inputs(kOutputGrads),
+ [&](const framework::Tensor &outside, framework::Tensor *inside) {
+ inside->ShareDataWith(outside.Slice(seq_offset, seq_offset + 1));
+ auto dims = framework::vectorize(inside->dims());
+ dims.erase(dims.begin());
+ inside->Resize(framework::make_ddim(dims));
+ });
+ auto og_set = List2Set(Inputs(kOutputGrads));
+
+ if (VLOG_IS_ON(10)) {
+ std::ostringstream sout;
+ std::copy(og_set.begin(), og_set.end(),
+ std::ostream_iterator(sout, ","));
+ VLOG(10) << " RNN output gradients = [" << sout.str() << "]";
+ }
+
+ // Link states
+ // if cur_scope::cur_state_grad in out_grads:
+ // cur_scope::cur_state_grad += ex_scope::ex_state_grad
+ // else:
+ // ex_scope::ex_state_grad --> cur_scope::cur_state_grad
+ if (step_id != 0) { // not at beginning
+ auto &ex_scope = scopes.ExScope();
+ auto ex_state_grads =
+ GradVarLists(Attr>(kExStates));
+ auto cur_state_grads =
+ GradVarLists(Attr>(kStates));
+
+ PADDLE_ENFORCE_EQ(ex_state_grads.size(), cur_state_grads.size());
+ for (size_t i = 0; i < ex_state_grads.size(); ++i) {
+ auto &cur_grad = cur_state_grads[i];
+ auto &ex_grad = ex_state_grads[i];
+ auto &ex_tensor =
+ ex_scope.FindVar(ex_grad)->Get();
+
+ VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad;
+ auto *cur_grad_var = cur_scope.Var(cur_grad);
+ auto cur_grad_tensor =
+ cur_grad_var->GetMutable();
+ cur_grad_tensor->CopyFrom(ex_tensor, dev_ctx.GetPlace(), dev_ctx);
}
}
- // create stepnet's outputs
- for (const auto& output : (*stepnet_)->Outputs()) {
- for (auto& var_name : output.second) {
- step_scope.Var(var_name);
+
+ VLOG(5) << "Recurrent memory linking finished ";
+ // Run step block with cur_scope
+ executor.Run(*program, &cur_scope, block->ID(),
+ false /*create_local_scope*/);
+
+ VLOG(5) << "executor.Run finished ";
+
+ auto local_var_names = LocalVarNames(cur_scope);
+
+ // Accumulate params
+ // if (step == 0):
+ // outside::param_grad = 0.0
+ // outside::param_grad += inside::param_grad
+ {
+ auto &pg_names = Outputs(kParamGrads);
+ auto &p_names = Inputs(kParameters);
+ PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
+
+ for (size_t prog_id = 0; prog_id < pg_names.size(); ++prog_id) {
+ auto inside_grad_name = framework::GradVarName(p_names[prog_id]);
+
+ // If does not compute gradient of that variable inside rnn, just
+ // continue
+ if (local_var_names.find(inside_grad_name) == local_var_names.end()) {
+ continue;
+ }
+
+ // zero gradient variable in step 0
+ if (step_id == 0) {
+ auto &inside_tensor = cur_scope.FindVar(inside_grad_name)
+ ->Get();
+ framework::AttributeMap attrs;
+ attrs["data_type"] = framework::ToDataType(inside_tensor.type());
+ attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
+ attrs["value"] = 0.0f;
+
+ auto zero_op = framework::OpRegistry::CreateOp(
+ "fill_constant", {}, {{"Out", {pg_names[prog_id]}}}, attrs);
+ zero_op->Run(scope, dev_ctx);
+ }
+
+ // sum gradient
+ auto *outside_var = scope.FindVar(pg_names[prog_id]);
+ PADDLE_ENFORCE(outside_var != nullptr);
+ auto &outside_tensor =
+ *outside_var->GetMutable();
+
+ std::string result_var_name;
+ auto *local_result_var = cur_scope.Var(&result_var_name);
+ auto &local_result_tensor =
+ *local_result_var->GetMutable();
+
+ local_result_tensor.ShareDataWith(outside_tensor);
+
+ auto sum_op = framework::OpRegistry::CreateOp(
+ "sum", {{"X", {result_var_name, inside_grad_name}}},
+ {{"Out", {result_var_name}}}, {});
+ sum_op->Run(cur_scope, dev_ctx);
}
}
- step_scopes->emplace_back(&step_scope);
+ VLOG(5) << "Accumulate Parameter finished ";
+
+ // Copy input gradient from inside to outside
+ // outside::input_grad[seq_offset: seq_offset + 1] = inside::input_grad
+ LinkTensorWithCallback(
+ cur_scope, GradVarLists(Inputs(kInputs)), scope, Outputs(kInputGrads),
+ [&](const framework::LoDTensor &inside,
+ framework::LoDTensor *outside) {
+ if (inside.memory_size() == 0) { // IG is not created.
+ return;
+ }
+ if (step_id == 0) { // alloc memory
+ outside->Resize(PrependDims(seq_len, inside.dims()));
+ outside->mutable_data(dev_ctx.GetPlace(), inside.type());
+ }
+
+ auto dst = outside->Slice(seq_offset, seq_offset + 1);
+ dst.CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx);
+ });
+ VLOG(5) << "Link outside gradient finished ";
+
+ if (step_id + 1 == seq_len) { // at_end
+ // copy initialize states gradient from inside to outside
+ LinkTensorWithCallback(
+ cur_scope, GradVarLists(Attr>(kExStates)),
+ scope, Outputs(kInitStateGrads),
+ [&](const framework::LoDTensor &inside,
+ framework::LoDTensor *outside) {
+ outside->Resize(inside.dims());
+ outside->mutable_data(dev_ctx.GetPlace(), inside.type());
+ outside->CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx);
+ });
+ VLOG(5) << "Link initialize state gradient finished ";
+ }
+ scopes.Next();
}
}
-}
-
-void RecurrentAlgorithm::InitMemories(Scope* step_scope) const {
- for (auto& attr : arg_->states) {
- auto* pre_mem = step_scope->Var(attr.pre_var)->GetMutable();
- PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
- "memory [%s]'s boot variable [%s] not exists", attr.var,
- attr.boot_var);
- auto* boot_mem =
- step_scope->FindVar(attr.boot_var)->GetMutable();
- pre_mem->Resize(boot_mem->dims());
- PADDLE_ENFORCE_EQ(pre_mem->dims().size(), 2);
- pre_mem->ShareDataWith(*boot_mem);
- }
-}
-
-const rnn::ArgumentName RecurrentOp::kArgName{
- "step_net", "step_scopes", "inputs", "outputs",
- "states", "ex_states", "initial_states"};
-
-const rnn::ArgumentName RecurrentGradientOp::kArgName{
- "step_net", "step_scopes@GRAD", "outputs@GRAD", "inputs@GRAD",
- "states", "ex_states", "initial_states@GRAD"};
-
-RecurrentOp::RecurrentOp(const std::string& type,
- const framework::VariableNameMap& inputs,
- const framework::VariableNameMap& outputs,
- const framework::AttributeMap& attrs)
- : OperatorBase(type, inputs, outputs, attrs) {
- rnn::InitArgument(kArgName, &arg_, *this);
- alg_.Init(&arg_, &stepnet_);
-}
-
-class RecurrentAlgorithmProtoAndCheckerMaker
- : public framework::OpProtoAndCheckerMaker {
+
+ private:
+ StepScopes CreateStepScopes(const framework::Scope &scope,
+ size_t seq_len) const {
+ auto *var = scope.FindVar(Input(kStepScopes));
+ PADDLE_ENFORCE(var != nullptr);
+ return StepScopes(scope, var->GetMutable(),
+ Attr(kIsTrain), seq_len, true /*is_backward*/);
+ }
+
+ std::unordered_set List2Set(
+ const std::vector &list) const {
+ std::unordered_set local_var_name_set;
+ local_var_name_set.reserve(list.size());
+ for (auto &each : list) {
+ local_var_name_set.insert(each);
+ }
+ return local_var_name_set;
+ }
+
+ std::unordered_set LocalVarNames(
+ const framework::Scope &scope) const {
+ return this->List2Set(scope.GetAllNames(false));
+ }
+ static std::vector GradVarLists(
+ const std::vector &var_names) {
+ std::vector retv;
+ retv.reserve(var_names.size());
+ std::transform(var_names.begin(), var_names.end(), std::back_inserter(retv),
+ framework::GradVarName);
+ return retv;
+ }
+};
+
+class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public:
- RecurrentAlgorithmProtoAndCheckerMaker(framework::OpProto* proto,
- framework::OpAttrChecker* op_checker)
+ RecurrentOpProtoMaker(framework::OpProto *proto,
+ framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
- const auto& name = RecurrentOp::kArgName;
- // inputs and outputs stored in proto
- AddInput(name.inlinks,
- "the inputs that need to be segmented for each step.")
+ AddInput(kInputs, "rnn inputs").AsDuplicable();
+ AddInput(kInitialStates, "rnn initial states").AsDuplicable();
+ AddInput(kParameters,
+ "Parameters are used by step block as its input. However, the "
+ "inputs is not a sequence tensor. Every time step, each operator "
+ "in step block just use the parameter directly")
.AsDuplicable();
- AddInput(name.initial_states, "variables to initialize states.")
+ AddOutput(kOutputs,
+ "The output sequence of RNN. The sequence length must be same")
.AsDuplicable();
+ AddOutput(kStepScopes,
+ "StepScopes contains all local variables in each time step.");
+ AddAttr>(kExStates,
+ string::Sprintf(
+ R"DOC(The ex-state variable names.
+The ex-state means the state value in the ex-timestep or the previous time step
+[%s, %s, %s] must be the same order)DOC",
+ kExStates, kStates, kInitStateGrads));
+ AddAttr>(
+ kStates,
+ string::Sprintf(
+ "The state variable names. [%s, %s, %s] must be the same order",
+ kExStates, kStates, kInitStateGrads));
+ AddAttr(kStepBlock,
+ "The step block inside RNN");
+ AddAttr(kReverse, R"DOC(Calculate RNN reversely or not.
+By default reverse=False
- AddOutput(name.outlinks, "the outputs that need to concated for all steps.")
- .AsDuplicable();
- AddOutput(name.step_scopes, "step scopes");
+Assume the input data is [A, B, C, D]
+
+if reverse is False:
+ the computation of RNN is like
+ A B C D
+ | | | |
+ v v v v
+ rnn -----> rnn -----> rnn ----> rnn
+ | | | |
+ v v v v
+ o o o o
+
+if reverse is True
+ the computation of RNN is like
+ A B C D
+ | | | |
+ v v v v
+ rnn <----- rnn <----- rnn <---- rnn
+ | | | |
+ v v v v
+ o o o o
+)DOC").SetDefault(false);
+ AddAttr(kIsTrain, "").SetDefault(true);
+ AddComment(R"DOC(Static Length Recurrent Operator
+
+The static length recurrent operator can only operate on fix sized sequence
+data, i.e. in each mini-batch, the sequence length of all inputs are same.
+)DOC");
+ }
+};
+
+class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+ using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
- // Attributes stored in AttributeMap
- AddAttr>(name.ex_states, "names of pre-states");
- AddAttr>(name.states, "names of states");
+ protected:
+ virtual std::unique_ptr Apply() const {
+ auto *grad = new framework::OpDescBind();
+ grad->SetType("recurrent_grad");
+ for (auto &input_param : this->InputNames()) {
+ grad->SetInput(input_param, this->Input(input_param));
+ grad->SetOutput(framework::GradVarName(input_param),
+ this->InputGrad(input_param));
+ }
+
+ for (auto &output_param : this->OutputNames()) {
+ if (output_param == kStepScopes) {
+ grad->SetInput(output_param, this->Output(output_param));
+ grad->SetInput(framework::GradVarName(output_param),
+ this->Output(output_param));
+ } else {
+ grad->SetInput(output_param, this->Output(output_param));
+ grad->SetInput(framework::GradVarName(output_param),
+ this->OutputGrad(output_param));
+ }
+ }
+ grad->SetAttrMap(this->Attrs());
+ grad->SetBlockAttr(kStepBlock, *grad_block_[0]);
- AddComment("This is a recurrent group operator.");
+ return std::unique_ptr(grad);
}
};
-void RecurrentGradientAlgorithm::Run(
- const Scope& scope, const platform::DeviceContext& dev_ctx) const {
- auto* input0 = scope.FindVar(arg_->inlinks[0]);
- PADDLE_ENFORCE_NOT_NULL(input0);
- size_t seq_len = input0->GetMutable