diff --git a/.gitignore b/.gitignore
index 020d3f0c303f7d850f4ec9c0efe58ab2d57dce2e..ac56a3320ec85769d2c87c072512f5217eca0c24 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@ cmake_install.cmake
paddle/.timestamp
python/paddlepaddle.egg-info/
paddle/pybind/pybind.h
+python/paddle/version.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4ba29d6bbcc4acf9538973562df55b823e6428ef..b309ff37e52b4fd28b14925bdd7e3740e1e2fa47 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,10 +16,14 @@ cmake_minimum_required(VERSION 3.0)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
include(system)
project(paddle CXX C Go)
+message(STATUS "CXX compiler: " ${CMAKE_CXX_COMPILER} ", version: " ${CMAKE_CXX_COMPILER_VERSION})
+message(STATUS "C compiler: " ${CMAKE_C_COMPILER} ", version: " ${CMAKE_C_COMPILER_VERSION})
find_package(Sphinx)
if(NOT CMAKE_CROSSCOMPILING)
@@ -56,6 +60,7 @@ option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
+option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
# CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE)
diff --git a/README.md b/README.md
index db0fbd88b250cdc2a3cc77521cc1c2cea77c6e87..bbb2d498589092de78b21a662f03171a0721f840 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
examples:
- Optimized math operations through SSE/AVX intrinsics, BLAS libraries
- (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels.
+ (e.g. MKL, OpenBLAS, cuBLAS) or customized CPU/GPU kernels.
- Highly optimized recurrent networks which can handle **variable-length**
sequence without padding.
- Optimized local and distributed training for models with high dimensional
diff --git a/RELEASE.cn.md b/RELEASE.cn.md
index 5deaf230a8f5dd3089993f0fc79b9460fd049750..494c59730dd3c2830514e8924aa3d59a34ac412e 100644
--- a/RELEASE.cn.md
+++ b/RELEASE.cn.md
@@ -1,3 +1,62 @@
+# v0.11.0版本
+
+## PaddlePaddle Fluid
+
+- PaddlePaddle发布版本v0.11.0包含一个新的特性*PaddlePaddle Fluid*. Fluid 是设计用来让用户像Pytorch和Tensorflow Eager Execution一样执行程序。在这些系统中,不再有*模型*这个概念,应用也不再包含一个用于描述Operator图或者一系列层的符号描述,而是像通用程序那样描述训练或者预测的过程。而Fluid与PyTorch或Eager Execution的区别在于Fluid不依赖Python提供的控制流,例如 if-else-then或者for,而是提供了基于C++实现的控制流并暴露了对应的用with语法实现的Python接口。例如:
+
+ https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44
+
+- 在v0.11.0版本中,我们提供了一个C++类`Executor`用于运行一个Fluid程序。Executor类似一个解释器。在未来的版本中,我们将提升和优化Executor成为一个调试器,就像GDB。并可能提供一些编译器,这个编译器会读取一个上文所描述的应用然后编译成一个等价的
+源代码,这个源代码可以被nvcc编译成可以使用CUDA的二进制,或者被icc编译成可以充分利用Intel CPU的二进制。
+
+
+## 新特点
+
+* 发布 `PaddlePaddle Fluid`。
+* 增加了用于模型预测的C-API。
+* 用Fluid API实现了一个简单的GAN的例子。
+* 增加了关于性能调优的文档。
+* 为`paddle.v2.dataset`下载数据集提供了重试机制.
+* C++中使用protobuf-lite替换protobuf减少了二进制的大小。
+* 发布了新特性 [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment).
+* 基于Bazel API利用cmake实现了一个的新的构建系统函数库。
+* 当使用编译选项`WITH_MKL=ON`时自动下载和编译Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) 函数库.
+* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn):
+ - 完成了 11个 MKL-DNN 层: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN。
+ - 完成了 3个 MKL-DNN 网络: VGG-19, ResNet-50, GoogleNet
+ - 基于Intel Skylake 6148 CPU的[性能测试](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) : 相对于MKLML有2~3倍的训练加速。
+* 增加 [softsign activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign)
+* 增加 [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod)
+* 增加 [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance)
+* 增加 [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq)
+* 增加 [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score)
+* 增加 [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice)
+* 增加 [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv)
+* 增加移动端友好的网页
+
+## 改进
+
+* 使用一个Python`whl`包即可安装.
+* [V2 API可以实现用户定制化评估](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标)。
+* 将 `PADDLE_ONLY_CPU` 改为 `PADDLE_WITH_GPU`, 因为我们会支持多种设备。
+* 删除了有一些bug的BarrierStat。
+* 清理和删除了paddle::Parameter中未使用的函数。
+* 删除了ProtoDataProvider。
+* Huber loss同时支持回归和分类。
+* 为sequence pooling 层增加`stride`参数。
+* v2 API自动使用cudnn batch normalization。
+* 可以使用一个固定的参数名共享BN层的参数。
+* 2D convolution operation支持variable-dimension input特性。
+* 重构cmake中关于CUDA的部分并实现自动检测GPU架构的功能。
+* 优化网页导航。
+
+## 错误修复
+
+* 修复ROI pooling的Bug. cc9a761
+* 修复当label是dense vector是AUC变成0的问题. #5274
+* 修复WarpCTC 层的Bug.
+
+
# v0.10.0版本
我们非常高兴发布了PaddlePaddle V0.10.0版,并开发了新的[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/)。
diff --git a/RELEASE.md b/RELEASE.md
index 146f7afa7dfbc152500b82fde28445ae3155c16c..5a62c955131007c9f3329d162c20d1b462550019 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,75 @@
+# Release v0.11.0
+
+## PaddlePaddle Fluid
+
+- Release 0.11.0 includes a new feature *PaddlePaddle Fluid*. Fluid is
+ designed to allow users to program like PyTorch and TensorFlow Eager Execution.
+ In these systems, there is no longer the concept *model* and applications
+ do not include a symbolic description of a graph of operators nor a sequence
+ of layers. Instead, applications look exactly like a usual program that
+ describes a process of training or inference. The difference between
+ Fluid and PyTorch or Eager Execution is that Fluid doesn't rely on Python's
+ control-flow, `if-then-else` nor `for`. Instead, Fluid provides its
+ C++ implementations and their Python binding using the `with` statement. For an example
+
+ https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44
+
+- In 0.11.0, we provides a C++ class `Executor` to run a Fluid program.
+Executor works like an interpreter. In future version, we will improve
+`Executor` into a debugger like GDB, and we might provide some compilers,
+which, for example, takes an application like the above one, and outputs
+an equivalent C++ source program, which can be compiled using
+[`nvcc`](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html)
+to generate binaries that use CUDA, or using
+[`icc`](https://software.intel.com/en-us/c-compilers) to generate binaries
+that make full use of Intel CPUs.
+
+## New Features
+
+* Release `PaddlePaddle Fluid`.
+* Add C-API for model inference
+* Use fluid API to create a simple GAN demo.
+* Add develop guide about performance tunning.
+* Add retry when download `paddle.v2.dataset`.
+* Linking protobuf-lite not protobuf in C++. Reduce the binary size.
+* Feature [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment) released.
+* A new style cmake functions for Paddle. It is based on Bazel API.
+* Automatically download and compile with Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) library as CBLAS when build `WITH_MKL=ON`.
+* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn):
+ - Complete 11 MKL-DNN layers: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN.
+ - Complete 3 MKL-DNN networks: VGG-19, ResNet-50, GoogleNet
+ - [Benchmark](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) on Intel Skylake 6148 CPU: 2~3x training speedup compared with MKLML.
+* Add the [`softsign` activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign).
+* Add the [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod).
+* Add the [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance).
+* Add the [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq).
+* Add the [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score).
+* Add the [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice).
+* Add the [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv)
+* Add mobile friendly webpages.
+
+## Improvements
+
+* Build and install using a single `whl` package.
+* [Custom evaluating in V2 API](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标).
+* Change `PADDLE_ONLY_CPU` to `PADDLE_WITH_GPU`, since we will support many kinds of devices.
+* Remove buggy BarrierStat.
+* Clean and remove unused functions in paddle::Parameter.
+* Remove ProtoDataProvider.
+* Huber loss supports both regression and classification.
+* Add the `stride` parameter for sequence pooling layers.
+* Enable v2 API use cudnn batch normalization automatically.
+* The BN layer's parameter can be shared by a fixed the parameter name.
+* Support variable-dimension input feature for 2D convolution operation.
+* Refine cmake about CUDA to automatically detect GPU architecture.
+* Improved website navigation.
+
+## Bug Fixes
+
+* Fix bug in ROI pooling. cc9a761
+* Fix AUC is zero when label is dense vector. #5274
+* Fix bug in WarpCTC layer.
+
# Release v0.10.0
We are glad to release version 0.10.0. In this version, we are happy to release the new
diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md
index 16c2390fd31bf1c79f29735fb98180d3f7302eb2..8ee7fd28c58f2a2bcb82040eb824a37062bd4e9c 100644
--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@@ -2,27 +2,25 @@
Machine:
-- Server
- - Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
-- Laptop
- - DELL XPS15-9560-R1745: i7-7700HQ 8G 256GSSD
- - i5 MacBook Pro (Retina, 13-inch, Early 2015)
-- Desktop
- - i7-6700k
+- Server: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
+- Laptop: TBD
System: CentOS release 6.3 (Final), Docker 1.12.1.
-PaddlePaddle: paddlepaddle/paddle:latest (for MKLML and MKL-DNN), paddlepaddle/paddle:latest-openblas (for OpenBLAS)
-- MKL-DNN tag v0.11
-- MKLML 2018.0.1.20171007
-- OpenBLAS v0.2.20
-(TODO: will rerun after 0.11.0)
+PaddlePaddle: (TODO: will rerun after 0.11.0)
+- paddlepaddle/paddle:latest (for MKLML and MKL-DNN)
+ - MKL-DNN tag v0.11
+ - MKLML 2018.0.1.20171007
+- paddlepaddle/paddle:latest-openblas (for OpenBLAS)
+ - OpenBLAS v0.2.20
On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
## Benchmark Model
### Server
+
+#### Training
Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
Input image size - 3 * 224 * 224, Time: images/second
@@ -35,9 +33,7 @@ Input image size - 3 * 224 * 224, Time: images/second
| MKLML | 12.12 | 13.70 | 16.18 |
| MKL-DNN | 28.46 | 29.83 | 30.44 |
-
-chart on batch size 128
-TBD
+
- ResNet-50
@@ -47,9 +43,7 @@ TBD
| MKLML | 32.52 | 31.89 | 33.12 |
| MKL-DNN | 81.69 | 82.35 | 84.08 |
-
-chart on batch size 128
-TBD
+
- GoogLeNet
@@ -59,10 +53,35 @@ TBD
| MKLML | 128.46| 137.89| 158.63 |
| MKL-DNN | 250.46| 264.83| 269.50 |
-chart on batch size 128
-TBD
+
+
+#### Inference
+Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+- VGG-19
+
+| BatchSize | 1 | 2 | 4 | 8 | 16 |
+|-----------|-------|-------|-------|-------|-------|
+| OpenBLAS | 1.07 | 1.08 | 1.06 | 0.88 | 0.65 |
+| MKLML | 5.58 | 9.80 | 15.15 | 21.21 | 28.67 |
+| MKL-DNN | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 |
+
+- ResNet-50
+
+| BatchSize | 1 | 2 | 4 | 8 | 16 |
+|-----------|-------|--------|--------|--------|--------|
+| OpenBLAS | 3.35 | 3.19 | 3.09 | 2.55 | 1.96 |
+| MKLML | 6.33 | 12.02 | 22.88 | 40.53 | 63.09 |
+| MKL-DNN | 107.83| 148.84 | 177.78 | 189.35 | 217.69 |
+
+
+- GoogLeNet
+
+| BatchSize | 1 | 2 | 4 | 8 | 16 |
+|-----------|--------|--------|--------|--------|--------|
+| OpenBLAS | 12.04 | 11.31 | 10.00 | 9.07 | 4.34 |
+| MKLML | 22.74 | 41.56 | 81.22 | 133.47 | 210.53 |
+| MKL-DNN | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
+
### Laptop
TBD
-### Desktop
-TBD
diff --git a/benchmark/figs/googlenet-cpu-train.png b/benchmark/figs/googlenet-cpu-train.png
new file mode 100644
index 0000000000000000000000000000000000000000..c3f67faf096fe9b45dd815f294b41679dc7c9e54
Binary files /dev/null and b/benchmark/figs/googlenet-cpu-train.png differ
diff --git a/benchmark/figs/resnet-cpu-train.png b/benchmark/figs/resnet-cpu-train.png
new file mode 100644
index 0000000000000000000000000000000000000000..b96ecd5ff940c0d000613b1ed1f11fb16796cf47
Binary files /dev/null and b/benchmark/figs/resnet-cpu-train.png differ
diff --git a/benchmark/figs/vgg-cpu-train.png b/benchmark/figs/vgg-cpu-train.png
new file mode 100644
index 0000000000000000000000000000000000000000..f830ca6a87d10b72a5113636dd5686ab25a2e864
Binary files /dev/null and b/benchmark/figs/vgg-cpu-train.png differ
diff --git a/benchmark/paddle/image/run_mkldnn_infer.sh b/benchmark/paddle/image/run_mkldnn_infer.sh
index 03a76c0540092501b33e1fdd430ae4e754744fd0..d795bcab1b7d098295066f79189d17e8299d28fb 100755
--- a/benchmark/paddle/image/run_mkldnn_infer.sh
+++ b/benchmark/paddle/image/run_mkldnn_infer.sh
@@ -4,7 +4,7 @@ function clock_to_seconds() {
hours=`echo $1 | awk -F ':' '{print $1}'`
mins=`echo $1 | awk -F ':' '{print $2}'`
secs=`echo $1 | awk -F ':' '{print $3}'`
- echo `bc -l <<< "$secs + $mins * 60 + $hours * 3600"`
+ echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
}
function infer() {
@@ -58,9 +58,9 @@ function infer() {
end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
start_sec=`clock_to_seconds $start`
end_sec=`clock_to_seconds $end`
- fps=`bc <<< "scale = 2; 1280 / ($end_sec - $start_sec)"`
+ fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
- echo "FPS: $fps images/sec" >> ${log}
+ echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
}
if [ ! -f "train.list" ]; then
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index b21fc43904d9aafe9f7d019dfbe5b1c0d3f9e2d6..6320b17520a687f88993b6f464d9115838b0f96b 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -3,7 +3,7 @@
# It will search MKLML, atlas, OpenBlas, reference-cblas in order.
#
# If any cblas implementation found, the following variable will be set.
-# CBLAS_PROVIDER # one of MKLML, ATLAS, OPENBLAS, REFERENCE
+# CBLAS_PROVIDER # one of MKLML, OPENBLAS, REFERENCE
# CBLAS_INC_DIR # the include directory for cblas.
# CBLAS_LIBS # a list of libraries should be linked by paddle.
# # Each library should be full path to object file.
@@ -17,7 +17,7 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
set(CBLAS_INC_DIR ${MKLML_INC_DIR})
set(CBLAS_LIBRARIES ${MKLML_LIB})
- add_definitions(-DPADDLE_USE_MKLML)
+ add_definitions(-DPADDLE_WITH_MKLML)
add_definitions(-DLAPACK_FOUND)
message(STATUS "Found cblas and lapack in MKLML "
@@ -25,42 +25,6 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
return()
endif()
-## Then find atlas.
-set(ATLAS_ROOT $ENV{ATLAS_ROOT} CACHE PATH "Folder contains Atlas")
-set(ATLAS_INCLUDE_SEARCH_PATHS
- ${ATLAS_ROOT}/include
- /usr/include
- /usr/include/atlas)
-set(ATLAS_LIB_SEARCH_PATHS
- ${ATLAS_ROOT}/lib
- /usr/lib
- /usr/lib/blas/atlas
- /usr/lib/atlas
- /usr/lib/atlas-base # special for ubuntu 14.04.
- )
-find_path(ATLAS_INC_DIR NAMES cblas.h
- PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
- PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
- PATHS ${ATLAS_LIB_SEARCH_PATHS})
-find_library(ATLAS_CLAPACK_LIB NAMES lapack_atlas liblapack_atlas.so.3
- PATHS ${ATLAS_LIB_SEARCH_PATHS})
-
-if(ATLAS_CLAPACK_INC_DIR AND ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_CLAPACK_LIB)
- set(CBLAS_FOUND ON)
- set(CBLAS_PROVIDER ATLAS)
- set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
- set(CBLAS_LIBRARIES ${ATLAS_CLAPACK_LIB} ${ATLAS_CBLAS_LIB})
-
- add_definitions(-DPADDLE_USE_ATLAS)
- add_definitions(-DLAPACK_FOUND)
-
- message(STATUS "Found ATLAS (include: ${ATLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
- message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
- return()
-endif()
-
## Then find openblas.
set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
set(OPENBLAS_INCLUDE_SEARCH_PATHS
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e550ec285668ea25757eeee9e7c5dc48fc9d339d..5c6bcfde76a1201f792d04766d698db8cd395a49 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -24,6 +24,11 @@ if(WITH_DOUBLE)
add_definitions(-DPADDLE_TYPE_DOUBLE)
endif(WITH_DOUBLE)
+if(WITH_ARM_FP16)
+ add_definitions(-DPADDLE_ARM_FP16)
+ add_definitions("-march=armv8.2-a+fp16+simd")
+endif(WITH_ARM_FP16)
+
if(WITH_TESTING)
add_definitions(-DPADDLE_WITH_TESTING)
endif(WITH_TESTING)
diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake
index ac456933bd2260b2bbde2de78c486a5c0a1f5a96..aec51410b33669f8a549f2eca193cc6aa2d07a13 100644
--- a/cmake/external/cares.cmake
+++ b/cmake/external/cares.cmake
@@ -33,7 +33,7 @@ ExternalProject_Add(
UPDATE_COMMAND ""
CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR}
BUILD_IN_SOURCE 1
- BUILD_COMMAND make
+ BUILD_COMMAND make -j8
INSTALL_COMMAND make install
)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index fc52d339d7a336b44c97f2e0a9fc8d6604854365..5d24caebdcc5a28823164d718fb1628be5c4179d 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -67,5 +67,5 @@ ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
-add_definitions(-DPADDLE_USE_MKLDNN)
+add_definitions(-DPADDLE_WITH_MKLDNN)
LIST(APPEND external_project_dependencies mkldnn)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 4c4f59656dae68739f2f07f3febd510e727fe2dd..97857a686b38d935b19f510ecdcb66bcca91fe03 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -114,11 +114,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
- ADD_LIBRARY(cblas SHARED ${dummyfile})
-ELSE()
- ADD_LIBRARY(cblas STATIC ${dummyfile})
-ENDIF()
+ADD_LIBRARY(cblas STATIC ${dummyfile})
TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
IF(NOT ${CBLAS_FOUND})
diff --git a/doc/api/index_en.rst b/doc/api/index_en.rst
index 25c1dd00b9cbb3ab647e04cdc2b4c27c552a2332..e6f632e1a5b9c4b50b7c6aa96a120030bd6ce338 100644
--- a/doc/api/index_en.rst
+++ b/doc/api/index_en.rst
@@ -7,3 +7,4 @@ API
v2/model_configs.rst
v2/data.rst
v2/run_logic.rst
+ v2/fluid.rst
diff --git a/doc/api/v2/config/activation.rst b/doc/api/v2/config/activation.rst
index eca3ce03bcdc599edca802d8dfca48d4f28275a2..5317e66b64bbd85c61f19700a9d2c1d239dee573 100644
--- a/doc/api/v2/config/activation.rst
+++ b/doc/api/v2/config/activation.rst
@@ -99,3 +99,10 @@ STanh
.. automodule:: paddle.v2.activation
:members: STanh
:noindex:
+
+SoftSign
+========
+
+.. automodule:: paddle.v2.activation
+ :members: SoftSign
+ :noindex:
diff --git a/doc/api/v2/fluid.rst b/doc/api/v2/fluid.rst
new file mode 100644
index 0000000000000000000000000000000000000000..43fc19dc492bbc119f2356034b81c65e443db2fa
--- /dev/null
+++ b/doc/api/v2/fluid.rst
@@ -0,0 +1,18 @@
+======================
+Fluid
+======================
+
+.. toctree::
+ :maxdepth: 1
+
+ fluid/layers.rst
+ fluid/data_feeder.rst
+ fluid/executor.rst
+ fluid/initializer.rst
+ fluid/evaluator.rst
+ fluid/nets.rst
+ fluid/optimizer.rst
+ fluid/param_attr.rst
+ fluid/profiler.rst
+ fluid/regularizer.rst
+
diff --git a/doc/api/v2/fluid/data_feeder.rst b/doc/api/v2/fluid/data_feeder.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0fa78f7dfb04c13be7eb83b7fd35cb03f2f4a7fa
--- /dev/null
+++ b/doc/api/v2/fluid/data_feeder.rst
@@ -0,0 +1,9 @@
+===========
+DataFeeder
+===========
+
+DataFeeder
+-----------
+.. automodule:: paddle.v2.fluid.data_feeder
+ :members: DataFeeder
+ :noindex:
diff --git a/doc/api/v2/fluid/evaluator.rst b/doc/api/v2/fluid/evaluator.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a23f3301d0331e0ea3733f06444515eb4680cd31
--- /dev/null
+++ b/doc/api/v2/fluid/evaluator.rst
@@ -0,0 +1,9 @@
+===========
+Evaluator
+===========
+
+Evaluator
+-----------
+.. automodule:: paddle.v2.fluid.evaluator
+ :members: Evaluator
+ :noindex:
diff --git a/doc/api/v2/fluid/executor.rst b/doc/api/v2/fluid/executor.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3a283538c120cfa1ef646c390bb71c6251c23675
--- /dev/null
+++ b/doc/api/v2/fluid/executor.rst
@@ -0,0 +1,9 @@
+===========
+Executor
+===========
+
+Executor
+-----------
+.. automodule:: paddle.v2.fluid.executor
+ :members: Executor
+ :noindex:
diff --git a/doc/api/v2/fluid/initializer.rst b/doc/api/v2/fluid/initializer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8f587837e9873370722062404f511654a9460587
--- /dev/null
+++ b/doc/api/v2/fluid/initializer.rst
@@ -0,0 +1,50 @@
+===========
+Initializer
+===========
+
+
+
+Initializer
+-----------
+.. automodule:: paddle.v2.fluid.initializer
+ :members: Initializer
+ :noindex:
+
+
+
+ConstantInitializer
+-------------------
+.. automodule:: paddle.v2.fluid.initializer
+ :members: ConstantInitializer
+ :noindex:
+
+
+
+UniformInitializer
+------------------
+.. automodule:: paddle.v2.fluid.initializer
+ :members: UniformInitializer
+ :noindex:
+
+
+
+NormalInitializer
+-----------------
+.. automodule:: paddle.v2.fluid.initializer
+ :members: NormalInitializer
+ :noindex:
+
+
+XavierInitializer
+-----------------
+.. automodule:: paddle.v2.fluid.initializer
+ :members: XavierInitializer
+ :noindex:
+
+
+MSRAInitializer
+---------------
+.. automodule:: paddle.v2.fluid.initializer
+ :members: MSRAInitializer
+ :noindex:
+
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
new file mode 100644
index 0000000000000000000000000000000000000000..89e5fec13bf9062dc7a7187b1334c8f5486a980b
--- /dev/null
+++ b/doc/api/v2/fluid/layers.rst
@@ -0,0 +1,302 @@
+==========
+Layers
+==========
+
+
+fc
+---
+.. autofunction:: paddle.v2.fluid.layers.fc
+ :noindex:
+
+embedding
+---------
+.. autofunction:: paddle.v2.fluid.layers.embedding
+ :noindex:
+
+dynamic_lstm
+------------
+.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm
+ :noindex:
+
+data
+---------
+.. autofunction:: paddle.v2.fluid.layers.data
+ :noindex:
+
+mean
+---------
+.. autofunction:: paddle.v2.fluid.layers.mean
+ :noindex:
+
+mul
+---------
+.. autofunction:: paddle.v2.fluid.layers.mul
+ :noindex:
+
+elementwise_add
+---------------
+.. autofunction:: paddle.v2.fluid.layers.elementwise_add
+ :noindex:
+
+elementwise_div
+---------------
+.. autofunction:: paddle.v2.fluid.layers.elementwise_div
+ :noindex:
+
+
+dropout
+---------
+.. autofunction:: paddle.v2.fluid.layers.dropout
+ :noindex:
+
+
+reshape
+---------
+.. autofunction:: paddle.v2.fluid.layers.reshape
+ :noindex:
+
+
+sigmoid
+---------
+.. autofunction:: paddle.v2.fluid.layers.sigmoid
+ :noindex:
+
+
+scale
+---------
+.. autofunction:: paddle.v2.fluid.layers.scale
+ :noindex:
+
+
+reshape
+---------
+.. autofunction:: paddle.v2.fluid.layers.reshape
+ :noindex:
+
+
+transpose
+---------
+.. autofunction:: paddle.v2.fluid.layers.transpose
+ :noindex:
+
+
+sigmoid_cross_entropy_with_logits
+---------
+.. autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
+ :noindex:
+
+
+cast
+---------
+.. autofunction:: paddle.v2.fluid.layers.cast
+ :noindex:
+
+
+concat
+---------
+.. autofunction:: paddle.v2.fluid.layers.concat
+ :noindex:
+
+
+sums
+---------
+.. autofunction:: paddle.v2.fluid.layers.sums
+ :noindex:
+
+
+linear_chain_crf
+---------
+.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+ :noindex:
+
+
+assign
+---------
+.. autofunction:: paddle.v2.fluid.layers.embedding
+ :noindex:
+
+
+split_lod_tensor
+---------
+.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor
+ :noindex:
+
+
+merge_lod_tensor
+---------
+.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
+ :noindex:
+
+cos_sim
+---------
+.. autofunction:: paddle.v2.fluid.layers.cos_sim
+ :noindex:
+
+
+cross_entropy
+---------
+.. autofunction:: paddle.v2.fluid.layers.cross_entropy
+ :noindex:
+
+
+
+square_error_cost
+---------
+.. autofunction:: paddle.v2.fluid.layers.square_error_cost
+ :noindex:
+
+
+accuracy
+---------
+.. autofunction:: paddle.v2.fluid.layers.accuracy
+ :noindex:
+
+
+sequence_conv
+---------
+.. autofunction:: paddle.v2.fluid.layers.sequence_conv
+ :noindex:
+
+
+conv2d
+---------
+.. autofunction:: paddle.v2.fluid.layers.conv2d
+ :noindex:
+
+
+sequence_pool
+---------
+.. autofunction:: paddle.v2.fluid.layers.sequence_pool
+ :noindex:
+
+
+pool2d
+---------
+.. autofunction:: paddle.v2.fluid.layers.pool2d
+ :noindex:
+
+
+batch_norm
+---------
+.. autofunction:: paddle.v2.fluid.layers.batch_norm
+ :noindex:
+
+
+beam_search_decode
+---------
+.. autofunction:: paddle.v2.fluid.layers.beam_search_decode
+ :noindex:
+
+
+lstm
+---------
+.. autofunction:: paddle.v2.fluid.layers.lstm
+ :noindex:
+
+
+lod_rank_table
+---------
+.. autofunction:: paddle.v2.fluid.layers.lod_rank_table
+ :noindex:
+
+
+max_sequence_len
+---------
+.. autofunction:: paddle.v2.fluid.layers.max_sequence_len
+ :noindex:
+
+
+topk
+---------
+.. autofunction:: paddle.v2.fluid.layers.topk
+ :noindex:
+
+
+lod_tensor_to_array
+---------
+.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
+ :noindex:
+
+
+
+array_to_lod_tensor
+---------
+.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
+ :noindex:
+
+
+
+
+fill_constant
+---------
+.. autofunction:: paddle.v2.fluid.layers.fill_constant
+ :noindex:
+
+
+
+fill_constant_batch_size_like
+---------
+.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
+ :noindex:
+
+
+ones
+---------
+.. autofunction:: paddle.v2.fluid.layers.ones
+ :noindex:
+
+
+zeros
+---------
+.. autofunction:: paddle.v2.fluid.layers.zeros
+ :noindex:
+
+
+increment
+---------
+.. autofunction:: paddle.v2.fluid.layers.increment
+ :noindex:
+
+
+array_write
+---------
+.. autofunction:: paddle.v2.fluid.layers.array_write
+ :noindex:
+
+
+
+create_array
+---------
+.. autofunction:: paddle.v2.fluid.layers.create_array
+ :noindex:
+
+
+less_than
+---------
+.. autofunction:: paddle.v2.fluid.layers.less_than
+ :noindex:
+
+
+array_read
+---------
+.. autofunction:: paddle.v2.fluid.layers.array_read
+ :noindex:
+
+
+shrink_memory
+---------
+.. autofunction:: paddle.v2.fluid.layers.shrink_memory
+ :noindex:
+
+
+array_length
+---------
+.. autofunction:: paddle.v2.fluid.layers.array_length
+ :noindex:
+
+
+conv2d_transpose
+---------
+.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose
+ :noindex:
+
diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2c3d075422de29c96e25458e831133a30270dd39
--- /dev/null
+++ b/doc/api/v2/fluid/nets.rst
@@ -0,0 +1,22 @@
+===========
+Nets
+===========
+
+simple_img_conv_pool
+-----------
+.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
+ :noindex:
+
+
+img_conv_group
+-----------
+.. autofunction:: paddle.v2.fluid.nets.img_conv_group
+ :noindex:
+
+
+sequence_conv_pool
+-----------
+.. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
+ :noindex:
+
+
diff --git a/doc/api/v2/fluid/optimizer.rst b/doc/api/v2/fluid/optimizer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..233762fcdfb39e592740adef6721a556fae3feef
--- /dev/null
+++ b/doc/api/v2/fluid/optimizer.rst
@@ -0,0 +1,54 @@
+===========
+Optimizer
+===========
+
+Optimizer
+-----------
+.. automodule:: paddle.v2.fluid.optimizer
+ :members: Optimizer
+ :noindex:
+
+
+SGDOptimizer
+-----------
+.. automodule:: paddle.v2.fluid.optimizer
+ :members: SGDOptimizer
+ :noindex:
+
+
+
+MomentumOptimizer
+-----------
+.. automodule:: paddle.v2.fluid.optimizer
+ :members: MomentumOptimizer
+ :noindex:
+
+
+
+AdagradOptimizer
+-----------
+.. automodule:: paddle.v2.fluid.optimizer
+ :members: AdagradOptimizer
+ :noindex:
+
+
+AdamOptimizer
+-----------
+.. automodule:: paddle.v2.fluid.optimizer
+ :members: AdamOptimizer
+ :noindex:
+
+
+AdamaxOptimizer
+-----------
+.. automodule:: paddle.v2.fluid.optimizer
+ :members: AdamaxOptimizer
+ :noindex:
+
+
+DecayedAdagradOptimizer
+-----------
+.. automodule:: paddle.v2.fluid.optimizer
+ :members: DecayedAdagradOptimizer
+ :noindex:
+
diff --git a/doc/api/v2/fluid/param_attr.rst b/doc/api/v2/fluid/param_attr.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ca0c8af9e8c4f2271de7a131ad0d27c0e8635f50
--- /dev/null
+++ b/doc/api/v2/fluid/param_attr.rst
@@ -0,0 +1,11 @@
+===========
+ParamAttr
+===========
+
+
+
+ParamAttr
+-----------
+.. automodule:: paddle.v2.fluid.param_attr
+ :members: ParamAttr
+ :noindex:
diff --git a/doc/api/v2/fluid/profiler.rst b/doc/api/v2/fluid/profiler.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7d4042d1f41c12c4a551ba6576559d612116872a
--- /dev/null
+++ b/doc/api/v2/fluid/profiler.rst
@@ -0,0 +1,10 @@
+===========
+Profiler
+===========
+
+
+
+Profiler
+-----------
+.. autofunction:: paddle.v2.fluid.profiler.cuda_profiler
+ :noindex:
diff --git a/doc/api/v2/fluid/regularizer.rst b/doc/api/v2/fluid/regularizer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3af2b07d2ae55d99df705fbf1ad2402eee05c435
--- /dev/null
+++ b/doc/api/v2/fluid/regularizer.rst
@@ -0,0 +1,25 @@
+===========
+Regularizer
+===========
+
+WeightDecayRegularizer
+-----------
+.. automodule:: paddle.v2.fluid.regularizer
+ :members: WeightDecayRegularizer
+ :noindex:
+
+
+L2DecayRegularizer
+-----------
+.. automodule:: paddle.v2.fluid.regularizer
+ :members: L2DecayRegularizer
+ :noindex:
+
+
+
+L1DecayRegularizer
+-----------
+.. automodule:: paddle.v2.fluid.regularizer
+ :members: L1DecayRegularizer
+
+
diff --git a/doc/design/evaluator.md b/doc/design/evaluator.md
index a62d75ffef14962aec8c7587e172d78dfe0cb4be..11cc129d56905a9ee666da92fbe6f8559c6d325a 100644
--- a/doc/design/evaluator.md
+++ b/doc/design/evaluator.md
@@ -1,22 +1,22 @@
## Evaluator Design
-### The Problem
+### Problem Statement
-During training or serving, we provide the evaluation function to measure the model performance, e.g., accuracy, precision. In the operator based framework design, the data go through the network pipeline batch by batch. As a result, inside the operator, we only can calculate one minibatch metrics. We need to provide a mechanism to calculate the metrics for each N pass/batch the user wanted.
+During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants.
### Evaluator Design
-Currently, every operation is expressed in the graph. we divide the evaluator process into three steps.
+Currently, every operation is expressed in the graph. We divide the evaluator process into three steps.
1. Initialize the metric state and add it into the block.
-2. Calculate the statistic of the metric state in every mini-batch. The single operator is only responsible for calculating necessary statistics for one mini-batch. For example, accuracy operator only calculate a minibatch data if run once.
+2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once.
3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
### Implementation
-This design is shown in python API.
-Each metric operator need to caculate the metric statistic and return the batch aware states, Python side responsible for accumulate the states for each pass.
+This design is shown in the Python API.
+Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass.
```python
diff --git a/doc/design/fluid-compiler.graffle b/doc/design/fluid-compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..c933df2cb855462c52b2d25f7f9a99b95652961d
Binary files /dev/null and b/doc/design/fluid-compiler.graffle differ
diff --git a/doc/design/fluid-compiler.png b/doc/design/fluid-compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..1b0ffed2039c91a3a00bbb719da08c91c3acf7bb
Binary files /dev/null and b/doc/design/fluid-compiler.png differ
diff --git a/doc/design/fluid.md b/doc/design/fluid.md
new file mode 100644
index 0000000000000000000000000000000000000000..585dc8ef39c0cfb30f470d79f7b27a59ceb5e940
--- /dev/null
+++ b/doc/design/fluid.md
@@ -0,0 +1,122 @@
+# Design Doc: PaddlePaddle Fluid
+
+## Why Fluid
+
+When Baidu developed PaddlePaddle in 2013, the only well-known open source deep learning system at the time was Caffe. However, when PaddlePaddle was open-sourced in 2016, many other choices were available. There was a challenge -- what is the need for open sourcing yet another deep learning framework?
+
+Fluid is the answer. Fluid is similar to PyTorch and TensorFlow Eager Execution, which describes the "process" of training or inference using the concept of a model. In fact in PyTorch, TensorFlow Eager Execution and Fluid, there is no concept of a model at all. The details are covered in the sections below. Fluid is currently more extreme in the above mentioned idea than PyTorch and Eager Execution, and we are trying to push Fluid towards the directions of a compiler and a new programming language for deep learning.
+
+## The Evolution of Deep Learning Systems
+
+Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
+
+| Existed since | model as sequence of layers | model as graph of operators | No model |
+|--|--|--|--|
+| 2013 | Caffe, Theano, Torch, PaddlePaddle | | |
+| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | |
+| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
+
+From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model. To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
+
+## Deep Learning Programming Paradigms
+
+With the systems listed as the first or second generation, e.g., Caffe or TensorFlow, an AI application training program looks like the following:
+
+```python
+x = layer.data("image")
+l = layer.data("label")
+f = layer.fc(x, W)
+s = layer.softmax(f)
+c = layer.mse(l, s)
+
+for i in xrange(1000): # train for 1000 iterations
+ m = read_minibatch()
+ forward({input=x, data=m}, minimize=c)
+ backward(...)
+
+print W # print the trained model parameters.
+```
+
+The above program includes two parts:
+
+1. The first part describes the model, and
+2. The second part describes the training process (or inference process) for the model.
+
+This paradigm has a well-known problem that limits the productivity of programmers. If the programmer made a mistake in configuring the model, the error messages wouldn't show up until the second part is executed and `forward` and `backward` propagations are performed. This makes it difficult for the programmer to debug and locate a mistake that is located blocks away from the actual error prompt.
+
+This problem of being hard to debug and re-iterate fast on a program is the primary reason that programmers, in general, prefer PyTorch over the older systems. Using PyTorch, we would write the above program as following:
+
+```python
+W = tensor(...)
+
+for i in xrange(1000): # train for 1000 iterations
+ m = read_minibatch()
+ x = m["image"]
+ l = m["label"]
+ f = layer.fc(x, W)
+ s = layer.softmax(f)
+ c = layer.mse(l, s)
+ backward()
+
+print W # print the trained model parameters.
+```
+
+We can see that the main difference is the moving the model configuration part (the first step) into the training loop. This change would allow the mistakes in model configuration to be reported where they actually appear in the programming block. This change also represents the model better, or its forward pass, by keeping the configuration process in the training loop.
+
+## Describe Arbitrary Models for the Future
+
+Describing the process instead of the model also brings Fluid, the flexibility to define different non-standard models that haven't been invented yet.
+
+As we write out the program for the process, we can write an RNN as a loop, instead of an RNN as a layer or as an operator. A PyTorch example would look like the following:
+
+```python
+for i in xrange(1000):
+ m = read_minibatch()
+ x = m["sentence"]
+ for t in xrange x.len():
+ h[t] = the_step(x[t])
+```
+
+With Fluid, the training loop and the RNN in the above program are not really Python loops, but just a "loop structure" provided by Fluid and implemented in C++ as the following:
+
+```python
+train_loop = layers.While(cond)
+with train_loop.block():
+ m = read_minibatch()
+ x = m["sentence"]
+ rnn = layers.While(...)
+ with rnn.block():
+ h[t] = the_step(input[t])
+```
+
+An actual Fluid example is described [here](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44).
+
+From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
+
+We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid.
+
+## Turing Completeness
+
+In computability theory, a system of data-manipulation rules, such as a programming language, is said to be Turing complete if it can be used to simulate any Turing machine. For a programming language, if it provides if-then-else and loop, it is Turing complete. From the above examples, Fluid seems to be Turing complete; however, it is noteworthy to notice that there is a slight difference between the `if-then-else` of Fluid and that of a programming language. The difference being that the former runs both of its branches and splits the input mini-batch into two -- one for the True condition and another for the False condition. This hasn't been researched in depth if this is equivalent to the `if-then-else` in programming languages that makes them Turing-complete. Based on a conversation with [Yuang Yu](https://research.google.com/pubs/104812.html), it seems to be the case but this needs to be looked into in-depth.
+
+## The Execution of a Fluid Program
+
+There are two ways to execute a Fluid program. When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
+
+There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
+
+Fluid is moving towards the direction of a compiler, which is explain in more detail later in this article.
+
+## Backward Compatibility of Fluid
+
+Given all the advantages from the removal of the concept of a *model*, hardware manufacturers might still prefer the existence of the concept of a model, so it would be easier for them to support multiple frameworks all at once and could run a trained model during inference. For example, Nervana, a startup company acquired by Intel, has been working on an XPU that reads the models in the format known as [n-graph](https://github.com/NervanaSystems/ngraph). Similarly, [Movidius](https://www.movidius.com/) is producing a mobile deep learning chip that reads and runs graphs of operators. The well-known [ONNX](https://github.com/onnx/onnx) is also a file format of graphs of operators.
+
+For Fluid, we can write a converter that extracts the parts in the `ProgramDesc` protobuf message, converts them into a graph of operators, and exports the graph into the ONNX or n-graph format.
+
+## Towards a Deep Learning Language and the Compiler
+
+We can change the `if-then-else` and loop structure a little bit in the above Fluid example programs, to make it into a new programming language, different than Python.
+
+Even if we do not invent a new language, as long as we get the `ProgramDesc` message filled in, we can write a transpiler, which translates each invocation to an operator, into a C++ call to a kernel function of that operator. For example, a transpiler that weaves the CUDA kernels outputs an NVIDIA-friendly C++ program, which can be built using `nvcc`. Another transpiler could generate MKL-friendly code that should be built using `icc` from Intel. More interestingly, we can translate a Fluid program into its distributed version of two `ProgramDesc` messages, one for running on the trainer process, and the other one for the parameter server. For more details of the last example, the [concurrent programming design](concurrent_programming.md) document would be a good pointer. The following figure explains the proposed two-stage process:
+
+![](fluid-compiler.png)
diff --git a/doc/design/refactor/distributed_architecture.md b/doc/design/refactor/distributed_architecture.md
index 2b4f921ae93c3b443ed62a28b1fa9fbda14f73ab..d9fe7d6bbb0eeb73fcdca3ee749a4f10bcdda682 100644
--- a/doc/design/refactor/distributed_architecture.md
+++ b/doc/design/refactor/distributed_architecture.md
@@ -53,7 +53,7 @@ The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the
The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document -
-[Design Doc: Operation Graph Based Parameter Server](./dist_train.md)
+[Design Doc: Operation Graph Based Parameter Server](./parameter_server.md)
## Distributed Training Architecture
diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md
index 62ff8f3229bbbb5bc82e4da29259baffc30c2c87..14c081ea84282e52a2e36475c3c0ea755122d154 100644
--- a/doc/design/releasing_process.md
+++ b/doc/design/releasing_process.md
@@ -5,8 +5,9 @@ PaddlePaddle使用git-flow branching model做分支管理,使用[Semantic Vers
PaddlePaddle每次发新的版本,遵循以下流程:
1. 从`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0`
-2. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。
-3. 对这个版本的提交,做如下几个操作:
+1. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。
+1. 对这个版本的提交,做如下几个操作:
+ * 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
* 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步
* 编译这个版本的Ubuntu Deb包。如果失败,修复Ubuntu Deb包编译问题,Patch号加一,返回第二步。
* 使用Regression Test List作为检查列表,测试Docker镜像/ubuntu安装包的功能正确性
@@ -20,9 +21,9 @@ PaddlePaddle每次发新的版本,遵循以下流程:
pip install twine
twine upload dist/[package to upload]
```
-4. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
-5. 编译master分支的Docker发行镜像,发布到dockerhub。编译ubuntu的deb包,发布到github release页面
-6. 协同完成Release Note的书写
+1. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
+1. 编译master分支的Docker发行镜像,发布到dockerhub。编译ubuntu的deb包,发布到github release页面
+1. 协同完成Release Note的书写
需要注意的是:
@@ -30,7 +31,7 @@ PaddlePaddle每次发新的版本,遵循以下流程:
* `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试PaddlePaddle的行为。
* 在`release/版本号`分支存在的时候,如果有bugfix的行为,需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
-# PaddlePaddle 分支规范
+## PaddlePaddle 分支规范
PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。
@@ -47,11 +48,11 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
* BugFix分支也是在开发者自己的fork版本库维护,与功能分支不同的是,BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支,同时提起`Pull Request`。
-# PaddlePaddle回归测试列表
+## PaddlePaddle回归测试列表
本列表说明PaddlePaddle发版之前需要测试的功能点。
-## PaddlePaddle Book中所有章节
+### PaddlePaddle Book中所有章节
PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
diff --git a/doc/design/support_new_device.md b/doc/design/support_new_device.md
new file mode 100644
index 0000000000000000000000000000000000000000..fd23dc211a35fdc9d87bc9233fcf4e90254da748
--- /dev/null
+++ b/doc/design/support_new_device.md
@@ -0,0 +1,248 @@
+# Design Doc: Supporting new Device/Library
+
+## Background
+
+Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries flexibly and efficiently.
+
+On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example,Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
+
+On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
+
+So, how to support a new Device/Library in Fluid becomes a challenge.
+
+
+## Basic: Integrate A New Device/Library
+
+For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/read_source.md).
+
+There are mainly three parts that we have to consider while integrating a new device/library:
+
+- Place and DeviceContext: indicates the device id and manages hardware resources
+
+- Memory and Tensor: malloc/free data on certain device
+
+- Math Functor and OpKernel: implement computing unit on certain devices/libraries
+
+### Place and DeviceContext
+
+
+#### Place
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent different devices and computing libraries. There are inheritance relationships between different kinds of `Place`.
+
+```
+ | CPUPlace --> MKLDNNPlace
+Place --| CUDAPlace --> CUDNNPlace
+ | FPGAPlace
+```
+
+And `Place` is defined as follows:
+
+```
+typedef boost::variant Place;
+```
+
+#### DeviceContext
+
+Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different hardwares, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
+
+
+```
+ /-> CPUDeviceContext --> MKLDeviceContext
+DeviceContext ----> CUDADeviceContext --> CUDNNDeviceContext
+ \-> FPGADeviceContext
+```
+
+An example of Nvidia GPU is as follows:
+
+- DeviceContext
+
+
+```
+class DeviceContext {
+ virtual Place GetPlace() const = 0;
+};
+```
+
+
+- CUDADeviceContext
+
+
+```
+class CUDADeviceContext : public DeviceContext {
+ Place GetPlace() const override { return place_; }
+private:
+ CUDAPlace place_;
+ cudaStream_t stream_;
+ cublasHandle_t cublas_handle_;
+ std::unique_ptr eigen_device_; // binds with stream_
+};
+```
+
+- CUDNNDeviceContext
+
+```
+class CUDNNDeviceContext : public CUDADeviceContext {
+ private:
+ cudnnHandle_t cudnn_handle_;
+};
+```
+
+
+### Memory and Tensor
+
+
+#### memory module
+
+Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/memory/memory.h#L36):
+
+```
+template
+void* Alloc(Place place, size_t size);
+
+template
+void Free(Place place, void* ptr);
+
+template
+size_t Used(Place place);
+```
+
+To implementing these interfaces, we have to implement MemoryAllocator for different Devices
+
+
+#### Tensor
+
+[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h#L36) holds data with some shape in a specific Place.
+
+```cpp
+class Tensor {
+ public:
+ /*! Return a pointer to mutable memory block. */
+ template
+ inline T* data();
+
+ /**
+ * @brief Return a pointer to mutable memory block.
+ * @note If not exist, then allocation.
+ */
+ template
+ inline T* mutable_data(platform::Place place);
+
+ /**
+ * @brief Return a pointer to mutable memory block.
+ *
+ * @param[in] dims The dimensions of the memory block.
+ * @param[in] place The place of the memory block.
+ *
+ * @note If not exist, then allocation.
+ */
+ template
+ inline T* mutable_data(DDim dims, platform::Place place);
+
+ /*! Resize the dimensions of the memory block. */
+ inline Tensor& Resize(const DDim& dims);
+
+ /*! Return the dimensions of the memory block. */
+ inline const DDim& dims() const;
+
+ private:
+ /*! holds the memory block if allocated. */
+ std::shared_ptr holder_;
+
+ /*! points to dimensions of memory block. */
+ DDim dim_;
+};
+```
+
+`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory.
+
+```cpp
+paddle::framework::Tensor t;
+paddle::platform::CPUPlace place;
+// set size first
+t.Resize({2, 3});
+// allocate memory on CPU later
+t.mutable_data(place);
+```
+
+
+
+### Math Functor and OpKernel
+
+Fluid implements computing units based on different DeviceContexts. Some computing units are shared between operators. This common part will be put in operators/math directory as basic Functors.
+
+Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:
+
+The interface is defined in header file.
+
+```
+template
+class MaxOutFunctor {
+ public:
+ void operator()(const DeviceContext& context, const framework::Tensor& input,
+ framework::Tensor* output, int groups);
+};
+```
+
+CPU implemention is in .cc file
+
+```
+template
+class MaxOutFunctor {
+ public:
+ void operator()(const platform::CPUDeviceContext& context,
+ const framework::Tensor& input, framework::Tensor* output,
+ int groups) {
+ ...
+ }
+};
+```
+
+CUDA implemention is in .cu file
+
+```
+template
+class MaxOutFunctor {
+ public:
+ void operator()(const platform::CUDADeviceContext& context,
+ const framework::Tensor& input, framework::Tensor* output,
+ int groups) {
+ ...
+ }
+};
+```
+
+
+We get computing handle from a concrete DeviceContext, and make compution on tensors.
+
+The implemention of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
+
+Fluid provides different register interfaces in op_registry.h
+
+
+Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/crop_op.cc#L134) operator as an example:
+
+In .cc file:
+
+```
+REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel);
+REGISTER_OP_CPU_KERNEL(
+ crop_grad, ops::CropGradKernel);
+```
+
+In .cu file:
+
+```
+REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel);
+REGISTER_OP_CUDA_KERNEL(
+ crop_grad, ops::CropGradKernel);
+```
+
+
+## Advanced topics: How to switch between different Device/Library
+
+Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
+
+
+We will discuss how to implement an efficient OpKernel switch policy.
+
+- TBD
diff --git a/doc/faq/build_and_install/index_cn.rst b/doc/faq/build_and_install/index_cn.rst
index f1677e216f31d79b53ac29a0afbf6fbb886a0dcd..a2bdeead7841393fdfe90c78e5b91d9e61678a24 100644
--- a/doc/faq/build_and_install/index_cn.rst
+++ b/doc/faq/build_and_install/index_cn.rst
@@ -14,7 +14,7 @@
$ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
$ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
- $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddlepaddle:latest-gpu
+ $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 `_ 。
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst
index 3c525bdad6f6118dcd560e2cb7bfaf89737c1362..c875c807b8ab2e420dec189ef32d41533f58fa6d 100644
--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@@ -19,7 +19,7 @@ PaddlePaddle主要使用 `CMake `_ 以及GCC, G++作为编译
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
# 如果使用Docker编译环境,执行下面的命令编译CPU-Only的二进制
- docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+ docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
# 如果不使用Docker编译环境,执行下面的命令
mkdir build
cd build
@@ -30,7 +30,7 @@ PaddlePaddle主要使用 `CMake `_ 以及GCC, G++作为编译
.. code-block:: bash
- pip install python/dist/*.whl
+ pip install build/python/dist/*.whl
.. _run_test:
@@ -45,7 +45,7 @@ PaddlePaddle主要使用 `CMake `_ 以及GCC, G++作为编译
.. code-block:: bash
- docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+ docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
如果不使用Docker,可以执行ctest命令即可:
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst
index 76fbc43de2e83580dd79b874507c103533022436..f194f84ce7c961bb8644d7c077a7c71730220ea2 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@@ -21,7 +21,7 @@ Then run:
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
# run the following command to build a CPU-Only binaries if you are using docker
- docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+ docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
# else run these commands
mkdir build
cd build
@@ -34,7 +34,7 @@ machine or copy it to the target machine.
.. code-block:: bash
- pip install python/dist/*.whl
+ pip install build/python/dist/*.whl
.. _run_test:
@@ -49,7 +49,7 @@ Set :code:`WITH_GPU=ON` Can also run tests on GPU.
.. code-block:: bash
- docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+ docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh
If you don't use Docker, just run ctest will start the tests:
@@ -117,7 +117,7 @@ You can add :code:`-D` argument to pass such options, like:
"WITH_PYTHON", "Build with integrated Python interpreter", "ON"
"WITH_STYLE_CHECK", "Check code style when building", "ON"
"WITH_TESTING", "Build unit tests", "ON"
- "WITH_DOC", "Build documentaions", "OFF"
+ "WITH_DOC", "Build documentations", "OFF"
"WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
"WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
"WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index f78b1fb0e11aa028a4b7abb5270740b97f8039e9..1eb06e4182d40c3be20d71e37b34009905eaf9d6 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -114,7 +114,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
.. code-block:: bash
- nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
+ nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
**注: 如果没有安装nvidia-docker,可以尝试以下的方法,将CUDA库和Linux设备挂载到Docker容器内:**
@@ -122,7 +122,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
- docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
+ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
**关于AVX:**
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index d7acc7aeb744b19d83acb520d07c8551168dd096..5a46c598f2248c7912169a9e77b16851230c1d2e 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -122,7 +122,7 @@ GPU driver installed before move on.
.. code-block:: bash
- nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
+ nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**
@@ -130,7 +130,7 @@ GPU driver installed before move on.
export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
- docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
+ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
**About AVX:**
diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/getstarted/build_and_install/index_cn.rst
index 88c5142ddee994ed0c0dc520195311e97f5a549e..c9ba84c842b530162c92713046e64fdf82bd441b 100644
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
@@ -13,7 +13,7 @@ PaddlePaddle提供pip和Docker的安装方式:
pip_install_cn.rst
docker_install_cn.rst
-
+ ../../howto/dev/build_cn.md
编译流程
++++++++
diff --git a/doc/getstarted/build_and_install/index_en.rst b/doc/getstarted/build_and_install/index_en.rst
index c8b60d03578ba6a9b73134ec53b440d057e36079..32d66d63dd5b2a30d5de4a088dc80b680830cb84 100644
--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/getstarted/build_and_install/index_en.rst
@@ -13,6 +13,7 @@ You can choose either pip or Docker to complete your install:
pip_install_en.rst
docker_install_en.rst
+ ../../howto/dev/build_en.md
Build from Source
diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst
index b26bf4c95cb18f36408eb75894e8b9b674efc67b..b270e2c2f0b0cbfd6fb4b9b0750d207952f84d76 100644
--- a/doc/getstarted/build_and_install/pip_install_cn.rst
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
@@ -34,7 +34,7 @@ PaddlePaddle可以使用常用的Python包管理工具
:align: center
.. csv-table:: 各个版本最新的whl包
- :header: "版本说明", "cp27-cp27mu", "cp27-cp27mu", "C-API"
+ :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API"
:widths: 1, 3, 3, 3
"cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_"
@@ -83,4 +83,4 @@ PaddlePaddle发布的安装包会尽量对齐 `manylinux1 `_ 链接中找到。
- 如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ,需要升级pip版本到最新; 如果系统支持 manylinux1_x86_64 而安装包(本地)是 linux_x86_64 ,可以重命名这个whl包为 manylinux1_x86_64 再安装。
\ No newline at end of file
+ 如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ,需要升级pip版本到最新; 如果系统支持 manylinux1_x86_64 而安装包(本地)是 linux_x86_64 ,可以重命名这个whl包为 manylinux1_x86_64 再安装。
diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst
index 113790e4e4ca116e91f11f8a233eae874d9d1b7a..70f601a11c610e0a2b5dcc8b73d2c3ea19e195e1 100644
--- a/doc/getstarted/build_and_install/pip_install_en.rst
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
@@ -37,7 +37,7 @@ If the links below shows up the login form, just click "Log in as guest" to star
:align: center
.. csv-table:: whl package of each version
- :header: "version", "cp27-cp27mu", "cp27-cp27mu", "C-API"
+ :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API"
:widths: 1, 3, 3, 3
"cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_"
diff --git a/doc/howto/dev/build_cn.md b/doc/howto/dev/build_cn.md
index 0b911f7b7509da4a147c65954acb7e7c38f489da..4a80a5245102fb992f513a749f6a02e1130188af 100644
--- a/doc/howto/dev/build_cn.md
+++ b/doc/howto/dev/build_cn.md
@@ -1,4 +1,4 @@
-# 编译PaddlePaddle和运行单元测试
+# 用Docker编译和测试PaddlePaddle
## 需要的软硬件
diff --git a/doc/howto/dev/build_en.md b/doc/howto/dev/build_en.md
index d0048e3714a5861a503736879d6c0870e5906c95..91c41ef8ce3abdec5d69a9cbcebbc49b17d8f663 100644
--- a/doc/howto/dev/build_en.md
+++ b/doc/howto/dev/build_en.md
@@ -1,4 +1,4 @@
-# Build PaddlePaddle from Source Code and Run Unit Test
+# Build using Docker
## What Developers Need
diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md
index 699390145226ec2b65fdf5122db187e1d30d669e..3e0bf7b3973079a2063d33b6be4fe8a9dc5c07bb 100644
--- a/doc/howto/dev/contribute_to_paddle_cn.md
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -76,18 +76,18 @@ no changes added to commit (use "git add" and/or "git commit -a")
## 构建和测试
-编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家,我们的标准开发流程是把这些工具都装进一个Docker image,称为*开发镜像*,通常名字是 `paddle:dev`。然后所有用 `cmake && make` 的地方(比如IDE配置里)都用 `docker run paddle:dev`来代替。
+编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家,我们的标准开发流程是把这些工具都装进一个Docker image,称为*开发镜像*,通常名字是 `paddle:latest-dev` 或者 `paddle:[version tag]-dev` 如 `paddle:0.11.0-dev`。然后所有用 `cmake && make` 的地方(比如IDE配置里)都用 `docker run paddle:latest-dev`来代替。
如要build这个开发镜像,在源码目录树的根目录中运行:
```bash
-➜ docker build -t paddle:dev .
+➜ docker build -t paddle:latest-dev .
```
随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU,但是支持AVX指令集,并且包括unit tests的PaddlePaddle,可以:
```bash
-➜ docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev
+➜ docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:latest-dev
```
这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`,并且输出一个 `./build/paddle.deb`文件之外,还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*(`paddle:prod`):
@@ -99,7 +99,7 @@ no changes added to commit (use "git add" and/or "git commit -a")
如果要运行所有的单元测试,可以用如下命令:
```bash
-➜ docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
+➜ docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
```
关于构建和测试的更多信息,请参见[这篇文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。
diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
index 6cfc9536f20e88571a9845a50be0341fe4d9f78b..757a5840bca4c8028e362789ec95bb03d261d2c1 100644
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -1,17 +1,18 @@
# 如何写新的Operator
- [概念简介](#概念简介)
- - [实现C++类](#实现C++类)
- - [定义ProtoMaker类](#定义ProtoMaker类)
- - [定义Operator类](#定义Operator类)
- - [定义OpKernel类](#定义OpKernel类)
- - [注册Operator](#注册Operator)
+ - [实现C++类](#实现c类)
+ - [定义ProtoMaker类](#定义protomaker类)
+ - [定义Operator类](#定义operator类)
+ - [定义OpKernel类](#定义opkernel类)
+ - [注册Operator](#注册operator)
- [编译](#编译)
- - [绑定Python](#绑定Python)
+ - [绑定Python](#绑定python)
- [实现单元测试](#实现单元测试)
- - [前向Operator单测](#前向Operator单测)
- - [反向Operator单测](#反向Operator单测)
+ - [前向Operator单测](#前向operator单测)
+ - [反向Operator单测](#反向operator单测)
- [编译和执行](#编译和执行)
+ - [注意事项](#注意事项)
## 概念简介
@@ -30,8 +31,8 @@
-------------- | :----------------------
OpProtoMake定义 | `.cc`文件,Backward Op不需要定义OpProtoMake
Op定义 | `.cc`文件
-Kernel实现 | CPU、GPU共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,GPU 实现在`.cu`文件中。
-注册Op | Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,GPU实现在`.cu`文件中
+Kernel实现 | CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。
+注册Op | Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中
实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc` 、`*_op.cu`(如有)结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
@@ -43,7 +44,7 @@ Kernel实现 | CPU、GPU共享Kernel实现在`.h`文件中,否则,CPU
## 实现C++类
-### 1. 定义ProtoMaker类
+### 定义ProtoMaker类
矩阵乘法的公式:$Out = X * Y$, 可见该计算由两个输入,一个输出组成。
@@ -100,7 +101,7 @@ The equation is: Out = scale*X
- `AddAttr("scale", "...").SetDefault(1.0);` : 增加`scale`系数,作为参数属性,并且设置默认值为1.0。
-### 2. 定义Operator类
+### 定义Operator类
下面的点实现了MulOp的定义:
@@ -149,11 +150,11 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中,和下面将要介绍的注册函数一起放在`.cc`中
-### 3. 定义OpKernel类
+### 定义OpKernel类
`MulKernel`继承自`framework::OpKernel`,带有下面两个模板参数:
-- `typename Place`: 表示设备类型,不同设备(CPU、GPU)共享同一个Kernel时,需加该模板参数,不共享则不加,一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+- `typename DeviceContext`: 表示设备类型,不同设备(CPU、CUDA)共享同一个Kernel时,需加该模板参数,不共享则不加,一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
- `typename T` : 表示数据类型,如`float`, `double`等。
@@ -165,7 +166,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
下面是 `MulKernel` `Compute`的实现:
```cpp
- template
+ template
class MulKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
@@ -173,33 +174,32 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
auto* Y = context.Input("Y");
auto* Z = context.Output("Out");
Z->mutable_data(context.GetPlace());
- auto* device_context =
- const_cast(context.device_context_);
- math::matmul(*X, false, *Y, false, 1, Z, 0, device_context);
+ auto& device_context = context.template device_context();
+ math::matmul(*X, false, *Y, false, 1, Z, 0, device_context);
}
};
```
-需要注意:**不同设备(CPU、GPU)共享一个Op定义,是否则共享同一个`OpKernel`,取决于`Compute`调用的函数是否支持不同设备。**
+需要注意:**不同设备(CPU、CUDA)共享一个Op定义,是否则共享同一个`OpKernel`,取决于`Compute`调用的函数是否支持不同设备。**
-`MulOp`的CPU、GPU实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考:[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考:[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
-为了使`OpKernel`的计算过程书写更加简单,并且CPU、GPU的代码可以复用,我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库,请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。
+为了使`OpKernel`的计算过程书写更加简单,并且CPU、CUDA的代码可以复用,我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库,请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。
到此,前向Op实现完成。接下来,需要在`.cc`文件中注册该op和kernel。
反向Op类的定义,反向OpKernel的定义与前向Op类似,这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。
-### 4. 注册Operator
+### 注册Operator
- 在`.cc`文件中注册前向、反向Op类,注册CPU Kernel。
```cpp
namespace ops = paddle::operators;
REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
- REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel);
+ REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel);
REGISTER_OP_CPU_KERNEL(mul_grad,
- ops::MulGradKernel);
+ ops::MulGradKernel);
```
在上面的代码中:
@@ -209,20 +209,20 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
- `REGISTER_OP_CPU_KERNEL` :注册`ops::MulKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::MulGradKernel`类。
-- 在 `.cu`文件中注册GPU Kernel。
- - 请注意,如果GPU Kernel的实现基于Eigen unsupported模块,那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`,代码示例如下:
+- 在 `.cu`文件中注册CUDA Kernel。
+ - 请注意,如果CUDA Kernel的实现基于Eigen unsupported模块,那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`,代码示例如下:
```cpp
// if use Eigen unsupported module before include head files
- // #define EIGEN_USE_GPU
+ #define EIGEN_USE_GPU
namespace ops = paddle::operators;
- REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel);
- REGISTER_OP_GPU_KERNEL(mul_grad,
- ops::MulGradKernel);
+ REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel);
+ REGISTER_OP_CUDA_KERNEL(mul_grad,
+ ops::MulGradKernel);
```
-### 5. 编译
+### 编译
运行下面命令可以进行编译:
@@ -236,71 +236,57 @@ make mul_op
## 实现单元测试
-单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
+单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
-### 前向Operator单元测试
+### 前向Operator单测
-前向Op单元测试继承自`unittest.TestCase`,并定义元类`__metaclass__ = OpTestMeta`。各项更加具体的单元测试在`OpTestMeta`里完成。测试前向Operator,需要:
+Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp`里完成。测试Operator,需要:
1. 在`setUp`函数定义输入、输出,以及相关的属性参数。
2. 生成随机的输入数据。
3. 在Python脚本中实现与前向operator相同的计算逻辑,得到输出值,与operator前向计算的输出进行对比。
+4. 反向计算已经自动集成进测试框架,直接调用相应接口即可。
```python
import unittest
import numpy as np
- from gradient_checker import GradientChecker, create_op
- from op_test_util import OpTestMeta
+ from op_test import OpTest
- class TestMulOp(unittest.TestCase):
- __metaclass__ = OpTestMeta
+ class TestMulOp(OpTest):
def setUp(self):
- self.type = "mul"
+ self.op_type = "mul"
self.inputs = {
'X': np.random.random((32, 84)).astype("float32"),
'Y': np.random.random((84, 100)).astype("float32")
}
self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
- ```
-
-上面的代码首先导入依赖的包,下面是对`setUp`函数中操作的重要变量的详细解释:
-
-- `self.type = "mul" ` : 定义类型,与operator注册时注册的类型一致。
-- `self.inputs` : 定义输入,类型为`numpy.array`,并初始化。
-- `self.outputs` : 定义输出,并在Python脚本中完成与operator同样的计算逻辑,返回Python端的计算结果。
+ def test_check_output(self):
+ self.check_output()
-### 反向Operator单元测试
+ def test_check_grad_normal(self):
+ self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-反向Op单元测试继承自`GradientChecker`,而`GradientChecker`继承自`unittest.TestCase`,因此,**反向单元测试函数需要以`test_`开头**。
+ def test_check_grad_ingore_x(self):
+ self.check_grad(
+ ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-```python
-class TestMulGradOp(GradientChecker):
- def setUp(self):
- self.op = create_op("mul")
- self.inputs = {
- 'X': np.random.random((32, 84)).astype("float32"),
- 'Y': np.random.random((84, 100)).astype("float32")
- }
-
- def test_check_grad_normal(self):
- # mul op will enlarge the relative error
- self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+ def test_check_grad_ingore_y(self):
+ self.check_grad(
+ ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+ ```
- def test_check_grad_ingore_x(self):
- self.check_grad(
- ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+上面的代码首先导入依赖的包,下面是对`setUp`函数中操作的重要变量的详细解释:
- def test_check_grad_ingore_y(self):
- self.check_grad(
- ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-```
+- `self.op_type = "mul" ` : 定义类型,与operator注册时注册的类型一致。
+- `self.inputs` : 定义输入,类型为`numpy.array`,并初始化。
+- `self.outputs` : 定义输出,并在Python脚本中完成与operator同样的计算逻辑,返回Python端的计算结果。
-下面解释代码中一些关键的地方:
+### 反向operator单测
-- 调用`create_op("mul")`创建反向Op对应的前向Op。
+而反向测试中:
- `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
- 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
- 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。
@@ -308,7 +294,7 @@ class TestMulGradOp(GradientChecker):
- `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。
-### 编译和执行单元测试
+### 编译和执行
`python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
@@ -328,5 +314,5 @@ ctest -R test_mul_op
- 为每个Op创建单独的`*_op.h`(如有)、`*_op.cc`和`*_op.cu`(如有)。不允许一个文件中包含多个Op,这将会导致编译出错。
- 注册Op时的类型名,需要和该Op的名字一样。即不允许在`A_op.cc`里面,注册`REGISTER_OP(B, ...)`等,这将会导致单元测试出错。
-- 如果Op没有实现GPU Kernel,请不要创建空的`*_op.cu`,这将会导致单元测试出错。
+- 如果Op没有实现CUDA Kernel,请不要创建空的`*_op.cu`,这将会导致单元测试出错。
- 如果多个Op依赖一些共用的函数,可以创建非`*_op.*`格式的文件来存放,如`gather.h`文件。
diff --git a/doc/howto/dev/new_op_en.md b/doc/howto/dev/new_op_en.md
index 1e88e1f5b4df710f1b69f0305d8d8a2921c4249a..fe86936bc12cc2fb88d653429e250f71a478dfb6 100644
--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
@@ -1,8 +1,8 @@
# How to write a new operator
- [Background](#background)
- - [Implementing C++ Types](#implementing-c++-types)
- - [Defining ProtoMaker](#defining-protoMaker)
+ - [Implementing C++ Types](#implementing-c-types)
+ - [Defining ProtoMaker](#defining-protomaker)
- [Defining Operator](#defining-operator)
- [Registering Operator](#registering-operator)
- [Compilation](#compilation)
@@ -28,8 +28,8 @@ An operator can be differentiated by whether in has kernel methods. An operator
-------------- | :----------------------
OpProtoMake definition | `.cc`files, Backward Op does not need an OpProtoMake interface.
Op definition | `.cc` files
-Kernel implementation | The kernel methods shared between CPU and GPU are defined in `.h` files. CPU-specific kernels live in `.cc` files, while GPU-specific kernels are implemented in `.cu`files.
-Registering the Op | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the GPU implementation.
+Kernel implementation | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.
+Registering the Op | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions. **
@@ -41,7 +41,7 @@ Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePadd
## Implementing C++ Types
-### 1. Defining Class ProtoMaker
+### Defining ProtoMaker
Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output.
@@ -98,7 +98,7 @@ There are two changes in this example:
- `AddAttr("scale", "...").SetDefault(1.0);` adds `scale`constant as an attribute, and sets the default value to 1.0.
-### 2. Defining Operator
+### Defining Operator
The following code defines the interface for MulOp:
@@ -147,11 +147,11 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later.
-### 3. Defining OpKernel
+### Defining OpKernel
`MulKernel` inherits `framework::OpKernel`, which includes the following templates:
-- `typename Place` denotes device type. When different devices, namely the CPU and the GPU, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+- `typename DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
- `typename T` denotes data type, such as `float` or `double`.
@@ -163,7 +163,7 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
`MulKernel`'s implementation of `Compute` is as follows:
```cpp
- template
+ template
class MulKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
@@ -171,16 +171,15 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
auto* Y = context.Input("Y");
auto* Z = context.Output("Out");
Z->mutable_data(context.GetPlace());
- auto* device_context =
- const_cast(context.device_context_);
- math::matmul(*X, false, *Y, false, 1, Z, 0, device_context);
+ auto& device_context = context.template device_context();
+ math::matmul(*X, false, *Y, false, 1, Z, 0, device_context);
}
};
```
-Note that **different devices (CPU, GPU)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.**
+Note that **different devices (CPU, CUDA)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.**
-`MulOp`'s CPU and GPU share the same `Kernel`. A non-sharing `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).
@@ -189,16 +188,16 @@ This concludes the forward implementation of an operator. Next its operation and
The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**.
-### 4. Registering Operator
+### Registering Operator
- In `.cc` files, register forward and backward operator classes and the CPU kernel.
```cpp
namespace ops = paddle::operators;
REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
- REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel);
+ REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel);
REGISTER_OP_CPU_KERNEL(mul_grad,
- ops::MulGradKernel);
+ ops::MulGradKernel);
```
In that code block,
@@ -208,20 +207,20 @@ The definition of its corresponding backward operator, if applicable, is similar
- `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
-- Registering GPU Kernel in `.cu` files
- - Note that if GPU Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as
+- Registering CUDA Kernel in `.cu` files
+ - Note that if CUDA Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as
```cpp
// if use Eigen unsupported module before include head files
#define EIGEN_USE_GPU
namespace ops = paddle::operators;
- REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel);
- REGISTER_OP_GPU_KERNEL(mul_grad,
- ops::MulGradKernel);
+ REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel);
+ REGISTER_OP_CUDA_KERNEL(mul_grad,
+ ops::MulGradKernel);
```
-### 5. Compilation
+### Compilation
Run the following commands to compile.
@@ -253,62 +252,51 @@ A forward operator unit test inherits `unittest.TestCase` and defines metaclass
2. Generating random input data.
-3. Implementing the same computation logic in a Python script:
+3. Implementing the same computation logic in a Python script.
+
+4. Call check gradient function to check the backward operator.
```python
import unittest
import numpy as np
- from gradient_checker import GradientChecker, create_op
- from op_test_util import OpTestMeta
+ from op_test import OpTest
- class TestMulOp(unittest.TestCase):
- __metaclass__ = OpTestMeta
+ class TestMulOp(OpTest):
def setUp(self):
- self.type = "mul"
+ self.op_type = "mul"
self.inputs = {
'X': np.random.random((32, 84)).astype("float32"),
'Y': np.random.random((84, 100)).astype("float32")
}
self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+ def test_check_output(self):
+ self.check_output()
+
+ def test_check_grad_normal(self):
+ self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+
+ def test_check_grad_ingore_x(self):
+ self.check_grad(
+ ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+ def test_check_grad_ingore_y(self):
+ self.check_grad(
+ ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
```
Get its output, and compare it with the forward operator's own output.
The code above first loads required packages. In addition, we have
-- `self.type = "mul" ` defines the type that is identical to what the operator's registered type.
+- `self.op_type = "mul" ` defines the type that is identical to what the operator's registered type.
- `self.inputs` defines input, with type `numpy.array` and initializes it.
- `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script.
### Testing Backward Operators
-A backward operator unit test inherits `GradientChecker`, which inherits `unittest.TestCase`. As a result, **a backward operator unit test needs to be have the prefix `test_`**.
-
-```python
-class TestMulGradOp(GradientChecker):
- def setUp(self):
- self.op = create_op("mul")
- self.inputs = {
- 'X': np.random.random((32, 84)).astype("float32"),
- 'Y': np.random.random((84, 100)).astype("float32")
- }
-
- def test_check_grad_normal(self):
- # mul op will enlarge the relative error
- self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
-
- def test_check_grad_ingore_x(self):
- self.check_grad(
- ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
-
- def test_check_grad_ingore_y(self):
- self.check_grad(
- ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
-```
-
-Some key points in the code above include:
+Some key points in checking gradient above include:
-- `create_op("mul")` creates the backward operator's corresponding forward operator.
- `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods.
- The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
- The second variable `"Out"` points to the network's final output target `Out`.
@@ -338,5 +326,5 @@ ctest -R test_mul_op
- Every `*_op.h` (if applicable), `*_op.cc`, and `*_op.cu` (if applicable) must be created for a unique Op. Compiling will fail if multiple operators are included per file.
- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OP(B, ...)` in `A_op.cc` will cause unit testing failures.
-- If the operator does not implement a GPU kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
+- If the operator does not implement a CUDA kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
- If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`.
diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst
index 61f3a223547b352cf7929615cf3682b29b9a738f..1bc947c260d7adb75ee5a2bb10e6b91bc0be2d4c 100644
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -3,12 +3,64 @@
##################
PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成,生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
-
+也可以利用PaddlePaddle 工具来编译文档,这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下
如何构建文档
============
-PaddlePaddle的文档构建有两种方式。
+PaddlePaddle的文档构建有三种方式。
+
+
+使用PaddlePaddle.org工具
+--------------
+这个是目前推荐的使用方法。除了可以自动编译文档,也可以直接在网页预览文档。
+
+文件工具是使用Docker,需要在系统里先安装好Docker工具包。Docker安装请参考Docker的官网。安装好Docker之后及可用以下命令启动工具
+
+.. code-block:: bash
+
+ mkdir paddlepaddle # Create paddlepaddle working directory
+ cd paddlepaddle
+
+ # Clone the content repositories
+ git clone https://github.com/PaddlePaddle/Paddle.git
+ git clone https://github.com/PaddlePaddle/book.git
+ git clone https://github.com/PaddlePaddle/models.git
+ git clone https://github.com/PaddlePaddle/Mobile.git
+
+ # Please specify the working directory through -v
+ docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+
+注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
+之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档
+编译后的文件将被存储在工作目录 /.ppo_workspace/content。
+
+如果不想使用 Docker,你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
+
+.. code-block:: bash
+
+ mkdir paddlepaddle # Create paddlepaddle working directory
+ cd paddlepaddle
+
+ # Clone the content repositories and PaddlePaddle.org
+ git clone https://github.com/PaddlePaddle/Paddle.git
+ git clone https://github.com/PaddlePaddle/book.git
+ git clone https://github.com/PaddlePaddle/models.git
+ git clone https://github.com/PaddlePaddle/Mobile.git
+ git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+
+ # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+ export CONTENT_DIR=
+ export ENV=''
+ cd PaddlePaddle.org/portal/
+ pip install -r requirements.txt
+ python manage.py runserver
+
+工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
+之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。
+编译后的文件将被存储在工作目录 /.ppo_workspace/content。
+
+想了解更多PaddlePaddle.org工具的详细信息,可以 `点击这里 `_ 。
使用Docker构建
--------------
@@ -47,17 +99,12 @@ PaddlePaddle的文档构建有两种方式。
PaddlePaddle文档使用 `sphinx`_ 自动生成,用户可以参考sphinx教程进行书写。
-如何更新文档主题
-================
-
-PaddlePaddle文档主题在 `TO_YOUR_PADDLE_CLONE_PATH/doc_theme` 文件夹下,包含所有和前端网页设计相关的文件。
-
-如何更新doc.paddlepaddle.org
+如何更新www.paddlepaddle.org
============================
-更新的文档以PR的形式提交到github中,提交方式参见 `贡献文档 `_ 。
-目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 `_ 和
-`英文文档 `_ 。
+更新的文档以PR的形式提交到github中,提交方式参见 `贡献文档 `_ 。
+目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 `_ 和
+`英文文档 `_ 。
.. _cmake: https://cmake.org/
diff --git a/doc/howto/dev/write_docs_en.rst b/doc/howto/dev/write_docs_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b3ef07eb1d0012827df8e6a4f27c5fa643649492
--- /dev/null
+++ b/doc/howto/dev/write_docs_en.rst
@@ -0,0 +1,80 @@
+##################
+Contribute Documentation
+##################
+
+PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
+Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
+When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content
+
+How to Build Documentations
+============
+
+We recommend using PaddlePaddle.org tool to build documentation
+
+
+Use PaddlePaddle.org tool
+--------------
+This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser.
+
+The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool
+
+.. code-block:: bash
+
+ mkdir paddlepaddle # Create paddlepaddle working directory
+ cd paddlepaddle
+
+ # Clone the content repositories. You may only clone the contents you need
+ git clone https://github.com/PaddlePaddle/Paddle.git
+ git clone https://github.com/PaddlePaddle/book.git
+ git clone https://github.com/PaddlePaddle/models.git
+ git clone https://github.com/PaddlePaddle/Mobile.git
+
+ # Please specify the working directory through -v
+ docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+
+Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command
+Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+The compiled documentations will be stored in /.ppo_workspace/content
+
+
+If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up
+
+.. code-block:: bash
+
+ mkdir paddlepaddle # Create paddlepaddle working directory
+ cd paddlepaddle
+
+ # Clone the content repositories and PaddlePaddle.org
+ git clone https://github.com/PaddlePaddle/Paddle.git
+ git clone https://github.com/PaddlePaddle/book.git
+ git clone https://github.com/PaddlePaddle/models.git
+ git clone https://github.com/PaddlePaddle/Mobile.git
+ git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+
+ # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+ export CONTENT_DIR=
+ export ENV=''
+ cd PaddlePaddle.org/portal/
+ pip install -r requirements.txt
+ python manage.py runserver
+
+Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+The compiled documentations will be stored in /.ppo_workspace/content
+
+If you want to learn more on the PaddlePaddle.org, please `click here `_ 。
+
+How to write Documentations
+============
+
+PaddlePaddle uses `sphinx`_ to compile documentations,Please check sphinx official website for more detail.
+
+
+How to update www.paddlepaddle.org
+============================
+
+Please create PRs and submit them to github, please check `Contribute Code `_ 。
+PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs `_ and
+`English Docs `_ 。
+
+.. _cmake: https://cmake.org/
+.. _sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index eb95356c67c5df22e4f543f958eb31d79f2c6195..991b9e2596a3b499846b963152c838d66260265d 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -19,6 +19,7 @@
.. toctree::
:maxdepth: 1
+ dev/contribute_to_paddle_cn.md
dev/write_docs_cn.rst
模型配置
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index 1fbfcd260b912078f00ed5b720ed607db725c4e2..61bf25ccd12eeedffc747fdd4ce84fa4adde07ee 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -20,6 +20,7 @@ Development
dev/new_layer_en.rst
dev/contribute_to_paddle_en.md
+ dev/write_docs_en.rst
Configuration
-------------
diff --git a/doc/howto/read_source.md b/doc/howto/read_source.md
new file mode 100644
index 0000000000000000000000000000000000000000..383acb0c8251043c3c6bbf309d2e07bf0074cd4f
--- /dev/null
+++ b/doc/howto/read_source.md
@@ -0,0 +1,67 @@
+# PaddlePaddle Fluid Source Code Overview
+
+Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/tests/book
+
+Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework
+
+Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators
+
+Optimizer: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/optimizer
+
+Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory
+
+# Compile Time
+
+The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto).
+
+```python
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+```
+
+- Variables: `x`, `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/framework.py#L93)
+- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/layers.py)
+ - Every Layer has one or more operators and variables/parameters
+ - All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files:
+ - Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h)
+ - Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h)
+ - Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h)
+- Optimizer: `fluid.optimizer.SGD`. It does the following
+ - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/backward.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/backward.cc)]
+ - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/optimizer.py), [C++](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/optimizer)]
+
+# Run Time
+
+The following **evaluates** the NN. Instantiates all the variables, operators.
+
+```python
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+# Allocate memory. Initialize Parameter.
+exe.run(fluid.default_startup_program())
+
+# Allocate memory. Do computation.
+exe.run(fluid.default_main_program(),
+ feed=feeder.feed(data),
+ fetch_list=[avg_cost])
+```
+
+- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h)
+ - The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h)
+- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)]
+ - Feeds the data: `feed=feeder.feed(data)`
+ - Evaluates all the operators
+ - Fetches the result: `fetch_list=[avg_cost]`
+- Other worth looking files:
+ - Scope: [paddle/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/scope.h). Where all the variables live
+ - Variable: [paddle/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h). Where all the data (most likely tensors) live
+ - Tensor: [paddle/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h). Where we allocate memory through [`paddle/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory)
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index d6b8464100d4497876aa3f6f7cbc666aafae4bfc..cf84568ecdf1227b0d0ed3606a4a9a6e5186af72 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -25,8 +25,18 @@ FILE(GLOB PY_PADDLE_PYTHON_FILES ${PADDLE_SOURCE_DIR}/paddle/py_paddle/*.py)
SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
+SET(SWIG_NEED_FLAGS
+ -ftls-model=global-dynamic
+ -Wno-parentheses-equality
+ -Wno-self-assign
+ -Wno-maybe-uninitialized
+ -Wno-missing-field-initializers)
+ FOREACH(flag ${SWIG_NEED_FLAGS})
+ safe_set_cxxflag(SWIG_CXX_FLAGS ${flag})
+ENDFOREACH()
+
SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR})
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality -Wno-missing-field-initializers -Wno-self-assign -ftls-model=global-dynamic")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SWIG_CXX_FLAGS}")
SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
paddle_parameter
diff --git a/paddle/capi/Main.cpp b/paddle/capi/Main.cpp
index bb8249a5511c089ec2f2263ff4cc290f0a5a8fce..c038789340033fcf6dcc07a41b033a50e980c965 100644
--- a/paddle/capi/Main.cpp
+++ b/paddle/capi/Main.cpp
@@ -43,4 +43,11 @@ paddle_error paddle_init(int argc, char** argv) {
isInit = true;
return kPD_NO_ERROR;
}
+
+paddle_error paddle_init_thread() {
+ if (FLAGS_use_gpu) {
+ hl_init(FLAGS_gpu_id);
+ }
+ return kPD_NO_ERROR;
+}
}
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index 30f3a766f0c65187c8f2dd4603e3d26c9b9a6a3d..cbacd1fb71c14f490ff548db714e728772292b4b 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -40,7 +40,7 @@ paddle_error paddle_matrix_destroy(paddle_matrix mat) {
paddle_error paddle_matrix_set_row(paddle_matrix mat,
uint64_t rowID,
paddle_real* rowArray) {
- if (mat == nullptr) return kPD_NULLPTR;
+ if (mat == nullptr || rowArray == nullptr) return kPD_NULLPTR;
auto ptr = cast(mat);
if (ptr->mat == nullptr) return kPD_NULLPTR;
if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
diff --git a/paddle/capi/error.cpp b/paddle/capi/error.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..169b65f92104336d9ec12e2a5a6778db25080270
--- /dev/null
+++ b/paddle/capi/error.cpp
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "error.h"
+
+const char* paddle_error_string(paddle_error err) {
+ switch (err) {
+ case kPD_NULLPTR:
+ return "nullptr error";
+ case kPD_OUT_OF_RANGE:
+ return "out of range error";
+ case kPD_PROTOBUF_ERROR:
+ return "protobuf error";
+ case kPD_NOT_SUPPORTED:
+ return "not supported error";
+ case kPD_UNDEFINED_ERROR:
+ return "undefined error";
+ default:
+ return "";
+ }
+}
diff --git a/paddle/capi/error.h b/paddle/capi/error.h
index 44d8c2040d1aad698398089baeee6f13c3deeb55..9d9d0ed63a5276c6b9a8747e1ee1fce6872bdc9e 100644
--- a/paddle/capi/error.h
+++ b/paddle/capi/error.h
@@ -15,6 +15,8 @@ limitations under the License. */
#ifndef __PADDLE_CAPI_ERROR_H__
#define __PADDLE_CAPI_ERROR_H__
+#include "config.h"
+
/**
* Error Type for Paddle API.
*/
@@ -27,4 +29,9 @@ typedef enum {
kPD_UNDEFINED_ERROR = -1,
} paddle_error;
+/**
+ * Error string for Paddle API.
+ */
+PD_API const char* paddle_error_string(paddle_error err);
+
#endif
diff --git a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
index 98e411ddc02a46034e8f6ceb00657622d998c9f3..2fc8debddedeab6ae982b0df49ec2b73bc0f85f5 100644
--- a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
+++ b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
@@ -1,8 +1,29 @@
project(multi_thread)
cmake_minimum_required(VERSION 2.8)
-aux_source_directory(. SRC_LIST)
-add_executable(${PROJECT_NAME} ${SRC_LIST})
+
find_package (Threads)
+
+if(NOT PADDLE_ROOT)
+ set(PADDLE_ROOT $ENV{PADDLE_ROOT} CACHE PATH "Paddle Path")
+endif()
+if(PADDLE_ROOT)
+ include_directories(${PADDLE_ROOT}/include)
+ link_directories(${PADDLE_ROOT}/lib)
+endif()
+
+set(CPU_SRCS main.c)
+add_executable(${PROJECT_NAME} ${CPU_SRCS})
set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
-target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared
- ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${PROJECT_NAME}
+ -lpaddle_capi_shared
+ ${CMAKE_THREAD_LIBS_INIT})
+
+find_package(CUDA QUIET)
+if(CUDA_FOUND)
+ set(GPU_SRCS main_gpu.c)
+ cuda_add_executable(${PROJECT_NAME}_gpu ${GPU_SRCS})
+ set_property(TARGET ${PROJECT_NAME}_gpu PROPERTY C_STANDARD 99)
+ target_link_libraries(${PROJECT_NAME}_gpu
+ -lpaddle_capi_shared
+ ${CMAKE_THREAD_LIBS_INIT})
+endif(CUDA_FOUND)
diff --git a/paddle/capi/examples/model_inference/multi_thread/main_gpu.c b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
new file mode 100644
index 0000000000000000000000000000000000000000..6fd376e0d1a2fee4f9a0f676b53c6f2891795cab
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
@@ -0,0 +1,113 @@
+#include
+#include
+#include
+#include "../common/common.h"
+
+#define CONFIG_BIN "./trainer_config.bin"
+#define NUM_THREAD 4
+#define NUM_ITER 1000
+
+pthread_mutex_t mutex;
+
+/*
+ * @brief It is an simple inference example that runs multi-threads on a GPU.
+ * Each thread holds it own local gradient_machine but shares the same
+ * parameters.
+ * If you want to run on different GPUs, you need to launch
+ * multi-processes or set trainer_count > 1.
+ */
+void* thread_main(void* gm_ptr) {
+ // Initialize the thread environment of Paddle.
+ CHECK(paddle_init_thread());
+
+ paddle_gradient_machine machine = (paddle_gradient_machine)(gm_ptr);
+ // Create input arguments.
+ paddle_arguments in_args = paddle_arguments_create_none();
+ // Create input matrix.
+ paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
+ /* size */ 784,
+ /* useGPU */ true);
+ // Create output arguments.
+ paddle_arguments out_args = paddle_arguments_create_none();
+ // Create output matrix.
+ paddle_matrix prob = paddle_matrix_create_none();
+
+ // CPU buffer to cache the input and output.
+ paddle_real* cpu_input = (paddle_real*)malloc(784 * sizeof(paddle_real));
+ paddle_real* cpu_output = (paddle_real*)malloc(10 * sizeof(paddle_real));
+ for (int iter = 0; iter < NUM_ITER; ++iter) {
+ // There is only one input layer of this network.
+ CHECK(paddle_arguments_resize(in_args, 1));
+ CHECK(paddle_arguments_set_value(in_args, 0, mat));
+
+ for (int i = 0; i < 784; ++i) {
+ cpu_input[i] = rand() / ((float)RAND_MAX);
+ }
+ CHECK(paddle_matrix_set_value(mat, cpu_input));
+
+ CHECK(paddle_gradient_machine_forward(machine,
+ in_args,
+ out_args,
+ /* isTrain */ false));
+
+ CHECK(paddle_arguments_get_value(out_args, 0, prob));
+ CHECK(paddle_matrix_get_value(prob, cpu_output));
+
+ pthread_mutex_lock(&mutex);
+ printf("Prob: ");
+ for (int i = 0; i < 10; ++i) {
+ printf("%.2f ", cpu_output[i]);
+ }
+ printf("\n");
+ pthread_mutex_unlock(&mutex);
+ }
+
+ CHECK(paddle_matrix_destroy(prob));
+ CHECK(paddle_arguments_destroy(out_args));
+ CHECK(paddle_matrix_destroy(mat));
+ CHECK(paddle_arguments_destroy(in_args));
+ CHECK(paddle_gradient_machine_destroy(machine));
+
+ free(cpu_input);
+ free(cpu_output);
+
+ return NULL;
+}
+
+int main() {
+ // Initalize Paddle
+ char* argv[] = {"--use_gpu=True"};
+ CHECK(paddle_init(1, (char**)argv));
+
+ // Reading config binary file. It is generated by `convert_protobin.sh`
+ long size;
+ void* buf = read_config(CONFIG_BIN, &size);
+
+ // Create a gradient machine for inference.
+ paddle_gradient_machine machine;
+ CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
+ CHECK(paddle_gradient_machine_randomize_param(machine));
+
+ // Loading parameter. Uncomment the following line and change the directory.
+ // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
+ // "./some_where_to_params"));
+ srand(time(0));
+ pthread_mutex_init(&mutex, NULL);
+
+ pthread_t threads[NUM_THREAD];
+
+ for (int i = 0; i < NUM_THREAD; ++i) {
+ paddle_gradient_machine thread_local_machine;
+ CHECK(paddle_gradient_machine_create_shared_param(
+ machine, buf, size, &thread_local_machine));
+ pthread_create(&threads[i], NULL, thread_main, thread_local_machine);
+ }
+
+ for (int i = 0; i < NUM_THREAD; ++i) {
+ pthread_join(threads[i], NULL);
+ }
+
+ pthread_mutex_destroy(&mutex);
+
+ return 0;
+}
diff --git a/paddle/capi/main.h b/paddle/capi/main.h
index 893ebcbd58dd24cf835fb2005865c94c9ba2a810..99c4e8428dbaa14d36dc2d36b2a4f16c9ec3e0d1 100644
--- a/paddle/capi/main.h
+++ b/paddle/capi/main.h
@@ -26,6 +26,13 @@ extern "C" {
*/
PD_API paddle_error paddle_init(int argc, char** argv);
+/**
+ * Initialize the thread environment of Paddle.
+ * @note it is requisite for GPU runs but optional for CPU runs.
+ * For GPU runs, all threads will run on the same GPU devices.
+ */
+PD_API paddle_error paddle_init_thread();
+
#ifdef __cplusplus
}
#endif
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 89c1f48edacbe0a4432957fe066481412db7e6e1..88418062927cd0f7714e992cc2495109da45d32f 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -116,6 +116,7 @@ extern void hl_maxpool_backward(const int frameCnt,
* @param[in] paddingW padding width.
* @param[out] tgtData output data.
* @param[in] tgtStride stride between output data samples.
+ * @param[in] excludeMode whether to consider paddings for size.
*
*/
extern void hl_avgpool_forward(const int frameCnt,
@@ -132,7 +133,8 @@ extern void hl_avgpool_forward(const int frameCnt,
const int paddingH,
const int paddingW,
real* tgtData,
- const int tgtStride);
+ const int tgtStride,
+ bool excludeMode);
/**
* @brief Maximum pool backward.
@@ -154,6 +156,7 @@ extern void hl_avgpool_forward(const int frameCnt,
* @param[in] scaleB scale.
* @param[out] backGrad output grad.
* @param[in] outStride stride between output data samples.
+ * @param[in] excludeMode whether to consider paddings for size.
*
*/
extern void hl_avgpool_backward(const int frameCnt,
@@ -172,7 +175,8 @@ extern void hl_avgpool_backward(const int frameCnt,
real scaleA,
real scaleB,
real* backGrad,
- const int outStride);
+ const int outStride,
+ bool excludeMode);
extern void hl_maxpool3D_forward(const int frameCnt,
const real* inputData,
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 968ed4840ffb0623b57bd6e6d839973e109394de..706cc59a8e394b109d2b290425f4b5f51d987f28 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -68,7 +68,8 @@ inline void hl_avgpool_forward(const int frameCnt,
const int paddingH,
const int paddingW,
real* tgtData,
- const int tgtStride) {}
+ const int tgtStride,
+ const bool excludeMode) {}
inline void hl_avgpool_backward(const int frameCnt,
const real* outGrad,
@@ -86,7 +87,8 @@ inline void hl_avgpool_backward(const int frameCnt,
real scaleA,
real scaleB,
real* backGrad,
- const int outStride) {}
+ const int outStride,
+ const bool excludeMode) {}
inline void hl_maxpool3D_forward(const int frameCnt,
const real* inputData,
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index 3699b1e8ae9d8f813439eaeaa760c4a9f6e100a0..2d1bc4f6d55fac4b74f4e58d40fe56aa61d19cf9 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -210,7 +210,8 @@ __global__ void KeAvgPoolForward(const int nthreads,
const int padH,
const int padW,
real* tgtData,
- const int tgtStride) {
+ const int tgtStride,
+ const bool excludeMode) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < nthreads) {
int pw = index % pooledW;
@@ -224,7 +225,8 @@ __global__ void KeAvgPoolForward(const int nthreads,
int wend = min(wstart + sizeX, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
- int pool_size = (hend - hstart) * (wend - wstart);
+ int poolSize =
+ excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
real aveval = 0;
inputData += (frameNum * channels + c) * height * width;
@@ -235,7 +237,7 @@ __global__ void KeAvgPoolForward(const int nthreads,
}
int tgtIndex =
index % (pooledW * pooledH * channels) + frameNum * tgtStride;
- tgtData[tgtIndex] = aveval / pool_size;
+ tgtData[tgtIndex] = aveval / poolSize;
}
}
@@ -253,7 +255,8 @@ void hl_avgpool_forward(const int frameCnt,
const int paddingH,
const int paddingW,
real* tgtData,
- const int tgtStride) {
+ const int tgtStride,
+ const bool excludeMode) {
int num_kernels = pooledH * pooledW * channels * frameCnt;
int blocks = (num_kernels + 1024 - 1) / 1024;
KeAvgPoolForward<<>>(num_kernels,
@@ -270,7 +273,8 @@ void hl_avgpool_forward(const int frameCnt,
paddingH,
paddingW,
tgtData,
- tgtStride);
+ tgtStride,
+ excludeMode);
CHECK_SYNC("hl_avgpool_forward failed");
}
@@ -290,7 +294,8 @@ __global__ void KeAvgPoolBackward(const int nthreads,
real scaleA,
real scaleB,
real* tgtGrad,
- const int outStride) {
+ const int outStride,
+ const bool excludeMode) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < nthreads) {
int offsetW = index % width + padW;
@@ -314,8 +319,9 @@ __global__ void KeAvgPoolBackward(const int nthreads,
int wstart = pw * strideW - padW;
int wend = min(wstart + sizeX, width);
wstart = max(wstart, 0);
- int poolsize = (hend - hstart) * (wend - wstart);
- gradient += outGrad[ph * pooledW + pw] / poolsize;
+ int poolSize =
+ excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
+ gradient += outGrad[ph * pooledW + pw] / poolSize;
}
}
tgtGrad[index] = scaleB * tgtGrad[index] + scaleA * gradient;
@@ -338,7 +344,8 @@ void hl_avgpool_backward(const int frameCnt,
real scaleA,
real scaleB,
real* backGrad,
- const int outStride) {
+ const int outStride,
+ const bool excludeMode) {
int num_kernels = height * width * channels * frameCnt;
int blocks = (num_kernels + 1024 - 1) / 1024;
@@ -358,7 +365,8 @@ void hl_avgpool_backward(const int frameCnt,
scaleA,
scaleB,
backGrad,
- outStride);
+ outStride,
+ excludeMode);
CHECK_SYNC("hl_avgpool_backward failed");
}
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 8fd2906107c490eee129fc10262df28bfa67800b..a17036c6527da3a4a32f021a57542b6b6d68a395 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -27,6 +27,18 @@
namespace paddle {
namespace framework {
+static std::unordered_set* g_ctrl_flow_ops_ = nullptr;
+// Control Flow operators's backward is significantly different from
+// computational operators. Hack Code here.
+// We should design a better way to backward CtrlFlowOps.
+static std::unordered_set& CtrlFlowOps() {
+ if (g_ctrl_flow_ops_ == nullptr) {
+ g_ctrl_flow_ops_ = new std::unordered_set{
+ "increment", "lod_rank_table", "less_than"};
+ }
+ return *g_ctrl_flow_ops_;
+}
+
static inline std::unique_ptr CreateGradOp(
const OperatorBase& op, const std::unordered_set& no_grad_set,
std::unordered_map* grad_to_var) {
@@ -178,8 +190,9 @@ static std::unique_ptr BackwardRecursive(
// collect all the offset for each alias,
// insert a sum operator to add all aliases to output
insert_position.push_back(
- {dup_op.back(), OpRegistry::CreateOp("sum", {{"X", dup_outputs}},
- {{"Out", {name}}}, {})});
+ {dup_op.back(),
+ OpRegistry::CreateOp("sum", {{"X", dup_outputs}}, {{"Out", {name}}},
+ AttributeMap{})});
}
// make sure the inserted `sum` ops follow the BFS order.
@@ -204,7 +217,8 @@ static std::unique_ptr BackwardRecursive(
// If part of input gradient of that operator is not calculated, fill
// zero variables to that input gradient.
net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}},
- {{"Y", {grad_input}}}, {}));
+ {{"Y", {grad_input}}},
+ AttributeMap{}));
}
return false;
});
@@ -288,12 +302,24 @@ static void CreateGradVarInBlock(
for (size_t op_index = grad_op_start_index; op_index < ops.size();
++op_index) {
std::unordered_set new_vars;
+ auto& ctrl_flow_ops = CtrlFlowOps();
ForEachVarName(ops[op_index]->Outputs(),
[&](const std::string& grad_var_name) {
- if (block_desc->HasVar(grad_var_name)) {
+ if (ctrl_flow_ops.find(ops[op_index]->Type()) !=
+ ctrl_flow_ops.end()) {
+ if (block_desc->HasVarRecursive(grad_var_name)) {
+ return false;
+ }
+ } else {
+ if (block_desc->HasVar(grad_var_name)) {
+ return false;
+ }
+ }
+ if (grad_var_name == framework::kEmptyVarName) {
return false;
}
auto var = block_desc->Var(grad_var_name);
+ VLOG(10) << "Creating Variable " << grad_var_name;
new_vars.insert(var->Name());
auto it = param_name_map.find(grad_var_name);
if (it == param_name_map.end()) {
@@ -333,14 +359,25 @@ std::vector> MakeOpGrad(
// All input gradients of forwarding operator do not need to calculate.
const std::vector& inputs = op_desc->InputArgumentNames();
if (AllGradInSet(inputs, *no_grad_vars)) {
+ VLOG(10) << "Drop operator " << op_desc->Type();
return grad_op_descs; // empty vector
}
+
// All output gradients of forwarding operator do not need to calculate.
const std::vector& outputs = op_desc->OutputArgumentNames();
+
if (AllGradInSet(outputs, *no_grad_vars)) {
- for (const std::string& name : inputs) {
- no_grad_vars->insert(GradVarName(name));
+ VLOG(10) << "Drop operator " << op_desc->Type();
+ // FIXME: Hack code here
+ auto& ctrl_flow_ops = CtrlFlowOps();
+ if (ctrl_flow_ops.find(op_desc->Type()) == ctrl_flow_ops.end()) {
+ // Only computational op need drop input's gradient.
+ for (const std::string& name : inputs) {
+ no_grad_vars->insert(GradVarName(name));
+ VLOG(10) << " Also drop " << GradVarName(name);
+ }
}
+
return grad_op_descs; // empty vector
}
@@ -357,8 +394,9 @@ std::vector> MakeOpGrad(
0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
std::string new_name = prefix + kZeroVarSuffix;
desc->Rename(in_name, new_name);
- std::unique_ptr fill_zeros_op(new OpDescBind(
- "fill_zeros_like", {{"X", {prefix}}}, {{"Y", {new_name}}}, {}));
+ std::unique_ptr fill_zeros_op(
+ new OpDescBind("fill_zeros_like", {{"X", {prefix}}},
+ {{"Y", {new_name}}}, AttributeMap{}));
pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
}
}
@@ -448,8 +486,9 @@ std::vector> MakeBlockBackward(
sum_op_inputs.emplace_back(new_name);
next_g_name = sum_op_inputs.back();
}
- std::unique_ptr sum_op(new OpDescBind(
- "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
+ std::unique_ptr sum_op(
+ new OpDescBind("sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}},
+ AttributeMap{}));
pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
}
}
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index 2b858f5ea0874d7bf1a9cf38529f5d0d70cca7f2..9fe49881d5b740655432f6e83a7886878ceb17e8 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -106,15 +106,15 @@ class FcOp : public operators::NetOp {
FcOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) {
- AppendOp(OpRegistry::CreateOp("mul",
- {{"X", {Input("X")}}, {"Y", {Input("W")}}},
- {{"Out", {Output("mul_result")}}}, {}));
+ AppendOp(OpRegistry::CreateOp(
+ "mul", {{"X", {Input("X")}}, {"Y", {Input("W")}}},
+ {{"Out", {Output("mul_result")}}}, AttributeMap{}));
auto input_b = Inputs("b");
std::string before_act = "mul_result";
if (input_b.size() != 0) {
AppendOp(OpRegistry::CreateOp(
"rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}},
- {{"Out", {Output("add_result")}}}, {}));
+ {{"Out", {Output("add_result")}}}, AttributeMap{}));
before_act = "add_result";
} else {
auto out_varname = Output("add_result");
@@ -124,7 +124,7 @@ class FcOp : public operators::NetOp {
}
AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
- {{"Out", {Output("Out")}}}, {}));
+ {{"Out", {Output("Out")}}}, AttributeMap{}));
CompleteAddOp(false);
}
};
@@ -278,8 +278,9 @@ REGISTER_OPERATOR(scale, f::NoneOp);
REGISTER_OP_CPU_KERNEL(scale, f::NoneKernel);
TEST(Backward, simple_op_not_need_grad) {
- auto fwd = f::OpRegistry::CreateOp(
- "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
+ auto fwd =
+ f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
+ {{"Out", {"out"}}}, f::AttributeMap{});
ASSERT_NE(fwd, nullptr);
auto gop = f::Backward(*fwd, {"x"});
ASSERT_EQ(gop->Output(f::GradVarName("X")), f::kEmptyVarName);
@@ -296,9 +297,10 @@ TEST(Backward, net_fc_backward_normal) {
{{"mul_result", {"mul_res"}},
{"add_result", {"add_re"}},
{"Out", {"out"}}},
- {});
+ f::AttributeMap{});
ASSERT_NE(fwd, nullptr);
- std::shared_ptr gop = f::Backward(*fwd, {});
+ std::shared_ptr gop =
+ f::Backward(*fwd, std::unordered_set{});
ASSERT_TRUE(gop->IsNetOp());
auto net = static_cast(gop.get());
@@ -322,9 +324,10 @@ TEST(Backward, net_fc_backward_not_have_b) {
{{"mul_result", {"mul_res"}},
{"add_result", {"add_res"}},
{"Out", {"tmp"}}},
- {});
+ f::AttributeMap{});
ASSERT_NE(fwd, nullptr);
- std::shared_ptr gop = f::Backward(*fwd, {});
+ std::shared_ptr gop =
+ f::Backward(*fwd, std::unordered_set{});
ASSERT_TRUE(gop->IsNetOp());
auto net = static_cast(gop.get());
@@ -346,13 +349,13 @@ TEST(Backward, net_input_of_network_not_need_grad) {
{{"mul_result", {"mul_tmp_0"}},
{"add_result", {"add_tmp_0"}},
{"Out", {"hidden0"}}},
- {}));
+ f::AttributeMap{}));
net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}},
{{"mul_result", {"mul_tmp_1"}},
{"add_result", {"add_tmp_1"}},
{"Out", {"hidden1"}}},
- {}));
+ f::AttributeMap{}));
net.CompleteAddOp();
auto bwd = Backward(net, {"x"}); // x@GRAD is not need.
ASSERT_TRUE(bwd->IsNetOp());
@@ -381,12 +384,13 @@ TEST(Backward, net_input_of_network_not_need_grad) {
TEST(Backward, net_shared_weight) {
ops::NetOp net;
net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
- {{"Out", {"out"}}}, {}));
+ {{"Out", {"out"}}}, f::AttributeMap{}));
net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
- {{"Out", {"FinalOut"}}}, {}));
+ {{"Out", {"FinalOut"}}},
+ f::AttributeMap{}));
net.CompleteAddOp();
- auto bwd = f::Backward(net, {});
+ auto bwd = f::Backward(net, std::unordered_set{});
ASSERT_TRUE(bwd->IsNetOp());
auto bwd_net = static_cast(bwd.get());
ASSERT_EQ(3UL, bwd_net->ops_.size());
@@ -394,8 +398,9 @@ TEST(Backward, net_shared_weight) {
}
TEST(Backward, op_all_input_are_not_need) {
- auto fwd = f::OpRegistry::CreateOp(
- "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
+ auto fwd =
+ f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
+ {{"Out", {"out"}}}, f::AttributeMap{});
auto backward = f::Backward(*fwd, {"x", "b"});
ASSERT_TRUE(backward->IsNetOp());
auto net = static_cast(backward.get());
@@ -403,8 +408,9 @@ TEST(Backward, op_all_input_are_not_need) {
}
TEST(Backward, op_all_output_are_not_need) {
- auto fwd = f::OpRegistry::CreateOp(
- "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
+ auto fwd =
+ f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
+ {{"Out", {"out"}}}, f::AttributeMap{});
auto backward = f::Backward(*fwd, {"out"});
ASSERT_TRUE(backward->IsNetOp());
auto net = static_cast(backward.get());
@@ -412,8 +418,9 @@ TEST(Backward, op_all_output_are_not_need) {
}
TEST(Backward, op_part_of_output_are_not_need) {
- auto fwd = f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}},
- {{"y", {"Y"}}, {"z", {"Z"}}}, {});
+ auto fwd =
+ f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}},
+ {{"y", {"Y"}}, {"z", {"Z"}}}, f::AttributeMap{});
auto backward = f::Backward(*fwd, {"Z"});
ASSERT_TRUE(backward->IsNetOp());
auto net = static_cast(backward.get());
@@ -437,7 +444,7 @@ TEST(Backward, op_part_of_output_are_not_need) {
TEST(Backward, op_part_of_input_are_not_need) {
auto fwd = f::OpRegistry::CreateOp("mul", {{"X", {"a"}}, {"Y", {"b"}}},
- {{"Out", {"out"}}}, {});
+ {{"Out", {"out"}}}, f::AttributeMap{});
auto backward = f::Backward(*fwd, {"a"});
auto &grad_mul = *backward;
ASSERT_EQ(grad_mul.Type(), "mul_grad");
@@ -458,19 +465,19 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
{{"mul_result", {"mul_out1"}},
{"add_result", {"add_out1"}},
{"Out", {"out1"}}},
- {}));
+ f::AttributeMap{}));
net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}},
{{"mul_result", {"mul_out2"}},
{"add_result", {"tmp_out2"}},
{"Out", {"out2"}}},
- {}));
+ f::AttributeMap{}));
net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}},
{{"mul_result", {"mul_out3"}},
{"add_result", {"tmp_out3"}},
{"Out", {"out3"}}},
- {}));
+ f::AttributeMap{}));
net.CompleteAddOp();
auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"});
@@ -509,7 +516,8 @@ TEST(Backward, simple_single_op) {
auto target = f::VarDescBind("out");
target.SetShape({1});
- auto var_to_grad = AppendBackward(program, target, {});
+ auto var_to_grad =
+ AppendBackward(program, target, std::unordered_set{});
ASSERT_EQ(block->AllOps().size(), 3UL);
f::OpDescBind *fill_op = block->AllOps()[1];
@@ -546,7 +554,7 @@ TEST(Backward, default_attribute) {
auto target = f::VarDescBind("out");
target.SetShape({1});
- AppendBackward(program, target, {});
+ AppendBackward(program, target, std::unordered_set{});
ASSERT_EQ(block->AllOps().size(), 3UL);
EXPECT_EQ(boost::get(op->GetAttr("x_num_col_dims")), 1);
@@ -585,7 +593,8 @@ TEST(Backward, simple_mult_op) {
auto target = f::VarDescBind("out3");
target.SetShape({1});
size_t forward_len = block->AllOps().size();
- auto var_to_grad = AppendBackward(program, target, {});
+ auto var_to_grad =
+ AppendBackward(program, target, std::unordered_set{});
ASSERT_EQ(block->AllOps().size(), 6UL + 1);
f::OpDescBind *fill_op = block->AllOps()[forward_len];
@@ -817,7 +826,8 @@ TEST(Backward, shared_var) {
auto target = f::VarDescBind("out3");
target.SetShape({1});
size_t forward_len = block->AllOps().size();
- auto var_to_grad = AppendBackward(program, target, {});
+ auto var_to_grad =
+ AppendBackward(program, target, std::unordered_set{});
ASSERT_EQ(block->AllOps().size(), 8UL);
f::OpDescBind *fill_op = block->AllOps()[forward_len];
diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
index 11764810e1d40e5e6eb3cd0d8e9b4b63a79855b4..6a7a07d5cf471a32822cdccf5c616d8748fd1bd7 100644
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/block_desc.h"
+#include "paddle/framework/operator.h"
#include "paddle/framework/program_desc.h"
namespace paddle {
@@ -42,6 +43,8 @@ bool BlockDescBind::HasVar(const std::string &name) const {
}
VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
+ if (name == kEmptyVarName) return nullptr;
+
auto it = vars_.find(name);
if (it == vars_.end()) {
return Parent() == kNoneBlockIndex ? nullptr
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 2ffb5b7dbb27b561092856eac0de23d0c3788f75..83aa927c293676c3800ed945c175e4f3dc5629d6 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -97,6 +97,10 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
if (create_local_scope) {
local_scope = &scope->NewScope();
for (auto& var : block.AllVars()) {
+ if (var->Name() == framework::kEmptyVarName) {
+ continue;
+ }
+
if (var->Persistable()) {
auto* ptr = scope->Var(var->Name());
CreateTensor(ptr, var->GetType());
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 02a825324328fa5cfd3a4d23a8c64488cc88aeec..7ba1e3e4e3270f4cd88e41e245f24c3cfc8aaab7 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -59,7 +59,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
if (in_var->GetType() != VarDesc::LOD_TENSOR) {
- VLOG(3) << "input " << in << "is not LodTensor";
+ VLOG(3) << "input " << in << " is not LodTensor";
return;
}
PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,
@@ -316,8 +316,8 @@ static void InitInferShapeFuncs() {
for (auto &kern_pair : OperatorWithKernel::AllOpKernels()) {
auto op_type = kern_pair.first;
auto &op_info = info_map.at(op_type);
- auto op =
- static_cast(op_info.Creator()("", {}, {}, {}));
+ auto op = static_cast(op_info.Creator()(
+ "", VariableNameMap{}, VariableNameMap{}, AttributeMap{}));
if (op_info.infer_shape_) { // infer_shape has been registered.
continue;
}
@@ -466,7 +466,12 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
try {
- return framework::make_ddim(var->Shape());
+ auto shape = var->Shape();
+ if (shape.empty()) {
+ return framework::make_ddim({0UL});
+ } else {
+ return framework::make_ddim(var->Shape());
+ }
} catch (...) {
VLOG(5) << "GetDim of variable " << name << " error";
std::rethrow_exception(std::current_exception());
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index daade439e5232f06be72bc5bb1e2285124f2c3a4..b29238432b05d81e984e1f4c269a00b01a4229cc 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -181,8 +181,8 @@ class OpKernelRegistrar : public Registrar {
return 0; \
}
-#define REGISTER_OP_GPU_KERNEL(op_type, ...) \
- REGISTER_OP_KERNEL(op_type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__)
+#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
+ REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::GPUPlace, __VA_ARGS__)
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
@@ -217,7 +217,7 @@ class OpKernelRegistrar : public Registrar {
#else
#define USE_OP_KERNEL(op_type) \
USE_OP_DEVICE_KERNEL(op_type, CPU); \
- USE_OP_DEVICE_KERNEL(op_type, GPU)
+ USE_OP_DEVICE_KERNEL(op_type, CUDA)
#endif
#define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type);
@@ -226,9 +226,9 @@ class OpKernelRegistrar : public Registrar {
USE_OP_ITSELF(op_type); \
USE_OP_DEVICE_KERNEL(op_type, CPU);
-#define USE_GPU_ONLY_OP(op_type) \
- USE_OP_ITSELF(op_type); \
- USE_OP_DEVICE_KERNEL(op_type, GPU)
+#define USE_CUDA_ONLY_OP(op_type) \
+ USE_OP_ITSELF(op_type); \
+ USE_OP_DEVICE_KERNEL(op_type, CUDA)
#define USE_OP(op_type) \
USE_OP_ITSELF(op_type); \
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 93467ab8ac796277b47a861a427de2837fb2d3d4..e83d7547831744333d6a9c36e842d840a2a0dc03 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -22,20 +22,6 @@ limitations under the License. */
namespace paddle {
namespace framework {
-template <>
-Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
- platform::CPUPlace, Eigen::DefaultDevice>() const {
- return *device_context_.GetEigenDevice();
-}
-
-#ifdef PADDLE_WITH_CUDA
-template <>
-Eigen::GpuDevice&
-ExecutionContext::GetEigenDevice() const {
- return *device_context_.GetEigenDevice();
-}
-#endif
-
std::string OperatorBase::Input(const std::string& name) const {
auto& ins = Inputs(name);
PADDLE_ENFORCE_LE(ins.size(), 1UL,
@@ -426,13 +412,10 @@ void OperatorWithKernel::Run(const Scope& scope,
}
kernel_iter->second->Compute(ctx);
-
- // throws errors if have.
- dev_ctx.Finish();
}
OpKernelType OperatorWithKernel::GetKernelType(
const ExecutionContext& ctx) const {
- return OpKernelType(IndicateDataType(ctx), ctx.device_context());
+ return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
}
DataType OperatorWithKernel::IndicateDataType(
const ExecutionContext& ctx) const {
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 60861d92933dd100f877bec8d43f9b924f951e60..e60dbfc313f732120f6879fd6fd19ca8abc06813 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -276,17 +276,25 @@ class ExecutionContext {
out_tensor->set_lod(in_tensor.lod());
}
- template ::EigenDeviceType>
- DeviceType& GetEigenDevice() const;
-
platform::Place GetPlace() const { return device_context_.GetPlace(); }
+ template
+ const DeviceContextType& device_context() const {
+ return *reinterpret_cast(&device_context_);
+ }
+
const platform::DeviceContext& device_context() const {
return device_context_;
}
+#ifdef PADDLE_WITH_CUDA
+ const inline platform::CUDADeviceContext& cuda_device_context() const {
+ PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
+ return *reinterpret_cast(
+ &device_context_);
+ }
+#endif
+
//! Get actual name vector for this input.
const std::vector& Inputs(const std::string& name) const {
return op_.Inputs(name);
@@ -297,14 +305,6 @@ class ExecutionContext {
return op_.Outputs(name);
}
-#ifdef PADDLE_WITH_CUDA
- const inline platform::CUDADeviceContext& cuda_device_context() const {
- PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
- return *reinterpret_cast(
- &device_context_);
- }
-#endif
-
private:
const OperatorBase& op_;
const Scope& scope_;
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 1e19f82b341768142258ba4a5dfa246d87ba4c43..b678178454ff63e4217f0be7a9938a9ba183cda4 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -115,7 +115,7 @@ class OpWithKernelTest : public OperatorWithKernel {
protected:
void InferShape(framework::InferShapeContext* ctx) const override {}
OpKernelType GetKernelType(const ExecutionContext& ctx) const override {
- return OpKernelType(DataType::FP32, ctx.device_context());
+ return OpKernelType(DataType::FP32, ctx.GetPlace());
}
};
@@ -261,7 +261,9 @@ class OperatorClone : public paddle::framework::OperatorBase {
};
TEST(Operator, Clone) {
- OperatorClone a("ABC", {}, {}, {});
+ OperatorClone a("ABC", paddle::framework::VariableNameMap{},
+ paddle::framework::VariableNameMap{},
+ paddle::framework::AttributeMap{});
auto b = a.Clone();
ASSERT_EQ(a.Type(), b->Type());
}
diff --git a/paddle/framework/prune_test.cc b/paddle/framework/prune_test.cc
index 5988874809f51c09b3d3d279be6c1e8d43d7a782..f21df37a292fd1e039ee8f8fa26244e26c978cae 100644
--- a/paddle/framework/prune_test.cc
+++ b/paddle/framework/prune_test.cc
@@ -54,7 +54,8 @@ TEST(Prune, one_operator) {
f::ProgramDescBind program;
f::BlockDescBind *block = program.MutableBlock(0);
- AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, {}, block);
+ AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
+ block);
f::ProgramDesc *pdesc = program.Proto();
f::ProgramDesc pruned;
@@ -71,10 +72,14 @@ TEST(Prune, forward) {
f::ProgramDescBind program;
f::BlockDescBind *block = program.MutableBlock(0);
- AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, {}, block);
- AddOp("one_one", {{"input", {"b"}}}, {{"output", {"c"}}}, {}, block);
- AddOp("one_one", {{"input", {"c"}}}, {{"output", {"d"}}}, {}, block);
- AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, {}, block);
+ AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
+ block);
+ AddOp("one_one", {{"input", {"b"}}}, {{"output", {"c"}}}, f::AttributeMap{},
+ block);
+ AddOp("one_one", {{"input", {"c"}}}, {{"output", {"d"}}}, f::AttributeMap{},
+ block);
+ AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, f::AttributeMap{},
+ block);
f::ProgramDesc *pdesc = program.Proto();
@@ -90,11 +95,14 @@ TEST(Prune, multi_input_op) {
f::ProgramDescBind program;
f::BlockDescBind *block = program.MutableBlock(0);
- AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, {}, block);
- AddOp("one_one", {{"input", {"a1"}}}, {{"output", {"b1"}}}, {}, block);
- AddOp("one_one", {{"input", {"a2"}}}, {{"output", {"b2"}}}, {}, block);
- AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}}, {},
+ AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, f::AttributeMap{},
+ block);
+ AddOp("one_one", {{"input", {"a1"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
block);
+ AddOp("one_one", {{"input", {"a2"}}}, {{"output", {"b2"}}}, f::AttributeMap{},
+ block);
+ AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}},
+ f::AttributeMap{}, block);
f::ProgramDesc *pdesc = program.Proto();
pdesc->mutable_blocks(0)->mutable_ops(3)->set_is_target(true);
@@ -108,9 +116,12 @@ TEST(Prune, multi_output_op) {
f::ProgramDescBind program;
f::BlockDescBind *block = program.MutableBlock(0);
- AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, {}, block);
- AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, {}, block);
- AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, {}, block);
+ AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
+ f::AttributeMap{}, block);
+ AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
+ block);
+ AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
+ block);
f::ProgramDesc *pdesc = program.Proto();
pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);
@@ -124,9 +135,12 @@ TEST(Prune, multi_target) {
f::ProgramDescBind program;
f::BlockDescBind *block = program.MutableBlock(0);
- AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, {}, block);
- AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, {}, block);
- AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, {}, block);
+ AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
+ f::AttributeMap{}, block);
+ AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
+ block);
+ AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
+ block);
f::ProgramDesc *pdesc = program.Proto();
pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index 9ad6272c99dd6a85520ae44c1331ac232bc6a9a2..656736e23846c8de50553a608c54a0bdd3272cb1 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -36,12 +36,9 @@ Scope& Scope::NewScope() const {
}
Variable* Scope::Var(const std::string& name) {
- auto iter = vars_.find(name);
- if (iter != vars_.end()) {
- VLOG(3) << "Get existing variable " << name;
- return iter->second;
- }
- Variable* v = new Variable();
+ auto* v = FindVarLocally(name);
+ if (v != nullptr) return v;
+ v = new Variable();
vars_[name] = v;
VLOG(3) << "Create variable " << name;
v->name_ = &(vars_.find(name)->first);
@@ -57,8 +54,10 @@ Variable* Scope::Var(std::string* name) {
}
Variable* Scope::FindVar(const std::string& name) const {
- auto it = vars_.find(name);
- if (it != vars_.end()) return it->second;
+ auto var = FindVarLocally(name);
+ if (var != nullptr) {
+ return var;
+ }
return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
}
@@ -116,6 +115,11 @@ std::string Scope::Rename(const std::string& origin_name) const {
Rename(origin_name, var_name);
return var_name;
}
+Variable* Scope::FindVarLocally(const std::string& name) const {
+ auto it = vars_.find(name);
+ if (it != vars_.end()) return it->second;
+ return nullptr;
+}
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index c2aafb6ad825f9bd9ffef754923a15afdeaa8e5c..56e815db54b6385c4e4d87f456ed5d59113ca77b 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -76,6 +76,8 @@ class Scope {
std::string Rename(const std::string& origin_name) const;
private:
+ Variable* FindVarLocally(const std::string& name) const;
+
// Call Scope::NewScope for a sub-scope.
explicit Scope(Scope const* parent) : parent_(parent) {}
diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc
index 2298507471c54c5b7751beff900466737eea36d4..7dac1cfd5ee0c320c67bc0b2448417d258d6862b 100644
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -12,6 +12,8 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/shape_inference.h"
+#include "grad_op_desc_maker.h"
+#include "paddle/framework/operator.h"
namespace paddle {
namespace framework {
@@ -49,6 +51,9 @@ void InferShapeContext::SetDims(const std::vector &names,
size_t length = names.size();
PADDLE_ENFORCE_EQ(length, dims.size());
for (size_t i = 0; i < length; ++i) {
+ if (names[i] == framework::kEmptyVarName) {
+ continue;
+ }
SetDim(names[i], dims[i]);
}
}
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index f5a41b66bf09a4abc5ae7b64f227ca52461408f5..57c890e4884da38e2087d89dc199e20af51495ea 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -24,7 +24,7 @@ limitations under the License. */
#include "paddle/utils/ClassRegistrar.h"
#include "paddle/utils/Logging.h"
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
#include "MKLDNNActivation.h"
#endif
@@ -490,7 +490,7 @@ Error __must_check backward(Argument& act) {
END_DEFINE_ACTIVATION(log)
ActivationFunction* ActivationFunction::create(const std::string& type) {
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
if (!type.empty() && type.compare(0, 7, "mkldnn_") == 0) {
return MKLDNNActivation::create(type);
}
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index be112b41239cace3fa9b9ee97923f8c3c7a9a98f..68bf37d59db65ddc8096e2db3391be25c37b57e6 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
#include "paddle/gserver/layers/MKLDNNLayer.h"
#endif
@@ -307,7 +307,7 @@ void NeuralNetwork::backward(const UpdateCallback& callback) {
}
void NeuralNetwork::finish() {
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
FOR_EACH_R(layer, layers_) {
MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast(*layer);
if (dnnLayer) {
diff --git a/paddle/gserver/layers/ConvTransProjection.cpp b/paddle/gserver/layers/ConvTransProjection.cpp
index 48132a3ce4cc4b50fea6d755d84d7254d2055bec..e7f081c0232d185c223fc2f48ca79dc84c7f721d 100644
--- a/paddle/gserver/layers/ConvTransProjection.cpp
+++ b/paddle/gserver/layers/ConvTransProjection.cpp
@@ -24,13 +24,13 @@ size_t ConvTransProjection::calOutputSize() {
if (outputH_ == 0) outputH_ = configOutH_;
if (outputW_ == 0) outputW_ = configOutW_;
imageH_ = imageSize(outputH_,
- filterH_,
+ (filterH_ - 1) * dilationH_ + 1,
paddingH_,
strideH_,
/* caffeMode */ true);
imageW_ = imageSize(outputW_,
- filterW_,
+ (filterW_ - 1) * dilationW_ + 1,
paddingW_,
strideW_,
/* caffeMode */ true);
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..741984bb68d3881f6ac26eaca7790190ed6e572a
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
@@ -0,0 +1,163 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNLRNLayer.h"
+#include "paddle/utils/Logging.h"
+
+using namespace mkldnn; // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_lrn, MKLDNNLRNLayer);
+
+bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
+ const ParameterMap& parameterMap) {
+ if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+ return false;
+ }
+
+ /* the size of inputs for norm-layer is 1 */
+ CHECK_EQ(config_.inputs_size(), 1UL);
+ const NormConfig& conf = config_.inputs(0).norm_conf();
+ localSize_ = conf.size();
+ alpha_ = conf.scale();
+ beta_ = conf.pow();
+
+ ic_ = conf.channels();
+ oc_ = ic_;
+ iw_ = conf.img_size();
+ ow_ = conf.output_x();
+ ih_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+ oh_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+ CHECK_EQ(iw_, ow_);
+ CHECK_EQ(ih_, oh_);
+ return true;
+}
+
+void MKLDNNLRNLayer::reshape(
+ int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+ CHECK_EQ(inputLayers_.size(), 1UL);
+ reshapeInput(bs, ih, iw);
+ // ic_ and oc can not be changed
+ CHECK_EQ((size_t)ic,
+ inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
+ << "Input channel can not be changed";
+ oh = ih;
+ ow = iw;
+ reshapeOutput(oh, ow);
+ resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNLRNLayer::resetFwd(std::vector& pipeline,
+ std::vector& inputs,
+ MKLDNNMatrixPtr& out) {
+ resetFwdBuffers(inputs[0], out);
+
+ resetFwdPD(fwdPD_, inputs[0], out);
+
+ resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
+}
+
+void MKLDNNLRNLayer::resetBwd(std::vector& pipeline,
+ std::vector& inputs,
+ MKLDNNMatrixPtr& out) {
+ std::shared_ptr pd;
+
+ resetBwdBuffers(inputs[0], out);
+
+ resetBwdPD(pd, inputs[0], out);
+
+ resetBwdPipeline(pipeline, pd, inputs[0], out);
+}
+
+void MKLDNNLRNLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+ MKLDNNMatrixPtr& out) {
+ resetInValue(in);
+ CHECK(in);
+ resetOutValue(out, in->getPrimitiveDesc());
+}
+
+void MKLDNNLRNLayer::resetFwdPD(std::shared_ptr& pd,
+ MKLDNNMatrixPtr in,
+ MKLDNNMatrixPtr out) {
+ prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+ : prop_kind::forward_training;
+ auto fwdDesc = lrn_fwd::desc(pk,
+ algorithm::lrn_across_channels,
+ in->getMemoryDesc(),
+ localSize_,
+ alpha_,
+ beta_,
+ 1.0f);
+ pd.reset(new lrn_fwd::primitive_desc(fwdDesc, engine_));
+ // prepare workspace if necessary
+ workspace_ =
+ passType_ != PASS_TEST
+ ? std::make_shared(memory(pd->workspace_primitive_desc()))
+ : nullptr;
+}
+
+void MKLDNNLRNLayer::resetFwdPipeline(
+ std::vector& pipeline,
+ std::shared_ptr& pd,
+ MKLDNNMatrixPtr& in,
+ MKLDNNMatrixPtr& out) {
+ fwd_ = workspace_
+ ? std::make_shared(lrn_fwd(*pd, *in, *workspace_, *out))
+ : std::make_shared(lrn_fwd(*pd, *in, *out));
+ pipeline.push_back(*fwd_);
+}
+
+void MKLDNNLRNLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
+ MKLDNNMatrixPtr& out) {
+ CHECK(inVals_[0] && outVal_);
+ resetOutGrad(out, outVal_->getPrimitiveDesc());
+ resetInGrad(in, inVals_[0]->getPrimitiveDesc());
+}
+
+void MKLDNNLRNLayer::resetBwdPD(std::shared_ptr& pd,
+ MKLDNNMatrixPtr& in,
+ MKLDNNMatrixPtr& out) {
+ pd = nullptr;
+ if (in == nullptr) {
+ return;
+ }
+ CHECK(out);
+ auto bwdDesc = lrn_bwd::desc(algorithm::lrn_across_channels,
+ in->getMemoryDesc(),
+ out->getMemoryDesc(),
+ localSize_,
+ alpha_,
+ beta_,
+ 1.0f);
+ pd.reset(new lrn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
+}
+
+void MKLDNNLRNLayer::resetBwdPipeline(
+ std::vector& pipeline,
+ std::shared_ptr& pd,
+ MKLDNNMatrixPtr& in,
+ MKLDNNMatrixPtr& out) {
+ if (pd == nullptr) {
+ return;
+ }
+ CHECK(inVals_[0]);
+ CHECK(workspace_);
+ bwdData_ = std::make_shared(
+ lrn_bwd(*pd, *inVals_[0], *out, *workspace_, *in));
+ pipeline.push_back(*bwdData_);
+}
+
+} // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.h b/paddle/gserver/layers/MKLDNNLRNLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfe5621252c71a1de9a0a42a2a88e221e3e56972
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+typedef mkldnn::lrn_forward lrn_fwd;
+typedef mkldnn::lrn_backward lrn_bwd;
+
+/**
+ * @brief A subclass of MKLDNNLayer LRN(Local Response Norm) layer.
+ *
+ * The config file api is mkldnn_lrn
+ */
+class MKLDNNLRNLayer : public MKLDNNLayer {
+protected:
+ // save forward primitive_desc, which can be used in backward
+ std::shared_ptr fwdPD_;
+ // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+ // test_lrn_backward.cpp, lrn need workspace for backward
+ std::shared_ptr workspace_;
+
+ int localSize_;
+ float alpha_, beta_; // scale and pow in paddle
+
+public:
+ explicit MKLDNNLRNLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
+
+ ~MKLDNNLRNLayer() {}
+
+ bool init(const LayerMap& layerMap,
+ const ParameterMap& parameterMap) override;
+
+ void reshape(
+ int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+ void resetFwd(std::vector& pipeline,
+ std::vector& inputs,
+ MKLDNNMatrixPtr& out) override;
+
+ void resetBwd(std::vector& pipeline,
+ std::vector& inputs,
+ MKLDNNMatrixPtr& out) override;
+
+protected:
+ void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+ void resetFwdPD(std::shared_ptr& pd,
+ MKLDNNMatrixPtr in,
+ MKLDNNMatrixPtr out);
+ void resetFwdPipeline(std::vector& pipeline,
+ std::shared_ptr& pd,
+ MKLDNNMatrixPtr& in,
+ MKLDNNMatrixPtr& out);
+ void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+ void resetBwdPD(std::shared_ptr& pd,
+ MKLDNNMatrixPtr& in,
+ MKLDNNMatrixPtr& out);
+ void resetBwdPipeline(std::vector& pipeline,
+ std::shared_ptr& pd,
+ MKLDNNMatrixPtr& in,
+ MKLDNNMatrixPtr& out);
+};
+
+} // namespace paddle
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
index 87613a96c5b3c2da212f63e9e678bcd22308b08e..fceb389d06d8d2cb0357186bf83edda9957c6c19 100644
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -45,6 +45,8 @@ bool PoolLayer::init(const LayerMap& layerMap,
strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+
+ excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
return true;
}
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
index d43292ad2d4bbe1229ca59ca21bee92c9ec006a3..9df672a935868e9c61f4dd1fd47a9c309b214f12 100644
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
@@ -38,6 +38,8 @@ protected:
std::string poolType_;
+ bool excludeMode_;
+
public:
explicit PoolLayer(const LayerConfig& config) : Layer(config) {}
diff --git a/paddle/gserver/layers/PoolProjection.cpp b/paddle/gserver/layers/PoolProjection.cpp
index d90b438448eb72e72e22e9a91a3cbcd84ac7e6cb..6a9de394cee3769784a38f5512b15f52b1ed6fa1 100644
--- a/paddle/gserver/layers/PoolProjection.cpp
+++ b/paddle/gserver/layers/PoolProjection.cpp
@@ -36,6 +36,8 @@ PoolProjection::PoolProjection(const ProjectionConfig& config,
strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+
+ excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
}
size_t PoolProjection::getSize() {
@@ -141,7 +143,8 @@ void AvgPoolProjection::forward() {
outputY_,
outputX_,
confPaddingY_,
- confPadding_);
+ confPadding_,
+ excludeMode_);
}
void AvgPoolProjection::backward(const UpdateCallback& callback) {
@@ -166,6 +169,7 @@ void AvgPoolProjection::backward(const UpdateCallback& callback) {
1,
1,
confPaddingY_,
- confPadding_);
+ confPadding_,
+ excludeMode_);
}
} // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjection.h b/paddle/gserver/layers/PoolProjection.h
index 9a75f465f6fbb2f2a928b0e36fcfbe0e510d7b3a..a0412714bca7a273e999e4d6bd552e833d20d69c 100644
--- a/paddle/gserver/layers/PoolProjection.h
+++ b/paddle/gserver/layers/PoolProjection.h
@@ -28,6 +28,7 @@ protected:
int confPaddingY_, confPadding_;
size_t channels_;
std::string poolType_;
+ bool excludeMode_;
public:
PoolProjection(const ProjectionConfig& config,
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 24e6cae8e69557c42ed5d437edce101709ca3983..b578a906c2027a1169a0098b93f8d0742920f99d 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -1,5 +1,4 @@
# gserver pacakge unittests
-
add_simple_unittest(test_LinearChainCRF)
add_simple_unittest(test_RecurrentLayer)
@@ -29,6 +28,26 @@ gserver_test(test_KmaxSeqScore)
gserver_test(test_Expand)
gserver_test(test_MaxPoolingWithMaskOutput)
+set(PYTHON_PATH
+ ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+ ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/gserver/tests)
+function(gserver_test_with_python TARGET)
+ add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
+ add_test(NAME ${TARGET}
+ COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
+ WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endfunction()
+
+gserver_test_with_python(test_PyDataProvider2)
+if(WITH_PYTHON)
+ gserver_test_with_python(test_PyDataProvider)
+endif()
+if(NOT MOBILE_INFERENCE)
+ gserver_test_with_python(test_CompareTwoNets)
+ # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine, I will fix it.
+ gserver_test_with_python(test_RecurrentGradientMachine)
+endif()
+
########## test_MKLDNN layers and activations ##########
if(WITH_MKLDNN)
add_unittest_without_exec(test_MKLDNN
@@ -36,86 +55,43 @@ if(WITH_MKLDNN)
MKLDNNTester.cpp
LayerGradUtil.cpp)
add_test(NAME test_MKLDNN
- COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python
- ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
+ COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
endif()
-############## test_PyDataProvider ########################
-if(WITH_PYTHON)
- add_unittest_without_exec(test_PyDataProvider
- test_PyDataProvider.cpp)
-
- add_test(NAME test_PyDataProvider
- COMMAND .set_python_path.sh -d ./gserver/tests:${PADDLE_SOURCE_DIR}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider
- WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-endif()
-
############### test_WarpCTCLayer #######################
if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
add_unittest_without_exec(test_WarpCTCLayer
test_WarpCTCLayer.cpp)
-
add_test(NAME test_WarpCTCLayer
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
endif()
if(NOT MOBILE_INFERENCE)
- ################## test_Evaluator #######################
+ ################## test_Evaluator #############
add_unittest(test_Evaluator
test_Evaluator.cpp)
- ############### test_RecurrentGradientMachine ###############
- # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
- # I will fix it.
- add_unittest_without_exec(test_RecurrentGradientMachine
- test_RecurrentGradientMachine.cpp)
- add_test(NAME test_RecurrentGradientMachine
- COMMAND .set_python_path.sh -d
- ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
- ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
- WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-
- ############### test_NetworkCompare ###############
+ ########### test_NetworkCompare ###############
add_unittest_without_exec(test_NetworkCompare
test_NetworkCompare.cpp)
if(WITH_GPU)
- add_test(NAME test_NetworkCompare
- COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
- WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+ set(use_gpu true)
else()
- add_test(NAME test_NetworkCompare
- COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
- WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+ set(use_gpu false)
endif()
+ add_test(NAME test_NetworkCompare
+ COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
+ WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
- ################# test_CompareSparse ##################
+ ############ test_CompareSparse ################
add_unittest_without_exec(test_CompareSparse
test_CompareSparse.cpp)
if(NOT ON_TRAVIS)
add_test(NAME test_CompareSparse
- COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
- ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
- ./.set_port.sh -p port -n 6
- ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
+ COMMAND ${PYTHON_PATH} ./.set_port.sh -p port -n 6
+ ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endif()
-
- ################ test_CompareTwoNets ######################
- add_unittest_without_exec(test_CompareTwoNets
- test_CompareTwoNets.cpp)
- add_test(NAME test_CompareTwoNets
- COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
- ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
- ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
- WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endif()
-
-################ test_PyDataProvider2 ######################
-add_unittest_without_exec(test_PyDataProvider2
- test_PyDataProvider2.cpp)
-add_test(NAME test_PyDataProvider2
- COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
- WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
-)
diff --git a/paddle/gserver/tests/mkldnn_simple_net.conf b/paddle/gserver/tests/mkldnn_simple_net.conf
index 8bbe91e56d0ba6da06475ad16f3162ee1103ee02..0e9d6b31fa8776136b4eee29311383ae6bb21644 100644
--- a/paddle/gserver/tests/mkldnn_simple_net.conf
+++ b/paddle/gserver/tests/mkldnn_simple_net.conf
@@ -51,6 +51,8 @@ tmp = img_pool_layer(input=tmp,
padding=1,
pool_type=MaxPooling())
+tmp = img_cmrnorm_layer(input=tmp, size=5, scale=0.0001, power=0.75)
+
tmp = fc_layer(input=tmp,
size=channels,
bias_attr=False,
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index c5359f272b4bed4d4d2483bf19d7ae482b0d33dd..a2f07937b8834e3f3fa7a6bf2ae10f29a8d84f29 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -238,9 +238,24 @@ void testProjectionConv(size_t groups, bool isDeconv) {
/* caffeMode */ true);
conv->set_output_x(output_x);
conv->set_output_y(output_y);
+ LOG(INFO) << "DILATION:" << DILATION << "; output_x: " << output_x
+ << "; output_y: " << output_y;
if (isDeconv) {
+ int deconv_image_x = imageSize(output_x,
+ (conv->filter_size() - 1) * DILATION + 1,
+ conv->padding(),
+ conv->stride(),
+ /* caffeMode */ true);
+ int deconv_image_y = imageSize(output_y,
+ (conv->filter_size_y() - 1) * DILATION + 1,
+ conv->padding_y(),
+ conv->stride_y(),
+ /* caffeMode */ true);
+
+ LOG(INFO) << " deconv_image_x: " << deconv_image_x
+ << "; deconv_image_y: " << deconv_image_y;
conf.set_input_size(output_x * output_y * CHANNELS);
- conf.set_output_size(IMAGE_SIZE * IMAGE_SIZE * NUM_FILTERS);
+ conf.set_output_size(deconv_image_x * deconv_image_y * NUM_FILTERS);
} else {
conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
conf.set_output_size(output_x * output_y * NUM_FILTERS);
@@ -1211,7 +1226,10 @@ void setPoolConfig(TestConfig* config,
pool->set_output_y(oh);
}
-void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
+void testPoolLayer(const string& poolType,
+ bool trans,
+ bool useGpu,
+ bool excludeMode = true) {
TestConfig config;
config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
LayerInputConfig* input = config.layerConfig.add_inputs();
@@ -1219,6 +1237,7 @@ void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
pool->set_img_size(14);
pool->set_img_size_y(14);
+ pool->set_exclude_mode(excludeMode);
setPoolConfig(&config, pool, poolType);
config.layerConfig.set_size(pool->output_x() * pool->output_y() *
pool->channels());
@@ -1250,16 +1269,26 @@ void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
TEST(Layer, PoolLayer) {
testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
+ testPoolLayer("avg-projection",
+ /* trans= */ false,
+ /* useGpu= */ false,
+ /* excludeMode= */ false);
testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);
#ifdef PADDLE_WITH_CUDA
testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
+ testPoolLayer("avg-projection",
+ /* trans= */ false,
+ /* useGpu= */ true,
+ /* excludeMode= */ false);
testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+ testPoolLayer2(
+ "cudnn-avg-incl-pad-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
#endif
}
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index 56b523f220c2a405851b89db5f63e9aa50bfaaf7..ad1dbc3ee2bfd00a94de06f1e1b2ffe64f19b417 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -272,6 +272,51 @@ TEST(MKLDNNLayer, BatchNormLayer) {
testBatchNormLayer({4, 16, 8, 10});
}
+struct testLRNDesc {
+ int bs, ic, ih, iw;
+ float scale, pow;
+ int localSize;
+};
+
+void getMKLDNNLRNConfig(TestConfig& cfg, const testLRNDesc& pm) {
+ cfg.layerConfig.set_type("mkldnn_lrn");
+ cfg.layerConfig.set_active_type("relu");
+ size_t layerSize = pm.ic * pm.ih * pm.iw;
+ cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
+ LayerInputConfig* input = cfg.layerConfig.add_inputs();
+ NormConfig* norm = input->mutable_norm_conf();
+ norm->set_channels(pm.ic);
+ norm->set_size(pm.localSize);
+ norm->set_scale(pm.scale);
+ norm->set_pow(pm.pow);
+ norm->set_blocked(0);
+ norm->set_img_size(pm.iw);
+ norm->set_img_size_y(pm.ih);
+ norm->set_output_x(norm->img_size());
+ norm->set_output_y(norm->img_size_y());
+ cfg.layerConfig.set_size(layerSize);
+ cfg.biasSize = 0;
+}
+
+void testLRNLayer(const testLRNDesc& pm) {
+ TestConfig dnnConfig;
+ getMKLDNNLRNConfig(dnnConfig, pm);
+ // mkldnn_lrn <==> norm with cmrnorm-projection type
+ TestConfig refConfig = dnnConfig;
+ refConfig.layerConfig.set_type("norm");
+ LayerInputConfig* input = refConfig.layerConfig.mutable_inputs(0);
+ NormConfig* norm = input->mutable_norm_conf();
+ norm->set_norm_type("cmrnorm-projection");
+ norm->set_scale(norm->scale() / norm->size());
+ RUN_MKLDNN_TEST(dnnConfig, refConfig, pm)
+}
+
+TEST(MKLDNNLayer, LRNLayer) {
+ testLRNLayer({4, 10, 12, 12, 0.001f, 0.75f, 5});
+ testLRNLayer({2, 32, 6, 6, 0.001f, 0.75f, 5});
+ testLRNLayer({4, 16, 8, 10, 0.01f, 0.5f, 5});
+}
+
struct testImageDesc {
int bs, ic, ih, iw;
};
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index 94ef561f066a127496e2849a419835e175c526d7..17563bf5e1649361b83b896bf864b922296a5487 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -48,7 +48,7 @@ public:
*/
virtual void* alloc(size_t size) {
void* ptr;
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
// refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
// memory alignment
CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index ba86eacbb5d53ee43a60d2cd1dd922333a5d48f0..28ab54b450c96b4bdefdf36813595766162b1434 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -206,7 +206,7 @@ double dotProduct(const int n, const double* x, const double* y) {
}
#endif
-#if defined(PADDLE_USE_MKLML)
+#if defined(PADDLE_WITH_MKLML)
template <>
void vExp(const int n, const float* a, float* r) {
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index f6e77029bdd75a602f88b688ca810f47ba4ee615..29fe36e3a4bd5e5d372480950a03142822262d41 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -15,7 +15,7 @@ limitations under the License. */
#ifndef MATHFUNCTIONS_H_
#define MATHFUNCTIONS_H_
-#ifdef PADDLE_USE_MKLML
+#ifdef PADDLE_WITH_MKLML
#include
#include
#include
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 88e9180690606c92cf46c5b295d80f14e5d64567..1ec4336cabbc7d3073b7638b7484bf61e83a2dc5 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -28,6 +28,7 @@ limitations under the License. */
#include "hl_top_k.h"
#include "paddle/utils/Logging.h"
+#include "NEONFunctions.h"
#include "paddle/function/GemmFunctor.h"
#include "paddle/utils/ThreadLocal.h"
@@ -1130,7 +1131,8 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat,
size_t outputH,
size_t outputW,
size_t paddingH,
- size_t paddingW) {
+ size_t paddingW,
+ bool excludeMode) {
CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
real* inputData = inputMat.getData();
@@ -1153,7 +1155,8 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat,
paddingH,
paddingW,
data_,
- getStride());
+ getStride(),
+ excludeMode);
}
void GpuMatrix::avgPoolBackward(Matrix& outGrad,
@@ -1168,7 +1171,8 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
real scaleTargets,
real scaleOutput,
size_t paddingH,
- size_t paddingW) {
+ size_t paddingW,
+ bool excludeMode) {
CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
real* outDiff = outGrad.getData();
@@ -1194,7 +1198,8 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
scaleTargets,
scaleOutput,
data_,
- outGrad.getStride());
+ outGrad.getStride(),
+ excludeMode);
}
void GpuMatrix::maxPool3DForward(Matrix& inputMat,
@@ -2136,7 +2141,8 @@ void CpuMatrix::avgPoolForward(Matrix& input,
size_t outputH,
size_t outputW,
size_t paddingH,
- size_t paddingW) {
+ size_t paddingW,
+ bool excludeMode) {
// The main loop
size_t num = input.getHeight();
size_t inLength = imgSizeH * imgSizeW;
@@ -2165,7 +2171,8 @@ void CpuMatrix::avgPoolForward(Matrix& input,
tgtData[ph * outputW + pw] += inData[h * imgSizeW + w];
}
}
- int poolSize = (hend - hstart) * (wend - wstart);
+ int poolSize =
+ excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
CHECK(poolSize);
tgtData[ph * outputW + pw] /= poolSize;
}
@@ -2189,7 +2196,8 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
real scaleTargets,
real scaleOutput,
size_t paddingH,
- size_t paddingW) {
+ size_t paddingW,
+ bool excludeMode) {
size_t num = input.getHeight();
size_t channels = input.getWidth() / outputH / outputW;
size_t inLength = imgSizeH * imgSizeW;
@@ -2211,7 +2219,8 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
int wstart = pw * strideW - paddingW;
int wend = std::min(wstart + sizeX, imgSizeW);
wstart = std::max(wstart, 0);
- int poolSize = (hend - hstart) * (wend - wstart);
+ int poolSize =
+ excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
CHECK(poolSize);
for (int h = hstart; h < hend; ++h) {
@@ -4157,16 +4166,36 @@ void CpuMatrix::print(std::ostream& os) const {
void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
real* input = data.getData();
real* w = W.getData();
+ real* output = data_;
size_t numElements = data.getWidth();
size_t numSamples = data.getHeight();
size_t paraSize = W.getHeight() * W.getWidth();
CHECK(!(numElements % paraSize)); // this check from ParameterReluLayer::init
+
size_t partial_sum = numElements / paraSize;
+ if (paraSize == numElements) {
+ for (size_t n = 0; n < numSamples * numElements; ++n) {
+ output[n] = input[n] > 0 ? input[n] : input[n] * w[n % numElements];
+ }
+ return;
+ }
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+ for (size_t n = 0; n < numSamples; ++n) {
+ for (size_t i = 0; i < paraSize; i++) {
+ neon::prelu(
+ input + i * partial_sum, w[i], output + i * partial_sum, partial_sum);
+ }
+ input = input + numElements;
+ output = output + numElements;
+ }
+#else
for (size_t n = 0, k = 0; n < numSamples; ++n) {
for (size_t i = 0; i < numElements; ++i, ++k) {
- data_[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
+ output[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
}
}
+#endif
}
void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index e273f1123690e31984c97185c5a8bc5e7b92c38c..c8e690e6421668bdade4e50a61882c915b2ddc7c 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -911,7 +911,8 @@ public:
size_t outputH,
size_t outputW,
size_t paddingH,
- size_t paddingW) {
+ size_t paddingW,
+ bool excludeMode = true) {
LOG(FATAL) << "Not implemeted";
}
@@ -927,9 +928,11 @@ public:
real scaleTargets,
real scaleOutput,
size_t paddingH,
- size_t paddingW) {
+ size_t paddingW,
+ bool excludeMode = true) {
LOG(FATAL) << "Not implemeted";
}
+
/**
* Pooling 3D forward operation, pick out the largest element
* in the sizeX of value
@@ -1458,7 +1461,8 @@ public:
size_t outputH,
size_t outputW,
size_t paddingH,
- size_t paddingW);
+ size_t paddingW,
+ bool excludeMode = true);
void avgPoolBackward(Matrix& input,
size_t imgSizeH,
@@ -1472,7 +1476,8 @@ public:
real scaleTargets,
real scaleOutput,
size_t paddingH,
- size_t paddingW);
+ size_t paddingW,
+ bool excludeMode = true);
void maxPool3DForward(Matrix& inputMat,
Matrix& maxPoolIdx,
@@ -1730,7 +1735,8 @@ public:
size_t outputH,
size_t outputW,
size_t paddingH,
- size_t paddingW);
+ size_t paddingW,
+ bool excludeMode = true);
void avgPoolBackward(Matrix& input,
size_t imgSizeH,
@@ -1744,7 +1750,8 @@ public:
real scaleTargets,
real scaleOutput,
size_t paddingH,
- size_t paddingW);
+ size_t paddingW,
+ bool excludeMode = true);
void maxPool3DForward(Matrix& inputMat,
Matrix& maxPoolIdx,
diff --git a/paddle/math/NEONFunctions.cpp b/paddle/math/NEONFunctions.cpp
index 3bf47901f1069ac228fa1b877e29848d8cc130e8..0f8314942290a71dd327437b8a6da2d64fe48444 100644
--- a/paddle/math/NEONFunctions.cpp
+++ b/paddle/math/NEONFunctions.cpp
@@ -49,6 +49,46 @@ void relu(const float* a, float* b, int len) {
}
}
+// b[i] = a[i] > 0.0f ? a[i] : a[i] * w
+void prelu(const float* a, float w, float* b, int len) {
+ int offset = len % 16;
+ float32x4_t ma0, ma1, ma2, ma3;
+
+ float32x4_t zero = vdupq_n_f32(0.f);
+ float32x4_t vw = vdupq_n_f32(w);
+
+ for (int k = 0; k < len / 16; k++, a += 16, b += 16) {
+ ma0 = vld1q_f32(a);
+ ma1 = vld1q_f32(a + 4);
+ ma2 = vld1q_f32(a + 8);
+ ma3 = vld1q_f32(a + 12);
+
+ uint32x4_t flag0 = vcgtq_f32(ma0, zero);
+ uint32x4_t flag1 = vcgtq_f32(ma1, zero);
+ uint32x4_t flag2 = vcgtq_f32(ma2, zero);
+ uint32x4_t flag3 = vcgtq_f32(ma3, zero);
+
+ float32x4_t mul0 = vmulq_f32(ma0, vw);
+ float32x4_t mul1 = vmulq_f32(ma1, vw);
+ float32x4_t mul2 = vmulq_f32(ma2, vw);
+ float32x4_t mul3 = vmulq_f32(ma3, vw);
+
+ ma0 = vbslq_f32(flag0, ma0, mul0);
+ ma1 = vbslq_f32(flag1, ma1, mul1);
+ ma2 = vbslq_f32(flag2, ma2, mul2);
+ ma3 = vbslq_f32(flag3, ma3, mul3);
+
+ vst1q_f32(b, ma0);
+ vst1q_f32(b + 4, ma1);
+ vst1q_f32(b + 8, ma2);
+ vst1q_f32(b + 12, ma3);
+ }
+
+ for (int i = 0; i < offset; i++) {
+ b[i] = a[i] > 0.0f ? a[i] : a[i] * w;
+ }
+}
+
} // namespace neon
} // namespace paddle
diff --git a/paddle/math/NEONFunctions.h b/paddle/math/NEONFunctions.h
index 69085e333547a31a341fbfde247f1e30adb957ee..d67b2f47a85a963949d23415e4f6881658203bb7 100644
--- a/paddle/math/NEONFunctions.h
+++ b/paddle/math/NEONFunctions.h
@@ -18,6 +18,7 @@ namespace paddle {
namespace neon {
void relu(const float* a, float* b, int len);
+void prelu(const float* a, float w, float* b, int len);
} // namespace neon
} // namespace paddle
diff --git a/paddle/math/float16.h b/paddle/math/float16.h
new file mode 100644
index 0000000000000000000000000000000000000000..76ad3a01239e409caeefc36a3d562ed5e388dc92
--- /dev/null
+++ b/paddle/math/float16.h
@@ -0,0 +1,739 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include
+
+#ifdef PADDLE_WITH_CUDA
+#include
+#endif // PADDLE_WITH_CUDA
+
+#include "unsupported/Eigen/CXX11/Tensor"
+
+#include "paddle/platform/hostdevice.h"
+
+#ifdef __GNUC__
+#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
+#else
+#define PADDLE_GNUC_VER 0
+#endif // __GNUC__
+
+#ifdef __clang__
+#define PADDLE_CLANG_VER (__clang_major__ * 10 + __clang_minor__)
+#else
+#define PADDLE_CLANG_VER 0
+#endif // __clang__
+
+#if defined(__CUDACC__) && CUDA_VERSION >= 7050
+#define PADDLE_CUDA_FP16
+#include
+#endif
+
+#if defined(__arm__) || defined(__aarch64__)
+#define PADDLE_ARM
+#endif
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#define PADDLE_NEON
+#include
+#endif
+
+#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
+ (PADDLE_GNUC_VER >= 62 || PADDLE_CLANG_VER >= 37)
+#define PADDLE_WITH_NATIVE_FP16
+#endif
+
+#ifndef PADDLE_ARM
+#include
+#endif // PADDLE_ARM
+
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+
+namespace paddle {
+
+// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
+// and aligned at least on a 2-byte boundary, which leads to efficient
+// memory access of float16 struct and also makes float16 compatible
+// with CUDA half, ARM float16_t, and Eigen::half data types.
+struct PADDLE_ALIGN(2) float16 {
+public:
+ uint16_t x;
+
+ // Constructors
+ HOSTDEVICE inline float16() : x(0) {}
+
+ HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
+
+#ifdef PADDLE_CUDA_FP16
+ HOSTDEVICE inline explicit float16(const half& h) {
+#if CUDA_VERSION >= 9000
+ x = reinterpret_cast<__half_raw*>(&h)->x;
+#else
+ x = h.x;
+#endif // CUDA_VERSION >= 9000
+ }
+#endif // PADDLE_CUDA_FP16
+
+ HOSTDEVICE inline explicit float16(const Eigen::half& h) : x(h.x) {}
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+ // __fp16 is a native half precision data type for arm cpu,
+ // float16_t is an alias for __fp16
+ HOSTDEVICE inline explicit float16(const float16_t& h) {
+ x = *reinterpret_cast(&h);
+ }
+#endif
+
+ HOSTDEVICE inline explicit float16(float val) {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+ half tmp = __float2half(val);
+ x = *reinterpret_cast(&tmp);
+
+#elif defined(PADDLE_WITH_NATIVE_FP16)
+ float32x4_t tmp = vld1q_dup_f32(&val);
+ float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0);
+ x = *reinterpret_cast(&res);
+
+#elif defined(__F16C__)
+ x = _cvtss_sh(val, 0);
+
+#else
+ // Conversion routine adapted from
+ // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+ Bits v, s;
+ v.f = val;
+ uint32_t sign = v.si & sigN;
+ v.si ^= sign;
+ sign >>= shiftSign; // logical shift
+ s.si = mulN;
+ s.si = s.f * v.f; // correct subnormals
+ v.si ^= (s.si ^ v.si) & -(minN > v.si);
+ v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
+ v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
+ v.ui >>= shift; // logical shift
+ v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
+ v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
+ x = v.ui | sign;
+
+#endif
+ }
+
+ HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
+
+ template
+ HOSTDEVICE inline explicit float16(const T& val)
+ : x(float16(static_cast(val)).x) {}
+
+ HOSTDEVICE inline float16& operator=(const float16& rhs) {
+ x = rhs.x;
+ return *this;
+ }
+
+// Assignment operators
+#ifdef PADDLE_CUDA_FP16
+ HOSTDEVICE inline float16& operator=(const half& rhs) {
+#if CUDA_VERSION >= 9000
+ x = reinterpret_cast<__half_raw*>(&rhs)->x;
+#else
+ x = rhs.x;
+#endif
+ return *this;
+ }
+#endif
+
+ HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) {
+ x = rhs.x;
+ return *this;
+ }
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+ HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
+ x = *reinterpret_cast(&rhs);
+ return *this;
+ }
+#endif
+
+ HOSTDEVICE inline float16& operator=(bool b) {
+ x = b ? 0x3c00 : 0;
+ return *this;
+ }
+
+ HOSTDEVICE inline float16& operator=(int8_t val) {
+ x = float16(val).x;
+ return *this;
+ }
+
+ HOSTDEVICE inline float16& operator=(uint8_t val) {
+ x = float16(val).x;
+ return *this;
+ }
+
+ HOSTDEVICE inline float16& operator=(int16_t val) {
+ x = float16(val).x;
+ return *this;
+ }
+
+ HOSTDEVICE inline float16& operator=(uint16_t val) {
+ x = float16(val).x;
+ return *this;
+ }
+
+ HOSTDEVICE inline float16& operator=(int32_t val) {
+ x = float16(val).x;
+ return *this;
+ }
+
+ HOSTDEVICE inline float16& operator=(uint32_t val) {
+ x = float16(val).x;
+ return *this;
+ }
+
+ HOSTDEVICE inline float16& operator=(int64_t val) {
+ x = float16(val).x;
+ return *this;
+ }
+
+ HOSTDEVICE inline float16& operator=(uint64_t val) {
+ x = float16(val).x;
+ return *this;
+ }
+
+ HOSTDEVICE inline float16& operator=(float val) {
+ x = float16(val).x;
+ return *this;
+ }
+
+ HOSTDEVICE inline float16& operator=(double val) {
+ x = float16(val).x;
+ return *this;
+ }
+
+// Conversion opertors
+#ifdef PADDLE_CUDA_FP16
+ HOSTDEVICE inline explicit operator half() const {
+#if CUDA_VERSION >= 9000
+ __half_raw h;
+ h.x = x;
+ return half(h);
+#else
+ half h;
+ h.x = x;
+ return h;
+#endif // CUDA_VERSION >= 9000
+ }
+#endif // PADDLE_CUDA_FP16
+
+ HOSTDEVICE inline explicit operator Eigen::half() const {
+ Eigen::half h;
+ h.x = x;
+ return h;
+ }
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+ HOSTDEVICE inline explicit operator float16_t() const {
+ return *reinterpret_cast(this);
+ }
+#endif
+
+ HOSTDEVICE inline explicit operator float() const {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+ half tmp = *reinterpret_cast(this);
+ return __half2float(tmp);
+
+#elif defined(PADDLE_WITH_NATIVE_FP16)
+ float16x4_t res = vld1_dup_f16(reinterpret_cast(this));
+ return vgetq_lane_f32(vcvt_f32_f16(res), 0);
+
+#elif defined(__F16C__)
+ return _cvtsh_ss(this->x);
+
+#else
+ // Conversion routine adapted from
+ // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+ Bits v;
+ v.ui = this->x;
+ int32_t sign = v.si & sigC;
+ v.si ^= sign;
+ sign <<= shiftSign;
+ v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+ v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+ Bits s;
+ s.si = mulC;
+ s.f *= v.si;
+ int32_t mask = -(norC > v.si);
+ v.si <<= shift;
+ v.si ^= (s.si ^ v.si) & mask;
+ v.si |= sign;
+ return v.f;
+
+#endif
+ }
+
+ HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
+
+ HOSTDEVICE inline explicit operator int8_t() const {
+ return static_cast(float(*this));
+ }
+
+ HOSTDEVICE inline explicit operator uint8_t() const {
+ return static_cast(float(*this));
+ }
+
+ HOSTDEVICE inline explicit operator int16_t() const {
+ return static_cast(float(*this));
+ }
+
+ HOSTDEVICE inline explicit operator uint16_t() const {
+ return static_cast(float(*this));
+ }
+
+ HOSTDEVICE inline explicit operator int32_t() const {
+ return static_cast(float(*this));
+ }
+
+ HOSTDEVICE inline explicit operator uint32_t() const {
+ return static_cast(float(*this));
+ }
+
+ HOSTDEVICE inline explicit operator int64_t() const {
+ return static_cast(float(*this));
+ }
+
+ HOSTDEVICE inline explicit operator uint64_t() const {
+ return static_cast(float(*this));
+ }
+
+ HOSTDEVICE inline explicit operator double() const {
+ return static_cast(float(*this));
+ }
+
+private:
+ union Bits {
+ float f;
+ int32_t si;
+ uint32_t ui;
+ };
+
+ static const int shift = 13;
+ static const int shiftSign = 16;
+
+ static const int32_t infN = 0x7F800000;
+ static const int32_t maxN = 0x477FE000; // max flt16 as flt32
+ static const int32_t minN = 0x38800000; // min flt16 normal as flt32
+ static const int32_t sigN = 0x80000000; // sign bit
+
+ static constexpr int32_t infC = infN >> shift;
+ static constexpr int32_t nanN = (infC + 1)
+ << shift; // minimum flt16 nan as float32
+ static constexpr int32_t maxC = maxN >> shift;
+ static constexpr int32_t minC = minN >> shift;
+ static constexpr int32_t sigC = sigN >> shiftSign;
+
+ static const int32_t mulN = 0x52000000; // (1 << 23) / minN
+ static const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift))
+ static const int32_t subC = 0x003FF; // max flt32 subnormal downshifted
+ static const int32_t norC = 0x00400; // min flt32 normal downshifted
+
+ static constexpr int32_t maxD = infC - maxC - 1;
+ static constexpr int32_t minD = minC - subC - 1;
+};
+
+// Arithmetic operators on GPU
+// CUDA 9.0 provides built-in arithmetic operators for half while
+// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
+// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
+// CUDA 9.0 regarding the half data type.
+#if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000
+
+DEVICE inline half operator+(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+ return __hadd(a, b);
+#else
+ float res = float(float16(a)) + float(float16(b));
+ return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator-(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+ return __hsub(a, b);
+#else
+ float res = float(float16(a)) - float(float16(b));
+ return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator*(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+ return __hmul(a, b);
+#else
+ float res = float(float16(a)) * float(float16(b));
+ return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator/(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+ float num = __half2float(a);
+ float denom = __half2float(b);
+ return __float2half(num / denom);
+#else
+ float res = float(float16(a)) / float(float16(b));
+ return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator-(const half& a) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+ return __hneg(a);
+#else
+ float res = -float(float16(a));
+ return half(float16(res));
+#endif
+}
+
+DEVICE inline half& operator+=(half& a, const half& b) {
+ a = a + b;
+ return a;
+}
+
+DEVICE inline half& operator-=(half& a, const half& b) {
+ a = a - b;
+ return a;
+}
+
+DEVICE inline half& operator*=(half& a, const half& b) {
+ a = a * b;
+ return a;
+}
+
+DEVICE inline half& operator/=(half& a, const half& b) {
+ a = a / b;
+ return a;
+}
+
+DEVICE inline bool operator==(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+ return __heq(a, b);
+#else
+ return float(float16(a)) == float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator!=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+ return __hne(a, b);
+#else
+ return float(float16(a)) != float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator<(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+ return __hlt(a, b);
+#else
+ return float(float16(a)) < float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator<=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+ return __hle(a, b);
+#else
+ return float(float16(a)) <= float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator>(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+ return __hgt(a, b);
+#else
+ return float(float16(a)) > float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator>=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+ return __hge(a, b);
+#else
+ return float(float16(a)) >= float(float16(b));
+#endif
+}
+
+#endif // PADDLE_CUDA_FP16
+
+// Arithmetic operators on ARMv8.2-A CPU
+#if defined(PADDLE_WITH_NATIVE_FP16)
+HOST inline float16 operator+(const float16& a, const float16& b) {
+ float16 res;
+ asm volatile(
+ "ld1 {v0.h}[0], [%[a_ptr]]\n"
+ "ld1 {v1.h}[0], [%[b_ptr]]\n"
+ "fadd h0, h0, h1\n"
+ "st1 {v0.h}[0], [%[res_ptr]]\n"
+ : // outputs
+ : // inputs
+ [a_ptr] "r"(&(a.x)),
+ [b_ptr] "r"(&(b.x)),
+ [res_ptr] "r"(&(res.x))
+ : // clobbers
+ "memory", "v0", "v1");
+ return res;
+}
+
+HOST inline float16 operator-(const float16& a, const float16& b) {
+ float16 res;
+ asm volatile(
+ "ld1 {v0.h}[0], [%[a_ptr]]\n"
+ "ld1 {v1.h}[0], [%[b_ptr]]\n"
+ "fsub h0, h0, h1\n"
+ "st1 {v0.h}[0], [%[res_ptr]]\n"
+ : // outputs
+ : // inputs
+ [a_ptr] "r"(&(a.x)),
+ [b_ptr] "r"(&(b.x)),
+ [res_ptr] "r"(&(res.x))
+ : // clobbers
+ "memory", "v0", "v1");
+ return res;
+}
+
+HOST inline float16 operator*(const float16& a, const float16& b) {
+ float16 res;
+ asm volatile(
+ "ld1 {v0.h}[0], [%[a_ptr]]\n"
+ "ld1 {v1.h}[0], [%[b_ptr]]\n"
+ "fmul h0, h0, h1\n"
+ "st1 {v0.h}[0], [%[res_ptr]]\n"
+ : // outputs
+ : // inputs
+ [a_ptr] "r"(&(a.x)),
+ [b_ptr] "r"(&(b.x)),
+ [res_ptr] "r"(&(res.x))
+ : // clobbers
+ "memory", "v0", "v1");
+ return res;
+}
+
+HOST inline float16 operator/(const float16& a, const float16& b) {
+ float16 res;
+ asm volatile(
+ "ld1 {v0.h}[0], [%[a_ptr]]\n"
+ "ld1 {v1.h}[0], [%[b_ptr]]\n"
+ "fdiv h0, h0, h1\n"
+ "st1 {v0.h}[0], [%[res_ptr]]\n"
+ : // outputs
+ : // inputs
+ [a_ptr] "r"(&(a.x)),
+ [b_ptr] "r"(&(b.x)),
+ [res_ptr] "r"(&(res.x))
+ : // clobbers
+ "memory", "v0", "v1");
+ return res;
+}
+
+HOST inline float16 operator-(const float16& a) {
+ float16 res;
+ asm volatile(
+ "ld1 {v0.h}[0], [%[a_ptr]]\n"
+ "fneg h0, h0\n"
+ "st1 {v0.h}[0], [%[res_ptr]]\n"
+ : // outputs
+ : // inputs
+ [a_ptr] "r"(&(a.x)),
+ [res_ptr] "r"(&(res.x))
+ : // clobbers
+ "memory", "v0");
+ return res;
+}
+
+HOST inline float16& operator+=(float16& a, const float16& b) {
+ a = a + b;
+ return a;
+}
+
+HOST inline float16& operator-=(float16& a, const float16& b) {
+ a = a - b;
+ return a;
+}
+
+HOST inline float16& operator*=(float16& a, const float16& b) {
+ a = a * b;
+ return a;
+}
+
+HOST inline float16& operator/=(float16& a, const float16& b) {
+ a = a / b;
+ return a;
+}
+
+HOST inline bool operator==(const float16& a, const float16& b) {
+ uint16_t res;
+ asm volatile(
+ "ld1 {v0.h}[0], [%[a_ptr]]\n"
+ "ld1 {v1.h}[0], [%[b_ptr]]\n"
+ "fcmeq h0, h0, h1\n"
+ "st1 {v0.h}[0], [%[res_ptr]]\n"
+ : // outputs
+ : // inputs
+ [a_ptr] "r"(&(a.x)),
+ [b_ptr] "r"(&(b.x)),
+ [res_ptr] "r"(&res)
+ : // clobbers
+ "memory", "v0", "v1");
+ return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator!=(const float16& a, const float16& b) {
+ return !(a == b);
+}
+
+HOST inline bool operator<(const float16& a, const float16& b) {
+ uint16_t res;
+ asm volatile(
+ "ld1 {v1.h}[0], [%[a_ptr]]\n"
+ "ld1 {v0.h}[0], [%[b_ptr]]\n"
+ "fcmgt h0, h0, h1\n"
+ "st1 {v0.h}[0], [%[res_ptr]]\n"
+ : // outputs
+ : // inputs
+ [a_ptr] "r"(&(a.x)),
+ [b_ptr] "r"(&(b.x)),
+ [res_ptr] "r"(&res)
+ : // clobbers
+ "memory", "v0", "v1");
+ return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator<=(const float16& a, const float16& b) {
+ uint16_t res;
+ asm volatile(
+ "ld1 {v1.h}[0], [%[a_ptr]]\n"
+ "ld1 {v0.h}[0], [%[b_ptr]]\n"
+ "fcmge h0, h0, h1\n"
+ "st1 {v0.h}[0], [%[res_ptr]]\n"
+ : // outputs
+ : // inputs
+ [a_ptr] "r"(&(a.x)),
+ [b_ptr] "r"(&(b.x)),
+ [res_ptr] "r"(&res)
+ : // clobbers
+ "memory", "v0", "v1");
+ return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator>(const float16& a, const float16& b) {
+ uint16_t res;
+ asm volatile(
+ "ld1 {v0.h}[0], [%[a_ptr]]\n"
+ "ld1 {v1.h}[0], [%[b_ptr]]\n"
+ "fcmgt h0, h0, h1\n"
+ "st1 {v0.h}[0], [%[res_ptr]]\n"
+ : // outputs
+ : // inputs
+ [a_ptr] "r"(&(a.x)),
+ [b_ptr] "r"(&(b.x)),
+ [res_ptr] "r"(&res)
+ : // clobbers
+ "memory", "v0", "v1");
+ return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator>=(const float16& a, const float16& b) {
+ uint16_t res;
+ asm volatile(
+ "ld1 {v0.h}[0], [%[a_ptr]]\n"
+ "ld1 {v1.h}[0], [%[b_ptr]]\n"
+ "fcmge h0, h0, h1\n"
+ "st1 {v0.h}[0], [%[res_ptr]]\n"
+ : // outputs
+ : // inputs
+ [a_ptr] "r"(&(a.x)),
+ [b_ptr] "r"(&(b.x)),
+ [res_ptr] "r"(&res)
+ : // clobbers
+ "memory", "v0", "v1");
+ return (res & 0xffff) != 0;
+}
+
+// Arithmetic operators, software emulated on other CPU
+#else
+HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
+ return float16(float(a) + float(b));
+}
+
+HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
+ return float16(float(a) - float(b));
+}
+
+HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
+ return float16(float(a) * float(b));
+}
+
+HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
+ return float16(float(a) / float(b));
+}
+
+HOSTDEVICE inline float16 operator-(const float16& a) {
+ float16 res;
+ res.x = a.x ^ 0x8000;
+ return res;
+}
+
+HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
+ a = float16(float(a) + float(b));
+ return a;
+}
+
+HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
+ a = float16(float(a) - float(b));
+ return a;
+}
+
+HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
+ a = float16(float(a) * float(b));
+ return a;
+}
+
+HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
+ a = float16(float(a) / float(b));
+ return a;
+}
+
+HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
+ return float(a) == float(b);
+}
+
+HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
+ return float(a) != float(b);
+}
+
+HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
+ return float(a) < float(b);
+}
+
+HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
+ return float(a) <= float(b);
+}
+
+HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
+ return float(a) > float(b);
+}
+
+HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
+ return float(a) >= float(b);
+}
+#endif
+} // namespace paddle
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index d8b7f9e3fc74040189ade83049e4a1c3348e08de..dcd2a34583417993a4bf2976f7a3bc5a10d496ac 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -22,6 +22,7 @@ if(WITH_GPU)
link_paddle_test(test_Tensor)
CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
link_paddle_test(test_lazyAssign)
+ nv_test(test_float16_gpu SRCS test_float16.cu)
else()
compile_cu_as_cpp(test_Tensor.cu)
add_unittest(test_Tensor test_Tensor.cu)
@@ -33,3 +34,4 @@ add_simple_unittest(test_FPException)
add_simple_unittest(test_GpuProfiler)
add_simple_unittest(test_BaseMatrix)
add_simple_unittest(test_Matrix)
+add_simple_unittest(test_float16)
diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..74cc55aa3792f5e9f86b4f56f28dad97f35996a0
--- /dev/null
+++ b/paddle/math/tests/test_float16.cpp
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/math/float16.h"
+
+#include
+
+namespace paddle {
+
+TEST(float16, conversion_cpu) {
+ // Explicit conversion from Eigen::half
+ EXPECT_EQ(float16(Eigen::half(1.0f)).x, 0x3c00);
+ EXPECT_EQ(float16(Eigen::half(0.5f)).x, 0x3800);
+ EXPECT_EQ(float16(Eigen::half(0.33333f)).x, 0x3555);
+ EXPECT_EQ(float16(Eigen::half(0.0f)).x, 0x0000);
+ EXPECT_EQ(float16(Eigen::half(-0.0f)).x, 0x8000);
+ EXPECT_EQ(float16(Eigen::half(65504.0f)).x, 0x7bff);
+ EXPECT_EQ(float16(Eigen::half(65536.0f)).x, 0x7c00);
+
+ // Conversion from float
+ EXPECT_EQ(float16(1.0f).x, 0x3c00);
+ EXPECT_EQ(float16(0.5f).x, 0x3800);
+ EXPECT_EQ(float16(0.33333f).x, 0x3555);
+ EXPECT_EQ(float16(0.0f).x, 0x0000);
+ EXPECT_EQ(float16(-0.0f).x, 0x8000);
+ EXPECT_EQ(float16(65504.0f).x, 0x7bff);
+ EXPECT_EQ(float16(65536.0f).x, 0x7c00);
+
+ // Conversion from double
+ EXPECT_EQ(float16(1.0).x, 0x3c00);
+ EXPECT_EQ(float16(0.5).x, 0x3800);
+ EXPECT_EQ(float16(0.33333).x, 0x3555);
+ EXPECT_EQ(float16(0.0).x, 0x0000);
+ EXPECT_EQ(float16(-0.0).x, 0x8000);
+ EXPECT_EQ(float16(65504.0).x, 0x7bff);
+ EXPECT_EQ(float16(65536.0).x, 0x7c00);
+
+ // Conversion from int
+ EXPECT_EQ(float16(-1).x, 0xbc00);
+ EXPECT_EQ(float16(0).x, 0x0000);
+ EXPECT_EQ(float16(1).x, 0x3c00);
+ EXPECT_EQ(float16(2).x, 0x4000);
+ EXPECT_EQ(float16(3).x, 0x4200);
+
+ // Conversion from bool
+ EXPECT_EQ(float16(true).x, 0x3c00);
+ EXPECT_EQ(float16(false).x, 0x0000);
+
+ // Default constructor
+ float16 v_def;
+ EXPECT_EQ(v_def.x, 0x0000);
+
+ // Assignment operator
+ float16 v_assign;
+ v_assign = v_def;
+ EXPECT_EQ(v_assign.x, 0x0000);
+ v_assign = Eigen::half(1.0f);
+ EXPECT_EQ(v_assign.x, 0x3c00);
+ v_assign = 0.5f;
+ EXPECT_EQ(v_assign.x, 0x3800);
+ v_assign = 0.33333;
+ EXPECT_EQ(v_assign.x, 0x3555);
+ v_assign = -1;
+ EXPECT_EQ(v_assign.x, 0xbc00);
+ v_assign = true;
+ EXPECT_EQ(v_assign.x, 0x3c00);
+
+ // Conversion operator
+ EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00);
+ EXPECT_EQ(float(float16(0.5f)), 0.5f);
+ EXPECT_NEAR(double(float16(0.33333)), 0.33333, 0.0001);
+ EXPECT_EQ(int(float16(-1)), -1);
+ EXPECT_EQ(bool(float16(true)), true);
+}
+
+TEST(float16, arithmetic_cpu) {
+ EXPECT_EQ(float(float16(1) + float16(1)), 2);
+ EXPECT_EQ(float(float16(5) + float16(-5)), 0);
+ EXPECT_NEAR(float(float16(0.33333f) + float16(0.66667f)), 1.0f, 0.001);
+ EXPECT_EQ(float(float16(3) - float16(5)), -2);
+ EXPECT_NEAR(float(float16(0.66667f) - float16(0.33333f)), 0.33334f, 0.001);
+ EXPECT_NEAR(float(float16(3.3f) * float16(2.0f)), 6.6f, 0.01);
+ EXPECT_NEAR(float(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01);
+ EXPECT_NEAR(float(float16(2.0f) / float16(3.0f)), 0.66667f, 0.001);
+ EXPECT_EQ(float(float16(1.0f) / float16(2.0f)), 0.5f);
+ EXPECT_EQ(float(-float16(512.0f)), -512.0f);
+ EXPECT_EQ(float(-float16(-512.0f)), 512.0f);
+}
+
+TEST(float16, comparison_cpu) {
+ EXPECT_TRUE(float16(1.0f) == float16(1.0f));
+ EXPECT_FALSE(float16(-1.0f) == float16(-0.5f));
+ EXPECT_TRUE(float16(1.0f) != float16(0.5f));
+ EXPECT_FALSE(float16(-1.0f) != float16(-1.0f));
+ EXPECT_TRUE(float16(1.0f) < float16(2.0f));
+ EXPECT_FALSE(float16(-1.0f) < float16(-1.0f));
+ EXPECT_TRUE(float16(1.0f) <= float16(1.0f));
+ EXPECT_TRUE(float16(2.0f) > float16(1.0f));
+ EXPECT_FALSE(float16(-2.0f) > float16(-2.0f));
+ EXPECT_TRUE(float16(2.0f) >= float16(2.0f));
+
+ EXPECT_TRUE(float16(0.0f) == float16(-0.0f));
+ EXPECT_TRUE(float16(0.0f) <= float16(-0.0f));
+ EXPECT_TRUE(float16(0.0f) >= float16(-0.0f));
+ EXPECT_FALSE(float16(0.0f) < float16(-0.0f));
+ EXPECT_FALSE(float16(-0.0f) < float16(0.0f));
+ EXPECT_FALSE(float16(0.0f) > float16(-0.0f));
+ EXPECT_FALSE(float16(-0.0f) > float16(0.0f));
+}
+
+} // namespace paddle
diff --git a/paddle/math/tests/test_float16.cu b/paddle/math/tests/test_float16.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4b520feaaf552302a969d8caee8aa28cc143304b
--- /dev/null
+++ b/paddle/math/tests/test_float16.cu
@@ -0,0 +1,213 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/math/float16.h"
+
+#include
+
+#include "paddle/utils/Logging.h"
+
+#define ARITHMETIC_KERNEL(op_type, sign) \
+ __global__ void op_type(const half* in1, const half* in2, half* out) { \
+ out[0] = in1[0] sign in2[0]; \
+ }
+
+#define COMPOUND_KERNEL(op_type, sign) \
+ __global__ void op_type(half* in1, const half* in2) { in1[0] sign in2[0]; }
+
+#define COMPARISON_KERNEL(op_type, sign) \
+ __global__ void op_type(const half* in1, const half* in2, bool* out) { \
+ out[0] = in1[0] sign in2[0]; \
+ }
+
+#define ARITHMETIC_KERNEL_LAUNCH(op_type) \
+ void Test##op_type(float v_in1, float v_in2, float v_out) { \
+ LOG(INFO) << "Test " << #op_type << " on GPU!"; \
+ half *in1, *in2, *out; \
+ half *d_in1, *d_in2, *d_out; \
+ int size = sizeof(half); \
+ cudaMalloc((void**)&d_in1, size); \
+ cudaMalloc((void**)&d_in2, size); \
+ cudaMalloc((void**)&d_out, size); \
+ in1 = (half*)malloc(size); \
+ in2 = (half*)malloc(size); \
+ out = (half*)malloc(size); \
+ in1[0] = half(float16(v_in1)); \
+ in2[0] = half(float16(v_in2)); \
+ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
+ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
+ op_type<<<1, 1>>>(d_in1, d_in2, d_out); \
+ cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \
+ EXPECT_EQ(float(float16(out[0])), v_out); \
+ free(in1); \
+ free(in2); \
+ free(out); \
+ cudaFree(d_in1); \
+ cudaFree(d_in2); \
+ cudaFree(d_out); \
+ }
+
+#define COMPOUND_KERNEL_LAUNCH(op_type) \
+ void Test##op_type(float v_in1, float v_in2, float v_out) { \
+ LOG(INFO) << "Test " << #op_type << " on GPU!"; \
+ half *in1, *in2; \
+ half *d_in1, *d_in2; \
+ int size = sizeof(half); \
+ cudaMalloc((void**)&d_in1, size); \
+ cudaMalloc((void**)&d_in2, size); \
+ in1 = (half*)malloc(size); \
+ in2 = (half*)malloc(size); \
+ in1[0] = half(float16(v_in1)); \
+ in2[0] = half(float16(v_in2)); \
+ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
+ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
+ op_type<<<1, 1>>>(d_in1, d_in2); \
+ cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \
+ EXPECT_EQ(float(float16(in1[0])), v_out); \
+ free(in1); \
+ free(in2); \
+ cudaFree(d_in1); \
+ cudaFree(d_in2); \
+ }
+
+#define COMPARISON_KERNEL_LAUNCH(op_type) \
+ void Test##op_type(float v_in1, float v_in2, bool v_out) { \
+ LOG(INFO) << "Test " << #op_type << " on GPU!"; \
+ half *in1, *in2; \
+ half *d_in1, *d_in2; \
+ bool *out, *d_out; \
+ int size = sizeof(half); \
+ cudaMalloc((void**)&d_in1, size); \
+ cudaMalloc((void**)&d_in2, size); \
+ cudaMalloc((void**)&d_out, 1); \
+ in1 = (half*)malloc(size); \
+ in2 = (half*)malloc(size); \
+ out = (bool*)malloc(1); \
+ in1[0] = half(float16(v_in1)); \
+ in2[0] = half(float16(v_in2)); \
+ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
+ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
+ op_type<<<1, 1>>>(d_in1, d_in2, d_out); \
+ cudaMemcpy(out, d_out, 1, cudaMemcpyDeviceToHost); \
+ EXPECT_EQ(out[0], v_out); \
+ free(in1); \
+ free(in2); \
+ free(out); \
+ cudaFree(d_in1); \
+ cudaFree(d_in2); \
+ cudaFree(d_out); \
+ }
+
+#ifdef PADDLE_CUDA_FP16
+namespace paddle {
+
+#if CUDA_VERSION < 9000
+ARITHMETIC_KERNEL(Add, +)
+ARITHMETIC_KERNEL(Sub, -)
+ARITHMETIC_KERNEL(Mul, *)
+ARITHMETIC_KERNEL(Div, /)
+
+ARITHMETIC_KERNEL_LAUNCH(Add)
+ARITHMETIC_KERNEL_LAUNCH(Sub)
+ARITHMETIC_KERNEL_LAUNCH(Mul)
+ARITHMETIC_KERNEL_LAUNCH(Div)
+
+// Negative sign kernel
+__global__ void Neg(half* in) { in[0] = -in[0]; }
+
+void TestNeg(float v_in, float v_out) {
+ LOG(INFO) << "Test Neg on GPU!";
+ half *in, *d_in;
+ int size = sizeof(half);
+ cudaMalloc((void**)&d_in, size);
+ in = (half*)malloc(size);
+ in[0] = half(float16(v_in));
+ cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
+ Neg<<<1, 1>>>(d_in);
+ cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
+ EXPECT_EQ(float(float16(in[0])), v_out);
+ free(in);
+ cudaFree(d_in);
+}
+
+COMPOUND_KERNEL(AddAssign, +=)
+COMPOUND_KERNEL(SubAssign, -=)
+COMPOUND_KERNEL(MulAssign, *=)
+COMPOUND_KERNEL(DivAssign, /=)
+
+COMPOUND_KERNEL_LAUNCH(AddAssign)
+COMPOUND_KERNEL_LAUNCH(SubAssign)
+COMPOUND_KERNEL_LAUNCH(MulAssign)
+COMPOUND_KERNEL_LAUNCH(DivAssign)
+
+COMPARISON_KERNEL(Equal, ==)
+COMPARISON_KERNEL(NotEqual, !=)
+COMPARISON_KERNEL(Less, <)
+COMPARISON_KERNEL(LessEqual, <=)
+COMPARISON_KERNEL(Greater, >)
+COMPARISON_KERNEL(GreaterEqual, >=)
+
+COMPARISON_KERNEL_LAUNCH(Equal)
+COMPARISON_KERNEL_LAUNCH(NotEqual)
+COMPARISON_KERNEL_LAUNCH(Less)
+COMPARISON_KERNEL_LAUNCH(LessEqual)
+COMPARISON_KERNEL_LAUNCH(Greater)
+COMPARISON_KERNEL_LAUNCH(GreaterEqual)
+
+TEST(float16, arithmetic_on_gpu) {
+ TestAdd(1, 2, 3);
+ TestSub(2, 1, 1);
+ TestMul(2, 3, 6);
+ TestDiv(6, 2, 3);
+ TestNeg(1, -1);
+}
+
+TEST(float16, compound_on_gpu) {
+ TestAddAssign(1, 2, 3);
+ TestSubAssign(2, 1, 1);
+ TestMulAssign(2, 3, 6);
+ TestDivAssign(6, 2, 3);
+}
+
+TEST(float16, comparision_on_gpu) {
+ TestEqual(1, 1, true);
+ TestEqual(1, 2, false);
+ TestNotEqual(2, 3, true);
+ TestNotEqual(2, 2, false);
+ TestLess(3, 4, true);
+ TestLess(3, 3, false);
+ TestLessEqual(3, 3, true);
+ TestLessEqual(3, 2, false);
+ TestGreater(4, 3, true);
+ TestGreater(4, 4, false);
+ TestGreaterEqual(4, 4, true);
+ TestGreaterEqual(4, 5, false);
+}
+#endif // CUDA_VERSION
+
+TEST(float16, conversion_on_gpu) {
+ // Explicit conversion to and from cuda half
+ EXPECT_EQ(float16(half(float16(1.0f))).x, 0x3c00);
+ EXPECT_EQ(float16(half(float16(0.5f))).x, 0x3800);
+ EXPECT_EQ(float16(half(float16(0.33333f))).x, 0x3555);
+ EXPECT_EQ(float16(half(float16(0.0f))).x, 0x0000);
+ EXPECT_EQ(float16(half(float16(-0.0f))).x, 0x8000);
+ EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff);
+ EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00);
+
+ // Assignment operator
+ float16 v_assign;
+ v_assign = half(float16(1.0f));
+ EXPECT_EQ(v_assign.x, 0x3c00);
+}
+
+} // namespace paddle
+#endif // PADDLE_CUDA_FP16
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 7e5a1db44a5302e3b4e5d2768755824666e880ba..afb8d9d599b15a0b6d19b7ecca5e91b623695dea 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -244,7 +244,7 @@ TEST(Matrix, unary) {
LOG(WARNING) << "This version of PaddlePaddle was not built with LAPACK"
<< "support so we cannot test matrix inverse. To test "
<< "matrix inverse, please install LAPACKE "
- << "and MKL/Openblas/ATLAS, and re-build PaddlePaddle.";
+ << "and MKL/Openblas, and re-build PaddlePaddle.";
#endif
}
}
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 6b4e46f56a0c9c9836c5b353ec9c554454ab0491..6a815a1b57db1d833781ca224f34e4559af9b9a5 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -43,7 +43,7 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) {
void* p;
-#ifdef PADDLE_USE_MKLDNN
+#ifdef PADDLE_WITH_MKLDNN
// refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
// memory alignment
PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0);
@@ -83,7 +83,7 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
paddle::platform::GpuMemoryUsage(available, capacity);
// Reserve memory for page tables, etc.
- size_t reserving = capacity - paddle::platform::GpuMaxAllocSize();
+ size_t reserving = 0.05 * capacity + paddle::platform::GpuMinChunkSize();
size_t usable = available > reserving ? available - reserving : 0;
// If remaining size no less than expected size, using general
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 95cfe2525e3e7c128d8652c5c6a0bb3d80a475b9..9cafdfda75d0511227ef648d50a8635320a81d32 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -64,19 +64,21 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
int gpu_num = platform::GetCUDADeviceCount();
as = new BuddyAllocator*[gpu_num];
for (int gpu = 0; gpu < gpu_num; gpu++) {
- platform::SetDeviceId(gpu);
- as[gpu] = new BuddyAllocator(new detail::GPUAllocator,
- platform::GpuMinChunkSize(),
- platform::GpuMaxChunkSize());
+ as[gpu] = nullptr;
}
+ }
+ platform::SetDeviceId(gpu_id);
+ if (!as[gpu_id]) {
+ as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator,
+ platform::GpuMinChunkSize(),
+ platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100
<< "% of GPU memory.\n"
- << "You can set environment variable '"
- << platform::kEnvFractionGpuMemoryToUse
+ << "You can set GFlags environment variable '"
+ << "FLAGS_fraction_of_gpu_memory_to_use"
<< "' to change the fraction of GPU usage.\n\n";
}
- platform::SetDeviceId(gpu_id);
return as[gpu_id];
}
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 38b89b9eb108d73c3374360a81c6ed28502bfdc5..5aaaf993323c2d4dbef688d0977ec6374fde6512 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -138,7 +138,7 @@ function(op_library TARGET)
if ("${TARGET}" STREQUAL "nccl_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
- file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
+ file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
endif()
# reduce_op contains several operators
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index 2785a8c6fb62527db4d203788be88ebead068a19..76da21c4726a1245241c1cf61860f9c8b62ea452 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -57,7 +57,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input("Out")->type()),
- ctx.device_context());
+ ctx.GetPlace());
}
};
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
index d2dcab4e548b99c6beecfaa570ac31804fd07d82..539a93530206c93a37791a9ccb2fb104af17f940 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -104,5 +104,6 @@ class AccuracyOpCUDAKernel : public framework::OpKernel {
// FIXME(typhoonzero): types of T is for inference data.
// label data is always int64
-REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel,
- paddle::operators::AccuracyOpCUDAKernel);
+REGISTER_OP_CUDA_KERNEL(accuracy,
+ paddle::operators::AccuracyOpCUDAKernel,
+ paddle::operators::AccuracyOpCUDAKernel);
diff --git a/paddle/operators/accuracy_op.h b/paddle/operators/accuracy_op.h
index d060e6edddb31ecc1a4d27836f80b8ac5fa7d36d..04104a695fac6a967ad94780e31ba3fdd2ca2eda 100644
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
@@ -21,7 +21,7 @@ namespace operators {
using Tensor = framework::Tensor;
-template
+template
class AccuracyKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index 154c618e8e7c4650b7f22684d3357de9c52a416c..63490f0ec9f4852a3ead574b9d52c807d8ba6d89 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -44,9 +44,9 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of Sigmoid operator");
AddOutput("Y", "Output of Sigmoid operator");
AddComment(R"DOC(
-Sigmoid Activation Operator.
+Sigmoid Activation Operator
-$y = 1 / (1 + e^{-x})$
+$$y = \frac{1}{1 + e^{-x}}$$
)DOC");
}
@@ -60,9 +60,9 @@ class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of LogSigmoid operator");
AddOutput("Y", "Output of LogSigmoid operator");
AddComment(R"DOC(
-Logsigmoid Activation Operator.
+Logsigmoid Activation Operator
-$y = \log(1 / (1 + e^{-x}))$
+$$y = \log \frac{1}{1 + e^{-x}}$$
)DOC");
}
@@ -506,6 +506,22 @@ It is recommended to use the defaults for this activation.
}
};
+class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ SwishOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput("X", "Input of Swish operator");
+ AddOutput("Y", "Output of Swish operator");
+ AddAttr("beta", "Constant beta of swish operator").SetDefault(1.0f);
+ AddComment(R"DOC(
+Swish Activation Operator.
+
+$$y = \frac{x}{1 + e^{- \beta x}}$$
+
+)DOC");
+ }
+};
+
} // namespace operators
} // namespace paddle
@@ -592,16 +608,20 @@ REGISTER_OP(thresholded_relu, ops::ActivationOp, ops::ThresholdedReluOpMaker,
REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker,
hard_sigmoid_grad, ops::ActivationOpGrad);
-#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor) \
- REGISTER_OP_CPU_KERNEL( \
- act_type, \
- ops::ActivationKernel>, \
- ops::ActivationKernel>); \
- REGISTER_OP_CPU_KERNEL( \
- act_type##_grad, ops::ActivationGradKernel>, \
- ops::ActivationGradKernel>, \
+ ops::ActivationKernel>); \
+ REGISTER_OP_CPU_KERNEL( \
+ act_type##_grad, \
+ ops::ActivationGradKernel>, \
+ ops::ActivationGradKernel>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
diff --git a/paddle/operators/activation_op.cu b/paddle/operators/activation_op.cu
index 97737857ab25dfa92163b64a750fd7a7d9ea0ac3..856d3fc35dafe6b22c25c55dfda2dc4973072615 100644
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
@@ -17,16 +17,17 @@
namespace ops = paddle::operators;
-#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor) \
- REGISTER_OP_GPU_KERNEL( \
- act_type, \
- ops::ActivationKernel>, \
- ops::ActivationKernel>); \
- REGISTER_OP_GPU_KERNEL( \
- act_type##_grad, ops::ActivationGradKernel>, \
- ops::ActivationGradKernel>, \
+ ops::ActivationKernel>); \
+ REGISTER_OP_CUDA_KERNEL( \
+ act_type##_grad, \
+ ops::ActivationGradKernel>, \
+ ops::ActivationGradKernel>);
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_GPU_KERNEL);
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index 8cd3bfbbd3f8f3210f94aef3a1586c8295730c1d..75eefca8b8c7ba8831a2f90c83718d00b83fba30 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -19,7 +19,7 @@
namespace paddle {
namespace operators {
-template
+template
class ActivationKernel
: public framework::OpKernel {
public:
@@ -32,18 +32,19 @@ class ActivationKernel
auto x = framework::EigenVector::Flatten(*X);
auto y = framework::EigenVector::Flatten(*Y);
- auto place = context.GetEigenDevice();
+ auto* place =
+ context.template device_context().eigen_device();
Functor functor;
auto attrs = functor.GetAttrs();
for (auto& attr : attrs) {
*attr.second = context.Attr(attr.first);
}
- functor(place, x, y);
+ functor(*place, x, y);
}
};
-template
+template
class ActivationGradKernel
: public framework::OpKernel {
public:
@@ -59,13 +60,14 @@ class ActivationGradKernel
auto x = framework::EigenVector::Flatten(*X);
auto y = framework::EigenVector::Flatten(*Y);
auto dx = framework::EigenVector::Flatten(*dX);
- auto place = context.GetEigenDevice();
+ auto* place =
+ context.template device_context().eigen_device();
Functor functor;
auto attrs = functor.GetAttrs();
for (auto& attr : attrs) {
*attr.second = context.Attr(attr.first);
}
- functor(place, x, y, dy, dx);
+ functor(*place, x, y, dy, dx);
}
};
@@ -700,6 +702,35 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor {
}
};
+template
+struct SwishFunctor : public BaseActivationFunctor {
+ float beta;
+ typename BaseActivationFunctor::AttrPair GetAttrs() {
+ return {{"beta", &beta}};
+ }
+
+ template
+ void operator()(Device d, X x, Y y) const {
+ y.device(d) = x / (static_cast(1) + (static_cast(-beta) * x).exp());
+ }
+};
+
+template
+struct SwishGradFunctor : public BaseActivationFunctor {
+ float beta;
+ typename BaseActivationFunctor::AttrPair GetAttrs() {
+ return {{"beta", &beta}};
+ }
+
+ template
+ void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+ auto temp1 = static_cast(1) /
+ (static_cast(1) + (static_cast(-beta) * x).exp());
+ auto temp2 = temp1 * (static_cast(1) - (beta * y));
+ dx.device(d) = dy * ((beta * y) + temp2);
+ }
+};
+
} // namespace operators
} // namespace paddle
@@ -730,4 +761,5 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor {
__macro(elu, ELUFunctor, ELUGradFunctor); \
__macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor); \
__macro(hard_sigmoid, HardSigmoidFunctor, HardSigmoidGradFunctor); \
+ __macro(swish, SwishFunctor, SwishGradFunctor); \
__macro(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor);
diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc
index 16a7794d5b7bf1d56cd9f5874454c41cab43b41f..507811e7b59b9426c599570ead9b42f8d02380fd 100644
--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
@@ -92,12 +92,12 @@ for gradient descent.
Adadelta updates are as follows:
-$$avgSquaredGradOut = \rho * avgSquaredGrad + (1 - \rho) * grad * grad \break
-paramUpdate = - $\sqrt{((avgSquaredUpdate + \epsilon) /
- (avgSquaredGrad_out + \epsilon))}$ * grad \break
-avgSquaredUpdateOut = \rho * avgSquaredUpdate + (1 - \rho) *
- {(paramUpdate)}^2 \break
-paramOut = param + paramUpdate$$
+$$
+avg\_squared\_grad\_out = \rho * avg\_squared\_grad + (1 - \rho) * grad * grad \\
+param\_update = - \sqrt{\frac{avg\_squared\_update + \epsilon}{avg\_squared\_grad\_out + \epsilon}} * grad \\
+avg\_squared\_update\_out = \rho * avg\_squared\_update + (1 - \rho) * {param\_update}^2 \\
+param\_out = param + param\_update
+$$
)DOC");
}
@@ -109,5 +109,5 @@ paramOut = param + paramUpdate$$
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
REGISTER_OP_CPU_KERNEL(
- adadelta, ops::AdadeltaOpKernel,
- ops::AdadeltaOpKernel);
+ adadelta, ops::AdadeltaOpKernel,
+ ops::AdadeltaOpKernel);
diff --git a/paddle/operators/adadelta_op.cu b/paddle/operators/adadelta_op.cu
index 9fb61852071f11670b8bc51321bb0881de196777..eee2d0a2f55f877bc5c87c72bca07bfd9485e517 100644
--- a/paddle/operators/adadelta_op.cu
+++ b/paddle/operators/adadelta_op.cu
@@ -16,6 +16,6 @@
#include "paddle/operators/adadelta_op.h"
namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
- adadelta, ops::AdadeltaOpKernel,
- ops::AdadeltaOpKernel);
+REGISTER_OP_CUDA_KERNEL(
+ adadelta, ops::AdadeltaOpKernel