diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9385943da92bc8c44ca75b267a768ba8ea22bd8b..90c25e435083d78ad4c123999a588aaf9092f719 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,18 +7,14 @@
hooks:
- id: yapf
- repo: https://github.com/pre-commit/pre-commit-hooks
- sha: 4ef03c4223ad322c7adaa6c6c0efb26b57df3b71
+ sha: 7539d8bd1a00a3c1bfd34cdb606d3a6372e83469
hooks:
- id: check-added-large-files
- id: check-merge-conflict
- id: check-symlinks
- id: detect-private-key
- id: end-of-file-fixer
-# TODO(yuyang): trailing whitespace has some bugs on markdown
-# files now, please not add it to pre-commit hook now
-# - id: trailing-whitespace
-#
-# TODO(yuyang): debug-statements not fit for Paddle, because
-# not all of our python code is runnable. Some are used for
-# documenation
-# - id: debug-statements
+- repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
+ sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
+ hooks:
+ - id: clang-formater
diff --git a/README.md b/README.md
index bd47ed44bc808196b0e6598f28d72620422f3e1a..8a8e15841586ae6a01bb93e94f6074189f556f5a 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,11 @@
# PaddlePaddle
-[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/baidu/Paddle)
+[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/cn/index.html)
-[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/baidu/Paddle?branch=develop)
-[![Release](https://img.shields.io/github/release/baidu/Paddle.svg?colorB=fedcba)](https://github.com/baidu/Paddle/releases)
+[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
+[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -17,7 +17,7 @@ developed by Baidu scientists and engineers for the purpose of applying deep
learning to many products at Baidu.
Our vision is to enable deep learning for everyone via PaddlePaddle.
-Please refer to our [release announcement](https://github.com/baidu/Paddle/releases) to track the latest feature of PaddlePaddle.
+Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
## Features
@@ -92,7 +92,7 @@ Both [English Docs](http://paddlepaddle.org/doc/) and [Chinese Docs](http://padd
## Ask Questions
-You are welcome to submit questions and bug reports as [Github Issues](https://github.com/baidu/paddle/issues).
+You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).
## Copyright and License
PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
diff --git a/doc/build/build_from_source.md b/doc/build/build_from_source.md
index b8f26f431eb7a04147fe791a8c805427c827fe09..e44fa0d38e9982e5d0ed159743994ce6acc51246 100644
--- a/doc/build/build_from_source.md
+++ b/doc/build/build_from_source.md
@@ -6,10 +6,10 @@ Installing from Sources
* [3. Build on Ubuntu](#ubuntu)
## Download and Setup
-You can download PaddlePaddle from the [github source](https://github.com/gangliao/Paddle).
+You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
```bash
-git clone https://github.com/baidu/Paddle paddle
+git clone https://github.com/PaddlePaddle/Paddle paddle
cd paddle
```
diff --git a/doc_cn/build_and_install/cmake/cblas_settings.csv b/doc_cn/build_and_install/cmake/cblas_settings.csv
index d804c0a662cb652dbefb0d09fb18538308c20aec..a6356baf16a0d3d2499e39d2055d8ee878dcaef2 100644
--- a/doc_cn/build_and_install/cmake/cblas_settings.csv
+++ b/doc_cn/build_and_install/cmake/cblas_settings.csv
@@ -1,4 +1,5 @@
-MKL_ROOT,mkl的路径,在${MKL_ROOT}/include下需要包含mkl.h,在${MKL_ROOT}/lib目录下需要包含 mkl_core,mkl_sequential和mkl_intel_lp64三个库
-ATLAS_ROOT,ATLAS库的路径,在${ATLAS_ROOT}/include下需要包含cblas.h,而在${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库
-OPENBLAS_ROOT,在${OPENBLAS_ROOT}/include下需要包含cblas.h,而在${OPENBLAS_ROOT}/lib下需要包含openblas库
-REFERENCE_CBLAS_ROOT,在${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h,在${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库
\ No newline at end of file
+编译选项,描述,注意
+MKL_ROOT,MKL的路径,${MKL_ROOT}/include下需要包含mkl.h,${MKL_ROOT}/lib目录下需要包含mkl_core,mkl_sequential和mkl_intel_lp64三个库。
+ATLAS_ROOT,ATLAS的路径,${ATLAS_ROOT}/include下需要包含cblas.h,${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库。
+OPENBLAS_ROOT,OpenBLAS的路径,${OPENBLAS_ROOT}/include下需要包含cblas.h,${OPENBLAS_ROOT}/lib下需要包含openblas库。
+REFERENCE_CBLAS_ROOT,REFERENCE BLAS的路径,${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h,${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库。
\ No newline at end of file
diff --git a/doc_cn/build_and_install/cmake/compile_options.csv b/doc_cn/build_and_install/cmake/compile_options.csv
index 0b8015aaee4d7b9068cb4a8de5d9967569e37f0c..12b45eebb2822d77447fa1bc754360605971dcab 100644
--- a/doc_cn/build_and_install/cmake/compile_options.csv
+++ b/doc_cn/build_and_install/cmake/compile_options.csv
@@ -1,15 +1,14 @@
-选项,说明,默认值
-WITH_GPU,是否编译GPU支持。,是否寻找到cuda工具链
-WITH_DOUBLE,是否使用双精度浮点数。,否
-WITH_DSO,是否使用运行时动态加载cuda动态库,而非静态加载cuda动态库。,是
-WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制,是
-WITH_PYTHON,是否内嵌python解释器。可以方便嵌入式工作。,是
-WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
-WITH_RDMA,是否开启RDMA支持,否
-WITH_GLOG,是否使用GLOG,如果不使用则会使用一个简化版的日志实现。可以方便嵌入式工作。,取决于是否寻找到GLOG
-WITH_GFLAGS,是否使用GFLAGS,如果不使用则会使用一个简化版的命令行参数解析。可以方便嵌入式工作。,取决于是否寻找到GFLAGS
-WITH_TIMER,是否开启计时功能开启计时功能会导致运行略慢,打印的日志变多。但是方便调试和benchmark,否
-WITH_TESTING,是否开启单元测试,取决于是否寻找到gtest
-WITH_DOC,是否编译英文文档,否
-WITH_DOC_CN,是否编译中文文档,否
-WITH_SWIG_PY,是否编译python的swig接口,python的swig接口可以方便进行预测和定制化训练,取决于是否找到swig
+选项,说明,默认值
+WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
+WITH_DOUBLE,是否使用双精度浮点数。,否
+WITH_DSO,是否运行时动态加载CUDA动态库,而非静态加载CUDA动态库。,是
+WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
+WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
+WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
+WITH_RDMA,是否开启RDMA,否
+WITH_GLOG,是否开启GLOG。如果不开启,则会使用一个简化版的日志,同时方便今后的嵌入式移植工作。,取决于是否寻找到GLOG
+WITH_GFLAGS,是否使用GFLAGS。如果不开启,则会使用一个简化版的命令行参数解析器,同时方便今后的嵌入式移植工作。,取决于是否寻找到GFLAGS
+WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢,打印的日志变多,但是方便调试和测Benchmark,否
+WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
+WITH_DOC,是否编译中英文文档,否
+WITH_SWIG_PY,是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练,取决于是否寻找到SWIG
\ No newline at end of file
diff --git a/doc_cn/build_and_install/cmake/compile_options.rst b/doc_cn/build_and_install/cmake/compile_options.rst
index bb5b18a073803662774cb6b7bcbdbafe3ad51112..f345ead2bf851bdad7be2fb8185d16fd2a318a66 100644
--- a/doc_cn/build_and_install/cmake/compile_options.rst
+++ b/doc_cn/build_and_install/cmake/compile_options.rst
@@ -1,62 +1,43 @@
-设置PaddlePaddle的编译选项
-==========================
-
-PaddlePaddle的编译选项可以在调用cmake的时候设置。cmake是一个跨平台的编译脚本,调用
-cmake可以将cmake项目文件,生成各个平台的makefile。详细的cmake使用方法可以参考
-`cmake的官方文档 `_ 。
-
-PaddlePaddle的编译选项是可以控制PaddlePaddle生成CPU/GPU版本二进制,链接何种blas等等。所有的
-编译选项列表如下
-
-PaddlePaddle的编译选项
-----------------------
-
-bool型的编译选项
-++++++++++++++++
-设置下列编译选项时,可以在cmake的命令行设置。使用 -D命令即可。例如
-:code:`cmake -D WITH_GPU=OFF`
-
-.. csv-table:: PaddlePaddle的bool型编译选项
- :widths: 1, 7, 2
- :file: compile_options.csv
-
-blas相关的编译选项
-++++++++++++++++++
-
-PaddlePaddle可以使用 `MKL `_ ,
-`Atlas `_ ,
-`OpenBlas `_ 和
-`refference Blas `_ ,任意一种cblas实现。
-通过编译时指定路径来实现引用各种blas。
-
-cmake编译时会首先在系统路径(/usr/lib\:/usr/local/lib)中寻找这些blas的实现。同时
-也会读取相关路径变量来进行搜索。路径变量为\:
-
-
-.. csv-table:: PaddlePaddle的cblas编译选项
- :widths: 1, 9
- :header: "编译选项", "描述"
- :file: cblas_settings.csv
-
-这些变量均可以使用 -D命令指定。例如 :code:`cmake -D MKL_ROOT=/opt/mkl/`。这些变
-量也可以通过调用cmake命令前通过环境变量指定。例如
-
-.. code-block:: bash
-
- export MKL_ROOT=/opt/mkl
- cmake
-
-需要注意的是,这些变量只在第一次cmake的时候有效。如果在第一次cmake之后想要重新设
-置这些变量,推荐清理( :code:`rm -rf` )掉编译目录后,再指定。
-
-cuda/cudnn相关的编译选项
-++++++++++++++++++++++++
-
-PaddlePaddle可以使用 cudnn v2之后的任何一个cudnn版本来编译运行。但需要注意的是编译和
-运行使用的cudnn尽量是同一个版本。推荐使用最新版本的cudnn v5.1。
-
-在cmake配置时可以使用 :code:`CUDNN_ROOT` 来配置CUDNN的安装路径。使用的命令也是
--D,例如 :code:`cmake -D CUDNN_ROOT=/opt/cudnnv5` 。
-
-需要注意的是,这些变量只在第一次cmake的时候有效。如果在第一次cmake之后想要重新设
-置这些变量,推荐清理( :code:`rm -rf` )掉编译目录后,再指定。
+PaddlePaddle的编译选项
+======================
+
+PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们,详细的cmake使用方法可以参考 `官方文档 `_ 。
+
+Bool型的编译选项
+----------------
+用户可在cmake的命令行中,通过使用 ``-D`` 命令设置该类编译选项,例如
+
+.. code-block:: bash
+
+ cmake .. -DWITH_GPU=OFF
+
+.. csv-table:: Bool型的编译选项
+ :widths: 1, 7, 2
+ :file: compile_options.csv
+
+BLAS/CUDA/Cudnn的编译选项
+--------------------------
+BLAS
++++++
+
+PaddlePaddle支持以下任意一种BLAS库:`MKL `_ ,`ATLAS `_ ,`OpenBlAS `_ 和 `REFERENCE BLAS `_ 。
+
+.. csv-table:: BLAS路径相关的编译选项
+ :widths: 1, 2, 7
+ :file: cblas_settings.csv
+
+CUDA/Cudnn
++++++++++++
+
+PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行,但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时,首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如
+
+.. code-block:: bash
+
+ cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
+
+注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录(``rm -rf``)后,再指定。
\ No newline at end of file
diff --git a/doc_cn/howto/how_to_write_docs/index.rst b/doc_cn/howto/how_to_write_docs/index.rst
index 869ef747f9f88c7dbb5efdf6e03111a3f76c4014..a1f983b3405fa40f436885e40fca2ebbb4695491 100644
--- a/doc_cn/howto/how_to_write_docs/index.rst
+++ b/doc_cn/howto/how_to_write_docs/index.rst
@@ -2,32 +2,19 @@
如何贡献/修改PaddlePaddle的文档
###############################
-PaddlePaddle的文档使用 `cmake`_ 驱动 `sphinx`_ 生成。公有两个文档,:code:`doc` 和 :code:`doc_cn` 。这两者会在 `cmake`_ 中进行编译,生成后的文档会存储在服务器的 :code:`doc` 和 :code:`doc_cn` 两个目录下。
+PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成,生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
-下面分几个部分介绍一下PaddlePaddle文档的贡献方法。
-
-如何书写PaddlePaddle的文档
-==========================
-
-TBD
如何构建PaddlePaddle的文档
==========================
-构建PaddlePaddle文档,需要使用构建Paddle的全部环境。准备这个环境相对来说比较复杂,所以本文档提供两种方式构建PaddlePaddle的文档,即
-
-* 使用Docker构建PaddlePaddle的文档
-* 直接构建PaddlePaddle的文档。
-
-并且,我们推荐使用Docker来构建PaddlePaddle的文档。
+PaddlePaddle的文档构建有直接构建和基于Docker构建两种方式。构建PaddlePaddle文档需要准备的环境相对较复杂,所以我们推荐使用基于Docker来构建PaddlePaddle的文档。
使用Docker构建PaddlePaddle的文档
--------------------------------
-使用Docker构建PaddlePaddle的文档,首先要求在系统里安装好Docker工具包。安装Docker请参考 `Docker的官网 `_ 。
-
-安装好Docker之后可以使用源码目录下的脚本构建文档,即
+使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 `_ 。安装好Docker之后可以使用源码目录下的脚本构建文档,即
.. code-block:: bash
@@ -35,10 +22,10 @@ TBD
cd paddle/scripts/tools/build_docs
bash build_docs.sh
-执行完这个脚本后,该目录下会生成两个目录,分别是\:
+编译完成后,该目录下会生成如下两个子目录\:
-* doc 目录,英文文档地址
-* doc_cn 目录,中文文档地址
+* doc 英文文档目录
+* doc_cn 中文文档目录
打开浏览器访问对应目录下的index.html即可访问本地文档。
@@ -52,6 +39,10 @@ TBD
TBD
+如何书写PaddlePaddle的文档
+==========================
+
+TBD
如何更新www.paddlepaddle.org文档
================================
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
index 6f51d551200696ebafade2a46243b78086975265..b539374cd4aa5a9510cdb728c1b22edf65a9f880 100644
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "PaddleAPIPrivate.h"
@@ -112,7 +111,7 @@ void Arguments::setSlotSequenceStartPositions(size_t idx,
}
void Arguments::setSlotSubSequenceStartPositions(
- size_t idx, IVector *vec) throw(RangeError) {
+ size_t idx, IVector* vec) throw(RangeError) {
auto& a = m->getArg(idx);
auto& v = m->cast(vec->getSharedPtr());
a.subSequenceStartPositions = std::make_shared(v);
diff --git a/paddle/api/ConfigParser.cpp b/paddle/api/ConfigParser.cpp
index 25d94f5a6a1255f3e2faff9816cfd003b20c0418..bc40d871d180a6bfe21200c866181dc161f5f078 100644
--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "PaddleAPIPrivate.h"
#include "paddle/trainer/Trainer.h"
@@ -44,8 +43,7 @@ TrainerConfig* TrainerConfig::createFromTrainerConfigFile(
return retv;
}
-TrainerConfig* TrainerConfig::createFromProtoString(
- const std::string& str) {
+TrainerConfig* TrainerConfig::createFromProtoString(const std::string& str) {
auto retv = new TrainerConfig();
paddle::TrainerConfig trainerConfigProto;
auto conf = std::make_shared(trainerConfigProto);
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
index bef499c67858b8e2d5432155a8defca56af6019c..9a4846d80980e23e97f89b6134e15af71207ae6b 100644
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "PaddleAPIPrivate.h"
@@ -27,7 +26,8 @@ GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {}
GradientMachine::~GradientMachine() { delete m; }
GradientMachine* GradientMachine::createFromPaddleModelPtr(
- const void* confPtr, GradientMatchineCreateMode mode,
+ const void* confPtr,
+ GradientMatchineCreateMode mode,
const std::vector& types) {
auto& conf = *(const paddle::ModelConfig*)(confPtr);
std::vector realTypes;
@@ -44,7 +44,8 @@ GradientMachine* GradientMachine::createFromPaddleModelPtr(
}
GradientMachine* GradientMachine::createByConfigProtoStr(
- const std::string& protoStr, GradientMatchineCreateMode mode,
+ const std::string& protoStr,
+ GradientMatchineCreateMode mode,
const std::vector& types) {
paddle::ModelConfig conf;
conf.ParseFromString(protoStr);
@@ -56,13 +57,15 @@ GradientMachine* GradientMachine::createByConfigProtoStr(
}
GradientMachine* GradientMachine::createByModelConfig(
- ModelConfig* conf, GradientMatchineCreateMode mode,
+ ModelConfig* conf,
+ GradientMatchineCreateMode mode,
const std::vector& types) {
auto confPtr = &conf->m->conf->getModelConfig();
return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
}
-void GradientMachine::forward(const Arguments& inArgs, Arguments* outArgs,
+void GradientMachine::forward(const Arguments& inArgs,
+ Arguments* outArgs,
PassType passType) {
auto& in =
m->cast>(inArgs.getInternalArgumentsPtr());
@@ -99,7 +102,8 @@ void GradientMachine::backward(const UpdateCallback& callback) {
}
void GradientMachine::forwardBackward(const Arguments& inArgs,
- Arguments* outArgs, PassType passType,
+ Arguments* outArgs,
+ PassType passType,
const UpdateCallback& callback) {
auto& in =
m->cast>(inArgs.getInternalArgumentsPtr());
@@ -129,7 +133,7 @@ Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
void GradientMachine::randParameters() { m->machine->randParameters(); }
Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
- throw(UnsupportError) {
+ throw(UnsupportError) {
auto nn = std::dynamic_pointer_cast(m->machine);
if (nn) {
auto mat = nn->getLayerOutput(layerName);
@@ -140,8 +144,11 @@ Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
}
SequenceGenerator* GradientMachine::asSequenceGenerator(
- const std::vector& dict, size_t begin_id, size_t end_id,
- size_t max_length, size_t beam_size) {
+ const std::vector& dict,
+ size_t begin_id,
+ size_t end_id,
+ size_t max_length,
+ size_t beam_size) {
SequenceGenerator* r =
SequenceGenerator::createByGradientMachineSharedPtr(&m->machine);
r->setDict(dict);
diff --git a/paddle/api/Internal.h b/paddle/api/Internal.h
index b990f650be9fa401898a8c6d10c21d9c90eb728a..66a13bc603ed5098997f168d3f527160ac3822ef 100644
--- a/paddle/api/Internal.h
+++ b/paddle/api/Internal.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include "PaddleAPI.h"
@@ -23,7 +22,8 @@ limitations under the License. */
template
void staticCastVector(std::vector* dest, const std::vector& src) {
dest->resize(src.size());
- std::transform(src.begin(), src.end(), dest->begin(), [](T1 t){
- return static_cast(t);
- });
+ std::transform(src.begin(),
+ src.end(),
+ dest->begin(),
+ [](T1 t) { return static_cast(t); });
}
diff --git a/paddle/api/Matrix.cpp b/paddle/api/Matrix.cpp
index e5493a381a6f9e3d135c14649a8e1e438494d363..f257ee65aa4a12dfcd1914ddbf0e16461a9b128c 100644
--- a/paddle/api/Matrix.cpp
+++ b/paddle/api/Matrix.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "paddle/math/Matrix.h"
#include "paddle/math/SparseMatrix.h"
@@ -44,17 +43,21 @@ Matrix* Matrix::createZero(size_t height, size_t width, bool useGpu) {
return m;
}
-Matrix* Matrix::createDense(const std::vector& data, size_t height,
- size_t width, bool useGpu) {
+Matrix* Matrix::createDense(const std::vector& data,
+ size_t height,
+ size_t width,
+ bool useGpu) {
auto m = new Matrix();
m->m->mat = paddle::Matrix::create(height, width, useGpu);
m->m->mat->copyFrom(data.data(), data.size());
return m;
}
-Matrix* Matrix::createDenseFromNumpy(float* data, int dim1, int dim2,
- bool copy, bool useGpu)
- throw (UnsupportError) {
+Matrix* Matrix::createDenseFromNumpy(float* data,
+ int dim1,
+ int dim2,
+ bool copy,
+ bool useGpu) throw(UnsupportError) {
if (useGpu) {
/// Gpu mode only supports copy=True
if (!copy) {
@@ -66,7 +69,9 @@ Matrix* Matrix::createDenseFromNumpy(float* data, int dim1, int dim2,
}
}
-Matrix* Matrix::createCpuDenseFromNumpy(float* data, int dim1, int dim2,
+Matrix* Matrix::createCpuDenseFromNumpy(float* data,
+ int dim1,
+ int dim2,
bool copy) {
auto m = new Matrix();
if (copy) {
@@ -85,12 +90,20 @@ Matrix* Matrix::createGpuDenseFromNumpy(float* data, int dim1, int dim2) {
return m;
}
-Matrix* Matrix::createSparse(size_t height, size_t width, size_t nnz,
- bool isNonVal, bool isTrans, bool useGpu) {
+Matrix* Matrix::createSparse(size_t height,
+ size_t width,
+ size_t nnz,
+ bool isNonVal,
+ bool isTrans,
+ bool useGpu) {
auto m = new Matrix();
m->m->mat = paddle::Matrix::createSparseMatrix(
- height, width, nnz, isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
- isTrans, useGpu);
+ height,
+ width,
+ nnz,
+ isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
+ isTrans,
+ useGpu);
return m;
}
@@ -221,7 +234,8 @@ FloatArray Matrix::getData() const {
}
void Matrix::sparseCopyFrom(
- const std::vector& rows, const std::vector& cols,
+ const std::vector& rows,
+ const std::vector& cols,
const std::vector& vals) throw(UnsupportError) {
auto cpuSparseMat =
std::dynamic_pointer_cast(m->mat);
@@ -240,7 +254,8 @@ void Matrix::sparseCopyFrom(
void* Matrix::getSharedPtr() const { return &m->mat; }
-void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
+void Matrix::toNumpyMatInplace(float** view_data,
+ int* dim1,
int* dim2) throw(UnsupportError) {
auto cpuMat = std::dynamic_pointer_cast(m->mat);
if (cpuMat) {
@@ -251,7 +266,8 @@ void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
throw UnsupportError();
}
}
-void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
+void Matrix::copyToNumpyMat(float** view_m_data,
+ int* dim1,
int* dim2) throw(UnsupportError) {
static_assert(sizeof(paddle::real) == sizeof(float),
"Currently PaddleAPI only support for single "
@@ -269,8 +285,8 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
} else if (auto gpuMat = dynamic_cast(m->mat.get())) {
auto src = gpuMat->getData();
auto dest = *view_m_data;
- hl_memcpy_device2host(dest, src,
- sizeof(paddle::real) * (*dim1) * (*dim2));
+ hl_memcpy_device2host(
+ dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
} else {
LOG(WARNING) << "Unexpected Situation";
throw UnsupportError();
@@ -278,7 +294,8 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
}
}
-void Matrix::copyFromNumpyMat(float* data, int dim1,
+void Matrix::copyFromNumpyMat(float* data,
+ int dim1,
int dim2) throw(UnsupportError, RangeError) {
if (isSparse()) {
throw UnsupportError();
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 5688ece44d2d58a2184a9f23d4af26c51c319579..c07facdb1292b34ac31247160a4347ea359e718b 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include
@@ -61,8 +60,8 @@ class RangeError {};
/// Not support Error, such as access GPU memory directly, etc.
class UnsupportError : public std::runtime_error {
public:
- UnsupportError() : std::runtime_error(" ") {};
- UnsupportError(const std::string& message) : std::runtime_error(message) {};
+ UnsupportError() : std::runtime_error(" "){};
+ UnsupportError(const std::string& message) : std::runtime_error(message){};
};
/// This type will map to python's list of float.
@@ -112,7 +111,8 @@ public:
/**
* Create A Matrix with height,width, which is filled by zero.
*/
- static Matrix* createZero(size_t height, size_t width,
+ static Matrix* createZero(size_t height,
+ size_t width,
bool useGpu = isUsingGpu());
/**
@@ -124,8 +124,11 @@ public:
*
* @note the default sparse type is SPARSE_CSR.
*/
- static Matrix* createSparse(size_t height, size_t width, size_t nnz,
- bool isNonVal = true, bool trans = false,
+ static Matrix* createSparse(size_t height,
+ size_t width,
+ size_t nnz,
+ bool isNonVal = true,
+ bool trans = false,
bool useGpu = isUsingGpu());
/**
@@ -134,13 +137,17 @@ public:
* @param data list of float should be passed in python.
* @note the value will be copy into a new matrix.
*/
- static Matrix* createDense(const std::vector& data, size_t height,
- size_t width, bool useGpu = isUsingGpu());
-
- static Matrix* createDenseFromNumpy(float* data, int dim1, int dim2,
- bool copy = true,
- bool useGpu = isUsingGpu())
- throw (UnsupportError);
+ static Matrix* createDense(const std::vector& data,
+ size_t height,
+ size_t width,
+ bool useGpu = isUsingGpu());
+
+ static Matrix* createDenseFromNumpy(
+ float* data,
+ int dim1,
+ int dim2,
+ bool copy = true,
+ bool useGpu = isUsingGpu()) throw(UnsupportError);
/**
* Create Cpu Dense Matrix from numpy matrix, dtype=float32
@@ -151,7 +158,9 @@ public:
* @param copy true if copy into a new matrix, false will create
* matrix inplace.
*/
- static Matrix* createCpuDenseFromNumpy(float* data, int dim1, int dim2,
+ static Matrix* createCpuDenseFromNumpy(float* data,
+ int dim1,
+ int dim2,
bool copy = false);
/// Create Gpu Dense Matrix from numpy matrix, dtype=float32
@@ -171,11 +180,13 @@ public:
* numpy_mat = m.toNumpyMat()
* @endcode
*/
- void toNumpyMatInplace(float** view_data, int* dim1,
+ void toNumpyMatInplace(float** view_data,
+ int* dim1,
int* dim2) throw(UnsupportError);
/// Copy To numpy mat.
- void copyToNumpyMat(float** view_m_data, int* dim1,
+ void copyToNumpyMat(float** view_m_data,
+ int* dim1,
int* dim2) throw(UnsupportError);
/// Copy From Numpy Mat
@@ -248,15 +259,18 @@ public:
static Vector* create(const std::vector& data,
bool useGpu = isUsingGpu());
- static Vector* createVectorFromNumpy(float* data, int dim, bool copy = true,
- bool useGpu = isUsingGpu())
- throw (UnsupportError);
+ static Vector* createVectorFromNumpy(
+ float* data,
+ int dim,
+ bool copy = true,
+ bool useGpu = isUsingGpu()) throw(UnsupportError);
/**
* Create Cpu Vector from numpy array, which dtype=float32
*
* If copy is false, it will create vector inplace.
*/
- static Vector* createCpuVectorFromNumpy(float* data, int dim,
+ static Vector* createCpuVectorFromNumpy(float* data,
+ int dim,
bool copy = false);
/// Create Gpu Vector from numpy array, which dtype=float32
@@ -312,16 +326,19 @@ public:
static IVector* create(const std::vector& data,
bool useGpu = isUsingGpu());
- static IVector* createVectorFromNumpy(int* data, int dim, bool copy = true,
- bool useGpu = isUsingGpu())
- throw (UnsupportError);
+ static IVector* createVectorFromNumpy(
+ int* data,
+ int dim,
+ bool copy = true,
+ bool useGpu = isUsingGpu()) throw(UnsupportError);
/**
* Create Cpu IVector from numpy array, which dtype=int32
*
* If copy is false, it will create vector inplace
*/
- static IVector* createCpuVectorFromNumpy(int* data, int dim,
+ static IVector* createCpuVectorFromNumpy(int* data,
+ int dim,
bool copy = false);
/**
* Create Gpu IVector from numpy array, which dtype=int32
@@ -605,7 +622,8 @@ class ParameterTraverseCallback {
public:
~ParameterTraverseCallback();
- void apply(const std::vector& vecs, const ParameterConfig& config,
+ void apply(const std::vector& vecs,
+ const ParameterConfig& config,
size_t sparseId);
private:
@@ -638,7 +656,8 @@ public:
void finishBatch();
- void update(const std::vector& vecs, const ParameterConfig& conf,
+ void update(const std::vector& vecs,
+ const ParameterConfig& conf,
size_t sparseId = NO_SPARSE_ID);
std::vector getParameterTypes() const;
@@ -678,7 +697,8 @@ public:
* model config by TrainerConfig
*/
static GradientMachine* createByModelConfig(
- ModelConfig* conf, GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
+ ModelConfig* conf,
+ GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
const std::vector& parameterTypes = defaultParamTypes);
/**
@@ -701,7 +721,8 @@ public:
/**
* Combine forward/backward
*/
- void forwardBackward(const Arguments& inArgs, Arguments* outArgs,
+ void forwardBackward(const Arguments& inArgs,
+ Arguments* outArgs,
PassType passType,
const UpdateCallback& callback = UpdateCallback());
@@ -722,14 +743,17 @@ public:
*/
SequenceGenerator* asSequenceGenerator(
const std::vector& dict = std::vector(),
- size_t begin_id = 0UL, size_t end_id = 0UL, size_t max_length = 100UL,
+ size_t begin_id = 0UL,
+ size_t end_id = 0UL,
+ size_t max_length = 100UL,
size_t beam_size = -1UL);
private:
GradientMachinePrivate* m;
static GradientMachine* createFromPaddleModelPtr(
- const void* confPtr, GradientMatchineCreateMode mode,
+ const void* confPtr,
+ GradientMatchineCreateMode mode,
const std::vector& types);
// Not to use c++ 11 init-list, so we use static var as function default arg.
@@ -751,8 +775,8 @@ public:
/// Create A Trainer By TrainerConfig. using paddle command line.
static Trainer* createByCommandLine() throw(IOError);
- static Trainer* create(TrainerConfig* optConfig, GradientMachine* gm)
- throw(IOError);
+ static Trainer* create(TrainerConfig* optConfig,
+ GradientMachine* gm) throw(IOError);
/// Start training
void startTrain();
diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp
index 8b56adc97c2d6178a9e0b272a9af89732a3573f6..c5876bb1c71438578831ffffd85840c706b6224c 100644
--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "paddle/parameter/Parameter.h"
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp
index b13761ab0900d57008c17094c5199ef31a040f54..21d031e4bcb897eb693e5cff56bc77a637dc6bd2 100644
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "PaddleAPIPrivate.h"
#include "paddle/parameter/ParameterOptimizer.h"
@@ -32,17 +31,21 @@ struct ParameterTraverseCallbackPrivate {
const paddle::ParameterOptimizer::TraverseCallback& callback)
: callback(callback) {}
- void apply(const std::vector& vecs, const ParameterConfig& conf,
+ void apply(const std::vector& vecs,
+ const ParameterConfig& conf,
size_t sparseId) {
std::vector real_vecs;
real_vecs.resize(vecs.size());
- std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
- if (v) {
- return *(paddle::VectorPtr*)(v->getSharedPtr());
- } else {
- return paddle::VectorPtr();
- }
- });
+ std::transform(vecs.begin(),
+ vecs.end(),
+ real_vecs.begin(),
+ [](Vector* v) {
+ if (v) {
+ return *(paddle::VectorPtr*)(v->getSharedPtr());
+ } else {
+ return paddle::VectorPtr();
+ }
+ });
paddle::ParameterConfig& real_conf =
*(paddle::ParameterConfig*)(const_cast(conf)
@@ -86,10 +89,12 @@ void ParameterOptimizer::startBatch(size_t numSamplesProcessed) {
void ParameterOptimizer::finishBatch() { m->optimizer->finishBatch(); }
void ParameterOptimizer::update(const std::vector& vecs,
- const ParameterConfig& conf, size_t sparseId) {
- ParameterTraverseCallbackPrivate invoker([&](
- const paddle::VectorPtr _vecs[], const paddle::ParameterConfig& config,
- size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
+ const ParameterConfig& conf,
+ size_t sparseId) {
+ ParameterTraverseCallbackPrivate invoker(
+ [&](const paddle::VectorPtr _vecs[],
+ const paddle::ParameterConfig& config,
+ size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
invoker.apply(vecs, conf, sparseId);
}
@@ -116,8 +121,9 @@ void ParameterTraverseCallback::apply(const std::vector& vecs,
ParameterTraverseCallback* ParameterOptimizer::needSpecialTraversal(
const ParameterConfig& config) const {
- auto& param_config = *(paddle::ParameterConfig*)const_cast(
- config).getRawPtr();
+ auto& param_config =
+ *(paddle::ParameterConfig*)const_cast(config)
+ .getRawPtr();
auto callback = m->optimizer->needSpecialTraversal(param_config);
if (callback) {
auto retCallback = new ParameterTraverseCallback();
diff --git a/paddle/api/SequenceGenerator.cpp b/paddle/api/SequenceGenerator.cpp
index 9d353ccc8e281e72a207ba19f45517fd256d6df2..d51be78d45902967107f4bf0af995958faed931a 100644
--- a/paddle/api/SequenceGenerator.cpp
+++ b/paddle/api/SequenceGenerator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "paddle/gserver/gradientmachines/GradientMachine.h"
#include "paddle/parameter/Argument.h"
@@ -42,8 +41,10 @@ struct Path {
// position
static void findNBest(paddle::GradientMachine* gradMachine,
std::vector& inArgs,
- std::vector& finalPaths, size_t bos_id,
- size_t eos_id, size_t max_length) {
+ std::vector& finalPaths,
+ size_t bos_id,
+ size_t eos_id,
+ size_t max_length) {
std::vector paths;
Path emptyPath;
paths.push_back(emptyPath);
@@ -166,7 +167,8 @@ public:
if (id < getSize()) {
Path& p = (*path_)[id];
std::ostringstream sout;
- std::transform(p.ids.begin(), p.ids.end(),
+ std::transform(p.ids.begin(),
+ p.ids.end(),
std::ostream_iterator(sout, split ? " " : ""),
[&](int id) { return (*dict_)[id]; });
return sout.str();
diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp
index b61f36f740d47fe785b30361f26059bf0b64829d..7a6aa69fb652313748b1fa787847ffd74fda7a22 100644
--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -64,12 +64,11 @@ Trainer* Trainer::createByCommandLine() throw(IOError) {
Trainer::Trainer(TrainerConfig* config, GradientMachine* gm)
: m(new TrainerPrivate()) {
- m->init(config->m->conf, /* testing= */false, gm ? gm->m->machine : nullptr);
+ m->init(config->m->conf, /* testing= */ false, gm ? gm->m->machine : nullptr);
}
-Trainer* Trainer::create(TrainerConfig* config, GradientMachine* gm)
- throw(IOError)
-{
+Trainer* Trainer::create(TrainerConfig* config,
+ GradientMachine* gm) throw(IOError) {
auto retv = new Trainer(config, gm);
if (retv->m->getConfig().IsInitialized()) {
return retv;
@@ -134,15 +133,17 @@ void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
Matrix* Trainer::getLayerOutput(const std::string& layerName) {
auto nn = std::dynamic_pointer_cast(
- this->m->getGradientMachine());
+ this->m->getGradientMachine());
CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
auto m = nn->getLayerOutput(layerName);
return Matrix::createByPaddleMatrixPtr(&m);
}
-void Trainer::forwardOneBatch(size_t batchSize) { m->forwardOneBatch(batchSize); }
+void Trainer::forwardOneBatch(size_t batchSize) {
+ m->forwardOneBatch(batchSize);
+}
-bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
+bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
CHECK(dataProvider_) << "data_provider is not specified";
paddle::DataBatch dataBatch;
int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
@@ -156,7 +157,6 @@ bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
void TrainerPrivate::forwardOneDataBatch(
const std::vector& inArgs) {
-
std::vector& outArgs = forwardOutput_;
if (config_->getOptConfig().use_sparse_remote_updater()) {
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index a8932351a685474a756c3f5b0e5e8c42bbf58237..1bba1df2e1c0a2d3cd2d8307ed3a0d784bb949b4 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -37,13 +37,15 @@ FloatArray::FloatArray(const float* b, const size_t l)
IntArray::IntArray(const int* b, const size_t l, bool f)
: buf(b), length(l), needFree(f) {}
-IntWithFloatArray::IntWithFloatArray(const float* v, const int* i, size_t l,
+IntWithFloatArray::IntWithFloatArray(const float* v,
+ const int* i,
+ size_t l,
bool f)
: valBuf(v), idxBuf(i), length(l), needFree(f) {}
-bool isUsingGpu() {return FLAGS_use_gpu;}
+bool isUsingGpu() { return FLAGS_use_gpu; }
-void setUseGpu(bool useGpu) {FLAGS_use_gpu = useGpu;}
+void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
bool isGpuVersion() {
#ifdef PADDLE_ONLY_CPU
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index d44cdefc35bd09e04412b52fb9981947caf89588..cc1c098223826a06fea291a95730d7fc1fd1beb3 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PaddleAPI.h"
#include "paddle/math/Vector.h"
@@ -39,8 +38,10 @@ IVector* IVector::create(const std::vector& data, bool useGpu) {
return v;
}
-IVector* IVector::createVectorFromNumpy(int* data, int dim, bool copy,
- bool useGpu) throw (UnsupportError){
+IVector* IVector::createVectorFromNumpy(int* data,
+ int dim,
+ bool copy,
+ bool useGpu) throw(UnsupportError) {
if (useGpu) {
/// if use gpu only copy=true is supported
if (!copy) {
@@ -137,8 +138,8 @@ void IVector::copyToNumpyArray(int** view_m_data, int* dim1) {
if (auto cpuVec = dynamic_cast(m->vec.get())) {
std::memcpy(*view_m_data, cpuVec->getData(), sizeof(int) * (*dim1));
} else if (auto gpuVec = dynamic_cast(m->vec.get())) {
- hl_memcpy_device2host(*view_m_data, gpuVec->getData(),
- sizeof(int) * (*dim1));
+ hl_memcpy_device2host(
+ *view_m_data, gpuVec->getData(), sizeof(int) * (*dim1));
} else {
LOG(INFO) << "Unexpected situation";
}
@@ -201,8 +202,10 @@ Vector* Vector::createByPaddleVectorPtr(void* ptr) {
}
}
-Vector* Vector::createVectorFromNumpy(float* data, int dim, bool copy,
- bool useGpu) throw (UnsupportError){
+Vector* Vector::createVectorFromNumpy(float* data,
+ int dim,
+ bool copy,
+ bool useGpu) throw(UnsupportError) {
if (useGpu) {
/// if use gpu only copy=True is supported
if (!copy) {
@@ -251,8 +254,8 @@ void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
if (auto cpuVec = dynamic_cast(m->vec.get())) {
std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
} else if (auto gpuVec = dynamic_cast(m->vec.get())) {
- hl_memcpy_device2host(*view_m_data, gpuVec->getData(),
- sizeof(float) * (*dim1));
+ hl_memcpy_device2host(
+ *view_m_data, gpuVec->getData(), sizeof(float) * (*dim1));
} else {
LOG(INFO) << "Unexpected situation";
}
diff --git a/paddle/cuda/include/hl_activation_functions.h b/paddle/cuda/include/hl_activation_functions.h
index c8aabc7844cd48d7ebdd0077684f9efa50f941a2..03e15b2223a50625c6999f6b081ae984e76b182b 100644
--- a/paddle/cuda/include/hl_activation_functions.h
+++ b/paddle/cuda/include/hl_activation_functions.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_ACTIVATION_FUNCTIONS_H_
#define HL_ACTIVATION_FUNCTIONS_H_
@@ -21,11 +20,8 @@ limitations under the License. */
/**
* Active functions: sigmoid, relu, tanh and linear.
*/
-#define HPPL_ACTIVE_FUNCTION {hppl::sigmoid, \
- hppl::relu, \
- hppl::tanh, \
- hppl::linear \
- }
+#define HPPL_ACTIVE_FUNCTION \
+ { hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear }
namespace hppl {
@@ -42,18 +38,18 @@ public:
#ifdef __NVCC__
namespace gpu {
-static __device__ Active::forward forward[] = HPPL_ACTIVE_FUNCTION;
+static __device__ Active::forward forward[] = HPPL_ACTIVE_FUNCTION;
static __device__ Active::backward backward[] = HPPL_ACTIVE_FUNCTION;
}
#else
namespace cpu {
-static Active::forward forward[] = HPPL_ACTIVE_FUNCTION;
+static Active::forward forward[] = HPPL_ACTIVE_FUNCTION;
static Active::backward backward[] = HPPL_ACTIVE_FUNCTION;
}
#ifdef __AVX__
namespace avx {
-static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION;
+static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION;
static Active<__m256>::backward backward[] = HPPL_ACTIVE_FUNCTION;
}
#endif
diff --git a/paddle/cuda/include/hl_aggregate.h b/paddle/cuda/include/hl_aggregate.h
index db75809f5de195d41577ed6569e8508f48241b69..a6d9ff8483eee28b2c8a380f0aca097c7662a02e 100644
--- a/paddle/cuda/include/hl_aggregate.h
+++ b/paddle/cuda/include/hl_aggregate.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_AGGREGATE_H_
#define HL_AGGREGATE_H_
diff --git a/paddle/cuda/include/hl_avx_functions.h b/paddle/cuda/include/hl_avx_functions.h
index cf062dd969bf79554e00369367e3b85c2ae7fc0d..ed339e312a7639cf9b78f130a43d67a7446576bb 100644
--- a/paddle/cuda/include/hl_avx_functions.h
+++ b/paddle/cuda/include/hl_avx_functions.h
@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_AVX_FUNCTIONS_H_
#define HL_AVX_FUNCTIONS_H_
#include
namespace hppl {
- __m256 relu(const __m256 a);
- __m256 sigmoid(const __m256 a);
- __m256 tanh(const __m256 a);
- __m256 linear(const __m256 a);
-
- __m256 relu(const __m256 a, const __m256 b);
- __m256 sigmoid(const __m256 a, const __m256 b);
- __m256 tanh(const __m256 a, const __m256 b);
- __m256 linear(const __m256 a, const __m256 b);
+__m256 relu(const __m256 a);
+__m256 sigmoid(const __m256 a);
+__m256 tanh(const __m256 a);
+__m256 linear(const __m256 a);
+
+__m256 relu(const __m256 a, const __m256 b);
+__m256 sigmoid(const __m256 a, const __m256 b);
+__m256 tanh(const __m256 a, const __m256 b);
+__m256 linear(const __m256 a, const __m256 b);
} // namespace hppl
#endif // HL_AVX_FUNCTIONS_H_
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
index 9f80898a1f927a0e8bbf86108567a04ccecc38f5..a076952467a5ce10dc1f58007dda2170aa694fbb 100644
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
-
#ifndef HL_BASE_H_
#define HL_BASE_H_
@@ -33,36 +31,36 @@ limitations under the License. */
* HPPL_STREAM_DEFAULT is HPPL default stream.
*/
typedef enum {
- HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/
- HPPL_STREAM_1 = 1,
- HPPL_STREAM_2 = 2,
- HPPL_STREAM_3 = 3,
- HPPL_STREAM_4 = 4,
- HPPL_THREAD_STREAM_1 = 5,
- HPPL_THREAD_STREAM_2 = 6,
- HPPL_THREAD_STREAM_3 = 7,
- HPPL_THREAD_STREAM_4 = 8,
- HPPL_STREAM_END
+ HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/
+ HPPL_STREAM_1 = 1,
+ HPPL_STREAM_2 = 2,
+ HPPL_STREAM_3 = 3,
+ HPPL_STREAM_4 = 4,
+ HPPL_THREAD_STREAM_1 = 5,
+ HPPL_THREAD_STREAM_2 = 6,
+ HPPL_THREAD_STREAM_3 = 7,
+ HPPL_THREAD_STREAM_4 = 8,
+ HPPL_STREAM_END
} hl_stream_t;
/**
* @brief HPPL activation mode.
*/
typedef enum {
- HL_ACTIVATION_SIGMOID = 0,
- HL_ACTIVATION_RELU = 1,
- HL_ACTIVATION_TANH = 2,
- HL_ACTIVATION_LINEAR = 3,
- HL_ACTIVATION_END
+ HL_ACTIVATION_SIGMOID = 0,
+ HL_ACTIVATION_RELU = 1,
+ HL_ACTIVATION_TANH = 2,
+ HL_ACTIVATION_LINEAR = 3,
+ HL_ACTIVATION_END
} hl_activation_mode_t;
/**
* @brief Transpose type.
*/
typedef enum {
- HPPL_OP_N = 0, /* transpose */
- HPPL_OP_T = 1, /* non transpose */
- HPPL_OP_END
+ HPPL_OP_N = 0, /* transpose */
+ HPPL_OP_T = 1, /* non transpose */
+ HPPL_OP_END
} hl_trans_op_t;
/**
@@ -148,23 +146,21 @@ typedef struct {
* @brief Sparse matrix value type.
*/
typedef enum {
- HL_NO_VALUE = 0, /* matrix values only 0 or 1 */
- HL_FLOAT_VALUE = 1,
- HL_VALUE_END
+ HL_NO_VALUE = 0, /* matrix values only 0 or 1 */
+ HL_FLOAT_VALUE = 1,
+ HL_VALUE_END
} hl_matrix_value_t;
-
/**
* @brief HPPL matrix format.
*/
typedef enum {
- HL_SPARSE_CSR = 0,
- HL_SPARSE_CSC = 1,
- HL_SPARSE_END
+ HL_SPARSE_CSR = 0,
+ HL_SPARSE_CSC = 1,
+ HL_SPARSE_END
} hl_matrix_format_t;
-
-typedef struct _hl_matrix_s * hl_matrix_s;
+typedef struct _hl_matrix_s *hl_matrix_s;
/**
* @brief HPPL sparse matrix.
@@ -177,12 +173,12 @@ typedef struct _hl_matrix_s * hl_matrix_s;
* @param nnz nonzero values of sparse matrix.
*/
typedef struct {
- hl_matrix_s matrix;
- hl_matrix_format_t format;
- hl_matrix_value_t type;
- int rows;
- int cols;
- size_t nnz;
+ hl_matrix_s matrix;
+ hl_matrix_format_t format;
+ hl_matrix_value_t type;
+ int rows;
+ int cols;
+ size_t nnz;
} _hl_sparse_matrix_s, *hl_sparse_matrix_s;
#ifndef PADDLE_TYPE_DOUBLE
@@ -195,7 +191,7 @@ typedef struct {
*
* HL_FLOAT_MIN: 1.17549435e-38F
*/
-#define HL_FLOAT_MAX 3.40282347e+38F
+#define HL_FLOAT_MAX 3.40282347e+38F
/**
* if real == double
*
@@ -203,20 +199,18 @@ typedef struct {
*
* HL_FLOAT_MIN: 2.2250738585072014e-308
*/
-#define HL_FLOAT_MIN 1.17549435e-38F
+#define HL_FLOAT_MIN 1.17549435e-38F
#else
-#define HL_FLOAT_MAX 1.7976931348623157e+308
-#define HL_FLOAT_MIN 2.2250738585072014e-308
+#define HL_FLOAT_MAX 1.7976931348623157e+308
+#define HL_FLOAT_MIN 2.2250738585072014e-308
#endif
-
/**
* The maximum input value for exp, used to avoid overflow problem.
*
* Currently only used for tanh function.
*/
-#define EXP_MAX_INPUT 40.0
-
+#define EXP_MAX_INPUT 40.0
/**
* @brief DIVUP(x, y) is similar to ceil(x / y).
@@ -224,7 +218,7 @@ typedef struct {
* the size of blockDim.
*/
#ifndef DIVUP
-#define DIVUP(x, y) (((x) + (y) - 1) / (y))
+#define DIVUP(x, y) (((x) + (y)-1) / (y))
#endif
#ifdef __NVCC__
@@ -233,7 +227,7 @@ typedef struct {
#include "hl_cuda.h"
#include "cuda_runtime.h"
-extern __thread bool g_sync_flag;
+extern __thread bool g_sync_flag;
extern __thread cudaStream_t default_stream;
#define STREAM_DEFAULT default_stream
@@ -241,16 +235,15 @@ extern __thread cudaStream_t default_stream;
* @brief Check cuda kernel execution.
* @param msg error string
*/
-#define CHECK_SYNC(msg) \
- if (true == g_sync_flag) { \
- hl_stream_synchronize(HPPL_STREAM_DEFAULT); \
- cudaError_t err \
- = (cudaError_t)hl_get_device_last_error(); \
- CHECK_EQ(cudaSuccess, err) << "[" << msg << "] " \
- << "CUDA error: " \
- << hl_get_device_error_string((size_t)err); \
+#define CHECK_SYNC(msg) \
+ if (true == g_sync_flag) { \
+ hl_stream_synchronize(HPPL_STREAM_DEFAULT); \
+ cudaError_t err = (cudaError_t)hl_get_device_last_error(); \
+ CHECK_EQ(cudaSuccess, err) \
+ << "[" << msg << "] " \
+ << "CUDA error: " << hl_get_device_error_string((size_t)err); \
}
-#endif /* __NVCC__ */
+#endif /* __NVCC__ */
-#endif /* HL_BASE_H_ */
+#endif /* HL_BASE_H_ */
diff --git a/paddle/cuda/include/hl_batch_transpose.h b/paddle/cuda/include/hl_batch_transpose.h
index 414c7996acee4ccbe2d7dbd093e25a23119fea3c..f3630e9762508fd39935e62e0007de04f9140fff 100644
--- a/paddle/cuda/include/hl_batch_transpose.h
+++ b/paddle/cuda/include/hl_batch_transpose.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_BATCH_TRANSPOSE_H_
#define HL_BATCH_TRANSPOSE_H_
@@ -31,10 +30,7 @@ limitations under the License. */
* order. Each batch has height * width data, which are
* arranged in height-first (or row-first) manner.
*/
-extern void batchTranspose(const real* input,
- real* output,
- int width,
- int height,
- int batchSize);
+extern void batchTranspose(
+ const real* input, real* output, int width, int height, int batchSize);
#endif // HL_BATCH_TRANSPOSE_H_
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 70b5be6fda2509853029a68d31129df28d580942..cffaac634f0f64be5ddab961d549ae43775bb7b0 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CNN_H_
#define HL_CNN_H_
@@ -37,15 +36,21 @@ limitations under the License. */
* @param[in] alpha
* @param[in] beta
*/
-extern void hl_shrink_col2feature(
- const real * dataCol, size_t channels,
- size_t height, size_t width,
- size_t blockH, size_t blockW,
- size_t strideH, size_t strideW,
- size_t paddingH, size_t paddingW,
- size_t outputH, size_t outputW,
- real* dataIm,
- real alpha = 1.0f, real beta = 0.0f);
+extern void hl_shrink_col2feature(const real* dataCol,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t blockH,
+ size_t blockW,
+ size_t strideH,
+ size_t strideW,
+ size_t paddingH,
+ size_t paddingW,
+ size_t outputH,
+ size_t outputW,
+ real* dataIm,
+ real alpha = 1.0f,
+ real beta = 0.0f);
/**
* @brief Expand feature to column.
@@ -65,14 +70,19 @@ extern void hl_shrink_col2feature(
* @param[out] dataCol expand data.
*
*/
-extern void hl_expand_feature2col(
- const real* dataIm, size_t channels,
- size_t height, size_t width,
- size_t blockH, size_t blockW,
- size_t strideH, size_t strideW,
- size_t paddingH, size_t paddingW,
- size_t outputH, size_t outputW,
- real* dataCol);
+extern void hl_expand_feature2col(const real* dataIm,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t blockH,
+ size_t blockW,
+ size_t strideH,
+ size_t strideW,
+ size_t paddingH,
+ size_t paddingW,
+ size_t outputH,
+ size_t outputW,
+ real* dataCol);
/**
* @brief Maximum pool forward.
@@ -94,15 +104,21 @@ extern void hl_expand_feature2col(
* @param[in] tgtStride stride between output data samples.
*
*/
-extern void hl_maxpool_forward(
- const int frameCnt, const real* inputData,
- const int channels,
- const int height, const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- const int paddingH, const int paddingW,
- real* tgtData, const int tgtStride);
+extern void hl_maxpool_forward(const int frameCnt,
+ const real* inputData,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ const int paddingH,
+ const int paddingW,
+ real* tgtData,
+ const int tgtStride);
/**
* @brief Maximum pool backward.
@@ -125,20 +141,28 @@ extern void hl_maxpool_forward(
* @param[in] paddingH padding height.
* @param[in] paddingW padding width.
* @param[out] targetGrad output grad.
- * @param[in] outStride stride between output data samples.
+ * @param[in] outStride stride between output data samples.
*
*/
-extern void hl_maxpool_backward(
- const int frameCnt, const real* inputData,
- const real* outData, const real* outGrad,
- const int channels, const int height,
- const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- const int paddingH, const int paddingW,
- real scaleA, real scaleB,
- real* targetGrad, const int outStride);
+extern void hl_maxpool_backward(const int frameCnt,
+ const real* inputData,
+ const real* outData,
+ const real* outGrad,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ const int paddingH,
+ const int paddingW,
+ real scaleA,
+ real scaleB,
+ real* targetGrad,
+ const int outStride);
/**
* @brief Averge pool forward.
@@ -160,15 +184,21 @@ extern void hl_maxpool_backward(
* @param[in] tgtStride stride between output data samples.
*
*/
-extern void hl_avgpool_forward(
- const int frameCnt, const real* inputData,
- const int channels,
- const int height, const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- const int paddingH, const int paddingW,
- real* tgtData, const int tgtStride);
+extern void hl_avgpool_forward(const int frameCnt,
+ const real* inputData,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ const int paddingH,
+ const int paddingW,
+ real* tgtData,
+ const int tgtStride);
/**
* @brief Maximum pool backward.
@@ -189,19 +219,26 @@ extern void hl_avgpool_forward(
* @param[in] scaleA scale.
* @param[in] scaleB scale.
* @param[out] backGrad output grad.
- * @param[in] outStride stride between output data samples.
+ * @param[in] outStride stride between output data samples.
*
*/
-extern void hl_avgpool_backward(
- const int frameCnt, const real* outGrad,
- const int channels, const int height,
- const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- int paddingH, int paddingW,
- real scaleA, real scaleB,
- real* backGrad, const int outStride);
+extern void hl_avgpool_backward(const int frameCnt,
+ const real* outGrad,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ int paddingH,
+ int paddingW,
+ real scaleA,
+ real scaleB,
+ real* backGrad,
+ const int outStride);
/**
* @brief Cross-map-respose normalize forward.
@@ -218,10 +255,16 @@ extern void hl_avgpool_backward(
* @param[in] beta scale.
*
*/
-extern void hl_CMRNorm_forward(
- size_t frameCnt, const real* in, real* scale, real* out,
- size_t channels, size_t height, size_t width, size_t sizeX,
- real alpha, real beta);
+extern void hl_CMRNorm_forward(size_t frameCnt,
+ const real* in,
+ real* scale,
+ real* out,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t sizeX,
+ real alpha,
+ real beta);
/**
* @brief Cross-map-respose normalize backward.
@@ -240,11 +283,18 @@ extern void hl_CMRNorm_forward(
* @param[in] beta scale.
*
*/
-extern void hl_CMRNorm_backward(
- size_t frameCnt, const real* inV, const real* scale,
- const real* outV, const real* outDiff, real *inDiff,
- size_t channels, size_t height, size_t width, size_t sizeX,
- real alpha, real beta);
+extern void hl_CMRNorm_backward(size_t frameCnt,
+ const real* inV,
+ const real* scale,
+ const real* outV,
+ const real* outDiff,
+ real* inDiff,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t sizeX,
+ real alpha,
+ real beta);
/**
* @brief Bilinear interpolation forward.
@@ -278,24 +328,24 @@ extern void hl_bilinear_forward(const real* inData,
const real ratioH,
const real ratioW);
- /**
- * @brief Bilinear interpolation backward.
- *
- * @param[out] inGrad input gradient.
- * @param[in] inImgH input image height.
- * @param[in] inImgW input image width.
- * @param[in] inputH input batchSize.
- * @param[in] inputW input image data dim.
- * @param[in] outGrad output gradient.
- * @param[in] outImgH output image height.
- * @param[in] outImgW output image width.
- * @param[in] outputH output batchSize.
- * @param[in] outputW output image data dim.
- * @param[in] numChannels number of channels.
- * @param[in] ratioH inImgH / outImgH.
- * @param[in] ratioW inImgW / outImgW.
- *
- */
+/**
+* @brief Bilinear interpolation backward.
+*
+* @param[out] inGrad input gradient.
+* @param[in] inImgH input image height.
+* @param[in] inImgW input image width.
+* @param[in] inputH input batchSize.
+* @param[in] inputW input image data dim.
+* @param[in] outGrad output gradient.
+* @param[in] outImgH output image height.
+* @param[in] outImgW output image width.
+* @param[in] outputH output batchSize.
+* @param[in] outputW output image data dim.
+* @param[in] numChannels number of channels.
+* @param[in] ratioH inImgH / outImgH.
+* @param[in] ratioW inImgW / outImgW.
+*
+*/
extern void hl_bilinear_backward(real* inGrad,
const size_t inImgH,
const size_t inImgW,
@@ -321,9 +371,13 @@ extern void hl_bilinear_backward(real* inGrad,
* @param[in] featLen feature length = image height * image width.
* @param[in] groups number of groups.
*/
-extern void hl_maxout_forward(
- const real* inData, real* outData, int* idData,
- size_t batchSize, size_t size, size_t featLen, size_t groups);
+extern void hl_maxout_forward(const real* inData,
+ real* outData,
+ int* idData,
+ size_t batchSize,
+ size_t size,
+ size_t featLen,
+ size_t groups);
/**
* @brief MaxOut backward.
@@ -336,8 +390,12 @@ extern void hl_maxout_forward(
* @param[in] featLen feature length = image height * image width.
* @param[in] groups number of groups.
*/
-extern void hl_maxout_backward(
- real* inGrad, const real* outGrad, const int* idData,
- size_t batchSize, size_t size, size_t featLen, size_t groups);
+extern void hl_maxout_backward(real* inGrad,
+ const real* outGrad,
+ const int* idData,
+ size_t batchSize,
+ size_t size,
+ size_t featLen,
+ size_t groups);
#endif /* HL_CNN_H_ */
diff --git a/paddle/cuda/include/hl_cuda.h b/paddle/cuda/include/hl_cuda.h
index 3196db67f61fd2e6b75df4abb3652df4456a0366..357286e3188a6f3184bc56e75232bf2e1ec54e44 100644
--- a/paddle/cuda/include/hl_cuda.h
+++ b/paddle/cuda/include/hl_cuda.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CUDA_H_
#define HL_CUDA_H_
@@ -22,8 +21,7 @@ limitations under the License. */
/**
* @brief HPPL event.
*/
-typedef struct _hl_event_st * hl_event_t;
-
+typedef struct _hl_event_st *hl_event_t;
/**
* @brief return cuda runtime api version.
@@ -42,7 +40,7 @@ extern void hl_start();
* if device is NULL, will start all GPU.
* @param[in] number number of devices.
*/
-extern void hl_specify_devices_start(int* device, int number);
+extern void hl_specify_devices_start(int *device, int number);
/**
* @brief Queries if a device may directly access a peer device's memory.
@@ -126,7 +124,7 @@ extern int hl_get_device();
*
* @return dest_d pointer to device memory.
*/
-extern void* hl_malloc_device(size_t size);
+extern void *hl_malloc_device(size_t size);
/**
* @brief Free device memory.
@@ -143,7 +141,7 @@ extern void hl_free_mem_device(void *dest_d);
*
* @return dest_h pointer to host memory.
*/
-extern void* hl_malloc_host(size_t size);
+extern void *hl_malloc_host(size_t size);
/**
* @brief Free host page-lock memory.
@@ -228,9 +226,9 @@ extern void hl_srand(unsigned int seed);
* @param[in] stream stream id.
*/
extern void hl_memcpy_async(void *dst,
- void *src,
- size_t size,
- hl_stream_t stream);
+ void *src,
+ size_t size,
+ hl_stream_t stream);
/**
* @brief Waits for stream tasks to complete.
@@ -261,8 +259,7 @@ extern void hl_destroy_event(hl_event_t event);
*
* @return time Time between start and end in ms.
*/
-extern float hl_event_elapsed_time(hl_event_t start,
- hl_event_t end);
+extern float hl_event_elapsed_time(hl_event_t start, hl_event_t end);
/**
* @brief Records an event.
@@ -300,7 +297,7 @@ extern void hl_set_device_flags_block();
/**
* @brief Returns the last error string from a cuda runtime call.
*/
-extern const char* hl_get_device_error_string();
+extern const char *hl_get_device_error_string();
/**
* @brief Returns the last error string from a cuda runtime call.
@@ -309,7 +306,7 @@ extern const char* hl_get_device_error_string();
*
* @see hl_get_device_last_error()
*/
-extern const char* hl_get_device_error_string(size_t err);
+extern const char *hl_get_device_error_string(size_t err);
/**
* @brief Returns the last error number.
diff --git a/paddle/cuda/include/hl_cuda_cublas.h b/paddle/cuda/include/hl_cuda_cublas.h
index d757317eb4a97559feef22d4fd8edf7c10ca6745..db8c03c2c01c67788622d37b5330e22c31e03f34 100644
--- a/paddle/cuda/include/hl_cuda_cublas.h
+++ b/paddle/cuda/include/hl_cuda_cublas.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CUDA_CUBLAS_H_
#define HL_CUDA_CUBLAS_H_
@@ -29,12 +28,8 @@ limitations under the License. */
* @param[in] ldc the first dimension of C_d.
*
*/
-extern void hl_matrix_transpose(real *A_d,
- real *C_d,
- int dimM,
- int dimN,
- int lda,
- int ldc);
+extern void hl_matrix_transpose(
+ real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc);
/*
* @brief Matrix transpose, while lda = dimN, ldc = dimM.
@@ -45,10 +40,7 @@ extern void hl_matrix_transpose(real *A_d,
* @param[in] dimN matrix width.
*
*/
-extern void hl_matrix_transpose(real *A_d,
- real *C_d,
- int dimM,
- int dimN);
+extern void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN);
/*
* @brief Matrix inverse
@@ -60,11 +52,7 @@ extern void hl_matrix_transpose(real *A_d,
* @param[in] ldc the first dimension of C_d
*
*/
-extern void hl_matrix_inverse(real *A_d,
- real *C_d,
- int dimN,
- int lda,
- int ldc);
+extern void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc);
/**
* @brief C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -84,12 +72,19 @@ extern void hl_matrix_inverse(real *A_d,
* @param[in] ldc the first dimension of C_d.
*
*/
-extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+extern void hl_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta,
- int lda, int ldb, int ldc);
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta,
+ int lda,
+ int ldb,
+ int ldc);
/**
* @brief C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -106,11 +101,16 @@ extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
* @param[in] beta scalar used for multiplication.
*
*/
-extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+extern void hl_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta);
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta);
/**
* @brief This function performs the matrix-vector multiplication.
@@ -132,11 +132,17 @@ extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
*
*/
-extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
- real *B_d, real *C_d,
- int dimM, int dimN,
- real alpha, real beta,
- int lda, int incb, int incc);
+extern void hl_matrix_mul_vector(real *A_d,
+ hl_trans_op_t trans,
+ real *B_d,
+ real *C_d,
+ int dimM,
+ int dimN,
+ real alpha,
+ real beta,
+ int lda,
+ int incb,
+ int incc);
/**
* @brief This function performs the matrix-vector multiplication.
@@ -154,9 +160,13 @@ extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
* @param[in] beta scalar used for multiplication.
*
*/
-extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
- real *B_d, real *C_d,
- int dimM, int dimN,
- real alpha, real beta);
+extern void hl_matrix_mul_vector(real *A_d,
+ hl_trans_op_t trans,
+ real *B_d,
+ real *C_d,
+ int dimM,
+ int dimN,
+ real alpha,
+ real beta);
#endif /* HL_CUDA_CUBLAS_H_ */
diff --git a/paddle/cuda/include/hl_cuda_cudnn.h b/paddle/cuda/include/hl_cuda_cudnn.h
index f256cb54dfe69e8df7cc7fcc0ed0a58f3574acd3..3a2f916210277145efa8f6d7663a2698ea546b0b 100644
--- a/paddle/cuda/include/hl_cuda_cudnn.h
+++ b/paddle/cuda/include/hl_cuda_cudnn.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CUDA_CUDNN_H_
#define HL_CUDA_CUDNN_H_
@@ -22,7 +21,7 @@ limitations under the License. */
* hppl pooling mode
*/
typedef enum {
- HL_POOLING_MAX = 0,
+ HL_POOLING_MAX = 0,
// average includes padded values
HL_POOLING_AVERAGE = 1,
// average does not include padded values
@@ -324,17 +323,16 @@ extern void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
* @param[in] sizeInBytes gpu workspace size (bytes).
* @param[in] convBwdFilterAlgo backward filter algorithm.
*/
-extern void hl_convolution_backward_filter(
- hl_tensor_descriptor input,
- real* input_data,
- hl_tensor_descriptor output,
- real* output_grad_data,
- hl_filter_descriptor filter,
- real* filter_grad_data,
- hl_convolution_descriptor conv,
- void* gpuWorkSpace,
- size_t sizeInBytes,
- int convBwdFilterAlgo);
+extern void hl_convolution_backward_filter(hl_tensor_descriptor input,
+ real* input_data,
+ hl_tensor_descriptor output,
+ real* output_grad_data,
+ hl_filter_descriptor filter,
+ real* filter_grad_data,
+ hl_convolution_descriptor conv,
+ void* gpuWorkSpace,
+ size_t sizeInBytes,
+ int convBwdFilterAlgo);
/**
* @brief convolution backward data(calculate input image grad data).
@@ -350,17 +348,16 @@ extern void hl_convolution_backward_filter(
* @param[in] sizeInBytes gpu workspace size (bytes).
* @param[in] convBwdDataAlgo backward data algorithm.
*/
-extern void hl_convolution_backward_data(
- hl_tensor_descriptor input,
- real* input_data_grad,
- hl_tensor_descriptor output,
- real* output_grad_data,
- hl_filter_descriptor filter,
- real* filter_data,
- hl_convolution_descriptor conv,
- void* gpuWorkSpace,
- size_t sizeInBytes,
- int convBwdDataAlgo);
+extern void hl_convolution_backward_data(hl_tensor_descriptor input,
+ real* input_data_grad,
+ hl_tensor_descriptor output,
+ real* output_grad_data,
+ hl_filter_descriptor filter,
+ real* filter_data,
+ hl_convolution_descriptor conv,
+ void* gpuWorkSpace,
+ size_t sizeInBytes,
+ int convBwdDataAlgo);
/**
* @brief convolution backward bias(calculate bias grad data).
@@ -383,8 +380,8 @@ extern void hl_convolution_backward_bias(hl_tensor_descriptor bias,
* @param[in] height matrix height.
* @param[in] width matrix width.
*/
-extern void hl_softmax_forward(real *input,
- real *output,
+extern void hl_softmax_forward(real* input,
+ real* output,
int height,
int width);
@@ -396,8 +393,8 @@ extern void hl_softmax_forward(real *input,
* @param[in] height matrix height.
* @param[in] width matrix width.
*/
-extern void hl_softmax_backward(real *output_value,
- real *output_grad,
+extern void hl_softmax_backward(real* output_value,
+ real* output_grad,
int height,
int width);
@@ -426,18 +423,18 @@ extern void hl_softmax_backward(real *output_value,
*
*/
extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outputDesc,
- real *output,
+ real* output,
hl_tensor_descriptor bnParamDesc,
- real *scale,
- real *bias,
+ real* scale,
+ real* bias,
double factor,
- real *runningMean,
- real *runningInvVar,
+ real* runningMean,
+ real* runningInvVar,
double epsilon,
- real *savedMean,
- real *savedVar);
+ real* savedMean,
+ real* savedVar);
/**
* @brief cudnn batch norm forward.
@@ -463,14 +460,14 @@ extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
*
*/
extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outputDesc,
- real *output,
+ real* output,
hl_tensor_descriptor bnParamDesc,
- real *scale,
- real *bias,
- real *estimatedMean,
- real *estimatedVar,
+ real* scale,
+ real* bias,
+ real* estimatedMean,
+ real* estimatedVar,
double epsilon);
/**
@@ -483,7 +480,8 @@ extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
* @param[in] inGradDesc input tensor descriptor desc.
* @param[in] inGrad input data.
* @param[in] dBnParamDesc tensor descriptor desc.
- * bnScale, bnBias, running mean/var, save_mean/var.
+ * bnScale, bnBias, running mean/var,
+ * save_mean/var.
* @param[in] scale batch normalization scale parameter (in original
* paper scale is referred to as gamma).
* @param[in] scaleGrad batch normalization scale parameter (in original
@@ -497,17 +495,17 @@ extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
*
*/
extern void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outGradDesc,
- real *outGrad,
+ real* outGrad,
hl_tensor_descriptor inGradDesc,
- real *inGrad,
+ real* inGrad,
hl_tensor_descriptor dBnParamDesc,
- real *scale,
- real *scaleGrad,
- real *biasGrad,
+ real* scale,
+ real* scaleGrad,
+ real* biasGrad,
double epsilon,
- real *savedMean,
- real *savedInvVar);
+ real* savedMean,
+ real* savedInvVar);
#endif // HL_CUDA_CUDNN_H_
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h
index f36c724e2da3dce11696fcda7daf98f5cda36dd6..1eb9f9ca888d3a93f04621e10346b5f9ff34cdca 100644
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_DSO_LOADER_H_
#define HL_DSO_LOADER_H_
diff --git a/paddle/cuda/include/hl_functions.h b/paddle/cuda/include/hl_functions.h
index 65f366461ced0f9ee31ff9075f6dfaeb6c9b72a2..91ce9a0678463597df88c548aeac322ee19d95de 100644
--- a/paddle/cuda/include/hl_functions.h
+++ b/paddle/cuda/include/hl_functions.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_FUNCTIONS_H_
#define HL_FUNCTIONS_H_
@@ -21,30 +20,30 @@ limitations under the License. */
/**
* sigmoid threshold maximum
*/
-#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MIN -40.0
/**
* sigmoid threshold minimum
*/
-#define SIGMOID_THRESHOLD_MAX 13.0
+#define SIGMOID_THRESHOLD_MAX 13.0
#ifndef __NVCC__
namespace hppl {
- /*
- * forward activation
- */
- real relu(const real a);
- real sigmoid(const real a);
- real tanh(const real a);
- real linear(const real a);
-
- /*
- * backward activation
- */
- real relu(const real a, const real b);
- real sigmoid(const real a, const real b);
- real tanh(const real a, const real b);
- real linear(const real a, const real b);
+/*
+ * forward activation
+ */
+real relu(const real a);
+real sigmoid(const real a);
+real tanh(const real a);
+real linear(const real a);
+
+/*
+ * backward activation
+ */
+real relu(const real a, const real b);
+real sigmoid(const real a, const real b);
+real tanh(const real a, const real b);
+real linear(const real a, const real b);
} // namespace hppl
#ifdef __AVX__
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h
index 05039663b6e9f5e4a72f15ab822d723635f9b282..3be0df3b93b69811fb9c36dae223cbd927b02559 100644
--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_GPU_H_
#define HL_GPU_H_
diff --git a/paddle/cuda/include/hl_lstm.h b/paddle/cuda/include/hl_lstm.h
index 1f95e318a1fe06050bbd31c2e276974f4a8bdc1e..7e527a79025969320f1aca75d313fd9d0194efd1 100644
--- a/paddle/cuda/include/hl_lstm.h
+++ b/paddle/cuda/include/hl_lstm.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_LSTM_H_
#define HL_LSTM_H_
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
index 6195e30b9974d3ad092b4cf604e6b74fa481835c..96648661e345d8fa5d50cb2aae3a56ee53921f90 100644
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_MATRIX_H_
#define HL_MATRIX_H_
@@ -30,13 +29,8 @@ limitations under the License. */
* @param[in] beta scalar used for addition.
*
*/
-extern void hl_matrix_add(real* A_d,
- real* B_d,
- real* C_d,
- int dimM,
- int dimN,
- real alpha,
- real beta);
+extern void hl_matrix_add(
+ real* A_d, real* B_d, real* C_d, int dimM, int dimN, real alpha, real beta);
/**
* @brief Matrix Softmax.
*
@@ -46,7 +40,7 @@ extern void hl_matrix_add(real* A_d,
* @param[in] dimN matrix width.
*
*/
-extern void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN);
+extern void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN);
/**
* @brief Matrix softmax derivative.
@@ -58,11 +52,8 @@ extern void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN);
* @param[in] dimN matrix width.
*
*/
-extern void hl_matrix_softmax_derivative(real* grad_d,
- real* output_d,
- real* sftmaxSum_d,
- int dimM,
- int dimN);
+extern void hl_matrix_softmax_derivative(
+ real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN);
/**
* @brief Sequence softmax.
@@ -73,8 +64,8 @@ extern void hl_matrix_softmax_derivative(real* grad_d,
* @param[in] numSequence sequence number.
*
*/
-extern void hl_sequence_softmax_forward(real *A_d,
- real *C_d,
+extern void hl_sequence_softmax_forward(real* A_d,
+ real* C_d,
const int* index,
int numSequence);
@@ -88,11 +79,8 @@ extern void hl_sequence_softmax_forward(real *A_d,
* @param[in] dimN matrix width.
*
*/
-extern void hl_matrix_classification_error(real* A_d,
- int* B_d,
- real* C_d,
- int dimM,
- int dimN);
+extern void hl_matrix_classification_error(
+ real* A_d, int* B_d, real* C_d, int dimM, int dimN);
/**
* @brief Matrix cross entropy.
@@ -104,11 +92,8 @@ extern void hl_matrix_classification_error(real* A_d,
* @param[in] dimN matrix width.
*
*/
-extern void hl_matrix_cross_entropy(real* A_d,
- real* C_d,
- int* label_d,
- int dimM,
- int dimN);
+extern void hl_matrix_cross_entropy(
+ real* A_d, real* C_d, int* label_d, int dimM, int dimN);
/**
* @brief Matrix cross entropy back propagation.
@@ -120,11 +105,8 @@ extern void hl_matrix_cross_entropy(real* A_d,
* @param[in] dimN matrix width.
*
*/
-extern void hl_matrix_cross_entropy_bp(real* grad_d,
- real* output_d,
- int* label_d,
- int dimM,
- int dimN);
+extern void hl_matrix_cross_entropy_bp(
+ real* grad_d, real* output_d, int* label_d, int dimM, int dimN);
/**
* @brief Matrix multi-binary label cross entropy
@@ -135,11 +117,8 @@ extern void hl_matrix_cross_entropy_bp(real* grad_d,
* @param[in] dimM matrix height.
* @param[in] dimN matrix width.
*/
-extern void hl_matrix_multi_binary_cross_entropy(real* output,
- real* entropy,
- hl_sparse_matrix_s mat,
- int dimM,
- int dimN);
+extern void hl_matrix_multi_binary_cross_entropy(
+ real* output, real* entropy, hl_sparse_matrix_s mat, int dimM, int dimN);
/**
* @brief Matrix multi-binary label cross entropy backprop
@@ -150,11 +129,8 @@ extern void hl_matrix_multi_binary_cross_entropy(real* output,
* @param[in] dimM matrix height.
* @param[in] dimN matrix width.
*/
-extern void hl_matrix_multi_binary_cross_entropy_bp(real* output,
- real* grad,
- hl_sparse_matrix_s mat,
- int dimM,
- int dimN);
+extern void hl_matrix_multi_binary_cross_entropy_bp(
+ real* output, real* grad, hl_sparse_matrix_s mat, int dimM, int dimN);
/**
* @brief Matrix zero memory.
@@ -176,12 +152,8 @@ extern void hl_matrix_zero_mem(real* data, int num);
* @param[in] partial_sum
*/
-extern void hl_param_relu_forward(real* output,
- real* input,
- real* w,
- int width,
- int height,
- int partial_sum);
+extern void hl_param_relu_forward(
+ real* output, real* input, real* w, int width, int height, int partial_sum);
/**
* @brief parameter relu backward w
*
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
index 46d86b2982f065802eec83ca7554f787d1d02f3a..bb5124df44b492bd8fdeb2a0c75ebcf74d2c8157 100644
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_SEQUENCE_H_
#define HL_SEQUENCE_H_
@@ -32,7 +31,7 @@ limitations under the License. */
extern void hl_max_sequence_forward(real* input,
const int* sequence,
real* output,
- int *index,
+ int* index,
int numSequences,
int dim);
@@ -46,11 +45,8 @@ extern void hl_max_sequence_forward(real* input,
* @param[in] dim input dimension.
*
*/
-extern void hl_max_sequence_backward(real* outputGrad,
- int *index,
- real* inputGrad,
- int numSequences,
- int dim);
+extern void hl_max_sequence_backward(
+ real* outputGrad, int* index, real* inputGrad, int numSequences, int dim);
/**
* @brief Context projection forward.
@@ -63,7 +59,8 @@ extern void hl_max_sequence_backward(real* outputGrad,
* @param[in] inputDim input sequence dimension.
* @param[in] contextLength context length.
* @param[in] contextStart context start.
- * @param[in] beginPad number of extra timesteps added at the beginning.
+ * @param[in] beginPad number of extra timesteps added at the
+ * beginning.
* @param[in] isPadding trainable padding.
*
*/
@@ -109,7 +106,8 @@ extern void hl_context_projection_backward_data(real* outputGrad,
* @param[in] totalPad number of extra timesteps.
* @param[in] contextLength context length.
* @param[in] contextStart context start.
- * @param[in] beginPad number of extra timesteps added at the beginning.
+ * @param[in] beginPad number of extra timesteps added at the
+ * beginning.
*
*/
extern void hl_context_projection_backward_weight(real* outputGrad,
@@ -141,9 +139,9 @@ extern void hl_context_projection_backward_weight(real* outputGrad,
* @param[in] seq2batch copy direction.
*
*/
-extern void hl_sequence2batch_copy(real *batch,
- real *sequence,
- const int *batchIndex,
+extern void hl_sequence2batch_copy(real* batch,
+ real* sequence,
+ const int* batchIndex,
int seqWidth,
int batchCount,
bool seq2batch);
@@ -167,9 +165,9 @@ extern void hl_sequence2batch_copy(real *batch,
* @param[in] seq2batch copy direction.
*
*/
-extern void hl_sequence2batch_add(real *batch,
- real *sequence,
- int *batchIndex,
+extern void hl_sequence2batch_add(real* batch,
+ real* sequence,
+ int* batchIndex,
int seqWidth,
int batchCount,
bool seq2batch);
diff --git a/paddle/cuda/include/hl_sparse.h b/paddle/cuda/include/hl_sparse.h
index 9acdebdebf37761e1485e3441963586ead9f3c85..c4e0be23e2031cbcb124b532216a23d8a344668d 100644
--- a/paddle/cuda/include/hl_sparse.h
+++ b/paddle/cuda/include/hl_sparse.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_SPARSE_H_
#define HL_SPARSE_H_
@@ -31,7 +30,7 @@ limitations under the License. */
*/
extern void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
hl_matrix_format_t format,
- hl_matrix_value_t value_type,
+ hl_matrix_value_t value_type,
int dimM,
int dimN,
int nnz);
@@ -60,10 +59,10 @@ extern void hl_free_sparse_matrix(hl_sparse_matrix_s A_d);
*
*/
extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
- void * dest_d,
+ void *dest_d,
size_t size,
hl_matrix_format_t format,
- hl_matrix_value_t value_type,
+ hl_matrix_value_t value_type,
int dimM,
int dimN,
int nnz);
@@ -94,11 +93,11 @@ extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
*
*/
extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
- real* value_d,
- int* rows_d,
- int* cols_d,
+ real *value_d,
+ int *rows_d,
+ int *cols_d,
hl_matrix_format_t format,
- hl_matrix_value_t value_type,
+ hl_matrix_value_t value_type,
int dimM,
int dimN,
int nnz);
@@ -259,10 +258,14 @@ extern void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
*/
extern void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta);
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta);
/**
* @brief C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
@@ -311,11 +314,16 @@ extern void hl_matrix_dense_mul_csc(real *A_d,
* @note transb is not support HPPL_OP_T.
*
*/
-extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+extern void hl_sparse_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
hl_sparse_matrix_s C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta);
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta);
/**
* @brief C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -336,12 +344,16 @@ extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
* @note transa is not support HPPL_OP_T.
*
*/
-extern void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
+extern void hl_matrix_dense_mul_csr(real *A_d,
+ hl_trans_op_t transa,
hl_sparse_matrix_s B_d,
hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta);
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta);
/**
* @brief Memcpy csc_matrix to host.
@@ -412,7 +424,6 @@ extern void hl_memcpy_from_csr_matrix(real *csr_val,
hl_sparse_matrix_s csr_matrix,
hl_stream_t stream);
-
/**
* @brief A_d[j] += B_d[i,j] for i in range(height)
*
@@ -423,19 +434,13 @@ extern void hl_memcpy_from_csr_matrix(real *csr_val,
* @param[in] scale scale of B_d
*
*/
-extern void hl_sparse_matrix_column_sum(real* A_d,
- hl_sparse_matrix_s B_d,
- int dimM,
- int dimN,
- real scale);
+extern void hl_sparse_matrix_column_sum(
+ real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale);
/**
* @brief implementation of csr sparse matrix in hl_sparse_matirx_column_sum
*/
-extern void hl_matrix_csr_column_sum(real* A_d,
- hl_sparse_matrix_s B_d,
- int dimM,
- int dimN,
- real scale);
+extern void hl_matrix_csr_column_sum(
+ real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale);
/**
* @brief A_d[i,j] += B_d[j]
@@ -446,13 +451,13 @@ extern void hl_matrix_csr_column_sum(real* A_d,
*
*/
extern void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
real scale);
/**
* @brief implementation of csr sparse matrix in hl_sparse_matrix_add_bias
*/
extern void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
real scale);
/**
@@ -470,7 +475,7 @@ extern void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
*
*/
extern void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
int dimM,
int dimN,
real alpha,
@@ -479,7 +484,7 @@ extern void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
* @brief implementation of csr sparse matrix in hl_sparse_matrix_add_dense
*/
extern void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
int dimM,
int dimN,
real alpha,
@@ -493,7 +498,7 @@ extern void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
* @return return rows pointer, which is gpu address
*
*/
-extern int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
+extern int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
/**
* @brief get cols pionter of GpuSparseMatrix
@@ -503,7 +508,7 @@ extern int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
* @return return cols pointer, which is gpu address
*
*/
-extern int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
+extern int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
/**
* @brief get value pionter of GpuSparseMatrix
@@ -513,7 +518,6 @@ extern int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
* @return return value pointer, which is gpu address
*
*/
-extern real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat);
-
+extern real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat);
#endif /* HL_SPARSE_H_ */
diff --git a/paddle/cuda/include/hl_table_apply.h b/paddle/cuda/include/hl_table_apply.h
index 3c9428e9253d5ed563e4e9f62d8842667496b83c..b4ac83a66af13c2a843872faba2ebd972008a738 100644
--- a/paddle/cuda/include/hl_table_apply.h
+++ b/paddle/cuda/include/hl_table_apply.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_TABLE_APPLY_H_
#define HL_TABLE_APPLY_H_
@@ -31,8 +30,10 @@ limitations under the License. */
* @param[in] dim width of table.
*
*/
-extern void hl_matrix_select_rows(real* output, int ldo,
- real* table, int ldt,
+extern void hl_matrix_select_rows(real* output,
+ int ldo,
+ real* table,
+ int ldt,
int* ids,
int numSamples,
int tableSize,
@@ -53,8 +54,10 @@ extern void hl_matrix_select_rows(real* output, int ldo,
* @param[in] dim width of table.
*
*/
-extern void hl_matrix_add_to_rows(real* table, int ldt,
- real* input, int ldi,
+extern void hl_matrix_add_to_rows(real* table,
+ int ldt,
+ real* input,
+ int ldi,
int* ids,
int numSamples,
int tableSize,
@@ -72,8 +75,7 @@ extern void hl_matrix_add_to_rows(real* table, int ldt,
*
*/
template
-extern void hl_vector_select_from(T* dst, int sized,
- const T* src, int sizes,
- const int* ids, int sizei);
+extern void hl_vector_select_from(
+ T* dst, int sized, const T* src, int sizes, const int* ids, int sizei);
-#endif /* HL_TABLE_APPLY_H_ */
+#endif /* HL_TABLE_APPLY_H_ */
diff --git a/paddle/cuda/include/hl_time.h b/paddle/cuda/include/hl_time.h
index 4414b0b2d2ed4ab6a48294ffaed3a43a639e5950..b0a88c66a12fcfec6ea96b877423f907dac8dfa1 100644
--- a/paddle/cuda/include/hl_time.h
+++ b/paddle/cuda/include/hl_time.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_TIME_H_
#define HL_TIME_H_
diff --git a/paddle/cuda/include/hl_top_k.h b/paddle/cuda/include/hl_top_k.h
index a38d4cf862278a060f72b970d723895dc3735d0a..e8cfebbf6a3bd27c10a71d7817238bc304681fa4 100644
--- a/paddle/cuda/include/hl_top_k.h
+++ b/paddle/cuda/include/hl_top_k.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_TOP_K_H_
#define HL_TOP_K_H_
@@ -31,9 +30,11 @@ limitations under the License. */
* @param[in] numSamples height of input value.
*
*/
-extern void hl_matrix_top_k(real* topVal, int ldv,
- int * topIds,
- real* src, int lds,
+extern void hl_matrix_top_k(real* topVal,
+ int ldv,
+ int* topIds,
+ real* src,
+ int lds,
int dim,
int beamSize,
int numSamples);
@@ -50,8 +51,9 @@ extern void hl_matrix_top_k(real* topVal, int ldv,
*
* @note Only support HL_SPARSE_CSR format.
*/
-extern void hl_sparse_matrix_top_k(real* topVal, int ldv,
- int * topIds,
+extern void hl_sparse_matrix_top_k(real* topVal,
+ int ldv,
+ int* topIds,
hl_sparse_matrix_s src,
int beamSize,
int numSamples);
diff --git a/paddle/cuda/include/stub/hl_aggregate_stub.h b/paddle/cuda/include/stub/hl_aggregate_stub.h
index 4c0c68f3c98fe95f01060b82c3a1b9822d2a3715..bb53fc581e09905aa7a9b2d8dfe44b04c677c40a 100644
--- a/paddle/cuda/include/stub/hl_aggregate_stub.h
+++ b/paddle/cuda/include/stub/hl_aggregate_stub.h
@@ -12,29 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_AGGREGATE_STUB_H_
#define HL_AGGREGATE_STUB_H_
#include "hl_aggregate.h"
-inline void hl_matrix_row_sum(real *A_d, real *C_d,
- int dimM, int dimN) {}
+inline void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {}
-inline void hl_matrix_row_max(real *A_d, real *C_d,
- int dimM, int dimN) {}
+inline void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {}
-inline void hl_matrix_row_min(real *A_d, real *C_d,
- int dimM, int dimN) {}
+inline void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {}
-inline void hl_matrix_column_sum(real *A_d, real *C_d,
- int dimM, int dimN) {}
+inline void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {}
-inline void hl_matrix_column_max(real *A_d, real *C_d,
- int dimM, int dimN) {}
+inline void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {}
-inline void hl_matrix_column_min(real *A_d, real *C_d,
- int dimM, int dimN) {}
+inline void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {}
inline void hl_vector_sum(real *A_d, real *C_h, int dimM) {}
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index c6f32ad337705ff938b7b370a4785dc7f4393041..2f73b9671edd3609996aebff2913f5262805f869 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -12,84 +12,134 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CNN_STUB_H_
#define HL_CNN_STUB_H_
#include "hl_cnn.h"
-inline void hl_shrink_col2feature(
- const real * dataCol, size_t channels,
- size_t height, size_t width,
- size_t blockH, size_t blockW,
- size_t strideH, size_t strideW,
- size_t paddingH, size_t paddingW,
- size_t outputH, size_t outputW,
- real* dataIm,
- real alpha, real beta) {}
-
-inline void hl_expand_feature2col(
- const real* dataIm, size_t channels,
- size_t height, size_t width,
- size_t blockH, size_t blockW,
- size_t strideH, size_t strideW,
- size_t paddingH, size_t paddingW,
- size_t outputH, size_t outputW,
- real* dataCol) {}
-
-inline void hl_maxpool_forward(
- const int frameCnt, const real* inputData,
- const int channels,
- const int height, const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- const int paddingH, const int paddingW,
- real* tgtData, const int tgtStride) {}
-
-inline void hl_maxpool_backward(
- const int frameCnt, const real* inputData,
- const real* outData, const real* outGrad,
- const int channels, const int height,
- const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- const int paddingH, const int paddingW,
- real scaleA, real scaleB,
- real* targetGrad, const int outStride) {}
-
-inline void hl_avgpool_forward(
- const int frameCnt, const real* inputData,
- const int channels,
- const int height, const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- const int paddingH, const int paddingW,
- real* tgtData, const int tgtStride) {}
-
-inline void hl_avgpool_backward(
- const int frameCnt, const real* outGrad,
- const int channels, const int height,
- const int width,
- const int pooledH, const int pooledW,
- const int sizeX, const int sizeY,
- const int strideH, const int strideW,
- int paddingH, int paddingW,
- real scaleA, real scaleB,
- real* backGrad, const int outStride) {}
-
-inline void hl_CMRNorm_forward(
- size_t frameCnt, const real* in, real* scale, real* out,
- size_t channels, size_t height, size_t width, size_t sizeX,
- real alpha, real beta) {}
-
-inline void hl_CMRNorm_backward(
- size_t frameCnt, const real* inV, const real* scale,
- const real* outV, const real* outDiff, real *inDiff,
- size_t channels, size_t height, size_t width, size_t sizeX,
- real alpha, real beta) {}
+inline void hl_shrink_col2feature(const real* dataCol,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t blockH,
+ size_t blockW,
+ size_t strideH,
+ size_t strideW,
+ size_t paddingH,
+ size_t paddingW,
+ size_t outputH,
+ size_t outputW,
+ real* dataIm,
+ real alpha,
+ real beta) {}
+
+inline void hl_expand_feature2col(const real* dataIm,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t blockH,
+ size_t blockW,
+ size_t strideH,
+ size_t strideW,
+ size_t paddingH,
+ size_t paddingW,
+ size_t outputH,
+ size_t outputW,
+ real* dataCol) {}
+
+inline void hl_maxpool_forward(const int frameCnt,
+ const real* inputData,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ const int paddingH,
+ const int paddingW,
+ real* tgtData,
+ const int tgtStride) {}
+
+inline void hl_maxpool_backward(const int frameCnt,
+ const real* inputData,
+ const real* outData,
+ const real* outGrad,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ const int paddingH,
+ const int paddingW,
+ real scaleA,
+ real scaleB,
+ real* targetGrad,
+ const int outStride) {}
+
+inline void hl_avgpool_forward(const int frameCnt,
+ const real* inputData,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ const int paddingH,
+ const int paddingW,
+ real* tgtData,
+ const int tgtStride) {}
+
+inline void hl_avgpool_backward(const int frameCnt,
+ const real* outGrad,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooledH,
+ const int pooledW,
+ const int sizeX,
+ const int sizeY,
+ const int strideH,
+ const int strideW,
+ int paddingH,
+ int paddingW,
+ real scaleA,
+ real scaleB,
+ real* backGrad,
+ const int outStride) {}
+
+inline void hl_CMRNorm_forward(size_t frameCnt,
+ const real* in,
+ real* scale,
+ real* out,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t sizeX,
+ real alpha,
+ real beta) {}
+
+inline void hl_CMRNorm_backward(size_t frameCnt,
+ const real* inV,
+ const real* scale,
+ const real* outV,
+ const real* outDiff,
+ real* inDiff,
+ size_t channels,
+ size_t height,
+ size_t width,
+ size_t sizeX,
+ real alpha,
+ real beta) {}
inline void hl_bilinear_forward(const real* inData,
const size_t inImgH,
@@ -106,25 +156,33 @@ inline void hl_bilinear_forward(const real* inData,
const real ratioW) {}
inline void hl_bilinear_backward(real* inGrad,
- const size_t inImgH,
- const size_t inImgW,
- const size_t inputH,
- const size_t inputW,
- const real* outGrad,
- const size_t outImgH,
- const size_t outImgW,
- const size_t outputH,
- const size_t outputW,
- const size_t numChannels,
- const real ratioH,
- const real ratioW) {}
-
-inline void hl_maxout_forward(
- const real* inData, real* outData, int* idData,
- size_t batchSize, size_t size, size_t featLen, size_t group) {}
-
-inline void hl_maxout_backward(
- real* inGrad, const real* outGrad, const int* idData,
- size_t batchSize, size_t size, size_t featLen, size_t group) {}
+ const size_t inImgH,
+ const size_t inImgW,
+ const size_t inputH,
+ const size_t inputW,
+ const real* outGrad,
+ const size_t outImgH,
+ const size_t outImgW,
+ const size_t outputH,
+ const size_t outputW,
+ const size_t numChannels,
+ const real ratioH,
+ const real ratioW) {}
+
+inline void hl_maxout_forward(const real* inData,
+ real* outData,
+ int* idData,
+ size_t batchSize,
+ size_t size,
+ size_t featLen,
+ size_t group) {}
+
+inline void hl_maxout_backward(real* inGrad,
+ const real* outGrad,
+ const int* idData,
+ size_t batchSize,
+ size_t size,
+ size_t featLen,
+ size_t group) {}
#endif // HL_CNN_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
index 903dcbe8355d6f593d96bc1f9e686d54035a9366..85f7c390c47397127487b16fdc933f0afe2fb880 100644
--- a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
@@ -12,41 +12,42 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CUDA_CUBLAS_STUB_H_
#define HL_CUDA_CUBLAS_STUB_H_
#include "hl_cuda_cublas.h"
-inline void hl_matrix_transpose(real *A_d,
- real *C_d,
- int dimM,
- int dimN,
- int lda,
- int ldc) {}
-
-inline void hl_matrix_transpose(real *A_d,
- real *C_d,
- int dimM,
- int dimN) {}
-
-inline void hl_matrix_inverse(real *A_d,
- real *C_d,
- int dimN,
- int lda,
- int ldc) {}
-
-inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
- real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta,
- int lda, int ldb, int ldc) {}
+inline void hl_matrix_transpose(
+ real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {}
+
+inline void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {}
-inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+inline void hl_matrix_inverse(
+ real *A_d, real *C_d, int dimN, int lda, int ldc) {}
+
+inline void hl_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
+ real *C_d,
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta,
+ int lda,
+ int ldb,
+ int ldc) {}
+
+inline void hl_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta) {}
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta) {}
#endif // HL_CUDA_CUBLAS_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
index b96804afd86ba5e8c7b7eed7eb768295b4e23096..3beb0e5b5170261a6c453936b8b0347f3e97dbff 100644
--- a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
@@ -12,15 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CUDA_CUDNN_STUB_H_
#define HL_CUDA_CUDNN_STUB_H_
#include "hl_cuda_cudnn.h"
-inline int hl_get_cudnn_lib_version() {
- return 0;
-}
+inline int hl_get_cudnn_lib_version() { return 0; }
inline void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {}
@@ -68,41 +65,41 @@ inline void hl_pooling_backward(hl_tensor_descriptor input,
hl_pooling_descriptor pooling) {}
inline void hl_create_filter_descriptor(hl_filter_descriptor* filter,
- int input_feature_maps,
- int output_feature_maps,
- int height,
- int width) {}
+ int input_feature_maps,
+ int output_feature_maps,
+ int height,
+ int width) {}
inline void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {}
inline void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
- hl_tensor_descriptor image,
- hl_filter_descriptor filter,
- int padding_height,
- int padding_width,
- int stride_height,
- int stride_width) {}
+ hl_tensor_descriptor image,
+ hl_filter_descriptor filter,
+ int padding_height,
+ int padding_width,
+ int stride_height,
+ int stride_width) {}
inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
- hl_tensor_descriptor image,
- hl_filter_descriptor filter,
- int padding_height,
- int padding_width,
- int stride_height,
- int stride_width) {}
+ hl_tensor_descriptor image,
+ hl_filter_descriptor filter,
+ int padding_height,
+ int padding_width,
+ int stride_height,
+ int stride_width) {}
inline void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {}
inline void hl_conv_workspace(hl_tensor_descriptor input,
- hl_tensor_descriptor output,
- hl_filter_descriptor filter,
- hl_convolution_descriptor conv,
- int* convFwdAlgo,
- size_t* fwdLimitBytes,
- int* convBwdDataAlgo,
- size_t* bwdDataLimitBytes,
- int* convBwdFilterAlgo,
- size_t* bwdFilterLimitBytes) {}
+ hl_tensor_descriptor output,
+ hl_filter_descriptor filter,
+ hl_convolution_descriptor conv,
+ int* convFwdAlgo,
+ size_t* fwdLimitBytes,
+ int* convBwdDataAlgo,
+ size_t* bwdDataLimitBytes,
+ int* convBwdFilterAlgo,
+ size_t* bwdFilterLimitBytes) {}
inline void hl_convolution_forward(hl_tensor_descriptor input,
real* input_data,
@@ -116,86 +113,84 @@ inline void hl_convolution_forward(hl_tensor_descriptor input,
int convFwdAlgo) {}
inline void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
- real* bias_data,
- hl_tensor_descriptor output,
- real* output_data) {}
-
-inline void hl_convolution_backward_filter(
- hl_tensor_descriptor input,
- real* input_data,
- hl_tensor_descriptor output,
- real* output_grad_data,
- hl_filter_descriptor filter,
- real* filter_grad_data,
- hl_convolution_descriptor conv,
- void* gpuWorkSpace,
- size_t sizeInBytes,
- int convBwdFilterAlgo) {}
-
-inline void hl_convolution_backward_data(
- hl_tensor_descriptor input,
- real* input_data_grad,
- hl_tensor_descriptor output,
- real* output_grad_data,
- hl_filter_descriptor filter,
- real* filter_data,
- hl_convolution_descriptor conv,
- void* gpuWorkSpace,
- size_t sizeInBytes,
- int convBwdDataAlgo) {}
+ real* bias_data,
+ hl_tensor_descriptor output,
+ real* output_data) {}
+
+inline void hl_convolution_backward_filter(hl_tensor_descriptor input,
+ real* input_data,
+ hl_tensor_descriptor output,
+ real* output_grad_data,
+ hl_filter_descriptor filter,
+ real* filter_grad_data,
+ hl_convolution_descriptor conv,
+ void* gpuWorkSpace,
+ size_t sizeInBytes,
+ int convBwdFilterAlgo) {}
+
+inline void hl_convolution_backward_data(hl_tensor_descriptor input,
+ real* input_data_grad,
+ hl_tensor_descriptor output,
+ real* output_grad_data,
+ hl_filter_descriptor filter,
+ real* filter_data,
+ hl_convolution_descriptor conv,
+ void* gpuWorkSpace,
+ size_t sizeInBytes,
+ int convBwdDataAlgo) {}
inline void hl_convolution_backward_bias(hl_tensor_descriptor bias,
- real* bias_grad_data,
- hl_tensor_descriptor output,
- real* output_grad_data) {}
+ real* bias_grad_data,
+ hl_tensor_descriptor output,
+ real* output_grad_data) {}
-inline void hl_softmax_forward(real *input,
- real *output,
- int height,
- int width) {}
-
-inline void hl_softmax_backward(real *output_value,
- real *output_grad,
+inline void hl_softmax_forward(real* input,
+ real* output,
int height,
int width) {}
+inline void hl_softmax_backward(real* output_value,
+ real* output_grad,
+ int height,
+ int width) {}
+
inline void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outputDesc,
- real *output,
+ real* output,
hl_tensor_descriptor bnParamDesc,
- real *scale,
- real *bias,
+ real* scale,
+ real* bias,
double factor,
- real *runningMean,
- real *runningInvVar,
+ real* runningMean,
+ real* runningInvVar,
double epsilon,
- real *savedMean,
- real *savedVar) {}
+ real* savedMean,
+ real* savedVar) {}
inline void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outputDesc,
- real *output,
+ real* output,
hl_tensor_descriptor bnParamDesc,
- real *scale,
- real *bias,
- real *estimatedMean,
- real *estimatedVar,
+ real* scale,
+ real* bias,
+ real* estimatedMean,
+ real* estimatedVar,
double epsilon) {}
inline void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outGradDesc,
- real *outGrad,
+ real* outGrad,
hl_tensor_descriptor inGradDesc,
- real *inGrad,
+ real* inGrad,
hl_tensor_descriptor dBnParamDesc,
- real *scale,
- real *scaleGrad,
- real *biasGrad,
+ real* scale,
+ real* scaleGrad,
+ real* biasGrad,
double epsilon,
- real *savedMean,
- real *savedInvVar) {}
+ real* savedMean,
+ real* savedInvVar) {}
#endif // HL_CUDA_CUDNN_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_stub.h b/paddle/cuda/include/stub/hl_cuda_stub.h
index 675ac03b0e188e9b26038dd4e40264099618e17a..1f91068cdf8b3d472c4b403d1ec7d5293c28c07e 100644
--- a/paddle/cuda/include/stub/hl_cuda_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_CUDA_STUB_H_
#define HL_CUDA_STUB_H_
@@ -24,29 +23,25 @@ inline void hl_specify_devices_start(int *device, int number) {}
inline void hl_init(int device) {}
-inline int hl_get_cuda_lib_version(int device) {
- return 0;
-}
+inline int hl_get_cuda_lib_version(int device) { return 0; }
inline void hl_fini() {}
inline void hl_set_sync_flag(bool flag) {}
-inline bool hl_get_sync_flag() {
- return false;
-}
+inline bool hl_get_sync_flag() { return false; }
-inline int hl_get_device_count() { return 0; }
+inline int hl_get_device_count() { return 0; }
inline void hl_set_device(int device) {}
-inline int hl_get_device() { return 0; }
+inline int hl_get_device() { return 0; }
-inline void* hl_malloc_device(size_t size) { return NULL; }
+inline void *hl_malloc_device(size_t size) { return NULL; }
inline void hl_free_mem_device(void *dest_d) {}
-inline void* hl_malloc_host(size_t size) { return NULL; }
+inline void *hl_malloc_host(size_t size) { return NULL; }
inline void hl_free_mem_host(void *dest_h) {}
@@ -64,7 +59,9 @@ inline void hl_rand(real *dest_d, size_t num) {}
inline void hl_srand(unsigned int seed) {}
-inline void hl_memcpy_async(void *dst, void *src, size_t size,
+inline void hl_memcpy_async(void *dst,
+ void *src,
+ size_t size,
hl_stream_t stream) {}
inline void hl_stream_synchronize(hl_stream_t stream) {}
@@ -83,11 +80,11 @@ inline void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {}
inline void hl_event_synchronize(hl_event_t event) {}
-inline int hl_get_device_last_error() { return 0; }
+inline int hl_get_device_last_error() { return 0; }
-inline const char* hl_get_device_error_string() { return NULL; }
+inline const char *hl_get_device_error_string() { return NULL; }
-inline const char* hl_get_device_error_string(size_t err) { return NULL; }
+inline const char *hl_get_device_error_string(size_t err) { return NULL; }
inline bool hl_cuda_event_is_ready(hl_event_t event) { return true; }
diff --git a/paddle/cuda/include/stub/hl_lstm_stub.h b/paddle/cuda/include/stub/hl_lstm_stub.h
index 2700bef02a5e1e40ee7603ccab7fec754196f8cd..7ccda032d26f2fbbe99136e8481416daea557a78 100644
--- a/paddle/cuda/include/stub/hl_lstm_stub.h
+++ b/paddle/cuda/include/stub/hl_lstm_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_LSTM_STUB_H_
#define HL_LSTM_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
index 76cac2e57769301fee2e5979e2685976daf35441..1bd78d23fbaf46e6265ba0db25ea399a204bd96f 100644
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_MATRIX_STUB_H_
#define HL_MATRIX_STUB_H_
@@ -26,48 +25,30 @@ inline void hl_matrix_add(real* A_d,
real alpha,
real beta) {}
-inline void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) {}
+inline void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {}
-inline void hl_sequence_softmax_forward(real *A_d,
- real *C_d,
+inline void hl_sequence_softmax_forward(real* A_d,
+ real* C_d,
const int* index,
int numSequence) {}
-inline void hl_matrix_softmax_derivative(real* grad_d,
- real* output_d,
- real* sftmaxSum_d,
- int dimM,
- int dimN) {}
-
-inline void hl_matrix_classification_error(real* A_d,
- int* B_d,
- real* C_d,
- int dimM,
- int dimN) {}
-
-inline void hl_matrix_cross_entropy(real* A_d,
- real* C_d,
- int* label_d,
- int dimM,
- int dimN) {}
-
-inline void hl_matrix_cross_entropy_bp(real* grad_d,
- real* output_d,
- int* label_d,
- int dimM,
- int dimN) {}
-
-inline void hl_matrix_multi_binary_cross_entropy(real* output,
- real* entropy,
- hl_sparse_matrix_s mat,
- int dimM,
- int dimN) {}
-
-inline void hl_matrix_multi_binary_cross_entropy_bp(real* output,
- real* grad,
- hl_sparse_matrix_s mat,
- int dimM,
- int dimN) {}
+inline void hl_matrix_softmax_derivative(
+ real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {}
+
+inline void hl_matrix_classification_error(
+ real* A_d, int* B_d, real* C_d, int dimM, int dimN) {}
+
+inline void hl_matrix_cross_entropy(
+ real* A_d, real* C_d, int* label_d, int dimM, int dimN) {}
+
+inline void hl_matrix_cross_entropy_bp(
+ real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {}
+
+inline void hl_matrix_multi_binary_cross_entropy(
+ real* output, real* entropy, hl_sparse_matrix_s mat, int dimM, int dimN) {}
+
+inline void hl_matrix_multi_binary_cross_entropy_bp(
+ real* output, real* grad, hl_sparse_matrix_s mat, int dimM, int dimN) {}
inline void hl_matrix_zero_mem(real* data, int num) {}
@@ -101,7 +82,6 @@ inline void hl_cossim(real* output,
int input2_height,
real scale) {}
-
inline void hl_cossim_derivative(real* grad,
real* output,
real* prevOutX,
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
index aabd956c37f7dce48a379b995ab88a53aa65c760..381f0a6f26c5669465f029e972c6ca8b0e6e1776 100644
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_SEQUENCE_STUB_H_
#define HL_SEQUENCE_STUB_H_
@@ -21,15 +20,12 @@ limitations under the License. */
inline void hl_max_sequence_forward(real* input,
const int* sequence,
real* output,
- int *index,
+ int* index,
int numSequences,
int dim) {}
-inline void hl_max_sequence_backward(real* outputGrad,
- int *index,
- real* inputGrad,
- int numSequences,
- int dim) {}
+inline void hl_max_sequence_backward(
+ real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {}
inline void hl_context_projection_forward(real* input,
const int* sequence,
@@ -60,16 +56,16 @@ inline void hl_context_projection_backward_weight(real* outputGrad,
int contextStart,
int beginPad) {}
-inline void hl_sequence2batch_copy(real *batch,
- real *sequence,
- const int *batchIndex,
+inline void hl_sequence2batch_copy(real* batch,
+ real* sequence,
+ const int* batchIndex,
int seqWidth,
int batchCount,
bool seq2batch) {}
-inline void hl_sequence2batch_add(real *batch,
- real *sequence,
- int *batchIndex,
+inline void hl_sequence2batch_add(real* batch,
+ real* sequence,
+ int* batchIndex,
int seqWidth,
int batchCount,
bool seq2batch) {}
diff --git a/paddle/cuda/include/stub/hl_sparse_stub.h b/paddle/cuda/include/stub/hl_sparse_stub.h
index 346a1900dda5825e9a4311a2c51e8a50e6e7df0b..d47bdd2c47d097c4c68b7b7e88ef888bc18270c2 100644
--- a/paddle/cuda/include/stub/hl_sparse_stub.h
+++ b/paddle/cuda/include/stub/hl_sparse_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifndef HL_SPARSE_STUB_H_
#define HL_SPARSE_STUB_H_
@@ -20,7 +19,7 @@ limitations under the License. */
inline void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
hl_matrix_format_t format,
- hl_matrix_value_t value_type,
+ hl_matrix_value_t value_type,
int dimM,
int dimN,
int nnz) {}
@@ -28,20 +27,20 @@ inline void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
inline void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {}
inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
- void * dest_d,
+ void *dest_d,
size_t size,
hl_matrix_format_t format,
- hl_matrix_value_t value_type,
+ hl_matrix_value_t value_type,
int dimM,
int dimN,
int nnz) {}
inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
- real* value_d,
- int* rows_d,
- int* cols_d,
+ real *value_d,
+ int *rows_d,
+ int *cols_d,
hl_matrix_format_t format,
- hl_matrix_value_t value_type,
+ hl_matrix_value_t value_type,
int dimM,
int dimN,
int nnz) {}
@@ -87,10 +86,14 @@ inline void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
inline void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta) {}
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta) {}
inline void hl_matrix_dense_mul_csc(real *A_d,
hl_trans_op_t transa,
@@ -103,18 +106,27 @@ inline void hl_matrix_dense_mul_csc(real *A_d,
real alpha,
real beta) {}
-inline void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+inline void hl_sparse_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
hl_sparse_matrix_s C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta) {}
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta) {}
-inline void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
+inline void hl_matrix_dense_mul_csr(real *A_d,
+ hl_trans_op_t transa,
hl_sparse_matrix_s B_d,
hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta) {}
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta) {}
inline void hl_memcpy_from_csc_matrix(real *csc_val,
size_t val_size,
@@ -134,49 +146,39 @@ inline void hl_memcpy_from_csr_matrix(real *csr_val,
hl_sparse_matrix_s csr_matrix,
hl_stream_t stream) {}
-inline void hl_sparse_matrix_column_sum(real* A_d,
- hl_sparse_matrix_s B_d,
- int dimM,
- int dimN,
- real scale) {}
+inline void hl_sparse_matrix_column_sum(
+ real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {}
-inline void hl_matrix_csr_column_sum(real* A_d,
- hl_sparse_matrix_s B_d,
- int dimM,
- int dimN,
- real scale) {}
+inline void hl_matrix_csr_column_sum(
+ real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {}
inline void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
real scale) {}
inline void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
real scale) {}
inline void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
int dimM,
int dimN,
real alpha,
real beta) {}
inline void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
- real* B_d,
+ real *B_d,
int dimM,
int dimN,
real alpha,
real beta) {}
-inline int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
- return NULL;
-}
+inline int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) { return NULL; }
-inline int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
- return NULL;
-}
+inline int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) { return NULL; }
-inline real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
+inline real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
return NULL;
}
diff --git a/paddle/cuda/src/avx_mathfun.h b/paddle/cuda/src/avx_mathfun.h
index 2922d4dc2937662d66fb2433f4883448ba21fa3f..2412ed5abc13b2a83521a75524f581e106788b60 100644
--- a/paddle/cuda/src/avx_mathfun.h
+++ b/paddle/cuda/src/avx_mathfun.h
@@ -32,32 +32,35 @@
#include
/* yes I know, the top of this file is quite ugly */
-# define ALIGN32_BEG
-# define ALIGN32_END __attribute__((aligned(32)))
+#define ALIGN32_BEG
+#define ALIGN32_END __attribute__((aligned(32)))
/* __m128 is ugly to write */
-typedef __m256 v8sf; // vector of 8 float (avx)
-typedef __m256i v8si; // vector of 8 int (avx)
-typedef __m128i v4si; // vector of 8 int (avx)
+typedef __m256 v8sf; // vector of 8 float (avx)
+typedef __m256i v8si; // vector of 8 int (avx)
+typedef __m128i v4si; // vector of 8 int (avx)
-#define _PI32AVX_CONST(Name, Val) \
- static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { Val, Val, Val, Val }
+#define _PI32AVX_CONST(Name, Val) \
+ static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { \
+ Val, Val, Val, Val}
_PI32AVX_CONST(1, 1);
_PI32AVX_CONST(inv1, ~1);
_PI32AVX_CONST(2, 2);
_PI32AVX_CONST(4, 4);
-
/* declare some AVX constants -- why can't I figure a better way to do that? */
-#define _PS256_CONST(Name, Val) \
- static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-#define _PI32_CONST256(Name, Val) \
- static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-#define _PS256_CONST_TYPE(Name, Type, Val) \
- static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-
-_PS256_CONST(1 , 1.0f);
+#define _PS256_CONST(Name, Val) \
+ static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \
+ Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PI32_CONST256(Name, Val) \
+ static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \
+ Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PS256_CONST_TYPE(Name, Type, Val) \
+ static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \
+ Val, Val, Val, Val, Val, Val, Val, Val}
+
+_PS256_CONST(1, 1.0f);
_PS256_CONST(0p5, 0.5f);
/* the smallest non denormalized float number */
_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
@@ -76,14 +79,14 @@ _PI32_CONST256(0x7f, 0x7f);
_PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
_PS256_CONST(cephes_log_p0, 7.0376836292E-2);
-_PS256_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS256_CONST(cephes_log_p1, -1.1514610310E-1);
_PS256_CONST(cephes_log_p2, 1.1676998740E-1);
-_PS256_CONST(cephes_log_p3, - 1.2420140846E-1);
-_PS256_CONST(cephes_log_p4, + 1.4249322787E-1);
-_PS256_CONST(cephes_log_p5, - 1.6668057665E-1);
-_PS256_CONST(cephes_log_p6, + 2.0000714765E-1);
-_PS256_CONST(cephes_log_p7, - 2.4999993993E-1);
-_PS256_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS256_CONST(cephes_log_p3, -1.2420140846E-1);
+_PS256_CONST(cephes_log_p4, +1.4249322787E-1);
+_PS256_CONST(cephes_log_p5, -1.6668057665E-1);
+_PS256_CONST(cephes_log_p6, +2.0000714765E-1);
+_PS256_CONST(cephes_log_p7, -2.4999993993E-1);
+_PS256_CONST(cephes_log_p8, +3.3333331174E-1);
_PS256_CONST(cephes_log_q1, -2.12194440e-4);
_PS256_CONST(cephes_log_q2, 0.693359375);
@@ -94,50 +97,51 @@ typedef union imm_xmm_union {
v4si xmm[2];
} imm_xmm_union;
-#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) { \
- imm_xmm_union u __attribute__((aligned(32))); \
- u.imm = imm_; \
- xmm0_ = u.xmm[0]; \
- xmm1_ = u.xmm[1]; \
-}
-
-#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) { \
+#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \
+ { \
imm_xmm_union u __attribute__((aligned(32))); \
- u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \
+ u.imm = imm_; \
+ xmm0_ = u.xmm[0]; \
+ xmm1_ = u.xmm[1]; \
}
+#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \
+ { \
+ imm_xmm_union u __attribute__((aligned(32))); \
+ u.xmm[0] = xmm0_; \
+ u.xmm[1] = xmm1_; \
+ imm_ = u.imm; \
+ }
-#define AVX2_BITOP_USING_SSE2(fn) \
-static inline v8si avx2_mm256_##fn(v8si x, int a) \
-{ \
- /* use SSE2 instruction to perform the bitop AVX2 */ \
- v4si x1, x2; \
- v8si ret; \
- COPY_IMM_TO_XMM(x, x1, x2); \
- x1 = _mm_##fn(x1,a); \
- x2 = _mm_##fn(x2,a); \
- COPY_XMM_TO_IMM(x1, x2, ret); \
- return(ret); \
-}
+#define AVX2_BITOP_USING_SSE2(fn) \
+ static inline v8si avx2_mm256_##fn(v8si x, int a) { \
+ /* use SSE2 instruction to perform the bitop AVX2 */ \
+ v4si x1, x2; \
+ v8si ret; \
+ COPY_IMM_TO_XMM(x, x1, x2); \
+ x1 = _mm_##fn(x1, a); \
+ x2 = _mm_##fn(x2, a); \
+ COPY_XMM_TO_IMM(x1, x2, ret); \
+ return (ret); \
+ }
//#warning "Using SSE2 to perform AVX2 bitshift ops"
AVX2_BITOP_USING_SSE2(slli_epi32)
AVX2_BITOP_USING_SSE2(srli_epi32)
-#define AVX2_INTOP_USING_SSE2(fn) \
-static inline v8si avx2_mm256_##fn(v8si x, v8si y) \
-{ \
- /* use SSE2 instructions to perform the AVX2 integer operation */ \
- v4si x1, x2; \
- v4si y1, y2; \
- v8si ret; \
- COPY_IMM_TO_XMM(x, x1, x2); \
- COPY_IMM_TO_XMM(y, y1, y2); \
- x1 = _mm_##fn(x1,y1); \
- x2 = _mm_##fn(x2,y2); \
- COPY_XMM_TO_IMM(x1, x2, ret); \
- return(ret); \
-}
+#define AVX2_INTOP_USING_SSE2(fn) \
+ static inline v8si avx2_mm256_##fn(v8si x, v8si y) { \
+ /* use SSE2 instructions to perform the AVX2 integer operation */ \
+ v4si x1, x2; \
+ v4si y1, y2; \
+ v8si ret; \
+ COPY_IMM_TO_XMM(x, x1, x2); \
+ COPY_IMM_TO_XMM(y, y1, y2); \
+ x1 = _mm_##fn(x1, y1); \
+ x2 = _mm_##fn(x2, y2); \
+ COPY_XMM_TO_IMM(x1, x2, ret); \
+ return (ret); \
+ }
//#warning "Using SSE2 to perform AVX2 integer ops"
AVX2_INTOP_USING_SSE2(and_si128)
@@ -157,84 +161,83 @@ AVX2_INTOP_USING_SSE2(add_epi32)
#define avx2_mm256_add_epi32 _mm256_add_epi32
#endif /* __AVX2__ */
-
-/* natural logarithm computed for 8 simultaneous float
+/* natural logarithm computed for 8 simultaneous float
return NaN for x <= 0
*/
v8sf log256_ps(v8sf x) {
v8si imm0;
- v8sf one = *(v8sf*)_ps256_1;
+ v8sf one = *(v8sf *)_ps256_1;
- //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
+ // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
- x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos); /* cut off denormalized stuff */
+ x = _mm256_max_ps(
+ x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */
// can be done with AVX2
imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
/* keep only the fractional part */
- x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask);
- x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5);
+ x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);
+ x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);
// this is again another AVX2 instruction
- imm0 = avx2_mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+ imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);
v8sf e = _mm256_cvtepi32_ps(imm0);
e = _mm256_add_ps(e, one);
- /* part2:
+ /* part2:
if( x < SQRTHF ) {
e -= 1;
x = x + x - 1.0;
} else { x = x - 1.0; }
*/
- //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
- v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
+ // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
+ v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
v8sf tmp = _mm256_and_ps(x, mask);
x = _mm256_sub_ps(x, one);
e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
x = _mm256_add_ps(x, tmp);
- v8sf z = _mm256_mul_ps(x,x);
+ v8sf z = _mm256_mul_ps(x, x);
- v8sf y = *(v8sf*)_ps256_cephes_log_p0;
+ v8sf y = *(v8sf *)_ps256_cephes_log_p0;
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);
y = _mm256_mul_ps(y, x);
y = _mm256_mul_ps(y, z);
-
- tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1);
- y = _mm256_add_ps(y, tmp);
+ tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);
+ y = _mm256_add_ps(y, tmp);
- tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+ tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
y = _mm256_sub_ps(y, tmp);
- tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2);
+ tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);
x = _mm256_add_ps(x, y);
x = _mm256_add_ps(x, tmp);
- x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
+ x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
return x;
}
-_PS256_CONST(exp_hi, 88.3762626647949f);
-_PS256_CONST(exp_lo, -88.3762626647949f);
+_PS256_CONST(exp_hi, 88.3762626647949f);
+_PS256_CONST(exp_lo, -88.3762626647949f);
_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
_PS256_CONST(cephes_exp_C1, 0.693359375);
@@ -250,45 +253,45 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
v8sf exp256_ps(v8sf x) {
v8sf tmp = _mm256_setzero_ps(), fx;
v8si imm0;
- v8sf one = *(v8sf*)_ps256_1;
+ v8sf one = *(v8sf *)_ps256_1;
- x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi);
- x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo);
+ x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);
+ x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);
/* express exp(x) as exp(g + n*log(2)) */
- fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF);
- fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5);
+ fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);
+ fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);
/* how to perform a floorf with SSE: just below */
- //imm0 = _mm256_cvttps_epi32(fx);
- //tmp = _mm256_cvtepi32_ps(imm0);
-
+ // imm0 = _mm256_cvttps_epi32(fx);
+ // tmp = _mm256_cvtepi32_ps(imm0);
+
tmp = _mm256_floor_ps(fx);
/* if greater, substract 1 */
- //v8sf mask = _mm256_cmpgt_ps(tmp, fx);
- v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
+ // v8sf mask = _mm256_cmpgt_ps(tmp, fx);
+ v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
mask = _mm256_and_ps(mask, one);
fx = _mm256_sub_ps(tmp, mask);
- tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1);
- v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2);
+ tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);
+ v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);
x = _mm256_sub_ps(x, tmp);
x = _mm256_sub_ps(x, z);
- z = _mm256_mul_ps(x,x);
-
- v8sf y = *(v8sf*)_ps256_cephes_exp_p0;
+ z = _mm256_mul_ps(x, x);
+
+ v8sf y = *(v8sf *)_ps256_cephes_exp_p0;
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);
y = _mm256_mul_ps(y, x);
- y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5);
+ y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, x);
y = _mm256_add_ps(y, one);
@@ -296,7 +299,7 @@ v8sf exp256_ps(v8sf x) {
/* build 2^n */
imm0 = _mm256_cvttps_epi32(fx);
// another two AVX2 instructions
- imm0 = avx2_mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+ imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);
imm0 = avx2_mm256_slli_epi32(imm0, 23);
v8sf pow2n = _mm256_castsi256_ps(imm0);
y = _mm256_mul_ps(y, pow2n);
@@ -307,13 +310,12 @@ _PS256_CONST(minus_cephes_DP1, -0.78515625);
_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
_PS256_CONST(sincof_p0, -1.9515295891E-4);
-_PS256_CONST(sincof_p1, 8.3321608736E-3);
+_PS256_CONST(sincof_p1, 8.3321608736E-3);
_PS256_CONST(sincof_p2, -1.6666654611E-1);
-_PS256_CONST(coscof_p0, 2.443315711809948E-005);
+_PS256_CONST(coscof_p0, 2.443315711809948E-005);
_PS256_CONST(coscof_p1, -1.388731625493765E-003);
-_PS256_CONST(coscof_p2, 4.166664568298827E-002);
-_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
-
+_PS256_CONST(coscof_p2, 4.166664568298827E-002);
+_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
/* evaluation of 8 sines at onces using AVX intrisics
@@ -327,7 +329,7 @@ _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
surprising but correct result.
*/
-v8sf sin256_ps(v8sf x) { // any x
+v8sf sin256_ps(v8sf x) { // any x
v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
v8si imm0, imm2;
@@ -338,78 +340,78 @@ v8sf sin256_ps(v8sf x) { // any x
sign_bit = x;
/* take the absolute value */
- x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+ x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
/* extract the sign bit (upper one) */
- sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
-
+ sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);
+
/* scale by 4/Pi */
- y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+ y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
- /*
- Here we start a series of integer operations, which are in the
- realm of AVX2.
- If we don't have AVX, let's perform them using SSE2 directives
- */
+/*
+ Here we start a series of integer operations, which are in the
+ realm of AVX2.
+ If we don't have AVX, let's perform them using SSE2 directives
+*/
#ifdef __AVX2__
/* store the integer part of y in mm0 */
imm2 = _mm256_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
// another two AVX2 instruction
- imm2 = avx2_mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
- imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+ imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+ imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
y = _mm256_cvtepi32_ps(imm2);
/* get the swap sign flag */
- imm0 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
+ imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
imm0 = avx2_mm256_slli_epi32(imm0, 29);
- /* get the polynom selection mask
+ /* get the polynom selection mask
there is one polynom for 0 <= x <= Pi/4
and another one for Pi/4
#include "hl_functions.h"
namespace hppl {
- extern __m256 exp(__m256 a);
+extern __m256 exp(__m256 a);
- __m256 relu(const __m256 a) {
- __m256 tmp = _mm256_set1_ps(0.0f);
- return _mm256_max_ps(a, tmp);
- }
+__m256 relu(const __m256 a) {
+ __m256 tmp = _mm256_set1_ps(0.0f);
+ return _mm256_max_ps(a, tmp);
+}
- __m256 sigmoid(const __m256 a) {
- __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
- __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
- __m256 tmp = _mm256_max_ps(a, min);
- tmp = _mm256_min_ps(tmp, max);
- tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
- tmp = exp(tmp);
- tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
- tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
- return tmp;
- }
+__m256 sigmoid(const __m256 a) {
+ __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
+ __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
+ __m256 tmp = _mm256_max_ps(a, min);
+ tmp = _mm256_min_ps(tmp, max);
+ tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
+ tmp = exp(tmp);
+ tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
+ tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
+ return tmp;
+}
- __m256 tanh(const __m256 a) {
- __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
- __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
- tmp = _mm256_min_ps(tmp, max);
- tmp = exp(tmp);
- return _mm256_sub_ps(
- _mm256_div_ps(_mm256_set1_ps(2.0f),
- _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)), _mm256_set1_ps(1.0f));
- }
+__m256 tanh(const __m256 a) {
+ __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
+ __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
+ tmp = _mm256_min_ps(tmp, max);
+ tmp = exp(tmp);
+ return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f),
+ _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)),
+ _mm256_set1_ps(1.0f));
+}
- __m256 linear(const __m256 a) {
- return a;
- }
+__m256 linear(const __m256 a) { return a; }
- __m256 relu(const __m256 a, const __m256 b) {
- return _mm256_mul_ps(a,
+__m256 relu(const __m256 a, const __m256 b) {
+ return _mm256_mul_ps(
+ a,
_mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
- _mm256_set1_ps(1.0f)));
- }
+ _mm256_set1_ps(1.0f)));
+}
- __m256 sigmoid(const __m256 a, const __m256 b) {
- return _mm256_mul_ps(_mm256_mul_ps(a, b),
- _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
- }
+__m256 sigmoid(const __m256 a, const __m256 b) {
+ return _mm256_mul_ps(_mm256_mul_ps(a, b),
+ _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
+}
- __m256 tanh(const __m256 a, const __m256 b) {
- return _mm256_mul_ps(a,
- _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
- }
+__m256 tanh(const __m256 a, const __m256 b) {
+ return _mm256_mul_ps(
+ a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
+}
- __m256 linear(const __m256 a, const __m256 b) {
- return a;
- }
+__m256 linear(const __m256 a, const __m256 b) { return a; }
} // namespace hppl
diff --git a/paddle/cuda/src/hl_cpu_functions.cc b/paddle/cuda/src/hl_cpu_functions.cc
index b8352c2d537fba5ec9cd3237fe8f3fa9c25cbffe..af00f352e536bf342e15315d1f6804225b87eb0b 100644
--- a/paddle/cuda/src/hl_cpu_functions.cc
+++ b/paddle/cuda/src/hl_cpu_functions.cc
@@ -12,46 +12,33 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include
#include "hl_functions.h"
namespace hppl {
- real relu(const real a) {
- return a > 0.0f ? a : 0.0f;
- }
-
- real sigmoid(const real a) {
- const real min = SIGMOID_THRESHOLD_MIN;
- const real max = SIGMOID_THRESHOLD_MAX;
- real tmp = (a < min) ? min : ((a > max) ? max : a);
- return 1.0 / (1.0 + exp(-tmp));
- }
-
- real tanh(const real a) {
- real tmp = -2.0 * a;
- tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
- return (2.0 / (1.0 + exp(tmp))) - 1.0;
- }
-
- real linear(const real a) {
- return a;
- }
-
- real relu(const real a, const real b) {
- return a * (b > 0.0f ? 1.0f : 0.0f);
- }
-
- real sigmoid(const real a, const real b) {
- return a * b * (1 - b);
- }
-
- real tanh(const real a, const real b) {
- return a * (1.0f - b * b);
- }
-
- real linear(const real a, const real b) {
- return a;
- }
+real relu(const real a) { return a > 0.0f ? a : 0.0f; }
+
+real sigmoid(const real a) {
+ const real min = SIGMOID_THRESHOLD_MIN;
+ const real max = SIGMOID_THRESHOLD_MAX;
+ real tmp = (a < min) ? min : ((a > max) ? max : a);
+ return 1.0 / (1.0 + exp(-tmp));
+}
+
+real tanh(const real a) {
+ real tmp = -2.0 * a;
+ tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+ return (2.0 / (1.0 + exp(tmp))) - 1.0;
+}
+
+real linear(const real a) { return a; }
+
+real relu(const real a, const real b) { return a * (b > 0.0f ? 1.0f : 0.0f); }
+
+real sigmoid(const real a, const real b) { return a * b * (1 - b); }
+
+real tanh(const real a, const real b) { return a * (1.0f - b * b); }
+
+real linear(const real a, const real b) { return a; }
} // namespace hppl
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
index abf6afadc218f615dc6b3cf734d09f072214be40..e8ba232d44b3f66254d4749d4abbcfbe46d1fd0e 100644
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include
#include
#include "hl_cuda.h"
@@ -24,7 +23,7 @@ limitations under the License. */
namespace dynload {
std::once_flag cublas_dso_flag;
-void* cublas_dso_handle = nullptr;
+void *cublas_dso_handle = nullptr;
/**
* The following macro definition can generate structs
@@ -34,31 +33,30 @@ void* cublas_dso_handle = nullptr;
* note: default dynamic linked libs
*/
#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- cublasStatus_t operator()(Args... args) { \
- typedef cublasStatus_t (*cublasFunc)(Args...); \
- std::call_once(cublas_dso_flag, GetCublasDsoHandle, \
- &cublas_dso_handle); \
- void* p_##__name = dlsym(cublas_dso_handle, #__name); \
- return reinterpret_cast(p_##__name)(args...); \
- } \
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ cublasStatus_t operator()(Args... args) { \
+ typedef cublasStatus_t (*cublasFunc)(Args...); \
+ std::call_once(cublas_dso_flag, GetCublasDsoHandle, &cublas_dso_handle); \
+ void *p_##__name = dlsym(cublas_dso_handle, #__name); \
+ return reinterpret_cast(p_##__name)(args...); \
+ } \
} __name; // struct DynLoad__##__name
#else
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- cublasStatus_t operator()(Args... args) { \
- return __name(args...); \
- } \
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ cublasStatus_t operator()(Args... args) { \
+ return __name(args...); \
+ } \
} __name; // struct DynLoad__##__name
#endif
-#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
- DYNAMIC_LOAD_CUBLAS_WRAP(__name)
+#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name)
// include all needed cublas functions in HPPL
+// clang-format off
#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
__macro(cublasSgemv) \
__macro(cublasDgemv) \
@@ -88,41 +86,41 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
} /* namespace dynload */
-
+// clang-format on
#ifndef PADDLE_TYPE_DOUBLE
-#define CUBLAS_GEAM dynload::cublasSgeam
-#define CUBLAS_GEMV dynload::cublasSgemv
-#define CUBLAS_GEMM dynload::cublasSgemm
-#define CUBLAS_GETRF dynload::cublasSgetrfBatched
-#define CUBLAS_GETRI dynload::cublasSgetriBatched
+#define CUBLAS_GEAM dynload::cublasSgeam
+#define CUBLAS_GEMV dynload::cublasSgemv
+#define CUBLAS_GEMM dynload::cublasSgemm
+#define CUBLAS_GETRF dynload::cublasSgetrfBatched
+#define CUBLAS_GETRI dynload::cublasSgetriBatched
#else
-#define CUBLAS_GEAM dynload::cublasDgeam
-#define CUBLAS_GEMV dynload::cublasDgemv
-#define CUBLAS_GEMM dynload::cublasDgemm
-#define CUBLAS_GETRF dynload::cublasDgetrfBatched
-#define CUBLAS_GETRI dynload::cublasDgetriBatched
+#define CUBLAS_GEAM dynload::cublasDgeam
+#define CUBLAS_GEMV dynload::cublasDgemv
+#define CUBLAS_GEMM dynload::cublasDgemm
+#define CUBLAS_GETRF dynload::cublasDgetrfBatched
+#define CUBLAS_GETRI dynload::cublasDgetriBatched
#endif
-const char* hl_cublas_get_error_string(cublasStatus_t status) {
+const char *hl_cublas_get_error_string(cublasStatus_t status) {
switch (status) {
- case CUBLAS_STATUS_NOT_INITIALIZED:
- return "[cublas status]: not initialized";
- case CUBLAS_STATUS_ALLOC_FAILED:
- return "[cublas status]: allocate failed";
- case CUBLAS_STATUS_INVALID_VALUE:
- return "[cublas status]: invalid value";
- case CUBLAS_STATUS_ARCH_MISMATCH:
- return "[cublas status]: arch mismatch";
- case CUBLAS_STATUS_MAPPING_ERROR:
- return "[cublas status]: mapping error";
- case CUBLAS_STATUS_EXECUTION_FAILED:
- return "[cublas status]: execution failed";
- case CUBLAS_STATUS_INTERNAL_ERROR:
- return "[cublas status]: internal error";
- case CUBLAS_STATUS_SUCCESS:
- return "[cublas status]: success";
- default:
- return "[cublas status]: unknown error";
+ case CUBLAS_STATUS_NOT_INITIALIZED:
+ return "[cublas status]: not initialized";
+ case CUBLAS_STATUS_ALLOC_FAILED:
+ return "[cublas status]: allocate failed";
+ case CUBLAS_STATUS_INVALID_VALUE:
+ return "[cublas status]: invalid value";
+ case CUBLAS_STATUS_ARCH_MISMATCH:
+ return "[cublas status]: arch mismatch";
+ case CUBLAS_STATUS_MAPPING_ERROR:
+ return "[cublas status]: mapping error";
+ case CUBLAS_STATUS_EXECUTION_FAILED:
+ return "[cublas status]: execution failed";
+ case CUBLAS_STATUS_INTERNAL_ERROR:
+ return "[cublas status]: internal error";
+ case CUBLAS_STATUS_SUCCESS:
+ return "[cublas status]: success";
+ default:
+ return "[cublas status]: unknown error";
}
}
@@ -131,27 +129,21 @@ const char* hl_cublas_get_error_string(cublasStatus_t status) {
* support << operator for more details error info.
*/
cublasStatus_t g_cublasStat;
-#define CHECK_CUBLAS(cublas_func) \
- g_cublasStat = cublas_func; \
- CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat) \
- << "Cublas Error: " \
- << hl_cublas_get_error_string(g_cublasStat) \
- << " "
+#define CHECK_CUBLAS(cublas_func) \
+ g_cublasStat = cublas_func; \
+ CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat) \
+ << "Cublas Error: " << hl_cublas_get_error_string(g_cublasStat) << " "
void hl_cublas_init(cublasHandle_t *cublas_handle, cudaStream_t stream) {
CHECK_CUBLAS(dynload::cublasCreate(cublas_handle))
- << "[cublas init] Cublas create handle faild!";
+ << "[cublas init] Cublas create handle faild!";
CHECK_CUBLAS(dynload::cublasSetStream(*cublas_handle, stream))
- << "[cublas init] Cublas set stream faild!";
+ << "[cublas init] Cublas set stream faild!";
}
-void hl_matrix_transpose(real *A_d,
- real *C_d,
- int dimM,
- int dimN,
- int lda,
- int ldc) {
+void hl_matrix_transpose(
+ real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {
real alpha = 1.0;
real beta = 0.0;
@@ -159,11 +151,18 @@ void hl_matrix_transpose(real *A_d,
CHECK_NOTNULL(C_d);
CHECK_CUBLAS(CUBLAS_GEAM(t_resource.handle,
- CUBLAS_OP_T, CUBLAS_OP_N,
- dimM, dimN,
- &alpha, A_d, lda,
- &beta, nullptr, dimM,
- C_d, ldc));
+ CUBLAS_OP_T,
+ CUBLAS_OP_N,
+ dimM,
+ dimN,
+ &alpha,
+ A_d,
+ lda,
+ &beta,
+ nullptr,
+ dimM,
+ C_d,
+ ldc));
CHECK_SYNC("hl_matrix_transpose failed");
}
@@ -188,13 +187,13 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
small-sized matrices. There may be a better way to reconstruct
the API for better performance.
*/
- CHECK_CUBLAS(CUBLAS_GETRF(t_resource.handle,
- dimN, inout_d, lda, pivot_d, info_d, 1));
+ CHECK_CUBLAS(
+ CUBLAS_GETRF(t_resource.handle, dimN, inout_d, lda, pivot_d, info_d, 1));
int info_h;
hl_memcpy(&info_h, info_d, sizeof(int));
if (info_h != 0) {
- LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
+ LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
}
/* Step 2: Compute the inverse of the matrix given its LU decomposition */
@@ -203,12 +202,18 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
hl_memcpy(out_d, out_h, sizeof(real *));
CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
- dimN, (const real **)inout_d, lda, pivot_d,
- out_d, ldc, info_d, 1));
+ dimN,
+ (const real **)inout_d,
+ lda,
+ pivot_d,
+ out_d,
+ ldc,
+ info_d,
+ 1));
hl_memcpy(&info_h, info_d, sizeof(int));
if (info_h != 0) {
- LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
+ LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
}
hl_free_mem_device(inout_d);
@@ -218,12 +223,19 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
CHECK_SYNC("hl_matrix_inverse failed");
}
-void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+void hl_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta,
- int lda, int ldb, int ldc) {
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta,
+ int lda,
+ int ldb,
+ int ldc) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(B_d);
CHECK_NOTNULL(C_d);
@@ -231,8 +243,8 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
if (dimN == 1 && dimM != 1 && dimK != 1 && transb == HPPL_OP_N) {
int m = (transa == HPPL_OP_N) ? dimM : dimK;
int n = (transa == HPPL_OP_N) ? dimK : dimM;
- hl_matrix_mul_vector(A_d, transa, B_d, C_d, m, n,
- alpha, beta, lda, ldb, ldc);
+ hl_matrix_mul_vector(
+ A_d, transa, B_d, C_d, m, n, alpha, beta, lda, ldb, ldc);
return;
}
@@ -240,8 +252,7 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
int m = (transb == HPPL_OP_N) ? dimK : dimN;
int n = (transb == HPPL_OP_N) ? dimN : dimK;
hl_trans_op_t trans = (transb == HPPL_OP_N) ? HPPL_OP_T : HPPL_OP_N;
- hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n,
- alpha, beta, ldb, 1, 1);
+ hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n, alpha, beta, ldb, 1, 1);
return;
}
@@ -250,26 +261,47 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
stat = CUBLAS_GEMM(t_resource.handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
- dimN, dimM, dimK,
- &alpha, B_d, ldb,
- A_d, lda,
- &beta, C_d, ldc);
+ dimN,
+ dimM,
+ dimK,
+ &alpha,
+ B_d,
+ ldb,
+ A_d,
+ lda,
+ &beta,
+ C_d,
+ ldc);
} else if ((HPPL_OP_T == transa) && (HPPL_OP_N == transb)) {
stat = CUBLAS_GEMM(t_resource.handle,
CUBLAS_OP_N,
CUBLAS_OP_T,
- dimN, dimM, dimK,
- &alpha, B_d, ldb,
- A_d, lda,
- &beta, C_d, ldc);
+ dimN,
+ dimM,
+ dimK,
+ &alpha,
+ B_d,
+ ldb,
+ A_d,
+ lda,
+ &beta,
+ C_d,
+ ldc);
} else if ((HPPL_OP_N == transa) && (HPPL_OP_T == transb)) {
stat = CUBLAS_GEMM(t_resource.handle,
CUBLAS_OP_T,
CUBLAS_OP_N,
- dimN, dimM, dimK,
- &alpha, B_d, ldb,
- A_d, lda,
- &beta, C_d, ldc);
+ dimN,
+ dimM,
+ dimK,
+ &alpha,
+ B_d,
+ ldb,
+ A_d,
+ lda,
+ &beta,
+ C_d,
+ ldc);
} else {
LOG(FATAL) << "parameter transa error!";
}
@@ -277,24 +309,46 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
CHECK_SYNC("hl_matrix_mul failed");
}
-void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
- real *B_d, hl_trans_op_t transb,
+void hl_matrix_mul(real *A_d,
+ hl_trans_op_t transa,
+ real *B_d,
+ hl_trans_op_t transb,
real *C_d,
- int dimM, int dimN, int dimK,
- real alpha, real beta) {
+ int dimM,
+ int dimN,
+ int dimK,
+ real alpha,
+ real beta) {
int lda = (HPPL_OP_N == transa) ? dimK : dimM;
int ldb = (HPPL_OP_N == transb) ? dimN : dimK;
int ldc = dimN;
- hl_matrix_mul(A_d, transa, B_d, transb, C_d, dimM, dimN,
- dimK, alpha, beta, lda, ldb, ldc);
+ hl_matrix_mul(A_d,
+ transa,
+ B_d,
+ transb,
+ C_d,
+ dimM,
+ dimN,
+ dimK,
+ alpha,
+ beta,
+ lda,
+ ldb,
+ ldc);
}
-void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
- real *B_d, real *C_d,
- int dimM, int dimN,
- real alpha, real beta,
- int lda, int incb, int incc) {
+void hl_matrix_mul_vector(real *A_d,
+ hl_trans_op_t trans,
+ real *B_d,
+ real *C_d,
+ int dimM,
+ int dimN,
+ real alpha,
+ real beta,
+ int lda,
+ int incb,
+ int incc) {
CHECK_NOTNULL(A_d);
CHECK_NOTNULL(B_d);
CHECK_NOTNULL(C_d);
@@ -303,21 +357,29 @@ void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
if (HPPL_OP_N == trans) {
stat = CUBLAS_GEMV(t_resource.handle,
CUBLAS_OP_T,
- dimN, dimM,
+ dimN,
+ dimM,
&alpha,
- A_d, lda,
- B_d, incb,
+ A_d,
+ lda,
+ B_d,
+ incb,
&beta,
- C_d, incc);
+ C_d,
+ incc);
} else if (HPPL_OP_T == trans) {
stat = CUBLAS_GEMV(t_resource.handle,
CUBLAS_OP_N,
- dimN, dimM,
+ dimN,
+ dimM,
&alpha,
- A_d, lda,
- B_d, incb,
+ A_d,
+ lda,
+ B_d,
+ incb,
&beta,
- C_d, incc);
+ C_d,
+ incc);
} else {
LOG(FATAL) << "parameter transa error!";
}
@@ -326,10 +388,14 @@ void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
CHECK_SYNC("hl_matrix_mul_vector");
}
-void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
- real *B_d, real *C_d,
- int dimM, int dimN,
- real alpha, real beta) {
- hl_matrix_mul_vector(A_d, trans, B_d, C_d, dimM, dimN,
- alpha, beta, dimN, 1, 1);
+void hl_matrix_mul_vector(real *A_d,
+ hl_trans_op_t trans,
+ real *B_d,
+ real *C_d,
+ int dimM,
+ int dimN,
+ real alpha,
+ real beta) {
+ hl_matrix_mul_vector(
+ A_d, trans, B_d, C_d, dimM, dimN, alpha, beta, dimN, 1, 1);
}
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index 1829fe23ac594e63253df23b350b16cb28eaebc1..9d4ff08a78d641896e946e9bf04590d4ba93350f 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include
#include
#include "hl_cuda_cudnn.h"
@@ -22,9 +21,10 @@ limitations under the License. */
#include "paddle/utils/Logging.h"
#include "paddle/utils/CommandLineParser.h"
-P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb, 4096,
- "Specify cuDNN max workspace limit, in units MB, "
- "4096MB=4GB by default.");
+P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
+ 4096,
+ "Specify cuDNN max workspace limit, in units MB, "
+ "4096MB=4GB by default.");
namespace dynload {
@@ -41,16 +41,15 @@ void* cudnn_dso_handle = nullptr;
#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- auto operator()(Args... args) -> decltype(__name(args...)) { \
- using cudnn_func = decltype(__name(args...))(*)(Args...); \
- std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, \
- &cudnn_dso_handle); \
- void* p_##__name = dlsym(cudnn_dso_handle, #__name); \
- return reinterpret_cast(p_##__name)(args...); \
- } \
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ auto operator()(Args... args) -> decltype(__name(args...)) { \
+ using cudnn_func = decltype(__name(args...)) (*)(Args...); \
+ std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle); \
+ void* p_##__name = dlsym(cudnn_dso_handle, #__name); \
+ return reinterpret_cast(p_##__name)(args...); \
+ } \
} __name; /* struct DynLoad__##__name */
#else
@@ -69,6 +68,7 @@ void* cudnn_dso_handle = nullptr;
* include all needed cudnn functions in HPPL
* different cudnn version has different interfaces
**/
+// clang-format off
#define CUDNN_DNN_ROUTINE_EACH(__macro) \
__macro(cudnnSetTensor4dDescriptor) \
__macro(cudnnSetTensor4dDescriptorEx) \
@@ -141,56 +141,53 @@ CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#undef CUDNN_DNN_ROUTINE_EACH
-
+// clang-format on
} /* namespace dynload */
/**
* Check build-in cudnn function using glog and it **does not**
* support << operator for more details error info.
*/
-#define CHECK_CUDNN(cudnnFunc) \
- do { \
- cudnnStatus_t cudnnStat = cudnnFunc; \
- CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat) \
- << "Cudnn Error: " \
- << dynload::cudnnGetErrorString(cudnnStat); \
+#define CHECK_CUDNN(cudnnFunc) \
+ do { \
+ cudnnStatus_t cudnnStat = cudnnFunc; \
+ CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat) \
+ << "Cudnn Error: " << dynload::cudnnGetErrorString(cudnnStat); \
} while (0)
bool g_is_libcudnn_init = false;
int g_cudnn_lib_version = 0;
-void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) {
- CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
+void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) {
+ CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
}
-void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream) {
- size_t cudnn_dso_ver = dynload::cudnnGetVersion();
- size_t cudnn_dso_major = cudnn_dso_ver / 1000;
- size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
-
- // Compare cudnn header version with that of cudnn.so.
- CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
- (cudnn_cuh_major == cudnn_dso_major))
- << "[cudnn init] libcudnn v" << cudnn_dso_major <<
- " with header v" << cudnn_cuh_major << " unmatched!\n"
- << "PaddlePaddle Requirement: "
- << "(header v[2-3] with libcudnn v[2-3]) Or "
- << "(header v4 with libcudnn v4) Or "
- << "(header v5 with libcudnn v5).";
-
- CHECK(!(CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
- << "cudnn v5 requires cuda version >= 7.5";
-
- CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
- CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
-
- g_is_libcudnn_init = true;
- g_cudnn_lib_version = cudnn_dso_ver;
+void hl_cudnn_init(cudnnHandle_t* cudnn_handle, cudaStream_t stream) {
+ size_t cudnn_dso_ver = dynload::cudnnGetVersion();
+ size_t cudnn_dso_major = cudnn_dso_ver / 1000;
+ size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
+
+ // Compare cudnn header version with that of cudnn.so.
+ CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
+ (cudnn_cuh_major == cudnn_dso_major))
+ << "[cudnn init] libcudnn v" << cudnn_dso_major << " with header v"
+ << cudnn_cuh_major << " unmatched!\n"
+ << "PaddlePaddle Requirement: "
+ << "(header v[2-3] with libcudnn v[2-3]) Or "
+ << "(header v4 with libcudnn v4) Or "
+ << "(header v5 with libcudnn v5).";
+
+ CHECK(!(CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
+ << "cudnn v5 requires cuda version >= 7.5";
+
+ CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
+ CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
+
+ g_is_libcudnn_init = true;
+ g_cudnn_lib_version = cudnn_dso_ver;
}
-int hl_get_cudnn_lib_version() {
- return g_cudnn_lib_version;
-}
+int hl_get_cudnn_lib_version() { return g_cudnn_lib_version; }
void hl_conv_workspace(hl_tensor_descriptor input,
hl_tensor_descriptor output,
@@ -204,99 +201,91 @@ void hl_conv_workspace(hl_tensor_descriptor input,
size_t* bwdFilterLimitBytes) {
#if CUDNN_VERSION >= 4000
- CHECK_NOTNULL(input);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(filter);
- CHECK_NOTNULL(conv);
-
- // Specify workspace limit directly
- size_t memoryLimitBytes =
- (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
-
- // cudnn convolution forward configuration
- cudnnTensorDescriptor_t fwd_src_desc =
- GET_TENSOR_DESCRIPTOR(input);
- cudnnTensorDescriptor_t fwd_dest_desc =
- GET_TENSOR_DESCRIPTOR(output);
- cudnnFilterDescriptor_t fwd_filter_desc =
- GET_FILTER_DESCRIPTOR(filter);
- cudnnConvolutionDescriptor_t fwd_conv_desc =
- GET_CONVOLUTION_DESCRIPTOR(conv);
-
- CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
- t_resource.cudnn_handle,
- fwd_src_desc,
- fwd_filter_desc,
- fwd_conv_desc,
- fwd_dest_desc,
- CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
- memoryLimitBytes,
- reinterpret_cast(convFwdAlgo)));
-
- CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
- t_resource.cudnn_handle,
- fwd_src_desc,
- fwd_filter_desc,
- fwd_conv_desc,
- fwd_dest_desc,
- static_cast(*convFwdAlgo),
- fwdLimitBytes));
-
- // cudnn convolution backward data configuration
- cudnnFilterDescriptor_t bwd_data_filter_desc =
- GET_FILTER_DESCRIPTOR(filter);
- cudnnTensorDescriptor_t bwd_data_diff_desc =
- GET_TENSOR_DESCRIPTOR(output);
- cudnnTensorDescriptor_t bwd_data_grad_desc =
- GET_TENSOR_DESCRIPTOR(input);
- cudnnConvolutionDescriptor_t bwd_data_conv_desc =
- GET_CONVOLUTION_DESCRIPTOR(conv);
-
- CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
- t_resource.cudnn_handle,
- bwd_data_filter_desc,
- bwd_data_diff_desc,
- bwd_data_conv_desc,
- bwd_data_grad_desc,
- CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
- memoryLimitBytes,
- reinterpret_cast(convBwdDataAlgo)));
-
- CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
- t_resource.cudnn_handle,
- bwd_data_filter_desc,
- bwd_data_diff_desc,
- bwd_data_conv_desc,
- bwd_data_grad_desc,
- static_cast(*convBwdDataAlgo),
- bwdDataLimitBytes));
-
- // cudnn convolution backward filter configuration
- cudnnTensorDescriptor_t bwd_filter_src_desc =
- GET_TENSOR_DESCRIPTOR(input);
- cudnnTensorDescriptor_t bwd_filter_diff_desc =
- GET_TENSOR_DESCRIPTOR(output);
- cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
- GET_CONVOLUTION_DESCRIPTOR(conv);
- cudnnFilterDescriptor_t bwd_filter_grad_desc =
- GET_FILTER_DESCRIPTOR(filter);
-
- CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
- t_resource.cudnn_handle,
- bwd_filter_src_desc,
- bwd_filter_diff_desc,
- bwd_filter_conv_desc,
- bwd_filter_grad_desc,
- CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
- memoryLimitBytes,
- reinterpret_cast(convBwdFilterAlgo)));
-
- CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
- t_resource.cudnn_handle, bwd_filter_src_desc,
- bwd_filter_diff_desc, bwd_filter_conv_desc,
- bwd_filter_grad_desc,
- static_cast(*convBwdFilterAlgo),
- bwdFilterLimitBytes));
+ CHECK_NOTNULL(input);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(filter);
+ CHECK_NOTNULL(conv);
+
+ // Specify workspace limit directly
+ size_t memoryLimitBytes =
+ (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
+
+ // cudnn convolution forward configuration
+ cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
+ cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+ cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+
+ CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
+ t_resource.cudnn_handle,
+ fwd_src_desc,
+ fwd_filter_desc,
+ fwd_conv_desc,
+ fwd_dest_desc,
+ CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+ memoryLimitBytes,
+ reinterpret_cast(convFwdAlgo)));
+
+ CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
+ t_resource.cudnn_handle,
+ fwd_src_desc,
+ fwd_filter_desc,
+ fwd_conv_desc,
+ fwd_dest_desc,
+ static_cast(*convFwdAlgo),
+ fwdLimitBytes));
+
+ // cudnn convolution backward data configuration
+ cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+ cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
+ cudnnConvolutionDescriptor_t bwd_data_conv_desc =
+ GET_CONVOLUTION_DESCRIPTOR(conv);
+
+ CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+ t_resource.cudnn_handle,
+ bwd_data_filter_desc,
+ bwd_data_diff_desc,
+ bwd_data_conv_desc,
+ bwd_data_grad_desc,
+ CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+ memoryLimitBytes,
+ reinterpret_cast(convBwdDataAlgo)));
+
+ CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+ t_resource.cudnn_handle,
+ bwd_data_filter_desc,
+ bwd_data_diff_desc,
+ bwd_data_conv_desc,
+ bwd_data_grad_desc,
+ static_cast(*convBwdDataAlgo),
+ bwdDataLimitBytes));
+
+ // cudnn convolution backward filter configuration
+ cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
+ cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
+ GET_CONVOLUTION_DESCRIPTOR(conv);
+ cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
+
+ CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+ t_resource.cudnn_handle,
+ bwd_filter_src_desc,
+ bwd_filter_diff_desc,
+ bwd_filter_conv_desc,
+ bwd_filter_grad_desc,
+ CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+ memoryLimitBytes,
+ reinterpret_cast(convBwdFilterAlgo)));
+
+ CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+ t_resource.cudnn_handle,
+ bwd_filter_src_desc,
+ bwd_filter_diff_desc,
+ bwd_filter_conv_desc,
+ bwd_filter_grad_desc,
+ static_cast(*convBwdFilterAlgo),
+ bwdFilterLimitBytes));
#endif
}
@@ -306,55 +295,54 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
int feature_maps,
int height,
int width) {
- CHECK_NOTNULL(image_desc);
+ CHECK_NOTNULL(image_desc);
- cudnn_tensor_descriptor hl_desc =
- (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
- CHECK_NOTNULL(hl_desc);
+ cudnn_tensor_descriptor hl_desc =
+ (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
+ CHECK_NOTNULL(hl_desc);
#ifndef PADDLE_TYPE_DOUBLE
- cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+ cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
- cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+ cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
#endif
- CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
-
- CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
- hl_desc->desc,
- CUDNN_TENSOR_NCHW,
- data_type,
- batch_size,
- feature_maps,
- height,
- width));
-
- hl_desc->format = CUDNN_TENSOR_NCHW;
- hl_desc->data_type = data_type;
- hl_desc->batch_size = batch_size;
- hl_desc->feature_maps = feature_maps;
- hl_desc->height = height;
- hl_desc->width = width;
-
- *image_desc = (hl_tensor_descriptor)hl_desc;
+ CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
+
+ CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(hl_desc->desc,
+ CUDNN_TENSOR_NCHW,
+ data_type,
+ batch_size,
+ feature_maps,
+ height,
+ width));
+
+ hl_desc->format = CUDNN_TENSOR_NCHW;
+ hl_desc->data_type = data_type;
+ hl_desc->batch_size = batch_size;
+ hl_desc->feature_maps = feature_maps;
+ hl_desc->height = height;
+ hl_desc->width = width;
+
+ *image_desc = (hl_tensor_descriptor)hl_desc;
}
void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {
- CHECK_NOTNULL(image_desc);
+ CHECK_NOTNULL(image_desc);
- cudnn_tensor_descriptor hl_desc =
- (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
- CHECK_NOTNULL(hl_desc);
+ cudnn_tensor_descriptor hl_desc =
+ (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
+ CHECK_NOTNULL(hl_desc);
#ifndef PADDLE_TYPE_DOUBLE
- cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+ cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
- cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+ cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
#endif
- CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
+ CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
- hl_desc->data_type = data_type;
+ hl_desc->data_type = data_type;
- *image_desc = (hl_tensor_descriptor)hl_desc;
+ *image_desc = (hl_tensor_descriptor)hl_desc;
}
void hl_tensor_reshape(hl_tensor_descriptor image_desc,
@@ -362,19 +350,19 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
int feature_maps,
int height,
int width) {
- const int stride_w = 1;
- const int stride_h = width * stride_w;
- const int stride_c = height * stride_h;
- const int stride_n = feature_maps * stride_c;
- return hl_tensor_reshape(image_desc,
- batch_size,
- feature_maps,
- height,
- width,
- stride_n,
- stride_c,
- stride_h,
- stride_w);
+ const int stride_w = 1;
+ const int stride_h = width * stride_w;
+ const int stride_c = height * stride_h;
+ const int stride_n = feature_maps * stride_c;
+ return hl_tensor_reshape(image_desc,
+ batch_size,
+ feature_maps,
+ height,
+ width,
+ stride_n,
+ stride_c,
+ stride_h,
+ stride_w);
}
void hl_tensor_reshape(hl_tensor_descriptor image_desc,
@@ -386,42 +374,41 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
int cStride,
int hStride,
int wStride) {
- CHECK_NOTNULL(image_desc);
-
- cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
- CHECK_NOTNULL(hl_desc->desc);
-
- CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
- hl_desc->data_type,
- batch_size,
- feature_maps,
- height,
- width,
- nStride,
- cStride,
- hStride,
- wStride));
-
- hl_desc->batch_size = batch_size;
- hl_desc->feature_maps = feature_maps;
- hl_desc->height = height;
- hl_desc->width = width;
+ CHECK_NOTNULL(image_desc);
+
+ cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
+ CHECK_NOTNULL(hl_desc->desc);
+
+ CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
+ hl_desc->data_type,
+ batch_size,
+ feature_maps,
+ height,
+ width,
+ nStride,
+ cStride,
+ hStride,
+ wStride));
+
+ hl_desc->batch_size = batch_size;
+ hl_desc->feature_maps = feature_maps;
+ hl_desc->height = height;
+ hl_desc->width = width;
}
void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {
- CHECK_NOTNULL(image_desc);
+ CHECK_NOTNULL(image_desc);
- cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
- CHECK_NOTNULL(hl_desc->desc);
+ cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
+ CHECK_NOTNULL(hl_desc->desc);
- CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
+ CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
- hl_desc->desc = NULL;
+ hl_desc->desc = NULL;
- free(image_desc);
+ free(image_desc);
}
-
void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
hl_pooling_mode_t mode,
int height,
@@ -430,63 +417,61 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
int width_padding,
int stride_height,
int stride_width) {
- cudnnPoolingMode_t cudnn_mode;
- switch (mode) {
- case HL_POOLING_MAX:
- cudnn_mode = CUDNN_POOLING_MAX;
- break;
- case HL_POOLING_AVERAGE:
- cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
- break;
- case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
- cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
- break;
- default:
- LOG(FATAL) << "parameter mode error";
- }
-
- CHECK_NOTNULL(pooling_desc);
-
- cudnn_pooling_descriptor hl_pooling_desc =
- (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
- CHECK_NOTNULL(hl_pooling_desc);
-
- CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
-
- CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(
- hl_pooling_desc->desc,
- cudnn_mode,
+ cudnnPoolingMode_t cudnn_mode;
+ switch (mode) {
+ case HL_POOLING_MAX:
+ cudnn_mode = CUDNN_POOLING_MAX;
+ break;
+ case HL_POOLING_AVERAGE:
+ cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+ break;
+ case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
+ cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+ break;
+ default:
+ LOG(FATAL) << "parameter mode error";
+ }
+
+ CHECK_NOTNULL(pooling_desc);
+
+ cudnn_pooling_descriptor hl_pooling_desc =
+ (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
+ CHECK_NOTNULL(hl_pooling_desc);
+
+ CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
+
+ CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(hl_pooling_desc->desc,
+ cudnn_mode,
#if CUDNN_VERSION >= 5000
- CUDNN_PROPAGATE_NAN,
+ CUDNN_PROPAGATE_NAN,
#endif
- height,
- width,
- height_padding,
- width_padding,
- stride_height,
- stride_width));
-
- hl_pooling_desc->mode = cudnn_mode;
- hl_pooling_desc->window_height = height;
- hl_pooling_desc->window_width = width;
- hl_pooling_desc->stride_height = stride_height;
- hl_pooling_desc->stride_width = stride_width;
-
- *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
+ height,
+ width,
+ height_padding,
+ width_padding,
+ stride_height,
+ stride_width));
+
+ hl_pooling_desc->mode = cudnn_mode;
+ hl_pooling_desc->window_height = height;
+ hl_pooling_desc->window_width = width;
+ hl_pooling_desc->stride_height = stride_height;
+ hl_pooling_desc->stride_width = stride_width;
+
+ *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
}
void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {
- CHECK_NOTNULL(pooling_desc);
+ CHECK_NOTNULL(pooling_desc);
- cudnn_pooling_descriptor hl_pooling =
- (cudnn_pooling_descriptor)pooling_desc;
+ cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
- CHECK_NOTNULL(hl_pooling->desc);
- CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
+ CHECK_NOTNULL(hl_pooling->desc);
+ CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
- hl_pooling->desc = NULL;
+ hl_pooling->desc = NULL;
- free(pooling_desc);
+ free(pooling_desc);
}
void hl_pooling_forward(hl_tensor_descriptor input,
@@ -494,31 +479,30 @@ void hl_pooling_forward(hl_tensor_descriptor input,
hl_tensor_descriptor output,
real* output_image,
hl_pooling_descriptor pooling) {
- cudnnPoolingDescriptor_t pooling_desc;
- cudnnTensorDescriptor_t input_desc;
- cudnnTensorDescriptor_t output_desc;
-
- CHECK_NOTNULL(input);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(pooling);
- CHECK_NOTNULL(input_image);
- CHECK_NOTNULL(output_image);
-
- real alpha = 1.0f;
- real beta = 1.0f;
- input_desc = ((cudnn_tensor_descriptor)input)->desc;
- output_desc = ((cudnn_tensor_descriptor)output)->desc;
- pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
- CHECK_CUDNN(dynload::cudnnPoolingForward(
- t_resource.cudnn_handle,
- pooling_desc,
- &alpha,
- input_desc,
- input_image,
- &beta,
- output_desc,
- output_image));
- CHECK_SYNC("hl_pooling_forward failed");
+ cudnnPoolingDescriptor_t pooling_desc;
+ cudnnTensorDescriptor_t input_desc;
+ cudnnTensorDescriptor_t output_desc;
+
+ CHECK_NOTNULL(input);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(pooling);
+ CHECK_NOTNULL(input_image);
+ CHECK_NOTNULL(output_image);
+
+ real alpha = 1.0f;
+ real beta = 1.0f;
+ input_desc = ((cudnn_tensor_descriptor)input)->desc;
+ output_desc = ((cudnn_tensor_descriptor)output)->desc;
+ pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
+ CHECK_CUDNN(dynload::cudnnPoolingForward(t_resource.cudnn_handle,
+ pooling_desc,
+ &alpha,
+ input_desc,
+ input_image,
+ &beta,
+ output_desc,
+ output_image));
+ CHECK_SYNC("hl_pooling_forward failed");
}
void hl_pooling_backward(hl_tensor_descriptor input,
@@ -528,90 +512,86 @@ void hl_pooling_backward(hl_tensor_descriptor input,
real* output_image,
real* output_image_grad,
hl_pooling_descriptor pooling) {
- cudnnPoolingDescriptor_t pooling_desc;
- cudnnTensorDescriptor_t input_desc;
- cudnnTensorDescriptor_t output_desc;
-
- CHECK_NOTNULL(input);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(pooling);
- CHECK_NOTNULL(input_image);
- CHECK_NOTNULL(input_image_grad);
- CHECK_NOTNULL(output_image);
- CHECK_NOTNULL(output_image_grad);
-
- real alpha = 1.0f;
- real beta = 1.0f;
- input_desc = ((cudnn_tensor_descriptor)input)->desc;
- output_desc = ((cudnn_tensor_descriptor)output)->desc;
- pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
- CHECK_CUDNN(dynload::cudnnPoolingBackward(
- t_resource.cudnn_handle,
- pooling_desc,
- &alpha,
- output_desc,
- output_image,
- output_desc,
- output_image_grad,
- input_desc,
- input_image,
- &beta,
- input_desc,
- input_image_grad));
+ cudnnPoolingDescriptor_t pooling_desc;
+ cudnnTensorDescriptor_t input_desc;
+ cudnnTensorDescriptor_t output_desc;
+
+ CHECK_NOTNULL(input);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(pooling);
+ CHECK_NOTNULL(input_image);
+ CHECK_NOTNULL(input_image_grad);
+ CHECK_NOTNULL(output_image);
+ CHECK_NOTNULL(output_image_grad);
+
+ real alpha = 1.0f;
+ real beta = 1.0f;
+ input_desc = ((cudnn_tensor_descriptor)input)->desc;
+ output_desc = ((cudnn_tensor_descriptor)output)->desc;
+ pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
+ CHECK_CUDNN(dynload::cudnnPoolingBackward(t_resource.cudnn_handle,
+ pooling_desc,
+ &alpha,
+ output_desc,
+ output_image,
+ output_desc,
+ output_image_grad,
+ input_desc,
+ input_image,
+ &beta,
+ input_desc,
+ input_image_grad));
CHECK_SYNC("hl_pooling_backward failed");
}
-
void hl_create_filter_descriptor(hl_filter_descriptor* filter,
int input_feature_maps,
int output_feature_maps,
int height,
int width) {
- CHECK_NOTNULL(filter);
+ CHECK_NOTNULL(filter);
- cudnn_filter_descriptor hl_filter =
- (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
- CHECK_NOTNULL(hl_filter);
+ cudnn_filter_descriptor hl_filter =
+ (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
+ CHECK_NOTNULL(hl_filter);
- CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
+ CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
#ifndef PADDLE_TYPE_DOUBLE
- cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+ cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
- cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+ cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
#endif
- CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(
- hl_filter->desc,
- data_type,
+ CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(hl_filter->desc,
+ data_type,
#if CUDNN_VERSION >= 5000
- CUDNN_TENSOR_NCHW,
+ CUDNN_TENSOR_NCHW,
#endif
- output_feature_maps,
- input_feature_maps,
- height,
- width));
-
- hl_filter->data_type = data_type;
- hl_filter->output_feature_maps = output_feature_maps;
- hl_filter->input_feature_maps = input_feature_maps;
- hl_filter->filter_height = height;
- hl_filter->filter_width = width;
-
- *filter = (hl_filter_descriptor)hl_filter;
+ output_feature_maps,
+ input_feature_maps,
+ height,
+ width));
+
+ hl_filter->data_type = data_type;
+ hl_filter->output_feature_maps = output_feature_maps;
+ hl_filter->input_feature_maps = input_feature_maps;
+ hl_filter->filter_height = height;
+ hl_filter->filter_width = width;
+
+ *filter = (hl_filter_descriptor)hl_filter;
}
-
void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {
- CHECK_NOTNULL(filter);
+ CHECK_NOTNULL(filter);
- cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
- CHECK_NOTNULL(hl_filter->desc);
+ cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
+ CHECK_NOTNULL(hl_filter->desc);
- CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
+ CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
- hl_filter->desc = NULL;
+ hl_filter->desc = NULL;
- free(filter);
+ free(filter);
}
void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
@@ -621,36 +601,35 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
int padding_width,
int stride_height,
int stride_width) {
- CHECK_NOTNULL(conv);
-
- cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)
- malloc(sizeof(_cudnn_convolution_descriptor));
-
- CHECK_NOTNULL(hl_conv);
- CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
-
- cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
- CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(
- hl_conv->desc,
- padding_height,
- padding_width,
- stride_height,
- stride_width,
- 1,
- 1,
- mode));
-
- hl_conv->input_image = image;
- hl_conv->filter = filter;
- hl_conv->padding_height = padding_height;
- hl_conv->padding_width = padding_width;
- hl_conv->stride_height = stride_height;
- hl_conv->stride_width = stride_width;
- hl_conv->upscalex = 1;
- hl_conv->upscaley = 1;
- hl_conv->mode = mode;
-
- *conv = (hl_convolution_descriptor)hl_conv;
+ CHECK_NOTNULL(conv);
+
+ cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc(
+ sizeof(_cudnn_convolution_descriptor));
+
+ CHECK_NOTNULL(hl_conv);
+ CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
+
+ cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+ CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
+ padding_height,
+ padding_width,
+ stride_height,
+ stride_width,
+ 1,
+ 1,
+ mode));
+
+ hl_conv->input_image = image;
+ hl_conv->filter = filter;
+ hl_conv->padding_height = padding_height;
+ hl_conv->padding_width = padding_width;
+ hl_conv->stride_height = stride_height;
+ hl_conv->stride_width = stride_width;
+ hl_conv->upscalex = 1;
+ hl_conv->upscaley = 1;
+ hl_conv->mode = mode;
+
+ *conv = (hl_convolution_descriptor)hl_conv;
}
void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
@@ -660,44 +639,43 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
int padding_width,
int stride_height,
int stride_width) {
- CHECK_NOTNULL(conv);
- CHECK_NOTNULL(image);
- CHECK_NOTNULL(filter);
-
- cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
- cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
- CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(
- conv_desc,
- padding_height,
- padding_width,
- stride_height,
- stride_width,
- 1,
- 1,
- mode));
-
- cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
- hl_conv->input_image = image;
- hl_conv->filter = filter;
- hl_conv->padding_height = padding_height;
- hl_conv->padding_width = padding_width;
- hl_conv->stride_height = stride_height;
- hl_conv->stride_width = stride_width;
- hl_conv->upscalex = 1;
- hl_conv->upscaley = 1;
- hl_conv->mode = mode;
+ CHECK_NOTNULL(conv);
+ CHECK_NOTNULL(image);
+ CHECK_NOTNULL(filter);
+
+ cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+ cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+ CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
+ padding_height,
+ padding_width,
+ stride_height,
+ stride_width,
+ 1,
+ 1,
+ mode));
+
+ cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
+ hl_conv->input_image = image;
+ hl_conv->filter = filter;
+ hl_conv->padding_height = padding_height;
+ hl_conv->padding_width = padding_width;
+ hl_conv->stride_height = stride_height;
+ hl_conv->stride_width = stride_width;
+ hl_conv->upscalex = 1;
+ hl_conv->upscaley = 1;
+ hl_conv->mode = mode;
}
void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {
- CHECK_NOTNULL(conv);
+ CHECK_NOTNULL(conv);
- cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
- CHECK_NOTNULL(hl_conv->desc);
+ cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
+ CHECK_NOTNULL(hl_conv->desc);
- CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
- hl_conv->desc = NULL;
+ CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
+ hl_conv->desc = NULL;
- free(conv);
+ free(conv);
}
void hl_convolution_forward(hl_tensor_descriptor input,
@@ -710,33 +688,33 @@ void hl_convolution_forward(hl_tensor_descriptor input,
void* gpuWorkSpace,
size_t sizeInBytes,
int convFwdAlgo) {
- CHECK_NOTNULL(input);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(filter);
- CHECK_NOTNULL(conv);
- CHECK_NOTNULL(input_data);
- CHECK_NOTNULL(output_data);
- CHECK_NOTNULL(filter_data);
- cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
- cudnnTensorDescriptor_t dest_desc = GET_TENSOR_DESCRIPTOR(output);
- cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
- cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
- real alpha = 1.0f;
- real beta = 1.0f;
- CHECK_CUDNN(dynload::cudnnConvolutionForward(
- t_resource.cudnn_handle,
- &alpha,
- src_desc,
- input_data,
- filter_desc,
- filter_data,
- conv_desc,
- static_cast(convFwdAlgo),
- gpuWorkSpace,
- sizeInBytes,
- &beta,
- dest_desc,
- output_data));
+ CHECK_NOTNULL(input);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(filter);
+ CHECK_NOTNULL(conv);
+ CHECK_NOTNULL(input_data);
+ CHECK_NOTNULL(output_data);
+ CHECK_NOTNULL(filter_data);
+ cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
+ cudnnTensorDescriptor_t dest_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
+ cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+ real alpha = 1.0f;
+ real beta = 1.0f;
+ CHECK_CUDNN(dynload::cudnnConvolutionForward(
+ t_resource.cudnn_handle,
+ &alpha,
+ src_desc,
+ input_data,
+ filter_desc,
+ filter_data,
+ conv_desc,
+ static_cast(convFwdAlgo),
+ gpuWorkSpace,
+ sizeInBytes,
+ &beta,
+ dest_desc,
+ output_data));
CHECK_SYNC("hl_convolution_forward failed");
}
@@ -744,27 +722,26 @@ void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
real* bias_data,
hl_tensor_descriptor output,
real* output_data) {
- CHECK_NOTNULL(bias);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(bias_data);
- CHECK_NOTNULL(output_data);
-
- cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
- cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
- real alpha = 1.0f;
- real beta = 1.0f;
-
- CHECK_CUDNN(dynload::cudnnAddTensor(
- t_resource.cudnn_handle,
+ CHECK_NOTNULL(bias);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(bias_data);
+ CHECK_NOTNULL(output_data);
+
+ cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
+ real alpha = 1.0f;
+ real beta = 1.0f;
+
+ CHECK_CUDNN(dynload::cudnnAddTensor(t_resource.cudnn_handle,
#if CUDNN_VERSION < 4000
- CUDNN_ADD_SAME_C,
+ CUDNN_ADD_SAME_C,
#endif
- &alpha,
- bias_desc,
- bias_data,
- &beta,
- output_desc,
- output_data));
+ &alpha,
+ bias_desc,
+ bias_data,
+ &beta,
+ output_desc,
+ output_data));
CHECK_SYNC("hl_convolution_forward_add_bias failed");
}
@@ -772,23 +749,22 @@ void hl_convolution_backward_bias(hl_tensor_descriptor bias,
real* bias_grad_data,
hl_tensor_descriptor output,
real* output_grad_data) {
- CHECK_NOTNULL(bias);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(bias_grad_data);
- CHECK_NOTNULL(output_grad_data);
-
- real alpha = 1.0f;
- real beta = 1.0f;
- cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
- cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
- CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(
- t_resource.cudnn_handle,
- &alpha,
- diff_desc,
- output_grad_data,
- &beta,
- bias_desc,
- bias_grad_data));
+ CHECK_NOTNULL(bias);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(bias_grad_data);
+ CHECK_NOTNULL(output_grad_data);
+
+ real alpha = 1.0f;
+ real beta = 1.0f;
+ cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
+ CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(t_resource.cudnn_handle,
+ &alpha,
+ diff_desc,
+ output_grad_data,
+ &beta,
+ bias_desc,
+ bias_grad_data));
CHECK_SYNC("hl_convolution_backward_bias failed");
}
@@ -802,37 +778,37 @@ void hl_convolution_backward_filter(hl_tensor_descriptor input,
void* gpuWorkSpace,
size_t sizeInBytes,
int convBwdFilterAlgo) {
- CHECK_NOTNULL(input);
- CHECK_NOTNULL(output);
- CHECK_NOTNULL(filter);
- CHECK_NOTNULL(conv);
- CHECK_NOTNULL(input_data);
- CHECK_NOTNULL(output_grad_data);
- CHECK_NOTNULL(filter_grad_data);
-
- real alpha = 1.0f;
- real beta = 1.0f;
- cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
- cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
- cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
- cudnnFilterDescriptor_t grad_desc = GET_FILTER_DESCRIPTOR(filter);
-
- CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
- t_resource.cudnn_handle,
- &alpha,
- src_desc,
- input_data,
- diff_desc,
- output_grad_data,
- conv_desc,
+ CHECK_NOTNULL(input);
+ CHECK_NOTNULL(output);
+ CHECK_NOTNULL(filter);
+ CHECK_NOTNULL(conv);
+ CHECK_NOTNULL(input_data);
+ CHECK_NOTNULL(output_grad_data);
+ CHECK_NOTNULL(filter_grad_data);
+
+ real alpha = 1.0f;
+ real beta = 1.0f;
+ cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
+ cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+ cudnnFilterDescriptor_t grad_desc = GET_FILTER_DESCRIPTOR(filter);
+
+ CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
+ t_resource.cudnn_handle,
+ &alpha,
+ src_desc,
+ input_data,
+ diff_desc,
+ output_grad_data,
+ conv_desc,
#if CUDNN_VERSION >= 4000
- static_cast(convBwdFilterAlgo),
- gpuWorkSpace,
- sizeInBytes,
+ static_cast(convBwdFilterAlgo),
+ gpuWorkSpace,
+ sizeInBytes,
#endif
- &beta,
- grad_desc,
- filter_grad_data));
+ &beta,
+ grad_desc,
+ filter_grad_data));
CHECK_SYNC("hl_convolution_backward_filter failed");
}
@@ -846,119 +822,111 @@ void hl_convolution_backward_data(hl_tensor_descriptor input,
void* gpuWorkSpace,
size_t sizeInBytes,
int convBwdDataAlgo) {
- real alpha = 1.0f;
- real beta = 1.0f;
- cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
- cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
- cudnnTensorDescriptor_t grad_desc = GET_TENSOR_DESCRIPTOR(input);
- cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-
- CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
- t_resource.cudnn_handle,
- &alpha,
- filter_desc,
- filter_data,
- diff_desc,
- output_grad_data,
- conv_desc,
+ real alpha = 1.0f;
+ real beta = 1.0f;
+ cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
+ cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+ cudnnTensorDescriptor_t grad_desc = GET_TENSOR_DESCRIPTOR(input);
+ cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+
+ CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
+ t_resource.cudnn_handle,
+ &alpha,
+ filter_desc,
+ filter_data,
+ diff_desc,
+ output_grad_data,
+ conv_desc,
#if CUDNN_VERSION >= 4000
- static_cast(convBwdDataAlgo),
- gpuWorkSpace,
- sizeInBytes,
+ static_cast(convBwdDataAlgo),
+ gpuWorkSpace,
+ sizeInBytes,
#endif
- &beta,
- grad_desc,
- input_data_grad));
+ &beta,
+ grad_desc,
+ input_data_grad));
CHECK_SYNC("hl_convolution_backward_data failed");
}
-
-void hl_softmax_forward(real *input,
- real *output,
- int height,
- int width) {
+void hl_softmax_forward(real* input, real* output, int height, int width) {
#ifndef PADDLE_TYPE_DOUBLE
- cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+ cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
- cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+ cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
#endif
- CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
- t_resource.cudnn_desc,
- CUDNN_TENSOR_NCHW,
- data_type,
- height,
- width,
- 1,
- 1));
-
- real alpha = 1.0f;
- real beta = 0.0f;
- CHECK_CUDNN(dynload::cudnnSoftmaxForward(
- t_resource.cudnn_handle,
- CUDNN_SOFTMAX_ACCURATE,
- CUDNN_SOFTMAX_MODE_CHANNEL,
- &alpha,
- t_resource.cudnn_desc,
- input,
- &beta,
- t_resource.cudnn_desc,
- output));
+ CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
+ CUDNN_TENSOR_NCHW,
+ data_type,
+ height,
+ width,
+ 1,
+ 1));
+
+ real alpha = 1.0f;
+ real beta = 0.0f;
+ CHECK_CUDNN(dynload::cudnnSoftmaxForward(t_resource.cudnn_handle,
+ CUDNN_SOFTMAX_ACCURATE,
+ CUDNN_SOFTMAX_MODE_CHANNEL,
+ &alpha,
+ t_resource.cudnn_desc,
+ input,
+ &beta,
+ t_resource.cudnn_desc,
+ output));
CHECK_SYNC("hl_softmax_forward failed");
}
-void hl_softmax_backward(real *output_value,
- real *output_grad,
+void hl_softmax_backward(real* output_value,
+ real* output_grad,
int height,
int width) {
#ifndef PADDLE_TYPE_DOUBLE
- cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+ cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#else
- cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+ cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
#endif
- CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
- t_resource.cudnn_desc,
- CUDNN_TENSOR_NCHW,
- data_type,
- height,
- width,
- 1,
- 1));
-
- real alpha = 1.0f;
- real beta = 0.0f;
- CHECK_CUDNN(dynload::cudnnSoftmaxBackward(
- t_resource.cudnn_handle,
- CUDNN_SOFTMAX_ACCURATE,
- CUDNN_SOFTMAX_MODE_CHANNEL,
- &alpha,
- t_resource.cudnn_desc,
- output_value,
- t_resource.cudnn_desc,
- output_grad,
- &beta,
- t_resource.cudnn_desc,
- output_grad));
+ CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
+ CUDNN_TENSOR_NCHW,
+ data_type,
+ height,
+ width,
+ 1,
+ 1));
+
+ real alpha = 1.0f;
+ real beta = 0.0f;
+ CHECK_CUDNN(dynload::cudnnSoftmaxBackward(t_resource.cudnn_handle,
+ CUDNN_SOFTMAX_ACCURATE,
+ CUDNN_SOFTMAX_MODE_CHANNEL,
+ &alpha,
+ t_resource.cudnn_desc,
+ output_value,
+ t_resource.cudnn_desc,
+ output_grad,
+ &beta,
+ t_resource.cudnn_desc,
+ output_grad));
CHECK_SYNC("hl_softmax_backward failed");
}
void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outputDesc,
- real *output,
+ real* output,
hl_tensor_descriptor bnParamDesc,
- real *scale,
- real *bias,
+ real* scale,
+ real* bias,
double factor,
- real *runningMean,
- real *runningInvVar,
+ real* runningMean,
+ real* runningInvVar,
double epsilon,
- real *savedMean,
- real *savedVar) {
+ real* savedMean,
+ real* savedVar) {
#if CUDNN_VERSION >= 4007
if ((NULL != runningMean && NULL == runningInvVar) ||
(NULL == runningMean && NULL != runningInvVar)) {
LOG(FATAL) << "runningMean and runningInvVar can be NULL "
- << "but only at the same time.";
+ << "but only at the same time.";
}
if ((NULL != savedMean && NULL == savedVar) ||
(NULL == savedMean && NULL != savedVar)) {
@@ -972,10 +940,24 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
real alpha = 1.0f;
real beta = 1.0f;
cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
- CHECK_CUDNN(dynload::cudnnBatchNormalizationForwardTraining(
- t_resource.cudnn_handle, mode, &alpha, &beta, xDesc,
- input, yDesc, output, bnDesc, scale, bias, factor,
- runningMean, runningInvVar, epsilon, savedMean, savedVar));
+ CHECK_CUDNN(
+ dynload::cudnnBatchNormalizationForwardTraining(t_resource.cudnn_handle,
+ mode,
+ &alpha,
+ &beta,
+ xDesc,
+ input,
+ yDesc,
+ output,
+ bnDesc,
+ scale,
+ bias,
+ factor,
+ runningMean,
+ runningInvVar,
+ epsilon,
+ savedMean,
+ savedVar));
CHECK_SYNC("hl_batch_norm_forward_training failed");
#else
@@ -985,15 +967,15 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
}
void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
- real *input,
- hl_tensor_descriptor outputDesc,
- real *output,
- hl_tensor_descriptor bnParamDesc,
- real *scale,
- real *bias,
- real *estimatedMean,
- real *estimatedInvVar,
- double epsilon) {
+ real* input,
+ hl_tensor_descriptor outputDesc,
+ real* output,
+ hl_tensor_descriptor bnParamDesc,
+ real* scale,
+ real* bias,
+ real* estimatedMean,
+ real* estimatedInvVar,
+ double epsilon) {
#if CUDNN_VERSION >= 4007
cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
@@ -1001,10 +983,21 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
real alpha = 1.0f;
real beta = 1.0f;
cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
- CHECK_CUDNN(dynload::cudnnBatchNormalizationForwardInference(
- t_resource.cudnn_handle, mode, &alpha, &beta, xDesc,
- input, yDesc, output, bnDesc, scale, bias,
- estimatedMean, estimatedInvVar, epsilon));
+ CHECK_CUDNN(
+ dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
+ mode,
+ &alpha,
+ &beta,
+ xDesc,
+ input,
+ yDesc,
+ output,
+ bnDesc,
+ scale,
+ bias,
+ estimatedMean,
+ estimatedInvVar,
+ epsilon));
CHECK_SYNC("hl_batch_norm_forward_inference failed");
#else
@@ -1014,18 +1007,18 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
}
void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
- real *input,
+ real* input,
hl_tensor_descriptor outGradDesc,
- real *outGrad,
+ real* outGrad,
hl_tensor_descriptor inGradDesc,
- real *inGrad,
+ real* inGrad,
hl_tensor_descriptor dBnParamDesc,
- real *scale,
- real *scaleGrad,
- real *biasGrad,
+ real* scale,
+ real* scaleGrad,
+ real* biasGrad,
double epsilon,
- real *savedMean,
- real *savedInvVar) {
+ real* savedMean,
+ real* savedInvVar) {
#if CUDNN_VERSION >= 4007
if ((NULL != savedMean && NULL == savedInvVar) ||
(NULL == savedMean && NULL != savedInvVar)) {
@@ -1040,12 +1033,25 @@ void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
real alpha = 1.0f;
real beta = 1.0f;
cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
- CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(
- t_resource.cudnn_handle, mode, &alpha, &beta,
- &alpha, &beta,
- xDesc, input, dyDesc, outGrad, dxDesc, inGrad,
- bnDesc, scale, scaleGrad, biasGrad, epsilon,
- savedMean, savedInvVar));
+ CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(t_resource.cudnn_handle,
+ mode,
+ &alpha,
+ &beta,
+ &alpha,
+ &beta,
+ xDesc,
+ input,
+ dyDesc,
+ outGrad,
+ dxDesc,
+ inGrad,
+ bnDesc,
+ scale,
+ scaleGrad,
+ biasGrad,
+ epsilon,
+ savedMean,
+ savedInvVar));
CHECK_SYNC("hl_batch_norm_backward failed");
#else
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index ca19f210c5c9d5151b01ce81a4f44663e2df97cc..745be35b56278ed2e0033d5fd2806320d3164d7c 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include
#include
#include
@@ -27,7 +26,7 @@ limitations under the License. */
namespace dynload {
std::once_flag curand_dso_flag;
-void* curand_dso_handle = nullptr;
+void *curand_dso_handle = nullptr;
/**
* The following macro definition can generate structs
@@ -37,34 +36,35 @@ void* curand_dso_handle = nullptr;
* note: default dynamic linked libs
*/
#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CURAND_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- curandStatus_t operator()(Args... args) { \
- typedef curandStatus_t (*curandFunc)(Args...); \
- std::call_once(curand_dso_flag, GetCurandDsoHandle, \
- &curand_dso_handle); \
- void* p_##__name = dlsym(curand_dso_handle, #__name); \
- return reinterpret_cast(p_##__name)(args...); \
- } \
- } __name; /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CURAND_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ curandStatus_t operator()(Args... args) { \
+ typedef curandStatus_t (*curandFunc)(Args...); \
+ std::call_once(curand_dso_flag, GetCurandDsoHandle, &curand_dso_handle); \
+ void *p_##__name = dlsym(curand_dso_handle, #__name); \
+ return reinterpret_cast(p_##__name)(args...); \
+ } \
+ } __name; /* struct DynLoad__##__name */
#else
-#define DYNAMIC_LOAD_CURAND_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- curandStatus_t operator()(Args... args) { \
- return __name(args...); \
- } \
- } __name; /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CURAND_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ curandStatus_t operator()(Args... args) { \
+ return __name(args...); \
+ } \
+ } __name; /* struct DynLoad__##__name */
#endif
/* include all needed curand functions in HPPL */
+// clang-format off
#define CURAND_RAND_ROUTINE_EACH(__macro) \
__macro(curandCreateGenerator) \
__macro(curandSetStream) \
__macro(curandSetPseudoRandomGeneratorSeed)\
__macro(curandGenerateUniform) \
__macro(curandGenerateUniformDouble)
+// clang-format on
CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
@@ -72,7 +72,7 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
#undef DYNAMIC_LOAD_CURAND_WRAP
std::once_flag cudart_dso_flag;
-void* cudart_dso_handle = nullptr;
+void *cudart_dso_handle = nullptr;
/**
* The following macro definition can generate structs
@@ -82,28 +82,28 @@ void* cudart_dso_handle = nullptr;
* note: default dynamic linked libs
*/
#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- auto operator()(Args... args) -> decltype(__name(args...)) { \
- using cudart_func = decltype(__name(args...))(*)(Args...); \
- std::call_once(cudart_dso_flag, GetCudartDsoHandle, \
- &cudart_dso_handle); \
- void* p_##__name = dlsym(cudart_dso_handle, #__name); \
- return reinterpret_cast(p_##__name)(args...); \
- } \
- } __name; /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ auto operator()(Args... args) -> decltype(__name(args...)) { \
+ using cudart_func = decltype(__name(args...)) (*)(Args...); \
+ std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
+ void *p_##__name = dlsym(cudart_dso_handle, #__name); \
+ return reinterpret_cast(p_##__name)(args...); \
+ } \
+ } __name; /* struct DynLoad__##__name */
#else
-#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
- struct DynLoad__##__name { \
- template \
- auto operator()(Args... args) -> decltype(__name(args...)) { \
- return __name(args...); \
- } \
- } __name; /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name) \
+ struct DynLoad__##__name { \
+ template \
+ auto operator()(Args... args) -> decltype(__name(args...)) { \
+ return __name(args...); \
+ } \
+ } __name; /* struct DynLoad__##__name */
#endif
/* include all needed cuda functions in HPPL */
+// clang-format off
#define CUDA_ROUTINE_EACH(__macro) \
__macro(cudaMalloc) \
__macro(cudaHostAlloc) \
@@ -134,57 +134,57 @@ void* cudart_dso_handle = nullptr;
__macro(cudaFuncSetCacheConfig) \
__macro(cudaRuntimeGetVersion) \
__macro(cudaGetErrorString)
+// clang-format on
CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
#undef CUDA_ROUNTINE_EACH
#undef DYNAMIC_LOAD_CUDART_WRAP
-} /* namespace dynload */
+} /* namespace dynload */
/**
* @brief global resource.
*/
-int g_system_device_num = 0; /* system device number */
-int device_num = 0; /* use device number */
-hl_device_prop *g_device; /* device info table */
-__thread thread_device_resources *t_device; /* device resources table */
+int g_system_device_num = 0; /* system device number */
+int device_num = 0; /* use device number */
+hl_device_prop *g_device; /* device info table */
+__thread thread_device_resources *t_device; /* device resources table */
int g_cuda_lib_version = 0;
/* number of global stream */
-#define NUMBER_OF_GLOBAL_STREAM (HPPL_THREAD_STREAM_1)
+#define NUMBER_OF_GLOBAL_STREAM (HPPL_THREAD_STREAM_1)
/* number of thread stream */
-#define NUMBER_OF_THREAD_STREAM (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
+#define NUMBER_OF_THREAD_STREAM (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
/* sizeof of device memory */
-#define HPPL_GPU_MEMORY_SIZE (256*4)
+#define HPPL_GPU_MEMORY_SIZE (256 * 4)
/**
* Check build-in cuda function using glog and it **does not**
* support << operator for more details error info.
*/
-#define CHECK_CUDA(cudaFunc) \
- do { \
- cudaError_t cudaStat = cudaFunc; \
- CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: " \
- << dynload::cudaGetErrorString(cudaStat); \
+#define CHECK_CUDA(cudaFunc) \
+ do { \
+ cudaError_t cudaStat = cudaFunc; \
+ CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: " \
+ << dynload::cudaGetErrorString(cudaStat); \
} while (0)
/**
* @brief thread resource.
*/
-__thread _hl_thread_resource t_resource = {
- {0}, /* stream */
- 0, /* handle */
- 0, /* gen */
- 0, /* cudnn_handle */
- 0, /* cudnn_desc */
- NULL, /* gen_mutex */
- NULL, /* gpu_mem */
- NULL, /* cpu_mem */
- 0, /* event */
- -1, /* device */
- 0, /* major */
- false}; /* is_init */
+__thread _hl_thread_resource t_resource = {{0}, /* stream */
+ 0, /* handle */
+ 0, /* gen */
+ 0, /* cudnn_handle */
+ 0, /* cudnn_desc */
+ NULL, /* gen_mutex */
+ NULL, /* gpu_mem */
+ NULL, /* cpu_mem */
+ 0, /* event */
+ -1, /* device */
+ 0, /* major */
+ false}; /* is_init */
__thread cudaStream_t default_stream = 0;
__thread bool g_sync_flag = true;
@@ -198,9 +198,9 @@ inline pid_t gettid() {
uint64_t tid;
pthread_threadid_np(NULL, &tid);
#else
- #ifndef __NR_gettid
- #define __NR_gettid 224
- #endif
+#ifndef __NR_gettid
+#define __NR_gettid 224
+#endif
pid_t tid = syscall(__NR_gettid);
#endif
CHECK_NE((int)tid, -1);
@@ -208,8 +208,7 @@ inline pid_t gettid() {
}
void hl_init(int device) {
- CHECK(hl_start_flag)
- << "[Init failed] hl_start() did not succeed.";
+ CHECK(hl_start_flag) << "[Init failed] hl_start() did not succeed.";
/* thread has been initialized */
if (true == t_resource.is_init) {
@@ -220,16 +219,16 @@ void hl_init(int device) {
/* create thread devcie resources */
char *tmp;
thread_device_resources device_res;
- tmp = (char *)malloc(g_system_device_num*sizeof(thread_device_resources*) +
- device_num*sizeof(_thread_device_resources));
+ tmp = (char *)malloc(g_system_device_num * sizeof(thread_device_resources *) +
+ device_num * sizeof(_thread_device_resources));
CHECK_NOTNULL(tmp);
- t_device = (thread_device_resources*)tmp;
- device_res = (thread_device_resources)((char*)tmp +
- g_system_device_num*sizeof(thread_device_resources*));
- memset(t_device, 0, g_system_device_num*sizeof(thread_device_resources*));
+ t_device = (thread_device_resources *)tmp;
+ device_res = (thread_device_resources)(
+ (char *)tmp + g_system_device_num * sizeof(thread_device_resources *));
+ memset(t_device, 0, g_system_device_num * sizeof(thread_device_resources *));
- char *tmp_stream = (char *)
- malloc(device_num*NUMBER_OF_THREAD_STREAM*sizeof(cudaStream_t));
+ char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_THREAD_STREAM *
+ sizeof(cudaStream_t));
CHECK_NOTNULL(tmp_stream);
int num = 0;
@@ -239,8 +238,9 @@ void hl_init(int device) {
}
t_device[dev] = &device_res[num];
- t_device[dev]->stream = (cudaStream_t*)(tmp_stream +
- num*NUMBER_OF_THREAD_STREAM*sizeof(cudaStream_t));
+ t_device[dev]->stream =
+ (cudaStream_t *)(tmp_stream +
+ num * NUMBER_OF_THREAD_STREAM * sizeof(cudaStream_t));
hl_create_thread_resources(dev, t_device[dev]);
num++;
@@ -266,14 +266,14 @@ void hl_fini() {
t_resource.stream[i] = 0;
}
- char* tmp = (char*)t_device;
- char* tmp_stream = NULL;
+ char *tmp = (char *)t_device;
+ char *tmp_stream = NULL;
for (int dev = 0; dev < g_system_device_num; dev++) {
if (!t_device[dev]) {
continue;
}
if (!tmp_stream) {
- tmp_stream = (char*)t_device[dev]->stream;
+ tmp_stream = (char *)t_device[dev]->stream;
}
for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
CHECK_CUDA(dynload::cudaStreamDestroy(t_device[dev]->stream[j]));
@@ -290,9 +290,7 @@ void hl_fini() {
t_resource.is_init = false;
}
-int hl_get_device_count() {
- return device_num;
-}
+int hl_get_device_count() { return device_num; }
void hl_set_device(int device) {
if (device == t_resource.device) {
@@ -300,7 +298,7 @@ void hl_set_device(int device) {
}
CHECK(device >= 0 && device < g_system_device_num && g_device[device])
- << "Device: " << device << " is not specified in startup.";
+ << "Device: " << device << " is not specified in startup.";
CHECK_CUDA(dynload::cudaSetDevice(device));
@@ -312,11 +310,11 @@ void hl_set_device(int device) {
if (true == t_resource.is_init) {
for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
t_resource.stream[i] =
- t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
+ t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
}
t_resource.gpu_mem = t_device[device]->gpu_mem;
t_resource.cpu_mem = t_device[device]->cpu_mem;
- t_resource.event = t_device[device]->mem_event;
+ t_resource.event = t_device[device]->mem_event;
}
t_resource.handle = g_device[device]->device_resources->handle;
@@ -334,11 +332,11 @@ int hl_get_device() {
return device;
}
-void* hl_malloc_device(size_t size) {
+void *hl_malloc_device(size_t size) {
void *dest_d;
CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
- CHECK_CUDA(dynload::cudaMalloc((void**)&dest_d, size));
+ CHECK_CUDA(dynload::cudaMalloc((void **)&dest_d, size));
return dest_d;
}
@@ -348,15 +346,15 @@ void hl_free_mem_device(void *dest_d) {
cudaError_t err = dynload::cudaFree(dest_d);
CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
- << hl_get_device_error_string();
+ << hl_get_device_error_string();
}
-void* hl_malloc_host(size_t size) {
+void *hl_malloc_host(size_t size) {
void *dest_h;
CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
- CHECK_CUDA(dynload::cudaHostAlloc(
- (void**)&dest_h, size, cudaHostAllocDefault));
+ CHECK_CUDA(
+ dynload::cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
return dest_h;
}
@@ -366,7 +364,7 @@ void hl_free_mem_host(void *dest_h) {
cudaError_t err = dynload::cudaFreeHost(dest_h);
CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
- << hl_get_device_error_string();
+ << hl_get_device_error_string();
}
void hl_memcpy(void *dst, void *src, size_t size) {
@@ -388,8 +386,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
}
CHECK_NOTNULL(src_h);
CHECK_NOTNULL(dest_d);
- CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size,
- cudaMemcpyHostToDevice));
+ CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
}
void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
@@ -398,8 +395,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
}
CHECK_NOTNULL(dest_h);
CHECK_NOTNULL(src_d);
- CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size,
- cudaMemcpyDeviceToHost));
+ CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
}
void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
@@ -408,8 +404,8 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
}
CHECK_NOTNULL(dest_d);
CHECK_NOTNULL(src_d);
- CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_d, size,
- cudaMemcpyDeviceToDevice));
+ CHECK_CUDA(
+ dynload::cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
}
void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
@@ -423,8 +419,8 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
CHECK_LT(stream, HPPL_STREAM_END);
cu_stream = t_resource.stream[stream];
- CHECK_CUDA(dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault,
- cu_stream));
+ CHECK_CUDA(
+ dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
}
void hl_start() {
@@ -435,8 +431,8 @@ void hl_start() {
bool hl_device_can_access_peer(int device, int peerDevice) {
int canAccessPeer;
- CHECK_CUDA(dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device,
- peerDevice));
+ CHECK_CUDA(
+ dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
if (canAccessPeer == 1) {
return true;
@@ -478,33 +474,32 @@ void hl_create_global_resources(hl_device_prop device_prop) {
/* create curand gen */
CHECK_EQ(dynload::curandCreateGenerator(&device_res->gen,
- CURAND_RNG_PSEUDO_DEFAULT), CURAND_STATUS_SUCCESS)
- << "[Start failed] Curand init failed.";
+ CURAND_RNG_PSEUDO_DEFAULT),
+ CURAND_STATUS_SUCCESS)
+ << "[Start failed] Curand init failed.";
- CHECK_EQ(dynload::curandSetStream(device_res->gen,
- device_res->stream[0]), CURAND_STATUS_SUCCESS)
- << "[Start failed] Curand set stream failed!";
+ CHECK_EQ(dynload::curandSetStream(device_res->gen, device_res->stream[0]),
+ CURAND_STATUS_SUCCESS)
+ << "[Start failed] Curand set stream failed!";
/* create cudnn handle */
hl_cudnn_init(&device_res->cudnn_handle, device_res->stream[0]);
int seed = gettid();
- CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(
- device_res->gen, seed+device), CURAND_STATUS_SUCCESS);
+ CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(device_res->gen,
+ seed + device),
+ CURAND_STATUS_SUCCESS);
- device_res->gen_mutex =
- (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t)));
+ device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
pthread_mutex_init(device_res->gen_mutex, NULL);
CHECK_CUDA(dynload::cudaRuntimeGetVersion(&g_cuda_lib_version));
}
-int hl_get_cuda_version() {
- return g_cuda_lib_version;
-}
+int hl_get_cuda_version() { return g_cuda_lib_version; }
void hl_create_thread_resources(int device,
- thread_device_resources device_res) {
+ thread_device_resources device_res) {
CHECK_CUDA(dynload::cudaSetDevice(device));
/* create thread stream */
@@ -513,15 +508,15 @@ void hl_create_thread_resources(int device,
}
/* allocation device memory */
- device_res->gpu_mem = (real*)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
+ device_res->gpu_mem = (real *)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
/* allocation host memory */
- device_res->cpu_mem = (real*)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
+ device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
CHECK_CUDA(dynload::cudaEventCreate(&device_res->mem_event));
}
-void hl_specify_devices_start(int* device, int number) {
+void hl_specify_devices_start(int *device, int number) {
if (hl_start_flag) return;
/* 1. get the number of devices */
@@ -533,20 +528,19 @@ void hl_specify_devices_start(int* device, int number) {
/* 2. check device & create device property table */
CHECK_LE(number, g_system_device_num)
- << "[Start failed] System does not have enough device. "
- << "Device number: " << g_system_device_num
- << "Input number: " << number;
+ << "[Start failed] System does not have enough device. "
+ << "Device number: " << g_system_device_num << "Input number: " << number;
char *tmp;
hl_device_prop device_prop;
- tmp = (char *)malloc(g_system_device_num*sizeof(hl_device_prop*) +
- number*sizeof(_hl_device_prop));
+ tmp = (char *)malloc(g_system_device_num * sizeof(hl_device_prop *) +
+ number * sizeof(_hl_device_prop));
CHECK(tmp) << "[Start failed] System memory is not enough.";
- g_device = (hl_device_prop*)tmp;
- device_prop = (hl_device_prop)((char*)tmp +
- g_system_device_num*sizeof(hl_device_prop*));
- memset(g_device, 0, g_system_device_num*sizeof(hl_device_prop*));
+ g_device = (hl_device_prop *)tmp;
+ device_prop = (hl_device_prop)(
+ (char *)tmp + g_system_device_num * sizeof(hl_device_prop *));
+ memset(g_device, 0, g_system_device_num * sizeof(hl_device_prop *));
int num = 0;
for (int i = 0; i < number; i++) {
int dev;
@@ -557,13 +551,13 @@ void hl_specify_devices_start(int* device, int number) {
}
CHECK_LT(dev, g_system_device_num)
- << "[Start failed] The specified device number is "
- << "out of range. Max device number: " << g_system_device_num - 1
- << " Specified devcie number: "<< dev;
+ << "[Start failed] The specified device number is "
+ << "out of range. Max device number: " << g_system_device_num - 1
+ << " Specified devcie number: " << dev;
if (g_device[dev]) {
/* Warning */
- LOG(WARNING) <<"[Warning] Repeat specify device: " << dev;
+ LOG(WARNING) << "[Warning] Repeat specify device: " << dev;
continue;
}
@@ -574,11 +568,11 @@ void hl_specify_devices_start(int* device, int number) {
device_num = num;
/* 3. create global device resources */
- char *tmp_res = (char *)malloc(device_num*sizeof(_global_device_resources));
+ char *tmp_res = (char *)malloc(device_num * sizeof(_global_device_resources));
CHECK_NOTNULL(tmp_res);
- char *tmp_stream =
- (char *)malloc(device_num*NUMBER_OF_GLOBAL_STREAM*sizeof(cudaStream_t));
+ char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_GLOBAL_STREAM *
+ sizeof(cudaStream_t));
CHECK_NOTNULL(tmp_stream);
num = 0;
@@ -587,10 +581,11 @@ void hl_specify_devices_start(int* device, int number) {
continue;
}
- g_device[i]->device_resources = (global_device_resources)(tmp_res +
- num*sizeof(_global_device_resources));
- g_device[i]->device_resources->stream = (cudaStream_t*)(tmp_stream +
- num*NUMBER_OF_GLOBAL_STREAM*sizeof(cudaStream_t));
+ g_device[i]->device_resources = (global_device_resources)(
+ tmp_res + num * sizeof(_global_device_resources));
+ g_device[i]->device_resources->stream =
+ (cudaStream_t *)(tmp_stream +
+ num * NUMBER_OF_GLOBAL_STREAM * sizeof(cudaStream_t));
hl_create_global_resources(g_device[i]);
num++;
@@ -600,9 +595,9 @@ void hl_specify_devices_start(int* device, int number) {
hl_start_flag = true;
/* set default device */
if (device == NULL) {
- hl_set_device(0);
+ hl_set_device(0);
} else {
- hl_set_device(device[0]);
+ hl_set_device(device[0]);
}
}
@@ -610,35 +605,31 @@ void hl_rand(real *dest_d, size_t num) {
pthread_mutex_lock(t_resource.gen_mutex);
CHECK_EQ(
#ifndef PADDLE_TYPE_DOUBLE
- dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
+ dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
#else
- dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
+ dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
#endif
- CURAND_STATUS_SUCCESS);
+ CURAND_STATUS_SUCCESS);
pthread_mutex_unlock(t_resource.gen_mutex);
CHECK_SYNC("hl_rand failed");
}
void hl_srand(unsigned int seed) {
pthread_mutex_lock(t_resource.gen_mutex);
- CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(
- t_resource.gen, seed), CURAND_STATUS_SUCCESS);
+ CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(t_resource.gen, seed),
+ CURAND_STATUS_SUCCESS);
pthread_mutex_unlock(t_resource.gen_mutex);
}
-void hl_set_sync_flag(bool flag) {
- g_sync_flag = flag;
-}
+void hl_set_sync_flag(bool flag) { g_sync_flag = flag; }
-bool hl_get_sync_flag() {
- return g_sync_flag;
-}
+bool hl_get_sync_flag() { return g_sync_flag; }
void hl_stream_synchronize(hl_stream_t stream) {
cudaStream_t cu_stream;
- CHECK_LT(stream, HPPL_STREAM_END)
- << __func__ <<": the parameter stream is error.";
+ CHECK_LT(stream, HPPL_STREAM_END) << __func__
+ << ": the parameter stream is error.";
cu_stream = t_resource.stream[stream];
CHECK_CUDA(dynload::cudaStreamSynchronize(cu_stream));
@@ -647,8 +638,8 @@ void hl_stream_synchronize(hl_stream_t stream) {
void hl_create_event(hl_event_t *event) {
CHECK_NOTNULL(event);
- struct _hl_event_st* st_event =
- (struct _hl_event_st*)malloc(sizeof(struct _hl_event_st));
+ struct _hl_event_st *st_event =
+ (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
CHECK_CUDA(dynload::cudaEventCreate(&st_event->cu_event));
@@ -660,8 +651,8 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
CHECK_NOTNULL(start);
CHECK_NOTNULL(end);
- CHECK_CUDA(dynload::cudaEventElapsedTime(&time,
- start->cu_event, end->cu_event));
+ CHECK_CUDA(
+ dynload::cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
return time;
}
@@ -669,24 +660,22 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
cudaStream_t cu_stream;
CHECK_NOTNULL(event);
- CHECK_LT(stream, HPPL_STREAM_END)
- << __func__ <<": the parameter stream is error.";
+ CHECK_LT(stream, HPPL_STREAM_END) << __func__
+ << ": the parameter stream is error.";
cu_stream = t_resource.stream[stream];
- CHECK_CUDA(dynload::cudaEventRecord(
- event->cu_event, cu_stream));
+ CHECK_CUDA(dynload::cudaEventRecord(event->cu_event, cu_stream));
}
void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
cudaStream_t cu_stream;
CHECK_NOTNULL(event);
- CHECK_LT(stream, HPPL_STREAM_END)
- << __func__ <<": the parameter stream is error.";
+ CHECK_LT(stream, HPPL_STREAM_END) << __func__
+ << ": the parameter stream is error.";
cu_stream = t_resource.stream[stream];
- CHECK_CUDA(dynload::cudaStreamWaitEvent(
- cu_stream, event->cu_event, 0));
+ CHECK_CUDA(dynload::cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
}
void hl_destroy_event(hl_event_t event) {
@@ -705,15 +694,15 @@ void hl_event_synchronize(hl_event_t event) {
void hl_get_device_name(char *name, int len, int device) {
CHECK_NOTNULL(name);
CHECK(device >= 0 && device < g_system_device_num && g_device[device])
- << "Device("<< device <<") is not specified in startup.";
+ << "Device(" << device << ") is not specified in startup.";
- strncpy(name, g_device[device]->device_name , len);
+ strncpy(name, g_device[device]->device_name, len);
}
void hl_get_device_memory(size_t *mem_size, int device) {
CHECK_NOTNULL(mem_size);
CHECK(device >= 0 && device < g_system_device_num && g_device[device])
- << "Device("<< device <<") is not specified in startup.";
+ << "Device(" << device << ") is not specified in startup.";
*mem_size = g_device[device]->device_mem;
}
@@ -722,31 +711,26 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) {
CHECK_NOTNULL(major);
CHECK_NOTNULL(minor);
CHECK(device >= 0 && device < g_system_device_num && g_device[device])
- << "Device("<< device << ") is not specified in startup.";
+ << "Device(" << device << ") is not specified in startup.";
*major = g_device[device]->major;
*minor = g_device[device]->minor;
}
-int hl_get_device_last_error() {
- return (int)dynload::cudaGetLastError();
-}
+int hl_get_device_last_error() { return (int)dynload::cudaGetLastError(); }
-const char* hl_get_device_error_string() {
+const char *hl_get_device_error_string() {
cudaError_t err = dynload::cudaGetLastError();
return dynload::cudaGetErrorString(err);
}
-const char* hl_get_device_error_string(size_t err) {
+const char *hl_get_device_error_string(size_t err) {
return dynload::cudaGetErrorString((cudaError_t)err);
}
-void hl_device_synchronize() {
- CHECK_CUDA(dynload::cudaDeviceSynchronize());
-}
+void hl_device_synchronize() { CHECK_CUDA(dynload::cudaDeviceSynchronize()); }
void hl_set_device_flags_block() {
- CHECK_CUDA(dynload::cudaSetDeviceFlags(
- cudaDeviceScheduleBlockingSync));
+ CHECK_CUDA(dynload::cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
}
bool hl_cuda_event_is_ready(hl_event_t event) {
diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc
index fe755b8c2606dffeeff2ea1549180ca8b134c251..ff6b830b7addc5c87af0d55070260c279a046a75 100644
--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ b/paddle/cuda/src/hl_cudart_wrap.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#ifdef PADDLE_USE_DSO
#include
@@ -29,26 +28,26 @@ limitations under the License. */
namespace dynload {
extern std::once_flag cudart_dso_flag;
-extern void* cudart_dso_handle;
+extern void *cudart_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cuda routine
* via operator overloading.
**/
-#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type) \
- struct DynLoad__##__name { \
- template \
- __type operator()(Args... args) { \
- typedef __type (*cudartFunc)(Args...); \
- std::call_once(cudart_dso_flag, GetCudartDsoHandle, \
- &cudart_dso_handle); \
- void* p_##__name = dlsym(cudart_dso_handle, #__name); \
- return reinterpret_cast(p_##__name)(args...); \
- } \
- } __name; /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type) \
+ struct DynLoad__##__name { \
+ template \
+ __type operator()(Args... args) { \
+ typedef __type (*cudartFunc)(Args...); \
+ std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
+ void *p_##__name = dlsym(cudart_dso_handle, #__name); \
+ return reinterpret_cast(p_##__name)(args...); \
+ } \
+ } __name; /* struct DynLoad__##__name */
/* include all needed cuda functions in HPPL */
+// clang-format off
#define CUDA_ROUTINE_EACH(__macro) \
__macro(cudaLaunch, cudaError_t) \
__macro(cudaSetupArgument, cudaError_t) \
@@ -61,16 +60,17 @@ extern void* cudart_dso_handle;
__macro(__cudaInitModule, char) \
__macro(__cudaRegisterTexture, void) \
__macro(__cudaRegisterSurface, void)
+// clang-format on
CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
#if CUDART_VERSION >= 7000
- DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
+DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
#endif
#undef CUDA_ROUNTINE_EACH
-} /* namespace dynload */
+} /* namespace dynload */
#if CUDART_VERSION >= 7000
__host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
@@ -79,12 +79,11 @@ __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
void **args,
size_t sharedMem,
cudaStream_t stream) {
- return dynload::cudaLaunchKernel(func, gridDim, blockDim,
- args, sharedMem, stream);
+ return dynload::cudaLaunchKernel(
+ func, gridDim, blockDim, args, sharedMem, stream);
}
#endif /* CUDART_VERSION >= 7000 */
-
__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
return dynload::cudaLaunch(func);
}
@@ -99,13 +98,12 @@ __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
dim3 blockDim,
size_t sharedMem,
cudaStream_t stream) {
- return dynload::cudaConfigureCall(gridDim, blockDim,
- sharedMem, stream);
+ return dynload::cudaConfigureCall(gridDim, blockDim, sharedMem, stream);
}
extern "C" {
-void** CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
+void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
return dynload::__cudaRegisterFatBinary(fatCubin);
}
@@ -113,86 +111,87 @@ void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
}
-void CUDARTAPI __cudaRegisterFunction(
- void **fatCubinHandle,
- const char *hostFun,
- char *deviceFun,
- const char *deviceName,
- int thread_limit,
- uint3 *tid,
- uint3 *bid,
- dim3 *bDim,
- dim3 *gDim,
- int *wSize
-) {
- return dynload::__cudaRegisterFunction(
- fatCubinHandle, hostFun, deviceFun, deviceName,
- thread_limit, tid, bid, bDim, gDim, wSize);
+void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle,
+ const char *hostFun,
+ char *deviceFun,
+ const char *deviceName,
+ int thread_limit,
+ uint3 *tid,
+ uint3 *bid,
+ dim3 *bDim,
+ dim3 *gDim,
+ int *wSize) {
+ return dynload::__cudaRegisterFunction(fatCubinHandle,
+ hostFun,
+ deviceFun,
+ deviceName,
+ thread_limit,
+ tid,
+ bid,
+ bDim,
+ gDim,
+ wSize);
}
-void CUDARTAPI __cudaRegisterVar(
- void **fatCubinHandle,
- char *hostVar,
- char *deviceAddress,
- const char *deviceName,
- int ext,
- int size,
- int constant,
- int global
-) {
- return dynload::__cudaRegisterVar(
- fatCubinHandle, hostVar, deviceAddress,
- deviceName, ext, size, constant, global);
+void CUDARTAPI __cudaRegisterVar(void **fatCubinHandle,
+ char *hostVar,
+ char *deviceAddress,
+ const char *deviceName,
+ int ext,
+ int size,
+ int constant,
+ int global) {
+ return dynload::__cudaRegisterVar(fatCubinHandle,
+ hostVar,
+ deviceAddress,
+ deviceName,
+ ext,
+ size,
+ constant,
+ global);
}
-
-
-extern void CUDARTAPI __cudaRegisterManagedVar(
- void **fatCubinHandle,
- void **hostVarPtrAddress,
- char *deviceAddress,
- const char *deviceName,
- int ext,
- int size,
- int constant,
- int global
-) {
- return dynload::__cudaRegisterManagedVar(
- fatCubinHandle, hostVarPtrAddress, deviceAddress,
- deviceName, ext, size, constant, global);
+extern void CUDARTAPI __cudaRegisterManagedVar(void **fatCubinHandle,
+ void **hostVarPtrAddress,
+ char *deviceAddress,
+ const char *deviceName,
+ int ext,
+ int size,
+ int constant,
+ int global) {
+ return dynload::__cudaRegisterManagedVar(fatCubinHandle,
+ hostVarPtrAddress,
+ deviceAddress,
+ deviceName,
+ ext,
+ size,
+ constant,
+ global);
}
-char CUDARTAPI __cudaInitModule(
- void **fatCubinHandle
-) {
+char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
return dynload::__cudaInitModule(fatCubinHandle);
}
-void CUDARTAPI __cudaRegisterTexture(
- void **fatCubinHandle,
- const struct textureReference *hostVar,
- const void **deviceAddress,
- const char *deviceName,
- int dim,
- int norm,
- int ext
-) {
+void CUDARTAPI __cudaRegisterTexture(void **fatCubinHandle,
+ const struct textureReference *hostVar,
+ const void **deviceAddress,
+ const char *deviceName,
+ int dim,
+ int norm,
+ int ext) {
return dynload::__cudaRegisterTexture(
- fatCubinHandle, hostVar, deviceAddress,
- deviceName, dim, norm, ext);
+ fatCubinHandle, hostVar, deviceAddress, deviceName, dim, norm, ext);
}
-void CUDARTAPI __cudaRegisterSurface(
- void **fatCubinHandle,
- const struct surfaceReference *hostVar,
- const void **deviceAddress,
- const char *deviceName,
- int dim,
- int ext
-) {
+void CUDARTAPI __cudaRegisterSurface(void **fatCubinHandle,
+ const struct surfaceReference *hostVar,
+ const void **deviceAddress,
+ const char *deviceName,
+ int dim,
+ int ext) {
return dynload::__cudaRegisterSurface(
- fatCubinHandle, hostVar, deviceAddress,
- deviceName, dim, ext);
+ fatCubinHandle, hostVar, deviceAddress, deviceName, dim, ext);
}
} /* extern "C" */
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index 5cb16cfbb372209a6cac83cdaace9afbf590e0fe..1a3ce08619fc3a5787576b30e9f4c13336990e74 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -12,17 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "hl_dso_loader.h"
-#include "paddle/utils/Logging.h"
#include "paddle/utils/CommandLineParser.h"
+#include "paddle/utils/Logging.h"
-P_DEFINE_string(cudnn_dir, "",
+P_DEFINE_string(cudnn_dir,
+ "",
"Specify path for loading libcudnn.so. For instance, "
- "/usr/local/cudnn/lib64. If empty [default], dlopen "
+ "/usr/local/cudnn/lib. If empty [default], dlopen "
"will search cudnn from LD_LIBRARY_PATH");
-P_DEFINE_string(cuda_dir, "",
+P_DEFINE_string(cuda_dir,
+ "",
"Specify path for loading cuda library, such as libcublas, "
"libcurand. For instance, /usr/local/cuda/lib64. (Note: "
"libcudart can not be specified by cuda_dir, since some "
@@ -33,7 +34,6 @@ static inline std::string join(const std::string& part1,
const std::string& part2) {
// directory separator
const char sep = '/';
-
if (!part2.empty() && part2.front() == sep) {
return part2;
}
@@ -47,100 +47,115 @@ static inline std::string join(const std::string& part1,
return ret;
}
-static inline void GetDsoHandleFromDefaultPath(
- std::string& dso_path, void** dso_handle, int dynload_flags) {
- VLOG(3) << "Try to find cuda library: " << dso_path
- << " from default system path.";
- // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
+ void** dso_handle,
+ int dynload_flags) {
+ VLOG(3) << "Try to find cuda library: " << dso_path
+ << " from default system path.";
+ // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+ *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+
+// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
+// bring System Integrity Projection (SIP), if dso_handle
+// is null, search from default package path in Mac OS.
+#if defined(__APPLE__) || defined(__OSX__)
+ if (nullptr == *dso_handle) {
+ dso_path = join("/usr/local/cuda/lib/", dso_path);
*dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-
- // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
- // bring System Integrity Projection (SIP), if dso_handle
- // is null, search from default package path in Mac OS.
- #if defined(__APPLE__) || defined(__OSX__)
if (nullptr == *dso_handle) {
- dso_path = join("/usr/local/cuda/lib/", dso_path);
- *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
- if (nullptr == *dso_handle) {
- if (dso_path == "libcudnn.dylib") {
- LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n" // NOLINT
- << "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C " // NOLINT
- << "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h " // NOLINT
- << "/usr/local/cuda/lib/libcudnn*";
- }
- }
+ if (dso_path == "libcudnn.dylib") {
+ LOG(FATAL)
+ << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n" // NOLINT
+ << "For instance, sudo tar -xzf "
+ "cudnn-7.5-osx-x64-v5.0-ga.tgz -C " // NOLINT
+ << "/usr/local \n sudo chmod a+r "
+ "/usr/local/cuda/include/cudnn.h " // NOLINT
+ << "/usr/local/cuda/lib/libcudnn*";
+ }
}
- #endif
+ }
+#endif
}
-static inline void GetDsoHandleFromSearchPath(
- const std::string& search_root,
- const std::string& dso_name,
- void** dso_handle) {
- int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
- *dso_handle = nullptr;
-
- std::string dlPath = dso_name;
- if (search_root.empty()) {
- GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
- } else {
- // search xxx.so from custom path
- dlPath = join(search_root, dso_name);
- *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
- // if not found, search from default path
- if (nullptr == dso_handle) {
- LOG(WARNING) << "Failed to find cuda library: " << dlPath;
- dlPath = dso_name;
- GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
- }
+static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
+ const std::string& dso_name,
+ void** dso_handle) {
+ int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+ *dso_handle = nullptr;
+
+ std::string dlPath = dso_name;
+ if (search_root.empty()) {
+ GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+ } else {
+ // search xxx.so from custom path
+ dlPath = join(search_root, dso_name);
+ *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+ // if not found, search from default path
+ if (nullptr == *dso_handle) {
+ LOG(WARNING) << "Failed to find cuda library: " << dlPath;
+ dlPath = dso_name;
+ GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
}
+ }
- CHECK(nullptr != *dso_handle)
- << "Failed to find cuda library: " << dlPath << std::endl
- << "Please specify its path correctly using one of the following ways: \n" // NOLINT
-
- << "Method 1. set cuda and cudnn lib path at runtime. "
- << "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n" // NOLINT
- << "For instance, issue command: paddle train --use_gpu=1 "
- << "--cuda_dir=/usr/local/cuda/lib64 --cudnn_dir=/usr/local/cudnn/lib ...\n" // NOLINT
-
- << "Method 2. set environment variable LD_LIBRARY_PATH on Linux or "
- << "DYLD_LIBRARY_PATH on Mac OS. \n"
- << "For instance, issue command: export LD_LIBRARY_PATH=... \n"
-
- << "Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is impossible "
- << "unless System Integrity Protection (SIP) is disabled. However, method 1 " // NOLINT
- << "always work well.";
+ CHECK(nullptr != *dso_handle) << "Failed to find cuda library: " << dlPath
+ << std::endl
+ << "Please specify its path correctly using "
+ "one of the following ways: \n" // NOLINT
+
+ << "Method 1. set cuda and cudnn lib path at "
+ "runtime. "
+ << "http://www.paddlepaddle.org/doc/ui/"
+ "cmd_argument/"
+ "argument_outline.html \n" // NOLINT
+ << "For instance, issue command: paddle train "
+ "--use_gpu=1 "
+ << "--cuda_dir=/usr/local/cuda/lib64 "
+ "--cudnn_dir=/usr/local/cudnn/lib "
+ "...\n" // NOLINT
+
+ << "Method 2. set environment variable "
+ "LD_LIBRARY_PATH on Linux or "
+ << "DYLD_LIBRARY_PATH on Mac OS. \n"
+ << "For instance, issue command: export "
+ "LD_LIBRARY_PATH=... \n"
+
+ << "Note: After Mac OS 10.11, using the "
+ "DYLD_LIBRARY_PATH is impossible "
+ << "unless System Integrity Protection (SIP) "
+ "is disabled. However, "
+ "method 1 " // NOLINT
+ << "always work well.";
}
void GetCublasDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
- GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+ GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
#else
- GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+ GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
#endif
}
void GetCudnnDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
- GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+ GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
#else
- GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+ GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
#endif
}
void GetCudartDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
- GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
+ GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
#else
- GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
+ GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
#endif
}
void GetCurandDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__)
- GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+ GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
#else
- GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+ GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
#endif
}
diff --git a/paddle/cuda/src/hl_math.cc b/paddle/cuda/src/hl_math.cc
index 76d48c4a9b94d402cf84c57bd240e03a1a83b1a0..f4bf888bab4e92dd940714ef1b7aeee9242eb817 100644
--- a/paddle/cuda/src/hl_math.cc
+++ b/paddle/cuda/src/hl_math.cc
@@ -12,24 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "avx_mathfun.h"
namespace hppl {
-__m256 exp(__m256 a) {
- return exp256_ps(a);
-}
+__m256 exp(__m256 a) { return exp256_ps(a); }
-__m256 log(__m256 a) {
- return log256_ps(a);
-}
+__m256 log(__m256 a) { return log256_ps(a); }
-__m256 sin(__m256 a) {
- return sin256_ps(a);
-}
+__m256 sin(__m256 a) { return sin256_ps(a); }
-__m256 cos(__m256 a) {
- return cos256_ps(a);
-}
+__m256 cos(__m256 a) { return cos256_ps(a); }
} // namespace hppl
diff --git a/paddle/cuda/src/hl_time.cc b/paddle/cuda/src/hl_time.cc
index adc88d60dd8d547cedcae5fd088b2fa581d8e5be..d52b2a1df07374f632def12eb52e10e10ca86028 100644
--- a/paddle/cuda/src/hl_time.cc
+++ b/paddle/cuda/src/hl_time.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include
#include
#include
@@ -21,8 +20,7 @@ limitations under the License. */
using std::chrono::high_resolution_clock;
int64_t getCurrentTimeStick() {
- high_resolution_clock::time_point tp = high_resolution_clock::now();
- high_resolution_clock::duration dtn = tp.time_since_epoch();
- return dtn.count();
+ high_resolution_clock::time_point tp = high_resolution_clock::now();
+ high_resolution_clock::duration dtn = tp.time_since_epoch();
+ return dtn.count();
}
-
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index 27eed75d4d76c351e381a3b71dc44a3254fb1a4d..f1bb94216c44b3e915f87a3ae49bdfd3ef812916 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -51,12 +51,14 @@ static ClassRegistrar gActivationRegistrar;
* @brief Macro for registering a derived activation class
*/
#define END_DEFINE_ACTIVATION(ACTIVATION_NAME) \
- }; \
+ } \
+ ; \
const std::string ACTIVATION_CLASS_NAME(ACTIVATION_NAME)::name = \
#ACTIVATION_NAME; \
static InitFunction __reg_activation__##ACTIVATION_NAME([] { \
- gActivationRegistrar.registerClass< \
- ACTIVATION_CLASS_NAME(ACTIVATION_NAME)>(#ACTIVATION_NAME); \
+ gActivationRegistrar \
+ .registerClass( \
+ #ACTIVATION_NAME); \
});
/**
@@ -111,14 +113,22 @@ void backward(Argument& act) {
outputG->softmaxBackward(*outputV);
} else {
SetDevice device(act.deviceId);
- Matrix::resizeOrCreate(sftMaxDot_, outputG->getHeight(),
+ Matrix::resizeOrCreate(sftMaxDot_,
+ outputG->getHeight(),
outputG->getWidth(),
- /* trans */ false, useGpu(act.deviceId));
- Matrix::resizeOrCreate(sftMaxSum_, outputG->getHeight(), 1,
- /* trans */ false, useGpu(act.deviceId));
+ /* trans */ false,
+ useGpu(act.deviceId));
+ Matrix::resizeOrCreate(sftMaxSum_,
+ outputG->getHeight(),
+ 1,
+ /* trans */ false,
+ useGpu(act.deviceId));
if (!one_ || one_->getWidth() != outputG->getWidth()) {
- Matrix::resizeOrCreate(one_, 1, outputG->getWidth(),
- /* trans */ false, useGpu(act.deviceId));
+ Matrix::resizeOrCreate(one_,
+ 1,
+ outputG->getWidth(),
+ /* trans */ false,
+ useGpu(act.deviceId));
one_->one();
}
@@ -130,7 +140,6 @@ void backward(Argument& act) {
}
END_DEFINE_ACTIVATION(softmax)
-
/**
* @brief Sequence_softmax Activation
* @note Softmax on all frames of one sequence.
@@ -146,10 +155,16 @@ void forward(Argument& act) {
CHECK_EQ(act.value->getWidth(), 1UL);
if (!argument_.value) {
- argument_.value = Matrix::create(nullptr, /* height= */ 1, 1,
- /* trans= */ false, useGpu(act.deviceId));
- argument_.grad = Matrix::create(nullptr, /* height= */ 1, 1,
- /* trans= */ false, useGpu(act.deviceId));
+ argument_.value = Matrix::create(nullptr,
+ /* height= */ 1,
+ 1,
+ /* trans= */ false,
+ useGpu(act.deviceId));
+ argument_.grad = Matrix::create(nullptr,
+ /* height= */ 1,
+ 1,
+ /* trans= */ false,
+ useGpu(act.deviceId));
}
auto starts = act.sequenceStartPositions->getVector(useGpu(act.deviceId));
@@ -267,8 +282,11 @@ END_DEFINE_ACTIVATION(softrelu)
BEGIN_DEFINE_ACTIVATION(abs)
void forward(Argument& act) {
SetDevice device(act.deviceId);
- Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
- /* trans */ false, useGpu(act.deviceId));
+ Matrix::resizeOrCreate(act.in,
+ act.value->getHeight(),
+ act.value->getWidth(),
+ /* trans */ false,
+ useGpu(act.deviceId));
act.in->copyFrom(*act.value);
act.value->abs(*act.value);
@@ -286,8 +304,11 @@ END_DEFINE_ACTIVATION(abs)
BEGIN_DEFINE_ACTIVATION(square)
void forward(Argument& act) {
SetDevice device(act.deviceId);
- Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
- /* trans */ false, useGpu(act.deviceId));
+ Matrix::resizeOrCreate(act.in,
+ act.value->getHeight(),
+ act.value->getWidth(),
+ /* trans */ false,
+ useGpu(act.deviceId));
act.in->copyFrom(*act.value);
act.value->square(*act.value);
@@ -317,8 +338,11 @@ END_DEFINE_ACTIVATION(exponential)
BEGIN_DEFINE_ACTIVATION(log)
void forward(Argument& act) {
SetDevice device(act.deviceId);
- Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
- /* trans */ false, useGpu(act.deviceId));
+ Matrix::resizeOrCreate(act.in,
+ act.value->getHeight(),
+ act.value->getWidth(),
+ /* trans */ false,
+ useGpu(act.deviceId));
act.in->copyFrom(*act.value);
act.value->log(*act.value);
@@ -333,11 +357,9 @@ ActivationFunction* ActivationFunction::create(const std::string& type) {
std::vector ActivationFunction::getAllRegisteredTypes() {
std::vector types;
- gActivationRegistrar.forEachType([&](const std::string& type) {
- types.push_back(type);
- });
+ gActivationRegistrar.forEachType(
+ [&](const std::string& type) { types.push_back(type); });
return types;
}
-
} // namespace paddle
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
index c483372256c035e39bfdbcaa4193a1a2e7fd80b8..e9ed5c619ab5e4dd9c52c0dac24478c2a57aa1bf 100644
--- a/paddle/gserver/activations/ActivationFunction.h
+++ b/paddle/gserver/activations/ActivationFunction.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include
#include
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
index 2cfb5a3a18c8a63d69bf0598eeee2807376340bc..e6cc4a246a8494d287f8638674f4ae213f38f657 100644
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "DataProvider.h"
#include "paddle/utils/Util.h"
@@ -57,7 +56,7 @@ void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) {
}
}
-DoubleBuffer::DoubleBuffer(DataProvider *dataPool,
+DoubleBuffer::DoubleBuffer(DataProvider* dataPool,
bool useGpu,
int64_t batchSize) {
batchSize_ = batchSize;
@@ -155,7 +154,7 @@ void DoubleBuffer::startAsyncLoad() {
}
ClassRegistrar
-DataProvider::registrar_;
+ DataProvider::registrar_;
DataProvider* DataProvider::create(const DataConfig& config,
const ModelConfig& modelConfig,
@@ -182,7 +181,8 @@ int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
for (int i = 0; i < config_.constant_slots_size(); ++i) {
MemoryHandlePtr handle =
constantSlots[i] ? constantSlots[i]->getMemoryHandle() : nullptr;
- Matrix::resizeOrCreate(constantSlots[i], batchSize,
+ Matrix::resizeOrCreate(constantSlots[i],
+ batchSize,
1, // = width
false, // = trans
useGpu_); // = useGpu
@@ -216,7 +216,8 @@ void DataProvider::initAsyncLoader() {
}
SimpleDataProviderBase::SimpleDataProviderBase(const DataConfig& config,
- bool useGpu, bool withInfo)
+ bool useGpu,
+ bool withInfo)
: DataProvider(config, useGpu) {
/* initialize the size of a sample, and the buffer */
sampleDim_ = config_.feat_dim() * (2 * config_.context_len() + 1);
@@ -337,7 +338,8 @@ int64_t SimpleDataProviderBase::fillBuffer() {
sampleNumInBuf_ =
n + fillBufferImp(hInputDataBuf_->getData() + n * sampleDim_,
hInputLabelBuf_->getData() + n,
- hInputInfoBuf_->getData() + n, bufferCapacity_ - n);
+ hInputInfoBuf_->getData() + n,
+ bufferCapacity_ - n);
/* for stachastic gradient training */
if (!skipShuffle_) {
@@ -357,11 +359,14 @@ SimpleDataProvider::SimpleDataProvider(const DataConfig& config, bool useGpu)
SimpleDataProvider::~SimpleDataProvider() {}
-int64_t SimpleDataProvider::fillBufferImp(real* data, int* label, int* info,
+int64_t SimpleDataProvider::fillBufferImp(real* data,
+ int* label,
+ int* info,
int64_t size) {
(void)info;
int64_t n = std::min(labels_.size() - currentSampleIndex_, size);
- memcpy(data, &data_[currentSampleIndex_ * sampleDim_],
+ memcpy(data,
+ &data_[currentSampleIndex_ * sampleDim_],
n * sampleDim_ * sizeof(real));
memcpy(label, &labels_[currentSampleIndex_], sizeof(int) * n);
currentSampleIndex_ += n;
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 112e45de1cb232097ed63b120d5ac631b37952e9..8b7fb27f821a47d830413eced79b3352a6969c90 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include
@@ -44,15 +43,15 @@ namespace paddle {
* @brief Macro for registering a data provider. The class type should contain
* a consturctor with parameter (DataConfig, bool).
*/
-#define REGISTER_DATA_PROVIDER(__type_name, __class_name)\
- static InitFunction __reg_type_##__type_name([]() {\
- DataProvider::registrar_.registerClass(\
- #__type_name, \
- [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
- DataProvider* dp = new __class_name (conf, useGpu);\
- return dp;\
- });\
-})
+#define REGISTER_DATA_PROVIDER(__type_name, __class_name) \
+ static InitFunction __reg_type_##__type_name([]() { \
+ DataProvider::registrar_.registerClass( \
+ #__type_name, \
+ [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
+ DataProvider* dp = new __class_name(conf, useGpu); \
+ return dp; \
+ }); \
+ })
/**
* @def REGISTER_DATA_PROVIDER_EX
@@ -61,8 +60,8 @@ namespace paddle {
*/
#define REGISTER_DATA_PROVIDER_EX(__type_name, __class_name) \
static InitFunction __reg_type_##__type_name([] { \
- DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
-})
+ DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
+ })
class DataBatch;
class BufferBatch;
@@ -181,7 +180,8 @@ public:
* @param[in] size DataBatch.getSize()
* @param[in] dataId sub dataprovider id (in MultiDataProvider)
*/
- void appendArguments(const std::vector& argus, int size,
+ void appendArguments(const std::vector& argus,
+ int size,
int dataId) {
size_ += size;
for (const auto& argu : argus) {
@@ -259,9 +259,7 @@ typedef Queue BufferBatchQueue;
class DoubleBuffer {
public:
- DoubleBuffer(DataProvider* dataPool,
- bool useGpu,
- int64_t batchSize = 0);
+ DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
virtual ~DoubleBuffer();
void removeOneBatch(DataBatch* dataBatch);
@@ -310,7 +308,7 @@ public:
/**
* @brief create only used for unittest.
*/
- inline static DataProvider* create(const DataConfig &config,
+ inline static DataProvider* create(const DataConfig& config,
bool useGpu = FLAGS_use_gpu) {
return create(config, ModelConfig(), useGpu);
}
@@ -462,7 +460,9 @@ protected:
*
* label[n] is the label for the n-th sample.
*/
- virtual int64_t fillBufferImp(real* data, int* label, int* info,
+ virtual int64_t fillBufferImp(real* data,
+ int* label,
+ int* info,
int64_t size) = 0;
};
@@ -475,7 +475,9 @@ public:
protected:
void loadData(const std::string& fileName);
void loadDataFile(const std::string& fileName);
- virtual int64_t fillBufferImp(real* data, int* label, int* info,
+ virtual int64_t fillBufferImp(real* data,
+ int* label,
+ int* info,
int64_t size);
protected:
diff --git a/paddle/gserver/dataproviders/DataProviderGroup.h b/paddle/gserver/dataproviders/DataProviderGroup.h
index 0689f90f3e7dd3d3e1df19f3958c821d53e69700..6c178e29ee714a6bd7f58861d7cf15716fee848d 100644
--- a/paddle/gserver/dataproviders/DataProviderGroup.h
+++ b/paddle/gserver/dataproviders/DataProviderGroup.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include "DataProvider.h"
@@ -65,8 +64,8 @@ void DataProviderGroup::reset() {
provider_ = nullptr;
// shuffle file list
- std::shuffle(fileList_.begin(), fileList_.end(),
- ThreadLocalRandomEngine::get());
+ std::shuffle(
+ fileList_.begin(), fileList_.end(), ThreadLocalRandomEngine::get());
startLoader();
DataProvider::reset();
@@ -113,8 +112,9 @@ void DataProviderGroup::startLoader() {
size_t endPos = std::min(fileList_.size(), startPos + loadFileCount);
std::vector fileVec(fileList_.begin() + startPos,
fileList_.begin() + endPos);
- loader_->addJob([this, fileVec]()
- -> ProviderPtrType { return this->loadFile(fileVec); });
+ loader_->addJob([this, fileVec]() -> ProviderPtrType {
+ return this->loadFile(fileVec);
+ });
}
loader_->stopAddJob();
}
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.cpp b/paddle/gserver/dataproviders/MultiDataProvider.cpp
index 8e4f53978a0451f3bb6cd5da30f017708448f9ac..51fb1f26668c55dc1c2aecd5389f327e2569a52f 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.cpp
+++ b/paddle/gserver/dataproviders/MultiDataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "paddle/utils/Util.h"
#include "MultiDataProvider.h"
#include "paddle/utils/Logging.h"
@@ -59,10 +58,8 @@ MultiDataProvider::MultiDataProvider(const DataConfig& config,
"MultiDataProvider";
subConfig.set_async_load_data(false);
}
- subDataProviders_[i] =
- std::unique_ptr(DataProvider::create(subConfig,
- modelConfig,
- useGpu_));
+ subDataProviders_[i] = std::unique_ptr(
+ DataProvider::create(subConfig, modelConfig, useGpu_));
}
}
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.h b/paddle/gserver/dataproviders/MultiDataProvider.h
index b498ba6516c4320566b1b3cc2bd557ae016d7c39..876467c04f074cf37e48fdfa9b24f236fcfe8ba1 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.h
+++ b/paddle/gserver/dataproviders/MultiDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include "DataProvider.h"
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
index 344644755f24045443b8cb3ebd08004a4b1cdcb5..0a7ff802461f2ded0e6e842c088bddf218361f79 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "ProtoDataProvider.h"
#include "paddle/utils/Util.h"
#include "paddle/utils/StringUtil.h"
@@ -23,7 +22,8 @@ limitations under the License. */
#include "paddle/utils/Logging.h"
#include "DataProviderGroup.h"
-P_DEFINE_double(memory_threshold_on_load_data, 1.0,
+P_DEFINE_double(memory_threshold_on_load_data,
+ 1.0,
"stop loading data when memory is not sufficient");
namespace paddle {
@@ -32,7 +32,8 @@ REGISTER_DATA_PROVIDER(proto_group, DataProviderGroup);
REGISTER_DATA_PROVIDER(proto_sequence_group,
DataProviderGroup);
-ProtoDataProvider::ProtoDataProvider(const DataConfig& config, bool useGpu,
+ProtoDataProvider::ProtoDataProvider(const DataConfig& config,
+ bool useGpu,
bool loadDataAll)
: DataProvider(config, useGpu), sampleNums_(0), currentSequenceIndex_(0) {
if (loadDataAll) {
@@ -279,7 +280,8 @@ void ProtoDataProvider::fillSlots(const DataSample& sample) {
}
slot.sparseNonValueData.resize(slot.indices.back() + slotSize);
const unsigned int* ids = sample.vector_slots(i).ids().data();
- memcpy(slot.sparseNonValueData.data() + slot.indices.back(), ids,
+ memcpy(slot.sparseNonValueData.data() + slot.indices.back(),
+ ids,
sizeof(*ids) * slotSize);
slot.indices.push_back(slot.indices.back() + slotSize);
if (subSlotSize) {
@@ -318,10 +320,11 @@ void ProtoDataProvider::fillSlots(const DataSample& sample) {
slot.varDenseData[oldSize].data.resize(varDim);
const float* values = sample.vector_slots(i).values().data();
#ifdef PADDLE_TYPE_DOUBLE
- std::copy(values, values + varDim,
- slot.varDenseData[oldSize].data.data());
+ std::copy(
+ values, values + varDim, slot.varDenseData[oldSize].data.data());
#else
- memcpy(slot.varDenseData[oldSize].data.data(), values,
+ memcpy(slot.varDenseData[oldSize].data.data(),
+ values,
sizeof(real) * varDim);
#endif
slot.varDenseData[oldSize].dims.resize(
@@ -374,8 +377,9 @@ void ProtoDataProvider::reset() {
}
void ProtoDataProvider::shuffle() {
- std::shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end(),
- ThreadLocalRandomEngine::get());
+ std::shuffle(shuffledSequenceIds_.begin(),
+ shuffledSequenceIds_.end(),
+ ThreadLocalRandomEngine::get());
}
/*
@@ -502,7 +506,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
if (!iidData()) {
ICpuGpuVector::resizeOrCreate(cpuArguments[0].sequenceStartPositions,
- numSequences + 1, /* useGpu= */ false);
+ numSequences + 1,
+ /* useGpu= */ false);
int* buf = cpuArguments[0].sequenceStartPositions->getMutableData(false);
int pos = 0;
int i = 0;
@@ -530,7 +535,9 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
switch (slotType) {
case SlotDef::VECTOR_DENSE: {
- Matrix::resizeOrCreate(cpuArguments[slot].value, size, dim,
+ Matrix::resizeOrCreate(cpuArguments[slot].value,
+ size,
+ dim,
false, // trans = false
false); // useGpu = false
real* buf = cpuArguments[slot].value->getData();
@@ -543,19 +550,27 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
}
case SlotDef::VECTOR_SPARSE_NON_VALUE: {
if (!(cpuArguments[slot].value)) {
- cpuArguments[slot].value = Matrix::createSparseMatrix(
- size, dim, size /*DEFAULT_AVG_WIDTH = 1*/, NO_VALUE, SPARSE_CSR,
- false, useGpu_);
+ cpuArguments[slot].value =
+ Matrix::createSparseMatrix(size,
+ dim,
+ size /*DEFAULT_AVG_WIDTH = 1*/,
+ NO_VALUE,
+ SPARSE_CSR,
+ false,
+ useGpu_);
}
auto mat = cpuArguments[slot].value;
mat->resize(size, dim);
if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
- slots_[slot].sparseNonValueData.data(), HPPL_STREAM_1);
+ ->copyFrom(dataPos.data(),
+ slots_[slot].indices.data(),
+ slots_[slot].sparseNonValueData.data(),
+ HPPL_STREAM_1);
} else if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
+ ->copyFrom(dataPos.data(),
+ slots_[slot].indices.data(),
slots_[slot].sparseNonValueData.data());
} else {
LOG(FATAL) << "Not Supported";
@@ -571,19 +586,27 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
}
case SlotDef::VECTOR_SPARSE_VALUE: {
if (!(cpuArguments[slot].value)) {
- cpuArguments[slot].value = Matrix::createSparseMatrix(
- size, dim, size /*DEFAULT_AVG_WIDTH = 1*/, FLOAT_VALUE,
- SPARSE_CSR, false, useGpu_);
+ cpuArguments[slot].value =
+ Matrix::createSparseMatrix(size,
+ dim,
+ size /*DEFAULT_AVG_WIDTH = 1*/,
+ FLOAT_VALUE,
+ SPARSE_CSR,
+ false,
+ useGpu_);
}
auto mat = cpuArguments[slot].value;
mat->resize(size, dim);
if (std::dynamic_pointer_cast(mat)) {
- std::dynamic_pointer_cast(mat)->copyFrom(
- dataPos.data(), slots_[slot].indices.data(),
- slots_[slot].sparseFloatValueData.data(), HPPL_STREAM_1);
+ std::dynamic_pointer_cast(mat)
+ ->copyFrom(dataPos.data(),
+ slots_[slot].indices.data(),
+ slots_[slot].sparseFloatValueData.data(),
+ HPPL_STREAM_1);
} else if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
+ ->copyFrom(dataPos.data(),
+ slots_[slot].indices.data(),
slots_[slot].sparseFloatValueData.data());
} else {
LOG(FATAL) << "Not Supported";
@@ -591,7 +614,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
break;
}
case SlotDef::INDEX: {
- IVector::resizeOrCreate(cpuArguments[slot].ids, size,
+ IVector::resizeOrCreate(cpuArguments[slot].ids,
+ size,
/* useGpu= */ false);
int* buf = cpuArguments[slot].ids->getData();
for (int i = 0; i < size; ++i) {
@@ -621,7 +645,9 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
if (oldWidth < height) {
totalDim = width * height * depth;
}
- Matrix::resizeOrCreate(cpuArguments[slot].value, size, totalDim,
+ Matrix::resizeOrCreate(cpuArguments[slot].value,
+ size,
+ totalDim,
false, // trans = false
false); // useGpu = false
real* buf = cpuArguments[slot].value->getData();
@@ -637,13 +663,13 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
}
}
} else {
- memcpy(buf, slots_[slot].varDenseData[dataPos[0]].data.data(),
+ memcpy(buf,
+ slots_[slot].varDenseData[dataPos[0]].data.data(),
sizeof(real) * totalDim);
}
- ICpuGpuVector::resizeOrCreate(
- cpuArguments[slot].sequenceStartPositions,
- size + 1, /* size == 1 currently */
- /* useGpu= */ false);
+ ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+ size + 1, /* size == 1 currently */
+ /* useGpu= */ false);
int* bufStarts =
cpuArguments[slot].sequenceStartPositions->getMutableData(false);
bufStarts[0] = 0;
@@ -653,16 +679,17 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
case SlotDef::VAR_MDIM_INDEX: {
CHECK_EQ(size, 1);
size_t totalDim = slots_[slot].varIndices[dataPos[0]].size();
- IVector::resizeOrCreate(cpuArguments[slot].ids, totalDim,
+ IVector::resizeOrCreate(cpuArguments[slot].ids,
+ totalDim,
/* useGpu= */ false);
int* buf = cpuArguments[slot].ids->getData();
- memcpy(buf, slots_[slot].varIndices[dataPos[0]].data(),
+ memcpy(buf,
+ slots_[slot].varIndices[dataPos[0]].data(),
sizeof(int) * totalDim);
- ICpuGpuVector::resizeOrCreate(
- cpuArguments[slot].sequenceStartPositions,
- size + 1, /* size == 1 currently */
- /* useGpu= */ false);
+ ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+ size + 1, /* size == 1 currently */
+ /* useGpu= */ false);
int* bufStarts =
cpuArguments[slot].sequenceStartPositions->getMutableData(false);
bufStarts[0] = 0;
@@ -700,8 +727,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
gpuArguments[i].sequenceStartPositions =
cpuArguments[i].sequenceStartPositions;
} else {
- gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
- HPPL_STREAM_1);
+ gpuArguments[i].resizeAndCopyFrom(
+ cpuArguments[i], useGpu_, HPPL_STREAM_1);
}
}
hl_stream_synchronize(HPPL_STREAM_1);
@@ -746,10 +773,9 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
sampleLoop(op, size);
// current slot: sequenceStartPositions
- ICpuGpuVector::resizeOrCreate(
- cpuArguments[slot].sequenceStartPositions,
- size + 1,
- /* useGpu= */ false);
+ ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+ size + 1,
+ /* useGpu= */ false);
switch (slotType) {
case SlotDef::VECTOR_SPARSE_VALUE:
@@ -821,10 +847,10 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
};
int subSize = subSampleLoop(op, size, slot);
ICpuGpuVector::resizeOrCreate(
- cpuArguments[slot].subSequenceStartPositions, subSize + 1,
- false);
+ cpuArguments[slot].subSequenceStartPositions, subSize + 1, false);
int* currPosOfArgumentSubSeqStart =
- cpuArguments[slot].subSequenceStartPositions->getMutableData(false);
+ cpuArguments[slot].subSequenceStartPositions->getMutableData(
+ false);
int64_t* subSeqs = dataSubPos.data();
int64_t* subIndexs = slots_[slot].subIndices.data();
int allSubSequenceLength = 0;
@@ -849,7 +875,8 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
}
case SlotDef::INDEX: {
// label slot
- IVector::resizeOrCreate(cpuArguments[slot].ids, size,
+ IVector::resizeOrCreate(cpuArguments[slot].ids,
+ size,
/* useGpu= */ false);
// fill labels
int* buf = cpuArguments[slot].ids->getData();
@@ -863,7 +890,9 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
case SlotDef::VECTOR_DENSE: {
// copy values
size_t dim = header_.slot_defs(slot).dim();
- Matrix::resizeOrCreate(cpuArguments[slot].value, size, dim,
+ Matrix::resizeOrCreate(cpuArguments[slot].value,
+ size,
+ dim,
false, // trans = false
false); // useGpu = false
real* buf = cpuArguments[slot].value->getData();
@@ -887,8 +916,8 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
gpuArguments.resize(cpuArguments.size());
gpuBatch.setSize(size);
for (size_t i = 0; i < cpuArguments.size(); ++i) {
- gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
- HPPL_STREAM_1);
+ gpuArguments[i].resizeAndCopyFrom(
+ cpuArguments[i], useGpu_, HPPL_STREAM_1);
}
hl_stream_synchronize(HPPL_STREAM_1);
*batch = gpuBatch;
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.h b/paddle/gserver/dataproviders/ProtoDataProvider.h
index 846dd7673abe8b836be1b728bb690daa0e8acc20..ffdcc8fdc977f53e29dc9f03fa3cf7af56acb92f 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.h
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include
@@ -48,7 +47,8 @@ namespace paddle {
*/
class ProtoDataProvider : public DataProvider {
public:
- ProtoDataProvider(const DataConfig& config, bool useGpu,
+ ProtoDataProvider(const DataConfig& config,
+ bool useGpu,
bool loadDataAll = true);
virtual void reset();
@@ -161,14 +161,16 @@ protected:
};
/**
- * @brief Special use for Proto data: instances should contain sparse-non-value slots
+ * @brief Special use for Proto data: instances should contain sparse-non-value
+ * slots
* and label.
*
* @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE
*/
class ProtoSequenceDataProvider : public ProtoDataProvider {
public:
- ProtoSequenceDataProvider(const DataConfig& config, bool useGpu,
+ ProtoSequenceDataProvider(const DataConfig& config,
+ bool useGpu,
bool loadDataAll = true);
~ProtoSequenceDataProvider() {}
virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
diff --git a/paddle/gserver/dataproviders/ProtoReader.h b/paddle/gserver/dataproviders/ProtoReader.h
index 3b1eb7e9ef03c42df31c6efc9f0e0240d64e78df..b8fca3cd7f3c5efaea35dc8e09f7ca0ec250830f 100644
--- a/paddle/gserver/dataproviders/ProtoReader.h
+++ b/paddle/gserver/dataproviders/ProtoReader.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include
@@ -138,7 +137,8 @@ protected:
*
* @note this code depends on protobuf 2.4.0. There is nothing like
* CodedInputStream::CurrentPosition() in protobuf 2.5.0 to tell us how many
- * bytes has the object readed so far. Therefore, we calculated bytes ourselves.
+ * bytes has the object readed so far. Therefore, we calculated bytes
+ * ourselves.
*/
int approximateReadedBytes_;
};
diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp
index 1332c0ab635b6ebec05f25fd77b9703b39227bc1..bee6ca14a2ec3995a3b432fc5a39419a5dd8a8ce 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@@ -12,21 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "PyDataProvider.h"
#include "paddle/utils/PythonUtil.h"
#include
#include "paddle/utils/Util.h"
#include "paddle/utils/Excepts.h"
-
namespace paddle {
#ifndef PADDLE_NO_PYTHON
REGISTER_DATA_PROVIDER(py, PyDataProvider);
#endif
-PyDataProvider::PyDataProvider(const DataConfig& config, bool useGpu,
+PyDataProvider::PyDataProvider(const DataConfig& config,
+ bool useGpu,
bool loadDataAll)
: DataProvider(config, useGpu), batchSize_(0) {
PyGuard guard;
@@ -50,8 +49,8 @@ void PyDataProvider::loadData(const std::vector& fileList) {
classInstance_ =
createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
CHECK(classInstance_) << "Create class instance failed.";
- PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
- const_cast("getHeader"), NULL));
+ PyObjectPtr obj(PyObject_CallMethod(
+ classInstance_.get(), const_cast("getHeader"), NULL));
CHECK_PY(obj) << "Call function getHeader failed.";
std::string headerInfo =
std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
@@ -90,7 +89,8 @@ void PyDataProvider::resetSlots() {
}
}
-void PyDataProvider::fillDenseSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillDenseSlot(ProtoSlot& slot,
+ char*& data,
const char* dataEnd) {
unsigned int dim = slot.dim;
slot.sampleNum = readT(data, dataEnd);
@@ -102,14 +102,17 @@ void PyDataProvider::fillDenseSlot(ProtoSlot& slot, char*& data,
float* dat = reinterpret_cast(data);
std::copy(dat, dat + slot.sampleNum * dim, slot.denseData.begin());
#else
- memcpyWithCheck(slot.denseData.data(), data,
- sizeof(real) * dim * slot.sampleNum, dataEnd);
+ memcpyWithCheck(slot.denseData.data(),
+ data,
+ sizeof(real) * dim * slot.sampleNum,
+ dataEnd);
#endif
// PyDataProvider always provide data in float
data += sizeof(float) * dim * slot.sampleNum;
}
-void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot,
+ char*& data,
const char* dataEnd) {
slot.sampleNum = readT(data, dataEnd);
unsigned int* indexPtr = (unsigned int*)data;
@@ -121,12 +124,15 @@ void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
length = readT(data, dataEnd);
slot.indices.push_back(length);
slot.sparseNonValueData.resize(length);
- memcpyWithCheck(slot.sparseNonValueData.data(), data,
- sizeof(unsigned int) * length, dataEnd);
+ memcpyWithCheck(slot.sparseNonValueData.data(),
+ data,
+ sizeof(unsigned int) * length,
+ dataEnd);
data += sizeof(unsigned int) * length;
}
-void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot,
+ char*& data,
const char* dataEnd) {
slot.sampleNum = readT(data, dataEnd);
unsigned int* indexPtr = (unsigned int*)data;
@@ -153,7 +159,8 @@ void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot, char*& data,
}
}
-void PyDataProvider::fillIndexSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillIndexSlot(ProtoSlot& slot,
+ char*& data,
const char* dataEnd) {
slot.sampleNum = readT(data, dataEnd);
CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
@@ -163,7 +170,8 @@ void PyDataProvider::fillIndexSlot(ProtoSlot& slot, char*& data,
data += sizeof(unsigned int) * slot.sampleNum;
}
-void PyDataProvider::fillStringSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillStringSlot(ProtoSlot& slot,
+ char*& data,
const char* dataEnd) {
slot.sampleNum = readT(data, dataEnd);
for (unsigned int i = 0; i < slot.sampleNum; ++i) {
@@ -225,9 +233,8 @@ void PyDataProvider::fillSlotsByStr(const std::string& samples) {
}
for (size_t i = 0; i < sequenceNum; ++i) {
size_t begin = slot.sequenceStartPositions[i];
- size_t end = (i < sequenceNum - 1)
- ? slot.sequenceStartPositions[i + 1]
- : slot.sampleNum;
+ size_t end = (i < sequenceNum - 1) ? slot.sequenceStartPositions[i + 1]
+ : slot.sampleNum;
for (size_t ii = begin; ii < end; ++ii) {
slot.sampleSequenceIdVec.push_back(ii);
}
@@ -255,8 +262,8 @@ void PyDataProvider::fillSlotsByStr(const std::string& samples) {
void PyDataProvider::reset() {
{ // Invoke PyDataProvider Reset
PyGuard guard;
- PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
- const_cast("reset"), NULL));
+ PyObjectPtr obj(PyObject_CallMethod(
+ classInstance_.get(), const_cast("reset"), NULL));
CHECK_PY(obj) << "Call function reset failed.";
}
@@ -270,15 +277,18 @@ void PyDataProvider::reset() {
void PyDataProvider::shuffle() {
// py shuffle
PyGuard guard;
- PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
- const_cast("shuffle"), NULL));
+ PyObjectPtr obj(PyObject_CallMethod(
+ classInstance_.get(), const_cast("shuffle"), NULL));
CHECK_PY(obj) << "Call function shuffle failed.";
}
-void PyDataProvider::handleDenseSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleDenseSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments) {
unsigned int dim = slot.dim;
- Matrix::resizeOrCreate(cpuArguments[slotIndex].value, slot.sampleNum, dim,
+ Matrix::resizeOrCreate(cpuArguments[slotIndex].value,
+ slot.sampleNum,
+ dim,
false, // trans = false
false); // useGpu = false
real* buf = cpuArguments[slotIndex].value->getData();
@@ -294,19 +304,27 @@ void PyDataProvider::handleSparseNonValueSlot(
ProtoSlot& slot, size_t slotIndex, std::vector& cpuArguments) {
unsigned int dim = slot.dim;
if (!(cpuArguments[slotIndex].value)) {
- cpuArguments[slotIndex].value = Matrix::createSparseMatrix(
- slot.sampleNum, dim, slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/, NO_VALUE,
- SPARSE_CSR, false, useGpu_);
+ cpuArguments[slotIndex].value =
+ Matrix::createSparseMatrix(slot.sampleNum,
+ dim,
+ slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
+ NO_VALUE,
+ SPARSE_CSR,
+ false,
+ useGpu_);
}
auto mat = cpuArguments[slotIndex].value;
mat->resize(slot.sampleNum, dim, slot.sampleNum, NO_VALUE, SPARSE_CSR);
if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
- slot.sparseNonValueData.data(), HPPL_STREAM_1);
+ ->copyFrom(slot.sampleSequenceIdVec.data(),
+ slot.indices.data(),
+ slot.sparseNonValueData.data(),
+ HPPL_STREAM_1);
} else if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
+ ->copyFrom(slot.sampleSequenceIdVec.data(),
+ slot.indices.data(),
slot.sparseNonValueData.data());
} else {
LOG(FATAL) << "Not Supported";
@@ -317,28 +335,38 @@ void PyDataProvider::handleSparseValueSlot(
ProtoSlot& slot, size_t slotIndex, std::vector& cpuArguments) {
unsigned int dim = slot.dim;
if (!(cpuArguments[slotIndex].value)) {
- cpuArguments[slotIndex].value = Matrix::createSparseMatrix(
- slot.sampleNum, dim, slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
- FLOAT_VALUE, SPARSE_CSR, false, useGpu_);
+ cpuArguments[slotIndex].value =
+ Matrix::createSparseMatrix(slot.sampleNum,
+ dim,
+ slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
+ FLOAT_VALUE,
+ SPARSE_CSR,
+ false,
+ useGpu_);
}
auto mat = cpuArguments[slotIndex].value;
mat->resize(slot.sampleNum, dim, slot.sampleNum, FLOAT_VALUE, SPARSE_CSR);
if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
- slot.sparseFloatValueData.data(), HPPL_STREAM_DEFAULT);
+ ->copyFrom(slot.sampleSequenceIdVec.data(),
+ slot.indices.data(),
+ slot.sparseFloatValueData.data(),
+ HPPL_STREAM_DEFAULT);
} else if (std::dynamic_pointer_cast(mat)) {
std::dynamic_pointer_cast(mat)
- ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
+ ->copyFrom(slot.sampleSequenceIdVec.data(),
+ slot.indices.data(),
slot.sparseFloatValueData.data());
} else {
LOG(FATAL) << "Not Supported";
}
}
-void PyDataProvider::handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleIndexSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments) {
- IVector::resizeOrCreate(cpuArguments[slotIndex].ids, slot.sampleNum,
+ IVector::resizeOrCreate(cpuArguments[slotIndex].ids,
+ slot.sampleNum,
/*useGpu_*/ false);
int* buf = cpuArguments[slotIndex].ids->getData();
for (size_t i = 0; i < slot.sampleNum; ++i) {
@@ -346,7 +374,8 @@ void PyDataProvider::handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
}
}
-void PyDataProvider::handleStringSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleStringSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments) {
if (cpuArguments[slotIndex].strs) {
cpuArguments[slotIndex].strs->resize(slot.sampleNum);
@@ -364,7 +393,8 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
PyGuard guard;
PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
const_cast("getNextBatch"),
- const_cast("i"), size));
+ const_cast("i"),
+ size));
CHECK_PY(obj) << "Call function getNextBatch failed.";
const std::string& samples =
std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
@@ -381,23 +411,24 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
if (!iidData()) {
for (size_t j = 0; j < slotNum_; ++j) {
auto& slot = slots_[j];
- ICpuGpuVector::resizeOrCreate(
- cpuArguments[j].sequenceStartPositions,
- slot.sequenceNum + 1, /* useGpu= */ false);
+ ICpuGpuVector::resizeOrCreate(cpuArguments[j].sequenceStartPositions,
+ slot.sequenceNum + 1,
+ /* useGpu= */ false);
int* buf = cpuArguments[j].sequenceStartPositions->getMutableData(false);
std::copy(slot.sequenceStartPositions.begin(),
- slot.sequenceStartPositions.end(), buf);
+ slot.sequenceStartPositions.end(),
+ buf);
buf[slot.sequenceStartPositions.size()] = slot.sampleNum;
if (slot.subSequenceStartPositions.size()) {
- ICpuGpuVector::resizeOrCreate(
- cpuArguments[j].subSequenceStartPositions,
- slot.subSequenceNum + 1,
- /* useGpu= */ false);
+ ICpuGpuVector::resizeOrCreate(cpuArguments[j].subSequenceStartPositions,
+ slot.subSequenceNum + 1,
+ /* useGpu= */ false);
int* buf =
- cpuArguments[j].subSequenceStartPositions->getMutableData(false);
+ cpuArguments[j].subSequenceStartPositions->getMutableData(false);
std::copy(slot.subSequenceStartPositions.begin(),
- slot.subSequenceStartPositions.end(), buf);
+ slot.subSequenceStartPositions.end(),
+ buf);
buf[slot.subSequenceNum] = slot.sampleNum;
// check subSequenceStartPositions and sequenceStartPositions
cpuArguments[j].checkSubset();
@@ -452,8 +483,8 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
cpuArguments[i].subSequenceStartPositions;
}
} else {
- gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
- HPPL_STREAM_1);
+ gpuArguments[i].resizeAndCopyFrom(
+ cpuArguments[i], useGpu_, HPPL_STREAM_1);
}
}
hl_stream_synchronize(HPPL_STREAM_1);
diff --git a/paddle/gserver/dataproviders/PyDataProvider.h b/paddle/gserver/dataproviders/PyDataProvider.h
index 939d9cf725c2fe6e4989c17e1e768c9f8aedfc95..6bb7c831fdd451abc5241199d6a4d1b1ad814517 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.h
+++ b/paddle/gserver/dataproviders/PyDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#pragma once
#include
@@ -25,7 +24,8 @@ namespace paddle {
class PyDataProvider : public DataProvider {
public:
- PyDataProvider(const DataConfig& config, bool useGpu,
+ PyDataProvider(const DataConfig& config,
+ bool useGpu,
bool loadDataAll = true);
virtual void reset();
@@ -48,21 +48,27 @@ protected:
void parseHeaderData(const std::string& headerData);
void fillDenseSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
- void fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
+ void fillSparseNonValueSlot(ProtoSlot& slot,
+ char*& data,
const char* dataEnd);
void fillSparseValueSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
void fillIndexSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
void fillStringSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
void fillSlotsByStr(const std::string& samples);
- void handleDenseSlot(ProtoSlot& slot, size_t slotIndex,
+ void handleDenseSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments);
- void handleSparseNonValueSlot(ProtoSlot& slot, size_t slotIndex,
+ void handleSparseNonValueSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments);
- void handleSparseValueSlot(ProtoSlot& slot, size_t slotIndex,
+ void handleSparseValueSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments);
- void handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
+ void handleIndexSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments);
- void handleStringSlot(ProtoSlot& slot, size_t slotIndex,
+ void handleStringSlot(ProtoSlot& slot,
+ size_t slotIndex,
std::vector& cpuArguments);
void resetSlots();
void loadData(const std::vector& fileList);
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index 90391a7c307d8dff7e289d445cafd27dc5008547..967fc9026a39967477d606862e060b680512901a 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -34,7 +34,7 @@ namespace paddle {
namespace unittest {
static std::unique_ptr>
- OnPoolFilled;
+ OnPoolFilled;
namespace pydp2 {
@@ -43,15 +43,11 @@ void setOnPoolFilledHook(const std::function& callback) {
*OnPoolFilled = callback;
}
-void clearOnPoolFilledHook() {
- OnPoolFilled.reset();
-}
+void clearOnPoolFilledHook() { OnPoolFilled.reset(); }
} // namespace pydp2
} // namespace unittest
-
-
/**
* Slot type
*/
@@ -65,17 +61,13 @@ enum SlotType {
/**
* Sequence type
*/
-enum SeqType {
- SQT_NONE = 0,
- SQT_SEQ,
- SQT_SUBSEQ
-};
+enum SeqType { SQT_NONE = 0, SQT_SEQ, SQT_SUBSEQ };
/**
* Cache Type.
*/
enum CacheType {
- NO_CACHE = 0, // Each pass will load data from PyDataProvider2.
+ NO_CACHE = 0, // Each pass will load data from PyDataProvider2.
CACHE_PASS_IN_MEM = 1, // First pass will load data from PyDataProvider2,
// then cache all data in memory. Load data from
// memory in rest passes.
@@ -87,8 +79,8 @@ struct SlotHeader { // Slot Header will parse from python object's slots field.
SeqType seqType;
};
-inline std::ostream& operator << (std::ostream& os, const SlotHeader& header) {
- os <<"Dim = " << header.dim << " Type = " << header.slotType
+inline std::ostream& operator<<(std::ostream& os, const SlotHeader& header) {
+ os << "Dim = " << header.dim << " Type = " << header.slotType
<< " SeqType = " << header.seqType;
return os;
}
@@ -158,7 +150,6 @@ protected:
SlotHeader* headerPtr_;
};
-
/**
* Py Data Provider Cache Interface.
*/
@@ -209,17 +200,13 @@ public:
PyDataProvider2(const DataConfig& config,
const ModelConfig& modelConfig,
bool useGpu)
- :DataProvider(config, useGpu),
- callingContextCreated_(2) {
- if (PyArray_API == NULL)
- import_array();
+ : DataProvider(config, useGpu), callingContextCreated_(2) {
+ if (PyArray_API == NULL) import_array();
auto& args = config.load_data_args();
PyObjectPtr kwargs = PyObjectPtr(PyDict_New());
if (!args.empty()) {
kwargs = callPythonFuncRetPyObj(
- "paddle.trainer.PyDataProvider2",
- "deserialize_args",
- {args});
+ "paddle.trainer.PyDataProvider2", "deserialize_args", {args});
}
py::DictHelper kwargsDict(kwargs);
@@ -245,40 +232,38 @@ public:
* Dtor
* @note will stop loading thread when destructing
*/
- virtual ~PyDataProvider2() {
- resetImpl(false);
- }
+ virtual ~PyDataProvider2() { resetImpl(false); }
private:
void createPyDataObj(const std::string& model,
const std::string& className,
const std::string& fileListName,
- PyObjectPtr && kwargs) {
- LOG(INFO) << "loading dataprovider " << model <<"::" << className;
+ PyObjectPtr&& kwargs // NOLINT
+ ) {
+ LOG(INFO) << "loading dataprovider " << model << "::" << className;
PyObjectPtr module = py::import(model);
PyObjectPtr moduleDict(PyModule_GetDict(module.get()));
CHECK_PY(moduleDict) << "Invoke module.__dict__ error";
- PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(),
- className.c_str()));
+ PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(), className.c_str()));
CHECK_PY(cls) << "load class " << className.c_str() << "error";
// If there are multiple python instance share same module, the PyObjectPtr
// only for instance will make python reference-count error.
//
// So here, we increase reference count manually.
- if (gModuleClsPtrs_.find((uintptr_t) module.get())
- != gModuleClsPtrs_.end()) {
+ if (gModuleClsPtrs_.find((uintptr_t)module.get()) !=
+ gModuleClsPtrs_.end()) {
// Multi instance use same module
Py_XINCREF(module.get());
Py_XINCREF(moduleDict.get());
} else {
- gModuleClsPtrs_.insert((uintptr_t) module.get());
+ gModuleClsPtrs_.insert((uintptr_t)module.get());
}
- if (gModuleClsPtrs_.find((uintptr_t) cls.get()) != gModuleClsPtrs_.end()) {
+ if (gModuleClsPtrs_.find((uintptr_t)cls.get()) != gModuleClsPtrs_.end()) {
Py_XINCREF(cls.get());
} else {
- gModuleClsPtrs_.insert((uintptr_t) cls.get());
+ gModuleClsPtrs_.insert((uintptr_t)cls.get());
}
PyObjectPtr fileListInPy = loadPyFileLists(fileListName);
@@ -294,8 +279,8 @@ private:
py::ObjectHelper self(this->instance_);
bool ok;
- this->skipShuffle_ = !self.getBoolAttr("should_shuffle",
- &ok /*isBoolType*/);
+ this->skipShuffle_ =
+ !self.getBoolAttr("should_shuffle", &ok /*isBoolType*/);
if (!ok) {
this->skipShuffle_ = testing; // shuffle when is training, skip shuffle
// when is testing.
@@ -335,12 +320,12 @@ private:
PyObjectPtr headerPtrWrap(hdPtr);
py::ObjectHelper hd(headerPtrWrap);
header.dim = hd.getIntAttrWithError("dim");
- header.seqType = (SeqType) hd.getIntAttrWithError("seq_type");
- header.slotType = (SlotType) hd.getIntAttrWithError("type");
+ header.seqType = (SeqType)hd.getIntAttrWithError("seq_type");
+ header.slotType = (SlotType)hd.getIntAttrWithError("type");
}
DBG << "Data header size " << headers_.size();
- for (auto & header : headers_) {
+ for (auto& header : headers_) {
DBG << header;
}
cache_.reset(IPyDataProviderCache::create(
@@ -351,8 +336,7 @@ private:
loadFileList(fileListName, fileLists_);
PyObject* lst = PyList_New(fileLists_.size());
for (size_t i = 0; i < fileLists_.size(); ++i) {
- PyList_SET_ITEM(lst, i,
- PyString_FromString(fileLists_[i].c_str()));
+ PyList_SET_ITEM(lst, i, PyString_FromString(fileLists_[i].c_str()));
}
return PyObjectPtr(lst);
}
@@ -414,11 +398,12 @@ private:
CHECK(ok) << "CalcBatchSize must return int or long";
}
- if (this->loadThread_){ // wait poolActualSize < poolSize;
+ if (this->loadThread_) { // wait poolActualSize < poolSize;
std::unique_lock l(mtx_);
- pushCV_.wait(l, [this, additionalBatchSize] {
- return this->poolActualSize_ < poolSize_;
- });
+ pushCV_.wait(l,
+ [this, additionalBatchSize] {
+ return this->poolActualSize_ < poolSize_;
+ });
}
{
@@ -487,14 +472,14 @@ private:
std::vector fileLists_;
std::vector headers_;
static PyObjectPtr zeroTuple_;
- static std::unordered_set gModuleClsPtrs_;
+ static std::unordered_set gModuleClsPtrs_;
class PositionRandom {
public:
- inline explicit PositionRandom(bool skipRand):
- eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
+ inline explicit PositionRandom(bool skipRand)
+ : eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
- inline size_t operator() (size_t len) {
+ inline size_t operator()(size_t len) {
if (!skipRand_) {
if (!dist_ || dist_->b() != len - 1) {
dist_.reset(new std::uniform_int_distribution(0, len - 1));
@@ -525,32 +510,31 @@ public:
* Shuffle. Do nothing because PyDataProvider do shuffle implicitly by random
* select data from datapool.
*/
- void shuffle() {
- }
+ void shuffle() {}
/**
* Not limited size.
*/
- int64_t getSize() {
- return -1;
- }
+ int64_t getSize() { return -1; }
/**
* Loading a batch of data.
*/
- int64_t getNextBatchInternal(int64_t size_, DataBatch *batch) {
+ int64_t getNextBatchInternal(int64_t size_, DataBatch* batch) {
std::lock_guard guard(mutexForReset_);
REGISTER_TIMER("PyDP2.getNextBatchInternal")
CHECK_GE(size_, 0);
- size_t size = (size_t) size_;
+ size_t size = (size_t)size_;
if (loadThread_) { // loading from thread should wait for data pool ready.
// but, loading from cache, cache object should ensure
// data pool ready.
std::unique_lock l(mtx_);
- pullCV_.wait(l, [this, &size] {
- return this->poolActualSize_ >= std::max(size, this->minPoolSize_)
- || callingContexts_.empty();
- });
+ pullCV_.wait(l,
+ [this, &size] {
+ return this->poolActualSize_ >=
+ std::max(size, this->minPoolSize_) ||
+ callingContexts_.empty();
+ });
if (unittest::OnPoolFilled) {
(*unittest::OnPoolFilled)(this->poolActualSize_);
@@ -633,35 +617,35 @@ public:
cpuBatch.setSize(bsize);
auto& inArgs = cpuBatch.getStreams();
inArgs.resize(headers_.size());
- std::vector > scanners;
+ std::vector> scanners;
scanners.reserve(headers_.size());
for (auto& header : headers_) {
scanners.emplace_back(IFieldScanner::create(&header));
}
DBG << "Scanner created.";
- for (size_t i=0; i < headers_.size(); ++i) {
+ for (size_t i = 0; i < headers_.size(); ++i) {
scanners[i]->startPrepare(inArgs[i]);
}
- for (auto & d : data) {
+ for (auto& d : data) {
py::SequenceHelper s(d);
- for (size_t i=0; i < headers_.size(); ++i) {
+ for (size_t i = 0; i < headers_.size(); ++i) {
scanners[i]->prepare(inArgs[i], s[i]);
}
}
- for (size_t i=0; i < headers_.size(); ++i) {
+ for (size_t i = 0; i < headers_.size(); ++i) {
scanners[i]->finishPrepare(inArgs[i]);
}
- for (size_t i=0; i < headers_.size(); ++i) {
+ for (size_t i = 0; i < headers_.size(); ++i) {
scanners[i]->startFill(inArgs[i]);
}
- for (auto & d : data) {
+ for (auto& d : data) {
py::SequenceHelper s(d);
for (size_t i = 0; i < headers_.size(); ++i) {
scanners[i]->fill(inArgs[i], s[i]);
}
}
- for (size_t i=0; i < headers_.size(); ++i) {
+ for (size_t i = 0; i < headers_.size(); ++i) {
scanners[i]->finishFill(inArgs[i]);
}
@@ -679,8 +663,8 @@ public:
gpuArguments.resize(cpuArguments.size());
gpuBatch.setSize(size);
for (size_t i = 0; i < headers_.size(); ++i) {
- gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
- HPPL_STREAM_1);
+ gpuArguments[i].resizeAndCopyFrom(
+ cpuArguments[i], useGpu_, HPPL_STREAM_1);
}
hl_stream_synchronize(HPPL_STREAM_1);
} else {
@@ -690,31 +674,28 @@ public:
}
};
-std::unordered_set PyDataProvider2::gModuleClsPtrs_;
+std::unordered_set PyDataProvider2::gModuleClsPtrs_;
PyObjectPtr PyDataProvider2::zeroTuple_(PyTuple_New(0));
REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2);
-
/**
* Scanner for dense slot.
*/
-class DenseScanner: public IFieldScanner {
+class DenseScanner : public IFieldScanner {
public:
- explicit DenseScanner(SlotHeader* ptr):IFieldScanner(ptr), height_(0) {}
+ explicit DenseScanner(SlotHeader* ptr) : IFieldScanner(ptr), height_(0) {}
/**
* Prepare.
* @param argument target argument
* @param obj each timestep of a sample.
*/
- virtual void prepare(Argument &argument, PyObject *obj) {
- ++height_;
- }
+ virtual void prepare(Argument& argument, PyObject* obj) { ++height_; }
- virtual void finishPrepare(Argument &argument) {
- Matrix::resizeOrCreate(argument.value, height_, headerPtr_->dim,
- false, false);
+ virtual void finishPrepare(Argument& argument) {
+ Matrix::resizeOrCreate(
+ argument.value, height_, headerPtr_->dim, false, false);
height_ = 0;
}
@@ -723,24 +704,23 @@ public:
* @param argument
* @param obj
*/
- virtual void fill(Argument &argument, PyObject *obj) {
+ virtual void fill(Argument& argument, PyObject* obj) {
real* dat = argument.value->getData() + height_ * headerPtr_->dim;
if (PyArray_Check(obj)) {
- auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
- if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
- real * data = (real*)PyArray_DATA((PyArrayObject*)obj);
- auto sz = PyArray_SIZE((PyArrayObject*)obj);
- std::copy(data, data + sz, dat);
- } else {
- LOG(FATAL) << "You should yield float" << sizeof(real) * 8
- << " array";
- }
- } else {
- py::SequenceHelper s(obj);
- // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
- for (size_t i=0; i < headerPtr_->dim; ++i) {
- dat[i] = (real) s.getDouble(i);
- }
+ auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
+ if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
+ real* data = (real*)PyArray_DATA((PyArrayObject*)obj);
+ auto sz = PyArray_SIZE((PyArrayObject*)obj);
+ std::copy(data, data + sz, dat);
+ } else {
+ LOG(FATAL) << "You should yield float" << sizeof(real) * 8 << " array";
+ }
+ } else {
+ py::SequenceHelper s(obj);
+ // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
+ for (size_t i = 0; i < headerPtr_->dim; ++i) {
+ dat[i] = (real)s.getDouble(i);
+ }
}
++height_;
}
@@ -752,20 +732,18 @@ private:
/**
* Scanner for index slot
*/
-class IndexScanner: public IFieldScanner {
+class IndexScanner : public IFieldScanner {
public:
- explicit IndexScanner(SlotHeader* ptr):IFieldScanner(ptr), cnt_(0) {}
+ explicit IndexScanner(SlotHeader* ptr) : IFieldScanner(ptr), cnt_(0) {}
/**
* Prepare memory space.
*
* @note obj is a single timestep of sample
*/
- virtual void prepare(Argument &argument, PyObject *obj) {
- ++cnt_;
- }
+ virtual void prepare(Argument& argument, PyObject* obj) { ++cnt_; }
- virtual void finishPrepare(Argument &argument) {
+ virtual void finishPrepare(Argument& argument) {
IVector::resizeOrCreate(argument.ids, cnt_, false);
cnt_ = 0;
}
@@ -773,9 +751,9 @@ public:
/**
* Fill one index to argument.
*/
- virtual void fill(Argument &argument, PyObject *obj) {
+ virtual void fill(Argument& argument, PyObject* obj) {
bool ok;
- argument.ids->getData()[cnt_++] = py::castInt(obj, &ok);
+ argument.ids->getData()[cnt_++] = py::castInt(obj, &ok);
CHECK(ok) << "Cannot cast int " << py::repr(obj);
}
@@ -785,27 +763,25 @@ private:
class SparseNonValueScanner : public IFieldScanner {
public:
- explicit SparseNonValueScanner(SlotHeader* ptr): IFieldScanner(ptr),
- nnz_(0),
- height_(0) {}
+ explicit SparseNonValueScanner(SlotHeader* ptr)
+ : IFieldScanner(ptr), nnz_(0), height_(0) {}
/**
* Prepare memory space
* @note obj is a timestep of one sample.
*/
- virtual void prepare(Argument &argument, PyObject *obj) {
+ virtual void prepare(Argument& argument, PyObject* obj) {
++height_;
nnz_ += py::SequenceHelper(obj).size();
}
- virtual void finishPrepare(Argument &argument) {
- Matrix::resizeOrCreateSparseMatrix(argument.value, height_,
- headerPtr_->dim,
- nnz_, NO_VALUE);
+ virtual void finishPrepare(Argument& argument) {
+ Matrix::resizeOrCreateSparseMatrix(
+ argument.value, height_, headerPtr_->dim, nnz_, NO_VALUE);
}
- virtual void startFill(Argument & argument) {
- auto smat = (CpuSparseMatrix*) (argument.value.get());
+ virtual void startFill(Argument& argument) {
+ auto smat = (CpuSparseMatrix*)(argument.value.get());
smat->getRows()[0] = 0;
nnz_ = 0;
height_ = 1;
@@ -818,14 +794,14 @@ public:
virtual void fill(Argument& argument, PyObject* obj) {
py::SequenceHelper s(obj);
auto sz = s.size();
- auto smat = (CpuSparseMatrix*) (argument.value.get());
+ auto smat = (CpuSparseMatrix*)(argument.value.get());
int* row = smat->getRows();
int* col = smat->getCols();
real* dat = smat->getData();
- row[height_] = row[height_-1] + (int)sz;
+ row[height_] = row[height_ - 1] + (int)sz;
for (decltype(sz) i = 0; i < sz; ++i) {
- setData(col+nnz_, dat+nnz_, s[i]);
+ setData(col + nnz_, dat + nnz_, s[i]);
++nnz_;
}
++height_;
@@ -839,7 +815,7 @@ protected:
* @param [in] obj Python Object. For sparse_non_value is a PyInt or PyLong.
* For sparse_value is a Tuple (int, float).
*/
- virtual void setData(int* col, real * dat, PyObject* obj) {
+ virtual void setData(int* col, real* dat, PyObject* obj) {
bool ok;
*col = py::castInt(obj, &ok);
CHECK(ok);
@@ -851,26 +827,25 @@ protected:
class SparseValueScanner : public SparseNonValueScanner {
public:
- explicit SparseValueScanner(SlotHeader *ptr) : SparseNonValueScanner(ptr) {}
+ explicit SparseValueScanner(SlotHeader* ptr) : SparseNonValueScanner(ptr) {}
- virtual void finishPrepare(Argument &argument) {
- Matrix::resizeOrCreateSparseMatrix(argument.value, height_,
- headerPtr_->dim,
- nnz_, FLOAT_VALUE);
+ virtual void finishPrepare(Argument& argument) {
+ Matrix::resizeOrCreateSparseMatrix(
+ argument.value, height_, headerPtr_->dim, nnz_, FLOAT_VALUE);
}
protected:
- virtual void setData(int *col, real *dat, PyObject *obj) {
+ virtual void setData(int* col, real* dat, PyObject* obj) {
py::SequenceHelper s(obj);
SparseNonValueScanner::setData(col, dat, s[0]);
- *dat = (real) s.getDouble(1);
+ *dat = (real)s.getDouble(1);
}
};
/**
* Sequence Scanner. Scanner for sequence or sub-sequence.
*/
-class SequenceScanner: public IFieldScanner {
+class SequenceScanner : public IFieldScanner {
public:
/**
* Ctor
@@ -879,15 +854,18 @@ public:
* return a sequence start position or a sub-sequence
* start position.
*/
- SequenceScanner(std::unique_ptr&& innerScanner,
- const std::function& getSeqStartPos)
- : IFieldScanner(nullptr), inner_(std::move(innerScanner)),
- cnt_(0), getSeqStartPos_(getSeqStartPos) {}
+ SequenceScanner(
+ std::unique_ptr&& innerScanner,
+ const std::function& getSeqStartPos)
+ : IFieldScanner(nullptr),
+ inner_(std::move(innerScanner)),
+ cnt_(0),
+ getSeqStartPos_(getSeqStartPos) {}
/**
* Start prepare. Invoke inner->startPrepare too.
*/
- virtual void startPrepare(Argument &argument) {
+ virtual void startPrepare(Argument& argument) {
inner_->startPrepare(argument);
}
@@ -895,10 +873,10 @@ public:
* Prepare. obj is a list or tuple. it will invoke inner_->prepare for each
* element of sequence obj.
*/
- virtual void prepare(Argument &argument, PyObject *obj) {
+ virtual void prepare(Argument& argument, PyObject* obj) {
py::SequenceHelper s(obj);
++cnt_;
- for (size_t i=0; i < s.size(); ++i) {
+ for (size_t i = 0; i < s.size(); ++i) {
inner_->prepare(argument, s[i]);
}
}
@@ -906,7 +884,7 @@ public:
/**
* Finish prepare. invoke inner_->finishPrepare too.
*/
- virtual void finishPrepare(Argument &argument) {
+ virtual void finishPrepare(Argument& argument) {
ICpuGpuVector::resizeOrCreate(getSeqStartPos_(argument), cnt_ + 1, false);
inner_->finishPrepare(argument);
}
@@ -914,7 +892,7 @@ public:
/**
* Start fill. invoke inner->startFill too.
*/
- virtual void startFill(Argument &argument) {
+ virtual void startFill(Argument& argument) {
getSeqStartPos_(argument)->getMutableData(false)[0] = 0;
cnt_ = 1;
inner_->startFill(argument);
@@ -925,13 +903,13 @@ public:
* sequence obj. And set seqStartPos at same time. The seqStartPos will be
* calculated by getSeqStartPos callback passed in ctor.
*/
- virtual void fill(Argument &argument, PyObject *obj) {
+ virtual void fill(Argument& argument, PyObject* obj) {
getSeqStartPos_(argument)->getMutableData(false)[cnt_] =
- getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
- (int)getSize(obj);
+ getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
+ (int)getSize(obj);
py::SequenceHelper s(obj);
++cnt_;
- for (size_t i=0; i < s.size(); ++i) {
+ for (size_t i = 0; i < s.size(); ++i) {
inner_->fill(argument, s[i]);
}
}
@@ -939,9 +917,7 @@ public:
/**
* Finish fill. will invoke inner->finishFill too.
*/
- virtual void finishFill(Argument &argument) {
- inner_->finishFill(argument);
- }
+ virtual void finishFill(Argument& argument) { inner_->finishFill(argument); }
protected:
size_t getSize(PyObject* obj) {
@@ -949,7 +925,7 @@ protected:
auto sc = dynamic_cast(inner_.get());
if (sc) {
size_t sum = 0;
- for (size_t i=0; i < s.size(); ++i) {
+ for (size_t i = 0; i < s.size(); ++i) {
sum += sc->getSize(s[i]);
}
return sum;
@@ -964,8 +940,7 @@ private:
std::function getSeqStartPos_;
};
-
-IFieldScanner* IFieldScanner::create(SlotHeader *header) {
+IFieldScanner* IFieldScanner::create(SlotHeader* header) {
IFieldScanner* retv = nullptr;
switch (header->slotType) {
case ST_DENSE:
@@ -989,15 +964,15 @@ IFieldScanner* IFieldScanner::create(SlotHeader *header) {
break;
case SQT_SUBSEQ:
retv = new SequenceScanner(std::unique_ptr(retv),
- [](Argument& arg) -> ICpuGpuVectorPtr& {
- return arg.subSequenceStartPositions;
- });
- // fall through, not break;
+ [](Argument& arg) -> ICpuGpuVectorPtr& {
+ return arg.subSequenceStartPositions;
+ });
+ // fall through, not break;
case SQT_SEQ:
retv = new SequenceScanner(std::unique_ptr(retv),
- [](Argument& arg) -> ICpuGpuVectorPtr& {
- return arg.sequenceStartPositions;
- });
+ [](Argument& arg) -> ICpuGpuVectorPtr& {
+ return arg.sequenceStartPositions;
+ });
break;
default:
LOG(FATAL) << "Not implemented";
@@ -1010,19 +985,13 @@ IFieldScanner* IFieldScanner::create(SlotHeader *header) {
* No Cache Strategy. Will destruct old data immediately and load data from
* python every pass.
*/
-class NoCacheStrategy: public IPyDataProviderCache {
+class NoCacheStrategy : public IPyDataProviderCache {
public:
- virtual bool reset() {
- return true;
- }
+ virtual bool reset() { return true; }
- virtual void drop(std::deque *data) {
- data->clear();
- }
+ virtual void drop(std::deque* data) { data->clear(); }
- virtual std::deque* load() {
- return nullptr;
- }
+ virtual std::deque* load() { return nullptr; }
};
/**
@@ -1033,9 +1002,9 @@ public:
*/
class CacheOnePassInMemory : public IPyDataProviderCache {
public:
- CacheOnePassInMemory() : objPool_(new std::deque()),
- droppedPool_(new std::deque())
- {}
+ CacheOnePassInMemory()
+ : objPool_(new std::deque()),
+ droppedPool_(new std::deque()) {}
virtual bool reset() {
if (objPool_->empty() && droppedPool_->empty()) {
@@ -1048,25 +1017,22 @@ public:
}
}
- virtual void drop(std::deque *data) {
+ virtual void drop(std::deque* data) {
size_t orgSize = droppedPool_->size();
droppedPool_->resize(orgSize + data->size());
- for (size_t i=0; i < data->size(); ++i) {
+ for (size_t i = 0; i < data->size(); ++i) {
std::swap((*droppedPool_)[orgSize + i], (*data)[i]);
}
data->clear();
}
- virtual std::deque* load() {
- return objPool_.get();
- }
+ virtual std::deque* load() { return objPool_.get(); }
private:
- std::unique_ptr > objPool_;
- std::unique_ptr > droppedPool_;
+ std::unique_ptr> objPool_;
+ std::unique_ptr> droppedPool_;
};
-
IPyDataProviderCache* IPyDataProviderCache::create(CacheType ct) {
switch (ct) {
case NO_CACHE:
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
index c2625bce9ab0cac7c42a20379c42debea0510c57..8f7d2fb80e9b6f2b4c83d90a04dab5219435d344 100644
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "Evaluator.h"
#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
@@ -33,7 +32,8 @@ private:
str.clear();
int prevLabel = -1;
for (std::vector::const_iterator label = path.begin();
- label != path.end(); label++) {
+ label != path.end();
+ label++) {
if (*label != blank_ &&
(str.empty() || *label != str.back() || prevLabel == blank_)) {
str.push_back(*label);
@@ -58,8 +58,11 @@ private:
/* "sp, dp, ip" is the weighting parameter of "substitution, deletion,
* insertion"
* in edit-distance error */
- real stringAlignment(std::vector& gtStr, std::vector& recogStr,
- bool backtrace = true, real sp = 1.0, real dp = 1.0,
+ real stringAlignment(std::vector& gtStr,
+ std::vector& recogStr,
+ bool backtrace = true,
+ real sp = 1.0,
+ real dp = 1.0,
real ip = 1.0) {
std::vector> matrix;
int substitutions, deletions, insertions;
@@ -165,8 +168,8 @@ private:
return distance / maxLen;
}
- real editDistance(real* output, int numTimes, int numClasses, int* labels,
- int labelsLen) {
+ real editDistance(
+ real* output, int numTimes, int numClasses, int* labels, int labelsLen) {
numTimes_ = numTimes;
numClasses_ = numClasses;
blank_ = numClasses_ - 1;
@@ -207,7 +210,8 @@ public:
real err = 0;
err = editDistance(
output.value->getData() + output.value->getWidth() * outputStarts[i],
- outputStarts[i+1] - outputStarts[i], output.value->getWidth(),
+ outputStarts[i + 1] - outputStarts[i],
+ output.value->getWidth(),
label.ids->getData() + labelStarts[i],
labelStarts[i + 1] - labelStarts[i]);
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
index 6f5d2b47c3a97d0c95fefd346add2f121ac51764..923e77fc9df919794902daed6113792e7f89a552 100644
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -144,7 +144,8 @@ public:
size_t numSequences = sequenceStartPositions->getSize() - 1;
const int* starts = sequenceStartPositions->getData();
for (size_t i = 0; i < numSequences; ++i) {
- eval1(output->getData() + starts[i], label->getData() + starts[i],
+ eval1(output->getData() + starts[i],
+ label->getData() + starts[i],
starts[i + 1] - starts[i]);
}
return 0;
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
index d43dceea7452724c1e45a1b7c5f5f1858d528df7..f5df2b18dedde9022d04b034912e59be00f15413 100644
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-
#include "paddle/utils/Stat.h"
#include "paddle/gserver/evaluators/Evaluator.h"
@@ -74,17 +73,19 @@ public:
}
const MatrixPtr errorMat = Matrix::create(output->getHeight(),
- 1, /* trans= */ false, useGpu(arguments[0].deviceId));
+ 1,
+ /* trans= */ false,
+ useGpu(arguments[0].deviceId));
errorMat->zeroMem();
if (label != nullptr) {
errorMat->classificationError(output, label);
} else if (dynamic_cast(multiBinaryLabel.get()) ||
dynamic_cast(multiBinaryLabel.get())) {
- errorMat->classificationErrorMulti(*output, *multiBinaryLabel,
- config_.classification_threshold());
+ errorMat->classificationErrorMulti(
+ *output, *multiBinaryLabel, config_.classification_threshold());
} else {
- errorMat->binaryClassificationError(0, *output, *multiBinaryLabel,
- config_.classification_threshold());
+ errorMat->binaryClassificationError(
+ 0, *output, *multiBinaryLabel, config_.classification_threshold());
}
if (supportWeight) {
@@ -126,8 +127,8 @@ public:
int errCounter = 0;
CpuVector errorVec(0, nullptr);
for (size_t i = 0; i < sequenceStartPositions->getSize() - 1; ++i) {
- errorVec.subVecFrom(errorMat->getData(), starts[i],
- starts[i + 1] - starts[i]);
+ errorVec.subVecFrom(
+ errorMat->getData(), starts[i], starts[i + 1] - starts[i]);
if (errorVec.getSum() > 0) {
errCounter += 1;
}
@@ -330,8 +331,8 @@ public:
}
void distributeEval(ParameterClient2* client) {
- client->reduce(sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id,
- 0);
+ client->reduce(
+ sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id, 0);
client->reduce(&numSamples_, &numSamples_, 1, FLAGS_trainer_id, 0);
}
@@ -379,8 +380,11 @@ real AucEvaluator::evalImp(std::vector