diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9385943da92bc8c44ca75b267a768ba8ea22bd8b..90c25e435083d78ad4c123999a588aaf9092f719 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,18 +7,14 @@
     hooks:
     -   id: yapf
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    sha: 4ef03c4223ad322c7adaa6c6c0efb26b57df3b71
+    sha: 7539d8bd1a00a3c1bfd34cdb606d3a6372e83469
     hooks:
     -   id: check-added-large-files
     -   id: check-merge-conflict
     -   id: check-symlinks
     -   id: detect-private-key
     -   id: end-of-file-fixer
-# TODO(yuyang): trailing whitespace has some bugs on markdown 
-# files now, please not add it to pre-commit hook now
-#    -   id: trailing-whitespace
-#
-# TODO(yuyang): debug-statements not fit for Paddle, because
-# not all of our python code is runnable. Some are used for 
-# documenation
-#    -   id: debug-statements
+-   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
+    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
+    hooks:
+    -   id: clang-formater
diff --git a/README.md b/README.md
index bd47ed44bc808196b0e6598f28d72620422f3e1a..8a8e15841586ae6a01bb93e94f6074189f556f5a 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,11 @@
 # PaddlePaddle
 
 
-[](https://travis-ci.org/baidu/Paddle)
+[](https://travis-ci.org/PaddlePaddle/Paddle)
 [](http://www.paddlepaddle.org/)
 [](http://www.paddlepaddle.org/cn/index.html)
-[](https://coveralls.io/github/baidu/Paddle?branch=develop)
-[](https://github.com/baidu/Paddle/releases)
+[](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
+[](https://github.com/PaddlePaddle/Paddle/releases)
 [](LICENSE)
 
 
@@ -17,7 +17,7 @@ developed by Baidu scientists and engineers for the purpose of applying deep
 learning to many products at Baidu.
 
 Our vision is to enable deep learning for everyone via PaddlePaddle.
-Please refer to our [release announcement](https://github.com/baidu/Paddle/releases) to track the latest feature of PaddlePaddle.
+Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
 ## Features
 
@@ -92,7 +92,7 @@ Both [English Docs](http://paddlepaddle.org/doc/) and [Chinese Docs](http://padd
 
 ## Ask Questions
 
-You are welcome to submit questions and bug reports as [Github Issues](https://github.com/baidu/paddle/issues).
+You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).
 
 ## Copyright and License
 PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
diff --git a/doc/build/build_from_source.md b/doc/build/build_from_source.md
index b8f26f431eb7a04147fe791a8c805427c827fe09..e44fa0d38e9982e5d0ed159743994ce6acc51246 100644
--- a/doc/build/build_from_source.md
+++ b/doc/build/build_from_source.md
@@ -6,10 +6,10 @@ Installing from Sources
 * [3. Build on Ubuntu](#ubuntu)
 
 ## Download and Setup 
-You can download PaddlePaddle from the [github source](https://github.com/gangliao/Paddle).
+You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
 
 ```bash
-git clone https://github.com/baidu/Paddle paddle
+git clone https://github.com/PaddlePaddle/Paddle paddle
 cd paddle
 ```
 
diff --git a/doc_cn/build_and_install/cmake/cblas_settings.csv b/doc_cn/build_and_install/cmake/cblas_settings.csv
index d804c0a662cb652dbefb0d09fb18538308c20aec..a6356baf16a0d3d2499e39d2055d8ee878dcaef2 100644
--- a/doc_cn/build_and_install/cmake/cblas_settings.csv
+++ b/doc_cn/build_and_install/cmake/cblas_settings.csv
@@ -1,4 +1,5 @@
-MKL_ROOT,mkl的路径,在${MKL_ROOT}/include下需要包含mkl.h,在${MKL_ROOT}/lib目录下需要包含 mkl_core,mkl_sequential和mkl_intel_lp64三个库
-ATLAS_ROOT,ATLAS库的路径,在${ATLAS_ROOT}/include下需要包含cblas.h,而在${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库
-OPENBLAS_ROOT,在${OPENBLAS_ROOT}/include下需要包含cblas.h,而在${OPENBLAS_ROOT}/lib下需要包含openblas库
-REFERENCE_CBLAS_ROOT,在${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h,在${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库
\ No newline at end of file
+编译选项,描述,注意
+MKL_ROOT,MKL的路径,${MKL_ROOT}/include下需要包含mkl.h,${MKL_ROOT}/lib目录下需要包含mkl_core,mkl_sequential和mkl_intel_lp64三个库。
+ATLAS_ROOT,ATLAS的路径,${ATLAS_ROOT}/include下需要包含cblas.h,${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库。
+OPENBLAS_ROOT,OpenBLAS的路径,${OPENBLAS_ROOT}/include下需要包含cblas.h,${OPENBLAS_ROOT}/lib下需要包含openblas库。
+REFERENCE_CBLAS_ROOT,REFERENCE BLAS的路径,${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h,${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库。
\ No newline at end of file
diff --git a/doc_cn/build_and_install/cmake/compile_options.csv b/doc_cn/build_and_install/cmake/compile_options.csv
index 0b8015aaee4d7b9068cb4a8de5d9967569e37f0c..12b45eebb2822d77447fa1bc754360605971dcab 100644
--- a/doc_cn/build_and_install/cmake/compile_options.csv
+++ b/doc_cn/build_and_install/cmake/compile_options.csv
@@ -1,15 +1,14 @@
-选项,说明,默认值
-WITH_GPU,是否编译GPU支持。,是否寻找到cuda工具链
-WITH_DOUBLE,是否使用双精度浮点数。,否
-WITH_DSO,是否使用运行时动态加载cuda动态库,而非静态加载cuda动态库。,是
-WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制,是
-WITH_PYTHON,是否内嵌python解释器。可以方便嵌入式工作。,是
-WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
-WITH_RDMA,是否开启RDMA支持,否
-WITH_GLOG,是否使用GLOG,如果不使用则会使用一个简化版的日志实现。可以方便嵌入式工作。,取决于是否寻找到GLOG
-WITH_GFLAGS,是否使用GFLAGS,如果不使用则会使用一个简化版的命令行参数解析。可以方便嵌入式工作。,取决于是否寻找到GFLAGS
-WITH_TIMER,是否开启计时功能开启计时功能会导致运行略慢,打印的日志变多。但是方便调试和benchmark,否
-WITH_TESTING,是否开启单元测试,取决于是否寻找到gtest
-WITH_DOC,是否编译英文文档,否
-WITH_DOC_CN,是否编译中文文档,否
-WITH_SWIG_PY,是否编译python的swig接口,python的swig接口可以方便进行预测和定制化训练,取决于是否找到swig
+选项,说明,默认值
+WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
+WITH_DOUBLE,是否使用双精度浮点数。,否
+WITH_DSO,是否运行时动态加载CUDA动态库,而非静态加载CUDA动态库。,是
+WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
+WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
+WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
+WITH_RDMA,是否开启RDMA,否
+WITH_GLOG,是否开启GLOG。如果不开启,则会使用一个简化版的日志,同时方便今后的嵌入式移植工作。,取决于是否寻找到GLOG
+WITH_GFLAGS,是否使用GFLAGS。如果不开启,则会使用一个简化版的命令行参数解析器,同时方便今后的嵌入式移植工作。,取决于是否寻找到GFLAGS
+WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢,打印的日志变多,但是方便调试和测Benchmark,否
+WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
+WITH_DOC,是否编译中英文文档,否
+WITH_SWIG_PY,是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练,取决于是否寻找到SWIG
\ No newline at end of file
diff --git a/doc_cn/build_and_install/cmake/compile_options.rst b/doc_cn/build_and_install/cmake/compile_options.rst
index bb5b18a073803662774cb6b7bcbdbafe3ad51112..f345ead2bf851bdad7be2fb8185d16fd2a318a66 100644
--- a/doc_cn/build_and_install/cmake/compile_options.rst
+++ b/doc_cn/build_and_install/cmake/compile_options.rst
@@ -1,62 +1,43 @@
-设置PaddlePaddle的编译选项
-==========================
-
-PaddlePaddle的编译选项可以在调用cmake的时候设置。cmake是一个跨平台的编译脚本,调用
-cmake可以将cmake项目文件,生成各个平台的makefile。详细的cmake使用方法可以参考
-`cmake的官方文档 `_ 。
-
-PaddlePaddle的编译选项是可以控制PaddlePaddle生成CPU/GPU版本二进制,链接何种blas等等。所有的
-编译选项列表如下
-
-PaddlePaddle的编译选项
-----------------------
-
-bool型的编译选项
-++++++++++++++++
-设置下列编译选项时,可以在cmake的命令行设置。使用 -D命令即可。例如 
-:code:`cmake -D WITH_GPU=OFF`
-
-..  csv-table:: PaddlePaddle的bool型编译选项
-    :widths: 1, 7, 2
-    :file: compile_options.csv
-
-blas相关的编译选项
-++++++++++++++++++
-
-PaddlePaddle可以使用 `MKL `_ ,
-`Atlas `_ ,
-`OpenBlas `_ 和 
-`refference Blas `_ ,任意一种cblas实现。
-通过编译时指定路径来实现引用各种blas。
-
-cmake编译时会首先在系统路径(/usr/lib\:/usr/local/lib)中寻找这些blas的实现。同时
-也会读取相关路径变量来进行搜索。路径变量为\:
-
-
-..  csv-table:: PaddlePaddle的cblas编译选项
-    :widths: 1, 9
-    :header: "编译选项", "描述"
-    :file: cblas_settings.csv
-
-这些变量均可以使用 -D命令指定。例如 :code:`cmake -D MKL_ROOT=/opt/mkl/`。这些变
-量也可以通过调用cmake命令前通过环境变量指定。例如
-
-..  code-block:: bash
-
-    export MKL_ROOT=/opt/mkl
-    cmake
-
-需要注意的是,这些变量只在第一次cmake的时候有效。如果在第一次cmake之后想要重新设
-置这些变量,推荐清理( :code:`rm -rf` )掉编译目录后,再指定。
-
-cuda/cudnn相关的编译选项
-++++++++++++++++++++++++
-
-PaddlePaddle可以使用 cudnn v2之后的任何一个cudnn版本来编译运行。但需要注意的是编译和
-运行使用的cudnn尽量是同一个版本。推荐使用最新版本的cudnn v5.1。
-
-在cmake配置时可以使用 :code:`CUDNN_ROOT` 来配置CUDNN的安装路径。使用的命令也是 
--D,例如 :code:`cmake -D CUDNN_ROOT=/opt/cudnnv5` 。
-
-需要注意的是,这些变量只在第一次cmake的时候有效。如果在第一次cmake之后想要重新设
-置这些变量,推荐清理( :code:`rm -rf` )掉编译目录后,再指定。
+PaddlePaddle的编译选项
+======================
+
+PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们,详细的cmake使用方法可以参考 `官方文档 `_ 。
+
+Bool型的编译选项
+----------------
+用户可在cmake的命令行中,通过使用 ``-D`` 命令设置该类编译选项,例如
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: Bool型的编译选项
+    :widths: 1, 7, 2
+    :file: compile_options.csv
+
+BLAS/CUDA/Cudnn的编译选项
+--------------------------
+BLAS
++++++
+
+PaddlePaddle支持以下任意一种BLAS库:`MKL `_ ,`ATLAS `_ ,`OpenBlAS `_ 和 `REFERENCE BLAS `_ 。
+
+..  csv-table:: BLAS路径相关的编译选项
+    :widths: 1, 2, 7
+    :file: cblas_settings.csv
+
+CUDA/Cudnn
++++++++++++
+
+PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行,但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时,首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如 
+
+..  code-block:: bash
+
+    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
+
+注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录(``rm -rf``)后,再指定。
\ No newline at end of file
diff --git a/doc_cn/howto/how_to_write_docs/index.rst b/doc_cn/howto/how_to_write_docs/index.rst
index 869ef747f9f88c7dbb5efdf6e03111a3f76c4014..a1f983b3405fa40f436885e40fca2ebbb4695491 100644
--- a/doc_cn/howto/how_to_write_docs/index.rst
+++ b/doc_cn/howto/how_to_write_docs/index.rst
@@ -2,32 +2,19 @@
 如何贡献/修改PaddlePaddle的文档
 ###############################
 
-PaddlePaddle的文档使用 `cmake`_ 驱动 `sphinx`_ 生成。公有两个文档,:code:`doc` 和 :code:`doc_cn` 。这两者会在 `cmake`_ 中进行编译,生成后的文档会存储在服务器的 :code:`doc` 和 :code:`doc_cn` 两个目录下。
+PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成,生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
 
-下面分几个部分介绍一下PaddlePaddle文档的贡献方法。
-
-如何书写PaddlePaddle的文档
-==========================
-
-TBD
 
 如何构建PaddlePaddle的文档
 ==========================
 
-构建PaddlePaddle文档,需要使用构建Paddle的全部环境。准备这个环境相对来说比较复杂,所以本文档提供两种方式构建PaddlePaddle的文档,即
-
-* 使用Docker构建PaddlePaddle的文档
-* 直接构建PaddlePaddle的文档。
-
-并且,我们推荐使用Docker来构建PaddlePaddle的文档。
+PaddlePaddle的文档构建有直接构建和基于Docker构建两种方式。构建PaddlePaddle文档需要准备的环境相对较复杂,所以我们推荐使用基于Docker来构建PaddlePaddle的文档。
 
 
 使用Docker构建PaddlePaddle的文档
 --------------------------------
 
-使用Docker构建PaddlePaddle的文档,首先要求在系统里安装好Docker工具包。安装Docker请参考 `Docker的官网 `_ 。
-
-安装好Docker之后可以使用源码目录下的脚本构建文档,即
+使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 `_ 。安装好Docker之后可以使用源码目录下的脚本构建文档,即
 
 ..	code-block:: bash
 
@@ -35,10 +22,10 @@ TBD
 	cd paddle/scripts/tools/build_docs
 	bash build_docs.sh
 
-执行完这个脚本后,该目录下会生成两个目录,分别是\:
+编译完成后,该目录下会生成如下两个子目录\:
 
-* doc 目录,英文文档地址
-* doc_cn 目录,中文文档地址
+* doc 英文文档目录
+* doc_cn 中文文档目录
 
 打开浏览器访问对应目录下的index.html即可访问本地文档。
 
@@ -52,6 +39,10 @@ TBD
 
 TBD
 
+如何书写PaddlePaddle的文档
+==========================
+
+TBD
 
 如何更新www.paddlepaddle.org文档
 ================================
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
index 6f51d551200696ebafade2a46243b78086975265..b539374cd4aa5a9510cdb728c1b22edf65a9f880 100644
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
 
@@ -112,7 +111,7 @@ void Arguments::setSlotSequenceStartPositions(size_t idx,
 }
 
 void Arguments::setSlotSubSequenceStartPositions(
-    size_t idx, IVector *vec) throw(RangeError) {
+    size_t idx, IVector* vec) throw(RangeError) {
   auto& a = m->getArg(idx);
   auto& v = m->cast(vec->getSharedPtr());
   a.subSequenceStartPositions = std::make_shared(v);
diff --git a/paddle/api/ConfigParser.cpp b/paddle/api/ConfigParser.cpp
index 25d94f5a6a1255f3e2faff9816cfd003b20c0418..bc40d871d180a6bfe21200c866181dc161f5f078 100644
--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
 #include "paddle/trainer/Trainer.h"
@@ -44,8 +43,7 @@ TrainerConfig* TrainerConfig::createFromTrainerConfigFile(
   return retv;
 }
 
-TrainerConfig* TrainerConfig::createFromProtoString(
-    const std::string& str) {
+TrainerConfig* TrainerConfig::createFromProtoString(const std::string& str) {
   auto retv = new TrainerConfig();
   paddle::TrainerConfig trainerConfigProto;
   auto conf = std::make_shared(trainerConfigProto);
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
index bef499c67858b8e2d5432155a8defca56af6019c..9a4846d80980e23e97f89b6134e15af71207ae6b 100644
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
 
@@ -27,7 +26,8 @@ GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {}
 GradientMachine::~GradientMachine() { delete m; }
 
 GradientMachine* GradientMachine::createFromPaddleModelPtr(
-    const void* confPtr, GradientMatchineCreateMode mode,
+    const void* confPtr,
+    GradientMatchineCreateMode mode,
     const std::vector& types) {
   auto& conf = *(const paddle::ModelConfig*)(confPtr);
   std::vector realTypes;
@@ -44,7 +44,8 @@ GradientMachine* GradientMachine::createFromPaddleModelPtr(
 }
 
 GradientMachine* GradientMachine::createByConfigProtoStr(
-    const std::string& protoStr, GradientMatchineCreateMode mode,
+    const std::string& protoStr,
+    GradientMatchineCreateMode mode,
     const std::vector& types) {
   paddle::ModelConfig conf;
   conf.ParseFromString(protoStr);
@@ -56,13 +57,15 @@ GradientMachine* GradientMachine::createByConfigProtoStr(
 }
 
 GradientMachine* GradientMachine::createByModelConfig(
-    ModelConfig* conf, GradientMatchineCreateMode mode,
+    ModelConfig* conf,
+    GradientMatchineCreateMode mode,
     const std::vector& types) {
   auto confPtr = &conf->m->conf->getModelConfig();
   return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
 }
 
-void GradientMachine::forward(const Arguments& inArgs, Arguments* outArgs,
+void GradientMachine::forward(const Arguments& inArgs,
+                              Arguments* outArgs,
                               PassType passType) {
   auto& in =
       m->cast>(inArgs.getInternalArgumentsPtr());
@@ -99,7 +102,8 @@ void GradientMachine::backward(const UpdateCallback& callback) {
 }
 
 void GradientMachine::forwardBackward(const Arguments& inArgs,
-                                      Arguments* outArgs, PassType passType,
+                                      Arguments* outArgs,
+                                      PassType passType,
                                       const UpdateCallback& callback) {
   auto& in =
       m->cast>(inArgs.getInternalArgumentsPtr());
@@ -129,7 +133,7 @@ Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
 void GradientMachine::randParameters() { m->machine->randParameters(); }
 
 Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
-  throw(UnsupportError) {
+    throw(UnsupportError) {
   auto nn = std::dynamic_pointer_cast(m->machine);
   if (nn) {
     auto mat = nn->getLayerOutput(layerName);
@@ -140,8 +144,11 @@ Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
 }
 
 SequenceGenerator* GradientMachine::asSequenceGenerator(
-    const std::vector& dict, size_t begin_id, size_t end_id,
-    size_t max_length, size_t beam_size) {
+    const std::vector& dict,
+    size_t begin_id,
+    size_t end_id,
+    size_t max_length,
+    size_t beam_size) {
   SequenceGenerator* r =
       SequenceGenerator::createByGradientMachineSharedPtr(&m->machine);
   r->setDict(dict);
diff --git a/paddle/api/Internal.h b/paddle/api/Internal.h
index b990f650be9fa401898a8c6d10c21d9c90eb728a..66a13bc603ed5098997f168d3f527160ac3822ef 100644
--- a/paddle/api/Internal.h
+++ b/paddle/api/Internal.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "PaddleAPI.h"
@@ -23,7 +22,8 @@ limitations under the License. */
 template 
 void staticCastVector(std::vector* dest, const std::vector& src) {
   dest->resize(src.size());
-  std::transform(src.begin(), src.end(), dest->begin(), [](T1 t){
-    return static_cast(t);
-  });
+  std::transform(src.begin(),
+                 src.end(),
+                 dest->begin(),
+                 [](T1 t) { return static_cast(t); });
 }
diff --git a/paddle/api/Matrix.cpp b/paddle/api/Matrix.cpp
index e5493a381a6f9e3d135c14649a8e1e438494d363..f257ee65aa4a12dfcd1914ddbf0e16461a9b128c 100644
--- a/paddle/api/Matrix.cpp
+++ b/paddle/api/Matrix.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
@@ -44,17 +43,21 @@ Matrix* Matrix::createZero(size_t height, size_t width, bool useGpu) {
   return m;
 }
 
-Matrix* Matrix::createDense(const std::vector& data, size_t height,
-                            size_t width, bool useGpu) {
+Matrix* Matrix::createDense(const std::vector& data,
+                            size_t height,
+                            size_t width,
+                            bool useGpu) {
   auto m = new Matrix();
   m->m->mat = paddle::Matrix::create(height, width, useGpu);
   m->m->mat->copyFrom(data.data(), data.size());
   return m;
 }
 
-Matrix* Matrix::createDenseFromNumpy(float* data, int dim1, int dim2,
-                                      bool copy, bool useGpu)
-                                     throw (UnsupportError) {
+Matrix* Matrix::createDenseFromNumpy(float* data,
+                                     int dim1,
+                                     int dim2,
+                                     bool copy,
+                                     bool useGpu) throw(UnsupportError) {
   if (useGpu) {
     /// Gpu mode only supports copy=True
     if (!copy) {
@@ -66,7 +69,9 @@ Matrix* Matrix::createDenseFromNumpy(float* data, int dim1, int dim2,
   }
 }
 
-Matrix* Matrix::createCpuDenseFromNumpy(float* data, int dim1, int dim2,
+Matrix* Matrix::createCpuDenseFromNumpy(float* data,
+                                        int dim1,
+                                        int dim2,
                                         bool copy) {
   auto m = new Matrix();
   if (copy) {
@@ -85,12 +90,20 @@ Matrix* Matrix::createGpuDenseFromNumpy(float* data, int dim1, int dim2) {
   return m;
 }
 
-Matrix* Matrix::createSparse(size_t height, size_t width, size_t nnz,
-                             bool isNonVal, bool isTrans, bool useGpu) {
+Matrix* Matrix::createSparse(size_t height,
+                             size_t width,
+                             size_t nnz,
+                             bool isNonVal,
+                             bool isTrans,
+                             bool useGpu) {
   auto m = new Matrix();
   m->m->mat = paddle::Matrix::createSparseMatrix(
-      height, width, nnz, isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
-      isTrans, useGpu);
+      height,
+      width,
+      nnz,
+      isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
+      isTrans,
+      useGpu);
   return m;
 }
 
@@ -221,7 +234,8 @@ FloatArray Matrix::getData() const {
 }
 
 void Matrix::sparseCopyFrom(
-    const std::vector& rows, const std::vector& cols,
+    const std::vector& rows,
+    const std::vector& cols,
     const std::vector& vals) throw(UnsupportError) {
   auto cpuSparseMat =
       std::dynamic_pointer_cast(m->mat);
@@ -240,7 +254,8 @@ void Matrix::sparseCopyFrom(
 
 void* Matrix::getSharedPtr() const { return &m->mat; }
 
-void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
+void Matrix::toNumpyMatInplace(float** view_data,
+                               int* dim1,
                                int* dim2) throw(UnsupportError) {
   auto cpuMat = std::dynamic_pointer_cast(m->mat);
   if (cpuMat) {
@@ -251,7 +266,8 @@ void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
     throw UnsupportError();
   }
 }
-void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
+void Matrix::copyToNumpyMat(float** view_m_data,
+                            int* dim1,
                             int* dim2) throw(UnsupportError) {
   static_assert(sizeof(paddle::real) == sizeof(float),
                 "Currently PaddleAPI only support for single "
@@ -269,8 +285,8 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
     } else if (auto gpuMat = dynamic_cast(m->mat.get())) {
       auto src = gpuMat->getData();
       auto dest = *view_m_data;
-      hl_memcpy_device2host(dest, src,
-                            sizeof(paddle::real) * (*dim1) * (*dim2));
+      hl_memcpy_device2host(
+          dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
     } else {
       LOG(WARNING) << "Unexpected Situation";
       throw UnsupportError();
@@ -278,7 +294,8 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
   }
 }
 
-void Matrix::copyFromNumpyMat(float* data, int dim1,
+void Matrix::copyFromNumpyMat(float* data,
+                              int dim1,
                               int dim2) throw(UnsupportError, RangeError) {
   if (isSparse()) {
     throw UnsupportError();
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 5688ece44d2d58a2184a9f23d4af26c51c319579..c07facdb1292b34ac31247160a4347ea359e718b 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include 
@@ -61,8 +60,8 @@ class RangeError {};
 /// Not support Error, such as access GPU memory directly, etc.
 class UnsupportError : public std::runtime_error {
 public:
-  UnsupportError() : std::runtime_error(" ") {};
-  UnsupportError(const std::string& message) : std::runtime_error(message) {};
+  UnsupportError() : std::runtime_error(" "){};
+  UnsupportError(const std::string& message) : std::runtime_error(message){};
 };
 
 /// This type will map to python's list of float.
@@ -112,7 +111,8 @@ public:
   /**
    * Create A Matrix with height,width, which is filled by zero.
    */
-  static Matrix* createZero(size_t height, size_t width,
+  static Matrix* createZero(size_t height,
+                            size_t width,
                             bool useGpu = isUsingGpu());
 
   /**
@@ -124,8 +124,11 @@ public:
    *
    * @note the default sparse type is SPARSE_CSR.
    */
-  static Matrix* createSparse(size_t height, size_t width, size_t nnz,
-                              bool isNonVal = true, bool trans = false,
+  static Matrix* createSparse(size_t height,
+                              size_t width,
+                              size_t nnz,
+                              bool isNonVal = true,
+                              bool trans = false,
                               bool useGpu = isUsingGpu());
 
   /**
@@ -134,13 +137,17 @@ public:
    * @param data  list of float should be passed in python.
    * @note        the value will be copy into a new matrix.
    */
-  static Matrix* createDense(const std::vector& data, size_t height,
-                             size_t width, bool useGpu = isUsingGpu());
-
-  static Matrix* createDenseFromNumpy(float* data, int dim1, int dim2,
-                                      bool copy = true,
-                                      bool useGpu = isUsingGpu())
-                                      throw (UnsupportError);
+  static Matrix* createDense(const std::vector& data,
+                             size_t height,
+                             size_t width,
+                             bool useGpu = isUsingGpu());
+
+  static Matrix* createDenseFromNumpy(
+      float* data,
+      int dim1,
+      int dim2,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
 
   /**
    *  Create Cpu Dense Matrix from numpy matrix, dtype=float32
@@ -151,7 +158,9 @@ public:
    *  @param copy  true if copy into a new matrix, false will create
    *               matrix inplace.
    */
-  static Matrix* createCpuDenseFromNumpy(float* data, int dim1, int dim2,
+  static Matrix* createCpuDenseFromNumpy(float* data,
+                                         int dim1,
+                                         int dim2,
                                          bool copy = false);
 
   /// Create Gpu Dense Matrix from numpy matrix, dtype=float32
@@ -171,11 +180,13 @@ public:
    * numpy_mat = m.toNumpyMat()
    * @endcode
    */
-  void toNumpyMatInplace(float** view_data, int* dim1,
+  void toNumpyMatInplace(float** view_data,
+                         int* dim1,
                          int* dim2) throw(UnsupportError);
 
   /// Copy To numpy mat.
-  void copyToNumpyMat(float** view_m_data, int* dim1,
+  void copyToNumpyMat(float** view_m_data,
+                      int* dim1,
                       int* dim2) throw(UnsupportError);
 
   /// Copy From Numpy Mat
@@ -248,15 +259,18 @@ public:
   static Vector* create(const std::vector& data,
                         bool useGpu = isUsingGpu());
 
-  static Vector* createVectorFromNumpy(float* data, int dim, bool copy = true,
-                                       bool useGpu = isUsingGpu())
-                                       throw (UnsupportError);
+  static Vector* createVectorFromNumpy(
+      float* data,
+      int dim,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
   /**
    * Create Cpu Vector from numpy array, which dtype=float32
    *
    * If copy is false, it will create vector inplace.
    */
-  static Vector* createCpuVectorFromNumpy(float* data, int dim,
+  static Vector* createCpuVectorFromNumpy(float* data,
+                                          int dim,
                                           bool copy = false);
 
   /// Create Gpu Vector from numpy array, which dtype=float32
@@ -312,16 +326,19 @@ public:
   static IVector* create(const std::vector& data,
                          bool useGpu = isUsingGpu());
 
-  static IVector* createVectorFromNumpy(int* data, int dim, bool copy = true,
-                                        bool useGpu = isUsingGpu())
-                                        throw (UnsupportError);
+  static IVector* createVectorFromNumpy(
+      int* data,
+      int dim,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
 
   /**
    * Create Cpu IVector from numpy array, which dtype=int32
    *
    * If copy is false, it will create vector inplace
    */
-  static IVector* createCpuVectorFromNumpy(int* data, int dim,
+  static IVector* createCpuVectorFromNumpy(int* data,
+                                           int dim,
                                            bool copy = false);
   /**
    * Create Gpu IVector from numpy array, which dtype=int32
@@ -605,7 +622,8 @@ class ParameterTraverseCallback {
 public:
   ~ParameterTraverseCallback();
 
-  void apply(const std::vector& vecs, const ParameterConfig& config,
+  void apply(const std::vector& vecs,
+             const ParameterConfig& config,
              size_t sparseId);
 
 private:
@@ -638,7 +656,8 @@ public:
 
   void finishBatch();
 
-  void update(const std::vector& vecs, const ParameterConfig& conf,
+  void update(const std::vector& vecs,
+              const ParameterConfig& conf,
               size_t sparseId = NO_SPARSE_ID);
 
   std::vector getParameterTypes() const;
@@ -678,7 +697,8 @@ public:
    * model config by TrainerConfig
    */
   static GradientMachine* createByModelConfig(
-      ModelConfig* conf, GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
+      ModelConfig* conf,
+      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
       const std::vector& parameterTypes = defaultParamTypes);
 
   /**
@@ -701,7 +721,8 @@ public:
   /**
    * Combine forward/backward
    */
-  void forwardBackward(const Arguments& inArgs, Arguments* outArgs,
+  void forwardBackward(const Arguments& inArgs,
+                       Arguments* outArgs,
                        PassType passType,
                        const UpdateCallback& callback = UpdateCallback());
 
@@ -722,14 +743,17 @@ public:
    */
   SequenceGenerator* asSequenceGenerator(
       const std::vector& dict = std::vector(),
-      size_t begin_id = 0UL, size_t end_id = 0UL, size_t max_length = 100UL,
+      size_t begin_id = 0UL,
+      size_t end_id = 0UL,
+      size_t max_length = 100UL,
       size_t beam_size = -1UL);
 
 private:
   GradientMachinePrivate* m;
 
   static GradientMachine* createFromPaddleModelPtr(
-      const void* confPtr, GradientMatchineCreateMode mode,
+      const void* confPtr,
+      GradientMatchineCreateMode mode,
       const std::vector& types);
 
   // Not to use c++ 11 init-list, so we use static var as function default arg.
@@ -751,8 +775,8 @@ public:
   /// Create A Trainer By TrainerConfig. using paddle command line.
   static Trainer* createByCommandLine() throw(IOError);
 
-  static Trainer* create(TrainerConfig* optConfig, GradientMachine* gm)
-      throw(IOError);
+  static Trainer* create(TrainerConfig* optConfig,
+                         GradientMachine* gm) throw(IOError);
 
   /// Start training
   void startTrain();
diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp
index 8b56adc97c2d6178a9e0b272a9af89732a3573f6..c5876bb1c71438578831ffffd85840c706b6224c 100644
--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "paddle/parameter/Parameter.h"
 
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp
index b13761ab0900d57008c17094c5199ef31a040f54..21d031e4bcb897eb693e5cff56bc77a637dc6bd2 100644
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
 #include "paddle/parameter/ParameterOptimizer.h"
@@ -32,17 +31,21 @@ struct ParameterTraverseCallbackPrivate {
       const paddle::ParameterOptimizer::TraverseCallback& callback)
       : callback(callback) {}
 
-  void apply(const std::vector& vecs, const ParameterConfig& conf,
+  void apply(const std::vector& vecs,
+             const ParameterConfig& conf,
              size_t sparseId) {
     std::vector real_vecs;
     real_vecs.resize(vecs.size());
-    std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
-      if (v) {
-        return *(paddle::VectorPtr*)(v->getSharedPtr());
-      } else {
-        return paddle::VectorPtr();
-      }
-    });
+    std::transform(vecs.begin(),
+                   vecs.end(),
+                   real_vecs.begin(),
+                   [](Vector* v) {
+                     if (v) {
+                       return *(paddle::VectorPtr*)(v->getSharedPtr());
+                     } else {
+                       return paddle::VectorPtr();
+                     }
+                   });
 
     paddle::ParameterConfig& real_conf =
         *(paddle::ParameterConfig*)(const_cast(conf)
@@ -86,10 +89,12 @@ void ParameterOptimizer::startBatch(size_t numSamplesProcessed) {
 void ParameterOptimizer::finishBatch() { m->optimizer->finishBatch(); }
 
 void ParameterOptimizer::update(const std::vector& vecs,
-                                const ParameterConfig& conf, size_t sparseId) {
-  ParameterTraverseCallbackPrivate invoker([&](
-      const paddle::VectorPtr _vecs[], const paddle::ParameterConfig& config,
-      size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
+                                const ParameterConfig& conf,
+                                size_t sparseId) {
+  ParameterTraverseCallbackPrivate invoker(
+      [&](const paddle::VectorPtr _vecs[],
+          const paddle::ParameterConfig& config,
+          size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
   invoker.apply(vecs, conf, sparseId);
 }
 
@@ -116,8 +121,9 @@ void ParameterTraverseCallback::apply(const std::vector& vecs,
 
 ParameterTraverseCallback* ParameterOptimizer::needSpecialTraversal(
     const ParameterConfig& config) const {
-  auto& param_config = *(paddle::ParameterConfig*)const_cast(
-                            config).getRawPtr();
+  auto& param_config =
+      *(paddle::ParameterConfig*)const_cast(config)
+           .getRawPtr();
   auto callback = m->optimizer->needSpecialTraversal(param_config);
   if (callback) {
     auto retCallback = new ParameterTraverseCallback();
diff --git a/paddle/api/SequenceGenerator.cpp b/paddle/api/SequenceGenerator.cpp
index 9d353ccc8e281e72a207ba19f45517fd256d6df2..d51be78d45902967107f4bf0af995958faed931a 100644
--- a/paddle/api/SequenceGenerator.cpp
+++ b/paddle/api/SequenceGenerator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "paddle/gserver/gradientmachines/GradientMachine.h"
 #include "paddle/parameter/Argument.h"
@@ -42,8 +41,10 @@ struct Path {
 // position
 static void findNBest(paddle::GradientMachine* gradMachine,
                       std::vector& inArgs,
-                      std::vector& finalPaths, size_t bos_id,
-                      size_t eos_id, size_t max_length) {
+                      std::vector& finalPaths,
+                      size_t bos_id,
+                      size_t eos_id,
+                      size_t max_length) {
   std::vector paths;
   Path emptyPath;
   paths.push_back(emptyPath);
@@ -166,7 +167,8 @@ public:
     if (id < getSize()) {
       Path& p = (*path_)[id];
       std::ostringstream sout;
-      std::transform(p.ids.begin(), p.ids.end(),
+      std::transform(p.ids.begin(),
+                     p.ids.end(),
                      std::ostream_iterator(sout, split ? " " : ""),
                      [&](int id) { return (*dict_)[id]; });
       return sout.str();
diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp
index b61f36f740d47fe785b30361f26059bf0b64829d..7a6aa69fb652313748b1fa787847ffd74fda7a22 100644
--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -64,12 +64,11 @@ Trainer* Trainer::createByCommandLine() throw(IOError) {
 
 Trainer::Trainer(TrainerConfig* config, GradientMachine* gm)
     : m(new TrainerPrivate()) {
-  m->init(config->m->conf, /* testing= */false, gm ? gm->m->machine : nullptr);
+  m->init(config->m->conf, /* testing= */ false, gm ? gm->m->machine : nullptr);
 }
 
-Trainer* Trainer::create(TrainerConfig* config, GradientMachine* gm)
-    throw(IOError)
-{
+Trainer* Trainer::create(TrainerConfig* config,
+                         GradientMachine* gm) throw(IOError) {
   auto retv = new Trainer(config, gm);
   if (retv->m->getConfig().IsInitialized()) {
     return retv;
@@ -134,15 +133,17 @@ void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
 
 Matrix* Trainer::getLayerOutput(const std::string& layerName) {
   auto nn = std::dynamic_pointer_cast(
-          this->m->getGradientMachine());
+      this->m->getGradientMachine());
   CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
   auto m = nn->getLayerOutput(layerName);
   return Matrix::createByPaddleMatrixPtr(&m);
 }
 
-void Trainer::forwardOneBatch(size_t batchSize) { m->forwardOneBatch(batchSize); }
+void Trainer::forwardOneBatch(size_t batchSize) {
+  m->forwardOneBatch(batchSize);
+}
 
-bool TrainerPrivate::forwardOneBatch(size_t batchSize)  {
+bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
   CHECK(dataProvider_) << "data_provider is not specified";
   paddle::DataBatch dataBatch;
   int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
@@ -156,7 +157,6 @@ bool TrainerPrivate::forwardOneBatch(size_t batchSize)  {
 
 void TrainerPrivate::forwardOneDataBatch(
     const std::vector& inArgs) {
-
   std::vector& outArgs = forwardOutput_;
 
   if (config_->getOptConfig().use_sparse_remote_updater()) {
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index a8932351a685474a756c3f5b0e5e8c42bbf58237..1bba1df2e1c0a2d3cd2d8307ed3a0d784bb949b4 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -37,13 +37,15 @@ FloatArray::FloatArray(const float* b, const size_t l)
 IntArray::IntArray(const int* b, const size_t l, bool f)
     : buf(b), length(l), needFree(f) {}
 
-IntWithFloatArray::IntWithFloatArray(const float* v, const int* i, size_t l,
+IntWithFloatArray::IntWithFloatArray(const float* v,
+                                     const int* i,
+                                     size_t l,
                                      bool f)
     : valBuf(v), idxBuf(i), length(l), needFree(f) {}
 
-bool isUsingGpu() {return FLAGS_use_gpu;}
+bool isUsingGpu() { return FLAGS_use_gpu; }
 
-void setUseGpu(bool useGpu) {FLAGS_use_gpu = useGpu;}
+void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
 
 bool isGpuVersion() {
 #ifdef PADDLE_ONLY_CPU
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index d44cdefc35bd09e04412b52fb9981947caf89588..cc1c098223826a06fea291a95730d7fc1fd1beb3 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 
 #include "paddle/math/Vector.h"
@@ -39,8 +38,10 @@ IVector* IVector::create(const std::vector& data, bool useGpu) {
   return v;
 }
 
-IVector* IVector::createVectorFromNumpy(int* data, int dim, bool copy,
-                                        bool useGpu) throw (UnsupportError){
+IVector* IVector::createVectorFromNumpy(int* data,
+                                        int dim,
+                                        bool copy,
+                                        bool useGpu) throw(UnsupportError) {
   if (useGpu) {
     /// if use gpu only copy=true is supported
     if (!copy) {
@@ -137,8 +138,8 @@ void IVector::copyToNumpyArray(int** view_m_data, int* dim1) {
   if (auto cpuVec = dynamic_cast(m->vec.get())) {
     std::memcpy(*view_m_data, cpuVec->getData(), sizeof(int) * (*dim1));
   } else if (auto gpuVec = dynamic_cast(m->vec.get())) {
-    hl_memcpy_device2host(*view_m_data, gpuVec->getData(),
-                          sizeof(int) * (*dim1));
+    hl_memcpy_device2host(
+        *view_m_data, gpuVec->getData(), sizeof(int) * (*dim1));
   } else {
     LOG(INFO) << "Unexpected situation";
   }
@@ -201,8 +202,10 @@ Vector* Vector::createByPaddleVectorPtr(void* ptr) {
   }
 }
 
-Vector* Vector::createVectorFromNumpy(float* data, int dim, bool copy,
-                                      bool useGpu) throw (UnsupportError){
+Vector* Vector::createVectorFromNumpy(float* data,
+                                      int dim,
+                                      bool copy,
+                                      bool useGpu) throw(UnsupportError) {
   if (useGpu) {
     /// if use gpu only copy=True is supported
     if (!copy) {
@@ -251,8 +254,8 @@ void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
   if (auto cpuVec = dynamic_cast(m->vec.get())) {
     std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
   } else if (auto gpuVec = dynamic_cast(m->vec.get())) {
-    hl_memcpy_device2host(*view_m_data, gpuVec->getData(),
-                          sizeof(float) * (*dim1));
+    hl_memcpy_device2host(
+        *view_m_data, gpuVec->getData(), sizeof(float) * (*dim1));
   } else {
     LOG(INFO) << "Unexpected situation";
   }
diff --git a/paddle/cuda/include/hl_activation_functions.h b/paddle/cuda/include/hl_activation_functions.h
index c8aabc7844cd48d7ebdd0077684f9efa50f941a2..03e15b2223a50625c6999f6b081ae984e76b182b 100644
--- a/paddle/cuda/include/hl_activation_functions.h
+++ b/paddle/cuda/include/hl_activation_functions.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_ACTIVATION_FUNCTIONS_H_
 #define HL_ACTIVATION_FUNCTIONS_H_
 
@@ -21,11 +20,8 @@ limitations under the License. */
 /**
  * Active functions: sigmoid, relu, tanh and linear.
  */
-#define HPPL_ACTIVE_FUNCTION  {hppl::sigmoid,   \
-                               hppl::relu,      \
-                               hppl::tanh,      \
-                               hppl::linear     \
-                              }
+#define HPPL_ACTIVE_FUNCTION \
+  { hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear }
 
 namespace hppl {
 
@@ -42,18 +38,18 @@ public:
 
 #ifdef __NVCC__
 namespace gpu {
-static __device__ Active::forward  forward[]  = HPPL_ACTIVE_FUNCTION;
+static __device__ Active::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static __device__ Active::backward backward[] = HPPL_ACTIVE_FUNCTION;
 }
 #else
 namespace cpu {
-static Active::forward  forward[] = HPPL_ACTIVE_FUNCTION;
+static Active::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static Active::backward backward[] = HPPL_ACTIVE_FUNCTION;
 }
 
 #ifdef __AVX__
 namespace avx {
-static Active<__m256>::forward  forward[] = HPPL_ACTIVE_FUNCTION;
+static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static Active<__m256>::backward backward[] = HPPL_ACTIVE_FUNCTION;
 }
 #endif
diff --git a/paddle/cuda/include/hl_aggregate.h b/paddle/cuda/include/hl_aggregate.h
index db75809f5de195d41577ed6569e8508f48241b69..a6d9ff8483eee28b2c8a380f0aca097c7662a02e 100644
--- a/paddle/cuda/include/hl_aggregate.h
+++ b/paddle/cuda/include/hl_aggregate.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_AGGREGATE_H_
 #define HL_AGGREGATE_H_
 
diff --git a/paddle/cuda/include/hl_avx_functions.h b/paddle/cuda/include/hl_avx_functions.h
index cf062dd969bf79554e00369367e3b85c2ae7fc0d..ed339e312a7639cf9b78f130a43d67a7446576bb 100644
--- a/paddle/cuda/include/hl_avx_functions.h
+++ b/paddle/cuda/include/hl_avx_functions.h
@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_AVX_FUNCTIONS_H_
 #define HL_AVX_FUNCTIONS_H_
 
 #include 
 
 namespace hppl {
-  __m256 relu(const __m256 a);
-  __m256 sigmoid(const __m256 a);
-  __m256 tanh(const __m256 a);
-  __m256 linear(const __m256 a);
-
-  __m256 relu(const __m256 a, const __m256 b);
-  __m256 sigmoid(const __m256 a, const __m256 b);
-  __m256 tanh(const __m256 a, const __m256 b);
-  __m256 linear(const __m256 a, const __m256 b);
+__m256 relu(const __m256 a);
+__m256 sigmoid(const __m256 a);
+__m256 tanh(const __m256 a);
+__m256 linear(const __m256 a);
+
+__m256 relu(const __m256 a, const __m256 b);
+__m256 sigmoid(const __m256 a, const __m256 b);
+__m256 tanh(const __m256 a, const __m256 b);
+__m256 linear(const __m256 a, const __m256 b);
 }  // namespace hppl
 
 #endif  // HL_AVX_FUNCTIONS_H_
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
index 9f80898a1f927a0e8bbf86108567a04ccecc38f5..a076952467a5ce10dc1f58007dda2170aa694fbb 100644
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
-
 #ifndef HL_BASE_H_
 #define HL_BASE_H_
 
@@ -33,36 +31,36 @@ limitations under the License. */
  *          HPPL_STREAM_DEFAULT is HPPL default stream.
  */
 typedef enum {
-    HPPL_STREAM_DEFAULT = 0,    /* Thread Default Stream*/
-    HPPL_STREAM_1 = 1,
-    HPPL_STREAM_2 = 2,
-    HPPL_STREAM_3 = 3,
-    HPPL_STREAM_4 = 4,
-    HPPL_THREAD_STREAM_1 = 5,
-    HPPL_THREAD_STREAM_2 = 6,
-    HPPL_THREAD_STREAM_3 = 7,
-    HPPL_THREAD_STREAM_4 = 8,
-    HPPL_STREAM_END
+  HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/
+  HPPL_STREAM_1 = 1,
+  HPPL_STREAM_2 = 2,
+  HPPL_STREAM_3 = 3,
+  HPPL_STREAM_4 = 4,
+  HPPL_THREAD_STREAM_1 = 5,
+  HPPL_THREAD_STREAM_2 = 6,
+  HPPL_THREAD_STREAM_3 = 7,
+  HPPL_THREAD_STREAM_4 = 8,
+  HPPL_STREAM_END
 } hl_stream_t;
 
 /**
  * @brief HPPL activation mode.
  */
 typedef enum {
-    HL_ACTIVATION_SIGMOID   = 0,
-    HL_ACTIVATION_RELU      = 1,
-    HL_ACTIVATION_TANH      = 2,
-    HL_ACTIVATION_LINEAR    = 3,
-    HL_ACTIVATION_END
+  HL_ACTIVATION_SIGMOID = 0,
+  HL_ACTIVATION_RELU = 1,
+  HL_ACTIVATION_TANH = 2,
+  HL_ACTIVATION_LINEAR = 3,
+  HL_ACTIVATION_END
 } hl_activation_mode_t;
 
 /**
  * @brief Transpose type.
  */
 typedef enum {
-    HPPL_OP_N = 0, /* transpose */
-    HPPL_OP_T = 1, /* non transpose */
-    HPPL_OP_END
+  HPPL_OP_N = 0, /* transpose */
+  HPPL_OP_T = 1, /* non transpose */
+  HPPL_OP_END
 } hl_trans_op_t;
 
 /**
@@ -148,23 +146,21 @@ typedef struct {
  * @brief  Sparse matrix value type.
  */
 typedef enum {
-    HL_NO_VALUE = 0,                       /* matrix values only 0 or 1 */
-    HL_FLOAT_VALUE = 1,
-    HL_VALUE_END
+  HL_NO_VALUE = 0, /* matrix values only 0 or 1 */
+  HL_FLOAT_VALUE = 1,
+  HL_VALUE_END
 } hl_matrix_value_t;
 
-
 /**
  * @brief  HPPL matrix format.
  */
 typedef enum {
-    HL_SPARSE_CSR = 0,
-    HL_SPARSE_CSC = 1,
-    HL_SPARSE_END
+  HL_SPARSE_CSR = 0,
+  HL_SPARSE_CSC = 1,
+  HL_SPARSE_END
 } hl_matrix_format_t;
 
-
-typedef struct _hl_matrix_s * hl_matrix_s;
+typedef struct _hl_matrix_s *hl_matrix_s;
 
 /**
  * @brief   HPPL sparse matrix.
@@ -177,12 +173,12 @@ typedef struct _hl_matrix_s * hl_matrix_s;
  * @param  nnz        nonzero values of sparse matrix.
  */
 typedef struct {
-    hl_matrix_s             matrix;
-    hl_matrix_format_t      format;
-    hl_matrix_value_t       type;
-    int                     rows;
-    int                     cols;
-    size_t                  nnz;
+  hl_matrix_s matrix;
+  hl_matrix_format_t format;
+  hl_matrix_value_t type;
+  int rows;
+  int cols;
+  size_t nnz;
 } _hl_sparse_matrix_s, *hl_sparse_matrix_s;
 
 #ifndef PADDLE_TYPE_DOUBLE
@@ -195,7 +191,7 @@ typedef struct {
  *
  * HL_FLOAT_MIN: 1.17549435e-38F
  */
-#define HL_FLOAT_MAX        3.40282347e+38F
+#define HL_FLOAT_MAX 3.40282347e+38F
 /**
  * if real == double
  *
@@ -203,20 +199,18 @@ typedef struct {
  *
  * HL_FLOAT_MIN: 2.2250738585072014e-308
  */
-#define HL_FLOAT_MIN        1.17549435e-38F
+#define HL_FLOAT_MIN 1.17549435e-38F
 #else
-#define HL_FLOAT_MAX        1.7976931348623157e+308
-#define HL_FLOAT_MIN        2.2250738585072014e-308
+#define HL_FLOAT_MAX 1.7976931348623157e+308
+#define HL_FLOAT_MIN 2.2250738585072014e-308
 #endif
 
-
 /**
  * The maximum input value for exp, used to avoid overflow problem.
  *
  * Currently only used for tanh function.
  */
-#define EXP_MAX_INPUT       40.0
-
+#define EXP_MAX_INPUT 40.0
 
 /**
  * @brief DIVUP(x, y) is similar to ceil(x / y).
@@ -224,7 +218,7 @@ typedef struct {
  *        the size of blockDim.
  */
 #ifndef DIVUP
-#define DIVUP(x, y) (((x) + (y) - 1) / (y))
+#define DIVUP(x, y) (((x) + (y)-1) / (y))
 #endif
 
 #ifdef __NVCC__
@@ -233,7 +227,7 @@ typedef struct {
 #include "hl_cuda.h"
 #include "cuda_runtime.h"
 
-extern  __thread bool g_sync_flag;
+extern __thread bool g_sync_flag;
 extern __thread cudaStream_t default_stream;
 #define STREAM_DEFAULT default_stream
 
@@ -241,16 +235,15 @@ extern __thread cudaStream_t default_stream;
  * @brief   Check cuda kernel execution.
  * @param   msg   error string
  */
-#define CHECK_SYNC(msg)                                   \
-  if (true == g_sync_flag) {                              \
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);           \
-    cudaError_t err                                       \
-      = (cudaError_t)hl_get_device_last_error();          \
-    CHECK_EQ(cudaSuccess, err) << "[" << msg << "] "      \
-      << "CUDA error: "                                   \
-      << hl_get_device_error_string((size_t)err);         \
+#define CHECK_SYNC(msg)                                               \
+  if (true == g_sync_flag) {                                          \
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);                       \
+    cudaError_t err = (cudaError_t)hl_get_device_last_error();        \
+    CHECK_EQ(cudaSuccess, err)                                        \
+        << "[" << msg << "] "                                         \
+        << "CUDA error: " << hl_get_device_error_string((size_t)err); \
   }
 
-#endif  /* __NVCC__ */
+#endif /* __NVCC__ */
 
-#endif  /* HL_BASE_H_ */
+#endif /* HL_BASE_H_ */
diff --git a/paddle/cuda/include/hl_batch_transpose.h b/paddle/cuda/include/hl_batch_transpose.h
index 414c7996acee4ccbe2d7dbd093e25a23119fea3c..f3630e9762508fd39935e62e0007de04f9140fff 100644
--- a/paddle/cuda/include/hl_batch_transpose.h
+++ b/paddle/cuda/include/hl_batch_transpose.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_BATCH_TRANSPOSE_H_
 #define HL_BATCH_TRANSPOSE_H_
 
@@ -31,10 +30,7 @@ limitations under the License. */
  *          order. Each batch has height * width data, which are
  *          arranged in height-first (or row-first) manner.
  */
-extern void batchTranspose(const real* input,
-                           real* output,
-                           int width,
-                           int height,
-                           int batchSize);
+extern void batchTranspose(
+    const real* input, real* output, int width, int height, int batchSize);
 
 #endif  // HL_BATCH_TRANSPOSE_H_
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 70b5be6fda2509853029a68d31129df28d580942..cffaac634f0f64be5ddab961d549ae43775bb7b0 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CNN_H_
 #define HL_CNN_H_
 
@@ -37,15 +36,21 @@ limitations under the License. */
  * @param[in]   alpha
  * @param[in]   beta
  */
-extern void hl_shrink_col2feature(
-    const real * dataCol, size_t channels,
-    size_t height, size_t width,
-    size_t blockH, size_t blockW,
-    size_t strideH, size_t strideW,
-    size_t paddingH, size_t paddingW,
-    size_t outputH, size_t outputW,
-    real* dataIm,
-    real alpha = 1.0f, real beta = 0.0f);
+extern void hl_shrink_col2feature(const real* dataCol,
+                                  size_t channels,
+                                  size_t height,
+                                  size_t width,
+                                  size_t blockH,
+                                  size_t blockW,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real* dataIm,
+                                  real alpha = 1.0f,
+                                  real beta = 0.0f);
 
 /**
  * @brief   Expand feature to column.
@@ -65,14 +70,19 @@ extern void hl_shrink_col2feature(
  * @param[out]  dataCol     expand data.
  *
  */
-extern void hl_expand_feature2col(
-    const real* dataIm, size_t channels,
-    size_t height, size_t width,
-    size_t blockH, size_t blockW,
-    size_t strideH, size_t strideW,
-    size_t paddingH, size_t paddingW,
-    size_t outputH, size_t outputW,
-    real* dataCol);
+extern void hl_expand_feature2col(const real* dataIm,
+                                  size_t channels,
+                                  size_t height,
+                                  size_t width,
+                                  size_t blockH,
+                                  size_t blockW,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real* dataCol);
 
 /**
  * @brief   Maximum pool forward.
@@ -94,15 +104,21 @@ extern void hl_expand_feature2col(
  * @param[in]   tgtStride   stride between output data samples.
  *
  */
-extern void hl_maxpool_forward(
-    const int frameCnt, const real* inputData,
-    const int channels,
-    const int height, const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW,
-    real* tgtData, const int tgtStride);
+extern void hl_maxpool_forward(const int frameCnt,
+                               const real* inputData,
+                               const int channels,
+                               const int height,
+                               const int width,
+                               const int pooledH,
+                               const int pooledW,
+                               const int sizeX,
+                               const int sizeY,
+                               const int strideH,
+                               const int strideW,
+                               const int paddingH,
+                               const int paddingW,
+                               real* tgtData,
+                               const int tgtStride);
 
 /**
  * @brief   Maximum pool backward.
@@ -125,20 +141,28 @@ extern void hl_maxpool_forward(
  * @param[in]   paddingH    padding height.
  * @param[in]   paddingW    padding width.
  * @param[out]  targetGrad  output grad.
- * @param[in]   outStride   stride between output data samples. 
+ * @param[in]   outStride   stride between output data samples.
  *
  */
-extern void hl_maxpool_backward(
-    const int frameCnt, const real* inputData,
-    const real* outData, const real* outGrad,
-    const int channels, const int height,
-    const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW,
-    real scaleA, real scaleB,
-    real* targetGrad, const int outStride);
+extern void hl_maxpool_backward(const int frameCnt,
+                                const real* inputData,
+                                const real* outData,
+                                const real* outGrad,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int pooledH,
+                                const int pooledW,
+                                const int sizeX,
+                                const int sizeY,
+                                const int strideH,
+                                const int strideW,
+                                const int paddingH,
+                                const int paddingW,
+                                real scaleA,
+                                real scaleB,
+                                real* targetGrad,
+                                const int outStride);
 
 /**
  * @brief   Averge pool forward.
@@ -160,15 +184,21 @@ extern void hl_maxpool_backward(
  * @param[in]   tgtStride   stride between output data samples.
  *
  */
-extern void hl_avgpool_forward(
-    const int frameCnt, const real* inputData,
-    const int channels,
-    const int height, const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW,
-    real* tgtData, const int tgtStride);
+extern void hl_avgpool_forward(const int frameCnt,
+                               const real* inputData,
+                               const int channels,
+                               const int height,
+                               const int width,
+                               const int pooledH,
+                               const int pooledW,
+                               const int sizeX,
+                               const int sizeY,
+                               const int strideH,
+                               const int strideW,
+                               const int paddingH,
+                               const int paddingW,
+                               real* tgtData,
+                               const int tgtStride);
 
 /**
  * @brief   Maximum pool backward.
@@ -189,19 +219,26 @@ extern void hl_avgpool_forward(
  * @param[in]   scaleA      scale.
  * @param[in]   scaleB      scale.
  * @param[out]  backGrad    output grad.
- * @param[in]   outStride   stride between output data samples. 
+ * @param[in]   outStride   stride between output data samples.
  *
  */
-extern void hl_avgpool_backward(
-    const int frameCnt, const real* outGrad,
-    const int channels, const int height,
-    const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    int paddingH, int paddingW,
-    real scaleA, real scaleB,
-    real* backGrad, const int outStride);
+extern void hl_avgpool_backward(const int frameCnt,
+                                const real* outGrad,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int pooledH,
+                                const int pooledW,
+                                const int sizeX,
+                                const int sizeY,
+                                const int strideH,
+                                const int strideW,
+                                int paddingH,
+                                int paddingW,
+                                real scaleA,
+                                real scaleB,
+                                real* backGrad,
+                                const int outStride);
 
 /**
  * @brief   Cross-map-respose normalize forward.
@@ -218,10 +255,16 @@ extern void hl_avgpool_backward(
  * @param[in]   beta        scale.
  *
  */
-extern void hl_CMRNorm_forward(
-    size_t frameCnt, const real* in, real* scale, real* out,
-    size_t channels, size_t height, size_t width, size_t sizeX,
-    real alpha, real beta);
+extern void hl_CMRNorm_forward(size_t frameCnt,
+                               const real* in,
+                               real* scale,
+                               real* out,
+                               size_t channels,
+                               size_t height,
+                               size_t width,
+                               size_t sizeX,
+                               real alpha,
+                               real beta);
 
 /**
  * @brief   Cross-map-respose normalize backward.
@@ -240,11 +283,18 @@ extern void hl_CMRNorm_forward(
  * @param[in]   beta        scale.
  *
  */
-extern void hl_CMRNorm_backward(
-    size_t frameCnt, const real* inV, const real* scale,
-    const real* outV, const real* outDiff, real *inDiff,
-    size_t channels, size_t height, size_t width, size_t sizeX,
-    real alpha, real beta);
+extern void hl_CMRNorm_backward(size_t frameCnt,
+                                const real* inV,
+                                const real* scale,
+                                const real* outV,
+                                const real* outDiff,
+                                real* inDiff,
+                                size_t channels,
+                                size_t height,
+                                size_t width,
+                                size_t sizeX,
+                                real alpha,
+                                real beta);
 
 /**
  * @brief   Bilinear interpolation forward.
@@ -278,24 +328,24 @@ extern void hl_bilinear_forward(const real* inData,
                                 const real ratioH,
                                 const real ratioW);
 
- /**
- * @brief   Bilinear interpolation backward.
- *
- * @param[out]  inGrad      input gradient.
- * @param[in]   inImgH      input image height.
- * @param[in]   inImgW      input image width.
- * @param[in]   inputH      input batchSize.
- * @param[in]   inputW      input image data dim.
- * @param[in]   outGrad     output gradient.
- * @param[in]   outImgH     output image height.
- * @param[in]   outImgW     output image width.
- * @param[in]   outputH     output batchSize.
- * @param[in]   outputW     output image data dim.
- * @param[in]   numChannels number of channels.
- * @param[in]   ratioH      inImgH / outImgH.
- * @param[in]   ratioW      inImgW / outImgW.
- *
- */                               
+/**
+* @brief   Bilinear interpolation backward.
+*
+* @param[out]  inGrad      input gradient.
+* @param[in]   inImgH      input image height.
+* @param[in]   inImgW      input image width.
+* @param[in]   inputH      input batchSize.
+* @param[in]   inputW      input image data dim.
+* @param[in]   outGrad     output gradient.
+* @param[in]   outImgH     output image height.
+* @param[in]   outImgW     output image width.
+* @param[in]   outputH     output batchSize.
+* @param[in]   outputW     output image data dim.
+* @param[in]   numChannels number of channels.
+* @param[in]   ratioH      inImgH / outImgH.
+* @param[in]   ratioW      inImgW / outImgW.
+*
+*/
 extern void hl_bilinear_backward(real* inGrad,
                                  const size_t inImgH,
                                  const size_t inImgW,
@@ -321,9 +371,13 @@ extern void hl_bilinear_backward(real* inGrad,
  * @param[in]   featLen     feature length = image height * image width.
  * @param[in]   groups      number of groups.
  */
-extern void hl_maxout_forward(
-    const real* inData, real* outData, int* idData,
-    size_t batchSize, size_t size, size_t featLen, size_t groups);
+extern void hl_maxout_forward(const real* inData,
+                              real* outData,
+                              int* idData,
+                              size_t batchSize,
+                              size_t size,
+                              size_t featLen,
+                              size_t groups);
 
 /**
  * @brief   MaxOut backward.
@@ -336,8 +390,12 @@ extern void hl_maxout_forward(
  * @param[in]   featLen     feature length = image height * image width.
  * @param[in]   groups      number of groups.
  */
-extern void hl_maxout_backward(
-    real* inGrad, const real* outGrad, const int* idData,
-    size_t batchSize, size_t size, size_t featLen, size_t groups);
+extern void hl_maxout_backward(real* inGrad,
+                               const real* outGrad,
+                               const int* idData,
+                               size_t batchSize,
+                               size_t size,
+                               size_t featLen,
+                               size_t groups);
 
 #endif /* HL_CNN_H_ */
diff --git a/paddle/cuda/include/hl_cuda.h b/paddle/cuda/include/hl_cuda.h
index 3196db67f61fd2e6b75df4abb3652df4456a0366..357286e3188a6f3184bc56e75232bf2e1ec54e44 100644
--- a/paddle/cuda/include/hl_cuda.h
+++ b/paddle/cuda/include/hl_cuda.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_H_
 #define HL_CUDA_H_
 
@@ -22,8 +21,7 @@ limitations under the License. */
 /**
  * @brief   HPPL event.
  */
-typedef struct _hl_event_st *  hl_event_t;
-
+typedef struct _hl_event_st *hl_event_t;
 
 /**
  * @brief return cuda runtime api version.
@@ -42,7 +40,7 @@ extern void hl_start();
  *                      if device is NULL, will start all GPU.
  * @param[in]   number  number of devices.
  */
-extern void hl_specify_devices_start(int* device, int number);
+extern void hl_specify_devices_start(int *device, int number);
 
 /**
  * @brief   Queries if a device may directly access a peer device's memory.
@@ -126,7 +124,7 @@ extern int hl_get_device();
  *
  * @return      dest_d   pointer to device memory.
  */
-extern void* hl_malloc_device(size_t size);
+extern void *hl_malloc_device(size_t size);
 
 /**
  * @brief   Free device memory.
@@ -143,7 +141,7 @@ extern void hl_free_mem_device(void *dest_d);
  *
  * @return      dest_h   pointer to host memory.
  */
-extern void* hl_malloc_host(size_t size);
+extern void *hl_malloc_host(size_t size);
 
 /**
  * @brief   Free host page-lock memory.
@@ -228,9 +226,9 @@ extern void hl_srand(unsigned int seed);
  * @param[in]   stream  stream id.
  */
 extern void hl_memcpy_async(void *dst,
-                           void *src,
-                           size_t size,
-                           hl_stream_t stream);
+                            void *src,
+                            size_t size,
+                            hl_stream_t stream);
 
 /**
  * @brief   Waits for stream tasks to complete.
@@ -261,8 +259,7 @@ extern void hl_destroy_event(hl_event_t event);
  *
  * @return      time   Time between start and end in ms.
  */
-extern float hl_event_elapsed_time(hl_event_t start,
-                                   hl_event_t end);
+extern float hl_event_elapsed_time(hl_event_t start, hl_event_t end);
 
 /**
  * @brief   Records an event.
@@ -300,7 +297,7 @@ extern void hl_set_device_flags_block();
 /**
  * @brief   Returns the last error string from a cuda runtime call.
  */
-extern const char* hl_get_device_error_string();
+extern const char *hl_get_device_error_string();
 
 /**
  * @brief     Returns the last error string from a cuda runtime call.
@@ -309,7 +306,7 @@ extern const char* hl_get_device_error_string();
  *
  * @see       hl_get_device_last_error()
  */
-extern const char* hl_get_device_error_string(size_t err);
+extern const char *hl_get_device_error_string(size_t err);
 
 /**
  * @brief   Returns the last error number.
diff --git a/paddle/cuda/include/hl_cuda_cublas.h b/paddle/cuda/include/hl_cuda_cublas.h
index d757317eb4a97559feef22d4fd8edf7c10ca6745..db8c03c2c01c67788622d37b5330e22c31e03f34 100644
--- a/paddle/cuda/include/hl_cuda_cublas.h
+++ b/paddle/cuda/include/hl_cuda_cublas.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_CUBLAS_H_
 #define HL_CUDA_CUBLAS_H_
 
@@ -29,12 +28,8 @@ limitations under the License. */
  * @param[in]   ldc     the first dimension of C_d.
  *
  */
-extern void hl_matrix_transpose(real *A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN,
-                                int lda,
-                                int ldc);
+extern void hl_matrix_transpose(
+    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc);
 
 /*
  * @brief Matrix transpose, while lda = dimN, ldc = dimM.
@@ -45,10 +40,7 @@ extern void hl_matrix_transpose(real *A_d,
  * @param[in]   dimN    matrix width.
  *
  */
-extern void hl_matrix_transpose(real *A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN);
+extern void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN);
 
 /*
  * @brief Matrix inverse
@@ -60,11 +52,7 @@ extern void hl_matrix_transpose(real *A_d,
  * @param[in]   ldc    the first dimension of C_d
  *
  */
-extern void hl_matrix_inverse(real *A_d,
-                              real *C_d,
-                              int dimN,
-                              int lda,
-                              int ldc);
+extern void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc);
 
 /**
  * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -84,12 +72,19 @@ extern void hl_matrix_inverse(real *A_d,
  * @param[in]   ldc     the first dimension of C_d.
  *
  */
-extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                          real *B_d, hl_trans_op_t transb,
+extern void hl_matrix_mul(real *A_d,
+                          hl_trans_op_t transa,
+                          real *B_d,
+                          hl_trans_op_t transb,
                           real *C_d,
-                          int dimM, int dimN, int dimK,
-                          real alpha, real beta,
-                          int lda, int ldb, int ldc);
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta,
+                          int lda,
+                          int ldb,
+                          int ldc);
 
 /**
  * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -106,11 +101,16 @@ extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
  * @param[in]   beta    scalar used for multiplication.
  *
  */
-extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                          real *B_d, hl_trans_op_t transb,
+extern void hl_matrix_mul(real *A_d,
+                          hl_trans_op_t transa,
+                          real *B_d,
+                          hl_trans_op_t transb,
                           real *C_d,
-                          int dimM, int dimN, int dimK,
-                          real alpha, real beta);
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta);
 
 /**
  * @brief   This function performs the matrix-vector multiplication.
@@ -132,11 +132,17 @@ extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
  *
  */
 
-extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
-                                 real *B_d, real *C_d,
-                                 int dimM, int dimN,
-                                 real alpha, real beta,
-                                 int lda, int incb, int incc);
+extern void hl_matrix_mul_vector(real *A_d,
+                                 hl_trans_op_t trans,
+                                 real *B_d,
+                                 real *C_d,
+                                 int dimM,
+                                 int dimN,
+                                 real alpha,
+                                 real beta,
+                                 int lda,
+                                 int incb,
+                                 int incc);
 
 /**
  * @brief   This function performs the matrix-vector multiplication.
@@ -154,9 +160,13 @@ extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
  * @param[in]     beta   scalar used for multiplication.
  *
  */
-extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
-                                 real *B_d, real *C_d,
-                                 int dimM, int dimN,
-                                 real alpha, real beta);
+extern void hl_matrix_mul_vector(real *A_d,
+                                 hl_trans_op_t trans,
+                                 real *B_d,
+                                 real *C_d,
+                                 int dimM,
+                                 int dimN,
+                                 real alpha,
+                                 real beta);
 
 #endif /* HL_CUDA_CUBLAS_H_ */
diff --git a/paddle/cuda/include/hl_cuda_cudnn.h b/paddle/cuda/include/hl_cuda_cudnn.h
index f256cb54dfe69e8df7cc7fcc0ed0a58f3574acd3..3a2f916210277145efa8f6d7663a2698ea546b0b 100644
--- a/paddle/cuda/include/hl_cuda_cudnn.h
+++ b/paddle/cuda/include/hl_cuda_cudnn.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_CUDNN_H_
 #define HL_CUDA_CUDNN_H_
 
@@ -22,7 +21,7 @@ limitations under the License. */
  *  hppl pooling mode
  */
 typedef enum {
-  HL_POOLING_MAX     = 0,
+  HL_POOLING_MAX = 0,
   // average includes padded values
   HL_POOLING_AVERAGE = 1,
   // average does not include padded values
@@ -324,17 +323,16 @@ extern void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
  * @param[in]   sizeInBytes         gpu workspace size (bytes).
  * @param[in]   convBwdFilterAlgo   backward filter algorithm.
  */
-extern void hl_convolution_backward_filter(
-        hl_tensor_descriptor input,
-        real* input_data,
-        hl_tensor_descriptor output,
-        real* output_grad_data,
-        hl_filter_descriptor filter,
-        real* filter_grad_data,
-        hl_convolution_descriptor conv,
-        void* gpuWorkSpace,
-        size_t sizeInBytes,
-        int  convBwdFilterAlgo);
+extern void hl_convolution_backward_filter(hl_tensor_descriptor input,
+                                           real* input_data,
+                                           hl_tensor_descriptor output,
+                                           real* output_grad_data,
+                                           hl_filter_descriptor filter,
+                                           real* filter_grad_data,
+                                           hl_convolution_descriptor conv,
+                                           void* gpuWorkSpace,
+                                           size_t sizeInBytes,
+                                           int convBwdFilterAlgo);
 
 /**
  * @brief   convolution backward data(calculate input image grad data).
@@ -350,17 +348,16 @@ extern void hl_convolution_backward_filter(
  * @param[in]   sizeInBytes         gpu workspace size (bytes).
  * @param[in]   convBwdDataAlgo     backward data algorithm.
  */
-extern void hl_convolution_backward_data(
-        hl_tensor_descriptor input,
-        real* input_data_grad,
-        hl_tensor_descriptor output,
-        real* output_grad_data,
-        hl_filter_descriptor filter,
-        real* filter_data,
-        hl_convolution_descriptor conv,
-        void* gpuWorkSpace,
-        size_t sizeInBytes,
-        int convBwdDataAlgo);
+extern void hl_convolution_backward_data(hl_tensor_descriptor input,
+                                         real* input_data_grad,
+                                         hl_tensor_descriptor output,
+                                         real* output_grad_data,
+                                         hl_filter_descriptor filter,
+                                         real* filter_data,
+                                         hl_convolution_descriptor conv,
+                                         void* gpuWorkSpace,
+                                         size_t sizeInBytes,
+                                         int convBwdDataAlgo);
 
 /**
  * @brief   convolution backward bias(calculate bias grad data).
@@ -383,8 +380,8 @@ extern void hl_convolution_backward_bias(hl_tensor_descriptor bias,
  * @param[in]   height              matrix height.
  * @param[in]   width               matrix width.
  */
-extern void hl_softmax_forward(real *input,
-                               real *output,
+extern void hl_softmax_forward(real* input,
+                               real* output,
                                int height,
                                int width);
 
@@ -396,8 +393,8 @@ extern void hl_softmax_forward(real *input,
  * @param[in]   height              matrix height.
  * @param[in]   width               matrix width.
  */
-extern void hl_softmax_backward(real *output_value,
-                                real *output_grad,
+extern void hl_softmax_backward(real* output_value,
+                                real* output_grad,
                                 int height,
                                 int width);
 
@@ -426,18 +423,18 @@ extern void hl_softmax_backward(real *output_value,
  *
  */
 extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                           real *input,
+                                           real* input,
                                            hl_tensor_descriptor outputDesc,
-                                           real *output,
+                                           real* output,
                                            hl_tensor_descriptor bnParamDesc,
-                                           real *scale,
-                                           real *bias,
+                                           real* scale,
+                                           real* bias,
                                            double factor,
-                                           real *runningMean,
-                                           real *runningInvVar,
+                                           real* runningMean,
+                                           real* runningInvVar,
                                            double epsilon,
-                                           real *savedMean,
-                                           real *savedVar);
+                                           real* savedMean,
+                                           real* savedVar);
 
 /**
  * @brief   cudnn batch norm forward.
@@ -463,14 +460,14 @@ extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
  *
  */
 extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                            real *input,
+                                            real* input,
                                             hl_tensor_descriptor outputDesc,
-                                            real *output,
+                                            real* output,
                                             hl_tensor_descriptor bnParamDesc,
-                                            real *scale,
-                                            real *bias,
-                                            real *estimatedMean,
-                                            real *estimatedVar,
+                                            real* scale,
+                                            real* bias,
+                                            real* estimatedMean,
+                                            real* estimatedVar,
                                             double epsilon);
 
 /**
@@ -483,7 +480,8 @@ extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
  * @param[in]   inGradDesc      input tensor descriptor desc.
  * @param[in]   inGrad          input data.
  * @param[in]   dBnParamDesc    tensor descriptor desc.
- *                              bnScale, bnBias, running mean/var, save_mean/var.
+ *                              bnScale, bnBias, running mean/var,
+ * save_mean/var.
  * @param[in]   scale           batch normalization scale parameter (in original
  *                              paper scale is referred to as gamma).
  * @param[in]   scaleGrad       batch normalization scale parameter (in original
@@ -497,17 +495,17 @@ extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
  *
  */
 extern void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                                   real *input,
+                                   real* input,
                                    hl_tensor_descriptor outGradDesc,
-                                   real *outGrad,
+                                   real* outGrad,
                                    hl_tensor_descriptor inGradDesc,
-                                   real *inGrad,
+                                   real* inGrad,
                                    hl_tensor_descriptor dBnParamDesc,
-                                   real *scale,
-                                   real *scaleGrad,
-                                   real *biasGrad,
+                                   real* scale,
+                                   real* scaleGrad,
+                                   real* biasGrad,
                                    double epsilon,
-                                   real *savedMean,
-                                   real *savedInvVar);
+                                   real* savedMean,
+                                   real* savedInvVar);
 
 #endif  // HL_CUDA_CUDNN_H_
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h
index f36c724e2da3dce11696fcda7daf98f5cda36dd6..1eb9f9ca888d3a93f04621e10346b5f9ff34cdca 100644
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_DSO_LOADER_H_
 #define HL_DSO_LOADER_H_
 
diff --git a/paddle/cuda/include/hl_functions.h b/paddle/cuda/include/hl_functions.h
index 65f366461ced0f9ee31ff9075f6dfaeb6c9b72a2..91ce9a0678463597df88c548aeac322ee19d95de 100644
--- a/paddle/cuda/include/hl_functions.h
+++ b/paddle/cuda/include/hl_functions.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_FUNCTIONS_H_
 #define HL_FUNCTIONS_H_
 
@@ -21,30 +20,30 @@ limitations under the License. */
 /**
  * sigmoid threshold maximum
  */
-#define     SIGMOID_THRESHOLD_MIN   -40.0
+#define SIGMOID_THRESHOLD_MIN -40.0
 
 /**
  * sigmoid threshold minimum
  */
-#define     SIGMOID_THRESHOLD_MAX   13.0
+#define SIGMOID_THRESHOLD_MAX 13.0
 
 #ifndef __NVCC__
 namespace hppl {
-  /*
-   * forward activation
-   */
-  real relu(const real a);
-  real sigmoid(const real a);
-  real tanh(const real a);
-  real linear(const real a);
-
-  /*
-   * backward activation
-   */
-  real relu(const real a, const real b);
-  real sigmoid(const real a, const real b);
-  real tanh(const real a, const real b);
-  real linear(const real a, const real b);
+/*
+ * forward activation
+ */
+real relu(const real a);
+real sigmoid(const real a);
+real tanh(const real a);
+real linear(const real a);
+
+/*
+ * backward activation
+ */
+real relu(const real a, const real b);
+real sigmoid(const real a, const real b);
+real tanh(const real a, const real b);
+real linear(const real a, const real b);
 }  // namespace hppl
 
 #ifdef __AVX__
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h
index 05039663b6e9f5e4a72f15ab822d723635f9b282..3be0df3b93b69811fb9c36dae223cbd927b02559 100644
--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_GPU_H_
 #define HL_GPU_H_
 
diff --git a/paddle/cuda/include/hl_lstm.h b/paddle/cuda/include/hl_lstm.h
index 1f95e318a1fe06050bbd31c2e276974f4a8bdc1e..7e527a79025969320f1aca75d313fd9d0194efd1 100644
--- a/paddle/cuda/include/hl_lstm.h
+++ b/paddle/cuda/include/hl_lstm.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_LSTM_H_
 #define HL_LSTM_H_
 
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
index 6195e30b9974d3ad092b4cf604e6b74fa481835c..96648661e345d8fa5d50cb2aae3a56ee53921f90 100644
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_MATRIX_H_
 #define HL_MATRIX_H_
 
@@ -30,13 +29,8 @@ limitations under the License. */
  * @param[in]   beta    scalar used for addition.
  *
  */
-extern void hl_matrix_add(real* A_d,
-                          real* B_d,
-                          real* C_d,
-                          int dimM,
-                          int dimN,
-                          real alpha,
-                          real beta);
+extern void hl_matrix_add(
+    real* A_d, real* B_d, real* C_d, int dimM, int dimN, real alpha, real beta);
 /**
  * @brief   Matrix Softmax.
  *
@@ -46,7 +40,7 @@ extern void hl_matrix_add(real* A_d,
  * @param[in]   dimN    matrix width.
  *
  */
-extern void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN);
+extern void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN);
 
 /**
  * @brief   Matrix softmax derivative.
@@ -58,11 +52,8 @@ extern void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN);
  * @param[in]   dimN         matrix width.
  *
  */
-extern void hl_matrix_softmax_derivative(real* grad_d,
-                                         real* output_d,
-                                         real* sftmaxSum_d,
-                                         int dimM,
-                                         int dimN);
+extern void hl_matrix_softmax_derivative(
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN);
 
 /**
  * @brief   Sequence softmax.
@@ -73,8 +64,8 @@ extern void hl_matrix_softmax_derivative(real* grad_d,
  * @param[in]   numSequence sequence number.
  *
  */
-extern void hl_sequence_softmax_forward(real *A_d,
-                                        real *C_d,
+extern void hl_sequence_softmax_forward(real* A_d,
+                                        real* C_d,
                                         const int* index,
                                         int numSequence);
 
@@ -88,11 +79,8 @@ extern void hl_sequence_softmax_forward(real *A_d,
  * @param[in]   dimN    matrix width.
  *
  */
-extern void hl_matrix_classification_error(real* A_d,
-                                           int* B_d,
-                                           real* C_d,
-                                           int dimM,
-                                           int dimN);
+extern void hl_matrix_classification_error(
+    real* A_d, int* B_d, real* C_d, int dimM, int dimN);
 
 /**
  * @brief   Matrix cross entropy.
@@ -104,11 +92,8 @@ extern void hl_matrix_classification_error(real* A_d,
  * @param[in]   dimN    matrix width.
  *
  */
-extern void hl_matrix_cross_entropy(real* A_d,
-                                    real* C_d,
-                                    int* label_d,
-                                    int dimM,
-                                    int dimN);
+extern void hl_matrix_cross_entropy(
+    real* A_d, real* C_d, int* label_d, int dimM, int dimN);
 
 /**
  * @brief   Matrix cross entropy back propagation.
@@ -120,11 +105,8 @@ extern void hl_matrix_cross_entropy(real* A_d,
  * @param[in]   dimN        matrix width.
  *
  */
-extern void hl_matrix_cross_entropy_bp(real* grad_d,
-                                       real* output_d,
-                                       int* label_d,
-                                       int dimM,
-                                       int dimN);
+extern void hl_matrix_cross_entropy_bp(
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN);
 
 /**
  * @brief  Matrix multi-binary label cross entropy
@@ -135,11 +117,8 @@ extern void hl_matrix_cross_entropy_bp(real* grad_d,
  * @param[in]   dimM      matrix height.
  * @param[in]   dimN      matrix width.
  */
-extern void hl_matrix_multi_binary_cross_entropy(real* output,
-                                                 real* entropy,
-                                                 hl_sparse_matrix_s mat,
-                                                 int dimM,
-                                                 int dimN);
+extern void hl_matrix_multi_binary_cross_entropy(
+    real* output, real* entropy, hl_sparse_matrix_s mat, int dimM, int dimN);
 
 /**
  * @brief  Matrix multi-binary label cross entropy backprop
@@ -150,11 +129,8 @@ extern void hl_matrix_multi_binary_cross_entropy(real* output,
  * @param[in]   dimM      matrix height.
  * @param[in]   dimN      matrix width.
  */
-extern void hl_matrix_multi_binary_cross_entropy_bp(real* output,
-                                                    real* grad,
-                                                    hl_sparse_matrix_s mat,
-                                                    int dimM,
-                                                    int dimN);
+extern void hl_matrix_multi_binary_cross_entropy_bp(
+    real* output, real* grad, hl_sparse_matrix_s mat, int dimM, int dimN);
 
 /**
  * @brief  Matrix zero memory.
@@ -176,12 +152,8 @@ extern void hl_matrix_zero_mem(real* data, int num);
  * @param[in]  partial_sum
  */
 
-extern void hl_param_relu_forward(real* output,
-                                  real* input,
-                                  real* w,
-                                  int width,
-                                  int height,
-                                  int partial_sum);
+extern void hl_param_relu_forward(
+    real* output, real* input, real* w, int width, int height, int partial_sum);
 /**
  * @brief parameter relu backward w
  *
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
index 46d86b2982f065802eec83ca7554f787d1d02f3a..bb5124df44b492bd8fdeb2a0c75ebcf74d2c8157 100644
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_SEQUENCE_H_
 #define HL_SEQUENCE_H_
 
@@ -32,7 +31,7 @@ limitations under the License. */
 extern void hl_max_sequence_forward(real* input,
                                     const int* sequence,
                                     real* output,
-                                    int *index,
+                                    int* index,
                                     int numSequences,
                                     int dim);
 
@@ -46,11 +45,8 @@ extern void hl_max_sequence_forward(real* input,
  * @param[in]   dim             input dimension.
  *
  */
-extern void hl_max_sequence_backward(real* outputGrad,
-                                     int *index,
-                                     real* inputGrad,
-                                     int numSequences,
-                                     int dim);
+extern void hl_max_sequence_backward(
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim);
 
 /**
  * @brief   Context projection forward.
@@ -63,7 +59,8 @@ extern void hl_max_sequence_backward(real* outputGrad,
  * @param[in]   inputDim        input sequence dimension.
  * @param[in]   contextLength   context length.
  * @param[in]   contextStart    context start.
- * @param[in]   beginPad        number of extra timesteps added at the beginning.
+ * @param[in]   beginPad        number of extra timesteps added at the
+ * beginning.
  * @param[in]   isPadding       trainable padding.
  *
  */
@@ -109,7 +106,8 @@ extern void hl_context_projection_backward_data(real* outputGrad,
  * @param[in]   totalPad        number of extra timesteps.
  * @param[in]   contextLength   context length.
  * @param[in]   contextStart    context start.
- * @param[in]   beginPad        number of extra timesteps added at the beginning.
+ * @param[in]   beginPad        number of extra timesteps added at the
+ * beginning.
  *
  */
 extern void hl_context_projection_backward_weight(real* outputGrad,
@@ -141,9 +139,9 @@ extern void hl_context_projection_backward_weight(real* outputGrad,
  * @param[in]       seq2batch   copy direction.
  *
  */
-extern void hl_sequence2batch_copy(real *batch,
-                                   real *sequence,
-                                   const int *batchIndex,
+extern void hl_sequence2batch_copy(real* batch,
+                                   real* sequence,
+                                   const int* batchIndex,
                                    int seqWidth,
                                    int batchCount,
                                    bool seq2batch);
@@ -167,9 +165,9 @@ extern void hl_sequence2batch_copy(real *batch,
  * @param[in]       seq2batch   copy direction.
  *
  */
-extern void hl_sequence2batch_add(real *batch,
-                                  real *sequence,
-                                  int *batchIndex,
+extern void hl_sequence2batch_add(real* batch,
+                                  real* sequence,
+                                  int* batchIndex,
                                   int seqWidth,
                                   int batchCount,
                                   bool seq2batch);
diff --git a/paddle/cuda/include/hl_sparse.h b/paddle/cuda/include/hl_sparse.h
index 9acdebdebf37761e1485e3441963586ead9f3c85..c4e0be23e2031cbcb124b532216a23d8a344668d 100644
--- a/paddle/cuda/include/hl_sparse.h
+++ b/paddle/cuda/include/hl_sparse.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_SPARSE_H_
 #define HL_SPARSE_H_
 
@@ -31,7 +30,7 @@ limitations under the License. */
  */
 extern void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
                                     hl_matrix_format_t format,
-                                    hl_matrix_value_t  value_type,
+                                    hl_matrix_value_t value_type,
                                     int dimM,
                                     int dimN,
                                     int nnz);
@@ -60,10 +59,10 @@ extern void hl_free_sparse_matrix(hl_sparse_matrix_s A_d);
  *
  */
 extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       void * dest_d,
+                                       void *dest_d,
                                        size_t size,
                                        hl_matrix_format_t format,
-                                       hl_matrix_value_t  value_type,
+                                       hl_matrix_value_t value_type,
                                        int dimM,
                                        int dimN,
                                        int nnz);
@@ -94,11 +93,11 @@ extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
  *
  */
 extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       real* value_d,
-                                       int* rows_d,
-                                       int* cols_d,
+                                       real *value_d,
+                                       int *rows_d,
+                                       int *cols_d,
                                        hl_matrix_format_t format,
-                                       hl_matrix_value_t  value_type,
+                                       hl_matrix_value_t value_type,
                                        int dimM,
                                        int dimN,
                                        int nnz);
@@ -259,10 +258,14 @@ extern void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
  */
 extern void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
                                     hl_trans_op_t transa,
-                                    real *B_d, hl_trans_op_t transb,
+                                    real *B_d,
+                                    hl_trans_op_t transb,
                                     real *C_d,
-                                    int dimM, int dimN, int dimK,
-                                    real alpha, real beta);
+                                    int dimM,
+                                    int dimN,
+                                    int dimK,
+                                    real alpha,
+                                    real beta);
 
 /**
  * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
@@ -311,11 +314,16 @@ extern void hl_matrix_dense_mul_csc(real *A_d,
  * @note    transb is not support HPPL_OP_T.
  *
  */
-extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
-                                 real *B_d, hl_trans_op_t transb,
+extern void hl_sparse_matrix_mul(real *A_d,
+                                 hl_trans_op_t transa,
+                                 real *B_d,
+                                 hl_trans_op_t transb,
                                  hl_sparse_matrix_s C_d,
-                                 int dimM, int dimN, int dimK,
-                                 real alpha, real beta);
+                                 int dimM,
+                                 int dimN,
+                                 int dimK,
+                                 real alpha,
+                                 real beta);
 
 /**
  * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -336,12 +344,16 @@ extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
  * @note    transa is not support HPPL_OP_T.
  *
  */
-extern void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
+extern void hl_matrix_dense_mul_csr(real *A_d,
+                                    hl_trans_op_t transa,
                                     hl_sparse_matrix_s B_d,
                                     hl_trans_op_t transb,
                                     real *C_d,
-                                    int dimM, int dimN, int dimK,
-                                    real alpha, real beta);
+                                    int dimM,
+                                    int dimN,
+                                    int dimK,
+                                    real alpha,
+                                    real beta);
 
 /**
  * @brief   Memcpy csc_matrix to host.
@@ -412,7 +424,6 @@ extern void hl_memcpy_from_csr_matrix(real *csr_val,
                                       hl_sparse_matrix_s csr_matrix,
                                       hl_stream_t stream);
 
-
 /**
  * @brief   A_d[j] += B_d[i,j] for i in range(height)
  *
@@ -423,19 +434,13 @@ extern void hl_memcpy_from_csr_matrix(real *csr_val,
  * @param[in]       scale  scale of B_d
  *
  */
-extern void hl_sparse_matrix_column_sum(real* A_d,
-                                        hl_sparse_matrix_s B_d,
-                                        int dimM,
-                                        int dimN,
-                                        real scale);
+extern void hl_sparse_matrix_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale);
 /**
  * @brief implementation of csr sparse matrix in hl_sparse_matirx_column_sum
  */
-extern void hl_matrix_csr_column_sum(real* A_d,
-                                     hl_sparse_matrix_s B_d,
-                                     int dimM,
-                                     int dimN,
-                                     real scale);
+extern void hl_matrix_csr_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale);
 
 /**
  * @brief   A_d[i,j] += B_d[j]
@@ -446,13 +451,13 @@ extern void hl_matrix_csr_column_sum(real* A_d,
  *
  */
 extern void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
-                                      real* B_d,
+                                      real *B_d,
                                       real scale);
 /**
  * @brief implementation of csr sparse matrix in hl_sparse_matrix_add_bias
  */
 extern void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
-                                   real* B_d,
+                                   real *B_d,
                                    real scale);
 
 /**
@@ -470,7 +475,7 @@ extern void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
  *
  */
 extern void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
-                                       real* B_d,
+                                       real *B_d,
                                        int dimM,
                                        int dimN,
                                        real alpha,
@@ -479,7 +484,7 @@ extern void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
  * @brief implementation of csr sparse matrix in hl_sparse_matrix_add_dense
  */
 extern void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
-                                    real* B_d,
+                                    real *B_d,
                                     int dimM,
                                     int dimN,
                                     real alpha,
@@ -493,7 +498,7 @@ extern void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
  * @return   return rows pointer, which is gpu address
  *
  */
-extern int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
+extern int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
 
 /**
  * @brief get cols pionter of GpuSparseMatrix
@@ -503,7 +508,7 @@ extern int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
  * @return   return cols pointer, which is gpu address
  *
  */
-extern int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
+extern int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
 
 /**
  * @brief get value pionter of GpuSparseMatrix
@@ -513,7 +518,6 @@ extern int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
  * @return   return value pointer, which is gpu address
  *
  */
-extern real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat);
-
+extern real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat);
 
 #endif /* HL_SPARSE_H_ */
diff --git a/paddle/cuda/include/hl_table_apply.h b/paddle/cuda/include/hl_table_apply.h
index 3c9428e9253d5ed563e4e9f62d8842667496b83c..b4ac83a66af13c2a843872faba2ebd972008a738 100644
--- a/paddle/cuda/include/hl_table_apply.h
+++ b/paddle/cuda/include/hl_table_apply.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_TABLE_APPLY_H_
 #define HL_TABLE_APPLY_H_
 
@@ -31,8 +30,10 @@ limitations under the License. */
  * @param[in]   dim             width of table.
  *
  */
-extern void hl_matrix_select_rows(real* output, int ldo,
-                                  real* table, int ldt,
+extern void hl_matrix_select_rows(real* output,
+                                  int ldo,
+                                  real* table,
+                                  int ldt,
                                   int* ids,
                                   int numSamples,
                                   int tableSize,
@@ -53,8 +54,10 @@ extern void hl_matrix_select_rows(real* output, int ldo,
  * @param[in]   dim             width of table.
  *
  */
-extern void hl_matrix_add_to_rows(real* table, int ldt,
-                                  real* input, int ldi,
+extern void hl_matrix_add_to_rows(real* table,
+                                  int ldt,
+                                  real* input,
+                                  int ldi,
                                   int* ids,
                                   int numSamples,
                                   int tableSize,
@@ -72,8 +75,7 @@ extern void hl_matrix_add_to_rows(real* table, int ldt,
  *
  */
 template 
-extern void hl_vector_select_from(T* dst, int sized,
-                                  const T* src, int sizes,
-                                  const int* ids, int sizei);
+extern void hl_vector_select_from(
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei);
 
-#endif  /* HL_TABLE_APPLY_H_ */
+#endif /* HL_TABLE_APPLY_H_ */
diff --git a/paddle/cuda/include/hl_time.h b/paddle/cuda/include/hl_time.h
index 4414b0b2d2ed4ab6a48294ffaed3a43a639e5950..b0a88c66a12fcfec6ea96b877423f907dac8dfa1 100644
--- a/paddle/cuda/include/hl_time.h
+++ b/paddle/cuda/include/hl_time.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_TIME_H_
 #define HL_TIME_H_
 
diff --git a/paddle/cuda/include/hl_top_k.h b/paddle/cuda/include/hl_top_k.h
index a38d4cf862278a060f72b970d723895dc3735d0a..e8cfebbf6a3bd27c10a71d7817238bc304681fa4 100644
--- a/paddle/cuda/include/hl_top_k.h
+++ b/paddle/cuda/include/hl_top_k.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_TOP_K_H_
 #define HL_TOP_K_H_
 
@@ -31,9 +30,11 @@ limitations under the License. */
  * @param[in]   numSamples     height of input value.
  *
  */
-extern void hl_matrix_top_k(real* topVal, int ldv,
-                            int * topIds,
-                            real* src, int lds,
+extern void hl_matrix_top_k(real* topVal,
+                            int ldv,
+                            int* topIds,
+                            real* src,
+                            int lds,
                             int dim,
                             int beamSize,
                             int numSamples);
@@ -50,8 +51,9 @@ extern void hl_matrix_top_k(real* topVal, int ldv,
  *
  * @note    Only support HL_SPARSE_CSR format.
  */
-extern void hl_sparse_matrix_top_k(real* topVal, int ldv,
-                                   int * topIds,
+extern void hl_sparse_matrix_top_k(real* topVal,
+                                   int ldv,
+                                   int* topIds,
                                    hl_sparse_matrix_s src,
                                    int beamSize,
                                    int numSamples);
diff --git a/paddle/cuda/include/stub/hl_aggregate_stub.h b/paddle/cuda/include/stub/hl_aggregate_stub.h
index 4c0c68f3c98fe95f01060b82c3a1b9822d2a3715..bb53fc581e09905aa7a9b2d8dfe44b04c677c40a 100644
--- a/paddle/cuda/include/stub/hl_aggregate_stub.h
+++ b/paddle/cuda/include/stub/hl_aggregate_stub.h
@@ -12,29 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_AGGREGATE_STUB_H_
 #define HL_AGGREGATE_STUB_H_
 
 #include "hl_aggregate.h"
 
-inline void hl_matrix_row_sum(real *A_d, real *C_d,
-                              int dimM, int dimN) {}
+inline void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_row_max(real *A_d, real *C_d,
-                              int dimM, int dimN) {}
+inline void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_row_min(real *A_d, real *C_d,
-                              int dimM, int dimN) {}
+inline void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_column_sum(real *A_d, real *C_d,
-                                 int dimM, int dimN) {}
+inline void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_column_max(real *A_d, real *C_d,
-                                 int dimM, int dimN) {}
+inline void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_column_min(real *A_d, real *C_d,
-                                 int dimM, int dimN) {}
+inline void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {}
 
 inline void hl_vector_sum(real *A_d, real *C_h, int dimM) {}
 
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index c6f32ad337705ff938b7b370a4785dc7f4393041..2f73b9671edd3609996aebff2913f5262805f869 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -12,84 +12,134 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CNN_STUB_H_
 #define HL_CNN_STUB_H_
 
 #include "hl_cnn.h"
 
-inline void hl_shrink_col2feature(
-    const real * dataCol, size_t channels,
-    size_t height, size_t width,
-    size_t blockH, size_t blockW,
-    size_t strideH, size_t strideW,
-    size_t paddingH, size_t paddingW,
-    size_t outputH, size_t outputW,
-    real* dataIm,
-    real alpha, real beta) {}
-
-inline void hl_expand_feature2col(
-    const real* dataIm, size_t channels,
-    size_t height, size_t width,
-    size_t blockH, size_t blockW,
-    size_t strideH, size_t strideW,
-    size_t paddingH, size_t paddingW,
-    size_t outputH, size_t outputW,
-    real* dataCol) {}
-
-inline void hl_maxpool_forward(
-    const int frameCnt, const real* inputData,
-    const int channels,
-    const int height, const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW,
-    real* tgtData, const int tgtStride) {}
-
-inline void hl_maxpool_backward(
-    const int frameCnt, const real* inputData,
-    const real* outData, const real* outGrad,
-    const int channels, const int height,
-    const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW,
-    real scaleA, real scaleB,
-    real* targetGrad, const int outStride) {}
-
-inline void hl_avgpool_forward(
-    const int frameCnt, const real* inputData,
-    const int channels,
-    const int height, const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW,
-    real* tgtData, const int tgtStride) {}
-
-inline void hl_avgpool_backward(
-    const int frameCnt, const real* outGrad,
-    const int channels, const int height,
-    const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    int paddingH, int paddingW,
-    real scaleA, real scaleB,
-    real* backGrad, const int outStride) {}
-
-inline void hl_CMRNorm_forward(
-    size_t frameCnt, const real* in, real* scale, real* out,
-    size_t channels, size_t height, size_t width, size_t sizeX,
-    real alpha, real beta) {}
-
-inline void hl_CMRNorm_backward(
-    size_t frameCnt, const real* inV, const real* scale,
-    const real* outV, const real* outDiff, real *inDiff,
-    size_t channels, size_t height, size_t width, size_t sizeX,
-    real alpha, real beta) {}
+inline void hl_shrink_col2feature(const real* dataCol,
+                                  size_t channels,
+                                  size_t height,
+                                  size_t width,
+                                  size_t blockH,
+                                  size_t blockW,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real* dataIm,
+                                  real alpha,
+                                  real beta) {}
+
+inline void hl_expand_feature2col(const real* dataIm,
+                                  size_t channels,
+                                  size_t height,
+                                  size_t width,
+                                  size_t blockH,
+                                  size_t blockW,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real* dataCol) {}
+
+inline void hl_maxpool_forward(const int frameCnt,
+                               const real* inputData,
+                               const int channels,
+                               const int height,
+                               const int width,
+                               const int pooledH,
+                               const int pooledW,
+                               const int sizeX,
+                               const int sizeY,
+                               const int strideH,
+                               const int strideW,
+                               const int paddingH,
+                               const int paddingW,
+                               real* tgtData,
+                               const int tgtStride) {}
+
+inline void hl_maxpool_backward(const int frameCnt,
+                                const real* inputData,
+                                const real* outData,
+                                const real* outGrad,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int pooledH,
+                                const int pooledW,
+                                const int sizeX,
+                                const int sizeY,
+                                const int strideH,
+                                const int strideW,
+                                const int paddingH,
+                                const int paddingW,
+                                real scaleA,
+                                real scaleB,
+                                real* targetGrad,
+                                const int outStride) {}
+
+inline void hl_avgpool_forward(const int frameCnt,
+                               const real* inputData,
+                               const int channels,
+                               const int height,
+                               const int width,
+                               const int pooledH,
+                               const int pooledW,
+                               const int sizeX,
+                               const int sizeY,
+                               const int strideH,
+                               const int strideW,
+                               const int paddingH,
+                               const int paddingW,
+                               real* tgtData,
+                               const int tgtStride) {}
+
+inline void hl_avgpool_backward(const int frameCnt,
+                                const real* outGrad,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int pooledH,
+                                const int pooledW,
+                                const int sizeX,
+                                const int sizeY,
+                                const int strideH,
+                                const int strideW,
+                                int paddingH,
+                                int paddingW,
+                                real scaleA,
+                                real scaleB,
+                                real* backGrad,
+                                const int outStride) {}
+
+inline void hl_CMRNorm_forward(size_t frameCnt,
+                               const real* in,
+                               real* scale,
+                               real* out,
+                               size_t channels,
+                               size_t height,
+                               size_t width,
+                               size_t sizeX,
+                               real alpha,
+                               real beta) {}
+
+inline void hl_CMRNorm_backward(size_t frameCnt,
+                                const real* inV,
+                                const real* scale,
+                                const real* outV,
+                                const real* outDiff,
+                                real* inDiff,
+                                size_t channels,
+                                size_t height,
+                                size_t width,
+                                size_t sizeX,
+                                real alpha,
+                                real beta) {}
 
 inline void hl_bilinear_forward(const real* inData,
                                 const size_t inImgH,
@@ -106,25 +156,33 @@ inline void hl_bilinear_forward(const real* inData,
                                 const real ratioW) {}
 
 inline void hl_bilinear_backward(real* inGrad,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t inputH,
-                                const size_t inputW,
-                                const real* outGrad,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t outputH,
-                                const size_t outputW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {}
-
-inline void hl_maxout_forward(
-    const real* inData, real* outData, int* idData,
-    size_t batchSize, size_t size, size_t featLen, size_t group) {}
-
-inline void hl_maxout_backward(
-    real* inGrad, const real* outGrad, const int* idData,
-    size_t batchSize, size_t size, size_t featLen, size_t group) {}
+                                 const size_t inImgH,
+                                 const size_t inImgW,
+                                 const size_t inputH,
+                                 const size_t inputW,
+                                 const real* outGrad,
+                                 const size_t outImgH,
+                                 const size_t outImgW,
+                                 const size_t outputH,
+                                 const size_t outputW,
+                                 const size_t numChannels,
+                                 const real ratioH,
+                                 const real ratioW) {}
+
+inline void hl_maxout_forward(const real* inData,
+                              real* outData,
+                              int* idData,
+                              size_t batchSize,
+                              size_t size,
+                              size_t featLen,
+                              size_t group) {}
+
+inline void hl_maxout_backward(real* inGrad,
+                               const real* outGrad,
+                               const int* idData,
+                               size_t batchSize,
+                               size_t size,
+                               size_t featLen,
+                               size_t group) {}
 
 #endif  // HL_CNN_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
index 903dcbe8355d6f593d96bc1f9e686d54035a9366..85f7c390c47397127487b16fdc933f0afe2fb880 100644
--- a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
@@ -12,41 +12,42 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_CUBLAS_STUB_H_
 #define HL_CUDA_CUBLAS_STUB_H_
 
 #include "hl_cuda_cublas.h"
 
-inline void hl_matrix_transpose(real *A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN,
-                                int lda,
-                                int ldc) {}
-
-inline void hl_matrix_transpose(real *A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN) {}
-
-inline void hl_matrix_inverse(real *A_d,
-                              real *C_d,
-                              int dimN,
-                              int lda,
-                              int ldc) {}
-
-inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                          real *B_d, hl_trans_op_t transb,
-                          real *C_d,
-                          int dimM, int dimN, int dimK,
-                          real alpha, real beta,
-                          int lda, int ldb, int ldc) {}
+inline void hl_matrix_transpose(
+    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {}
+
+inline void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                          real *B_d, hl_trans_op_t transb,
+inline void hl_matrix_inverse(
+    real *A_d, real *C_d, int dimN, int lda, int ldc) {}
+
+inline void hl_matrix_mul(real *A_d,
+                          hl_trans_op_t transa,
+                          real *B_d,
+                          hl_trans_op_t transb,
+                          real *C_d,
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta,
+                          int lda,
+                          int ldb,
+                          int ldc) {}
+
+inline void hl_matrix_mul(real *A_d,
+                          hl_trans_op_t transa,
+                          real *B_d,
+                          hl_trans_op_t transb,
                           real *C_d,
-                          int dimM, int dimN, int dimK,
-                          real alpha, real beta) {}
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta) {}
 
 #endif  // HL_CUDA_CUBLAS_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
index b96804afd86ba5e8c7b7eed7eb768295b4e23096..3beb0e5b5170261a6c453936b8b0347f3e97dbff 100644
--- a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
@@ -12,15 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_CUDNN_STUB_H_
 #define HL_CUDA_CUDNN_STUB_H_
 
 #include "hl_cuda_cudnn.h"
 
-inline int hl_get_cudnn_lib_version() {
-  return 0;
-}
+inline int hl_get_cudnn_lib_version() { return 0; }
 
 inline void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {}
 
@@ -68,41 +65,41 @@ inline void hl_pooling_backward(hl_tensor_descriptor input,
                                 hl_pooling_descriptor pooling) {}
 
 inline void hl_create_filter_descriptor(hl_filter_descriptor* filter,
-                                       int input_feature_maps,
-                                       int output_feature_maps,
-                                       int height,
-                                       int width) {}
+                                        int input_feature_maps,
+                                        int output_feature_maps,
+                                        int height,
+                                        int width) {}
 
 inline void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {}
 
 inline void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
-        hl_tensor_descriptor image,
-        hl_filter_descriptor filter,
-        int padding_height,
-        int padding_width,
-        int stride_height,
-        int stride_width) {}
+                                             hl_tensor_descriptor image,
+                                             hl_filter_descriptor filter,
+                                             int padding_height,
+                                             int padding_width,
+                                             int stride_height,
+                                             int stride_width) {}
 
 inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
-        hl_tensor_descriptor image,
-        hl_filter_descriptor filter,
-        int padding_height,
-        int padding_width,
-        int stride_height,
-        int stride_width) {}
+                                            hl_tensor_descriptor image,
+                                            hl_filter_descriptor filter,
+                                            int padding_height,
+                                            int padding_width,
+                                            int stride_height,
+                                            int stride_width) {}
 
 inline void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {}
 
 inline void hl_conv_workspace(hl_tensor_descriptor input,
-                       hl_tensor_descriptor output,
-                       hl_filter_descriptor filter,
-                       hl_convolution_descriptor conv,
-                       int* convFwdAlgo,
-                       size_t* fwdLimitBytes,
-                       int* convBwdDataAlgo,
-                       size_t* bwdDataLimitBytes,
-                       int* convBwdFilterAlgo,
-                       size_t* bwdFilterLimitBytes) {}
+                              hl_tensor_descriptor output,
+                              hl_filter_descriptor filter,
+                              hl_convolution_descriptor conv,
+                              int* convFwdAlgo,
+                              size_t* fwdLimitBytes,
+                              int* convBwdDataAlgo,
+                              size_t* bwdDataLimitBytes,
+                              int* convBwdFilterAlgo,
+                              size_t* bwdFilterLimitBytes) {}
 
 inline void hl_convolution_forward(hl_tensor_descriptor input,
                                    real* input_data,
@@ -116,86 +113,84 @@ inline void hl_convolution_forward(hl_tensor_descriptor input,
                                    int convFwdAlgo) {}
 
 inline void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
-        real* bias_data,
-        hl_tensor_descriptor output,
-        real* output_data) {}
-
-inline void hl_convolution_backward_filter(
-        hl_tensor_descriptor input,
-        real* input_data,
-        hl_tensor_descriptor output,
-        real* output_grad_data,
-        hl_filter_descriptor filter,
-        real* filter_grad_data,
-        hl_convolution_descriptor conv,
-        void* gpuWorkSpace,
-        size_t sizeInBytes,
-        int convBwdFilterAlgo) {}
-
-inline void hl_convolution_backward_data(
-        hl_tensor_descriptor input,
-        real* input_data_grad,
-        hl_tensor_descriptor output,
-        real* output_grad_data,
-        hl_filter_descriptor filter,
-        real* filter_data,
-        hl_convolution_descriptor conv,
-        void* gpuWorkSpace,
-        size_t sizeInBytes,
-        int convBwdDataAlgo) {}
+                                            real* bias_data,
+                                            hl_tensor_descriptor output,
+                                            real* output_data) {}
+
+inline void hl_convolution_backward_filter(hl_tensor_descriptor input,
+                                           real* input_data,
+                                           hl_tensor_descriptor output,
+                                           real* output_grad_data,
+                                           hl_filter_descriptor filter,
+                                           real* filter_grad_data,
+                                           hl_convolution_descriptor conv,
+                                           void* gpuWorkSpace,
+                                           size_t sizeInBytes,
+                                           int convBwdFilterAlgo) {}
+
+inline void hl_convolution_backward_data(hl_tensor_descriptor input,
+                                         real* input_data_grad,
+                                         hl_tensor_descriptor output,
+                                         real* output_grad_data,
+                                         hl_filter_descriptor filter,
+                                         real* filter_data,
+                                         hl_convolution_descriptor conv,
+                                         void* gpuWorkSpace,
+                                         size_t sizeInBytes,
+                                         int convBwdDataAlgo) {}
 
 inline void hl_convolution_backward_bias(hl_tensor_descriptor bias,
-                                        real* bias_grad_data,
-                                        hl_tensor_descriptor output,
-                                        real* output_grad_data) {}
+                                         real* bias_grad_data,
+                                         hl_tensor_descriptor output,
+                                         real* output_grad_data) {}
 
-inline void hl_softmax_forward(real *input,
-                              real *output,
-                              int height,
-                              int width) {}
-
-inline void hl_softmax_backward(real *output_value,
-                               real *output_grad,
+inline void hl_softmax_forward(real* input,
+                               real* output,
                                int height,
                                int width) {}
 
+inline void hl_softmax_backward(real* output_value,
+                                real* output_grad,
+                                int height,
+                                int width) {}
+
 inline void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                           real *input,
+                                           real* input,
                                            hl_tensor_descriptor outputDesc,
-                                           real *output,
+                                           real* output,
                                            hl_tensor_descriptor bnParamDesc,
-                                           real *scale,
-                                           real *bias,
+                                           real* scale,
+                                           real* bias,
                                            double factor,
-                                           real *runningMean,
-                                           real *runningInvVar,
+                                           real* runningMean,
+                                           real* runningInvVar,
                                            double epsilon,
-                                           real *savedMean,
-                                           real *savedVar) {}
+                                           real* savedMean,
+                                           real* savedVar) {}
 
 inline void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                            real *input,
+                                            real* input,
                                             hl_tensor_descriptor outputDesc,
-                                            real *output,
+                                            real* output,
                                             hl_tensor_descriptor bnParamDesc,
-                                            real *scale,
-                                            real *bias,
-                                            real *estimatedMean,
-                                            real *estimatedVar,
+                                            real* scale,
+                                            real* bias,
+                                            real* estimatedMean,
+                                            real* estimatedVar,
                                             double epsilon) {}
 
 inline void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                                   real *input,
+                                   real* input,
                                    hl_tensor_descriptor outGradDesc,
-                                   real *outGrad,
+                                   real* outGrad,
                                    hl_tensor_descriptor inGradDesc,
-                                   real *inGrad,
+                                   real* inGrad,
                                    hl_tensor_descriptor dBnParamDesc,
-                                   real *scale,
-                                   real *scaleGrad,
-                                   real *biasGrad,
+                                   real* scale,
+                                   real* scaleGrad,
+                                   real* biasGrad,
                                    double epsilon,
-                                   real *savedMean,
-                                   real *savedInvVar) {}
+                                   real* savedMean,
+                                   real* savedInvVar) {}
 
 #endif  // HL_CUDA_CUDNN_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_stub.h b/paddle/cuda/include/stub/hl_cuda_stub.h
index 675ac03b0e188e9b26038dd4e40264099618e17a..1f91068cdf8b3d472c4b403d1ec7d5293c28c07e 100644
--- a/paddle/cuda/include/stub/hl_cuda_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_STUB_H_
 #define HL_CUDA_STUB_H_
 
@@ -24,29 +23,25 @@ inline void hl_specify_devices_start(int *device, int number) {}
 
 inline void hl_init(int device) {}
 
-inline int hl_get_cuda_lib_version(int device) {
-  return 0;
-}
+inline int hl_get_cuda_lib_version(int device) { return 0; }
 
 inline void hl_fini() {}
 
 inline void hl_set_sync_flag(bool flag) {}
 
-inline bool hl_get_sync_flag() {
-  return false;
-}
+inline bool hl_get_sync_flag() { return false; }
 
-inline int hl_get_device_count() { return 0;  }
+inline int hl_get_device_count() { return 0; }
 
 inline void hl_set_device(int device) {}
 
-inline int hl_get_device() { return 0;  }
+inline int hl_get_device() { return 0; }
 
-inline void* hl_malloc_device(size_t size) { return NULL; }
+inline void *hl_malloc_device(size_t size) { return NULL; }
 
 inline void hl_free_mem_device(void *dest_d) {}
 
-inline void* hl_malloc_host(size_t size) { return NULL;  }
+inline void *hl_malloc_host(size_t size) { return NULL; }
 
 inline void hl_free_mem_host(void *dest_h) {}
 
@@ -64,7 +59,9 @@ inline void hl_rand(real *dest_d, size_t num) {}
 
 inline void hl_srand(unsigned int seed) {}
 
-inline void hl_memcpy_async(void *dst, void *src, size_t size,
+inline void hl_memcpy_async(void *dst,
+                            void *src,
+                            size_t size,
                             hl_stream_t stream) {}
 
 inline void hl_stream_synchronize(hl_stream_t stream) {}
@@ -83,11 +80,11 @@ inline void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {}
 
 inline void hl_event_synchronize(hl_event_t event) {}
 
-inline int hl_get_device_last_error() { return 0;  }
+inline int hl_get_device_last_error() { return 0; }
 
-inline const char* hl_get_device_error_string() { return NULL; }
+inline const char *hl_get_device_error_string() { return NULL; }
 
-inline const char* hl_get_device_error_string(size_t err) { return NULL; }
+inline const char *hl_get_device_error_string(size_t err) { return NULL; }
 
 inline bool hl_cuda_event_is_ready(hl_event_t event) { return true; }
 
diff --git a/paddle/cuda/include/stub/hl_lstm_stub.h b/paddle/cuda/include/stub/hl_lstm_stub.h
index 2700bef02a5e1e40ee7603ccab7fec754196f8cd..7ccda032d26f2fbbe99136e8481416daea557a78 100644
--- a/paddle/cuda/include/stub/hl_lstm_stub.h
+++ b/paddle/cuda/include/stub/hl_lstm_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_LSTM_STUB_H_
 #define HL_LSTM_STUB_H_
 
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
index 76cac2e57769301fee2e5979e2685976daf35441..1bd78d23fbaf46e6265ba0db25ea399a204bd96f 100644
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_MATRIX_STUB_H_
 #define HL_MATRIX_STUB_H_
 
@@ -26,48 +25,30 @@ inline void hl_matrix_add(real* A_d,
                           real alpha,
                           real beta) {}
 
-inline void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) {}
+inline void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {}
 
-inline void hl_sequence_softmax_forward(real *A_d,
-                                        real *C_d,
+inline void hl_sequence_softmax_forward(real* A_d,
+                                        real* C_d,
                                         const int* index,
                                         int numSequence) {}
 
-inline void hl_matrix_softmax_derivative(real* grad_d,
-                                         real* output_d,
-                                         real* sftmaxSum_d,
-                                         int dimM,
-                                         int dimN) {}
-
-inline void hl_matrix_classification_error(real* A_d,
-                                           int* B_d,
-                                           real* C_d,
-                                           int dimM,
-                                           int dimN) {}
-
-inline void hl_matrix_cross_entropy(real* A_d,
-                                    real* C_d,
-                                    int* label_d,
-                                    int dimM,
-                                    int dimN) {}
-
-inline void hl_matrix_cross_entropy_bp(real* grad_d,
-                                       real* output_d,
-                                       int* label_d,
-                                       int dimM,
-                                       int dimN) {}
-
-inline void hl_matrix_multi_binary_cross_entropy(real* output,
-                                                 real* entropy,
-                                                 hl_sparse_matrix_s mat,
-                                                 int dimM,
-                                                 int dimN) {}
-
-inline void hl_matrix_multi_binary_cross_entropy_bp(real* output,
-                                                    real* grad,
-                                                    hl_sparse_matrix_s mat,
-                                                    int dimM,
-                                                    int dimN) {}
+inline void hl_matrix_softmax_derivative(
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {}
+
+inline void hl_matrix_classification_error(
+    real* A_d, int* B_d, real* C_d, int dimM, int dimN) {}
+
+inline void hl_matrix_cross_entropy(
+    real* A_d, real* C_d, int* label_d, int dimM, int dimN) {}
+
+inline void hl_matrix_cross_entropy_bp(
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {}
+
+inline void hl_matrix_multi_binary_cross_entropy(
+    real* output, real* entropy, hl_sparse_matrix_s mat, int dimM, int dimN) {}
+
+inline void hl_matrix_multi_binary_cross_entropy_bp(
+    real* output, real* grad, hl_sparse_matrix_s mat, int dimM, int dimN) {}
 
 inline void hl_matrix_zero_mem(real* data, int num) {}
 
@@ -101,7 +82,6 @@ inline void hl_cossim(real* output,
                       int input2_height,
                       real scale) {}
 
-
 inline void hl_cossim_derivative(real* grad,
                                  real* output,
                                  real* prevOutX,
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
index aabd956c37f7dce48a379b995ab88a53aa65c760..381f0a6f26c5669465f029e972c6ca8b0e6e1776 100644
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_SEQUENCE_STUB_H_
 #define HL_SEQUENCE_STUB_H_
 
@@ -21,15 +20,12 @@ limitations under the License. */
 inline void hl_max_sequence_forward(real* input,
                                     const int* sequence,
                                     real* output,
-                                    int *index,
+                                    int* index,
                                     int numSequences,
                                     int dim) {}
 
-inline void hl_max_sequence_backward(real* outputGrad,
-                                     int *index,
-                                     real* inputGrad,
-                                     int numSequences,
-                                     int dim) {}
+inline void hl_max_sequence_backward(
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {}
 
 inline void hl_context_projection_forward(real* input,
                                           const int* sequence,
@@ -60,16 +56,16 @@ inline void hl_context_projection_backward_weight(real* outputGrad,
                                                   int contextStart,
                                                   int beginPad) {}
 
-inline void hl_sequence2batch_copy(real *batch,
-                                   real *sequence,
-                                   const int *batchIndex,
+inline void hl_sequence2batch_copy(real* batch,
+                                   real* sequence,
+                                   const int* batchIndex,
                                    int seqWidth,
                                    int batchCount,
                                    bool seq2batch) {}
 
-inline void hl_sequence2batch_add(real *batch,
-                                  real *sequence,
-                                  int *batchIndex,
+inline void hl_sequence2batch_add(real* batch,
+                                  real* sequence,
+                                  int* batchIndex,
                                   int seqWidth,
                                   int batchCount,
                                   bool seq2batch) {}
diff --git a/paddle/cuda/include/stub/hl_sparse_stub.h b/paddle/cuda/include/stub/hl_sparse_stub.h
index 346a1900dda5825e9a4311a2c51e8a50e6e7df0b..d47bdd2c47d097c4c68b7b7e88ef888bc18270c2 100644
--- a/paddle/cuda/include/stub/hl_sparse_stub.h
+++ b/paddle/cuda/include/stub/hl_sparse_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_SPARSE_STUB_H_
 #define HL_SPARSE_STUB_H_
 
@@ -20,7 +19,7 @@ limitations under the License. */
 
 inline void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
                                     hl_matrix_format_t format,
-                                    hl_matrix_value_t  value_type,
+                                    hl_matrix_value_t value_type,
                                     int dimM,
                                     int dimN,
                                     int nnz) {}
@@ -28,20 +27,20 @@ inline void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
 inline void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {}
 
 inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       void * dest_d,
+                                       void *dest_d,
                                        size_t size,
                                        hl_matrix_format_t format,
-                                       hl_matrix_value_t  value_type,
+                                       hl_matrix_value_t value_type,
                                        int dimM,
                                        int dimN,
                                        int nnz) {}
 
 inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       real* value_d,
-                                       int* rows_d,
-                                       int* cols_d,
+                                       real *value_d,
+                                       int *rows_d,
+                                       int *cols_d,
                                        hl_matrix_format_t format,
-                                       hl_matrix_value_t  value_type,
+                                       hl_matrix_value_t value_type,
                                        int dimM,
                                        int dimN,
                                        int nnz) {}
@@ -87,10 +86,14 @@ inline void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
 
 inline void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
                                     hl_trans_op_t transa,
-                                    real *B_d, hl_trans_op_t transb,
+                                    real *B_d,
+                                    hl_trans_op_t transb,
                                     real *C_d,
-                                    int dimM, int dimN, int dimK,
-                                    real alpha, real beta) {}
+                                    int dimM,
+                                    int dimN,
+                                    int dimK,
+                                    real alpha,
+                                    real beta) {}
 
 inline void hl_matrix_dense_mul_csc(real *A_d,
                                     hl_trans_op_t transa,
@@ -103,18 +106,27 @@ inline void hl_matrix_dense_mul_csc(real *A_d,
                                     real alpha,
                                     real beta) {}
 
-inline void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
-                                 real *B_d, hl_trans_op_t transb,
+inline void hl_sparse_matrix_mul(real *A_d,
+                                 hl_trans_op_t transa,
+                                 real *B_d,
+                                 hl_trans_op_t transb,
                                  hl_sparse_matrix_s C_d,
-                                 int dimM, int dimN, int dimK,
-                                 real alpha, real beta) {}
+                                 int dimM,
+                                 int dimN,
+                                 int dimK,
+                                 real alpha,
+                                 real beta) {}
 
-inline void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
+inline void hl_matrix_dense_mul_csr(real *A_d,
+                                    hl_trans_op_t transa,
                                     hl_sparse_matrix_s B_d,
                                     hl_trans_op_t transb,
                                     real *C_d,
-                                    int dimM, int dimN, int dimK,
-                                    real alpha, real beta) {}
+                                    int dimM,
+                                    int dimN,
+                                    int dimK,
+                                    real alpha,
+                                    real beta) {}
 
 inline void hl_memcpy_from_csc_matrix(real *csc_val,
                                       size_t val_size,
@@ -134,49 +146,39 @@ inline void hl_memcpy_from_csr_matrix(real *csr_val,
                                       hl_sparse_matrix_s csr_matrix,
                                       hl_stream_t stream) {}
 
-inline void hl_sparse_matrix_column_sum(real* A_d,
-                                        hl_sparse_matrix_s B_d,
-                                        int dimM,
-                                        int dimN,
-                                        real scale) {}
+inline void hl_sparse_matrix_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {}
 
-inline void hl_matrix_csr_column_sum(real* A_d,
-                                     hl_sparse_matrix_s B_d,
-                                     int dimM,
-                                     int dimN,
-                                     real scale) {}
+inline void hl_matrix_csr_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {}
 
 inline void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
-                                      real* B_d,
+                                      real *B_d,
                                       real scale) {}
 
 inline void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
-                                   real* B_d,
+                                   real *B_d,
                                    real scale) {}
 
 inline void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
-                                       real* B_d,
+                                       real *B_d,
                                        int dimM,
                                        int dimN,
                                        real alpha,
                                        real beta) {}
 
 inline void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
-                                    real* B_d,
+                                    real *B_d,
                                     int dimM,
                                     int dimN,
                                     real alpha,
                                     real beta) {}
 
-inline int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
-  return NULL;
-}
+inline int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) { return NULL; }
 
-inline int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
-  return NULL;
-}
+inline int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) { return NULL; }
 
-inline real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
+inline real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
   return NULL;
 }
 
diff --git a/paddle/cuda/src/avx_mathfun.h b/paddle/cuda/src/avx_mathfun.h
index 2922d4dc2937662d66fb2433f4883448ba21fa3f..2412ed5abc13b2a83521a75524f581e106788b60 100644
--- a/paddle/cuda/src/avx_mathfun.h
+++ b/paddle/cuda/src/avx_mathfun.h
@@ -32,32 +32,35 @@
 #include 
 
 /* yes I know, the top of this file is quite ugly */
-# define ALIGN32_BEG
-# define ALIGN32_END __attribute__((aligned(32)))
+#define ALIGN32_BEG
+#define ALIGN32_END __attribute__((aligned(32)))
 
 /* __m128 is ugly to write */
-typedef __m256  v8sf; // vector of 8 float (avx)
-typedef __m256i v8si; // vector of 8 int   (avx)
-typedef __m128i v4si; // vector of 8 int   (avx)
+typedef __m256 v8sf;   // vector of 8 float (avx)
+typedef __m256i v8si;  // vector of 8 int   (avx)
+typedef __m128i v4si;  // vector of 8 int   (avx)
 
-#define _PI32AVX_CONST(Name, Val)                                            \
-  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { Val, Val, Val, Val }
+#define _PI32AVX_CONST(Name, Val)                                 \
+  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { \
+      Val, Val, Val, Val}
 
 _PI32AVX_CONST(1, 1);
 _PI32AVX_CONST(inv1, ~1);
 _PI32AVX_CONST(2, 2);
 _PI32AVX_CONST(4, 4);
 
-
 /* declare some AVX constants -- why can't I figure a better way to do that? */
-#define _PS256_CONST(Name, Val)                                            \
-  static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-#define _PI32_CONST256(Name, Val)                                            \
-  static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-#define _PS256_CONST_TYPE(Name, Type, Val)                                 \
-  static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-
-_PS256_CONST(1  , 1.0f);
+#define _PS256_CONST(Name, Val)                                   \
+  static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PI32_CONST256(Name, Val)                                  \
+  static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PS256_CONST_TYPE(Name, Type, Val)                       \
+  static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+
+_PS256_CONST(1, 1.0f);
 _PS256_CONST(0p5, 0.5f);
 /* the smallest non denormalized float number */
 _PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
@@ -76,14 +79,14 @@ _PI32_CONST256(0x7f, 0x7f);
 
 _PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
 _PS256_CONST(cephes_log_p0, 7.0376836292E-2);
-_PS256_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS256_CONST(cephes_log_p1, -1.1514610310E-1);
 _PS256_CONST(cephes_log_p2, 1.1676998740E-1);
-_PS256_CONST(cephes_log_p3, - 1.2420140846E-1);
-_PS256_CONST(cephes_log_p4, + 1.4249322787E-1);
-_PS256_CONST(cephes_log_p5, - 1.6668057665E-1);
-_PS256_CONST(cephes_log_p6, + 2.0000714765E-1);
-_PS256_CONST(cephes_log_p7, - 2.4999993993E-1);
-_PS256_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS256_CONST(cephes_log_p3, -1.2420140846E-1);
+_PS256_CONST(cephes_log_p4, +1.4249322787E-1);
+_PS256_CONST(cephes_log_p5, -1.6668057665E-1);
+_PS256_CONST(cephes_log_p6, +2.0000714765E-1);
+_PS256_CONST(cephes_log_p7, -2.4999993993E-1);
+_PS256_CONST(cephes_log_p8, +3.3333331174E-1);
 _PS256_CONST(cephes_log_q1, -2.12194440e-4);
 _PS256_CONST(cephes_log_q2, 0.693359375);
 
@@ -94,50 +97,51 @@ typedef union imm_xmm_union {
   v4si xmm[2];
 } imm_xmm_union;
 
-#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) {    \
-    imm_xmm_union u __attribute__((aligned(32)));  \
-    u.imm = imm_;				   \
-    xmm0_ = u.xmm[0];                            \
-    xmm1_ = u.xmm[1];                            \
-}
-
-#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) {                       \
+#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_)       \
+  {                                               \
     imm_xmm_union u __attribute__((aligned(32))); \
-    u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \
+    u.imm = imm_;                                 \
+    xmm0_ = u.xmm[0];                             \
+    xmm1_ = u.xmm[1];                             \
   }
 
+#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_)       \
+  {                                               \
+    imm_xmm_union u __attribute__((aligned(32))); \
+    u.xmm[0] = xmm0_;                             \
+    u.xmm[1] = xmm1_;                             \
+    imm_ = u.imm;                                 \
+  }
 
-#define AVX2_BITOP_USING_SSE2(fn) \
-static inline v8si avx2_mm256_##fn(v8si x, int a) \
-{ \
-  /* use SSE2 instruction to perform the bitop AVX2 */ \
-  v4si x1, x2; \
-  v8si ret; \
-  COPY_IMM_TO_XMM(x, x1, x2); \
-  x1 = _mm_##fn(x1,a); \
-  x2 = _mm_##fn(x2,a); \
-  COPY_XMM_TO_IMM(x1, x2, ret); \
-  return(ret); \
-}
+#define AVX2_BITOP_USING_SSE2(fn)                        \
+  static inline v8si avx2_mm256_##fn(v8si x, int a) {    \
+    /* use SSE2 instruction to perform the bitop AVX2 */ \
+    v4si x1, x2;                                         \
+    v8si ret;                                            \
+    COPY_IMM_TO_XMM(x, x1, x2);                          \
+    x1 = _mm_##fn(x1, a);                                \
+    x2 = _mm_##fn(x2, a);                                \
+    COPY_XMM_TO_IMM(x1, x2, ret);                        \
+    return (ret);                                        \
+  }
 
 //#warning "Using SSE2 to perform AVX2 bitshift ops"
 AVX2_BITOP_USING_SSE2(slli_epi32)
 AVX2_BITOP_USING_SSE2(srli_epi32)
 
-#define AVX2_INTOP_USING_SSE2(fn) \
-static inline v8si avx2_mm256_##fn(v8si x, v8si y) \
-{ \
-  /* use SSE2 instructions to perform the AVX2 integer operation */ \
-  v4si x1, x2; \
-  v4si y1, y2; \
-  v8si ret; \
-  COPY_IMM_TO_XMM(x, x1, x2); \
-  COPY_IMM_TO_XMM(y, y1, y2); \
-  x1 = _mm_##fn(x1,y1); \
-  x2 = _mm_##fn(x2,y2); \
-  COPY_XMM_TO_IMM(x1, x2, ret); \
-  return(ret); \
-}
+#define AVX2_INTOP_USING_SSE2(fn)                                     \
+  static inline v8si avx2_mm256_##fn(v8si x, v8si y) {                \
+    /* use SSE2 instructions to perform the AVX2 integer operation */ \
+    v4si x1, x2;                                                      \
+    v4si y1, y2;                                                      \
+    v8si ret;                                                         \
+    COPY_IMM_TO_XMM(x, x1, x2);                                       \
+    COPY_IMM_TO_XMM(y, y1, y2);                                       \
+    x1 = _mm_##fn(x1, y1);                                            \
+    x2 = _mm_##fn(x2, y2);                                            \
+    COPY_XMM_TO_IMM(x1, x2, ret);                                     \
+    return (ret);                                                     \
+  }
 
 //#warning "Using SSE2 to perform AVX2 integer ops"
 AVX2_INTOP_USING_SSE2(and_si128)
@@ -157,84 +161,83 @@ AVX2_INTOP_USING_SSE2(add_epi32)
 #define avx2_mm256_add_epi32 _mm256_add_epi32
 #endif /* __AVX2__ */
 
-
-/* natural logarithm computed for 8 simultaneous float 
+/* natural logarithm computed for 8 simultaneous float
    return NaN for x <= 0
 */
 v8sf log256_ps(v8sf x) {
   v8si imm0;
-  v8sf one = *(v8sf*)_ps256_1;
+  v8sf one = *(v8sf *)_ps256_1;
 
-  //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
+  // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
   v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
 
-  x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos);  /* cut off denormalized stuff */
+  x = _mm256_max_ps(
+      x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */
 
   // can be done with AVX2
   imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
 
   /* keep only the fractional part */
-  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask);
-  x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5);
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);
+  x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);
 
   // this is again another AVX2 instruction
-  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);
   v8sf e = _mm256_cvtepi32_ps(imm0);
 
   e = _mm256_add_ps(e, one);
 
-  /* part2: 
+  /* part2:
      if( x < SQRTHF ) {
        e -= 1;
        x = x + x - 1.0;
      } else { x = x - 1.0; }
   */
-  //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
-  v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
+  // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
+  v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
   v8sf tmp = _mm256_and_ps(x, mask);
   x = _mm256_sub_ps(x, one);
   e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
   x = _mm256_add_ps(x, tmp);
 
-  v8sf z = _mm256_mul_ps(x,x);
+  v8sf z = _mm256_mul_ps(x, x);
 
-  v8sf y = *(v8sf*)_ps256_cephes_log_p0;
+  v8sf y = *(v8sf *)_ps256_cephes_log_p0;
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);
   y = _mm256_mul_ps(y, x);
 
   y = _mm256_mul_ps(y, z);
-  
-  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1);
-  y = _mm256_add_ps(y, tmp);
 
+  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);
+  y = _mm256_add_ps(y, tmp);
 
-  tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
   y = _mm256_sub_ps(y, tmp);
 
-  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2);
+  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);
   x = _mm256_add_ps(x, y);
   x = _mm256_add_ps(x, tmp);
-  x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
+  x = _mm256_or_ps(x, invalid_mask);  // negative arg will be NAN
   return x;
 }
 
-_PS256_CONST(exp_hi,	88.3762626647949f);
-_PS256_CONST(exp_lo,	-88.3762626647949f);
+_PS256_CONST(exp_hi, 88.3762626647949f);
+_PS256_CONST(exp_lo, -88.3762626647949f);
 
 _PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
 _PS256_CONST(cephes_exp_C1, 0.693359375);
@@ -250,45 +253,45 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
 v8sf exp256_ps(v8sf x) {
   v8sf tmp = _mm256_setzero_ps(), fx;
   v8si imm0;
-  v8sf one = *(v8sf*)_ps256_1;
+  v8sf one = *(v8sf *)_ps256_1;
 
-  x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi);
-  x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo);
+  x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);
+  x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);
 
   /* express exp(x) as exp(g + n*log(2)) */
-  fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF);
-  fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5);
+  fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);
+  fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);
 
   /* how to perform a floorf with SSE: just below */
-  //imm0 = _mm256_cvttps_epi32(fx);
-  //tmp  = _mm256_cvtepi32_ps(imm0);
-  
+  // imm0 = _mm256_cvttps_epi32(fx);
+  // tmp  = _mm256_cvtepi32_ps(imm0);
+
   tmp = _mm256_floor_ps(fx);
 
   /* if greater, substract 1 */
-  //v8sf mask = _mm256_cmpgt_ps(tmp, fx);    
-  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);    
+  // v8sf mask = _mm256_cmpgt_ps(tmp, fx);
+  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
   mask = _mm256_and_ps(mask, one);
   fx = _mm256_sub_ps(tmp, mask);
 
-  tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1);
-  v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2);
+  tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);
+  v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);
   x = _mm256_sub_ps(x, tmp);
   x = _mm256_sub_ps(x, z);
 
-  z = _mm256_mul_ps(x,x);
-  
-  v8sf y = *(v8sf*)_ps256_cephes_exp_p0;
+  z = _mm256_mul_ps(x, x);
+
+  v8sf y = *(v8sf *)_ps256_cephes_exp_p0;
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);
   y = _mm256_mul_ps(y, z);
   y = _mm256_add_ps(y, x);
   y = _mm256_add_ps(y, one);
@@ -296,7 +299,7 @@ v8sf exp256_ps(v8sf x) {
   /* build 2^n */
   imm0 = _mm256_cvttps_epi32(fx);
   // another two AVX2 instructions
-  imm0 = avx2_mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+  imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);
   imm0 = avx2_mm256_slli_epi32(imm0, 23);
   v8sf pow2n = _mm256_castsi256_ps(imm0);
   y = _mm256_mul_ps(y, pow2n);
@@ -307,13 +310,12 @@ _PS256_CONST(minus_cephes_DP1, -0.78515625);
 _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
 _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
 _PS256_CONST(sincof_p0, -1.9515295891E-4);
-_PS256_CONST(sincof_p1,  8.3321608736E-3);
+_PS256_CONST(sincof_p1, 8.3321608736E-3);
 _PS256_CONST(sincof_p2, -1.6666654611E-1);
-_PS256_CONST(coscof_p0,  2.443315711809948E-005);
+_PS256_CONST(coscof_p0, 2.443315711809948E-005);
 _PS256_CONST(coscof_p1, -1.388731625493765E-003);
-_PS256_CONST(coscof_p2,  4.166664568298827E-002);
-_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
-
+_PS256_CONST(coscof_p2, 4.166664568298827E-002);
+_PS256_CONST(cephes_FOPI, 1.27323954473516);  // 4 / M_PI
 
 /* evaluation of 8 sines at onces using AVX intrisics
 
@@ -327,7 +329,7 @@ _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
    surprising but correct result.
 
 */
-v8sf sin256_ps(v8sf x) { // any x
+v8sf sin256_ps(v8sf x) {  // any x
   v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
   v8si imm0, imm2;
 
@@ -338,78 +340,78 @@ v8sf sin256_ps(v8sf x) { // any x
 
   sign_bit = x;
   /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
   /* extract the sign bit (upper one) */
-  sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
-  
+  sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);
+
   /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
 
-  /*
-    Here we start a series of integer operations, which are in the
-    realm of AVX2.
-    If we don't have AVX, let's perform them using SSE2 directives
-  */
+/*
+  Here we start a series of integer operations, which are in the
+  realm of AVX2.
+  If we don't have AVX, let's perform them using SSE2 directives
+*/
 
 #ifdef __AVX2__
   /* store the integer part of y in mm0 */
   imm2 = _mm256_cvttps_epi32(y);
   /* j=(j+1) & (~1) (see the cephes sources) */
   // another two AVX2 instruction
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
   y = _mm256_cvtepi32_ps(imm2);
 
   /* get the swap sign flag */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
+  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
   imm0 = avx2_mm256_slli_epi32(imm0, 29);
-  /* get the polynom selection mask 
+  /* get the polynom selection mask
      there is one polynom for 0 <= x <= Pi/4
      and another one for Pi/4
 #include "hl_functions.h"
 
 namespace hppl {
 
-  extern __m256 exp(__m256 a);
+extern __m256 exp(__m256 a);
 
-  __m256 relu(const __m256 a) {
-    __m256 tmp = _mm256_set1_ps(0.0f);
-    return _mm256_max_ps(a, tmp);
-  }
+__m256 relu(const __m256 a) {
+  __m256 tmp = _mm256_set1_ps(0.0f);
+  return _mm256_max_ps(a, tmp);
+}
 
-  __m256 sigmoid(const __m256 a) {
-    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
-    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
-    __m256 tmp = _mm256_max_ps(a, min);
-    tmp = _mm256_min_ps(tmp, max);
-    tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
-    tmp = exp(tmp);
-    tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
-    tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
-    return tmp;
-  }
+__m256 sigmoid(const __m256 a) {
+  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
+  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
+  __m256 tmp = _mm256_max_ps(a, min);
+  tmp = _mm256_min_ps(tmp, max);
+  tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
+  tmp = exp(tmp);
+  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
+  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
+  return tmp;
+}
 
-  __m256 tanh(const __m256 a) {
-    __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
-    __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
-    tmp = _mm256_min_ps(tmp, max);
-    tmp = exp(tmp);
-    return _mm256_sub_ps(
-        _mm256_div_ps(_mm256_set1_ps(2.0f),
-        _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)), _mm256_set1_ps(1.0f));
-  }
+__m256 tanh(const __m256 a) {
+  __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
+  __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
+  tmp = _mm256_min_ps(tmp, max);
+  tmp = exp(tmp);
+  return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f),
+                                     _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)),
+                       _mm256_set1_ps(1.0f));
+}
 
-  __m256 linear(const __m256 a) {
-    return a;
-  }
+__m256 linear(const __m256 a) { return a; }
 
-  __m256 relu(const __m256 a, const __m256 b) {
-    return _mm256_mul_ps(a,
+__m256 relu(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(
+      a,
       _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
-      _mm256_set1_ps(1.0f)));
-  }
+                    _mm256_set1_ps(1.0f)));
+}
 
-  __m256 sigmoid(const __m256 a, const __m256 b) {
-    return _mm256_mul_ps(_mm256_mul_ps(a, b),
-        _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
-  }
+__m256 sigmoid(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(_mm256_mul_ps(a, b),
+                       _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
+}
 
-  __m256 tanh(const __m256 a, const __m256 b) {
-    return _mm256_mul_ps(a,
-      _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
-  }
+__m256 tanh(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(
+      a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
+}
 
-  __m256 linear(const __m256 a, const __m256 b) {
-    return a;
-  }
+__m256 linear(const __m256 a, const __m256 b) { return a; }
 }  // namespace hppl
diff --git a/paddle/cuda/src/hl_cpu_functions.cc b/paddle/cuda/src/hl_cpu_functions.cc
index b8352c2d537fba5ec9cd3237fe8f3fa9c25cbffe..af00f352e536bf342e15315d1f6804225b87eb0b 100644
--- a/paddle/cuda/src/hl_cpu_functions.cc
+++ b/paddle/cuda/src/hl_cpu_functions.cc
@@ -12,46 +12,33 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include 
 #include "hl_functions.h"
 
 namespace hppl {
 
-  real relu(const real a) {
-    return a > 0.0f ? a : 0.0f;
-  }
-
-  real sigmoid(const real a) {
-    const real min = SIGMOID_THRESHOLD_MIN;
-    const real max = SIGMOID_THRESHOLD_MAX;
-    real tmp = (a < min) ? min : ((a > max) ? max : a);
-    return 1.0 / (1.0 + exp(-tmp));
-  }
-
-  real tanh(const real a) {
-    real tmp = -2.0 * a;
-    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-    return (2.0 / (1.0 + exp(tmp))) - 1.0;
-  }
-
-  real linear(const real a) {
-    return a;
-  }
-
-  real relu(const real a, const real b) {
-    return a * (b > 0.0f ? 1.0f : 0.0f);
-  }
-
-  real sigmoid(const real a, const real b) {
-    return a * b * (1 - b);
-  }
-
-  real tanh(const real a, const real b) {
-    return a * (1.0f - b * b);
-  }
-
-  real linear(const real a, const real b) {
-    return a;
-  }
+real relu(const real a) { return a > 0.0f ? a : 0.0f; }
+
+real sigmoid(const real a) {
+  const real min = SIGMOID_THRESHOLD_MIN;
+  const real max = SIGMOID_THRESHOLD_MAX;
+  real tmp = (a < min) ? min : ((a > max) ? max : a);
+  return 1.0 / (1.0 + exp(-tmp));
+}
+
+real tanh(const real a) {
+  real tmp = -2.0 * a;
+  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+  return (2.0 / (1.0 + exp(tmp))) - 1.0;
+}
+
+real linear(const real a) { return a; }
+
+real relu(const real a, const real b) { return a * (b > 0.0f ? 1.0f : 0.0f); }
+
+real sigmoid(const real a, const real b) { return a * b * (1 - b); }
+
+real tanh(const real a, const real b) { return a * (1.0f - b * b); }
+
+real linear(const real a, const real b) { return a; }
 }  // namespace hppl
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
index abf6afadc218f615dc6b3cf734d09f072214be40..e8ba232d44b3f66254d4749d4abbcfbe46d1fd0e 100644
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include 
 #include 
 #include "hl_cuda.h"
@@ -24,7 +23,7 @@ limitations under the License. */
 namespace dynload {
 
 std::once_flag cublas_dso_flag;
-void* cublas_dso_handle = nullptr;
+void *cublas_dso_handle = nullptr;
 
 /**
  * The following macro definition can generate structs
@@ -34,31 +33,30 @@ void* cublas_dso_handle = nullptr;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                          \
-   struct DynLoad__##__name {                                     \
-    template                                    \
-    cublasStatus_t operator()(Args... args) {                     \
-        typedef cublasStatus_t (*cublasFunc)(Args...);            \
-        std::call_once(cublas_dso_flag, GetCublasDsoHandle,       \
-                      &cublas_dso_handle);                        \
-        void* p_##__name = dlsym(cublas_dso_handle, #__name);     \
-        return reinterpret_cast(p_##__name)(args...); \
-    }                                                             \
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template                                                 \
+    cublasStatus_t operator()(Args... args) {                                  \
+      typedef cublasStatus_t (*cublasFunc)(Args...);                           \
+      std::call_once(cublas_dso_flag, GetCublasDsoHandle, &cublas_dso_handle); \
+      void *p_##__name = dlsym(cublas_dso_handle, #__name);                    \
+      return reinterpret_cast(p_##__name)(args...);                \
+    }                                                                          \
   } __name;  // struct DynLoad__##__name
 #else
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                          \
-   struct DynLoad__##__name {                                     \
-    template                                    \
-    cublasStatus_t operator()(Args... args) {                     \
-      return __name(args...);                                     \
-    }                                                             \
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)      \
+  struct DynLoad__##__name {                  \
+    template                \
+    cublasStatus_t operator()(Args... args) { \
+      return __name(args...);                 \
+    }                                         \
   } __name;  // struct DynLoad__##__name
 #endif
 
-#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
-  DYNAMIC_LOAD_CUBLAS_WRAP(__name)
+#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name)
 
 // include all needed cublas functions in HPPL
+// clang-format off
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
   __macro(cublasSgemv)                    \
   __macro(cublasDgemv)                    \
@@ -88,41 +86,41 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
 
 } /* namespace dynload */
 
-
+// clang-format on
 #ifndef PADDLE_TYPE_DOUBLE
-#define     CUBLAS_GEAM     dynload::cublasSgeam
-#define     CUBLAS_GEMV     dynload::cublasSgemv
-#define     CUBLAS_GEMM     dynload::cublasSgemm
-#define     CUBLAS_GETRF    dynload::cublasSgetrfBatched
-#define     CUBLAS_GETRI    dynload::cublasSgetriBatched
+#define CUBLAS_GEAM dynload::cublasSgeam
+#define CUBLAS_GEMV dynload::cublasSgemv
+#define CUBLAS_GEMM dynload::cublasSgemm
+#define CUBLAS_GETRF dynload::cublasSgetrfBatched
+#define CUBLAS_GETRI dynload::cublasSgetriBatched
 #else
-#define     CUBLAS_GEAM     dynload::cublasDgeam
-#define     CUBLAS_GEMV     dynload::cublasDgemv
-#define     CUBLAS_GEMM     dynload::cublasDgemm
-#define     CUBLAS_GETRF    dynload::cublasDgetrfBatched
-#define     CUBLAS_GETRI    dynload::cublasDgetriBatched
+#define CUBLAS_GEAM dynload::cublasDgeam
+#define CUBLAS_GEMV dynload::cublasDgemv
+#define CUBLAS_GEMM dynload::cublasDgemm
+#define CUBLAS_GETRF dynload::cublasDgetrfBatched
+#define CUBLAS_GETRI dynload::cublasDgetriBatched
 #endif
 
-const char* hl_cublas_get_error_string(cublasStatus_t status) {
+const char *hl_cublas_get_error_string(cublasStatus_t status) {
   switch (status) {
-     case CUBLAS_STATUS_NOT_INITIALIZED:
-        return "[cublas status]: not initialized";
-     case CUBLAS_STATUS_ALLOC_FAILED:
-        return "[cublas status]: allocate failed";
-     case CUBLAS_STATUS_INVALID_VALUE:
-        return "[cublas status]: invalid value";
-     case CUBLAS_STATUS_ARCH_MISMATCH:
-        return "[cublas status]: arch mismatch";
-     case CUBLAS_STATUS_MAPPING_ERROR:
-        return "[cublas status]: mapping error";
-     case CUBLAS_STATUS_EXECUTION_FAILED:
-        return "[cublas status]: execution failed";
-     case CUBLAS_STATUS_INTERNAL_ERROR:
-        return "[cublas status]: internal error";
-     case CUBLAS_STATUS_SUCCESS:
-        return "[cublas status]: success";
-     default:
-        return "[cublas status]: unknown error";
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "[cublas status]: not initialized";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "[cublas status]: allocate failed";
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "[cublas status]: invalid value";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "[cublas status]: arch mismatch";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "[cublas status]: mapping error";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "[cublas status]: execution failed";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "[cublas status]: internal error";
+    case CUBLAS_STATUS_SUCCESS:
+      return "[cublas status]: success";
+    default:
+      return "[cublas status]: unknown error";
   }
 }
 
@@ -131,27 +129,21 @@ const char* hl_cublas_get_error_string(cublasStatus_t status) {
  * support << operator for more details error info.
  */
 cublasStatus_t g_cublasStat;
-#define CHECK_CUBLAS(cublas_func)                 \
-  g_cublasStat = cublas_func;                     \
-  CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat)   \
-      << "Cublas Error: "                         \
-      << hl_cublas_get_error_string(g_cublasStat) \
-      << " "
+#define CHECK_CUBLAS(cublas_func)               \
+  g_cublasStat = cublas_func;                   \
+  CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat) \
+      << "Cublas Error: " << hl_cublas_get_error_string(g_cublasStat) << " "
 
 void hl_cublas_init(cublasHandle_t *cublas_handle, cudaStream_t stream) {
   CHECK_CUBLAS(dynload::cublasCreate(cublas_handle))
-    << "[cublas init] Cublas create handle faild!";
+      << "[cublas init] Cublas create handle faild!";
 
   CHECK_CUBLAS(dynload::cublasSetStream(*cublas_handle, stream))
-    << "[cublas init] Cublas set stream faild!";
+      << "[cublas init] Cublas set stream faild!";
 }
 
-void hl_matrix_transpose(real *A_d,
-                         real *C_d,
-                         int dimM,
-                         int dimN,
-                         int lda,
-                         int ldc) {
+void hl_matrix_transpose(
+    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {
   real alpha = 1.0;
   real beta = 0.0;
 
@@ -159,11 +151,18 @@ void hl_matrix_transpose(real *A_d,
   CHECK_NOTNULL(C_d);
 
   CHECK_CUBLAS(CUBLAS_GEAM(t_resource.handle,
-               CUBLAS_OP_T, CUBLAS_OP_N,
-               dimM, dimN,
-               &alpha, A_d, lda,
-               &beta, nullptr, dimM,
-               C_d, ldc));
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           dimM,
+                           dimN,
+                           &alpha,
+                           A_d,
+                           lda,
+                           &beta,
+                           nullptr,
+                           dimM,
+                           C_d,
+                           ldc));
   CHECK_SYNC("hl_matrix_transpose failed");
 }
 
@@ -188,13 +187,13 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
      small-sized matrices. There may be a better way to reconstruct
      the API for better performance.
    */
-  CHECK_CUBLAS(CUBLAS_GETRF(t_resource.handle,
-      dimN, inout_d, lda, pivot_d, info_d, 1));
+  CHECK_CUBLAS(
+      CUBLAS_GETRF(t_resource.handle, dimN, inout_d, lda, pivot_d, info_d, 1));
 
   int info_h;
   hl_memcpy(&info_h, info_d, sizeof(int));
   if (info_h != 0) {
-      LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
+    LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
   }
 
   /* Step 2: Compute the inverse of the matrix given its LU decomposition */
@@ -203,12 +202,18 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
   hl_memcpy(out_d, out_h, sizeof(real *));
 
   CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
-      dimN, (const real **)inout_d, lda, pivot_d,
-      out_d, ldc, info_d, 1));
+                            dimN,
+                            (const real **)inout_d,
+                            lda,
+                            pivot_d,
+                            out_d,
+                            ldc,
+                            info_d,
+                            1));
 
   hl_memcpy(&info_h, info_d, sizeof(int));
   if (info_h != 0) {
-      LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
+    LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
   }
 
   hl_free_mem_device(inout_d);
@@ -218,12 +223,19 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
   CHECK_SYNC("hl_matrix_inverse failed");
 }
 
-void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                   real *B_d, hl_trans_op_t transb,
+void hl_matrix_mul(real *A_d,
+                   hl_trans_op_t transa,
+                   real *B_d,
+                   hl_trans_op_t transb,
                    real *C_d,
-                   int dimM, int dimN, int dimK,
-                   real alpha, real beta,
-                   int lda, int ldb, int ldc) {
+                   int dimM,
+                   int dimN,
+                   int dimK,
+                   real alpha,
+                   real beta,
+                   int lda,
+                   int ldb,
+                   int ldc) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
   CHECK_NOTNULL(C_d);
@@ -231,8 +243,8 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
   if (dimN == 1 && dimM != 1 && dimK != 1 && transb == HPPL_OP_N) {
     int m = (transa == HPPL_OP_N) ? dimM : dimK;
     int n = (transa == HPPL_OP_N) ? dimK : dimM;
-    hl_matrix_mul_vector(A_d, transa, B_d, C_d, m, n,
-                         alpha, beta, lda, ldb, ldc);
+    hl_matrix_mul_vector(
+        A_d, transa, B_d, C_d, m, n, alpha, beta, lda, ldb, ldc);
     return;
   }
 
@@ -240,8 +252,7 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
     int m = (transb == HPPL_OP_N) ? dimK : dimN;
     int n = (transb == HPPL_OP_N) ? dimN : dimK;
     hl_trans_op_t trans = (transb == HPPL_OP_N) ? HPPL_OP_T : HPPL_OP_N;
-    hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n,
-                         alpha, beta, ldb, 1, 1);
+    hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n, alpha, beta, ldb, 1, 1);
     return;
   }
 
@@ -250,26 +261,47 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
     stat = CUBLAS_GEMM(t_resource.handle,
                        CUBLAS_OP_N,
                        CUBLAS_OP_N,
-                       dimN, dimM, dimK,
-                       &alpha, B_d, ldb,
-                       A_d, lda,
-                       &beta, C_d, ldc);
+                       dimN,
+                       dimM,
+                       dimK,
+                       &alpha,
+                       B_d,
+                       ldb,
+                       A_d,
+                       lda,
+                       &beta,
+                       C_d,
+                       ldc);
   } else if ((HPPL_OP_T == transa) && (HPPL_OP_N == transb)) {
     stat = CUBLAS_GEMM(t_resource.handle,
                        CUBLAS_OP_N,
                        CUBLAS_OP_T,
-                       dimN, dimM, dimK,
-                       &alpha, B_d, ldb,
-                       A_d, lda,
-                       &beta, C_d, ldc);
+                       dimN,
+                       dimM,
+                       dimK,
+                       &alpha,
+                       B_d,
+                       ldb,
+                       A_d,
+                       lda,
+                       &beta,
+                       C_d,
+                       ldc);
   } else if ((HPPL_OP_N == transa) && (HPPL_OP_T == transb)) {
     stat = CUBLAS_GEMM(t_resource.handle,
                        CUBLAS_OP_T,
                        CUBLAS_OP_N,
-                       dimN, dimM, dimK,
-                       &alpha, B_d, ldb,
-                       A_d, lda,
-                       &beta, C_d, ldc);
+                       dimN,
+                       dimM,
+                       dimK,
+                       &alpha,
+                       B_d,
+                       ldb,
+                       A_d,
+                       lda,
+                       &beta,
+                       C_d,
+                       ldc);
   } else {
     LOG(FATAL) << "parameter transa error!";
   }
@@ -277,24 +309,46 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
   CHECK_SYNC("hl_matrix_mul failed");
 }
 
-void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                   real *B_d, hl_trans_op_t transb,
+void hl_matrix_mul(real *A_d,
+                   hl_trans_op_t transa,
+                   real *B_d,
+                   hl_trans_op_t transb,
                    real *C_d,
-                   int dimM, int dimN, int dimK,
-                   real alpha, real beta) {
+                   int dimM,
+                   int dimN,
+                   int dimK,
+                   real alpha,
+                   real beta) {
   int lda = (HPPL_OP_N == transa) ? dimK : dimM;
   int ldb = (HPPL_OP_N == transb) ? dimN : dimK;
   int ldc = dimN;
 
-  hl_matrix_mul(A_d, transa, B_d, transb, C_d, dimM, dimN,
-                dimK, alpha, beta, lda, ldb, ldc);
+  hl_matrix_mul(A_d,
+                transa,
+                B_d,
+                transb,
+                C_d,
+                dimM,
+                dimN,
+                dimK,
+                alpha,
+                beta,
+                lda,
+                ldb,
+                ldc);
 }
 
-void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
-                          real *B_d, real *C_d,
-                          int dimM, int dimN,
-                          real alpha, real beta,
-                          int lda, int incb, int incc) {
+void hl_matrix_mul_vector(real *A_d,
+                          hl_trans_op_t trans,
+                          real *B_d,
+                          real *C_d,
+                          int dimM,
+                          int dimN,
+                          real alpha,
+                          real beta,
+                          int lda,
+                          int incb,
+                          int incc) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
   CHECK_NOTNULL(C_d);
@@ -303,21 +357,29 @@ void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
   if (HPPL_OP_N == trans) {
     stat = CUBLAS_GEMV(t_resource.handle,
                        CUBLAS_OP_T,
-                       dimN, dimM,
+                       dimN,
+                       dimM,
                        &alpha,
-                       A_d, lda,
-                       B_d, incb,
+                       A_d,
+                       lda,
+                       B_d,
+                       incb,
                        &beta,
-                       C_d, incc);
+                       C_d,
+                       incc);
   } else if (HPPL_OP_T == trans) {
     stat = CUBLAS_GEMV(t_resource.handle,
                        CUBLAS_OP_N,
-                       dimN, dimM,
+                       dimN,
+                       dimM,
                        &alpha,
-                       A_d, lda,
-                       B_d, incb,
+                       A_d,
+                       lda,
+                       B_d,
+                       incb,
                        &beta,
-                       C_d, incc);
+                       C_d,
+                       incc);
   } else {
     LOG(FATAL) << "parameter transa error!";
   }
@@ -326,10 +388,14 @@ void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
   CHECK_SYNC("hl_matrix_mul_vector");
 }
 
-void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
-                          real *B_d, real *C_d,
-                          int dimM, int dimN,
-                          real alpha, real beta) {
-  hl_matrix_mul_vector(A_d, trans, B_d, C_d, dimM, dimN,
-                       alpha, beta, dimN, 1, 1);
+void hl_matrix_mul_vector(real *A_d,
+                          hl_trans_op_t trans,
+                          real *B_d,
+                          real *C_d,
+                          int dimM,
+                          int dimN,
+                          real alpha,
+                          real beta) {
+  hl_matrix_mul_vector(
+      A_d, trans, B_d, C_d, dimM, dimN, alpha, beta, dimN, 1, 1);
 }
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index 1829fe23ac594e63253df23b350b16cb28eaebc1..9d4ff08a78d641896e946e9bf04590d4ba93350f 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include 
 #include 
 #include "hl_cuda_cudnn.h"
@@ -22,9 +21,10 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/CommandLineParser.h"
 
-P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb, 4096,
-                "Specify cuDNN max workspace limit, in units MB, "
-                "4096MB=4GB by default.");
+P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
+               4096,
+               "Specify cuDNN max workspace limit, in units MB, "
+               "4096MB=4GB by default.");
 
 namespace dynload {
 
@@ -41,16 +41,15 @@ void* cudnn_dso_handle = nullptr;
 
 #ifdef PADDLE_USE_DSO
 
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                          \
-  struct DynLoad__##__name {                                     \
-    template                                   \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      using cudnn_func = decltype(__name(args...))(*)(Args...);  \
-      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle,          \
-                     &cudnn_dso_handle);                         \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);       \
-      return reinterpret_cast(p_##__name)(args...);  \
-    }                                                            \
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                                     \
+  struct DynLoad__##__name {                                                \
+    template                                              \
+    auto operator()(Args... args) -> decltype(__name(args...)) {            \
+      using cudnn_func = decltype(__name(args...)) (*)(Args...);            \
+      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle); \
+      void* p_##__name = dlsym(cudnn_dso_handle, #__name);                  \
+      return reinterpret_cast(p_##__name)(args...);             \
+    }                                                                       \
   } __name; /* struct DynLoad__##__name */
 
 #else
@@ -69,6 +68,7 @@ void* cudnn_dso_handle = nullptr;
  * include all needed cudnn functions in HPPL
  * different cudnn version has different interfaces
  **/
+// clang-format off
 #define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
   __macro(cudnnSetTensor4dDescriptor)                     \
   __macro(cudnnSetTensor4dDescriptorEx)                   \
@@ -141,56 +141,53 @@ CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
 #undef CUDNN_DNN_ROUTINE_EACH
-
+// clang-format on
 } /* namespace dynload */
 
 /**
  * Check build-in cudnn function using glog and it **does not**
  * support << operator for more details error info.
  */
-#define CHECK_CUDNN(cudnnFunc)                               \
-  do {                                                       \
-    cudnnStatus_t cudnnStat = cudnnFunc;                     \
-    CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat)                \
-        << "Cudnn Error: "                                   \
-        << dynload::cudnnGetErrorString(cudnnStat);          \
+#define CHECK_CUDNN(cudnnFunc)                                         \
+  do {                                                                 \
+    cudnnStatus_t cudnnStat = cudnnFunc;                               \
+    CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat)                          \
+        << "Cudnn Error: " << dynload::cudnnGetErrorString(cudnnStat); \
   } while (0)
 
 bool g_is_libcudnn_init = false;
 int g_cudnn_lib_version = 0;
 
-void hl_cudnn_desc_init(cudnnTensorDescriptor_t*  cudnn_desc) {
-    CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
+void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) {
+  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
 }
 
-void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream) {
-    size_t cudnn_dso_ver = dynload::cudnnGetVersion();
-    size_t cudnn_dso_major = cudnn_dso_ver / 1000;
-    size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
-
-    // Compare cudnn header version with that of cudnn.so.
-    CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
-          (cudnn_cuh_major == cudnn_dso_major))
-        << "[cudnn init] libcudnn v" << cudnn_dso_major <<
-        " with header v" << cudnn_cuh_major << " unmatched!\n"
-        << "PaddlePaddle Requirement: "
-        << "(header v[2-3] with libcudnn v[2-3]) Or "
-        << "(header v4 with libcudnn v4) Or "
-        << "(header v5 with libcudnn v5).";
-
-    CHECK(!(CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
-        << "cudnn v5 requires cuda version >= 7.5";
-
-    CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
-    CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
-
-    g_is_libcudnn_init = true;
-    g_cudnn_lib_version = cudnn_dso_ver;
+void hl_cudnn_init(cudnnHandle_t* cudnn_handle, cudaStream_t stream) {
+  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
+  size_t cudnn_dso_major = cudnn_dso_ver / 1000;
+  size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
+
+  // Compare cudnn header version with that of cudnn.so.
+  CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
+        (cudnn_cuh_major == cudnn_dso_major))
+      << "[cudnn init] libcudnn v" << cudnn_dso_major << " with header v"
+      << cudnn_cuh_major << " unmatched!\n"
+      << "PaddlePaddle Requirement: "
+      << "(header v[2-3] with libcudnn v[2-3]) Or "
+      << "(header v4 with libcudnn v4) Or "
+      << "(header v5 with libcudnn v5).";
+
+  CHECK(!(CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
+      << "cudnn v5 requires cuda version >= 7.5";
+
+  CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
+  CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
+
+  g_is_libcudnn_init = true;
+  g_cudnn_lib_version = cudnn_dso_ver;
 }
 
-int hl_get_cudnn_lib_version() {
-  return g_cudnn_lib_version;
-}
+int hl_get_cudnn_lib_version() { return g_cudnn_lib_version; }
 
 void hl_conv_workspace(hl_tensor_descriptor input,
                        hl_tensor_descriptor output,
@@ -204,99 +201,91 @@ void hl_conv_workspace(hl_tensor_descriptor input,
                        size_t* bwdFilterLimitBytes) {
 #if CUDNN_VERSION >= 4000
 
-    CHECK_NOTNULL(input);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(filter);
-    CHECK_NOTNULL(conv);
-
-    // Specify workspace limit directly
-    size_t memoryLimitBytes =
-        (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
-
-    // cudnn convolution forward configuration
-    cudnnTensorDescriptor_t       fwd_src_desc =
-                                        GET_TENSOR_DESCRIPTOR(input);
-    cudnnTensorDescriptor_t       fwd_dest_desc =
-                                        GET_TENSOR_DESCRIPTOR(output);
-    cudnnFilterDescriptor_t       fwd_filter_desc =
-                                        GET_FILTER_DESCRIPTOR(filter);
-    cudnnConvolutionDescriptor_t  fwd_conv_desc =
-                                        GET_CONVOLUTION_DESCRIPTOR(conv);
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
-             t_resource.cudnn_handle,
-             fwd_src_desc,
-             fwd_filter_desc,
-             fwd_conv_desc,
-             fwd_dest_desc,
-             CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-             memoryLimitBytes,
-             reinterpret_cast(convFwdAlgo)));
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
-             t_resource.cudnn_handle,
-             fwd_src_desc,
-             fwd_filter_desc,
-             fwd_conv_desc,
-             fwd_dest_desc,
-             static_cast(*convFwdAlgo),
-             fwdLimitBytes));
-
-    // cudnn convolution backward data configuration
-    cudnnFilterDescriptor_t       bwd_data_filter_desc =
-                                          GET_FILTER_DESCRIPTOR(filter);
-    cudnnTensorDescriptor_t       bwd_data_diff_desc =
-                                          GET_TENSOR_DESCRIPTOR(output);
-    cudnnTensorDescriptor_t       bwd_data_grad_desc =
-                                          GET_TENSOR_DESCRIPTOR(input);
-    cudnnConvolutionDescriptor_t  bwd_data_conv_desc =
-                                          GET_CONVOLUTION_DESCRIPTOR(conv);
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-        t_resource.cudnn_handle,
-        bwd_data_filter_desc,
-        bwd_data_diff_desc,
-        bwd_data_conv_desc,
-        bwd_data_grad_desc,
-        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-        memoryLimitBytes,
-        reinterpret_cast(convBwdDataAlgo)));
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
-        t_resource.cudnn_handle,
-        bwd_data_filter_desc,
-        bwd_data_diff_desc,
-        bwd_data_conv_desc,
-        bwd_data_grad_desc,
-        static_cast(*convBwdDataAlgo),
-        bwdDataLimitBytes));
-
-    // cudnn convolution backward filter configuration
-    cudnnTensorDescriptor_t       bwd_filter_src_desc =
-                                      GET_TENSOR_DESCRIPTOR(input);
-    cudnnTensorDescriptor_t       bwd_filter_diff_desc =
-                                      GET_TENSOR_DESCRIPTOR(output);
-    cudnnConvolutionDescriptor_t  bwd_filter_conv_desc =
-                                      GET_CONVOLUTION_DESCRIPTOR(conv);
-    cudnnFilterDescriptor_t       bwd_filter_grad_desc =
-                                      GET_FILTER_DESCRIPTOR(filter);
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-        t_resource.cudnn_handle,
-        bwd_filter_src_desc,
-        bwd_filter_diff_desc,
-        bwd_filter_conv_desc,
-        bwd_filter_grad_desc,
-        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-        memoryLimitBytes,
-        reinterpret_cast(convBwdFilterAlgo)));
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-        t_resource.cudnn_handle, bwd_filter_src_desc,
-        bwd_filter_diff_desc, bwd_filter_conv_desc,
-        bwd_filter_grad_desc,
-        static_cast(*convBwdFilterAlgo),
-        bwdFilterLimitBytes));
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(filter);
+  CHECK_NOTNULL(conv);
+
+  // Specify workspace limit directly
+  size_t memoryLimitBytes =
+      (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
+
+  // cudnn convolution forward configuration
+  cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
+      t_resource.cudnn_handle,
+      fwd_src_desc,
+      fwd_filter_desc,
+      fwd_conv_desc,
+      fwd_dest_desc,
+      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+      memoryLimitBytes,
+      reinterpret_cast(convFwdAlgo)));
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
+      t_resource.cudnn_handle,
+      fwd_src_desc,
+      fwd_filter_desc,
+      fwd_conv_desc,
+      fwd_dest_desc,
+      static_cast(*convFwdAlgo),
+      fwdLimitBytes));
+
+  // cudnn convolution backward data configuration
+  cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnConvolutionDescriptor_t bwd_data_conv_desc =
+      GET_CONVOLUTION_DESCRIPTOR(conv);
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+      t_resource.cudnn_handle,
+      bwd_data_filter_desc,
+      bwd_data_diff_desc,
+      bwd_data_conv_desc,
+      bwd_data_grad_desc,
+      CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+      memoryLimitBytes,
+      reinterpret_cast(convBwdDataAlgo)));
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+      t_resource.cudnn_handle,
+      bwd_data_filter_desc,
+      bwd_data_diff_desc,
+      bwd_data_conv_desc,
+      bwd_data_grad_desc,
+      static_cast(*convBwdDataAlgo),
+      bwdDataLimitBytes));
+
+  // cudnn convolution backward filter configuration
+  cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
+      GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+      t_resource.cudnn_handle,
+      bwd_filter_src_desc,
+      bwd_filter_diff_desc,
+      bwd_filter_conv_desc,
+      bwd_filter_grad_desc,
+      CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+      memoryLimitBytes,
+      reinterpret_cast(convBwdFilterAlgo)));
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+      t_resource.cudnn_handle,
+      bwd_filter_src_desc,
+      bwd_filter_diff_desc,
+      bwd_filter_conv_desc,
+      bwd_filter_grad_desc,
+      static_cast(*convBwdFilterAlgo),
+      bwdFilterLimitBytes));
 
 #endif
 }
@@ -306,55 +295,54 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
                                  int feature_maps,
                                  int height,
                                  int width) {
-    CHECK_NOTNULL(image_desc);
+  CHECK_NOTNULL(image_desc);
 
-    cudnn_tensor_descriptor hl_desc =
-        (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
-    CHECK_NOTNULL(hl_desc);
+  cudnn_tensor_descriptor hl_desc =
+      (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
+  CHECK_NOTNULL(hl_desc);
 
 #ifndef PADDLE_TYPE_DOUBLE
-    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
-    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
 #endif
-    CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
-
-    CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
-                hl_desc->desc,
-                CUDNN_TENSOR_NCHW,
-                data_type,
-                batch_size,
-                feature_maps,
-                height,
-                width));
-
-    hl_desc->format = CUDNN_TENSOR_NCHW;
-    hl_desc->data_type = data_type;
-    hl_desc->batch_size = batch_size;
-    hl_desc->feature_maps = feature_maps;
-    hl_desc->height = height;
-    hl_desc->width = width;
-
-    *image_desc = (hl_tensor_descriptor)hl_desc;
+  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
+
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(hl_desc->desc,
+                                                  CUDNN_TENSOR_NCHW,
+                                                  data_type,
+                                                  batch_size,
+                                                  feature_maps,
+                                                  height,
+                                                  width));
+
+  hl_desc->format = CUDNN_TENSOR_NCHW;
+  hl_desc->data_type = data_type;
+  hl_desc->batch_size = batch_size;
+  hl_desc->feature_maps = feature_maps;
+  hl_desc->height = height;
+  hl_desc->width = width;
+
+  *image_desc = (hl_tensor_descriptor)hl_desc;
 }
 
 void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {
-    CHECK_NOTNULL(image_desc);
+  CHECK_NOTNULL(image_desc);
 
-    cudnn_tensor_descriptor hl_desc =
-        (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
-    CHECK_NOTNULL(hl_desc);
+  cudnn_tensor_descriptor hl_desc =
+      (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
+  CHECK_NOTNULL(hl_desc);
 
 #ifndef PADDLE_TYPE_DOUBLE
-    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
-    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
 #endif
-    CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
+  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
 
-    hl_desc->data_type = data_type;
+  hl_desc->data_type = data_type;
 
-    *image_desc = (hl_tensor_descriptor)hl_desc;
+  *image_desc = (hl_tensor_descriptor)hl_desc;
 }
 
 void hl_tensor_reshape(hl_tensor_descriptor image_desc,
@@ -362,19 +350,19 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
                        int feature_maps,
                        int height,
                        int width) {
-    const int stride_w = 1;
-    const int stride_h = width * stride_w;
-    const int stride_c = height * stride_h;
-    const int stride_n = feature_maps * stride_c;
-    return hl_tensor_reshape(image_desc,
-                             batch_size,
-                             feature_maps,
-                             height,
-                             width,
-                             stride_n,
-                             stride_c,
-                             stride_h,
-                             stride_w);
+  const int stride_w = 1;
+  const int stride_h = width * stride_w;
+  const int stride_c = height * stride_h;
+  const int stride_n = feature_maps * stride_c;
+  return hl_tensor_reshape(image_desc,
+                           batch_size,
+                           feature_maps,
+                           height,
+                           width,
+                           stride_n,
+                           stride_c,
+                           stride_h,
+                           stride_w);
 }
 
 void hl_tensor_reshape(hl_tensor_descriptor image_desc,
@@ -386,42 +374,41 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
                        int cStride,
                        int hStride,
                        int wStride) {
-    CHECK_NOTNULL(image_desc);
-
-    cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
-    CHECK_NOTNULL(hl_desc->desc);
-
-    CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
-                hl_desc->data_type,
-                batch_size,
-                feature_maps,
-                height,
-                width,
-                nStride,
-                cStride,
-                hStride,
-                wStride));
-
-    hl_desc->batch_size = batch_size;
-    hl_desc->feature_maps = feature_maps;
-    hl_desc->height = height;
-    hl_desc->width = width;
+  CHECK_NOTNULL(image_desc);
+
+  cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
+  CHECK_NOTNULL(hl_desc->desc);
+
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
+                                                    hl_desc->data_type,
+                                                    batch_size,
+                                                    feature_maps,
+                                                    height,
+                                                    width,
+                                                    nStride,
+                                                    cStride,
+                                                    hStride,
+                                                    wStride));
+
+  hl_desc->batch_size = batch_size;
+  hl_desc->feature_maps = feature_maps;
+  hl_desc->height = height;
+  hl_desc->width = width;
 }
 
 void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {
-    CHECK_NOTNULL(image_desc);
+  CHECK_NOTNULL(image_desc);
 
-    cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
-    CHECK_NOTNULL(hl_desc->desc);
+  cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
+  CHECK_NOTNULL(hl_desc->desc);
 
-    CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
+  CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
 
-    hl_desc->desc = NULL;
+  hl_desc->desc = NULL;
 
-    free(image_desc);
+  free(image_desc);
 }
 
-
 void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
                                   hl_pooling_mode_t mode,
                                   int height,
@@ -430,63 +417,61 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
                                   int width_padding,
                                   int stride_height,
                                   int stride_width) {
-    cudnnPoolingMode_t cudnn_mode;
-    switch (mode) {
-        case HL_POOLING_MAX:
-            cudnn_mode = CUDNN_POOLING_MAX;
-            break;
-        case HL_POOLING_AVERAGE:
-            cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-            break;
-        case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
-            cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-            break;
-        default:
-            LOG(FATAL) << "parameter mode error";
-    }
-
-    CHECK_NOTNULL(pooling_desc);
-
-    cudnn_pooling_descriptor hl_pooling_desc =
-        (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
-    CHECK_NOTNULL(hl_pooling_desc);
-
-    CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
-
-    CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(
-                hl_pooling_desc->desc,
-                cudnn_mode,
+  cudnnPoolingMode_t cudnn_mode;
+  switch (mode) {
+    case HL_POOLING_MAX:
+      cudnn_mode = CUDNN_POOLING_MAX;
+      break;
+    case HL_POOLING_AVERAGE:
+      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+      break;
+    case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
+      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+      break;
+    default:
+      LOG(FATAL) << "parameter mode error";
+  }
+
+  CHECK_NOTNULL(pooling_desc);
+
+  cudnn_pooling_descriptor hl_pooling_desc =
+      (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
+  CHECK_NOTNULL(hl_pooling_desc);
+
+  CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
+
+  CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(hl_pooling_desc->desc,
+                                                   cudnn_mode,
 #if CUDNN_VERSION >= 5000
-                CUDNN_PROPAGATE_NAN,
+                                                   CUDNN_PROPAGATE_NAN,
 #endif
-                height,
-                width,
-                height_padding,
-                width_padding,
-                stride_height,
-                stride_width));
-
-    hl_pooling_desc->mode = cudnn_mode;
-    hl_pooling_desc->window_height = height;
-    hl_pooling_desc->window_width = width;
-    hl_pooling_desc->stride_height = stride_height;
-    hl_pooling_desc->stride_width = stride_width;
-
-    *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
+                                                   height,
+                                                   width,
+                                                   height_padding,
+                                                   width_padding,
+                                                   stride_height,
+                                                   stride_width));
+
+  hl_pooling_desc->mode = cudnn_mode;
+  hl_pooling_desc->window_height = height;
+  hl_pooling_desc->window_width = width;
+  hl_pooling_desc->stride_height = stride_height;
+  hl_pooling_desc->stride_width = stride_width;
+
+  *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
 }
 
 void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {
-    CHECK_NOTNULL(pooling_desc);
+  CHECK_NOTNULL(pooling_desc);
 
-    cudnn_pooling_descriptor hl_pooling =
-        (cudnn_pooling_descriptor)pooling_desc;
+  cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
 
-    CHECK_NOTNULL(hl_pooling->desc);
-    CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
+  CHECK_NOTNULL(hl_pooling->desc);
+  CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
 
-    hl_pooling->desc = NULL;
+  hl_pooling->desc = NULL;
 
-    free(pooling_desc);
+  free(pooling_desc);
 }
 
 void hl_pooling_forward(hl_tensor_descriptor input,
@@ -494,31 +479,30 @@ void hl_pooling_forward(hl_tensor_descriptor input,
                         hl_tensor_descriptor output,
                         real* output_image,
                         hl_pooling_descriptor pooling) {
-    cudnnPoolingDescriptor_t    pooling_desc;
-    cudnnTensorDescriptor_t     input_desc;
-    cudnnTensorDescriptor_t     output_desc;
-
-    CHECK_NOTNULL(input);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(pooling);
-    CHECK_NOTNULL(input_image);
-    CHECK_NOTNULL(output_image);
-
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    input_desc = ((cudnn_tensor_descriptor)input)->desc;
-    output_desc = ((cudnn_tensor_descriptor)output)->desc;
-    pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
-    CHECK_CUDNN(dynload::cudnnPoolingForward(
-                t_resource.cudnn_handle,
-                pooling_desc,
-                &alpha,
-                input_desc,
-                input_image,
-                &beta,
-                output_desc,
-                output_image));
-    CHECK_SYNC("hl_pooling_forward failed");
+  cudnnPoolingDescriptor_t pooling_desc;
+  cudnnTensorDescriptor_t input_desc;
+  cudnnTensorDescriptor_t output_desc;
+
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(pooling);
+  CHECK_NOTNULL(input_image);
+  CHECK_NOTNULL(output_image);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  input_desc = ((cudnn_tensor_descriptor)input)->desc;
+  output_desc = ((cudnn_tensor_descriptor)output)->desc;
+  pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
+  CHECK_CUDNN(dynload::cudnnPoolingForward(t_resource.cudnn_handle,
+                                           pooling_desc,
+                                           &alpha,
+                                           input_desc,
+                                           input_image,
+                                           &beta,
+                                           output_desc,
+                                           output_image));
+  CHECK_SYNC("hl_pooling_forward failed");
 }
 
 void hl_pooling_backward(hl_tensor_descriptor input,
@@ -528,90 +512,86 @@ void hl_pooling_backward(hl_tensor_descriptor input,
                          real* output_image,
                          real* output_image_grad,
                          hl_pooling_descriptor pooling) {
-    cudnnPoolingDescriptor_t    pooling_desc;
-    cudnnTensorDescriptor_t     input_desc;
-    cudnnTensorDescriptor_t     output_desc;
-
-    CHECK_NOTNULL(input);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(pooling);
-    CHECK_NOTNULL(input_image);
-    CHECK_NOTNULL(input_image_grad);
-    CHECK_NOTNULL(output_image);
-    CHECK_NOTNULL(output_image_grad);
-
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    input_desc = ((cudnn_tensor_descriptor)input)->desc;
-    output_desc = ((cudnn_tensor_descriptor)output)->desc;
-    pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
-    CHECK_CUDNN(dynload::cudnnPoolingBackward(
-                t_resource.cudnn_handle,
-                pooling_desc,
-                &alpha,
-                output_desc,
-                output_image,
-                output_desc,
-                output_image_grad,
-                input_desc,
-                input_image,
-                &beta,
-                input_desc,
-                input_image_grad));
+  cudnnPoolingDescriptor_t pooling_desc;
+  cudnnTensorDescriptor_t input_desc;
+  cudnnTensorDescriptor_t output_desc;
+
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(pooling);
+  CHECK_NOTNULL(input_image);
+  CHECK_NOTNULL(input_image_grad);
+  CHECK_NOTNULL(output_image);
+  CHECK_NOTNULL(output_image_grad);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  input_desc = ((cudnn_tensor_descriptor)input)->desc;
+  output_desc = ((cudnn_tensor_descriptor)output)->desc;
+  pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
+  CHECK_CUDNN(dynload::cudnnPoolingBackward(t_resource.cudnn_handle,
+                                            pooling_desc,
+                                            &alpha,
+                                            output_desc,
+                                            output_image,
+                                            output_desc,
+                                            output_image_grad,
+                                            input_desc,
+                                            input_image,
+                                            &beta,
+                                            input_desc,
+                                            input_image_grad));
   CHECK_SYNC("hl_pooling_backward failed");
 }
 
-
 void hl_create_filter_descriptor(hl_filter_descriptor* filter,
                                  int input_feature_maps,
                                  int output_feature_maps,
                                  int height,
                                  int width) {
-    CHECK_NOTNULL(filter);
+  CHECK_NOTNULL(filter);
 
-    cudnn_filter_descriptor hl_filter =
-        (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
-    CHECK_NOTNULL(hl_filter);
+  cudnn_filter_descriptor hl_filter =
+      (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
+  CHECK_NOTNULL(hl_filter);
 
-    CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
+  CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
 
 #ifndef PADDLE_TYPE_DOUBLE
-    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
-    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
 #endif
-    CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(
-             hl_filter->desc,
-             data_type,
+  CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(hl_filter->desc,
+                                                  data_type,
 #if CUDNN_VERSION >= 5000
-             CUDNN_TENSOR_NCHW,
+                                                  CUDNN_TENSOR_NCHW,
 #endif
-             output_feature_maps,
-             input_feature_maps,
-             height,
-             width));
-
-    hl_filter->data_type = data_type;
-    hl_filter->output_feature_maps = output_feature_maps;
-    hl_filter->input_feature_maps = input_feature_maps;
-    hl_filter->filter_height = height;
-    hl_filter->filter_width = width;
-
-    *filter = (hl_filter_descriptor)hl_filter;
+                                                  output_feature_maps,
+                                                  input_feature_maps,
+                                                  height,
+                                                  width));
+
+  hl_filter->data_type = data_type;
+  hl_filter->output_feature_maps = output_feature_maps;
+  hl_filter->input_feature_maps = input_feature_maps;
+  hl_filter->filter_height = height;
+  hl_filter->filter_width = width;
+
+  *filter = (hl_filter_descriptor)hl_filter;
 }
 
-
 void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {
-    CHECK_NOTNULL(filter);
+  CHECK_NOTNULL(filter);
 
-    cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
-    CHECK_NOTNULL(hl_filter->desc);
+  cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
+  CHECK_NOTNULL(hl_filter->desc);
 
-    CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
+  CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
 
-    hl_filter->desc = NULL;
+  hl_filter->desc = NULL;
 
-    free(filter);
+  free(filter);
 }
 
 void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
@@ -621,36 +601,35 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                       int padding_width,
                                       int stride_height,
                                       int stride_width) {
-    CHECK_NOTNULL(conv);
-
-    cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)
-        malloc(sizeof(_cudnn_convolution_descriptor));
-
-    CHECK_NOTNULL(hl_conv);
-    CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
-
-    cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
-    CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(
-                hl_conv->desc,
-                padding_height,
-                padding_width,
-                stride_height,
-                stride_width,
-                1,
-                1,
-                mode));
-
-    hl_conv->input_image = image;
-    hl_conv->filter = filter;
-    hl_conv->padding_height = padding_height;
-    hl_conv->padding_width = padding_width;
-    hl_conv->stride_height = stride_height;
-    hl_conv->stride_width = stride_width;
-    hl_conv->upscalex = 1;
-    hl_conv->upscaley = 1;
-    hl_conv->mode = mode;
-
-    *conv = (hl_convolution_descriptor)hl_conv;
+  CHECK_NOTNULL(conv);
+
+  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc(
+      sizeof(_cudnn_convolution_descriptor));
+
+  CHECK_NOTNULL(hl_conv);
+  CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
+
+  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
+                                                       padding_height,
+                                                       padding_width,
+                                                       stride_height,
+                                                       stride_width,
+                                                       1,
+                                                       1,
+                                                       mode));
+
+  hl_conv->input_image = image;
+  hl_conv->filter = filter;
+  hl_conv->padding_height = padding_height;
+  hl_conv->padding_width = padding_width;
+  hl_conv->stride_height = stride_height;
+  hl_conv->stride_width = stride_width;
+  hl_conv->upscalex = 1;
+  hl_conv->upscaley = 1;
+  hl_conv->mode = mode;
+
+  *conv = (hl_convolution_descriptor)hl_conv;
 }
 
 void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
@@ -660,44 +639,43 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                      int padding_width,
                                      int stride_height,
                                      int stride_width) {
-    CHECK_NOTNULL(conv);
-    CHECK_NOTNULL(image);
-    CHECK_NOTNULL(filter);
-
-    cudnnConvolutionDescriptor_t  conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-    cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
-    CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(
-                conv_desc,
-                padding_height,
-                padding_width,
-                stride_height,
-                stride_width,
-                1,
-                1,
-                mode));
-
-    cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
-    hl_conv->input_image = image;
-    hl_conv->filter = filter;
-    hl_conv->padding_height = padding_height;
-    hl_conv->padding_width = padding_width;
-    hl_conv->stride_height = stride_height;
-    hl_conv->stride_width = stride_width;
-    hl_conv->upscalex = 1;
-    hl_conv->upscaley = 1;
-    hl_conv->mode = mode;
+  CHECK_NOTNULL(conv);
+  CHECK_NOTNULL(image);
+  CHECK_NOTNULL(filter);
+
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
+                                                       padding_height,
+                                                       padding_width,
+                                                       stride_height,
+                                                       stride_width,
+                                                       1,
+                                                       1,
+                                                       mode));
+
+  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
+  hl_conv->input_image = image;
+  hl_conv->filter = filter;
+  hl_conv->padding_height = padding_height;
+  hl_conv->padding_width = padding_width;
+  hl_conv->stride_height = stride_height;
+  hl_conv->stride_width = stride_width;
+  hl_conv->upscalex = 1;
+  hl_conv->upscaley = 1;
+  hl_conv->mode = mode;
 }
 
 void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {
-    CHECK_NOTNULL(conv);
+  CHECK_NOTNULL(conv);
 
-    cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
-    CHECK_NOTNULL(hl_conv->desc);
+  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
+  CHECK_NOTNULL(hl_conv->desc);
 
-    CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
-    hl_conv->desc = NULL;
+  CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
+  hl_conv->desc = NULL;
 
-    free(conv);
+  free(conv);
 }
 
 void hl_convolution_forward(hl_tensor_descriptor input,
@@ -710,33 +688,33 @@ void hl_convolution_forward(hl_tensor_descriptor input,
                             void* gpuWorkSpace,
                             size_t sizeInBytes,
                             int convFwdAlgo) {
-    CHECK_NOTNULL(input);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(filter);
-    CHECK_NOTNULL(conv);
-    CHECK_NOTNULL(input_data);
-    CHECK_NOTNULL(output_data);
-    CHECK_NOTNULL(filter_data);
-    cudnnTensorDescriptor_t       src_desc = GET_TENSOR_DESCRIPTOR(input);
-    cudnnTensorDescriptor_t       dest_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnFilterDescriptor_t       filter_desc = GET_FILTER_DESCRIPTOR(filter);
-    cudnnConvolutionDescriptor_t  conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    CHECK_CUDNN(dynload::cudnnConvolutionForward(
-                t_resource.cudnn_handle,
-                &alpha,
-                src_desc,
-                input_data,
-                filter_desc,
-                filter_data,
-                conv_desc,
-                static_cast(convFwdAlgo),
-                gpuWorkSpace,
-                sizeInBytes,
-                &beta,
-                dest_desc,
-                output_data));
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(filter);
+  CHECK_NOTNULL(conv);
+  CHECK_NOTNULL(input_data);
+  CHECK_NOTNULL(output_data);
+  CHECK_NOTNULL(filter_data);
+  cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t dest_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  CHECK_CUDNN(dynload::cudnnConvolutionForward(
+      t_resource.cudnn_handle,
+      &alpha,
+      src_desc,
+      input_data,
+      filter_desc,
+      filter_data,
+      conv_desc,
+      static_cast(convFwdAlgo),
+      gpuWorkSpace,
+      sizeInBytes,
+      &beta,
+      dest_desc,
+      output_data));
   CHECK_SYNC("hl_convolution_forward failed");
 }
 
@@ -744,27 +722,26 @@ void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
                                      real* bias_data,
                                      hl_tensor_descriptor output,
                                      real* output_data) {
-    CHECK_NOTNULL(bias);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(bias_data);
-    CHECK_NOTNULL(output_data);
-
-    cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
-    real alpha = 1.0f;
-    real beta = 1.0f;
-
-    CHECK_CUDNN(dynload::cudnnAddTensor(
-                t_resource.cudnn_handle,
+  CHECK_NOTNULL(bias);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(bias_data);
+  CHECK_NOTNULL(output_data);
+
+  cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+
+  CHECK_CUDNN(dynload::cudnnAddTensor(t_resource.cudnn_handle,
 #if CUDNN_VERSION < 4000
-                CUDNN_ADD_SAME_C,
+                                      CUDNN_ADD_SAME_C,
 #endif
-                &alpha,
-                bias_desc,
-                bias_data,
-                &beta,
-                output_desc,
-                output_data));
+                                      &alpha,
+                                      bias_desc,
+                                      bias_data,
+                                      &beta,
+                                      output_desc,
+                                      output_data));
   CHECK_SYNC("hl_convolution_forward_add_bias failed");
 }
 
@@ -772,23 +749,22 @@ void hl_convolution_backward_bias(hl_tensor_descriptor bias,
                                   real* bias_grad_data,
                                   hl_tensor_descriptor output,
                                   real* output_grad_data) {
-    CHECK_NOTNULL(bias);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(bias_grad_data);
-    CHECK_NOTNULL(output_grad_data);
-
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
-    CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(
-                t_resource.cudnn_handle,
-                &alpha,
-                diff_desc,
-                output_grad_data,
-                &beta,
-                bias_desc,
-                bias_grad_data));
+  CHECK_NOTNULL(bias);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(bias_grad_data);
+  CHECK_NOTNULL(output_grad_data);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
+  CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(t_resource.cudnn_handle,
+                                                    &alpha,
+                                                    diff_desc,
+                                                    output_grad_data,
+                                                    &beta,
+                                                    bias_desc,
+                                                    bias_grad_data));
   CHECK_SYNC("hl_convolution_backward_bias failed");
 }
 
@@ -802,37 +778,37 @@ void hl_convolution_backward_filter(hl_tensor_descriptor input,
                                     void* gpuWorkSpace,
                                     size_t sizeInBytes,
                                     int convBwdFilterAlgo) {
-    CHECK_NOTNULL(input);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(filter);
-    CHECK_NOTNULL(conv);
-    CHECK_NOTNULL(input_data);
-    CHECK_NOTNULL(output_grad_data);
-    CHECK_NOTNULL(filter_grad_data);
-
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    cudnnTensorDescriptor_t       src_desc = GET_TENSOR_DESCRIPTOR(input);
-    cudnnTensorDescriptor_t       diff_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnConvolutionDescriptor_t  conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-    cudnnFilterDescriptor_t       grad_desc = GET_FILTER_DESCRIPTOR(filter);
-
-    CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
-                t_resource.cudnn_handle,
-                &alpha,
-                src_desc,
-                input_data,
-                diff_desc,
-                output_grad_data,
-                conv_desc,
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(filter);
+  CHECK_NOTNULL(conv);
+  CHECK_NOTNULL(input_data);
+  CHECK_NOTNULL(output_grad_data);
+  CHECK_NOTNULL(filter_grad_data);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnFilterDescriptor_t grad_desc = GET_FILTER_DESCRIPTOR(filter);
+
+  CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
+      t_resource.cudnn_handle,
+      &alpha,
+      src_desc,
+      input_data,
+      diff_desc,
+      output_grad_data,
+      conv_desc,
 #if CUDNN_VERSION >= 4000
-                static_cast(convBwdFilterAlgo),
-                gpuWorkSpace,
-                sizeInBytes,
+      static_cast(convBwdFilterAlgo),
+      gpuWorkSpace,
+      sizeInBytes,
 #endif
-                &beta,
-                grad_desc,
-                filter_grad_data));
+      &beta,
+      grad_desc,
+      filter_grad_data));
   CHECK_SYNC("hl_convolution_backward_filter failed");
 }
 
@@ -846,119 +822,111 @@ void hl_convolution_backward_data(hl_tensor_descriptor input,
                                   void* gpuWorkSpace,
                                   size_t sizeInBytes,
                                   int convBwdDataAlgo) {
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    cudnnFilterDescriptor_t       filter_desc = GET_FILTER_DESCRIPTOR(filter);
-    cudnnTensorDescriptor_t       diff_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnTensorDescriptor_t       grad_desc = GET_TENSOR_DESCRIPTOR(input);
-    cudnnConvolutionDescriptor_t  conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-
-    CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
-                t_resource.cudnn_handle,
-                &alpha,
-                filter_desc,
-                filter_data,
-                diff_desc,
-                output_grad_data,
-                conv_desc,
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t grad_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+
+  CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
+      t_resource.cudnn_handle,
+      &alpha,
+      filter_desc,
+      filter_data,
+      diff_desc,
+      output_grad_data,
+      conv_desc,
 #if CUDNN_VERSION >= 4000
-                static_cast(convBwdDataAlgo),
-                gpuWorkSpace,
-                sizeInBytes,
+      static_cast(convBwdDataAlgo),
+      gpuWorkSpace,
+      sizeInBytes,
 #endif
-                &beta,
-                grad_desc,
-                input_data_grad));
+      &beta,
+      grad_desc,
+      input_data_grad));
   CHECK_SYNC("hl_convolution_backward_data failed");
 }
 
-
-void hl_softmax_forward(real *input,
-                        real *output,
-                        int height,
-                        int width) {
+void hl_softmax_forward(real* input, real* output, int height, int width) {
 #ifndef PADDLE_TYPE_DOUBLE
-    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
-    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
 #endif
-    CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
-                t_resource.cudnn_desc,
-                CUDNN_TENSOR_NCHW,
-                data_type,
-                height,
-                width,
-                1,
-                1));
-
-    real alpha = 1.0f;
-    real beta = 0.0f;
-    CHECK_CUDNN(dynload::cudnnSoftmaxForward(
-                t_resource.cudnn_handle,
-                CUDNN_SOFTMAX_ACCURATE,
-                CUDNN_SOFTMAX_MODE_CHANNEL,
-                &alpha,
-                t_resource.cudnn_desc,
-                input,
-                &beta,
-                t_resource.cudnn_desc,
-                output));
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
+                                                  CUDNN_TENSOR_NCHW,
+                                                  data_type,
+                                                  height,
+                                                  width,
+                                                  1,
+                                                  1));
+
+  real alpha = 1.0f;
+  real beta = 0.0f;
+  CHECK_CUDNN(dynload::cudnnSoftmaxForward(t_resource.cudnn_handle,
+                                           CUDNN_SOFTMAX_ACCURATE,
+                                           CUDNN_SOFTMAX_MODE_CHANNEL,
+                                           &alpha,
+                                           t_resource.cudnn_desc,
+                                           input,
+                                           &beta,
+                                           t_resource.cudnn_desc,
+                                           output));
   CHECK_SYNC("hl_softmax_forward failed");
 }
 
-void hl_softmax_backward(real *output_value,
-                         real *output_grad,
+void hl_softmax_backward(real* output_value,
+                         real* output_grad,
                          int height,
                          int width) {
 #ifndef PADDLE_TYPE_DOUBLE
-    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
-    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
 #endif
-    CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
-                t_resource.cudnn_desc,
-                CUDNN_TENSOR_NCHW,
-                data_type,
-                height,
-                width,
-                1,
-                1));
-
-    real alpha = 1.0f;
-    real beta = 0.0f;
-    CHECK_CUDNN(dynload::cudnnSoftmaxBackward(
-                t_resource.cudnn_handle,
-                CUDNN_SOFTMAX_ACCURATE,
-                CUDNN_SOFTMAX_MODE_CHANNEL,
-                &alpha,
-                t_resource.cudnn_desc,
-                output_value,
-                t_resource.cudnn_desc,
-                output_grad,
-                &beta,
-                t_resource.cudnn_desc,
-                output_grad));
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
+                                                  CUDNN_TENSOR_NCHW,
+                                                  data_type,
+                                                  height,
+                                                  width,
+                                                  1,
+                                                  1));
+
+  real alpha = 1.0f;
+  real beta = 0.0f;
+  CHECK_CUDNN(dynload::cudnnSoftmaxBackward(t_resource.cudnn_handle,
+                                            CUDNN_SOFTMAX_ACCURATE,
+                                            CUDNN_SOFTMAX_MODE_CHANNEL,
+                                            &alpha,
+                                            t_resource.cudnn_desc,
+                                            output_value,
+                                            t_resource.cudnn_desc,
+                                            output_grad,
+                                            &beta,
+                                            t_resource.cudnn_desc,
+                                            output_grad));
   CHECK_SYNC("hl_softmax_backward failed");
 }
 
 void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                    real *input,
+                                    real* input,
                                     hl_tensor_descriptor outputDesc,
-                                    real *output,
+                                    real* output,
                                     hl_tensor_descriptor bnParamDesc,
-                                    real *scale,
-                                    real *bias,
+                                    real* scale,
+                                    real* bias,
                                     double factor,
-                                    real *runningMean,
-                                    real *runningInvVar,
+                                    real* runningMean,
+                                    real* runningInvVar,
                                     double epsilon,
-                                    real *savedMean,
-                                    real *savedVar) {
+                                    real* savedMean,
+                                    real* savedVar) {
 #if CUDNN_VERSION >= 4007
   if ((NULL != runningMean && NULL == runningInvVar) ||
       (NULL == runningMean && NULL != runningInvVar)) {
     LOG(FATAL) << "runningMean and runningInvVar can be NULL "
-              << "but only at the same time.";
+               << "but only at the same time.";
   }
   if ((NULL != savedMean && NULL == savedVar) ||
       (NULL == savedMean && NULL != savedVar)) {
@@ -972,10 +940,24 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
   real alpha = 1.0f;
   real beta = 1.0f;
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-  CHECK_CUDNN(dynload::cudnnBatchNormalizationForwardTraining(
-              t_resource.cudnn_handle, mode, &alpha, &beta, xDesc,
-              input, yDesc, output, bnDesc, scale, bias, factor,
-              runningMean, runningInvVar, epsilon, savedMean, savedVar));
+  CHECK_CUDNN(
+      dynload::cudnnBatchNormalizationForwardTraining(t_resource.cudnn_handle,
+                                                      mode,
+                                                      &alpha,
+                                                      &beta,
+                                                      xDesc,
+                                                      input,
+                                                      yDesc,
+                                                      output,
+                                                      bnDesc,
+                                                      scale,
+                                                      bias,
+                                                      factor,
+                                                      runningMean,
+                                                      runningInvVar,
+                                                      epsilon,
+                                                      savedMean,
+                                                      savedVar));
 
   CHECK_SYNC("hl_batch_norm_forward_training failed");
 #else
@@ -985,15 +967,15 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
 }
 
 void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                    real *input,
-                                    hl_tensor_descriptor outputDesc,
-                                    real *output,
-                                    hl_tensor_descriptor bnParamDesc,
-                                    real *scale,
-                                    real *bias,
-                                    real *estimatedMean,
-                                    real *estimatedInvVar,
-                                    double epsilon) {
+                                     real* input,
+                                     hl_tensor_descriptor outputDesc,
+                                     real* output,
+                                     hl_tensor_descriptor bnParamDesc,
+                                     real* scale,
+                                     real* bias,
+                                     real* estimatedMean,
+                                     real* estimatedInvVar,
+                                     double epsilon) {
 #if CUDNN_VERSION >= 4007
   cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
   cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
@@ -1001,10 +983,21 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
   real alpha = 1.0f;
   real beta = 1.0f;
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-  CHECK_CUDNN(dynload::cudnnBatchNormalizationForwardInference(
-              t_resource.cudnn_handle, mode, &alpha, &beta, xDesc,
-              input, yDesc, output, bnDesc, scale, bias,
-              estimatedMean, estimatedInvVar, epsilon));
+  CHECK_CUDNN(
+      dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
+                                                       mode,
+                                                       &alpha,
+                                                       &beta,
+                                                       xDesc,
+                                                       input,
+                                                       yDesc,
+                                                       output,
+                                                       bnDesc,
+                                                       scale,
+                                                       bias,
+                                                       estimatedMean,
+                                                       estimatedInvVar,
+                                                       epsilon));
 
   CHECK_SYNC("hl_batch_norm_forward_inference failed");
 #else
@@ -1014,18 +1007,18 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
 }
 
 void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                            real *input,
+                            real* input,
                             hl_tensor_descriptor outGradDesc,
-                            real *outGrad,
+                            real* outGrad,
                             hl_tensor_descriptor inGradDesc,
-                            real *inGrad,
+                            real* inGrad,
                             hl_tensor_descriptor dBnParamDesc,
-                            real *scale,
-                            real *scaleGrad,
-                            real *biasGrad,
+                            real* scale,
+                            real* scaleGrad,
+                            real* biasGrad,
                             double epsilon,
-                            real *savedMean,
-                            real *savedInvVar) {
+                            real* savedMean,
+                            real* savedInvVar) {
 #if CUDNN_VERSION >= 4007
   if ((NULL != savedMean && NULL == savedInvVar) ||
       (NULL == savedMean && NULL != savedInvVar)) {
@@ -1040,12 +1033,25 @@ void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
   real alpha = 1.0f;
   real beta = 1.0f;
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-  CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(
-              t_resource.cudnn_handle, mode, &alpha, &beta,
-              &alpha, &beta,
-              xDesc, input, dyDesc, outGrad, dxDesc, inGrad,
-              bnDesc, scale, scaleGrad, biasGrad, epsilon,
-              savedMean, savedInvVar));
+  CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(t_resource.cudnn_handle,
+                                                       mode,
+                                                       &alpha,
+                                                       &beta,
+                                                       &alpha,
+                                                       &beta,
+                                                       xDesc,
+                                                       input,
+                                                       dyDesc,
+                                                       outGrad,
+                                                       dxDesc,
+                                                       inGrad,
+                                                       bnDesc,
+                                                       scale,
+                                                       scaleGrad,
+                                                       biasGrad,
+                                                       epsilon,
+                                                       savedMean,
+                                                       savedInvVar));
 
   CHECK_SYNC("hl_batch_norm_backward failed");
 #else
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index ca19f210c5c9d5151b01ce81a4f44663e2df97cc..745be35b56278ed2e0033d5fd2806320d3164d7c 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include 
 #include 
 #include 
@@ -27,7 +26,7 @@ limitations under the License. */
 namespace dynload {
 
 std::once_flag curand_dso_flag;
-void* curand_dso_handle = nullptr;
+void *curand_dso_handle = nullptr;
 
 /**
  * The following macro definition can generate structs
@@ -37,34 +36,35 @@ void* curand_dso_handle = nullptr;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)                           \
-  struct DynLoad__##__name {                                       \
-    template                                     \
-    curandStatus_t operator()(Args... args) {                      \
-       typedef curandStatus_t (*curandFunc)(Args...);              \
-       std::call_once(curand_dso_flag, GetCurandDsoHandle,         \
-                      &curand_dso_handle);                         \
-       void* p_##__name = dlsym(curand_dso_handle, #__name);       \
-       return reinterpret_cast(p_##__name)(args...);   \
-    }                                                              \
-  } __name;  /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CURAND_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template                                                 \
+    curandStatus_t operator()(Args... args) {                                  \
+      typedef curandStatus_t (*curandFunc)(Args...);                           \
+      std::call_once(curand_dso_flag, GetCurandDsoHandle, &curand_dso_handle); \
+      void *p_##__name = dlsym(curand_dso_handle, #__name);                    \
+      return reinterpret_cast(p_##__name)(args...);                \
+    }                                                                          \
+  } __name; /* struct DynLoad__##__name */
 #else
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)                           \
-  struct DynLoad__##__name {                                       \
-    template                                     \
-    curandStatus_t operator()(Args... args) {                      \
-       return __name(args...);                                     \
-    }                                                              \
-  } __name;  /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CURAND_WRAP(__name)      \
+  struct DynLoad__##__name {                  \
+    template                \
+    curandStatus_t operator()(Args... args) { \
+      return __name(args...);                 \
+    }                                         \
+  } __name; /* struct DynLoad__##__name */
 #endif
 
 /* include all needed curand functions in HPPL */
+// clang-format off
 #define CURAND_RAND_ROUTINE_EACH(__macro)    \
   __macro(curandCreateGenerator)             \
   __macro(curandSetStream)                   \
   __macro(curandSetPseudoRandomGeneratorSeed)\
   __macro(curandGenerateUniform)             \
   __macro(curandGenerateUniformDouble)
+// clang-format on
 
 CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
 
@@ -72,7 +72,7 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
 #undef DYNAMIC_LOAD_CURAND_WRAP
 
 std::once_flag cudart_dso_flag;
-void* cudart_dso_handle = nullptr;
+void *cudart_dso_handle = nullptr;
 
 /**
  * The following macro definition can generate structs
@@ -82,28 +82,28 @@ void* cudart_dso_handle = nullptr;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUDART_WRAP(__name)                            \
-  struct DynLoad__##__name {                                        \
-    template                                      \
-    auto operator()(Args... args) -> decltype(__name(args...)) {    \
-      using cudart_func = decltype(__name(args...))(*)(Args...);    \
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle,           \
-                     &cudart_dso_handle);                           \
-      void* p_##__name = dlsym(cudart_dso_handle, #__name);         \
-      return reinterpret_cast(p_##__name)(args...);    \
-    }                                                               \
-  } __name;  /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template                                                 \
+    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+      using cudart_func = decltype(__name(args...)) (*)(Args...);              \
+      std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
+      void *p_##__name = dlsym(cudart_dso_handle, #__name);                    \
+      return reinterpret_cast(p_##__name)(args...);               \
+    }                                                                          \
+  } __name; /* struct DynLoad__##__name */
 #else
-#define DYNAMIC_LOAD_CUDART_WRAP(__name)                            \
-  struct DynLoad__##__name {                                        \
-    template                                      \
-    auto operator()(Args... args) -> decltype(__name(args...)) {    \
-      return __name(args...);                                       \
-    }                                                               \
-  } __name;  /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name)                         \
+  struct DynLoad__##__name {                                     \
+    template                                   \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      return __name(args...);                                    \
+    }                                                            \
+  } __name; /* struct DynLoad__##__name */
 #endif
 
 /* include all needed cuda functions in HPPL */
+// clang-format off
 #define CUDA_ROUTINE_EACH(__macro)        \
   __macro(cudaMalloc)                     \
   __macro(cudaHostAlloc)                  \
@@ -134,57 +134,57 @@ void* cudart_dso_handle = nullptr;
   __macro(cudaFuncSetCacheConfig)         \
   __macro(cudaRuntimeGetVersion)          \
   __macro(cudaGetErrorString)
+// clang-format on
 
 CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
 
 #undef CUDA_ROUNTINE_EACH
 #undef DYNAMIC_LOAD_CUDART_WRAP
 
-}  /* namespace dynload */
+} /* namespace dynload */
 
 /**
  * @brief   global resource.
  */
-int                     g_system_device_num = 0;    /* system device number */
-int                     device_num = 0;             /* use    device number */
-hl_device_prop          *g_device;                  /* device info table */
-__thread thread_device_resources *t_device;         /* device resources table */
+int g_system_device_num = 0;                /* system device number */
+int device_num = 0;                         /* use    device number */
+hl_device_prop *g_device;                   /* device info table */
+__thread thread_device_resources *t_device; /* device resources table */
 int g_cuda_lib_version = 0;
 
 /* number of global stream */
-#define  NUMBER_OF_GLOBAL_STREAM    (HPPL_THREAD_STREAM_1)
+#define NUMBER_OF_GLOBAL_STREAM (HPPL_THREAD_STREAM_1)
 /* number of thread stream */
-#define  NUMBER_OF_THREAD_STREAM    (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
+#define NUMBER_OF_THREAD_STREAM (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
 /* sizeof of device memory */
-#define  HPPL_GPU_MEMORY_SIZE                (256*4)
+#define HPPL_GPU_MEMORY_SIZE (256 * 4)
 
 /**
  * Check build-in cuda function using glog and it **does not**
  * support << operator for more details error info.
  */
-#define CHECK_CUDA(cudaFunc)                               \
-  do {                                                     \
-    cudaError_t cudaStat = cudaFunc;                       \
-    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "      \
-        << dynload::cudaGetErrorString(cudaStat);          \
+#define CHECK_CUDA(cudaFunc)                                                  \
+  do {                                                                        \
+    cudaError_t cudaStat = cudaFunc;                                          \
+    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                         \
+                                    << dynload::cudaGetErrorString(cudaStat); \
   } while (0)
 
 /**
  * @brief   thread resource.
  */
-__thread _hl_thread_resource t_resource = {
-                                           {0},     /* stream */
-                                           0,       /* handle */
-                                           0,       /* gen */
-                                           0,       /* cudnn_handle */
-                                           0,       /* cudnn_desc */
-                                           NULL,    /* gen_mutex */
-                                           NULL,    /* gpu_mem */
-                                           NULL,    /* cpu_mem */
-                                           0,       /* event */
-                                           -1,      /* device */
-                                           0,       /* major */
-                                           false};  /* is_init */
+__thread _hl_thread_resource t_resource = {{0},    /* stream */
+                                           0,      /* handle */
+                                           0,      /* gen */
+                                           0,      /* cudnn_handle */
+                                           0,      /* cudnn_desc */
+                                           NULL,   /* gen_mutex */
+                                           NULL,   /* gpu_mem */
+                                           NULL,   /* cpu_mem */
+                                           0,      /* event */
+                                           -1,     /* device */
+                                           0,      /* major */
+                                           false}; /* is_init */
 
 __thread cudaStream_t default_stream = 0;
 __thread bool g_sync_flag = true;
@@ -198,9 +198,9 @@ inline pid_t gettid() {
   uint64_t tid;
   pthread_threadid_np(NULL, &tid);
 #else
-  #ifndef __NR_gettid
-  #define __NR_gettid 224
-  #endif
+#ifndef __NR_gettid
+#define __NR_gettid 224
+#endif
   pid_t tid = syscall(__NR_gettid);
 #endif
   CHECK_NE((int)tid, -1);
@@ -208,8 +208,7 @@ inline pid_t gettid() {
 }
 
 void hl_init(int device) {
-  CHECK(hl_start_flag)
-    << "[Init failed] hl_start() did not succeed.";
+  CHECK(hl_start_flag) << "[Init failed] hl_start() did not succeed.";
 
   /* thread has been initialized */
   if (true == t_resource.is_init) {
@@ -220,16 +219,16 @@ void hl_init(int device) {
   /* create thread devcie resources */
   char *tmp;
   thread_device_resources device_res;
-  tmp = (char *)malloc(g_system_device_num*sizeof(thread_device_resources*) +
-                       device_num*sizeof(_thread_device_resources));
+  tmp = (char *)malloc(g_system_device_num * sizeof(thread_device_resources *) +
+                       device_num * sizeof(_thread_device_resources));
   CHECK_NOTNULL(tmp);
-  t_device = (thread_device_resources*)tmp;
-  device_res = (thread_device_resources)((char*)tmp +
-               g_system_device_num*sizeof(thread_device_resources*));
-  memset(t_device, 0, g_system_device_num*sizeof(thread_device_resources*));
+  t_device = (thread_device_resources *)tmp;
+  device_res = (thread_device_resources)(
+      (char *)tmp + g_system_device_num * sizeof(thread_device_resources *));
+  memset(t_device, 0, g_system_device_num * sizeof(thread_device_resources *));
 
-  char *tmp_stream = (char *)
-      malloc(device_num*NUMBER_OF_THREAD_STREAM*sizeof(cudaStream_t));
+  char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_THREAD_STREAM *
+                                    sizeof(cudaStream_t));
   CHECK_NOTNULL(tmp_stream);
 
   int num = 0;
@@ -239,8 +238,9 @@ void hl_init(int device) {
     }
 
     t_device[dev] = &device_res[num];
-    t_device[dev]->stream = (cudaStream_t*)(tmp_stream +
-        num*NUMBER_OF_THREAD_STREAM*sizeof(cudaStream_t));
+    t_device[dev]->stream =
+        (cudaStream_t *)(tmp_stream +
+                         num * NUMBER_OF_THREAD_STREAM * sizeof(cudaStream_t));
 
     hl_create_thread_resources(dev, t_device[dev]);
     num++;
@@ -266,14 +266,14 @@ void hl_fini() {
     t_resource.stream[i] = 0;
   }
 
-  char* tmp = (char*)t_device;
-  char* tmp_stream = NULL;
+  char *tmp = (char *)t_device;
+  char *tmp_stream = NULL;
   for (int dev = 0; dev < g_system_device_num; dev++) {
     if (!t_device[dev]) {
       continue;
     }
     if (!tmp_stream) {
-        tmp_stream = (char*)t_device[dev]->stream;
+      tmp_stream = (char *)t_device[dev]->stream;
     }
     for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
       CHECK_CUDA(dynload::cudaStreamDestroy(t_device[dev]->stream[j]));
@@ -290,9 +290,7 @@ void hl_fini() {
   t_resource.is_init = false;
 }
 
-int hl_get_device_count() {
-  return device_num;
-}
+int hl_get_device_count() { return device_num; }
 
 void hl_set_device(int device) {
   if (device == t_resource.device) {
@@ -300,7 +298,7 @@ void hl_set_device(int device) {
   }
 
   CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-    << "Device: " << device << " is not specified in startup.";
+      << "Device: " << device << " is not specified in startup.";
 
   CHECK_CUDA(dynload::cudaSetDevice(device));
 
@@ -312,11 +310,11 @@ void hl_set_device(int device) {
   if (true == t_resource.is_init) {
     for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
       t_resource.stream[i] =
-        t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
+          t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
     }
     t_resource.gpu_mem = t_device[device]->gpu_mem;
     t_resource.cpu_mem = t_device[device]->cpu_mem;
-    t_resource.event   = t_device[device]->mem_event;
+    t_resource.event = t_device[device]->mem_event;
   }
 
   t_resource.handle = g_device[device]->device_resources->handle;
@@ -334,11 +332,11 @@ int hl_get_device() {
   return device;
 }
 
-void* hl_malloc_device(size_t size) {
+void *hl_malloc_device(size_t size) {
   void *dest_d;
 
   CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(dynload::cudaMalloc((void**)&dest_d, size));
+  CHECK_CUDA(dynload::cudaMalloc((void **)&dest_d, size));
 
   return dest_d;
 }
@@ -348,15 +346,15 @@ void hl_free_mem_device(void *dest_d) {
 
   cudaError_t err = dynload::cudaFree(dest_d);
   CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
-    << hl_get_device_error_string();
+      << hl_get_device_error_string();
 }
 
-void* hl_malloc_host(size_t size) {
+void *hl_malloc_host(size_t size) {
   void *dest_h;
 
   CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(dynload::cudaHostAlloc(
-    (void**)&dest_h, size, cudaHostAllocDefault));
+  CHECK_CUDA(
+      dynload::cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
 
   return dest_h;
 }
@@ -366,7 +364,7 @@ void hl_free_mem_host(void *dest_h) {
 
   cudaError_t err = dynload::cudaFreeHost(dest_h);
   CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
-    << hl_get_device_error_string();
+      << hl_get_device_error_string();
 }
 
 void hl_memcpy(void *dst, void *src, size_t size) {
@@ -388,8 +386,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
   }
   CHECK_NOTNULL(src_h);
   CHECK_NOTNULL(dest_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size,
-             cudaMemcpyHostToDevice));
+  CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
 }
 
 void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
@@ -398,8 +395,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
   }
   CHECK_NOTNULL(dest_h);
   CHECK_NOTNULL(src_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size,
-             cudaMemcpyDeviceToHost));
+  CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
 }
 
 void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
@@ -408,8 +404,8 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
   }
   CHECK_NOTNULL(dest_d);
   CHECK_NOTNULL(src_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_d, size,
-             cudaMemcpyDeviceToDevice));
+  CHECK_CUDA(
+      dynload::cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
 }
 
 void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
@@ -423,8 +419,8 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
   CHECK_LT(stream, HPPL_STREAM_END);
   cu_stream = t_resource.stream[stream];
 
-  CHECK_CUDA(dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault,
-             cu_stream));
+  CHECK_CUDA(
+      dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
 }
 
 void hl_start() {
@@ -435,8 +431,8 @@ void hl_start() {
 
 bool hl_device_can_access_peer(int device, int peerDevice) {
   int canAccessPeer;
-  CHECK_CUDA(dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device,
-             peerDevice));
+  CHECK_CUDA(
+      dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
 
   if (canAccessPeer == 1) {
     return true;
@@ -478,33 +474,32 @@ void hl_create_global_resources(hl_device_prop device_prop) {
 
   /* create curand gen */
   CHECK_EQ(dynload::curandCreateGenerator(&device_res->gen,
-           CURAND_RNG_PSEUDO_DEFAULT), CURAND_STATUS_SUCCESS)
-           << "[Start failed] Curand init failed.";
+                                          CURAND_RNG_PSEUDO_DEFAULT),
+           CURAND_STATUS_SUCCESS)
+      << "[Start failed] Curand init failed.";
 
-  CHECK_EQ(dynload::curandSetStream(device_res->gen,
-           device_res->stream[0]), CURAND_STATUS_SUCCESS)
-           << "[Start failed] Curand set stream failed!";
+  CHECK_EQ(dynload::curandSetStream(device_res->gen, device_res->stream[0]),
+           CURAND_STATUS_SUCCESS)
+      << "[Start failed] Curand set stream failed!";
 
   /* create cudnn handle */
   hl_cudnn_init(&device_res->cudnn_handle, device_res->stream[0]);
 
   int seed = gettid();
-  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(
-           device_res->gen, seed+device), CURAND_STATUS_SUCCESS);
+  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(device_res->gen,
+                                                       seed + device),
+           CURAND_STATUS_SUCCESS);
 
-  device_res->gen_mutex =
-    (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t)));
+  device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
   pthread_mutex_init(device_res->gen_mutex, NULL);
 
   CHECK_CUDA(dynload::cudaRuntimeGetVersion(&g_cuda_lib_version));
 }
 
-int hl_get_cuda_version() {
-  return g_cuda_lib_version;
-}
+int hl_get_cuda_version() { return g_cuda_lib_version; }
 
 void hl_create_thread_resources(int device,
-  thread_device_resources device_res) {
+                                thread_device_resources device_res) {
   CHECK_CUDA(dynload::cudaSetDevice(device));
 
   /* create thread stream */
@@ -513,15 +508,15 @@ void hl_create_thread_resources(int device,
   }
 
   /* allocation device memory */
-  device_res->gpu_mem = (real*)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
+  device_res->gpu_mem = (real *)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
 
   /* allocation host memory */
-  device_res->cpu_mem = (real*)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
+  device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
 
   CHECK_CUDA(dynload::cudaEventCreate(&device_res->mem_event));
 }
 
-void hl_specify_devices_start(int* device, int number) {
+void hl_specify_devices_start(int *device, int number) {
   if (hl_start_flag) return;
 
   /* 1. get the number of devices */
@@ -533,20 +528,19 @@ void hl_specify_devices_start(int* device, int number) {
 
   /* 2. check device & create device property table */
   CHECK_LE(number, g_system_device_num)
-    << "[Start failed] System does not have enough device. "
-    << "Device number: " << g_system_device_num
-    << "Input number: " << number;
+      << "[Start failed] System does not have enough device. "
+      << "Device number: " << g_system_device_num << "Input number: " << number;
 
   char *tmp;
   hl_device_prop device_prop;
-  tmp = (char *)malloc(g_system_device_num*sizeof(hl_device_prop*) +
-                       number*sizeof(_hl_device_prop));
+  tmp = (char *)malloc(g_system_device_num * sizeof(hl_device_prop *) +
+                       number * sizeof(_hl_device_prop));
   CHECK(tmp) << "[Start failed] System memory is not enough.";
 
-  g_device = (hl_device_prop*)tmp;
-  device_prop = (hl_device_prop)((char*)tmp +
-                g_system_device_num*sizeof(hl_device_prop*));
-  memset(g_device, 0, g_system_device_num*sizeof(hl_device_prop*));
+  g_device = (hl_device_prop *)tmp;
+  device_prop = (hl_device_prop)(
+      (char *)tmp + g_system_device_num * sizeof(hl_device_prop *));
+  memset(g_device, 0, g_system_device_num * sizeof(hl_device_prop *));
   int num = 0;
   for (int i = 0; i < number; i++) {
     int dev;
@@ -557,13 +551,13 @@ void hl_specify_devices_start(int* device, int number) {
     }
 
     CHECK_LT(dev, g_system_device_num)
-      << "[Start failed] The specified device number is "
-      << "out of range. Max device number: " << g_system_device_num - 1
-      << " Specified devcie number: "<< dev;
+        << "[Start failed] The specified device number is "
+        << "out of range. Max device number: " << g_system_device_num - 1
+        << " Specified devcie number: " << dev;
 
     if (g_device[dev]) {
       /* Warning */
-      LOG(WARNING) <<"[Warning] Repeat specify device: " << dev;
+      LOG(WARNING) << "[Warning] Repeat specify device: " << dev;
       continue;
     }
 
@@ -574,11 +568,11 @@ void hl_specify_devices_start(int* device, int number) {
   device_num = num;
 
   /* 3.  create global device resources */
-  char *tmp_res = (char *)malloc(device_num*sizeof(_global_device_resources));
+  char *tmp_res = (char *)malloc(device_num * sizeof(_global_device_resources));
   CHECK_NOTNULL(tmp_res);
 
-  char *tmp_stream =
-    (char *)malloc(device_num*NUMBER_OF_GLOBAL_STREAM*sizeof(cudaStream_t));
+  char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_GLOBAL_STREAM *
+                                    sizeof(cudaStream_t));
   CHECK_NOTNULL(tmp_stream);
 
   num = 0;
@@ -587,10 +581,11 @@ void hl_specify_devices_start(int* device, int number) {
       continue;
     }
 
-    g_device[i]->device_resources = (global_device_resources)(tmp_res +
-      num*sizeof(_global_device_resources));
-    g_device[i]->device_resources->stream = (cudaStream_t*)(tmp_stream +
-      num*NUMBER_OF_GLOBAL_STREAM*sizeof(cudaStream_t));
+    g_device[i]->device_resources = (global_device_resources)(
+        tmp_res + num * sizeof(_global_device_resources));
+    g_device[i]->device_resources->stream =
+        (cudaStream_t *)(tmp_stream +
+                         num * NUMBER_OF_GLOBAL_STREAM * sizeof(cudaStream_t));
 
     hl_create_global_resources(g_device[i]);
     num++;
@@ -600,9 +595,9 @@ void hl_specify_devices_start(int* device, int number) {
   hl_start_flag = true;
   /* set default device */
   if (device == NULL) {
-      hl_set_device(0);
+    hl_set_device(0);
   } else {
-      hl_set_device(device[0]);
+    hl_set_device(device[0]);
   }
 }
 
@@ -610,35 +605,31 @@ void hl_rand(real *dest_d, size_t num) {
   pthread_mutex_lock(t_resource.gen_mutex);
   CHECK_EQ(
 #ifndef PADDLE_TYPE_DOUBLE
-  dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
+      dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
 #else
-  dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
+      dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
 #endif
-  CURAND_STATUS_SUCCESS);
+      CURAND_STATUS_SUCCESS);
   pthread_mutex_unlock(t_resource.gen_mutex);
   CHECK_SYNC("hl_rand failed");
 }
 
 void hl_srand(unsigned int seed) {
   pthread_mutex_lock(t_resource.gen_mutex);
-  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(
-           t_resource.gen, seed), CURAND_STATUS_SUCCESS);
+  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(t_resource.gen, seed),
+           CURAND_STATUS_SUCCESS);
   pthread_mutex_unlock(t_resource.gen_mutex);
 }
 
-void hl_set_sync_flag(bool flag) {
-  g_sync_flag = flag;
-}
+void hl_set_sync_flag(bool flag) { g_sync_flag = flag; }
 
-bool hl_get_sync_flag() {
-  return g_sync_flag;
-}
+bool hl_get_sync_flag() { return g_sync_flag; }
 
 void hl_stream_synchronize(hl_stream_t stream) {
   cudaStream_t cu_stream;
 
-  CHECK_LT(stream, HPPL_STREAM_END)
-    << __func__ <<": the parameter stream is error.";
+  CHECK_LT(stream, HPPL_STREAM_END) << __func__
+                                    << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
   CHECK_CUDA(dynload::cudaStreamSynchronize(cu_stream));
@@ -647,8 +638,8 @@ void hl_stream_synchronize(hl_stream_t stream) {
 void hl_create_event(hl_event_t *event) {
   CHECK_NOTNULL(event);
 
-  struct _hl_event_st* st_event =
-    (struct _hl_event_st*)malloc(sizeof(struct _hl_event_st));
+  struct _hl_event_st *st_event =
+      (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
 
   CHECK_CUDA(dynload::cudaEventCreate(&st_event->cu_event));
 
@@ -660,8 +651,8 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
   CHECK_NOTNULL(start);
   CHECK_NOTNULL(end);
 
-  CHECK_CUDA(dynload::cudaEventElapsedTime(&time,
-             start->cu_event, end->cu_event));
+  CHECK_CUDA(
+      dynload::cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
   return time;
 }
 
@@ -669,24 +660,22 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
   cudaStream_t cu_stream;
 
   CHECK_NOTNULL(event);
-  CHECK_LT(stream, HPPL_STREAM_END)
-    << __func__ <<": the parameter stream is error.";
+  CHECK_LT(stream, HPPL_STREAM_END) << __func__
+                                    << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaEventRecord(
-             event->cu_event, cu_stream));
+  CHECK_CUDA(dynload::cudaEventRecord(event->cu_event, cu_stream));
 }
 
 void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
   cudaStream_t cu_stream;
 
   CHECK_NOTNULL(event);
-  CHECK_LT(stream, HPPL_STREAM_END)
-    << __func__ <<": the parameter stream is error.";
+  CHECK_LT(stream, HPPL_STREAM_END) << __func__
+                                    << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaStreamWaitEvent(
-             cu_stream, event->cu_event, 0));
+  CHECK_CUDA(dynload::cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
 }
 
 void hl_destroy_event(hl_event_t event) {
@@ -705,15 +694,15 @@ void hl_event_synchronize(hl_event_t event) {
 void hl_get_device_name(char *name, int len, int device) {
   CHECK_NOTNULL(name);
   CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-    << "Device("<< device <<") is not specified in startup.";
+      << "Device(" << device << ") is not specified in startup.";
 
-  strncpy(name, g_device[device]->device_name , len);
+  strncpy(name, g_device[device]->device_name, len);
 }
 
 void hl_get_device_memory(size_t *mem_size, int device) {
   CHECK_NOTNULL(mem_size);
   CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-    << "Device("<< device <<") is not specified in startup.";
+      << "Device(" << device << ") is not specified in startup.";
 
   *mem_size = g_device[device]->device_mem;
 }
@@ -722,31 +711,26 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) {
   CHECK_NOTNULL(major);
   CHECK_NOTNULL(minor);
   CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-    << "Device("<< device << ") is not specified in startup.";
+      << "Device(" << device << ") is not specified in startup.";
 
   *major = g_device[device]->major;
   *minor = g_device[device]->minor;
 }
 
-int hl_get_device_last_error() {
-  return (int)dynload::cudaGetLastError();
-}
+int hl_get_device_last_error() { return (int)dynload::cudaGetLastError(); }
 
-const char* hl_get_device_error_string() {
+const char *hl_get_device_error_string() {
   cudaError_t err = dynload::cudaGetLastError();
   return dynload::cudaGetErrorString(err);
 }
 
-const char* hl_get_device_error_string(size_t err) {
+const char *hl_get_device_error_string(size_t err) {
   return dynload::cudaGetErrorString((cudaError_t)err);
 }
 
-void hl_device_synchronize() {
-  CHECK_CUDA(dynload::cudaDeviceSynchronize());
-}
+void hl_device_synchronize() { CHECK_CUDA(dynload::cudaDeviceSynchronize()); }
 void hl_set_device_flags_block() {
-  CHECK_CUDA(dynload::cudaSetDeviceFlags(
-             cudaDeviceScheduleBlockingSync));
+  CHECK_CUDA(dynload::cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
 }
 
 bool hl_cuda_event_is_ready(hl_event_t event) {
diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc
index fe755b8c2606dffeeff2ea1549180ca8b134c251..ff6b830b7addc5c87af0d55070260c279a046a75 100644
--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ b/paddle/cuda/src/hl_cudart_wrap.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifdef PADDLE_USE_DSO
 
 #include 
@@ -29,26 +28,26 @@ limitations under the License. */
 namespace dynload {
 
 extern std::once_flag cudart_dso_flag;
-extern void* cudart_dso_handle;
+extern void *cudart_dso_handle;
 
 /**
  * The following macro definition can generate structs
  * (for each function) to dynamic load cuda routine
  * via operator overloading.
  **/
-#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type)                    \
-  struct DynLoad__##__name {                                        \
-    template                                      \
-    __type operator()(Args... args) {                               \
-      typedef __type (*cudartFunc)(Args...);                        \
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle,           \
-                    &cudart_dso_handle);                            \
-      void* p_##__name = dlsym(cudart_dso_handle, #__name);         \
-      return reinterpret_cast(p_##__name)(args...);     \
-    }                                                               \
-  } __name;  /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type)                               \
+  struct DynLoad__##__name {                                                   \
+    template                                                 \
+    __type operator()(Args... args) {                                          \
+      typedef __type (*cudartFunc)(Args...);                                   \
+      std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
+      void *p_##__name = dlsym(cudart_dso_handle, #__name);                    \
+      return reinterpret_cast(p_##__name)(args...);                \
+    }                                                                          \
+  } __name; /* struct DynLoad__##__name */
 
 /* include all needed cuda functions in HPPL */
+// clang-format off
 #define CUDA_ROUTINE_EACH(__macro)          \
   __macro(cudaLaunch, cudaError_t)          \
   __macro(cudaSetupArgument, cudaError_t)   \
@@ -61,16 +60,17 @@ extern void* cudart_dso_handle;
   __macro(__cudaInitModule, char)           \
   __macro(__cudaRegisterTexture, void)      \
   __macro(__cudaRegisterSurface, void)
+// clang-format on
 
 CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
 
 #if CUDART_VERSION >= 7000
-  DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
+DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
 #endif
 
 #undef CUDA_ROUNTINE_EACH
 
-}  /* namespace dynload */
+} /* namespace dynload */
 
 #if CUDART_VERSION >= 7000
 __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
@@ -79,12 +79,11 @@ __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
                                                 void **args,
                                                 size_t sharedMem,
                                                 cudaStream_t stream) {
-  return dynload::cudaLaunchKernel(func, gridDim, blockDim,
-                                   args, sharedMem, stream);
+  return dynload::cudaLaunchKernel(
+      func, gridDim, blockDim, args, sharedMem, stream);
 }
 #endif /* CUDART_VERSION >= 7000 */
 
-
 __host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
   return dynload::cudaLaunch(func);
 }
@@ -99,13 +98,12 @@ __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
                                                  dim3 blockDim,
                                                  size_t sharedMem,
                                                  cudaStream_t stream) {
-  return dynload::cudaConfigureCall(gridDim, blockDim,
-                                    sharedMem, stream);
+  return dynload::cudaConfigureCall(gridDim, blockDim, sharedMem, stream);
 }
 
 extern "C" {
 
-void** CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
+void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
   return dynload::__cudaRegisterFatBinary(fatCubin);
 }
 
@@ -113,86 +111,87 @@ void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
   return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
 }
 
-void CUDARTAPI __cudaRegisterFunction(
-        void   **fatCubinHandle,
-  const char    *hostFun,
-        char    *deviceFun,
-  const char    *deviceName,
-        int      thread_limit,
-        uint3   *tid,
-        uint3   *bid,
-        dim3    *bDim,
-        dim3    *gDim,
-        int     *wSize
-) {
-  return dynload::__cudaRegisterFunction(
-                fatCubinHandle, hostFun, deviceFun, deviceName,
-                thread_limit, tid, bid, bDim, gDim, wSize);
+void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle,
+                                      const char *hostFun,
+                                      char *deviceFun,
+                                      const char *deviceName,
+                                      int thread_limit,
+                                      uint3 *tid,
+                                      uint3 *bid,
+                                      dim3 *bDim,
+                                      dim3 *gDim,
+                                      int *wSize) {
+  return dynload::__cudaRegisterFunction(fatCubinHandle,
+                                         hostFun,
+                                         deviceFun,
+                                         deviceName,
+                                         thread_limit,
+                                         tid,
+                                         bid,
+                                         bDim,
+                                         gDim,
+                                         wSize);
 }
 
-void CUDARTAPI __cudaRegisterVar(
-        void **fatCubinHandle,
-        char  *hostVar,
-        char  *deviceAddress,
-  const char  *deviceName,
-        int    ext,
-        int    size,
-        int    constant,
-        int    global
-) {
-  return dynload::__cudaRegisterVar(
-                fatCubinHandle, hostVar, deviceAddress,
-                deviceName, ext, size, constant, global);
+void CUDARTAPI __cudaRegisterVar(void **fatCubinHandle,
+                                 char *hostVar,
+                                 char *deviceAddress,
+                                 const char *deviceName,
+                                 int ext,
+                                 int size,
+                                 int constant,
+                                 int global) {
+  return dynload::__cudaRegisterVar(fatCubinHandle,
+                                    hostVar,
+                                    deviceAddress,
+                                    deviceName,
+                                    ext,
+                                    size,
+                                    constant,
+                                    global);
 }
 
-
-
-extern void CUDARTAPI __cudaRegisterManagedVar(
-        void **fatCubinHandle,
-        void **hostVarPtrAddress,
-        char  *deviceAddress,
-  const char  *deviceName,
-        int    ext,
-        int    size,
-        int    constant,
-        int    global
-) {
-  return dynload::__cudaRegisterManagedVar(
-                fatCubinHandle, hostVarPtrAddress, deviceAddress,
-                deviceName, ext, size, constant, global);
+extern void CUDARTAPI __cudaRegisterManagedVar(void **fatCubinHandle,
+                                               void **hostVarPtrAddress,
+                                               char *deviceAddress,
+                                               const char *deviceName,
+                                               int ext,
+                                               int size,
+                                               int constant,
+                                               int global) {
+  return dynload::__cudaRegisterManagedVar(fatCubinHandle,
+                                           hostVarPtrAddress,
+                                           deviceAddress,
+                                           deviceName,
+                                           ext,
+                                           size,
+                                           constant,
+                                           global);
 }
 
-char CUDARTAPI __cudaInitModule(
-        void **fatCubinHandle
-) {
+char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
   return dynload::__cudaInitModule(fatCubinHandle);
 }
 
-void CUDARTAPI __cudaRegisterTexture(
-        void                    **fatCubinHandle,
-  const struct textureReference  *hostVar,
-  const void                    **deviceAddress,
-  const char                     *deviceName,
-        int                       dim,
-        int                       norm,
-        int                       ext
-) {
+void CUDARTAPI __cudaRegisterTexture(void **fatCubinHandle,
+                                     const struct textureReference *hostVar,
+                                     const void **deviceAddress,
+                                     const char *deviceName,
+                                     int dim,
+                                     int norm,
+                                     int ext) {
   return dynload::__cudaRegisterTexture(
-                fatCubinHandle, hostVar, deviceAddress,
-                deviceName, dim, norm, ext);
+      fatCubinHandle, hostVar, deviceAddress, deviceName, dim, norm, ext);
 }
 
-void CUDARTAPI __cudaRegisterSurface(
-        void                    **fatCubinHandle,
-  const struct surfaceReference  *hostVar,
-  const void                    **deviceAddress,
-  const char                     *deviceName,
-        int                       dim,
-        int                       ext
-) {
+void CUDARTAPI __cudaRegisterSurface(void **fatCubinHandle,
+                                     const struct surfaceReference *hostVar,
+                                     const void **deviceAddress,
+                                     const char *deviceName,
+                                     int dim,
+                                     int ext) {
   return dynload::__cudaRegisterSurface(
-                fatCubinHandle, hostVar, deviceAddress,
-                deviceName, dim, ext);
+      fatCubinHandle, hostVar, deviceAddress, deviceName, dim, ext);
 }
 
 } /* extern "C" */
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index 5cb16cfbb372209a6cac83cdaace9afbf590e0fe..1a3ce08619fc3a5787576b30e9f4c13336990e74 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -12,17 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "hl_dso_loader.h"
-#include "paddle/utils/Logging.h"
 #include "paddle/utils/CommandLineParser.h"
+#include "paddle/utils/Logging.h"
 
-P_DEFINE_string(cudnn_dir, "",
+P_DEFINE_string(cudnn_dir,
+                "",
                 "Specify path for loading libcudnn.so. For instance, "
-                "/usr/local/cudnn/lib64. If empty [default], dlopen "
+                "/usr/local/cudnn/lib. If empty [default], dlopen "
                 "will search cudnn from LD_LIBRARY_PATH");
 
-P_DEFINE_string(cuda_dir, "",
+P_DEFINE_string(cuda_dir,
+                "",
                 "Specify path for loading cuda library, such as libcublas, "
                 "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
                 "libcudart can not be specified by cuda_dir, since some "
@@ -33,7 +34,6 @@ static inline std::string join(const std::string& part1,
                                const std::string& part2) {
   // directory separator
   const char sep = '/';
-
   if (!part2.empty() && part2.front() == sep) {
     return part2;
   }
@@ -47,100 +47,115 @@ static inline std::string join(const std::string& part1,
   return ret;
 }
 
-static inline void GetDsoHandleFromDefaultPath(
-        std::string& dso_path, void** dso_handle, int dynload_flags) {
-    VLOG(3) << "Try to find cuda library: " << dso_path
-            << " from default system path.";
-    // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
+                                               void** dso_handle,
+                                               int dynload_flags) {
+  VLOG(3) << "Try to find cuda library: " << dso_path
+          << " from default system path.";
+  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+  *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+
+// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
+// bring System Integrity Projection (SIP), if dso_handle
+// is null, search from default package path in Mac OS.
+#if defined(__APPLE__) || defined(__OSX__)
+  if (nullptr == *dso_handle) {
+    dso_path = join("/usr/local/cuda/lib/", dso_path);
     *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-
-    // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
-    // bring System Integrity Projection (SIP), if dso_handle
-    // is null, search from default package path in Mac OS.
-    #if defined(__APPLE__) || defined(__OSX__)
     if (nullptr == *dso_handle) {
-        dso_path = join("/usr/local/cuda/lib/", dso_path);
-        *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-        if (nullptr == *dso_handle) {
-            if (dso_path == "libcudnn.dylib") {
-                LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n" // NOLINT
-                << "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C "    // NOLINT
-                << "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h "    // NOLINT
-                << "/usr/local/cuda/lib/libcudnn*";
-            }
-        }
+      if (dso_path == "libcudnn.dylib") {
+        LOG(FATAL)
+            << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"  // NOLINT
+            << "For instance, sudo tar -xzf "
+               "cudnn-7.5-osx-x64-v5.0-ga.tgz -C "  // NOLINT
+            << "/usr/local \n sudo chmod a+r "
+               "/usr/local/cuda/include/cudnn.h "  // NOLINT
+            << "/usr/local/cuda/lib/libcudnn*";
+      }
     }
-    #endif
+  }
+#endif
 }
 
-static inline void GetDsoHandleFromSearchPath(
-        const std::string& search_root,
-        const std::string& dso_name,
-        void** dso_handle) {
-    int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
-    *dso_handle = nullptr;
-
-    std::string dlPath = dso_name;
-    if (search_root.empty()) {
-        GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
-    } else {
-        // search xxx.so from custom path
-        dlPath = join(search_root, dso_name);
-        *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
-        // if not found, search from default path
-        if (nullptr == dso_handle) {
-            LOG(WARNING) << "Failed to find cuda library: " << dlPath;
-            dlPath = dso_name;
-            GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
-        }
+static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
+                                              const std::string& dso_name,
+                                              void** dso_handle) {
+  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+  *dso_handle = nullptr;
+
+  std::string dlPath = dso_name;
+  if (search_root.empty()) {
+    GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+  } else {
+    // search xxx.so from custom path
+    dlPath = join(search_root, dso_name);
+    *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+    // if not found, search from default path
+    if (nullptr == *dso_handle) {
+      LOG(WARNING) << "Failed to find cuda library: " << dlPath;
+      dlPath = dso_name;
+      GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
     }
+  }
 
-    CHECK(nullptr != *dso_handle)
-      << "Failed to find cuda library: " << dlPath << std::endl
-      << "Please specify its path correctly using one of the following ways: \n"    // NOLINT
-
-      << "Method 1. set cuda and cudnn lib path at runtime. "
-      << "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n" // NOLINT
-      << "For instance, issue command: paddle train --use_gpu=1 "
-      << "--cuda_dir=/usr/local/cuda/lib64 --cudnn_dir=/usr/local/cudnn/lib ...\n"  // NOLINT
-
-      << "Method 2. set environment variable LD_LIBRARY_PATH on Linux or "
-      << "DYLD_LIBRARY_PATH on Mac OS. \n"
-      << "For instance, issue command: export LD_LIBRARY_PATH=... \n"
-
-      << "Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is impossible "
-      << "unless System Integrity Protection (SIP) is disabled. However, method 1 " // NOLINT
-      << "always work well.";
+  CHECK(nullptr != *dso_handle) << "Failed to find cuda library: " << dlPath
+                                << std::endl
+                                << "Please specify its path correctly using "
+                                   "one of the following ways: \n"  // NOLINT
+
+                                << "Method 1. set cuda and cudnn lib path at "
+                                   "runtime. "
+                                << "http://www.paddlepaddle.org/doc/ui/"
+                                   "cmd_argument/"
+                                   "argument_outline.html \n"  // NOLINT
+                                << "For instance, issue command: paddle train "
+                                   "--use_gpu=1 "
+                                << "--cuda_dir=/usr/local/cuda/lib64 "
+                                   "--cudnn_dir=/usr/local/cudnn/lib "
+                                   "...\n"  // NOLINT
+
+                                << "Method 2. set environment variable "
+                                   "LD_LIBRARY_PATH on Linux or "
+                                << "DYLD_LIBRARY_PATH on Mac OS. \n"
+                                << "For instance, issue command: export "
+                                   "LD_LIBRARY_PATH=... \n"
+
+                                << "Note: After Mac OS 10.11, using the "
+                                   "DYLD_LIBRARY_PATH is impossible "
+                                << "unless System Integrity Protection (SIP) "
+                                   "is disabled. However, "
+                                   "method 1 "  // NOLINT
+                                << "always work well.";
 }
 
 void GetCublasDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
 #else
-    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
 #endif
 }
 
 void GetCudnnDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
 #else
-    GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
 #endif
 }
 
 void GetCudartDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
+  GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
 #else
-    GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
+  GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
 #endif
 }
 
 void GetCurandDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
 #else
-    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
 #endif
 }
diff --git a/paddle/cuda/src/hl_math.cc b/paddle/cuda/src/hl_math.cc
index 76d48c4a9b94d402cf84c57bd240e03a1a83b1a0..f4bf888bab4e92dd940714ef1b7aeee9242eb817 100644
--- a/paddle/cuda/src/hl_math.cc
+++ b/paddle/cuda/src/hl_math.cc
@@ -12,24 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "avx_mathfun.h"
 
 namespace hppl {
-__m256 exp(__m256 a) {
-  return exp256_ps(a);
-}
+__m256 exp(__m256 a) { return exp256_ps(a); }
 
-__m256 log(__m256 a) {
-  return log256_ps(a);
-}
+__m256 log(__m256 a) { return log256_ps(a); }
 
-__m256 sin(__m256 a) {
-  return sin256_ps(a);
-}
+__m256 sin(__m256 a) { return sin256_ps(a); }
 
-__m256 cos(__m256 a) {
-  return cos256_ps(a);
-}
+__m256 cos(__m256 a) { return cos256_ps(a); }
 
 }  // namespace hppl
diff --git a/paddle/cuda/src/hl_time.cc b/paddle/cuda/src/hl_time.cc
index adc88d60dd8d547cedcae5fd088b2fa581d8e5be..d52b2a1df07374f632def12eb52e10e10ca86028 100644
--- a/paddle/cuda/src/hl_time.cc
+++ b/paddle/cuda/src/hl_time.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include 
 #include 
 #include 
@@ -21,8 +20,7 @@ limitations under the License. */
 using std::chrono::high_resolution_clock;
 
 int64_t getCurrentTimeStick() {
-    high_resolution_clock::time_point tp = high_resolution_clock::now();
-    high_resolution_clock::duration dtn = tp.time_since_epoch();
-    return dtn.count();
+  high_resolution_clock::time_point tp = high_resolution_clock::now();
+  high_resolution_clock::duration dtn = tp.time_since_epoch();
+  return dtn.count();
 }
-
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index 27eed75d4d76c351e381a3b71dc44a3254fb1a4d..f1bb94216c44b3e915f87a3ae49bdfd3ef812916 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -51,12 +51,14 @@ static ClassRegistrar gActivationRegistrar;
  * @brief Macro for registering a derived activation class
  */
 #define END_DEFINE_ACTIVATION(ACTIVATION_NAME)                     \
-  };                                                               \
+  }                                                                \
+  ;                                                                \
   const std::string ACTIVATION_CLASS_NAME(ACTIVATION_NAME)::name = \
       #ACTIVATION_NAME;                                            \
   static InitFunction __reg_activation__##ACTIVATION_NAME([] {     \
-    gActivationRegistrar.registerClass<                            \
-        ACTIVATION_CLASS_NAME(ACTIVATION_NAME)>(#ACTIVATION_NAME); \
+    gActivationRegistrar                                           \
+        .registerClass(    \
+            #ACTIVATION_NAME);                                     \
   });
 
 /**
@@ -111,14 +113,22 @@ void backward(Argument& act) {
     outputG->softmaxBackward(*outputV);
   } else {
     SetDevice device(act.deviceId);
-    Matrix::resizeOrCreate(sftMaxDot_, outputG->getHeight(),
+    Matrix::resizeOrCreate(sftMaxDot_,
+                           outputG->getHeight(),
                            outputG->getWidth(),
-                           /* trans */ false, useGpu(act.deviceId));
-    Matrix::resizeOrCreate(sftMaxSum_, outputG->getHeight(), 1,
-                           /* trans */ false, useGpu(act.deviceId));
+                           /* trans */ false,
+                           useGpu(act.deviceId));
+    Matrix::resizeOrCreate(sftMaxSum_,
+                           outputG->getHeight(),
+                           1,
+                           /* trans */ false,
+                           useGpu(act.deviceId));
     if (!one_ || one_->getWidth() != outputG->getWidth()) {
-      Matrix::resizeOrCreate(one_, 1, outputG->getWidth(),
-                             /* trans */ false, useGpu(act.deviceId));
+      Matrix::resizeOrCreate(one_,
+                             1,
+                             outputG->getWidth(),
+                             /* trans */ false,
+                             useGpu(act.deviceId));
       one_->one();
     }
 
@@ -130,7 +140,6 @@ void backward(Argument& act) {
 }
 END_DEFINE_ACTIVATION(softmax)
 
-
 /**
  * @brief Sequence_softmax Activation
  * @note Softmax on all frames of one sequence.
@@ -146,10 +155,16 @@ void forward(Argument& act) {
   CHECK_EQ(act.value->getWidth(), 1UL);
 
   if (!argument_.value) {
-    argument_.value = Matrix::create(nullptr, /* height= */ 1, 1,
-                                     /* trans= */ false, useGpu(act.deviceId));
-    argument_.grad = Matrix::create(nullptr, /* height= */ 1, 1,
-                                    /* trans= */ false, useGpu(act.deviceId));
+    argument_.value = Matrix::create(nullptr,
+                                     /* height= */ 1,
+                                     1,
+                                     /* trans= */ false,
+                                     useGpu(act.deviceId));
+    argument_.grad = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    1,
+                                    /* trans= */ false,
+                                    useGpu(act.deviceId));
   }
 
   auto starts = act.sequenceStartPositions->getVector(useGpu(act.deviceId));
@@ -267,8 +282,11 @@ END_DEFINE_ACTIVATION(softrelu)
 BEGIN_DEFINE_ACTIVATION(abs)
 void forward(Argument& act) {
   SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
-                         /* trans */ false, useGpu(act.deviceId));
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
 
   act.in->copyFrom(*act.value);
   act.value->abs(*act.value);
@@ -286,8 +304,11 @@ END_DEFINE_ACTIVATION(abs)
 BEGIN_DEFINE_ACTIVATION(square)
 void forward(Argument& act) {
   SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
-                         /* trans */ false, useGpu(act.deviceId));
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
 
   act.in->copyFrom(*act.value);
   act.value->square(*act.value);
@@ -317,8 +338,11 @@ END_DEFINE_ACTIVATION(exponential)
 BEGIN_DEFINE_ACTIVATION(log)
 void forward(Argument& act) {
   SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
-                         /* trans */ false, useGpu(act.deviceId));
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
 
   act.in->copyFrom(*act.value);
   act.value->log(*act.value);
@@ -333,11 +357,9 @@ ActivationFunction* ActivationFunction::create(const std::string& type) {
 
 std::vector ActivationFunction::getAllRegisteredTypes() {
   std::vector types;
-  gActivationRegistrar.forEachType([&](const std::string& type) {
-      types.push_back(type);
-    });
+  gActivationRegistrar.forEachType(
+      [&](const std::string& type) { types.push_back(type); });
   return types;
 }
 
-
 }  // namespace paddle
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
index c483372256c035e39bfdbcaa4193a1a2e7fd80b8..e9ed5c619ab5e4dd9c52c0dac24478c2a57aa1bf 100644
--- a/paddle/gserver/activations/ActivationFunction.h
+++ b/paddle/gserver/activations/ActivationFunction.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include 
 #include 
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
index 2cfb5a3a18c8a63d69bf0598eeee2807376340bc..e6cc4a246a8494d287f8638674f4ae213f38f657 100644
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "DataProvider.h"
 
 #include "paddle/utils/Util.h"
@@ -57,7 +56,7 @@ void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) {
   }
 }
 
-DoubleBuffer::DoubleBuffer(DataProvider *dataPool,
+DoubleBuffer::DoubleBuffer(DataProvider* dataPool,
                            bool useGpu,
                            int64_t batchSize) {
   batchSize_ = batchSize;
@@ -155,7 +154,7 @@ void DoubleBuffer::startAsyncLoad() {
 }
 
 ClassRegistrar
-DataProvider::registrar_;
+    DataProvider::registrar_;
 
 DataProvider* DataProvider::create(const DataConfig& config,
                                    const ModelConfig& modelConfig,
@@ -182,7 +181,8 @@ int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
   for (int i = 0; i < config_.constant_slots_size(); ++i) {
     MemoryHandlePtr handle =
         constantSlots[i] ? constantSlots[i]->getMemoryHandle() : nullptr;
-    Matrix::resizeOrCreate(constantSlots[i], batchSize,
+    Matrix::resizeOrCreate(constantSlots[i],
+                           batchSize,
                            1,         // = width
                            false,     // = trans
                            useGpu_);  // = useGpu
@@ -216,7 +216,8 @@ void DataProvider::initAsyncLoader() {
 }
 
 SimpleDataProviderBase::SimpleDataProviderBase(const DataConfig& config,
-                                               bool useGpu, bool withInfo)
+                                               bool useGpu,
+                                               bool withInfo)
     : DataProvider(config, useGpu) {
   /* initialize the size of a sample, and the buffer */
   sampleDim_ = config_.feat_dim() * (2 * config_.context_len() + 1);
@@ -337,7 +338,8 @@ int64_t SimpleDataProviderBase::fillBuffer() {
   sampleNumInBuf_ =
       n + fillBufferImp(hInputDataBuf_->getData() + n * sampleDim_,
                         hInputLabelBuf_->getData() + n,
-                        hInputInfoBuf_->getData() + n, bufferCapacity_ - n);
+                        hInputInfoBuf_->getData() + n,
+                        bufferCapacity_ - n);
 
   /* for stachastic gradient training */
   if (!skipShuffle_) {
@@ -357,11 +359,14 @@ SimpleDataProvider::SimpleDataProvider(const DataConfig& config, bool useGpu)
 
 SimpleDataProvider::~SimpleDataProvider() {}
 
-int64_t SimpleDataProvider::fillBufferImp(real* data, int* label, int* info,
+int64_t SimpleDataProvider::fillBufferImp(real* data,
+                                          int* label,
+                                          int* info,
                                           int64_t size) {
   (void)info;
   int64_t n = std::min(labels_.size() - currentSampleIndex_, size);
-  memcpy(data, &data_[currentSampleIndex_ * sampleDim_],
+  memcpy(data,
+         &data_[currentSampleIndex_ * sampleDim_],
          n * sampleDim_ * sizeof(real));
   memcpy(label, &labels_[currentSampleIndex_], sizeof(int) * n);
   currentSampleIndex_ += n;
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 112e45de1cb232097ed63b120d5ac631b37952e9..8b7fb27f821a47d830413eced79b3352a6969c90 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include 
@@ -44,15 +43,15 @@ namespace paddle {
  * @brief Macro for registering a data provider. The class type should contain
  *        a consturctor with parameter (DataConfig, bool).
  */
-#define REGISTER_DATA_PROVIDER(__type_name, __class_name)\
-  static InitFunction __reg_type_##__type_name([]() {\
-  DataProvider::registrar_.registerClass(\
-  #__type_name, \
-  [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
-    DataProvider* dp = new __class_name (conf, useGpu);\
-    return dp;\
-  });\
-})
+#define REGISTER_DATA_PROVIDER(__type_name, __class_name)                \
+  static InitFunction __reg_type_##__type_name([]() {                    \
+    DataProvider::registrar_.registerClass(                              \
+        #__type_name,                                                    \
+        [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
+          DataProvider* dp = new __class_name(conf, useGpu);             \
+          return dp;                                                     \
+        });                                                              \
+  })
 
 /**
  * @def REGISTER_DATA_PROVIDER_EX
@@ -61,8 +60,8 @@ namespace paddle {
  */
 #define REGISTER_DATA_PROVIDER_EX(__type_name, __class_name)            \
   static InitFunction __reg_type_##__type_name([] {                     \
-  DataProvider::registrar_.registerClass<__class_name>(#__type_name);   \
-})
+    DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
+  })
 
 class DataBatch;
 class BufferBatch;
@@ -181,7 +180,8 @@ public:
    * @param[in]  size    DataBatch.getSize()
    * @param[in]  dataId  sub dataprovider id (in MultiDataProvider)
    */
-  void appendArguments(const std::vector& argus, int size,
+  void appendArguments(const std::vector& argus,
+                       int size,
                        int dataId) {
     size_ += size;
     for (const auto& argu : argus) {
@@ -259,9 +259,7 @@ typedef Queue BufferBatchQueue;
 
 class DoubleBuffer {
 public:
-  DoubleBuffer(DataProvider* dataPool,
-               bool useGpu,
-               int64_t batchSize = 0);
+  DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
   virtual ~DoubleBuffer();
   void removeOneBatch(DataBatch* dataBatch);
 
@@ -310,7 +308,7 @@ public:
   /**
    * @brief create only used for unittest.
    */
-  inline static DataProvider* create(const DataConfig &config,
+  inline static DataProvider* create(const DataConfig& config,
                                      bool useGpu = FLAGS_use_gpu) {
     return create(config, ModelConfig(), useGpu);
   }
@@ -462,7 +460,9 @@ protected:
    *
    * label[n] is the label for the n-th sample.
    */
-  virtual int64_t fillBufferImp(real* data, int* label, int* info,
+  virtual int64_t fillBufferImp(real* data,
+                                int* label,
+                                int* info,
                                 int64_t size) = 0;
 };
 
@@ -475,7 +475,9 @@ public:
 protected:
   void loadData(const std::string& fileName);
   void loadDataFile(const std::string& fileName);
-  virtual int64_t fillBufferImp(real* data, int* label, int* info,
+  virtual int64_t fillBufferImp(real* data,
+                                int* label,
+                                int* info,
                                 int64_t size);
 
 protected:
diff --git a/paddle/gserver/dataproviders/DataProviderGroup.h b/paddle/gserver/dataproviders/DataProviderGroup.h
index 0689f90f3e7dd3d3e1df19f3958c821d53e69700..6c178e29ee714a6bd7f58861d7cf15716fee848d 100644
--- a/paddle/gserver/dataproviders/DataProviderGroup.h
+++ b/paddle/gserver/dataproviders/DataProviderGroup.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "DataProvider.h"
@@ -65,8 +64,8 @@ void DataProviderGroup::reset() {
   provider_ = nullptr;
 
   // shuffle file list
-  std::shuffle(fileList_.begin(), fileList_.end(),
-      ThreadLocalRandomEngine::get());
+  std::shuffle(
+      fileList_.begin(), fileList_.end(), ThreadLocalRandomEngine::get());
 
   startLoader();
   DataProvider::reset();
@@ -113,8 +112,9 @@ void DataProviderGroup::startLoader() {
     size_t endPos = std::min(fileList_.size(), startPos + loadFileCount);
     std::vector fileVec(fileList_.begin() + startPos,
                                      fileList_.begin() + endPos);
-    loader_->addJob([this, fileVec]()
-                        -> ProviderPtrType { return this->loadFile(fileVec); });
+    loader_->addJob([this, fileVec]() -> ProviderPtrType {
+      return this->loadFile(fileVec);
+    });
   }
   loader_->stopAddJob();
 }
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.cpp b/paddle/gserver/dataproviders/MultiDataProvider.cpp
index 8e4f53978a0451f3bb6cd5da30f017708448f9ac..51fb1f26668c55dc1c2aecd5389f327e2569a52f 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.cpp
+++ b/paddle/gserver/dataproviders/MultiDataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "MultiDataProvider.h"
 #include "paddle/utils/Logging.h"
@@ -59,10 +58,8 @@ MultiDataProvider::MultiDataProvider(const DataConfig& config,
                    "MultiDataProvider";
       subConfig.set_async_load_data(false);
     }
-    subDataProviders_[i] =
-        std::unique_ptr(DataProvider::create(subConfig,
-                                                           modelConfig,
-                                                           useGpu_));
+    subDataProviders_[i] = std::unique_ptr(
+        DataProvider::create(subConfig, modelConfig, useGpu_));
   }
 }
 
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.h b/paddle/gserver/dataproviders/MultiDataProvider.h
index b498ba6516c4320566b1b3cc2bd557ae016d7c39..876467c04f074cf37e48fdfa9b24f236fcfe8ba1 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.h
+++ b/paddle/gserver/dataproviders/MultiDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "DataProvider.h"
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
index 344644755f24045443b8cb3ebd08004a4b1cdcb5..0a7ff802461f2ded0e6e842c088bddf218361f79 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ProtoDataProvider.h"
 #include "paddle/utils/Util.h"
 #include "paddle/utils/StringUtil.h"
@@ -23,7 +22,8 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 #include "DataProviderGroup.h"
 
-P_DEFINE_double(memory_threshold_on_load_data, 1.0,
+P_DEFINE_double(memory_threshold_on_load_data,
+                1.0,
                 "stop loading data when memory is not sufficient");
 
 namespace paddle {
@@ -32,7 +32,8 @@ REGISTER_DATA_PROVIDER(proto_group, DataProviderGroup);
 REGISTER_DATA_PROVIDER(proto_sequence_group,
                        DataProviderGroup);
 
-ProtoDataProvider::ProtoDataProvider(const DataConfig& config, bool useGpu,
+ProtoDataProvider::ProtoDataProvider(const DataConfig& config,
+                                     bool useGpu,
                                      bool loadDataAll)
     : DataProvider(config, useGpu), sampleNums_(0), currentSequenceIndex_(0) {
   if (loadDataAll) {
@@ -279,7 +280,8 @@ void ProtoDataProvider::fillSlots(const DataSample& sample) {
         }
         slot.sparseNonValueData.resize(slot.indices.back() + slotSize);
         const unsigned int* ids = sample.vector_slots(i).ids().data();
-        memcpy(slot.sparseNonValueData.data() + slot.indices.back(), ids,
+        memcpy(slot.sparseNonValueData.data() + slot.indices.back(),
+               ids,
                sizeof(*ids) * slotSize);
         slot.indices.push_back(slot.indices.back() + slotSize);
         if (subSlotSize) {
@@ -318,10 +320,11 @@ void ProtoDataProvider::fillSlots(const DataSample& sample) {
         slot.varDenseData[oldSize].data.resize(varDim);
         const float* values = sample.vector_slots(i).values().data();
 #ifdef PADDLE_TYPE_DOUBLE
-        std::copy(values, values + varDim,
-                  slot.varDenseData[oldSize].data.data());
+        std::copy(
+            values, values + varDim, slot.varDenseData[oldSize].data.data());
 #else
-        memcpy(slot.varDenseData[oldSize].data.data(), values,
+        memcpy(slot.varDenseData[oldSize].data.data(),
+               values,
                sizeof(real) * varDim);
 #endif
         slot.varDenseData[oldSize].dims.resize(
@@ -374,8 +377,9 @@ void ProtoDataProvider::reset() {
 }
 
 void ProtoDataProvider::shuffle() {
-  std::shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end(),
-      ThreadLocalRandomEngine::get());
+  std::shuffle(shuffledSequenceIds_.begin(),
+               shuffledSequenceIds_.end(),
+               ThreadLocalRandomEngine::get());
 }
 
 /*
@@ -502,7 +506,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
 
   if (!iidData()) {
     ICpuGpuVector::resizeOrCreate(cpuArguments[0].sequenceStartPositions,
-                                   numSequences + 1, /* useGpu= */ false);
+                                  numSequences + 1,
+                                  /* useGpu= */ false);
     int* buf = cpuArguments[0].sequenceStartPositions->getMutableData(false);
     int pos = 0;
     int i = 0;
@@ -530,7 +535,9 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
 
     switch (slotType) {
       case SlotDef::VECTOR_DENSE: {
-        Matrix::resizeOrCreate(cpuArguments[slot].value, size, dim,
+        Matrix::resizeOrCreate(cpuArguments[slot].value,
+                               size,
+                               dim,
                                false,   // trans = false
                                false);  // useGpu = false
         real* buf = cpuArguments[slot].value->getData();
@@ -543,19 +550,27 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
       }
       case SlotDef::VECTOR_SPARSE_NON_VALUE: {
         if (!(cpuArguments[slot].value)) {
-          cpuArguments[slot].value = Matrix::createSparseMatrix(
-              size, dim, size /*DEFAULT_AVG_WIDTH = 1*/, NO_VALUE, SPARSE_CSR,
-              false, useGpu_);
+          cpuArguments[slot].value =
+              Matrix::createSparseMatrix(size,
+                                         dim,
+                                         size /*DEFAULT_AVG_WIDTH = 1*/,
+                                         NO_VALUE,
+                                         SPARSE_CSR,
+                                         false,
+                                         useGpu_);
         }
         auto mat = cpuArguments[slot].value;
         mat->resize(size, dim);
         if (std::dynamic_pointer_cast(mat)) {
           std::dynamic_pointer_cast(mat)
-              ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
-                         slots_[slot].sparseNonValueData.data(), HPPL_STREAM_1);
+              ->copyFrom(dataPos.data(),
+                         slots_[slot].indices.data(),
+                         slots_[slot].sparseNonValueData.data(),
+                         HPPL_STREAM_1);
         } else if (std::dynamic_pointer_cast(mat)) {
           std::dynamic_pointer_cast(mat)
-              ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
+              ->copyFrom(dataPos.data(),
+                         slots_[slot].indices.data(),
                          slots_[slot].sparseNonValueData.data());
         } else {
           LOG(FATAL) << "Not Supported";
@@ -571,19 +586,27 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
       }
       case SlotDef::VECTOR_SPARSE_VALUE: {
         if (!(cpuArguments[slot].value)) {
-          cpuArguments[slot].value = Matrix::createSparseMatrix(
-              size, dim, size /*DEFAULT_AVG_WIDTH = 1*/, FLOAT_VALUE,
-              SPARSE_CSR, false, useGpu_);
+          cpuArguments[slot].value =
+              Matrix::createSparseMatrix(size,
+                                         dim,
+                                         size /*DEFAULT_AVG_WIDTH = 1*/,
+                                         FLOAT_VALUE,
+                                         SPARSE_CSR,
+                                         false,
+                                         useGpu_);
         }
         auto mat = cpuArguments[slot].value;
         mat->resize(size, dim);
         if (std::dynamic_pointer_cast(mat)) {
-          std::dynamic_pointer_cast(mat)->copyFrom(
-              dataPos.data(), slots_[slot].indices.data(),
-              slots_[slot].sparseFloatValueData.data(), HPPL_STREAM_1);
+          std::dynamic_pointer_cast(mat)
+              ->copyFrom(dataPos.data(),
+                         slots_[slot].indices.data(),
+                         slots_[slot].sparseFloatValueData.data(),
+                         HPPL_STREAM_1);
         } else if (std::dynamic_pointer_cast(mat)) {
           std::dynamic_pointer_cast(mat)
-              ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
+              ->copyFrom(dataPos.data(),
+                         slots_[slot].indices.data(),
                          slots_[slot].sparseFloatValueData.data());
         } else {
           LOG(FATAL) << "Not Supported";
@@ -591,7 +614,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
         break;
       }
       case SlotDef::INDEX: {
-        IVector::resizeOrCreate(cpuArguments[slot].ids, size,
+        IVector::resizeOrCreate(cpuArguments[slot].ids,
+                                size,
                                 /*  useGpu= */ false);
         int* buf = cpuArguments[slot].ids->getData();
         for (int i = 0; i < size; ++i) {
@@ -621,7 +645,9 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
         if (oldWidth < height) {
           totalDim = width * height * depth;
         }
-        Matrix::resizeOrCreate(cpuArguments[slot].value, size, totalDim,
+        Matrix::resizeOrCreate(cpuArguments[slot].value,
+                               size,
+                               totalDim,
                                false,   // trans = false
                                false);  // useGpu = false
         real* buf = cpuArguments[slot].value->getData();
@@ -637,13 +663,13 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
             }
           }
         } else {
-          memcpy(buf, slots_[slot].varDenseData[dataPos[0]].data.data(),
+          memcpy(buf,
+                 slots_[slot].varDenseData[dataPos[0]].data.data(),
                  sizeof(real) * totalDim);
         }
-        ICpuGpuVector::resizeOrCreate(
-            cpuArguments[slot].sequenceStartPositions,
-            size + 1, /* size == 1 currently */
-            /* useGpu= */ false);
+        ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+                                      size + 1, /* size == 1 currently */
+                                      /* useGpu= */ false);
         int* bufStarts =
             cpuArguments[slot].sequenceStartPositions->getMutableData(false);
         bufStarts[0] = 0;
@@ -653,16 +679,17 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
       case SlotDef::VAR_MDIM_INDEX: {
         CHECK_EQ(size, 1);
         size_t totalDim = slots_[slot].varIndices[dataPos[0]].size();
-        IVector::resizeOrCreate(cpuArguments[slot].ids, totalDim,
+        IVector::resizeOrCreate(cpuArguments[slot].ids,
+                                totalDim,
                                 /*  useGpu= */ false);
         int* buf = cpuArguments[slot].ids->getData();
-        memcpy(buf, slots_[slot].varIndices[dataPos[0]].data(),
+        memcpy(buf,
+               slots_[slot].varIndices[dataPos[0]].data(),
                sizeof(int) * totalDim);
 
-        ICpuGpuVector::resizeOrCreate(
-            cpuArguments[slot].sequenceStartPositions,
-            size + 1, /* size == 1 currently */
-            /* useGpu= */ false);
+        ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+                                      size + 1, /* size == 1 currently */
+                                      /* useGpu= */ false);
         int* bufStarts =
             cpuArguments[slot].sequenceStartPositions->getMutableData(false);
         bufStarts[0] = 0;
@@ -700,8 +727,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
         gpuArguments[i].sequenceStartPositions =
             cpuArguments[i].sequenceStartPositions;
       } else {
-        gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
-                                          HPPL_STREAM_1);
+        gpuArguments[i].resizeAndCopyFrom(
+            cpuArguments[i], useGpu_, HPPL_STREAM_1);
       }
     }
     hl_stream_synchronize(HPPL_STREAM_1);
@@ -746,10 +773,9 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
     sampleLoop(op, size);
 
     // current slot: sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(
-        cpuArguments[slot].sequenceStartPositions,
-        size + 1,
-        /* useGpu= */ false);
+    ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+                                  size + 1,
+                                  /* useGpu= */ false);
 
     switch (slotType) {
       case SlotDef::VECTOR_SPARSE_VALUE:
@@ -821,10 +847,10 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
           };
           int subSize = subSampleLoop(op, size, slot);
           ICpuGpuVector::resizeOrCreate(
-              cpuArguments[slot].subSequenceStartPositions, subSize + 1,
-              false);
+              cpuArguments[slot].subSequenceStartPositions, subSize + 1, false);
           int* currPosOfArgumentSubSeqStart =
-            cpuArguments[slot].subSequenceStartPositions->getMutableData(false);
+              cpuArguments[slot].subSequenceStartPositions->getMutableData(
+                  false);
           int64_t* subSeqs = dataSubPos.data();
           int64_t* subIndexs = slots_[slot].subIndices.data();
           int allSubSequenceLength = 0;
@@ -849,7 +875,8 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
       }
       case SlotDef::INDEX: {
         // label slot
-        IVector::resizeOrCreate(cpuArguments[slot].ids, size,
+        IVector::resizeOrCreate(cpuArguments[slot].ids,
+                                size,
                                 /* useGpu= */ false);
         // fill labels
         int* buf = cpuArguments[slot].ids->getData();
@@ -863,7 +890,9 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
       case SlotDef::VECTOR_DENSE: {
         // copy values
         size_t dim = header_.slot_defs(slot).dim();
-        Matrix::resizeOrCreate(cpuArguments[slot].value, size, dim,
+        Matrix::resizeOrCreate(cpuArguments[slot].value,
+                               size,
+                               dim,
                                false,   // trans = false
                                false);  // useGpu = false
         real* buf = cpuArguments[slot].value->getData();
@@ -887,8 +916,8 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
     gpuArguments.resize(cpuArguments.size());
     gpuBatch.setSize(size);
     for (size_t i = 0; i < cpuArguments.size(); ++i) {
-      gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
-                                        HPPL_STREAM_1);
+      gpuArguments[i].resizeAndCopyFrom(
+          cpuArguments[i], useGpu_, HPPL_STREAM_1);
     }
     hl_stream_synchronize(HPPL_STREAM_1);
     *batch = gpuBatch;
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.h b/paddle/gserver/dataproviders/ProtoDataProvider.h
index 846dd7673abe8b836be1b728bb690daa0e8acc20..ffdcc8fdc977f53e29dc9f03fa3cf7af56acb92f 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.h
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include 
@@ -48,7 +47,8 @@ namespace paddle {
  */
 class ProtoDataProvider : public DataProvider {
 public:
-  ProtoDataProvider(const DataConfig& config, bool useGpu,
+  ProtoDataProvider(const DataConfig& config,
+                    bool useGpu,
                     bool loadDataAll = true);
   virtual void reset();
 
@@ -161,14 +161,16 @@ protected:
 };
 
 /**
- * @brief Special use for Proto data: instances should contain sparse-non-value slots
+ * @brief Special use for Proto data: instances should contain sparse-non-value
+ * slots
  * and label.
  *
  * @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE
  */
 class ProtoSequenceDataProvider : public ProtoDataProvider {
 public:
-  ProtoSequenceDataProvider(const DataConfig& config, bool useGpu,
+  ProtoSequenceDataProvider(const DataConfig& config,
+                            bool useGpu,
                             bool loadDataAll = true);
   ~ProtoSequenceDataProvider() {}
   virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
diff --git a/paddle/gserver/dataproviders/ProtoReader.h b/paddle/gserver/dataproviders/ProtoReader.h
index 3b1eb7e9ef03c42df31c6efc9f0e0240d64e78df..b8fca3cd7f3c5efaea35dc8e09f7ca0ec250830f 100644
--- a/paddle/gserver/dataproviders/ProtoReader.h
+++ b/paddle/gserver/dataproviders/ProtoReader.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include 
@@ -138,7 +137,8 @@ protected:
    *
    * @note this code depends on protobuf 2.4.0. There is nothing like
    * CodedInputStream::CurrentPosition() in protobuf 2.5.0 to tell us how many
-   * bytes has the object readed so far. Therefore, we calculated bytes ourselves.
+   * bytes has the object readed so far. Therefore, we calculated bytes
+   * ourselves.
    */
   int approximateReadedBytes_;
 };
diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp
index 1332c0ab635b6ebec05f25fd77b9703b39227bc1..bee6ca14a2ec3995a3b432fc5a39419a5dd8a8ce 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@@ -12,21 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PyDataProvider.h"
 #include "paddle/utils/PythonUtil.h"
 #include 
 #include "paddle/utils/Util.h"
 #include "paddle/utils/Excepts.h"
 
-
 namespace paddle {
 
 #ifndef PADDLE_NO_PYTHON
 REGISTER_DATA_PROVIDER(py, PyDataProvider);
 #endif
 
-PyDataProvider::PyDataProvider(const DataConfig& config, bool useGpu,
+PyDataProvider::PyDataProvider(const DataConfig& config,
+                               bool useGpu,
                                bool loadDataAll)
     : DataProvider(config, useGpu), batchSize_(0) {
   PyGuard guard;
@@ -50,8 +49,8 @@ void PyDataProvider::loadData(const std::vector& fileList) {
   classInstance_ =
       createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
   CHECK(classInstance_) << "Create class instance failed.";
-  PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
-                                      const_cast("getHeader"), NULL));
+  PyObjectPtr obj(PyObject_CallMethod(
+      classInstance_.get(), const_cast("getHeader"), NULL));
   CHECK_PY(obj) << "Call function getHeader failed.";
   std::string headerInfo =
       std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
@@ -90,7 +89,8 @@ void PyDataProvider::resetSlots() {
   }
 }
 
-void PyDataProvider::fillDenseSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillDenseSlot(ProtoSlot& slot,
+                                   char*& data,
                                    const char* dataEnd) {
   unsigned int dim = slot.dim;
   slot.sampleNum = readT(data, dataEnd);
@@ -102,14 +102,17 @@ void PyDataProvider::fillDenseSlot(ProtoSlot& slot, char*& data,
   float* dat = reinterpret_cast(data);
   std::copy(dat, dat + slot.sampleNum * dim, slot.denseData.begin());
 #else
-  memcpyWithCheck(slot.denseData.data(), data,
-                  sizeof(real) * dim * slot.sampleNum, dataEnd);
+  memcpyWithCheck(slot.denseData.data(),
+                  data,
+                  sizeof(real) * dim * slot.sampleNum,
+                  dataEnd);
 #endif
   // PyDataProvider always provide data in float
   data += sizeof(float) * dim * slot.sampleNum;
 }
 
-void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot,
+                                            char*& data,
                                             const char* dataEnd) {
   slot.sampleNum = readT(data, dataEnd);
   unsigned int* indexPtr = (unsigned int*)data;
@@ -121,12 +124,15 @@ void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
   length = readT(data, dataEnd);
   slot.indices.push_back(length);
   slot.sparseNonValueData.resize(length);
-  memcpyWithCheck(slot.sparseNonValueData.data(), data,
-                  sizeof(unsigned int) * length, dataEnd);
+  memcpyWithCheck(slot.sparseNonValueData.data(),
+                  data,
+                  sizeof(unsigned int) * length,
+                  dataEnd);
   data += sizeof(unsigned int) * length;
 }
 
-void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot,
+                                         char*& data,
                                          const char* dataEnd) {
   slot.sampleNum = readT(data, dataEnd);
   unsigned int* indexPtr = (unsigned int*)data;
@@ -153,7 +159,8 @@ void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot, char*& data,
   }
 }
 
-void PyDataProvider::fillIndexSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillIndexSlot(ProtoSlot& slot,
+                                   char*& data,
                                    const char* dataEnd) {
   slot.sampleNum = readT(data, dataEnd);
   CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
@@ -163,7 +170,8 @@ void PyDataProvider::fillIndexSlot(ProtoSlot& slot, char*& data,
   data += sizeof(unsigned int) * slot.sampleNum;
 }
 
-void PyDataProvider::fillStringSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillStringSlot(ProtoSlot& slot,
+                                    char*& data,
                                     const char* dataEnd) {
   slot.sampleNum = readT(data, dataEnd);
   for (unsigned int i = 0; i < slot.sampleNum; ++i) {
@@ -225,9 +233,8 @@ void PyDataProvider::fillSlotsByStr(const std::string& samples) {
       }
       for (size_t i = 0; i < sequenceNum; ++i) {
         size_t begin = slot.sequenceStartPositions[i];
-        size_t end = (i < sequenceNum - 1)
-                         ? slot.sequenceStartPositions[i + 1]
-                         : slot.sampleNum;
+        size_t end = (i < sequenceNum - 1) ? slot.sequenceStartPositions[i + 1]
+                                           : slot.sampleNum;
         for (size_t ii = begin; ii < end; ++ii) {
           slot.sampleSequenceIdVec.push_back(ii);
         }
@@ -255,8 +262,8 @@ void PyDataProvider::fillSlotsByStr(const std::string& samples) {
 void PyDataProvider::reset() {
   {  // Invoke PyDataProvider Reset
     PyGuard guard;
-    PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
-                                        const_cast("reset"), NULL));
+    PyObjectPtr obj(PyObject_CallMethod(
+        classInstance_.get(), const_cast("reset"), NULL));
     CHECK_PY(obj) << "Call function reset failed.";
   }
 
@@ -270,15 +277,18 @@ void PyDataProvider::reset() {
 void PyDataProvider::shuffle() {
   // py shuffle
   PyGuard guard;
-  PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
-                                      const_cast("shuffle"), NULL));
+  PyObjectPtr obj(PyObject_CallMethod(
+      classInstance_.get(), const_cast("shuffle"), NULL));
   CHECK_PY(obj) << "Call function shuffle failed.";
 }
 
-void PyDataProvider::handleDenseSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleDenseSlot(ProtoSlot& slot,
+                                     size_t slotIndex,
                                      std::vector& cpuArguments) {
   unsigned int dim = slot.dim;
-  Matrix::resizeOrCreate(cpuArguments[slotIndex].value, slot.sampleNum, dim,
+  Matrix::resizeOrCreate(cpuArguments[slotIndex].value,
+                         slot.sampleNum,
+                         dim,
                          false,   // trans = false
                          false);  // useGpu = false
   real* buf = cpuArguments[slotIndex].value->getData();
@@ -294,19 +304,27 @@ void PyDataProvider::handleSparseNonValueSlot(
     ProtoSlot& slot, size_t slotIndex, std::vector& cpuArguments) {
   unsigned int dim = slot.dim;
   if (!(cpuArguments[slotIndex].value)) {
-    cpuArguments[slotIndex].value = Matrix::createSparseMatrix(
-        slot.sampleNum, dim, slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/, NO_VALUE,
-        SPARSE_CSR, false, useGpu_);
+    cpuArguments[slotIndex].value =
+        Matrix::createSparseMatrix(slot.sampleNum,
+                                   dim,
+                                   slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
+                                   NO_VALUE,
+                                   SPARSE_CSR,
+                                   false,
+                                   useGpu_);
   }
   auto mat = cpuArguments[slotIndex].value;
   mat->resize(slot.sampleNum, dim, slot.sampleNum, NO_VALUE, SPARSE_CSR);
   if (std::dynamic_pointer_cast(mat)) {
     std::dynamic_pointer_cast(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
-                   slot.sparseNonValueData.data(), HPPL_STREAM_1);
+        ->copyFrom(slot.sampleSequenceIdVec.data(),
+                   slot.indices.data(),
+                   slot.sparseNonValueData.data(),
+                   HPPL_STREAM_1);
   } else if (std::dynamic_pointer_cast(mat)) {
     std::dynamic_pointer_cast(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
+        ->copyFrom(slot.sampleSequenceIdVec.data(),
+                   slot.indices.data(),
                    slot.sparseNonValueData.data());
   } else {
     LOG(FATAL) << "Not Supported";
@@ -317,28 +335,38 @@ void PyDataProvider::handleSparseValueSlot(
     ProtoSlot& slot, size_t slotIndex, std::vector& cpuArguments) {
   unsigned int dim = slot.dim;
   if (!(cpuArguments[slotIndex].value)) {
-    cpuArguments[slotIndex].value = Matrix::createSparseMatrix(
-        slot.sampleNum, dim, slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
-        FLOAT_VALUE, SPARSE_CSR, false, useGpu_);
+    cpuArguments[slotIndex].value =
+        Matrix::createSparseMatrix(slot.sampleNum,
+                                   dim,
+                                   slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
+                                   FLOAT_VALUE,
+                                   SPARSE_CSR,
+                                   false,
+                                   useGpu_);
   }
   auto mat = cpuArguments[slotIndex].value;
   mat->resize(slot.sampleNum, dim, slot.sampleNum, FLOAT_VALUE, SPARSE_CSR);
   if (std::dynamic_pointer_cast(mat)) {
     std::dynamic_pointer_cast(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
-                   slot.sparseFloatValueData.data(), HPPL_STREAM_DEFAULT);
+        ->copyFrom(slot.sampleSequenceIdVec.data(),
+                   slot.indices.data(),
+                   slot.sparseFloatValueData.data(),
+                   HPPL_STREAM_DEFAULT);
   } else if (std::dynamic_pointer_cast(mat)) {
     std::dynamic_pointer_cast(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
+        ->copyFrom(slot.sampleSequenceIdVec.data(),
+                   slot.indices.data(),
                    slot.sparseFloatValueData.data());
   } else {
     LOG(FATAL) << "Not Supported";
   }
 }
 
-void PyDataProvider::handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleIndexSlot(ProtoSlot& slot,
+                                     size_t slotIndex,
                                      std::vector& cpuArguments) {
-  IVector::resizeOrCreate(cpuArguments[slotIndex].ids, slot.sampleNum,
+  IVector::resizeOrCreate(cpuArguments[slotIndex].ids,
+                          slot.sampleNum,
                           /*useGpu_*/ false);
   int* buf = cpuArguments[slotIndex].ids->getData();
   for (size_t i = 0; i < slot.sampleNum; ++i) {
@@ -346,7 +374,8 @@ void PyDataProvider::handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
   }
 }
 
-void PyDataProvider::handleStringSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleStringSlot(ProtoSlot& slot,
+                                      size_t slotIndex,
                                       std::vector& cpuArguments) {
   if (cpuArguments[slotIndex].strs) {
     cpuArguments[slotIndex].strs->resize(slot.sampleNum);
@@ -364,7 +393,8 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
   PyGuard guard;
   PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
                                       const_cast("getNextBatch"),
-                                      const_cast("i"), size));
+                                      const_cast("i"),
+                                      size));
   CHECK_PY(obj) << "Call function getNextBatch failed.";
   const std::string& samples =
       std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
@@ -381,23 +411,24 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
   if (!iidData()) {
     for (size_t j = 0; j < slotNum_; ++j) {
       auto& slot = slots_[j];
-      ICpuGpuVector::resizeOrCreate(
-          cpuArguments[j].sequenceStartPositions,
-          slot.sequenceNum + 1, /* useGpu= */ false);
+      ICpuGpuVector::resizeOrCreate(cpuArguments[j].sequenceStartPositions,
+                                    slot.sequenceNum + 1,
+                                    /* useGpu= */ false);
       int* buf = cpuArguments[j].sequenceStartPositions->getMutableData(false);
       std::copy(slot.sequenceStartPositions.begin(),
-                slot.sequenceStartPositions.end(), buf);
+                slot.sequenceStartPositions.end(),
+                buf);
       buf[slot.sequenceStartPositions.size()] = slot.sampleNum;
 
       if (slot.subSequenceStartPositions.size()) {
-        ICpuGpuVector::resizeOrCreate(
-            cpuArguments[j].subSequenceStartPositions,
-            slot.subSequenceNum + 1,
-            /*  useGpu= */ false);
+        ICpuGpuVector::resizeOrCreate(cpuArguments[j].subSequenceStartPositions,
+                                      slot.subSequenceNum + 1,
+                                      /*  useGpu= */ false);
         int* buf =
-           cpuArguments[j].subSequenceStartPositions->getMutableData(false);
+            cpuArguments[j].subSequenceStartPositions->getMutableData(false);
         std::copy(slot.subSequenceStartPositions.begin(),
-                  slot.subSequenceStartPositions.end(), buf);
+                  slot.subSequenceStartPositions.end(),
+                  buf);
         buf[slot.subSequenceNum] = slot.sampleNum;
         // check subSequenceStartPositions and sequenceStartPositions
         cpuArguments[j].checkSubset();
@@ -452,8 +483,8 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
               cpuArguments[i].subSequenceStartPositions;
         }
       } else {
-        gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
-                                          HPPL_STREAM_1);
+        gpuArguments[i].resizeAndCopyFrom(
+            cpuArguments[i], useGpu_, HPPL_STREAM_1);
       }
     }
     hl_stream_synchronize(HPPL_STREAM_1);
diff --git a/paddle/gserver/dataproviders/PyDataProvider.h b/paddle/gserver/dataproviders/PyDataProvider.h
index 939d9cf725c2fe6e4989c17e1e768c9f8aedfc95..6bb7c831fdd451abc5241199d6a4d1b1ad814517 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.h
+++ b/paddle/gserver/dataproviders/PyDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include 
@@ -25,7 +24,8 @@ namespace paddle {
 
 class PyDataProvider : public DataProvider {
 public:
-  PyDataProvider(const DataConfig& config, bool useGpu,
+  PyDataProvider(const DataConfig& config,
+                 bool useGpu,
                  bool loadDataAll = true);
 
   virtual void reset();
@@ -48,21 +48,27 @@ protected:
 
   void parseHeaderData(const std::string& headerData);
   void fillDenseSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
-  void fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
+  void fillSparseNonValueSlot(ProtoSlot& slot,
+                              char*& data,
                               const char* dataEnd);
   void fillSparseValueSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
   void fillIndexSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
   void fillStringSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
   void fillSlotsByStr(const std::string& samples);
-  void handleDenseSlot(ProtoSlot& slot, size_t slotIndex,
+  void handleDenseSlot(ProtoSlot& slot,
+                       size_t slotIndex,
                        std::vector& cpuArguments);
-  void handleSparseNonValueSlot(ProtoSlot& slot, size_t slotIndex,
+  void handleSparseNonValueSlot(ProtoSlot& slot,
+                                size_t slotIndex,
                                 std::vector& cpuArguments);
-  void handleSparseValueSlot(ProtoSlot& slot, size_t slotIndex,
+  void handleSparseValueSlot(ProtoSlot& slot,
+                             size_t slotIndex,
                              std::vector& cpuArguments);
-  void handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
+  void handleIndexSlot(ProtoSlot& slot,
+                       size_t slotIndex,
                        std::vector& cpuArguments);
-  void handleStringSlot(ProtoSlot& slot, size_t slotIndex,
+  void handleStringSlot(ProtoSlot& slot,
+                        size_t slotIndex,
                         std::vector& cpuArguments);
   void resetSlots();
   void loadData(const std::vector& fileList);
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index 90391a7c307d8dff7e289d445cafd27dc5008547..967fc9026a39967477d606862e060b680512901a 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -34,7 +34,7 @@ namespace paddle {
 namespace unittest {
 
 static std::unique_ptr>
-         OnPoolFilled;
+    OnPoolFilled;
 
 namespace pydp2 {
 
@@ -43,15 +43,11 @@ void setOnPoolFilledHook(const std::function& callback) {
   *OnPoolFilled = callback;
 }
 
-void clearOnPoolFilledHook() {
-  OnPoolFilled.reset();
-}
+void clearOnPoolFilledHook() { OnPoolFilled.reset(); }
 
 }  // namespace pydp2
 }  // namespace unittest
 
-
-
 /**
  * Slot type
  */
@@ -65,17 +61,13 @@ enum SlotType {
 /**
  * Sequence type
  */
-enum SeqType {
-  SQT_NONE = 0,
-  SQT_SEQ,
-  SQT_SUBSEQ
-};
+enum SeqType { SQT_NONE = 0, SQT_SEQ, SQT_SUBSEQ };
 
 /**
  * Cache Type.
  */
 enum CacheType {
-  NO_CACHE = 0,  // Each pass will load data from PyDataProvider2.
+  NO_CACHE = 0,           // Each pass will load data from PyDataProvider2.
   CACHE_PASS_IN_MEM = 1,  // First pass will load data from PyDataProvider2,
                           // then cache all data in memory. Load data from
                           // memory in rest passes.
@@ -87,8 +79,8 @@ struct SlotHeader {  // Slot Header will parse from python object's slots field.
   SeqType seqType;
 };
 
-inline std::ostream& operator << (std::ostream& os, const SlotHeader& header) {
-  os <<"Dim = " << header.dim << " Type = " << header.slotType
+inline std::ostream& operator<<(std::ostream& os, const SlotHeader& header) {
+  os << "Dim = " << header.dim << " Type = " << header.slotType
      << " SeqType = " << header.seqType;
   return os;
 }
@@ -158,7 +150,6 @@ protected:
   SlotHeader* headerPtr_;
 };
 
-
 /**
  * Py Data Provider Cache Interface.
  */
@@ -209,17 +200,13 @@ public:
   PyDataProvider2(const DataConfig& config,
                   const ModelConfig& modelConfig,
                   bool useGpu)
-    :DataProvider(config, useGpu),
-      callingContextCreated_(2) {
-    if (PyArray_API == NULL)
-      import_array();
+      : DataProvider(config, useGpu), callingContextCreated_(2) {
+    if (PyArray_API == NULL) import_array();
     auto& args = config.load_data_args();
     PyObjectPtr kwargs = PyObjectPtr(PyDict_New());
     if (!args.empty()) {
       kwargs = callPythonFuncRetPyObj(
-            "paddle.trainer.PyDataProvider2",
-            "deserialize_args",
-            {args});
+          "paddle.trainer.PyDataProvider2", "deserialize_args", {args});
     }
 
     py::DictHelper kwargsDict(kwargs);
@@ -245,40 +232,38 @@ public:
    * Dtor
    * @note will stop loading thread when destructing
    */
-  virtual ~PyDataProvider2() {
-    resetImpl(false);
-  }
+  virtual ~PyDataProvider2() { resetImpl(false); }
 
 private:
   void createPyDataObj(const std::string& model,
                        const std::string& className,
                        const std::string& fileListName,
-                       PyObjectPtr && kwargs) {
-    LOG(INFO) << "loading dataprovider " << model <<"::" << className;
+                       PyObjectPtr&& kwargs  // NOLINT
+                       ) {
+    LOG(INFO) << "loading dataprovider " << model << "::" << className;
 
     PyObjectPtr module = py::import(model);
     PyObjectPtr moduleDict(PyModule_GetDict(module.get()));
     CHECK_PY(moduleDict) << "Invoke module.__dict__ error";
-    PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(),
-                                         className.c_str()));
+    PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(), className.c_str()));
     CHECK_PY(cls) << "load class " << className.c_str() << "error";
 
     // If there are multiple python instance share same module, the PyObjectPtr
     // only for instance will make python reference-count error.
     //
     // So here, we increase reference count manually.
-    if (gModuleClsPtrs_.find((uintptr_t) module.get())
-        != gModuleClsPtrs_.end()) {
+    if (gModuleClsPtrs_.find((uintptr_t)module.get()) !=
+        gModuleClsPtrs_.end()) {
       // Multi instance use same module
       Py_XINCREF(module.get());
       Py_XINCREF(moduleDict.get());
     } else {
-      gModuleClsPtrs_.insert((uintptr_t) module.get());
+      gModuleClsPtrs_.insert((uintptr_t)module.get());
     }
-    if (gModuleClsPtrs_.find((uintptr_t) cls.get()) != gModuleClsPtrs_.end()) {
+    if (gModuleClsPtrs_.find((uintptr_t)cls.get()) != gModuleClsPtrs_.end()) {
       Py_XINCREF(cls.get());
     } else {
-      gModuleClsPtrs_.insert((uintptr_t) cls.get());
+      gModuleClsPtrs_.insert((uintptr_t)cls.get());
     }
 
     PyObjectPtr fileListInPy = loadPyFileLists(fileListName);
@@ -294,8 +279,8 @@ private:
     py::ObjectHelper self(this->instance_);
     bool ok;
 
-    this->skipShuffle_ = !self.getBoolAttr("should_shuffle",
-                                           &ok /*isBoolType*/);
+    this->skipShuffle_ =
+        !self.getBoolAttr("should_shuffle", &ok /*isBoolType*/);
     if (!ok) {
       this->skipShuffle_ = testing;  // shuffle when is training, skip shuffle
                                      // when is testing.
@@ -335,12 +320,12 @@ private:
       PyObjectPtr headerPtrWrap(hdPtr);
       py::ObjectHelper hd(headerPtrWrap);
       header.dim = hd.getIntAttrWithError("dim");
-      header.seqType = (SeqType) hd.getIntAttrWithError("seq_type");
-      header.slotType = (SlotType) hd.getIntAttrWithError("type");
+      header.seqType = (SeqType)hd.getIntAttrWithError("seq_type");
+      header.slotType = (SlotType)hd.getIntAttrWithError("type");
     }
 
     DBG << "Data header size " << headers_.size();
-    for (auto & header : headers_) {
+    for (auto& header : headers_) {
       DBG << header;
     }
     cache_.reset(IPyDataProviderCache::create(
@@ -351,8 +336,7 @@ private:
     loadFileList(fileListName, fileLists_);
     PyObject* lst = PyList_New(fileLists_.size());
     for (size_t i = 0; i < fileLists_.size(); ++i) {
-      PyList_SET_ITEM(lst, i,
-                      PyString_FromString(fileLists_[i].c_str()));
+      PyList_SET_ITEM(lst, i, PyString_FromString(fileLists_[i].c_str()));
     }
     return PyObjectPtr(lst);
   }
@@ -414,11 +398,12 @@ private:
         CHECK(ok) << "CalcBatchSize must return int or long";
       }
 
-      if (this->loadThread_){  // wait poolActualSize < poolSize;
+      if (this->loadThread_) {  // wait poolActualSize < poolSize;
         std::unique_lock l(mtx_);
-        pushCV_.wait(l, [this, additionalBatchSize] {
-          return this->poolActualSize_ < poolSize_;
-        });
+        pushCV_.wait(l,
+                     [this, additionalBatchSize] {
+                       return this->poolActualSize_ < poolSize_;
+                     });
       }
 
       {
@@ -487,14 +472,14 @@ private:
   std::vector fileLists_;
   std::vector headers_;
   static PyObjectPtr zeroTuple_;
-  static std::unordered_set gModuleClsPtrs_;
+  static std::unordered_set gModuleClsPtrs_;
 
   class PositionRandom {
   public:
-    inline explicit PositionRandom(bool skipRand):
-        eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
+    inline explicit PositionRandom(bool skipRand)
+        : eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
 
-    inline size_t operator() (size_t len) {
+    inline size_t operator()(size_t len) {
       if (!skipRand_) {
         if (!dist_ || dist_->b() != len - 1) {
           dist_.reset(new std::uniform_int_distribution(0, len - 1));
@@ -525,32 +510,31 @@ public:
    * Shuffle. Do nothing because PyDataProvider do shuffle implicitly by random
    * select data from datapool.
    */
-  void shuffle() {
-  }
+  void shuffle() {}
 
   /**
    * Not limited size.
    */
-  int64_t getSize() {
-    return -1;
-  }
+  int64_t getSize() { return -1; }
 
   /**
    * Loading a batch of data.
    */
-  int64_t getNextBatchInternal(int64_t size_, DataBatch *batch) {
+  int64_t getNextBatchInternal(int64_t size_, DataBatch* batch) {
     std::lock_guard guard(mutexForReset_);
     REGISTER_TIMER("PyDP2.getNextBatchInternal")
     CHECK_GE(size_, 0);
-    size_t size = (size_t) size_;
+    size_t size = (size_t)size_;
     if (loadThread_) {  // loading from thread should wait for data pool ready.
                         // but, loading from cache, cache object should ensure
                         // data pool ready.
       std::unique_lock l(mtx_);
-      pullCV_.wait(l, [this, &size] {
-        return this->poolActualSize_ >= std::max(size, this->minPoolSize_)
-            || callingContexts_.empty();
-      });
+      pullCV_.wait(l,
+                   [this, &size] {
+                     return this->poolActualSize_ >=
+                                std::max(size, this->minPoolSize_) ||
+                            callingContexts_.empty();
+                   });
 
       if (unittest::OnPoolFilled) {
         (*unittest::OnPoolFilled)(this->poolActualSize_);
@@ -633,35 +617,35 @@ public:
     cpuBatch.setSize(bsize);
     auto& inArgs = cpuBatch.getStreams();
     inArgs.resize(headers_.size());
-    std::vector > scanners;
+    std::vector> scanners;
     scanners.reserve(headers_.size());
     for (auto& header : headers_) {
       scanners.emplace_back(IFieldScanner::create(&header));
     }
     DBG << "Scanner created.";
-    for (size_t i=0; i < headers_.size(); ++i) {
+    for (size_t i = 0; i < headers_.size(); ++i) {
       scanners[i]->startPrepare(inArgs[i]);
     }
-    for (auto & d : data) {
+    for (auto& d : data) {
       py::SequenceHelper s(d);
-      for (size_t i=0; i < headers_.size(); ++i) {
+      for (size_t i = 0; i < headers_.size(); ++i) {
         scanners[i]->prepare(inArgs[i], s[i]);
       }
     }
-    for (size_t i=0; i < headers_.size(); ++i) {
+    for (size_t i = 0; i < headers_.size(); ++i) {
       scanners[i]->finishPrepare(inArgs[i]);
     }
-    for (size_t i=0; i < headers_.size(); ++i) {
+    for (size_t i = 0; i < headers_.size(); ++i) {
       scanners[i]->startFill(inArgs[i]);
     }
-    for (auto & d : data) {
+    for (auto& d : data) {
       py::SequenceHelper s(d);
       for (size_t i = 0; i < headers_.size(); ++i) {
         scanners[i]->fill(inArgs[i], s[i]);
       }
     }
 
-    for (size_t i=0; i < headers_.size(); ++i) {
+    for (size_t i = 0; i < headers_.size(); ++i) {
       scanners[i]->finishFill(inArgs[i]);
     }
 
@@ -679,8 +663,8 @@ public:
       gpuArguments.resize(cpuArguments.size());
       gpuBatch.setSize(size);
       for (size_t i = 0; i < headers_.size(); ++i) {
-        gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
-                                          HPPL_STREAM_1);
+        gpuArguments[i].resizeAndCopyFrom(
+            cpuArguments[i], useGpu_, HPPL_STREAM_1);
       }
       hl_stream_synchronize(HPPL_STREAM_1);
     } else {
@@ -690,31 +674,28 @@ public:
   }
 };
 
-std::unordered_set PyDataProvider2::gModuleClsPtrs_;
+std::unordered_set PyDataProvider2::gModuleClsPtrs_;
 PyObjectPtr PyDataProvider2::zeroTuple_(PyTuple_New(0));
 
 REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2);
 
-
 /**
  * Scanner for dense slot.
  */
-class DenseScanner: public IFieldScanner {
+class DenseScanner : public IFieldScanner {
 public:
-  explicit DenseScanner(SlotHeader* ptr):IFieldScanner(ptr), height_(0) {}
+  explicit DenseScanner(SlotHeader* ptr) : IFieldScanner(ptr), height_(0) {}
 
   /**
    * Prepare.
    * @param argument target argument
    * @param obj each timestep of a sample.
    */
-  virtual void prepare(Argument &argument, PyObject *obj) {
-    ++height_;
-  }
+  virtual void prepare(Argument& argument, PyObject* obj) { ++height_; }
 
-  virtual void finishPrepare(Argument &argument) {
-    Matrix::resizeOrCreate(argument.value, height_, headerPtr_->dim,
-                           false, false);
+  virtual void finishPrepare(Argument& argument) {
+    Matrix::resizeOrCreate(
+        argument.value, height_, headerPtr_->dim, false, false);
     height_ = 0;
   }
 
@@ -723,24 +704,23 @@ public:
    * @param argument
    * @param obj
    */
-  virtual void fill(Argument &argument, PyObject *obj) {
+  virtual void fill(Argument& argument, PyObject* obj) {
     real* dat = argument.value->getData() + height_ * headerPtr_->dim;
     if (PyArray_Check(obj)) {
-        auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
-        if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
-            real * data = (real*)PyArray_DATA((PyArrayObject*)obj);
-            auto sz = PyArray_SIZE((PyArrayObject*)obj);
-            std::copy(data, data + sz, dat);
-        } else {
-            LOG(FATAL) << "You should yield float" << sizeof(real) * 8
-                       << " array";
-        }
-     } else {
-        py::SequenceHelper s(obj);
-        // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
-        for (size_t i=0; i < headerPtr_->dim; ++i) {
-          dat[i] = (real) s.getDouble(i);
-        }
+      auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
+      if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
+        real* data = (real*)PyArray_DATA((PyArrayObject*)obj);
+        auto sz = PyArray_SIZE((PyArrayObject*)obj);
+        std::copy(data, data + sz, dat);
+      } else {
+        LOG(FATAL) << "You should yield float" << sizeof(real) * 8 << " array";
+      }
+    } else {
+      py::SequenceHelper s(obj);
+      // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
+      for (size_t i = 0; i < headerPtr_->dim; ++i) {
+        dat[i] = (real)s.getDouble(i);
+      }
     }
     ++height_;
   }
@@ -752,20 +732,18 @@ private:
 /**
  * Scanner for index slot
  */
-class IndexScanner: public IFieldScanner {
+class IndexScanner : public IFieldScanner {
 public:
-  explicit IndexScanner(SlotHeader* ptr):IFieldScanner(ptr), cnt_(0) {}
+  explicit IndexScanner(SlotHeader* ptr) : IFieldScanner(ptr), cnt_(0) {}
 
   /**
    * Prepare memory space.
    *
    * @note obj is a single timestep of sample
    */
-  virtual void prepare(Argument &argument, PyObject *obj) {
-    ++cnt_;
-  }
+  virtual void prepare(Argument& argument, PyObject* obj) { ++cnt_; }
 
-  virtual void finishPrepare(Argument &argument) {
+  virtual void finishPrepare(Argument& argument) {
     IVector::resizeOrCreate(argument.ids, cnt_, false);
     cnt_ = 0;
   }
@@ -773,9 +751,9 @@ public:
   /**
    * Fill one index to argument.
    */
-  virtual void fill(Argument &argument, PyObject *obj) {
+  virtual void fill(Argument& argument, PyObject* obj) {
     bool ok;
-    argument.ids->getData()[cnt_++] = py::castInt(obj, &ok);
+    argument.ids->getData()[cnt_++] = py::castInt(obj, &ok);
     CHECK(ok) << "Cannot cast int " << py::repr(obj);
   }
 
@@ -785,27 +763,25 @@ private:
 
 class SparseNonValueScanner : public IFieldScanner {
 public:
-  explicit SparseNonValueScanner(SlotHeader* ptr): IFieldScanner(ptr),
-                                                   nnz_(0),
-                                                   height_(0) {}
+  explicit SparseNonValueScanner(SlotHeader* ptr)
+      : IFieldScanner(ptr), nnz_(0), height_(0) {}
 
   /**
    * Prepare memory space
    * @note obj is a timestep of one sample.
    */
-  virtual void prepare(Argument &argument, PyObject *obj) {
+  virtual void prepare(Argument& argument, PyObject* obj) {
     ++height_;
     nnz_ += py::SequenceHelper(obj).size();
   }
 
-  virtual void finishPrepare(Argument &argument) {
-    Matrix::resizeOrCreateSparseMatrix(argument.value, height_,
-                                       headerPtr_->dim,
-                                       nnz_, NO_VALUE);
+  virtual void finishPrepare(Argument& argument) {
+    Matrix::resizeOrCreateSparseMatrix(
+        argument.value, height_, headerPtr_->dim, nnz_, NO_VALUE);
   }
 
-  virtual void startFill(Argument & argument) {
-    auto smat = (CpuSparseMatrix*) (argument.value.get());
+  virtual void startFill(Argument& argument) {
+    auto smat = (CpuSparseMatrix*)(argument.value.get());
     smat->getRows()[0] = 0;
     nnz_ = 0;
     height_ = 1;
@@ -818,14 +794,14 @@ public:
   virtual void fill(Argument& argument, PyObject* obj) {
     py::SequenceHelper s(obj);
     auto sz = s.size();
-    auto smat = (CpuSparseMatrix*) (argument.value.get());
+    auto smat = (CpuSparseMatrix*)(argument.value.get());
     int* row = smat->getRows();
     int* col = smat->getCols();
     real* dat = smat->getData();
-    row[height_] = row[height_-1] + (int)sz;
+    row[height_] = row[height_ - 1] + (int)sz;
 
     for (decltype(sz) i = 0; i < sz; ++i) {
-      setData(col+nnz_, dat+nnz_, s[i]);
+      setData(col + nnz_, dat + nnz_, s[i]);
       ++nnz_;
     }
     ++height_;
@@ -839,7 +815,7 @@ protected:
    * @param [in] obj Python Object. For sparse_non_value is a PyInt or PyLong.
    *                 For sparse_value is a Tuple (int, float).
    */
-  virtual void setData(int* col, real * dat, PyObject* obj) {
+  virtual void setData(int* col, real* dat, PyObject* obj) {
     bool ok;
     *col = py::castInt(obj, &ok);
     CHECK(ok);
@@ -851,26 +827,25 @@ protected:
 
 class SparseValueScanner : public SparseNonValueScanner {
 public:
-  explicit SparseValueScanner(SlotHeader *ptr) : SparseNonValueScanner(ptr) {}
+  explicit SparseValueScanner(SlotHeader* ptr) : SparseNonValueScanner(ptr) {}
 
-  virtual void finishPrepare(Argument &argument) {
-    Matrix::resizeOrCreateSparseMatrix(argument.value, height_,
-                                       headerPtr_->dim,
-                                       nnz_, FLOAT_VALUE);
+  virtual void finishPrepare(Argument& argument) {
+    Matrix::resizeOrCreateSparseMatrix(
+        argument.value, height_, headerPtr_->dim, nnz_, FLOAT_VALUE);
   }
 
 protected:
-  virtual void setData(int *col, real *dat, PyObject *obj) {
+  virtual void setData(int* col, real* dat, PyObject* obj) {
     py::SequenceHelper s(obj);
     SparseNonValueScanner::setData(col, dat, s[0]);
-    *dat = (real) s.getDouble(1);
+    *dat = (real)s.getDouble(1);
   }
 };
 
 /**
  * Sequence Scanner. Scanner for sequence or sub-sequence.
  */
-class SequenceScanner: public IFieldScanner {
+class SequenceScanner : public IFieldScanner {
 public:
   /**
    * Ctor
@@ -879,15 +854,18 @@ public:
    *                       return a sequence start position or a sub-sequence
    *                       start position.
    */
-  SequenceScanner(std::unique_ptr&& innerScanner,
-    const std::function& getSeqStartPos)
-      : IFieldScanner(nullptr), inner_(std::move(innerScanner)),
-        cnt_(0), getSeqStartPos_(getSeqStartPos) {}
+  SequenceScanner(
+      std::unique_ptr&& innerScanner,
+      const std::function& getSeqStartPos)
+      : IFieldScanner(nullptr),
+        inner_(std::move(innerScanner)),
+        cnt_(0),
+        getSeqStartPos_(getSeqStartPos) {}
 
   /**
    * Start prepare. Invoke inner->startPrepare too.
    */
-  virtual void startPrepare(Argument &argument) {
+  virtual void startPrepare(Argument& argument) {
     inner_->startPrepare(argument);
   }
 
@@ -895,10 +873,10 @@ public:
    * Prepare. obj is a list or tuple. it will invoke inner_->prepare for each
    * element of sequence obj.
    */
-  virtual void prepare(Argument &argument, PyObject *obj) {
+  virtual void prepare(Argument& argument, PyObject* obj) {
     py::SequenceHelper s(obj);
     ++cnt_;
-    for (size_t i=0; i < s.size(); ++i) {
+    for (size_t i = 0; i < s.size(); ++i) {
       inner_->prepare(argument, s[i]);
     }
   }
@@ -906,7 +884,7 @@ public:
   /**
    * Finish prepare. invoke inner_->finishPrepare too.
    */
-  virtual void finishPrepare(Argument &argument) {
+  virtual void finishPrepare(Argument& argument) {
     ICpuGpuVector::resizeOrCreate(getSeqStartPos_(argument), cnt_ + 1, false);
     inner_->finishPrepare(argument);
   }
@@ -914,7 +892,7 @@ public:
   /**
    * Start fill. invoke inner->startFill too.
    */
-  virtual void startFill(Argument &argument) {
+  virtual void startFill(Argument& argument) {
     getSeqStartPos_(argument)->getMutableData(false)[0] = 0;
     cnt_ = 1;
     inner_->startFill(argument);
@@ -925,13 +903,13 @@ public:
    * sequence obj. And set seqStartPos at same time. The seqStartPos will be
    * calculated by getSeqStartPos callback passed in ctor.
    */
-  virtual void fill(Argument &argument, PyObject *obj) {
+  virtual void fill(Argument& argument, PyObject* obj) {
     getSeqStartPos_(argument)->getMutableData(false)[cnt_] =
-      getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
-          (int)getSize(obj);
+        getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
+        (int)getSize(obj);
     py::SequenceHelper s(obj);
     ++cnt_;
-    for (size_t i=0; i < s.size(); ++i) {
+    for (size_t i = 0; i < s.size(); ++i) {
       inner_->fill(argument, s[i]);
     }
   }
@@ -939,9 +917,7 @@ public:
   /**
    * Finish fill. will invoke inner->finishFill too.
    */
-  virtual void finishFill(Argument &argument) {
-    inner_->finishFill(argument);
-  }
+  virtual void finishFill(Argument& argument) { inner_->finishFill(argument); }
 
 protected:
   size_t getSize(PyObject* obj) {
@@ -949,7 +925,7 @@ protected:
     auto sc = dynamic_cast(inner_.get());
     if (sc) {
       size_t sum = 0;
-      for (size_t i=0; i < s.size(); ++i) {
+      for (size_t i = 0; i < s.size(); ++i) {
         sum += sc->getSize(s[i]);
       }
       return sum;
@@ -964,8 +940,7 @@ private:
   std::function getSeqStartPos_;
 };
 
-
-IFieldScanner* IFieldScanner::create(SlotHeader *header) {
+IFieldScanner* IFieldScanner::create(SlotHeader* header) {
   IFieldScanner* retv = nullptr;
   switch (header->slotType) {
     case ST_DENSE:
@@ -989,15 +964,15 @@ IFieldScanner* IFieldScanner::create(SlotHeader *header) {
       break;
     case SQT_SUBSEQ:
       retv = new SequenceScanner(std::unique_ptr(retv),
-            [](Argument& arg) -> ICpuGpuVectorPtr& {
-              return arg.subSequenceStartPositions;
-            });
-      // fall through, not break;
+                                 [](Argument& arg) -> ICpuGpuVectorPtr& {
+                                   return arg.subSequenceStartPositions;
+                                 });
+    // fall through, not break;
     case SQT_SEQ:
       retv = new SequenceScanner(std::unique_ptr(retv),
-          [](Argument& arg) -> ICpuGpuVectorPtr& {
-            return arg.sequenceStartPositions;
-          });
+                                 [](Argument& arg) -> ICpuGpuVectorPtr& {
+                                   return arg.sequenceStartPositions;
+                                 });
       break;
     default:
       LOG(FATAL) << "Not implemented";
@@ -1010,19 +985,13 @@ IFieldScanner* IFieldScanner::create(SlotHeader *header) {
  * No Cache Strategy. Will destruct old data immediately and load data from
  * python every pass.
  */
-class NoCacheStrategy: public IPyDataProviderCache {
+class NoCacheStrategy : public IPyDataProviderCache {
 public:
-  virtual bool reset() {
-    return true;
-  }
+  virtual bool reset() { return true; }
 
-  virtual void drop(std::deque *data) {
-    data->clear();
-  }
+  virtual void drop(std::deque* data) { data->clear(); }
 
-  virtual std::deque* load() {
-    return nullptr;
-  }
+  virtual std::deque* load() { return nullptr; }
 };
 
 /**
@@ -1033,9 +1002,9 @@ public:
  */
 class CacheOnePassInMemory : public IPyDataProviderCache {
 public:
-  CacheOnePassInMemory() : objPool_(new std::deque()),
-                           droppedPool_(new std::deque())
-  {}
+  CacheOnePassInMemory()
+      : objPool_(new std::deque()),
+        droppedPool_(new std::deque()) {}
 
   virtual bool reset() {
     if (objPool_->empty() && droppedPool_->empty()) {
@@ -1048,25 +1017,22 @@ public:
     }
   }
 
-  virtual void drop(std::deque *data) {
+  virtual void drop(std::deque* data) {
     size_t orgSize = droppedPool_->size();
     droppedPool_->resize(orgSize + data->size());
-    for (size_t i=0; i < data->size(); ++i) {
+    for (size_t i = 0; i < data->size(); ++i) {
       std::swap((*droppedPool_)[orgSize + i], (*data)[i]);
     }
     data->clear();
   }
 
-  virtual std::deque* load() {
-    return objPool_.get();
-  }
+  virtual std::deque* load() { return objPool_.get(); }
 
 private:
-  std::unique_ptr > objPool_;
-  std::unique_ptr > droppedPool_;
+  std::unique_ptr> objPool_;
+  std::unique_ptr> droppedPool_;
 };
 
-
 IPyDataProviderCache* IPyDataProviderCache::create(CacheType ct) {
   switch (ct) {
     case NO_CACHE:
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
index c2625bce9ab0cac7c42a20379c42debea0510c57..8f7d2fb80e9b6f2b4c83d90a04dab5219435d344 100644
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Evaluator.h"
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 
@@ -33,7 +32,8 @@ private:
     str.clear();
     int prevLabel = -1;
     for (std::vector::const_iterator label = path.begin();
-         label != path.end(); label++) {
+         label != path.end();
+         label++) {
       if (*label != blank_ &&
           (str.empty() || *label != str.back() || prevLabel == blank_)) {
         str.push_back(*label);
@@ -58,8 +58,11 @@ private:
   /* "sp, dp, ip" is the weighting parameter of "substitution, deletion,
    * insertion"
    * in edit-distance error */
-  real stringAlignment(std::vector& gtStr, std::vector& recogStr,
-                       bool backtrace = true, real sp = 1.0, real dp = 1.0,
+  real stringAlignment(std::vector& gtStr,
+                       std::vector& recogStr,
+                       bool backtrace = true,
+                       real sp = 1.0,
+                       real dp = 1.0,
                        real ip = 1.0) {
     std::vector> matrix;
     int substitutions, deletions, insertions;
@@ -165,8 +168,8 @@ private:
     return distance / maxLen;
   }
 
-  real editDistance(real* output, int numTimes, int numClasses, int* labels,
-                    int labelsLen) {
+  real editDistance(
+      real* output, int numTimes, int numClasses, int* labels, int labelsLen) {
     numTimes_ = numTimes;
     numClasses_ = numClasses;
     blank_ = numClasses_ - 1;
@@ -207,7 +210,8 @@ public:
       real err = 0;
       err = editDistance(
           output.value->getData() + output.value->getWidth() * outputStarts[i],
-          outputStarts[i+1] - outputStarts[i], output.value->getWidth(),
+          outputStarts[i + 1] - outputStarts[i],
+          output.value->getWidth(),
           label.ids->getData() + labelStarts[i],
           labelStarts[i + 1] - labelStarts[i]);
 
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
index 6f5d2b47c3a97d0c95fefd346add2f121ac51764..923e77fc9df919794902daed6113792e7f89a552 100644
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -144,7 +144,8 @@ public:
     size_t numSequences = sequenceStartPositions->getSize() - 1;
     const int* starts = sequenceStartPositions->getData();
     for (size_t i = 0; i < numSequences; ++i) {
-      eval1(output->getData() + starts[i], label->getData() + starts[i],
+      eval1(output->getData() + starts[i],
+            label->getData() + starts[i],
             starts[i + 1] - starts[i]);
     }
     return 0;
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
index d43dceea7452724c1e45a1b7c5f5f1858d528df7..f5df2b18dedde9022d04b034912e59be00f15413 100644
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "paddle/gserver/evaluators/Evaluator.h"
 
@@ -74,17 +73,19 @@ public:
     }
 
     const MatrixPtr errorMat = Matrix::create(output->getHeight(),
-      1, /* trans= */ false, useGpu(arguments[0].deviceId));
+                                              1,
+                                              /* trans= */ false,
+                                              useGpu(arguments[0].deviceId));
     errorMat->zeroMem();
     if (label != nullptr) {
       errorMat->classificationError(output, label);
     } else if (dynamic_cast(multiBinaryLabel.get()) ||
                dynamic_cast(multiBinaryLabel.get())) {
-      errorMat->classificationErrorMulti(*output, *multiBinaryLabel,
-                                         config_.classification_threshold());
+      errorMat->classificationErrorMulti(
+          *output, *multiBinaryLabel, config_.classification_threshold());
     } else {
-      errorMat->binaryClassificationError(0, *output, *multiBinaryLabel,
-                                          config_.classification_threshold());
+      errorMat->binaryClassificationError(
+          0, *output, *multiBinaryLabel, config_.classification_threshold());
     }
 
     if (supportWeight) {
@@ -126,8 +127,8 @@ public:
     int errCounter = 0;
     CpuVector errorVec(0, nullptr);
     for (size_t i = 0; i < sequenceStartPositions->getSize() - 1; ++i) {
-      errorVec.subVecFrom(errorMat->getData(), starts[i],
-                          starts[i + 1] - starts[i]);
+      errorVec.subVecFrom(
+          errorMat->getData(), starts[i], starts[i + 1] - starts[i]);
       if (errorVec.getSum() > 0) {
         errCounter += 1;
       }
@@ -330,8 +331,8 @@ public:
   }
 
   void distributeEval(ParameterClient2* client) {
-    client->reduce(sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id,
-                   0);
+    client->reduce(
+        sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id, 0);
     client->reduce(&numSamples_, &numSamples_, 1, FLAGS_trainer_id, 0);
   }
 
@@ -379,8 +380,11 @@ real AucEvaluator::evalImp(std::vector