diff --git a/.dockerignore b/.dockerignore
new file mode 120000
index 0000000000000000000000000000000000000000..3e4e48b0b5fe6b468434d6767749b399319f2da2
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1 @@
+.gitignore
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index ee8489c1d71bd050b9a1d9358a664d2294165292..35bed0accdaa274f5966ca5b4b7180106325449b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@ build/
 .cproject
 .pydevproject
 Makefile
+.test_env/
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..f635e65784af47a21df80cc92073ef14eba9a731
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "warp-ctc"]
+	path = warp-ctc
+	url = https://github.com/baidu-research/warp-ctc.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9385943da92bc8c44ca75b267a768ba8ea22bd8b..942669c41ff154c91e88c937739b0f604f21d545 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,23 +2,21 @@
     sha: c25201a00e6b0514370501050cf2a8538ac12270
     hooks:
     -   id: remove-crlf
+        files: (?!.*warp-ctc)^.*$
 -   repo: https://github.com/reyoung/mirrors-yapf.git
     sha: v0.13.2
     hooks:
     -   id: yapf
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    sha: 4ef03c4223ad322c7adaa6c6c0efb26b57df3b71
+    sha: 7539d8bd1a00a3c1bfd34cdb606d3a6372e83469
     hooks:
     -   id: check-added-large-files
     -   id: check-merge-conflict
     -   id: check-symlinks
     -   id: detect-private-key
+        files: (?!.*warp-ctc)^.*$
     -   id: end-of-file-fixer
-# TODO(yuyang): trailing whitespace has some bugs on markdown 
-# files now, please not add it to pre-commit hook now
-#    -   id: trailing-whitespace
-#
-# TODO(yuyang): debug-statements not fit for Paddle, because
-# not all of our python code is runnable. Some are used for 
-# documenation
-#    -   id: debug-statements
+-   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
+    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
+    hooks:
+    -   id: clang-formater
diff --git a/.travis.yml b/.travis.yml
index ffe3bc193b49eb3b3318cbbc7f1c3d86dc205c14..6215060e336c7cff9689951c918dc7ec02b2a2fb 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -42,7 +42,7 @@ addons:
 before_install:
   - |
     if [ ${JOB} == "BUILD_AND_TEST" ]; then
-      if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)'
+      if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)'
       then
         echo "Only markdown docs were updated, stopping build process."
         exit
@@ -50,7 +50,7 @@ before_install:
     fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
-  - pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy
+  - pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy sphinx_rtd_theme
 script:
   - paddle/scripts/travis/main.sh
 notifications:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 090ac9e188422099cc4270b87064b5590e7b620c..dfb5159ea12179b127d3780c8affdcfe5978f6db 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,10 +1,6 @@
 cmake_minimum_required(VERSION 2.8)
 
 project(paddle CXX C)
-set(PADDLE_MAJOR_VERSION 0)
-set(PADDLE_MINOR_VERSION 9)
-set(PADDLE_PATCH_VERSION 0a0)
-set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
 set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
@@ -12,6 +8,17 @@ include(package)
 find_package(SWIG 2.0)
 find_package(CUDA QUIET)
 find_package(Protobuf REQUIRED)
+
+# Check protobuf library version.
+execute_process(COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --version
+	OUTPUT_VARIABLE PROTOBUF_VERSION)
+string(REPLACE "libprotoc " "" PROTOBUF_VERSION ${PROTOBUF_VERSION})
+
+set(PROTOBUF_3 OFF)
+if (${PROTOBUF_VERSION} VERSION_GREATER "3.0.0" OR ${PROTOBUF_VERSION} VERSION_EQUAL "3.0.0")
+    set(PROTOBUF_3 ON)
+endif()
+
 find_package(PythonLibs 2.7 REQUIRED)
 find_package(PythonInterp 2.7 REQUIRED)
 find_package(ZLIB REQUIRED)
@@ -36,6 +43,7 @@ option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF)
 option(WITH_GLOG "Compile PaddlePaddle use glog, otherwise use a log implement internally" ${LIBGLOG_FOUND})
 option(WITH_GFLAGS "Compile PaddlePaddle use gflags, otherwise use a flag implement internally" ${GFLAGS_FOUND})
 option(WITH_TIMER "Compile PaddlePaddle use timer" OFF)
+option(WITH_PROFILER "Compile PaddlePaddle use gpu profiler" OFF)
 option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
 option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
 option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
@@ -44,7 +52,7 @@ option(ON_COVERALLS "Generating code coverage data on coveralls or not." OFF)
 option(COVERALLS_UPLOAD "Uploading the generated coveralls json." ON)
 
 if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING 
+    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
         "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
         FORCE)
 endif()
@@ -63,36 +71,16 @@ include(check_packages)
 include(swig)
 include(coveralls)
 
-# add PaddlePaddle version
-if(DEFINED ENV{PADDLE_VERSION})
-    add_definitions(-DPADDLE_VERSION=\"$ENV{PADDLE_VERSION}\")
-else()
-    if(EXISTS ${PROJ_ROOT}/.svn/)
-        find_package(Subversion REQUIRED)
-        if(SUBVERSION_FOUND)
-            Subversion_WC_INFO(${PROJ_ROOT} Project)
-            add_definitions(-DPADDLE_VERSION=${Project_WC_REVISION})
-        endif()
-    elseif(EXISTS ${PROJ_ROOT}/.git/)
-        find_package(Git REQUIRED)
-        execute_process(
-            COMMAND ${GIT_EXECUTABLE} log -1 --format=%H
-            WORKING_DIRECTORY ${PROJ_ROOT}
-            OUTPUT_VARIABLE GIT_SHA1
-            RESULT_VARIABLE GIT_RESULT
-            ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-        if(NOT ${GIT_RESULT})
-            add_definitions(-DPADDLE_VERSION=\"${GIT_SHA1}\")
-        else()
-            message(WARNING "Cannot add paddle version from git tag")
-        endif()
-    endif()
-endif()
-
+# Set PaddlePaddle version to Git tag name or Git commit ID.
+find_package(Git REQUIRED)
+# version.cmake will get the current PADDLE_VERSION
+include(version)
+add_definitions(-DPADDLE_VERSION=\"${PADDLE_VERSION}\")
 
 if(NOT WITH_GPU)
     add_definitions(-DPADDLE_ONLY_CPU)
     add_definitions(-DHPPL_STUB_FUNC)
+
     list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
     if(${CUDA_VERSION_MAJOR} GREATER 6)
@@ -114,16 +102,15 @@ else()
         set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
     endif(WITH_AVX)
 
-    if(WITH_DSO)
-        set(CUDA_LIBRARIES "")
-        add_definitions(-DPADDLE_USE_DSO)
-    endif(WITH_DSO)
-
     # Include cuda and cudnn
     include_directories(${CUDNN_INCLUDE_DIR})
     include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
 
+if(WITH_DSO)
+    add_definitions(-DPADDLE_USE_DSO)
+endif(WITH_DSO)
+
 if(WITH_DOUBLE)
     add_definitions(-DPADDLE_TYPE_DOUBLE)
     set(ACCURACY double)
@@ -135,6 +122,10 @@ if(NOT WITH_TIMER)
     add_definitions(-DPADDLE_DISABLE_TIMER)
 endif(NOT WITH_TIMER)
 
+if(NOT WITH_PROFILER)
+    add_definitions(-DPADDLE_DISABLE_PROFILER)
+endif(NOT WITH_PROFILER)
+
 if(WITH_AVX)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
diff --git a/README.md b/README.md
index e8679fb55fc22559d933a416e8706b7baf536ead..8a8e15841586ae6a01bb93e94f6074189f556f5a 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,13 @@
 # PaddlePaddle
 
 
-[![Build Status](https://travis-ci.org/baidu/Paddle.svg?branch=master)](https://travis-ci.org/baidu/Paddle)
-[![Coverage Status](https://coveralls.io/repos/github/baidu/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/baidu/Paddle?branch=develop)
-[![Join the chat at https://gitter.im/PaddlePaddle/Deep_Learning](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/PaddlePaddle/Deep_Learning?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
-[![License](https://img.shields.io/badge/license-Apache%202.0-green.svg)](LICENSE)
+[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/cn/index.html)
+[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
+[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
+
 
 Welcome to the PaddlePaddle GitHub.
 
@@ -14,7 +17,7 @@ developed by Baidu scientists and engineers for the purpose of applying deep
 learning to many products at Baidu.
 
 Our vision is to enable deep learning for everyone via PaddlePaddle.
-Please refer to our [release announcement](https://github.com/baidu/Paddle/releases) to track the latest feature of PaddlePaddle.
+Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
 ## Features
 
@@ -89,7 +92,7 @@ Both [English Docs](http://paddlepaddle.org/doc/) and [Chinese Docs](http://padd
 
 ## Ask Questions
 
-You are welcome to submit questions and bug reports as [Github Issues](https://github.com/baidu/paddle/issues).
+You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).
 
 ## Copyright and License
 PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
diff --git a/RELEASE.md b/RELEASE.md
new file mode 100644
index 0000000000000000000000000000000000000000..a8a245ab442ba0fc63d1f1fda932e7590a6fe4ca
--- /dev/null
+++ b/RELEASE.md
@@ -0,0 +1,69 @@
+# Release v0.9.0
+
+## New Features:
+
+* New Layers
+  * bilinear interpolation layer.
+  * spatial pyramid-pool layer.
+  * de-convolution layer.
+  * maxout layer.
+* Support rectangle padding, stride, window and input for Pooling Operation.
+* Add —job=time in trainer, which can be used to print time info without compiler option -WITH_TIMER=ON.
+* Expose cost_weight/nce_layer in `trainer_config_helpers`
+* Add FAQ, concepts, h-rnn docs.
+* Add Bidi-LSTM and DB-LSTM to quick start demo @alvations
+* Add usage track scripts.
+
+## Improvements
+
+* Add Travis-CI for Mac OS X. Enable swig unittest in Travis-CI. Skip Travis-CI when only docs are changed.
+* Add code coverage tools.
+* Refine convolution layer to speedup and reduce GPU memory.
+* Speed up PyDataProvider2
+* Add ubuntu deb package build scripts.
+* Make Paddle use git-flow branching model.
+* PServer support no parameter blocks.
+
+## Bug Fixes
+
+* add zlib link to py_paddle
+* add input sparse data check for sparse layer at runtime
+* Bug fix for sparse matrix multiplication
+* Fix floating-point overflow problem of tanh
+* Fix some nvcc compile options
+* Fix a bug in yield dictionary in DataProvider
+* Fix SRL hang when exit.
+
+# Release v0.8.0beta.1
+New features:
+
+* Mac OSX is supported by source code. #138
+   * Both GPU and CPU versions of PaddlePaddle are supported.
+
+* Support CUDA 8.0
+
+* Enhance `PyDataProvider2`
+   * Add dictionary yield format. `PyDataProvider2` can yield a dictionary with key is data_layer's name, value is features.
+   * Add `min_pool_size` to control memory pool in provider.
+
+* Add `deb` install package & docker image for no_avx machines.
+   * Especially for cloud computing and virtual machines
+
+* Automatically disable `avx` instructions in cmake when machine's CPU don't support `avx` instructions.
+
+* Add Parallel NN api in trainer_config_helpers.
+
+* Add `travis ci` for Github
+
+Bug fixes:
+
+* Several bugs in trainer_config_helpers. Also complete the unittest for trainer_config_helpers
+* Check if PaddlePaddle is installed when unittest.
+* Fix bugs in GTX series GPU
+* Fix bug in MultinomialSampler
+
+Also more documentation was written since last release.
+
+# Release v0.8.0beta.0
+
+PaddlePaddle v0.8.0beta.0 release. The install package is not stable yet and it's a pre-release version.
diff --git a/benchmark/.gitignore b/benchmark/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..7b66e8a5b5020fd847982db401665d24ba3a069c
--- /dev/null
+++ b/benchmark/.gitignore
@@ -0,0 +1,9 @@
+paddle/image/logs
+paddle/image/*.pyc
+paddle/image/train.list
+paddle/rnn/logs
+paddle/rnn/*.pyc
+paddle/rnn/imdb.pkl
+caffe/image/logs
+tensorflow/image/logs
+tensorflow/rnn/logs
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..367013f0457f9bbb9ae1335ea63dce181316d444
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,168 @@
+# Benchmark
+
+Machine: 
+
+- CPU: 12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz
+- GPU: Tesla K40m
+- cuDNN: v5.1
+- system: Docker 1.12.1, all platforms are tested in docker environment.
+
+Platforms: 
+
+- PaddlePaddle: paddledev/paddle:gpu-devel-v0.9.0a0 
+- Tensorflow: gcr.io/tensorflow/tensorflow:0.11.0rc0-gpu 
+- Caffe: kaixhin/cuda-caffe
+
+Several convolutional neural networks and recurrent neural networks are used to test.
+
+## Image
+
+### Benchmark Model
+
+AlexNet, GoogleNet and a small network used in Caffe.
+
+- [AlexNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet): but the group size is one.
+
+- [GoogleNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet): but remove loss1 and loss2 when testing benchmark.
+
+- [SmallNet](https://github.com/BVLC/caffe/blob/master/examples/cifar10/cifar10\_quick\_train\_test.prototxt)
+
+
+### Single-GPU
+
+- AlexNet:  input - 3 * 227 * 227,  Time: ms/batch
+
+| BatchSize    | 64  | 128  | 256   | 512  |
+|--------------|-----| -----| ------| -----|
+| PaddlePaddle | 195 | 334  | 602   | 1629 |
+| TensorFlow   | 223 | 364  | 645   | 1235 |
+| Caffe        | 324 | 627  | 1232  | 2513 |
+ 
+**Notation**
+
+All platforms use cuDNN-v5.1. We see that caffe is slower in this experiment, because its workspace limit size of cuDNN-conv interface is 8 * 1024 * 1024, which is smaller in PaddlePaddle and TensorFlow. Note that Caffe will be faster if increasing the workspace limit size.
+ 
+- GoogletNet:  input - 3 * 224 * 224, Time: ms/batch
+
+
+| BatchSize    | 64    |   128  | 256     |
+|--------------|-------| -------| --------|
+| PaddlePaddle | 613   | 1149   | 2348    |
+| TensorFlow   | 644   | 1176   | 2219    |
+| Caffe        | 694   | 1364   | out of memory   |
+
+- SmallNet: input - 3 * 32 * 32, Time ms/batch
+
+| BatchSize    | 64     |   128    | 256     | 512     |
+|--------------|--------| -------- | --------|---------|
+| PaddlePaddle | 10.463 | 18.184   | 33.113  |  63.039 |
+| TensorFlow   | 9     | 15       | 28      | 59       |
+| Caffe        | 9.373  | 16.6606  | 31.4797 | 59.719  |
+
+**Notation**
+
+All the single-GPU experiments in caffe use `caffe time` to calculate elapsed time, which does not include parameter updating time. However, both PaddlePaddle and TensorFlow experiments contain the parameter updating time. As compared with the total time, this part is relatively little on single machine, we can ignore it.
+
+In Tensorflow, they implement algorithm searching method instead of using the algorithm searching interface in cuDNN.
+
+### Multi-GPU: 4 GPUs
+
+- AlexNet,  ms / batch
+
+| total-BatchSize | 128 * 4  | 256 * 4    |
+|------------------|----------| -----------|
+| PaddlePaddle     | 347      | 622        |
+| TensorFlow       | 377      | 675        |
+| Caffe            | 1229     | 2435       |
+
+For example, if `total-BatchSize = 128 * 4`, the speedup ratio is calculated by 
+
+```
+  time_at_1gpu_batch_128 * 4 / time_at_4gpu_total_batch_512 
+= (334 * 4)/347 
+= 3.85
+``` 
+
+<img src="figs/alexnet-4gpu.png" width="420">
+
+
+- GoogleNet, ms / batch
+
+| total-BatchSize  | 128 * 4      |  256 * 4    |
+|-------------------|--------------| ----------- |
+| PaddlePaddle      | 1178         | 2367        |
+| TensorFlow        | 1210         | 2292        |
+| Caffe             | 2007         | out of memory  |
+
+<img src="figs/googlenet-4gpu.png" width="420">
+
+
+## RNN
+We use lstm network for text classfication to test benchmark.
+
+### Dataset
+-  [IMDB](http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl)
+- Sequence length is 100. In fact, PaddlePaddle supports training with variable-length sequence, but TensorFlow needs to pad. Thus, we also pad sequence length to 100 in PaddlePaddle in order to compare.
+- Dictionary size=30000 
+- Peephole connection is used in `lstmemory` by default in PaddlePaddle. It is also configured in TensorFlow.
+
+### Single-GPU
+
+#### LSTM in Text Classification
+
+Testing `2 lstm layer + fc` network with different hidden size and batch size.
+  
+- Batch size = 64, ms / batch
+ 
+| hidden_size  | 256   | 512    |  1280   |
+|--------------|-------| -------| --------|
+| PaddlePaddle | 83    | 184    | 641     |
+| TensorFlow   | 175   | 280    | 818     |
+
+- Batch size = 128, ms / batch
+ 
+| hidden_size  | 256    | 512    |  1280   |
+|--------------|------- | -------| --------|
+| PaddlePaddle | 110    | 261    | 1007    |
+| TensorFlow   | 181    | 361    | 1237    |
+
+
+- Batch size = 256, ms / batch
+ 
+| hidden_size  | 256   | 512    |  1280   |
+|--------------|-------| -------| --------|
+| PaddlePaddle | 170   | 414    | 1655    |
+| TensorFlow   | 238   | 536    | 1905    |
+
+<img src="figs/rnn_lstm_cls.png" width="600">
+
+#### Seq2Seq
+
+The benchmark of sequence-to-sequence network will be added later.
+ 
+
+### Multi GPU: 4 GPUs
+
+#### LSTM in Text Classification
+
+- hidden_size = 256, ms / batch
+ 
+| batch_size   | 256    |  512    |
+|--------------| -------| --------|
+| PaddlePaddle | 90     | 118     |
+| TensorFlow   | 226    | 118     |
+
+
+- hidden_size = 512, ms / batch
+ 
+| batch_size   | 256    |  512    |
+|--------------| -------| --------|
+| PaddlePaddle | 189    | 268     |
+| TensorFlow   | 297    | 383     |
+
+
+<img src="figs/rnn_lstm_4gpus.png" width="420">
+
+#### Seq2Seq
+
+The benchmark of sequence-to-sequence network will be added later.
diff --git a/benchmark/caffe/image/alexnet.prototxt b/benchmark/caffe/image/alexnet.prototxt
new file mode 100644
index 0000000000000000000000000000000000000000..aca184ddaf2ca2b5e2bea17d131055e0621b8271
--- /dev/null
+++ b/benchmark/caffe/image/alexnet.prototxt
@@ -0,0 +1,347 @@
+name: "alexnet"
+input: "data"
+input_dim: 64
+input_dim: 3
+input_dim: 227
+input_dim: 227
+input: "label"
+input_dim: 64
+input_dim: 1
+input_dim: 1
+input_dim: 1 
+force_backward: true
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.1
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc8"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "fc8"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "fc8"
+  bottom: "label"
+  top: "loss"
+}
diff --git a/benchmark/caffe/image/googlenet.prototxt b/benchmark/caffe/image/googlenet.prototxt
new file mode 100644
index 0000000000000000000000000000000000000000..c5f3b4fe3efcb6f7397031c086997fa914c67b7f
--- /dev/null
+++ b/benchmark/caffe/image/googlenet.prototxt
@@ -0,0 +1,2334 @@
+name: "googlenet"
+input: "data"
+input_dim: 128
+input_dim: 3
+input_dim: 224
+input_dim: 224
+input: "label"
+input_dim: 128
+input_dim: 1
+input_dim: 1
+input_dim: 1 
+layer {
+  name: "conv1/7x7_s2"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1/7x7_s2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 3
+    kernel_size: 7
+    stride: 2
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "conv1/relu_7x7"
+  type: "ReLU"
+  bottom: "conv1/7x7_s2"
+  top: "conv1/7x7_s2"
+}
+layer {
+  name: "pool1/3x3_s2"
+  type: "Pooling"
+  bottom: "conv1/7x7_s2"
+  top: "pool1/3x3_s2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+#layer {
+#  name: "pool1/norm1"
+#  type: "LRN"
+#  bottom: "pool1/3x3_s2"
+#  top: "pool1/norm1"
+#  lrn_param {
+#    local_size: 5
+#    alpha: 0.0001
+#    beta: 0.75
+#  }
+#}
+layer {
+  name: "conv2/3x3_reduce"
+  type: "Convolution"
+#  bottom: "pool1/norm1"
+  bottom: "pool1/3x3_s2"
+  top: "conv2/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "conv2/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "conv2/3x3_reduce"
+  top: "conv2/3x3_reduce"
+}
+layer {
+  name: "conv2/3x3"
+  type: "Convolution"
+  bottom: "conv2/3x3_reduce"
+  top: "conv2/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 192
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "conv2/relu_3x3"
+  type: "ReLU"
+  bottom: "conv2/3x3"
+  top: "conv2/3x3"
+}
+#layer {
+#  name: "conv2/norm2"
+#  type: "LRN"
+#  bottom: "conv2/3x3"
+#  top: "conv2/norm2"
+#  lrn_param {
+#    local_size: 5
+#    alpha: 0.0001
+#    beta: 0.75
+#  }
+#}
+layer {
+  name: "pool2/3x3_s2"
+  type: "Pooling"
+#  bottom: "conv2/norm2"
+  bottom: "conv2/3x3"
+  top: "pool2/3x3_s2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "inception_3a/1x1"
+  type: "Convolution"
+  bottom: "pool2/3x3_s2"
+  top: "inception_3a/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3a/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_3a/1x1"
+  top: "inception_3a/1x1"
+}
+layer {
+  name: "inception_3a/3x3_reduce"
+  type: "Convolution"
+  bottom: "pool2/3x3_s2"
+  top: "inception_3a/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3a/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_3a/3x3_reduce"
+  top: "inception_3a/3x3_reduce"
+}
+layer {
+  name: "inception_3a/3x3"
+  type: "Convolution"
+  bottom: "inception_3a/3x3_reduce"
+  top: "inception_3a/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3a/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_3a/3x3"
+  top: "inception_3a/3x3"
+}
+layer {
+  name: "inception_3a/5x5_reduce"
+  type: "Convolution"
+  bottom: "pool2/3x3_s2"
+  top: "inception_3a/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 16
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3a/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_3a/5x5_reduce"
+  top: "inception_3a/5x5_reduce"
+}
+layer {
+  name: "inception_3a/5x5"
+  type: "Convolution"
+  bottom: "inception_3a/5x5_reduce"
+  top: "inception_3a/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 32
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3a/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_3a/5x5"
+  top: "inception_3a/5x5"
+}
+layer {
+  name: "inception_3a/pool"
+  type: "Pooling"
+  bottom: "pool2/3x3_s2"
+  top: "inception_3a/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_3a/pool_proj"
+  type: "Convolution"
+  bottom: "inception_3a/pool"
+  top: "inception_3a/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3a/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_3a/pool_proj"
+  top: "inception_3a/pool_proj"
+}
+layer {
+  name: "inception_3a/output"
+  type: "Concat"
+  bottom: "inception_3a/1x1"
+  bottom: "inception_3a/3x3"
+  bottom: "inception_3a/5x5"
+  bottom: "inception_3a/pool_proj"
+  top: "inception_3a/output"
+}
+layer {
+  name: "inception_3b/1x1"
+  type: "Convolution"
+  bottom: "inception_3a/output"
+  top: "inception_3b/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3b/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_3b/1x1"
+  top: "inception_3b/1x1"
+}
+layer {
+  name: "inception_3b/3x3_reduce"
+  type: "Convolution"
+  bottom: "inception_3a/output"
+  top: "inception_3b/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3b/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_3b/3x3_reduce"
+  top: "inception_3b/3x3_reduce"
+}
+layer {
+  name: "inception_3b/3x3"
+  type: "Convolution"
+  bottom: "inception_3b/3x3_reduce"
+  top: "inception_3b/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 192
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3b/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_3b/3x3"
+  top: "inception_3b/3x3"
+}
+layer {
+  name: "inception_3b/5x5_reduce"
+  type: "Convolution"
+  bottom: "inception_3a/output"
+  top: "inception_3b/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3b/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_3b/5x5_reduce"
+  top: "inception_3b/5x5_reduce"
+}
+layer {
+  name: "inception_3b/5x5"
+  type: "Convolution"
+  bottom: "inception_3b/5x5_reduce"
+  top: "inception_3b/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3b/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_3b/5x5"
+  top: "inception_3b/5x5"
+}
+layer {
+  name: "inception_3b/pool"
+  type: "Pooling"
+  bottom: "inception_3a/output"
+  top: "inception_3b/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_3b/pool_proj"
+  type: "Convolution"
+  bottom: "inception_3b/pool"
+  top: "inception_3b/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_3b/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_3b/pool_proj"
+  top: "inception_3b/pool_proj"
+}
+layer {
+  name: "inception_3b/output"
+  type: "Concat"
+  bottom: "inception_3b/1x1"
+  bottom: "inception_3b/3x3"
+  bottom: "inception_3b/5x5"
+  bottom: "inception_3b/pool_proj"
+  top: "inception_3b/output"
+}
+layer {
+  name: "pool3/3x3_s2"
+  type: "Pooling"
+  bottom: "inception_3b/output"
+  top: "pool3/3x3_s2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "inception_4a/1x1"
+  type: "Convolution"
+  bottom: "pool3/3x3_s2"
+  top: "inception_4a/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 192
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4a/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_4a/1x1"
+  top: "inception_4a/1x1"
+}
+layer {
+  name: "inception_4a/3x3_reduce"
+  type: "Convolution"
+  bottom: "pool3/3x3_s2"
+  top: "inception_4a/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4a/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_4a/3x3_reduce"
+  top: "inception_4a/3x3_reduce"
+}
+layer {
+  name: "inception_4a/3x3"
+  type: "Convolution"
+  bottom: "inception_4a/3x3_reduce"
+  top: "inception_4a/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 208
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4a/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_4a/3x3"
+  top: "inception_4a/3x3"
+}
+layer {
+  name: "inception_4a/5x5_reduce"
+  type: "Convolution"
+  bottom: "pool3/3x3_s2"
+  top: "inception_4a/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 16
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4a/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_4a/5x5_reduce"
+  top: "inception_4a/5x5_reduce"
+}
+layer {
+  name: "inception_4a/5x5"
+  type: "Convolution"
+  bottom: "inception_4a/5x5_reduce"
+  top: "inception_4a/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 48
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4a/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_4a/5x5"
+  top: "inception_4a/5x5"
+}
+layer {
+  name: "inception_4a/pool"
+  type: "Pooling"
+  bottom: "pool3/3x3_s2"
+  top: "inception_4a/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_4a/pool_proj"
+  type: "Convolution"
+  bottom: "inception_4a/pool"
+  top: "inception_4a/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4a/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_4a/pool_proj"
+  top: "inception_4a/pool_proj"
+}
+layer {
+  name: "inception_4a/output"
+  type: "Concat"
+  bottom: "inception_4a/1x1"
+  bottom: "inception_4a/3x3"
+  bottom: "inception_4a/5x5"
+  bottom: "inception_4a/pool_proj"
+  top: "inception_4a/output"
+}
+#layer {
+#  name: "loss1/ave_pool"
+#  type: "Pooling"
+#  bottom: "inception_4a/output"
+#  top: "loss1/ave_pool"
+#  pooling_param {
+#    pool: AVE
+#    kernel_size: 5
+#    stride: 3
+#  }
+#}
+#layer {
+#  name: "loss1/conv"
+#  type: "Convolution"
+#  bottom: "loss1/ave_pool"
+#  top: "loss1/conv"
+#  param {
+#    lr_mult: 1
+#    decay_mult: 1
+#  }
+#  param {
+#    lr_mult: 2
+#    decay_mult: 0
+#  }
+#  convolution_param {
+#    num_output: 128
+#    kernel_size: 1
+#    weight_filler {
+#      type: "xavier"
+#    }
+#    bias_filler {
+#      type: "constant"
+#      value: 0.2
+#    }
+#  }
+#}
+#layer {
+#  name: "loss1/relu_conv"
+#  type: "ReLU"
+#  bottom: "loss1/conv"
+#  top: "loss1/conv"
+#}
+#layer {
+#  name: "loss1/fc"
+#  type: "InnerProduct"
+#  bottom: "loss1/conv"
+#  top: "loss1/fc"
+#  param {
+#    lr_mult: 1
+#    decay_mult: 1
+#  }
+#  param {
+#    lr_mult: 2
+#    decay_mult: 0
+#  }
+#  inner_product_param {
+#    num_output: 1024
+#    weight_filler {
+#      type: "xavier"
+#    }
+#    bias_filler {
+#      type: "constant"
+#      value: 0.2
+#    }
+#  }
+#}
+#layer {
+#  name: "loss1/relu_fc"
+#  type: "ReLU"
+#  bottom: "loss1/fc"
+#  top: "loss1/fc"
+#}
+#layer {
+#  name: "loss1/drop_fc"
+#  type: "Dropout"
+#  bottom: "loss1/fc"
+#  top: "loss1/fc"
+#  dropout_param {
+#    dropout_ratio: 0.7
+#  }
+#}
+#layer {
+#  name: "loss1/classifier"
+#  type: "InnerProduct"
+#  bottom: "loss1/fc"
+#  top: "loss1/classifier"
+#  param {
+#    lr_mult: 1
+#    decay_mult: 1
+#  }
+#  param {
+#    lr_mult: 2
+#    decay_mult: 0
+#  }
+#  inner_product_param {
+#    num_output: 1000
+#    weight_filler {
+#      type: "xavier"
+#    }
+#    bias_filler {
+#      type: "constant"
+#      value: 0
+#    }
+#  }
+#}
+#layer {
+#  name: "loss1/loss"
+#  type: "SoftmaxWithLoss"
+#  bottom: "loss1/classifier"
+#  bottom: "label"
+#  top: "loss1/loss1"
+#  loss_weight: 0.3
+#}
+layer {
+  name: "inception_4b/1x1"
+  type: "Convolution"
+  bottom: "inception_4a/output"
+  top: "inception_4b/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 160
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4b/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_4b/1x1"
+  top: "inception_4b/1x1"
+}
+layer {
+  name: "inception_4b/3x3_reduce"
+  type: "Convolution"
+  bottom: "inception_4a/output"
+  top: "inception_4b/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 112
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4b/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_4b/3x3_reduce"
+  top: "inception_4b/3x3_reduce"
+}
+layer {
+  name: "inception_4b/3x3"
+  type: "Convolution"
+  bottom: "inception_4b/3x3_reduce"
+  top: "inception_4b/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 224
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4b/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_4b/3x3"
+  top: "inception_4b/3x3"
+}
+layer {
+  name: "inception_4b/5x5_reduce"
+  type: "Convolution"
+  bottom: "inception_4a/output"
+  top: "inception_4b/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 24
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4b/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_4b/5x5_reduce"
+  top: "inception_4b/5x5_reduce"
+}
+layer {
+  name: "inception_4b/5x5"
+  type: "Convolution"
+  bottom: "inception_4b/5x5_reduce"
+  top: "inception_4b/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4b/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_4b/5x5"
+  top: "inception_4b/5x5"
+}
+layer {
+  name: "inception_4b/pool"
+  type: "Pooling"
+  bottom: "inception_4a/output"
+  top: "inception_4b/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_4b/pool_proj"
+  type: "Convolution"
+  bottom: "inception_4b/pool"
+  top: "inception_4b/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4b/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_4b/pool_proj"
+  top: "inception_4b/pool_proj"
+}
+layer {
+  name: "inception_4b/output"
+  type: "Concat"
+  bottom: "inception_4b/1x1"
+  bottom: "inception_4b/3x3"
+  bottom: "inception_4b/5x5"
+  bottom: "inception_4b/pool_proj"
+  top: "inception_4b/output"
+}
+layer {
+  name: "inception_4c/1x1"
+  type: "Convolution"
+  bottom: "inception_4b/output"
+  top: "inception_4c/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4c/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_4c/1x1"
+  top: "inception_4c/1x1"
+}
+layer {
+  name: "inception_4c/3x3_reduce"
+  type: "Convolution"
+  bottom: "inception_4b/output"
+  top: "inception_4c/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4c/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_4c/3x3_reduce"
+  top: "inception_4c/3x3_reduce"
+}
+layer {
+  name: "inception_4c/3x3"
+  type: "Convolution"
+  bottom: "inception_4c/3x3_reduce"
+  top: "inception_4c/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4c/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_4c/3x3"
+  top: "inception_4c/3x3"
+}
+layer {
+  name: "inception_4c/5x5_reduce"
+  type: "Convolution"
+  bottom: "inception_4b/output"
+  top: "inception_4c/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 24
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4c/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_4c/5x5_reduce"
+  top: "inception_4c/5x5_reduce"
+}
+layer {
+  name: "inception_4c/5x5"
+  type: "Convolution"
+  bottom: "inception_4c/5x5_reduce"
+  top: "inception_4c/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4c/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_4c/5x5"
+  top: "inception_4c/5x5"
+}
+layer {
+  name: "inception_4c/pool"
+  type: "Pooling"
+  bottom: "inception_4b/output"
+  top: "inception_4c/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_4c/pool_proj"
+  type: "Convolution"
+  bottom: "inception_4c/pool"
+  top: "inception_4c/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4c/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_4c/pool_proj"
+  top: "inception_4c/pool_proj"
+}
+layer {
+  name: "inception_4c/output"
+  type: "Concat"
+  bottom: "inception_4c/1x1"
+  bottom: "inception_4c/3x3"
+  bottom: "inception_4c/5x5"
+  bottom: "inception_4c/pool_proj"
+  top: "inception_4c/output"
+}
+layer {
+  name: "inception_4d/1x1"
+  type: "Convolution"
+  bottom: "inception_4c/output"
+  top: "inception_4d/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 112
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4d/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_4d/1x1"
+  top: "inception_4d/1x1"
+}
+layer {
+  name: "inception_4d/3x3_reduce"
+  type: "Convolution"
+  bottom: "inception_4c/output"
+  top: "inception_4d/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 144
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4d/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_4d/3x3_reduce"
+  top: "inception_4d/3x3_reduce"
+}
+layer {
+  name: "inception_4d/3x3"
+  type: "Convolution"
+  bottom: "inception_4d/3x3_reduce"
+  top: "inception_4d/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 288
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4d/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_4d/3x3"
+  top: "inception_4d/3x3"
+}
+layer {
+  name: "inception_4d/5x5_reduce"
+  type: "Convolution"
+  bottom: "inception_4c/output"
+  top: "inception_4d/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4d/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_4d/5x5_reduce"
+  top: "inception_4d/5x5_reduce"
+}
+layer {
+  name: "inception_4d/5x5"
+  type: "Convolution"
+  bottom: "inception_4d/5x5_reduce"
+  top: "inception_4d/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4d/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_4d/5x5"
+  top: "inception_4d/5x5"
+}
+layer {
+  name: "inception_4d/pool"
+  type: "Pooling"
+  bottom: "inception_4c/output"
+  top: "inception_4d/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_4d/pool_proj"
+  type: "Convolution"
+  bottom: "inception_4d/pool"
+  top: "inception_4d/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4d/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_4d/pool_proj"
+  top: "inception_4d/pool_proj"
+}
+layer {
+  name: "inception_4d/output"
+  type: "Concat"
+  bottom: "inception_4d/1x1"
+  bottom: "inception_4d/3x3"
+  bottom: "inception_4d/5x5"
+  bottom: "inception_4d/pool_proj"
+  top: "inception_4d/output"
+}
+#layer {
+#  name: "loss2/ave_pool"
+#  type: "Pooling"
+#  bottom: "inception_4d/output"
+#  top: "loss2/ave_pool"
+#  pooling_param {
+#    pool: AVE
+#    kernel_size: 5
+#    stride: 3
+#  }
+#}
+#layer {
+#  name: "loss2/conv"
+#  type: "Convolution"
+#  bottom: "loss2/ave_pool"
+#  top: "loss2/conv"
+#  param {
+#    lr_mult: 1
+#    decay_mult: 1
+#  }
+#  param {
+#    lr_mult: 2
+#    decay_mult: 0
+#  }
+#  convolution_param {
+#    num_output: 128
+#    kernel_size: 1
+#    weight_filler {
+#      type: "xavier"
+#    }
+#    bias_filler {
+#      type: "constant"
+#      value: 0.2
+#    }
+#  }
+#}
+#layer {
+#  name: "loss2/relu_conv"
+#  type: "ReLU"
+#  bottom: "loss2/conv"
+#  top: "loss2/conv"
+#}
+#layer {
+#  name: "loss2/fc"
+#  type: "InnerProduct"
+#  bottom: "loss2/conv"
+#  top: "loss2/fc"
+#  param {
+#    lr_mult: 1
+#    decay_mult: 1
+#  }
+#  param {
+#    lr_mult: 2
+#    decay_mult: 0
+#  }
+#  inner_product_param {
+#    num_output: 1024
+#    weight_filler {
+#      type: "xavier"
+#    }
+#    bias_filler {
+#      type: "constant"
+#      value: 0.2
+#    }
+#  }
+#}
+#layer {
+#  name: "loss2/relu_fc"
+#  type: "ReLU"
+#  bottom: "loss2/fc"
+#  top: "loss2/fc"
+#}
+#layer {
+#  name: "loss2/drop_fc"
+#  type: "Dropout"
+#  bottom: "loss2/fc"
+#  top: "loss2/fc"
+#  dropout_param {
+#    dropout_ratio: 0.7
+#  }
+#}
+#layer {
+#  name: "loss2/classifier"
+#  type: "InnerProduct"
+#  bottom: "loss2/fc"
+#  top: "loss2/classifier"
+#  param {
+#    lr_mult: 1
+#    decay_mult: 1
+#  }
+#  param {
+#    lr_mult: 2
+#    decay_mult: 0
+#  }
+#  inner_product_param {
+#    num_output: 1000
+#    weight_filler {
+#      type: "xavier"
+#    }
+#    bias_filler {
+#      type: "constant"
+#      value: 0
+#    }
+#  }
+#}
+#layer {
+#  name: "loss2/loss"
+#  type: "SoftmaxWithLoss"
+#  bottom: "loss2/classifier"
+#  bottom: "label"
+#  top: "loss2/loss1"
+#  loss_weight: 0.3
+#}
+layer {
+  name: "inception_4e/1x1"
+  type: "Convolution"
+  bottom: "inception_4d/output"
+  top: "inception_4e/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4e/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_4e/1x1"
+  top: "inception_4e/1x1"
+}
+layer {
+  name: "inception_4e/3x3_reduce"
+  type: "Convolution"
+  bottom: "inception_4d/output"
+  top: "inception_4e/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 160
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4e/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_4e/3x3_reduce"
+  top: "inception_4e/3x3_reduce"
+}
+layer {
+  name: "inception_4e/3x3"
+  type: "Convolution"
+  bottom: "inception_4e/3x3_reduce"
+  top: "inception_4e/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 320
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4e/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_4e/3x3"
+  top: "inception_4e/3x3"
+}
+layer {
+  name: "inception_4e/5x5_reduce"
+  type: "Convolution"
+  bottom: "inception_4d/output"
+  top: "inception_4e/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4e/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_4e/5x5_reduce"
+  top: "inception_4e/5x5_reduce"
+}
+layer {
+  name: "inception_4e/5x5"
+  type: "Convolution"
+  bottom: "inception_4e/5x5_reduce"
+  top: "inception_4e/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4e/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_4e/5x5"
+  top: "inception_4e/5x5"
+}
+layer {
+  name: "inception_4e/pool"
+  type: "Pooling"
+  bottom: "inception_4d/output"
+  top: "inception_4e/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_4e/pool_proj"
+  type: "Convolution"
+  bottom: "inception_4e/pool"
+  top: "inception_4e/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_4e/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_4e/pool_proj"
+  top: "inception_4e/pool_proj"
+}
+layer {
+  name: "inception_4e/output"
+  type: "Concat"
+  bottom: "inception_4e/1x1"
+  bottom: "inception_4e/3x3"
+  bottom: "inception_4e/5x5"
+  bottom: "inception_4e/pool_proj"
+  top: "inception_4e/output"
+}
+layer {
+  name: "pool4/3x3_s2"
+  type: "Pooling"
+  bottom: "inception_4e/output"
+  top: "pool4/3x3_s2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "inception_5a/1x1"
+  type: "Convolution"
+  bottom: "pool4/3x3_s2"
+  top: "inception_5a/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5a/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_5a/1x1"
+  top: "inception_5a/1x1"
+}
+layer {
+  name: "inception_5a/3x3_reduce"
+  type: "Convolution"
+  bottom: "pool4/3x3_s2"
+  top: "inception_5a/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 160
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5a/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_5a/3x3_reduce"
+  top: "inception_5a/3x3_reduce"
+}
+layer {
+  name: "inception_5a/3x3"
+  type: "Convolution"
+  bottom: "inception_5a/3x3_reduce"
+  top: "inception_5a/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 320
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5a/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_5a/3x3"
+  top: "inception_5a/3x3"
+}
+layer {
+  name: "inception_5a/5x5_reduce"
+  type: "Convolution"
+  bottom: "pool4/3x3_s2"
+  top: "inception_5a/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5a/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_5a/5x5_reduce"
+  top: "inception_5a/5x5_reduce"
+}
+layer {
+  name: "inception_5a/5x5"
+  type: "Convolution"
+  bottom: "inception_5a/5x5_reduce"
+  top: "inception_5a/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5a/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_5a/5x5"
+  top: "inception_5a/5x5"
+}
+layer {
+  name: "inception_5a/pool"
+  type: "Pooling"
+  bottom: "pool4/3x3_s2"
+  top: "inception_5a/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_5a/pool_proj"
+  type: "Convolution"
+  bottom: "inception_5a/pool"
+  top: "inception_5a/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5a/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_5a/pool_proj"
+  top: "inception_5a/pool_proj"
+}
+layer {
+  name: "inception_5a/output"
+  type: "Concat"
+  bottom: "inception_5a/1x1"
+  bottom: "inception_5a/3x3"
+  bottom: "inception_5a/5x5"
+  bottom: "inception_5a/pool_proj"
+  top: "inception_5a/output"
+}
+layer {
+  name: "inception_5b/1x1"
+  type: "Convolution"
+  bottom: "inception_5a/output"
+  top: "inception_5b/1x1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5b/relu_1x1"
+  type: "ReLU"
+  bottom: "inception_5b/1x1"
+  top: "inception_5b/1x1"
+}
+layer {
+  name: "inception_5b/3x3_reduce"
+  type: "Convolution"
+  bottom: "inception_5a/output"
+  top: "inception_5b/3x3_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 192
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5b/relu_3x3_reduce"
+  type: "ReLU"
+  bottom: "inception_5b/3x3_reduce"
+  top: "inception_5b/3x3_reduce"
+}
+layer {
+  name: "inception_5b/3x3"
+  type: "Convolution"
+  bottom: "inception_5b/3x3_reduce"
+  top: "inception_5b/3x3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5b/relu_3x3"
+  type: "ReLU"
+  bottom: "inception_5b/3x3"
+  top: "inception_5b/3x3"
+}
+layer {
+  name: "inception_5b/5x5_reduce"
+  type: "Convolution"
+  bottom: "inception_5a/output"
+  top: "inception_5b/5x5_reduce"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 48
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5b/relu_5x5_reduce"
+  type: "ReLU"
+  bottom: "inception_5b/5x5_reduce"
+  top: "inception_5b/5x5_reduce"
+}
+layer {
+  name: "inception_5b/5x5"
+  type: "Convolution"
+  bottom: "inception_5b/5x5_reduce"
+  top: "inception_5b/5x5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 2
+    kernel_size: 5
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5b/relu_5x5"
+  type: "ReLU"
+  bottom: "inception_5b/5x5"
+  top: "inception_5b/5x5"
+}
+layer {
+  name: "inception_5b/pool"
+  type: "Pooling"
+  bottom: "inception_5a/output"
+  top: "inception_5b/pool"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 1
+    pad: 1
+  }
+}
+layer {
+  name: "inception_5b/pool_proj"
+  type: "Convolution"
+  bottom: "inception_5b/pool"
+  top: "inception_5b/pool_proj"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0.2
+    }
+  }
+}
+layer {
+  name: "inception_5b/relu_pool_proj"
+  type: "ReLU"
+  bottom: "inception_5b/pool_proj"
+  top: "inception_5b/pool_proj"
+}
+layer {
+  name: "inception_5b/output"
+  type: "Concat"
+  bottom: "inception_5b/1x1"
+  bottom: "inception_5b/3x3"
+  bottom: "inception_5b/5x5"
+  bottom: "inception_5b/pool_proj"
+  top: "inception_5b/output"
+}
+layer {
+  name: "pool5/7x7_s1"
+  type: "Pooling"
+  bottom: "inception_5b/output"
+  top: "pool5/7x7_s1"
+  pooling_param {
+    pool: AVE
+    kernel_size: 7
+    stride: 1
+  }
+}
+layer {
+  name: "pool5/drop_7x7_s1"
+  type: "Dropout"
+  bottom: "pool5/7x7_s1"
+  top: "pool5/7x7_s1"
+  dropout_param {
+    dropout_ratio: 0.4
+  }
+}
+layer {
+  name: "loss3/classifier"
+  type: "InnerProduct"
+  bottom: "pool5/7x7_s1"
+  top: "loss3/classifier"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "xavier"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss3/loss3"
+  type: "SoftmaxWithLoss"
+  bottom: "loss3/classifier"
+  bottom: "label"
+  top: "loss3/loss3"
+  loss_weight: 1
+}
diff --git a/benchmark/caffe/image/run.sh b/benchmark/caffe/image/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..aa9ac20ca5cc1d48a07ce39f7d6c6d70ad4121ab
--- /dev/null
+++ b/benchmark/caffe/image/run.sh
@@ -0,0 +1,30 @@
+set -e
+
+function test() {
+  cfg=$1
+  batch=$2
+  prefix=$3
+  sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg 
+  sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: $batch/g}" $cfg
+  caffe time --model=$cfg --iterations=50 --gpu 0 > logs/$prefix-1gpu-batch${batch}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# alexnet
+test alexnet.prototxt 64 alexnet 
+test alexnet.prototxt 128 alexnet 
+test alexnet.prototxt 256 alexnet 
+test alexnet.prototxt 512 alexnet 
+
+# googlenet
+test googlenet.prototxt 64 googlenet 
+test googlenet.prototxt 128 googlenet 
+
+# small net 
+test smallnet_mnist_cifar.prototxt 64 smallnet 
+test smallnet_mnist_cifar.prototxt 128 smallnet 
+test smallnet_mnist_cifar.prototxt 256 smallnet 
+test smallnet_mnist_cifar.prototxt 512 smallnet 
diff --git a/benchmark/caffe/image/run_multi.sh b/benchmark/caffe/image/run_multi.sh
new file mode 100755
index 0000000000000000000000000000000000000000..9a0a71bc185a421842265ea6d2310429adb86913
--- /dev/null
+++ b/benchmark/caffe/image/run_multi.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+set -e
+
+function test() {
+  cfg=$1
+  batch=$2
+  prefix=$3
+  batch_per_gpu=`expr ${batch} / 4`
+  sed -i "/input: \"data\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg 
+  sed -i "/input: \"label\"/{n;s/^input_dim.*/input_dim: ${batch_per_gpu}/g}" $cfg 
+  sed -i "1c\net : \"${cfg}\"" solver.prototxt
+  caffe train --solver=solver.prototxt -gpu 0,1,2,3 > logs/${prefix}-4gpu-batch${batch}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# alexnet
+test alexnet.prototxt 512 alexnet 
+test alexnet.prototxt 1024 alexnet 
+
+# googlnet 
+test googlenet.prototxt 512 googlenet 
diff --git a/benchmark/caffe/image/smallnet_mnist_cifar.prototxt b/benchmark/caffe/image/smallnet_mnist_cifar.prototxt
new file mode 100644
index 0000000000000000000000000000000000000000..3cb0e32bbfb9f785ece6d428356987e5503dd25d
--- /dev/null
+++ b/benchmark/caffe/image/smallnet_mnist_cifar.prototxt
@@ -0,0 +1,198 @@
+name: "mnist/cifar"
+input: "data"
+input_dim: 128 
+input_dim: 3
+input_dim: 32 
+input_dim: 32 
+input: "label"
+input_dim: 128 
+input_dim: 1
+input_dim: 1
+input_dim: 1 
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 32
+    pad: 2
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.0001
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "pool1"
+  top: "pool1"
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 32
+    pad: 2
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2"
+  top: "pool2"
+  pooling_param {
+    pool: AVE
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 64
+    pad: 2
+    kernel_size: 5
+    stride: 1
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3"
+  top: "pool3"
+  pooling_param {
+    pool: AVE
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "ip1"
+  type: "InnerProduct"
+  bottom: "pool3"
+  top: "ip1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 64
+    weight_filler {
+      type: "gaussian"
+      std: 0.1
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "ip2"
+  type: "InnerProduct"
+  bottom: "ip1"
+  top: "ip2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 10
+    weight_filler {
+      type: "gaussian"
+      std: 0.1
+    }
+    bias_filler {
+      type: "constant"
+    }
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "ip2"
+  bottom: "label"
+  top: "accuracy"
+  include {
+    phase: TEST
+  }
+}
+layer {
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "ip2"
+  bottom: "label"
+  top: "loss"
+}
diff --git a/benchmark/caffe/image/solver.prototxt b/benchmark/caffe/image/solver.prototxt
new file mode 100644
index 0000000000000000000000000000000000000000..61c10284e6027b4cc0b3d4c8fcf949e0a5a22a85
--- /dev/null
+++ b/benchmark/caffe/image/solver.prototxt
@@ -0,0 +1,10 @@
+net: "alexnet.prototxt"
+base_lr: 0.01
+lr_policy: "fixed"
+display: 20
+max_iter: 200
+momentum: 0.9
+weight_decay: 0.0005
+snapshot: 10000
+snapshot_prefix: "models/caffe_alexnet_train"
+solver_mode: GPU
diff --git a/benchmark/figs/alexnet-4gpu.png b/benchmark/figs/alexnet-4gpu.png
new file mode 100644
index 0000000000000000000000000000000000000000..28b95a44508f0ee7ad270c9ccdf8659009406b03
Binary files /dev/null and b/benchmark/figs/alexnet-4gpu.png differ
diff --git a/benchmark/figs/googlenet-4gpu.png b/benchmark/figs/googlenet-4gpu.png
new file mode 100644
index 0000000000000000000000000000000000000000..9b5331f05a3e54cacf949f10b6603bf627a6d106
Binary files /dev/null and b/benchmark/figs/googlenet-4gpu.png differ
diff --git a/benchmark/figs/rnn_lstm_4gpus.png b/benchmark/figs/rnn_lstm_4gpus.png
new file mode 100644
index 0000000000000000000000000000000000000000..973ce2fa5f65e9681c972d4f5bd5776b5c4aa264
Binary files /dev/null and b/benchmark/figs/rnn_lstm_4gpus.png differ
diff --git a/benchmark/figs/rnn_lstm_cls.png b/benchmark/figs/rnn_lstm_cls.png
new file mode 100644
index 0000000000000000000000000000000000000000..26d05cac11aa7ae8cdfbcd8c4401f6547a9404f6
Binary files /dev/null and b/benchmark/figs/rnn_lstm_cls.png differ
diff --git a/benchmark/paddle/image/alexnet.py b/benchmark/paddle/image/alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..3358d43a4b08c6a9b89d59e1a8be53ee1f12bbe0
--- /dev/null
+++ b/benchmark/paddle/image/alexnet.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+
+from paddle.trainer_config_helpers import *
+
+height = 227
+width = 227
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 128)
+
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+
+# conv1
+net = data_layer('data', size=height * width * 3)
+net = img_conv_layer(
+    input=net,
+    filter_size=11,
+    num_channels=3,
+    num_filters=96,
+    stride=4,
+    padding=1)
+net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
+net = img_pool_layer(input=net, pool_size=3, stride=2)
+
+# conv2
+net = img_conv_layer(
+    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1)
+net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
+net = img_pool_layer(input=net, pool_size=3, stride=2)
+
+# conv3
+net = img_conv_layer(
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1)
+# conv4
+net = img_conv_layer(
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1)
+
+# conv5
+net = img_conv_layer(
+    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1)
+net = img_pool_layer(input=net, pool_size=3, stride=2)
+
+net = fc_layer(
+    input=net,
+    size=4096,
+    act=ReluActivation(),
+    layer_attr=ExtraAttr(drop_rate=0.5))
+net = fc_layer(
+    input=net,
+    size=4096,
+    act=ReluActivation(),
+    layer_attr=ExtraAttr(drop_rate=0.5))
+net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
+
+lab = data_layer('label', num_class)
+loss = cross_entropy(input=net, label=lab)
+outputs(loss)
diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc893bab98c4d2e07c62fbd012d51a0939db4766
--- /dev/null
+++ b/benchmark/paddle/image/googlenet.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python
+from paddle.trainer_config_helpers import *
+
+height = 224
+width = 224
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 128)
+
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+
+def inception2(name, input, channels, \
+    filter1,
+    filter3R, filter3,
+    filter5R, filter5,
+    proj):
+
+    conv1 = name + '_1'
+    conv3r = name + '_3r'
+    conv3 = name + '_3'
+    conv5r = name + '_5r'
+    conv5 = name + '_5'
+    maxpool = name + '_max'
+    convproj = name + '_proj'
+
+    cov1 = img_conv_layer(
+        name=conv1,
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter1,
+        stride=1,
+        padding=0)
+
+    cov3r = img_conv_layer(
+        name=conv3r,
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter3R,
+        stride=1,
+        padding=0)
+    cov3 = img_conv_layer(
+        name=conv3,
+        input=cov3r,
+        filter_size=3,
+        num_filters=filter3,
+        stride=1,
+        padding=1)
+
+    cov5r = img_conv_layer(
+        name=conv5r,
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter5R,
+        stride=1,
+        padding=0)
+    cov5 = img_conv_layer(
+        name=conv5,
+        input=cov5r,
+        filter_size=5,
+        num_filters=filter5,
+        stride=1,
+        padding=2)
+
+    pool1 = img_pool_layer(
+        name=maxpool,
+        input=input,
+        pool_size=3,
+        num_channels=channels,
+        stride=1,
+        padding=1)
+    covprj = img_conv_layer(
+        name=convproj,
+        input=pool1,
+        filter_size=1,
+        num_filters=proj,
+        stride=1,
+        padding=0)
+
+    cat = concat_layer(name=name, input=[cov1, cov3, cov5, covprj])
+    return cat
+
+def inception(name, input, channels, \
+    filter1,
+    filter3R, filter3,
+    filter5R, filter5,
+    proj):
+
+    cov1 = conv_projection(
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter1,
+        stride=1,
+        padding=0)
+
+    cov3r = img_conv_layer(
+        name=name + '_3r',
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter3R,
+        stride=1,
+        padding=0)
+    cov3 = conv_projection(
+        input=cov3r, filter_size=3, num_filters=filter3, stride=1, padding=1)
+
+    cov5r = img_conv_layer(
+        name=name + '_5r',
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter5R,
+        stride=1,
+        padding=0)
+    cov5 = conv_projection(
+        input=cov5r, filter_size=5, num_filters=filter5, stride=1, padding=2)
+
+    pool1 = img_pool_layer(
+        name=name + '_max',
+        input=input,
+        pool_size=3,
+        num_channels=channels,
+        stride=1,
+        padding=1)
+    covprj = conv_projection(
+        input=pool1, filter_size=1, num_filters=proj, stride=1, padding=0)
+
+    cat = concat_layer(
+        name=name,
+        input=[cov1, cov3, cov5, covprj],
+        bias_attr=True,
+        act=ReluActivation())
+    return cat
+
+
+lab = data_layer(name="label", size=1000)
+data = data_layer(name="input", size=3 * height * width)
+
+# stage 1
+conv1 = img_conv_layer(
+    name="conv1",
+    input=data,
+    filter_size=7,
+    num_channels=3,
+    num_filters=64,
+    stride=2,
+    padding=3)
+pool1 = img_pool_layer(
+    name="pool1", input=conv1, pool_size=3, num_channels=64, stride=2)
+
+# stage 2
+conv2_1 = img_conv_layer(
+    name="conv2_1",
+    input=pool1,
+    filter_size=1,
+    num_filters=64,
+    stride=1,
+    padding=0)
+conv2_2 = img_conv_layer(
+    name="conv2_2",
+    input=conv2_1,
+    filter_size=3,
+    num_filters=192,
+    stride=1,
+    padding=1)
+pool2 = img_pool_layer(
+    name="pool2", input=conv2_2, pool_size=3, num_channels=192, stride=2)
+
+# stage 3
+ince3a = inception("ince3a", pool2, 192, 64, 96, 128, 16, 32, 32)
+ince3b = inception("ince3b", ince3a, 256, 128, 128, 192, 32, 96, 64)
+pool3 = img_pool_layer(
+    name="pool3", input=ince3b, num_channels=480, pool_size=3, stride=2)
+
+# stage 4
+ince4a = inception("ince4a", pool3, 480, 192, 96, 208, 16, 48, 64)
+ince4b = inception("ince4b", ince4a, 512, 160, 112, 224, 24, 64, 64)
+ince4c = inception("ince4c", ince4b, 512, 128, 128, 256, 24, 64, 64)
+ince4d = inception("ince4d", ince4c, 512, 112, 144, 288, 32, 64, 64)
+ince4e = inception("ince4e", ince4d, 528, 256, 160, 320, 32, 128, 128)
+pool4 = img_pool_layer(
+    name="pool4", input=ince4e, num_channels=832, pool_size=3, stride=2)
+
+# stage 5
+ince5a = inception("ince5a", pool4, 832, 256, 160, 320, 32, 128, 128)
+ince5b = inception("ince5b", ince5a, 832, 384, 192, 384, 48, 128, 128)
+pool5 = img_pool_layer(
+    name="pool5",
+    input=ince5b,
+    num_channels=1024,
+    pool_size=7,
+    stride=7,
+    pool_type=AvgPooling())
+
+# We remove loss1 and loss2 for all system when testing benchmark
+# output 1
+# pool_o1 = img_pool_layer(name="pool_o1", input=ince4a, num_channels=512, pool_size=5, stride=3, pool_type=AvgPooling())
+# conv_o1 = img_conv_layer(name="conv_o1", input=pool_o1, filter_size=1, num_filters=128, stride=1, padding=0)
+# fc_o1 = fc_layer(name="fc_o1", input=conv_o1, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation())
+# out1 = fc_layer(name="output1", input=fc_o1,  size=1000, act=SoftmaxActivation())
+# loss1 = cross_entropy(name='loss1', input=out1, label=lab, coeff=0.3) 
+
+# output 2
+#pool_o2 = img_pool_layer(name="pool_o2", input=ince4d, num_channels=528, pool_size=5, stride=3, pool_type=AvgPooling())
+#conv_o2 = img_conv_layer(name="conv_o2", input=pool_o2, filter_size=1, num_filters=128, stride=1, padding=0)
+#fc_o2 = fc_layer(name="fc_o2", input=conv_o2, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation())
+#out2 = fc_layer(name="output2", input=fc_o2, size=1000, act=SoftmaxActivation())
+#loss2 = cross_entropy(name='loss2', input=out2, label=lab, coeff=0.3) 
+
+# output 3
+dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
+out3 = fc_layer(
+    name="output3", input=dropout, size=1000, act=SoftmaxActivation())
+loss3 = cross_entropy(name='loss3', input=out3, label=lab)
+
+outputs(loss3)
diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ac47212b5a75667e8e9d4465b33f575516e2836
--- /dev/null
+++ b/benchmark/paddle/image/provider.py
@@ -0,0 +1,26 @@
+import io, os
+import random
+import numpy as np
+from paddle.trainer.PyDataProvider2 import *
+
+
+def initHook(settings, height, width, color, num_class, **kwargs):
+    settings.height = height
+    settings.width = width
+    settings.color = color
+    settings.num_class = num_class
+    if settings.color:
+        settings.data_size = settings.height * settings.width * 3
+    else:
+        settings.data_size = settings.height * settings.width
+
+    settings.slots = [dense_vector(settings.data_size), integer_value(1)]
+
+
+@provider(
+    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_list):
+    for i in xrange(1024):
+        img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
+        lab = random.randint(0, settings.num_class)
+        yield img.astype('float32'), int(lab)
diff --git a/benchmark/paddle/image/run.sh b/benchmark/paddle/image/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..717ed487ba7657db6535efcb1128a355a0f15eaf
--- /dev/null
+++ b/benchmark/paddle/image/run.sh
@@ -0,0 +1,51 @@
+set -e
+
+function train() {
+  cfg=$1
+  thread=$2
+  bz=$3
+  args="batch_size=$3"
+  prefix=$4
+  paddle train --job=time \
+    --config=$cfg \
+    --use_gpu=True \
+    --trainer_count=$thread \
+    --log_period=10 \
+    --test_period=100 \
+    --config_args=$args \
+    > logs/$prefix-${thread}gpu-$bz.log 2>&1 
+}
+
+if [ ! -d "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+#========single-gpu=========#
+# alexnet
+train alexnet.py 1 64 alexnet
+train alexnet.py 1 128 alexnet
+train alexnet.py 1 256 alexnet
+train alexnet.py 1 512 alexnet
+
+# googlenet
+train googlenet.py 1 64 googlenet
+train googlenet.py 1 128 googlenet
+train googlenet.py 1 256 googlenet
+
+# smallnet
+train smallnet_mnist_cifar.py 1 64 smallnet
+train smallnet_mnist_cifar.py 1 128 smallnet
+train smallnet_mnist_cifar.py 1 256 smallnet
+train smallnet_mnist_cifar.py 1 512 smallnet
+
+
+############################
+#========multi-gpus=========#
+train alexnet.py 4 512 alexnet
+train alexnet.py 4 1024 alexnet
+
+train googlenet.py 4 512 googlenet 
+train googlenet.py 4 1024 googlenet
diff --git a/benchmark/paddle/image/smallnet_mnist_cifar.py b/benchmark/paddle/image/smallnet_mnist_cifar.py
new file mode 100644
index 0000000000000000000000000000000000000000..58879c454f37991405d83bbb593bb5d1e977ff53
--- /dev/null
+++ b/benchmark/paddle/image/smallnet_mnist_cifar.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+
+from paddle.trainer_config_helpers import *
+
+height = 32
+width = 32
+num_class = 10
+
+batch_size = get_config_arg('batch_size', int, 128)
+
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+
+# conv1
+net = data_layer('data', size=height * width * 3)
+net = img_conv_layer(
+    input=net,
+    filter_size=5,
+    num_channels=3,
+    num_filters=32,
+    stride=1,
+    padding=2)
+net = img_pool_layer(input=net, pool_size=3, stride=2, padding=1)
+
+# conv2
+net = img_conv_layer(
+    input=net, filter_size=5, num_filters=32, stride=1, padding=2)
+net = img_pool_layer(
+    input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
+
+# conv3
+net = img_conv_layer(
+    input=net, filter_size=3, num_filters=64, stride=1, padding=1)
+net = img_pool_layer(
+    input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
+
+net = fc_layer(input=net, size=64, act=ReluActivation())
+net = fc_layer(input=net, size=10, act=SoftmaxActivation())
+
+lab = data_layer('label', num_class)
+loss = classification_cost(input=net, label=lab)
+outputs(loss)
diff --git a/benchmark/paddle/rnn/imdb.py b/benchmark/paddle/rnn/imdb.py
new file mode 100755
index 0000000000000000000000000000000000000000..fc4ed4025f9ed2e0a32a1709ff8df4af53521196
--- /dev/null
+++ b/benchmark/paddle/rnn/imdb.py
@@ -0,0 +1,46 @@
+from __future__ import print_function
+import six.moves.cPickle as pickle
+import gzip
+import os
+import numpy
+
+
+def get_dataset_file(dataset, default_dataset, origin):
+    data_dir, data_file = os.path.split(dataset)
+    if (not os.path.isfile(dataset)) and data_file == default_dataset:
+        from six.moves import urllib
+        print('Downloading data from %s' % origin)
+        urllib.request.urlretrieve(origin, dataset)
+
+    return dataset
+
+
+def create_data(path="imdb.pkl"):
+
+    if (not os.path.isfile('imdb.train.pkl')):
+        path = get_dataset_file(
+            path, "imdb.pkl",
+            "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
+
+        if path.endswith(".gz"):
+            f = gzip.open(path, 'rb')
+        else:
+            f = open(path, 'rb')
+
+        train_set = pickle.load(f)
+        test_set = pickle.load(f)
+        f.close()
+
+        pickle.dump(train_set, open('imdb.train.pkl', 'wb'))
+        pickle.dump(test_set, open('imdb.test.pkl', 'wb'))
+
+    if (not os.path.isfile('train.list')):
+        file('train.list', 'w').write('imdb.train.pkl\n')
+
+
+def main():
+    create_data('imdb.pkl')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/paddle/rnn/provider.py b/benchmark/paddle/rnn/provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..928ca75daf84ccebb775364b0be0d8b3d5eebff9
--- /dev/null
+++ b/benchmark/paddle/rnn/provider.py
@@ -0,0 +1,72 @@
+import io, os
+import random
+import numpy as np
+import six.moves.cPickle as pickle
+from paddle.trainer.PyDataProvider2 import *
+
+
+def remove_unk(x, n_words):
+    return [[1 if w >= n_words else w for w in sen] for sen in x]
+
+
+# ==============================================================
+#  tensorflow uses fixed length, but PaddlePaddle can process
+#  variable-length. Padding is used in benchmark in order to
+#  compare with other platform. 
+# ==============================================================
+def pad_sequences(sequences,
+                  maxlen=None,
+                  dtype='int32',
+                  padding='post',
+                  truncating='post',
+                  value=0.):
+    lengths = [len(s) for s in sequences]
+
+    nb_samples = len(sequences)
+    if maxlen is None:
+        maxlen = np.max(lengths)
+
+    x = (np.ones((nb_samples, maxlen)) * value).astype(dtype)
+    for idx, s in enumerate(sequences):
+        if len(s) == 0:
+            continue  # empty list was found
+        if truncating == 'pre':
+            trunc = s[-maxlen:]
+        elif truncating == 'post':
+            trunc = s[:maxlen]
+        else:
+            raise ValueError("Truncating type '%s' not understood" % padding)
+
+        if padding == 'post':
+            x[idx, :len(trunc)] = trunc
+        elif padding == 'pre':
+            x[idx, -len(trunc):] = trunc
+        else:
+            raise ValueError("Padding type '%s' not understood" % padding)
+    return x
+
+
+def initHook(settings, vocab_size, pad_seq, maxlen, **kwargs):
+    settings.vocab_size = vocab_size
+    settings.pad_seq = pad_seq
+    settings.maxlen = maxlen
+    settings.input_types = [
+        integer_value_sequence(vocab_size), integer_value(2)
+    ]
+
+
+@provider(
+    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file):
+    f = open(file, 'rb')
+    train_set = pickle.load(f)
+    f.close()
+    x, y = train_set
+
+    # remove unk, namely remove the words out of dictionary
+    x = remove_unk(x, settings.vocab_size)
+    if settings.pad_seq:
+        x = pad_sequences(x, maxlen=settings.maxlen, value=0.)
+
+    for i in range(len(y)):
+        yield map(int, x[i]), int(y[i])
diff --git a/benchmark/paddle/rnn/rnn.py b/benchmark/paddle/rnn/rnn.py
new file mode 100755
index 0000000000000000000000000000000000000000..83eb3e565473f7e7e91cddeaa3cd2aafb7e3df2c
--- /dev/null
+++ b/benchmark/paddle/rnn/rnn.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+
+from paddle.trainer_config_helpers import *
+import imdb
+
+num_class = 2
+vocab_size = 30000
+fixedlen = 100
+batch_size = get_config_arg('batch_size', int, 128)
+lstm_num = get_config_arg('lstm_num', int, 1)
+hidden_size = get_config_arg('hidden_size', int, 128)
+# whether to pad sequence into fixed length
+pad_seq = get_config_arg('pad_seq', bool, True)
+imdb.create_data('imdb.pkl')
+
+args = {'vocab_size': vocab_size, 'pad_seq': pad_seq, 'maxlen': fixedlen}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25)
+
+net = data_layer('data', size=vocab_size)
+net = embedding_layer(input=net, size=128)
+
+for i in xrange(lstm_num):
+    net = simple_lstm(input=net, size=hidden_size)
+
+net = last_seq(input=net)
+net = fc_layer(input=net, size=2, act=SoftmaxActivation())
+
+lab = data_layer('label', num_class)
+loss = classification_cost(input=net, label=lab)
+outputs(loss)
diff --git a/benchmark/paddle/rnn/run.sh b/benchmark/paddle/rnn/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..e9dfeb2e525979f47e4ef48f7610dc1007900f2c
--- /dev/null
+++ b/benchmark/paddle/rnn/run.sh
@@ -0,0 +1,50 @@
+set -e
+
+function train() {
+  cfg=$1
+  thread=$2
+  args="lstm_num=${3},seq_pad=${4},hidden_size=${5},batch_size=${6}"
+  paddle train --job=time \
+    --config=$cfg \
+    --use_gpu=1 \
+    --trainer_count=$thread \
+    --log_period=10 \
+    --test_period=100 \
+    --num_passes=1 \
+    --feed_data=1 \
+    --config_args=$args \
+    >logs/rnn-pad${4}-${thread}gpu-lstm${3}-batch${6}-hid${5}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+## padding, single gpu
+#-----config--gpu--lstm_num--padding--hidden_size--batch_size
+## lstm_num=2, batch_size=64
+train rnn.py 1 2 1 256 64 
+train rnn.py 1 2 1 512 64 
+train rnn.py 1 2 1 1280 64 
+
+## lstm_num=2, batch_size=128
+train rnn.py 1 2 1 256 128 
+train rnn.py 1 2 1 512 128 
+train rnn.py 1 2 1 1280 128 
+
+## lstm_num=4, batch_size=256
+train rnn.py 1 2 1 256 256 
+train rnn.py 1 2 1 512 256 
+train rnn.py 1 2 1 1280 256 
+
+
+#==================multi gpus=====================#
+# hidden_size=256, lstm_num=2, different batch size
+train rnn.py 4 2 1 256 128 
+train rnn.py 4 2 1 256 256 
+train rnn.py 4 2 1 256 512 
+
+# hidden_size=512, lstm_num=4, different batch size
+train rnn.py 4 2 1 512 128 
+train rnn.py 4 2 1 512 256 
+train rnn.py 4 2 1 512 512 
diff --git a/benchmark/tensorflow/image/alexnet.py b/benchmark/tensorflow/image/alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6a39ef778e21bee7374718a1b1ddf43392825a8
--- /dev/null
+++ b/benchmark/tensorflow/image/alexnet.py
@@ -0,0 +1,298 @@
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from datetime import datetime
+import math
+import time
+
+import tensorflow.python.platform
+import tensorflow as tf
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_boolean('forward_only', False,
+                            """Only run the forward pass.""")
+tf.app.flags.DEFINE_boolean('forward_backward_only', False,
+                            """Only run the forward-forward pass.""")
+tf.app.flags.DEFINE_string('data_format', 'NCHW',
+                           """The data format for Convnet operations.
+                           Can be either NHWC or NCHW.
+                           """)
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+
+
+def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.0005):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [kH, kW, nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        if wd is not None and wd > 0:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        if FLAGS.data_format == 'NCHW':
+            strides = [1, 1, dH, dW]
+        else:
+            strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(
+            inpOp,
+            kernel,
+            strides,
+            padding=padType,
+            data_format=FLAGS.data_format)
+
+        biases = tf.get_variable(
+            name=name + '_b',
+            shape=[nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32)
+
+        bias = tf.reshape(
+            tf.nn.bias_add(
+                conv, biases, data_format=FLAGS.data_format),
+            conv.get_shape())
+
+        conv1 = tf.nn.relu(bias, name=scope)
+        return conv1
+
+
+def _affine(name, inpOp, nIn, nOut, wd=0.0005, act=True, drop=None):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        if wd is not None and wd > 0:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        biases = tf.get_variable(
+            name + '_b', [nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=True)
+
+        affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
+                  tf.matmul(inpOp, kernel) + biases
+
+        output = tf.nn.dropout(affine1, drop) if drop else affine1
+
+        return output
+
+
+def _mpool(name, inpOp, kH, kW, dH, dW):
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding='VALID',
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _norm(name, l_input, lsize=4):
+    return tf.nn.lrn(l_input,
+                     lsize,
+                     bias=1.0,
+                     alpha=0.001 / 9.0,
+                     beta=0.75,
+                     name=name)
+
+
+def loss(logits, labels):
+    labels = tf.cast(labels, tf.int64)
+    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        logits, labels, name='cross_entropy_per_example')
+    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
+    tf.add_to_collection('losses', cross_entropy_mean)
+
+    # The total loss is defined as the cross entropy loss plus all of the weight
+    # decay terms (L2 loss).
+    return tf.add_n(tf.get_collection('losses'), name='total_loss')
+
+
+def get_incoming_shape(incoming):
+    """ Returns the incoming data shape """
+    if isinstance(incoming, tf.Tensor):
+        return incoming.get_shape().as_list()
+    elif type(incoming) in [np.array, list, tuple]:
+        return np.shape(incoming)
+    else:
+        raise Exception("Invalid incoming layer.")
+
+
+def inference(images):
+    conv1 = _conv('conv1', images, 3, 96, 11, 11, 4, 4, 'VALID')
+    pool1 = _mpool('pool1', conv1, 3, 3, 2, 2)
+    norm1 = _norm('norm1', pool1, lsize=5)
+    conv2 = _conv('conv2', norm1, 96, 256, 5, 5, 1, 1, 'SAME')
+    pool2 = _mpool('pool2', conv2, 3, 3, 2, 2)
+    norm2 = _norm('norm2', pool2, lsize=5)
+    conv3 = _conv('conv3', norm2, 256, 384, 3, 3, 1, 1, 'SAME')
+    conv4 = _conv('conv4', conv3, 384, 384, 3, 3, 1, 1, 'SAME')
+    conv5 = _conv('conv5', conv4, 384, 256, 3, 3, 1, 1, 'SAME')
+    pool5 = _mpool('pool5', conv5, 3, 3, 2, 2)
+    resh1 = tf.reshape(pool5, [-1, 256 * 6 * 6])
+    affn1 = _affine('fc6', resh1, 256 * 6 * 6, 4096, 0.5)
+    affn2 = _affine('fc7', affn1, 4096, 4096, 0.5)
+    affn3 = _affine('fc8', affn2, 4096, 1000, wd=None, act=False)  # last fc
+
+    return affn3
+
+
+def time_tensorflow_run(session, target, info_string):
+    num_steps_burn_in = 10
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    if not isinstance(target, list):
+        target = [target]
+    target_op = tf.group(*target)
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _ = session.run(target_op)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                print('%s: step %d, duration = %.3f' %
+                      (datetime.now(), i - num_steps_burn_in, duration))
+            total_duration += duration
+            total_duration_squared += duration * duration
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
+
+
+def _add_loss_summaries(total_loss):
+    """
+  Generates moving average for all losses and associated summaries for
+  visualizing the performance of the network.
+
+  Args:
+    total_loss: Total loss from loss().
+  Returns:
+    loss_averages_op: op for generating moving averages of losses.
+  """
+    # Compute the moving average of all individual losses and the total loss.
+    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
+    losses = tf.get_collection('losses')
+    loss_averages_op = loss_averages.apply(losses + [total_loss])
+
+    # Attach a scalar summary to all individual losses and the total loss; do the
+    # same for the averaged version of the losses.
+    for l in losses + [total_loss]:
+        # Name each loss as '(raw)' and name the moving average version of the loss
+        # as the original loss name.
+        tf.scalar_summary(l.op.name + ' (raw)', l)
+        tf.scalar_summary(l.op.name, loss_averages.average(l))
+
+    return loss_averages_op
+
+
+def run_benchmark():
+    with tf.Graph().as_default():
+        with tf.device('/gpu:0'):
+            # Generate some dummy images.
+            image_size = 224
+            # Note that our padding definition is slightly different the cuda-convnet.
+            # In order to force the model to start with the same activations sizes,
+            # we add 3 to the image_size and employ VALID padding above.
+            if FLAGS.data_format == 'NCHW':
+                image_shape = [
+                    FLAGS.batch_size, 3, image_size + 3, image_size + 3
+                ]
+            else:
+                image_shape = [
+                    FLAGS.batch_size, image_size + 3, image_size + 3, 3
+                ]
+            images = tf.get_variable(
+                'image',
+                image_shape,
+                initializer=tf.truncated_normal_initializer(
+                    stddev=0.1, dtype=tf.float32),
+                dtype=tf.float32,
+                trainable=False)
+
+            labels = tf.get_variable(
+                'label', [FLAGS.batch_size],
+                initializer=tf.constant_initializer(1),
+                dtype=tf.int32,
+                trainable=False)
+
+            # Build a Graph that computes the logits predictions from the
+            # inference model.
+            last_layer = inference(images)
+
+            objective = loss(last_layer, labels)
+            # Compute the gradient with respect to all the parameters.
+
+            # Compute gradients.
+            # opt = tf.train.GradientDescentOptimizer(0.001)
+            opt = tf.train.MomentumOptimizer(0.001, 0.9)
+            grads = opt.compute_gradients(objective)
+            global_step = tf.get_variable(
+                'global_step', [],
+                initializer=tf.constant_initializer(
+                    0.0, dtype=tf.float32),
+                trainable=False,
+                dtype=tf.float32)
+            apply_gradient_op = opt.apply_gradients(
+                grads, global_step=global_step)
+
+            # Track the moving averages of all trainable variables.
+            variable_averages = tf.train.ExponentialMovingAverage(0.9,
+                                                                  global_step)
+            variables_averages_op = variable_averages.apply(
+                tf.trainable_variables())
+
+            # Build an initialization operation.
+            init = tf.initialize_all_variables()
+
+            # Start running operations on the Graph.
+            sess = tf.Session(config=tf.ConfigProto(
+                allow_soft_placement=True,
+                log_device_placement=FLAGS.log_device_placement))
+            sess.run(init)
+
+            run_forward = True
+            run_forward_backward = True
+            if FLAGS.forward_only and FLAGS.forward_backward_only:
+                raise ValueError("Cannot specify --forward_only and "
+                                 "--forward_backward_only at the same time.")
+            if FLAGS.forward_only:
+                run_forward_backward = False
+            elif FLAGS.forward_backward_only:
+                run_forward = False
+
+            if run_forward:
+                time_tensorflow_run(sess, last_layer, "Forward")
+
+            if run_forward_backward:
+                with tf.control_dependencies(
+                    [apply_gradient_op, variables_averages_op]):
+                    train_op = tf.no_op(name='train')
+                time_tensorflow_run(sess, [train_op, objective],
+                                    "Forward-backward")
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
diff --git a/benchmark/tensorflow/image/alexnet_multi_gpu.py b/benchmark/tensorflow/image/alexnet_multi_gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b5ee78f4dd5429abd85d75c092a6e3a2a39f922
--- /dev/null
+++ b/benchmark/tensorflow/image/alexnet_multi_gpu.py
@@ -0,0 +1,365 @@
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from datetime import datetime
+import math
+import re
+import time
+
+import tensorflow.python.platform
+import tensorflow as tf
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_string('data_format', 'NCHW',
+                           """The data format for Convnet operations.
+                           Can be either NHWC or NCHW.
+                           """)
+
+tf.app.flags.DEFINE_string('train_dir', '/train_model',
+                           """Directory where to write event logs """
+                           """and checkpoint.""")
+tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""")
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+
+NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
+NUM_EPOCHS_PER_DECAY = 50
+INITIAL_LEARNING_RATE = 0.1
+LEARNING_RATE_DECAY_FACTOR = 0.1
+TOWER_NAME = 'tower'
+
+
+def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [kH, kW, nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        if wd is not None:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        if FLAGS.data_format == 'NCHW':
+            strides = [1, 1, dH, dW]
+        else:
+            strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(
+            inpOp,
+            kernel,
+            strides,
+            padding=padType,
+            data_format=FLAGS.data_format)
+
+        biases = tf.get_variable(
+            name=name + '_b',
+            shape=[nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32)
+
+        bias = tf.reshape(
+            tf.nn.bias_add(
+                conv, biases, data_format=FLAGS.data_format),
+            conv.get_shape())
+
+        conv1 = tf.nn.relu(bias, name=scope)
+        return conv1
+
+
+def _affine(name, inpOp, nIn, nOut, wd=0.005, act=True):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        if wd is not None:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        biases = tf.get_variable(
+            name + '_b', [nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=True)
+
+        affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
+                  tf.matmul(inpOp, kernel) + biases
+
+        return affine1
+
+
+def _mpool(name, inpOp, kH, kW, dH, dW):
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding='VALID',
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _norm(name, l_input, lsize=4):
+    return tf.nn.lrn(l_input,
+                     lsize,
+                     bias=1.0,
+                     alpha=0.001 / 9.0,
+                     beta=0.75,
+                     name=name)
+
+
+def loss(logits, labels):
+    labels = tf.cast(labels, tf.int64)
+    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        logits, labels, name='cross_entropy_per_example')
+    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
+    tf.add_to_collection('losses', cross_entropy_mean)
+
+    # The total loss is defined as the cross entropy loss plus all of the weight
+    # decay terms (L2 loss).
+    return tf.add_n(tf.get_collection('losses'), name='total_loss')
+
+
+def get_incoming_shape(incoming):
+    """ Returns the incoming data shape """
+    if isinstance(incoming, tf.Tensor):
+        return incoming.get_shape().as_list()
+    elif type(incoming) in [np.array, list, tuple]:
+        return np.shape(incoming)
+    else:
+        raise Exception("Invalid incoming layer.")
+
+
+def inference(images):
+    conv1 = _conv('conv1', images, 3, 96, 11, 11, 4, 4, 'VALID')
+    pool1 = _mpool('pool1', conv1, 3, 3, 2, 2)
+    norm1 = _norm('norm1', pool1, lsize=5)
+    conv2 = _conv('conv2', norm1, 96, 256, 5, 5, 1, 1, 'SAME')
+    pool2 = _mpool('pool2', conv2, 3, 3, 2, 2)
+    norm2 = _norm('norm2', pool2, lsize=5)
+    conv3 = _conv('conv3', norm2, 256, 384, 3, 3, 1, 1, 'SAME')
+    conv4 = _conv('conv4', conv3, 384, 384, 3, 3, 1, 1, 'SAME')
+    conv5 = _conv('conv5', conv4, 384, 256, 3, 3, 1, 1, 'SAME')
+    pool5 = _mpool('pool5', conv5, 3, 3, 2, 2)
+    resh1 = tf.reshape(pool5, [-1, 256 * 6 * 6])
+    affn1 = _affine('fc6', resh1, 256 * 6 * 6, 4096)
+    affn2 = _affine('fc7', affn1, 4096, 4096)
+    affn3 = _affine('fc8', affn2, 4096, 1000, wd=None, act=False)  # last fc
+
+    return affn3
+
+
+def tower_loss(scope):
+    """Calculate the total loss on a single tower running the model.
+    Args:
+        scope: unique prefix string identifying the tower, e.g. 'tower_0'
+    Returns:
+        Tensor of shape [] containing the total loss for a batch of data
+    """
+    image_size = 224
+    if FLAGS.data_format == 'NCHW':
+        image_shape = [FLAGS.batch_size, 3, image_size + 3, image_size + 3]
+    else:
+        image_shape = [FLAGS.batch_size, image_size + 3, image_size + 3, 3]
+    images = tf.get_variable(
+        'image',
+        image_shape,
+        initializer=tf.truncated_normal_initializer(
+            stddev=0.1, dtype=tf.float32),
+        dtype=tf.float32,
+        trainable=False)
+
+    labels = tf.get_variable(
+        'label', [FLAGS.batch_size],
+        initializer=tf.constant_initializer(1),
+        dtype=tf.int32,
+        trainable=False)
+
+    # Build a Graph that computes the logits predictions from the
+    # inference model.
+    last_layer = inference(images)
+
+    # Build the portion of the Graph calculating the losses. Note that we will
+    # assemble the total_loss using a custom function below.
+    _ = loss(last_layer, labels)
+
+    # Assemble all of the losses for the current tower only.
+    losses = tf.get_collection('losses', scope)
+
+    # Calculate the total loss for the current tower.
+    total_loss = tf.add_n(losses, name='total_loss')
+
+    # Compute the moving average of all individual losses and the total loss.
+    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
+    loss_averages_op = loss_averages.apply(losses + [total_loss])
+
+    # Attach a scalar summary to all individual losses and the total loss; do the
+    # same for the averaged version of the losses.
+    for l in losses + [total_loss]:
+        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
+        # session. This helps the clarity of presentation on tensorboard.
+        loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
+        # Name each loss as '(raw)' and name the moving average version of the loss
+        # as the original loss name.
+        tf.scalar_summary(loss_name + ' (raw)', l)
+        tf.scalar_summary(loss_name, loss_averages.average(l))
+
+    with tf.control_dependencies([loss_averages_op]):
+        total_loss = tf.identity(total_loss)
+    return total_loss
+
+
+def average_gradients(tower_grads):
+    """Calculate the average gradient for each shared variable across all towers.
+  Note that this function provides a synchronization point across all towers.
+  Args:
+    tower_grads: List of lists of (gradient, variable) tuples. The outer list
+      is over individual gradients. The inner list is over the gradient
+      calculation for each tower.
+  Returns:
+     List of pairs of (gradient, variable) where the gradient has been averaged
+     across all towers.
+  """
+    average_grads = []
+    for grad_and_vars in zip(*tower_grads):
+        # Note that each grad_and_vars looks like the following:
+        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+        grads = []
+        for g, _ in grad_and_vars:
+            # Add 0 dimension to the gradients to represent the tower.
+            expanded_g = tf.expand_dims(g, 0)
+
+            # Append on a 'tower' dimension which we will average over below.
+            grads.append(expanded_g)
+
+        # Average over the 'tower' dimension.
+        grad = tf.concat(0, grads)
+        grad = tf.reduce_mean(grad, 0)
+
+        # Keep in mind that the Variables are redundant because they are shared
+        # across towers. So .. we will just return the first tower's pointer to
+        # the Variable.
+        v = grad_and_vars[0][1]
+        grad_and_var = (grad, v)
+        average_grads.append(grad_and_var)
+    return average_grads
+
+
+def time_tensorflow_run(session, target):
+    num_steps_burn_in = 50
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _, loss_value = session.run(target)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
+                examples_per_sec = num_examples_per_step / duration
+                sec_per_batch = duration
+
+                format_str = (
+                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
+                    'sec/batch batch_size = %d)')
+                print(format_str %
+                      (datetime.now(), i - num_steps_burn_in, loss_value,
+                       duration, sec_per_batch, num_examples_per_step))
+
+            total_duration += duration
+            total_duration_squared += duration * duration
+
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), FLAGS.num_batches, mn, sd))
+
+
+def run_benchmark():
+    with tf.Graph().as_default(), tf.device('/cpu:0'):
+        # Create a variable to count the number of train() calls. This equals the
+        # number of batches processed * FLAGS.num_gpus.
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(0),
+            trainable=False)
+
+        # Calculate the learning rate schedule.
+        num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
+                                 FLAGS.batch_size)
+        decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
+
+        # Decay the learning rate exponentially based on the number of steps.
+        lr = tf.train.exponential_decay(
+            INITIAL_LEARNING_RATE,
+            global_step,
+            decay_steps,
+            LEARNING_RATE_DECAY_FACTOR,
+            staircase=True)
+
+        # Create an optimizer that performs gradient descent.
+        opt = tf.train.MomentumOptimizer(lr, 0.9)
+
+        # Calculate the gradients for each model tower.
+        tower_grads = []
+        for i in xrange(FLAGS.num_gpus):
+            with tf.device('/gpu:%d' % i):
+                with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
+                    # Calculate the loss for one tower of the model. This function
+                    # constructs the entire model but shares the variables across
+                    # all towers.
+                    loss = tower_loss(scope)
+
+                    # Reuse variables for the next tower.
+                    tf.get_variable_scope().reuse_variables()
+
+                    # Retain the summaries from the final tower.
+                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
+
+                    # Calculate the gradients for the batch of data on this tower.
+                    grads = opt.compute_gradients(loss)
+
+                    # Keep track of the gradients across all towers.
+                    tower_grads.append(grads)
+
+        # We must calculate the mean of each gradient. Note that this is the
+        # synchronization point across all towers.
+        grads = average_gradients(tower_grads)
+
+        # Apply the gradients to adjust the shared variables.
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Group all updates to into a single train op.
+        train_op = tf.group(apply_gradient_op)
+
+        # Build an initialization operation.
+        init = tf.initialize_all_variables()
+
+        # Start running operations on the Graph. allow_soft_placement must be set to
+        # True to build towers on GPU, as some of the ops do not have GPU
+        # implementations.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+        time_tensorflow_run(sess, [train_op, loss])
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
diff --git a/benchmark/tensorflow/image/googlenet.py b/benchmark/tensorflow/image/googlenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..decf855b54451efba5f6a7868fbcf631789f3572
--- /dev/null
+++ b/benchmark/tensorflow/image/googlenet.py
@@ -0,0 +1,311 @@
+from six.moves import xrange
+from datetime import datetime
+import math
+import time
+
+import tensorflow.python.platform
+import tensorflow as tf
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_boolean('forward_only', False,
+                            """Only run the forward pass.""")
+tf.app.flags.DEFINE_boolean('forward_backward_only', False,
+                            """Only run the forward-forward pass.""")
+tf.app.flags.DEFINE_string('data_format', 'NCHW',
+                           """The data format for Convnet operations.
+                           Can be either NHWC or NCHW.
+                           """)
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+
+parameters = []
+
+conv_counter = 1
+pool_counter = 1
+affine_counter = 1
+
+
+def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.0005):
+    global conv_counter
+    global parameters
+    name = 'conv' + str(conv_counter)
+    conv_counter += 1
+    with tf.name_scope(name) as scope:
+        kernel = tf.Variable(
+            tf.truncated_normal(
+                [kH, kW, nIn, nOut], dtype=tf.float32, stddev=1e-1),
+            name='weights')
+
+        if wd is not None and wd > 0:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        if FLAGS.data_format == 'NCHW':
+            strides = [1, 1, dH, dW]
+        else:
+            strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(
+            inpOp,
+            kernel,
+            strides,
+            padding=padType,
+            data_format=FLAGS.data_format)
+        biases = tf.Variable(
+            tf.constant(
+                0.0, shape=[nOut], dtype=tf.float32),
+            trainable=True,
+            name='biases')
+        bias = tf.reshape(
+            tf.nn.bias_add(
+                conv, biases, data_format=FLAGS.data_format),
+            conv.get_shape())
+        conv1 = tf.nn.relu(bias, name=scope)
+        parameters += [kernel, biases]
+        return conv1
+
+
+def _affine(inpOp, nIn, nOut, act=True, wd=0.0005):
+    global affine_counter
+    global parameters
+    name = 'affine' + str(affine_counter)
+    affine_counter += 1
+    with tf.name_scope(name) as scope:
+        kernel = tf.Variable(
+            tf.truncated_normal(
+                [nIn, nOut], dtype=tf.float32, stddev=1e-1),
+            name='weights')
+
+        if wd is not None and wd > 0:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        biases = tf.Variable(
+            tf.constant(
+                0.0, shape=[nOut], dtype=tf.float32),
+            trainable=True,
+            name='biases')
+        affine1 = tf.nn.relu_layer(
+            inpOp, kernel, biases,
+            name=name) if act else tf.matmul(inpOp, kernel) + biases
+        parameters += [kernel, biases]
+        return affine1
+
+
+def _mpool(inpOp, kH, kW, dH, dW, padding):
+    global pool_counter
+    global parameters
+    name = 'pool' + str(pool_counter)
+    pool_counter += 1
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _apool(inpOp, kH, kW, dH, dW, padding):
+    global pool_counter
+    global parameters
+    name = 'pool' + str(pool_counter)
+    pool_counter += 1
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.avg_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _inception(inp, inSize, o1s, o2s1, o2s2, o3s1, o3s2, o4s1, o4s2):
+    conv1 = _conv(inp, inSize, o1s, 1, 1, 1, 1, 'VALID')
+
+    conv3_ = _conv(inp, inSize, o2s1, 1, 1, 1, 1, 'VALID')
+    conv3 = _conv(conv3_, o2s1, o2s2, 3, 3, 1, 1, 'SAME')
+
+    conv5_ = _conv(inp, inSize, o3s1, 1, 1, 1, 1, 'VALID')
+    conv5 = _conv(conv5_, o3s1, o3s2, 5, 5, 1, 1, 'SAME')
+
+    pool_ = _mpool(inp, o4s1, o4s1, 1, 1, 'SAME')
+    pool = _conv(pool_, inSize, o4s2, 1, 1, 1, 1, 'VALID')
+
+    if FLAGS.data_format == 'NCHW':
+        channel_dim = 1
+    else:
+        channel_dim = 3
+    incept = tf.concat(channel_dim, [conv1, conv3, conv5, pool])
+    return incept
+
+
+def loss(logits, labels):
+    batch_size = tf.size(labels)
+    labels = tf.expand_dims(labels, 1)
+    indices = tf.expand_dims(tf.range(0, batch_size, 1), 1)
+    concated = tf.concat(1, [indices, labels])
+    onehot_labels = tf.sparse_to_dense(concated,
+                                       tf.pack([batch_size, 1000]), 1.0, 0.0)
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+        logits, onehot_labels, name='xentropy')
+    loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
+    return loss
+
+
+def inference(images):
+    # stage 1
+    conv1 = _conv(images, 3, 64, 7, 7, 2, 2, 'SAME')
+    pool1 = _mpool(conv1, 3, 3, 2, 2, 'SAME')
+    # stage 2
+    conv2 = _conv(pool1, 64, 64, 1, 1, 1, 1, 'VALID')
+    conv3 = _conv(conv2, 64, 192, 3, 3, 1, 1, 'SAME')
+    pool3 = _mpool(conv3, 3, 3, 2, 2, 'SAME')
+
+    # stage 3
+    incept3a = _inception(pool3, 192, 64, 96, 128, 16, 32, 3, 32)
+    incept3b = _inception(incept3a, 256, 128, 128, 192, 32, 96, 3, 64)
+    pool4 = _mpool(incept3b, 3, 3, 2, 2, 'SAME')
+
+    # stage 4
+    incept4a = _inception(pool4, 480, 192, 96, 208, 16, 48, 3, 64)
+    incept4b = _inception(incept4a, 512, 160, 112, 224, 24, 64, 3, 64)
+    incept4c = _inception(incept4b, 512, 128, 128, 256, 24, 64, 3, 64)
+    incept4d = _inception(incept4c, 512, 112, 144, 288, 32, 64, 3, 64)
+    incept4e = _inception(incept4d, 528, 256, 160, 320, 32, 128, 3, 128)
+    pool5 = _mpool(incept4e, 3, 3, 2, 2, 'SAME')
+
+    # stage 5
+    incept5a = _inception(pool5, 832, 256, 160, 320, 32, 128, 3, 128)
+    incept5b = _inception(incept5a, 832, 384, 192, 384, 48, 128, 3, 128)
+    pool6 = _apool(incept5b, 7, 7, 1, 1, 'VALID')
+
+    # output 1
+    resh1 = tf.reshape(pool6, [-1, 1024])
+    drop = tf.nn.dropout(resh1, 0.4)
+    affn1 = _affine(resh1, 1024, 1000, act=False)
+
+    return affn1
+
+
+def time_tensorflow_run(session, target, info_string):
+    num_steps_burn_in = 10
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    if not isinstance(target, list):
+        target = [target]
+    target_op = tf.group(*target)
+    for i in range(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _ = session.run(target_op)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                print('%s: step %d, duration = %.3f' %
+                      (datetime.now(), i - num_steps_burn_in, duration))
+            total_duration += duration
+            total_duration_squared += duration * duration
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
+
+
+def run_benchmark():
+    global parameters
+    with tf.Graph().as_default():
+        # Generate some dummy images.
+        image_size = 224
+        if FLAGS.data_format == 'NCHW':
+            image_shape = [FLAGS.batch_size, 3, image_size, image_size]
+        else:
+            image_shape = [FLAGS.batch_size, image_size, image_size, 3]
+
+        images = tf.get_variable(
+            'image',
+            image_shape,
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.1, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=False)
+
+        labels = tf.get_variable(
+            'label', [FLAGS.batch_size],
+            initializer=tf.constant_initializer(1),
+            dtype=tf.int32,
+            trainable=False)
+
+        # Build a Graph that computes the logits predictions from the
+        # inference model.
+        last_layer = inference(images)
+
+        objective = loss(last_layer, labels)
+
+        # Compute gradients.
+        # opt = tf.train.GradientDescentOptimizer(0.001)
+        opt = tf.train.MomentumOptimizer(0.001, 0.9)
+        grads = opt.compute_gradients(objective)
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(
+                0.0, dtype=tf.float32),
+            trainable=False,
+            dtype=tf.float32)
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Track the moving averages of all trainable variables.
+        variable_averages = tf.train.ExponentialMovingAverage(0.9, global_step)
+        variables_averages_op = variable_averages.apply(tf.trainable_variables(
+        ))
+
+        # Build an initialization operation.
+        init = tf.initialize_all_variables()
+
+        # Start running operations on the Graph.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+
+        run_forward = True
+        run_forward_backward = True
+        if FLAGS.forward_only and FLAGS.forward_backward_only:
+            raise ValueError("Cannot specify --forward_only and "
+                             "--forward_backward_only at the same time.")
+        if FLAGS.forward_only:
+            run_forward_backward = False
+        elif FLAGS.forward_backward_only:
+            run_forward = False
+
+        if run_forward:
+            # Run the forward benchmark.
+            time_tensorflow_run(sess, last_layer, "Forward")
+
+        if run_forward_backward:
+            with tf.control_dependencies(
+                [apply_gradient_op, variables_averages_op]):
+                train_op = tf.no_op(name='train')
+            time_tensorflow_run(sess, [train_op, objective], "Forward-backward")
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
diff --git a/benchmark/tensorflow/image/googlenet_multi_gpu.py b/benchmark/tensorflow/image/googlenet_multi_gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..31466faa37c47c66e4fe4628e28c867875e89f2e
--- /dev/null
+++ b/benchmark/tensorflow/image/googlenet_multi_gpu.py
@@ -0,0 +1,411 @@
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from datetime import datetime
+import math
+import re
+import time
+
+import tensorflow.python.platform
+import tensorflow as tf
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_string('data_format', 'NCHW',
+                           """The data format for Convnet operations.
+                           Can be either NHWC or NCHW.
+                           """)
+
+tf.app.flags.DEFINE_string('train_dir', '/train_model',
+                           """Directory where to write event logs """
+                           """and checkpoint.""")
+tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""")
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+
+NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
+NUM_EPOCHS_PER_DECAY = 50
+INITIAL_LEARNING_RATE = 0.1
+LEARNING_RATE_DECAY_FACTOR = 0.1
+TOWER_NAME = 'tower'
+
+
+def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [kH, kW, nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        if wd is not None:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        if FLAGS.data_format == 'NCHW':
+            strides = [1, 1, dH, dW]
+        else:
+            strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(
+            inpOp,
+            kernel,
+            strides,
+            padding=padType,
+            data_format=FLAGS.data_format)
+
+        biases = tf.get_variable(
+            name=name + '_b',
+            shape=[nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32)
+
+        bias = tf.reshape(
+            tf.nn.bias_add(
+                conv, biases, data_format=FLAGS.data_format),
+            conv.get_shape())
+
+        conv1 = tf.nn.relu(bias, name=scope)
+        return conv1
+
+
+def _affine(name, inpOp, nIn, nOut, wd=0.005, act=True):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        if wd is not None:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        biases = tf.get_variable(
+            name + '_b', [nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=True)
+
+        affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
+                  tf.matmul(inpOp, kernel) + biases
+
+        return affine1
+
+
+def _mpool(name, inpOp, kH, kW, dH, dW, padding):
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _apool(name, inpOp, kH, kW, dH, dW, padding):
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.avg_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def loss(logits, labels):
+    labels = tf.cast(labels, tf.int64)
+    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        logits, labels, name='cross_entropy_per_example')
+    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
+    tf.add_to_collection('losses', cross_entropy_mean)
+
+    # The total loss is defined as the cross entropy loss plus all of the weight
+    # decay terms (L2 loss).
+    return tf.add_n(tf.get_collection('losses'), name='total_loss')
+
+
+def get_incoming_shape(incoming):
+    """ Returns the incoming data shape """
+    if isinstance(incoming, tf.Tensor):
+        return incoming.get_shape().as_list()
+    elif type(incoming) in [np.array, list, tuple]:
+        return np.shape(incoming)
+    else:
+        raise Exception("Invalid incoming layer.")
+
+
+def _inception(name, inp, inSize, o1s, o2s1, o2s2, o3s1, o3s2, o4s1, o4s2):
+    conv1 = _conv(name + '_1', inp, inSize, o1s, 1, 1, 1, 1, 'VALID')
+
+    conv3_ = _conv(name + '_3r', inp, inSize, o2s1, 1, 1, 1, 1, 'VALID')
+    conv3 = _conv(name + '_3', conv3_, o2s1, o2s2, 3, 3, 1, 1, 'SAME')
+
+    conv5_ = _conv(name + '_5r', inp, inSize, o3s1, 1, 1, 1, 1, 'VALID')
+    conv5 = _conv(name + '5', conv5_, o3s1, o3s2, 5, 5, 1, 1, 'SAME')
+
+    pool_ = _mpool(name + 'pool', inp, o4s1, o4s1, 1, 1, 'SAME')
+    pool = _conv(name + 'proj', pool_, inSize, o4s2, 1, 1, 1, 1, 'VALID')
+
+    if FLAGS.data_format == 'NCHW':
+        channel_dim = 1
+    else:
+        channel_dim = 3
+    incept = tf.concat(channel_dim, [conv1, conv3, conv5, pool])
+    return incept
+
+
+def inference(images):
+    # stage 1
+    conv1 = _conv('conv1', images, 3, 64, 7, 7, 2, 2, 'SAME')
+    pool1 = _mpool('pool1', conv1, 3, 3, 2, 2, 'SAME')
+
+    # stage 2
+    conv2 = _conv('conv2', pool1, 64, 64, 1, 1, 1, 1, 'VALID')
+    conv3 = _conv('conv3', conv2, 64, 192, 3, 3, 1, 1, 'SAME')
+    pool3 = _mpool('pool3', conv3, 3, 3, 2, 2, 'SAME')
+
+    # stage 3
+    incept3a = _inception('ince3a', pool3, 192, 64, 96, 128, 16, 32, 3, 32)
+    incept3b = _inception('ince3b', incept3a, 256, 128, 128, 192, 32, 96, 3, 64)
+    pool4 = _mpool('pool4', incept3b, 3, 3, 2, 2, 'SAME')
+
+    # stage 4
+    incept4a = _inception('ince4a', pool4, 480, 192, 96, 208, 16, 48, 3, 64)
+    incept4b = _inception('ince4b', incept4a, 512, 160, 112, 224, 24, 64, 3, 64)
+    incept4c = _inception('ince4c', incept4b, 512, 128, 128, 256, 24, 64, 3, 64)
+    incept4d = _inception('ince4d', incept4c, 512, 112, 144, 288, 32, 64, 3, 64)
+    incept4e = _inception('ince4e', incept4d, 528, 256, 160, 320, 32, 128, 3,
+                          128)
+    pool5 = _mpool('pool5', incept4e, 3, 3, 2, 2, 'SAME')
+
+    # stage 5
+    incept5a = _inception('ince5a', pool5, 832, 256, 160, 320, 32, 128, 3, 128)
+    incept5b = _inception('ince5b', incept5a, 832, 384, 192, 384, 48, 128, 3,
+                          128)
+    pool6 = _apool('pool6', incept5b, 7, 7, 1, 1, 'VALID')
+
+    # output 1
+    resh1 = tf.reshape(pool6, [-1, 1024])
+    drop = tf.nn.dropout(resh1, 0.4)
+    affn1 = _affine('fc_out', resh1, 1024, 1000, act=False)
+
+    return affn1
+
+
+def tower_loss(scope):
+    """Calculate the total loss on a single tower running the model.
+    Args:
+        scope: unique prefix string identifying the tower, e.g. 'tower_0'
+    Returns:
+        Tensor of shape [] containing the total loss for a batch of data
+    """
+    image_size = 224
+    if FLAGS.data_format == 'NCHW':
+        image_shape = [FLAGS.batch_size, 3, image_size, image_size]
+    else:
+        image_shape = [FLAGS.batch_size, image_size, image_size, 3]
+    images = tf.get_variable(
+        'image',
+        image_shape,
+        initializer=tf.truncated_normal_initializer(
+            stddev=0.1, dtype=tf.float32),
+        dtype=tf.float32,
+        trainable=False)
+
+    labels = tf.get_variable(
+        'label', [FLAGS.batch_size],
+        initializer=tf.constant_initializer(1),
+        dtype=tf.int32,
+        trainable=False)
+
+    # Build a Graph that computes the logits predictions from the
+    # inference model.
+    last_layer = inference(images)
+
+    # Build the portion of the Graph calculating the losses. Note that we will
+    # assemble the total_loss using a custom function below.
+    _ = loss(last_layer, labels)
+
+    # Assemble all of the losses for the current tower only.
+    losses = tf.get_collection('losses', scope)
+
+    # Calculate the total loss for the current tower.
+    total_loss = tf.add_n(losses, name='total_loss')
+
+    # Compute the moving average of all individual losses and the total loss.
+    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
+    loss_averages_op = loss_averages.apply(losses + [total_loss])
+
+    # Attach a scalar summary to all individual losses and the total loss; do the
+    # same for the averaged version of the losses.
+    for l in losses + [total_loss]:
+        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
+        # session. This helps the clarity of presentation on tensorboard.
+        loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
+        # Name each loss as '(raw)' and name the moving average version of the loss
+        # as the original loss name.
+        tf.scalar_summary(loss_name + ' (raw)', l)
+        tf.scalar_summary(loss_name, loss_averages.average(l))
+
+    with tf.control_dependencies([loss_averages_op]):
+        total_loss = tf.identity(total_loss)
+    return total_loss
+
+
+def average_gradients(tower_grads):
+    """Calculate the average gradient for each shared variable across all towers.
+  Note that this function provides a synchronization point across all towers.
+  Args:
+    tower_grads: List of lists of (gradient, variable) tuples. The outer list
+      is over individual gradients. The inner list is over the gradient
+      calculation for each tower.
+  Returns:
+     List of pairs of (gradient, variable) where the gradient has been averaged
+     across all towers.
+  """
+    average_grads = []
+    for grad_and_vars in zip(*tower_grads):
+        # Note that each grad_and_vars looks like the following:
+        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+        grads = []
+        for g, _ in grad_and_vars:
+            # Add 0 dimension to the gradients to represent the tower.
+            expanded_g = tf.expand_dims(g, 0)
+
+            # Append on a 'tower' dimension which we will average over below.
+            grads.append(expanded_g)
+
+        # Average over the 'tower' dimension.
+        grad = tf.concat(0, grads)
+        grad = tf.reduce_mean(grad, 0)
+
+        # Keep in mind that the Variables are redundant because they are shared
+        # across towers. So .. we will just return the first tower's pointer to
+        # the Variable.
+        v = grad_and_vars[0][1]
+        grad_and_var = (grad, v)
+        average_grads.append(grad_and_var)
+    return average_grads
+
+
+def time_tensorflow_run(session, target):
+    num_steps_burn_in = 50
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _, loss_value = session.run(target)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
+                examples_per_sec = num_examples_per_step / duration
+                sec_per_batch = duration
+
+                format_str = (
+                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
+                    'sec/batch batch_size = %d)')
+                print(format_str %
+                      (datetime.now(), i - num_steps_burn_in, loss_value,
+                       duration, sec_per_batch, num_examples_per_step))
+
+            total_duration += duration
+            total_duration_squared += duration * duration
+
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), FLAGS.num_batches, mn, sd))
+
+
+def run_benchmark():
+    with tf.Graph().as_default(), tf.device('/cpu:0'):
+        # Create a variable to count the number of train() calls. This equals the
+        # number of batches processed * FLAGS.num_gpus.
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(0),
+            trainable=False)
+
+        # Calculate the learning rate schedule.
+        num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
+                                 FLAGS.batch_size)
+        decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
+
+        # Decay the learning rate exponentially based on the number of steps.
+        lr = tf.train.exponential_decay(
+            INITIAL_LEARNING_RATE,
+            global_step,
+            decay_steps,
+            LEARNING_RATE_DECAY_FACTOR,
+            staircase=True)
+
+        # Create an optimizer that performs gradient descent.
+        opt = tf.train.MomentumOptimizer(lr, 0.9)
+
+        # Calculate the gradients for each model tower.
+        tower_grads = []
+        for i in xrange(FLAGS.num_gpus):
+            with tf.device('/gpu:%d' % i):
+                with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
+                    # Calculate the loss for one tower of the model. This function
+                    # constructs the entire model but shares the variables across
+                    # all towers.
+                    loss = tower_loss(scope)
+
+                    # Reuse variables for the next tower.
+                    tf.get_variable_scope().reuse_variables()
+
+                    # Retain the summaries from the final tower.
+                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
+
+                    # Calculate the gradients for the batch of data on this tower.
+                    grads = opt.compute_gradients(loss)
+
+                    # Keep track of the gradients across all towers.
+                    tower_grads.append(grads)
+
+        # We must calculate the mean of each gradient. Note that this is the
+        # synchronization point across all towers.
+        grads = average_gradients(tower_grads)
+
+        # Apply the gradients to adjust the shared variables.
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Group all updates to into a single train op.
+        train_op = tf.group(apply_gradient_op)
+
+        # Build an initialization operation.
+        init = tf.initialize_all_variables()
+
+        # Start running operations on the Graph. allow_soft_placement must be set to
+        # True to build towers on GPU, as some of the ops do not have GPU
+        # implementations.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+        time_tensorflow_run(sess, [train_op, loss])
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
diff --git a/benchmark/tensorflow/image/run.sh b/benchmark/tensorflow/image/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..eade36beb9df5f8d3978939216e058203e024c1a
--- /dev/null
+++ b/benchmark/tensorflow/image/run.sh
@@ -0,0 +1,28 @@
+set -e
+
+function test() {
+  cfg=$1
+  batch_size=$2
+  prefix=$3
+  python $cfg --batch_size=$batch_size > logs/${prefix}-1gpu-${batch_size}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# alexnet
+test alexnet.py 64 alexnet
+test alexnet.py 128 alexnet
+test alexnet.py 256 alexnet
+test alexnet.py 512 alexnet
+
+# googlenet
+test googlenet.py 64 googlenet
+test googlenet.py 128 googlenet
+
+# smallnet 
+test smallnet_mnist_cifar.py 64 smallnet
+test smallnet_mnist_cifar.py 128 smallnet
+test smallnet_mnist_cifar.py 256 smallnet
+test smallnet_mnist_cifar.py 512 smallnet
diff --git a/benchmark/tensorflow/image/run_multi.sh b/benchmark/tensorflow/image/run_multi.sh
new file mode 100755
index 0000000000000000000000000000000000000000..69faa4331744f2276e7706185ae10bc507f95764
--- /dev/null
+++ b/benchmark/tensorflow/image/run_multi.sh
@@ -0,0 +1,22 @@
+set -e
+
+function test() {
+  cfg=$1
+  num_gpu=$2
+  batch_size=$3
+  batch_per_gpu=`expr ${batch_size} / ${num_gpu}`
+  prefix=$4
+  python $cfg --num_gpus=$num_gpu --batch_size=${batch_per_gpu} > logs/${prefix}-4gpu-${batch_size}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# alexnet
+test alexnet_multi_gpu.py 4 512 alexnet
+test alexnet_multi_gpu.py 4 1024 alexnet
+
+# googlenet 
+test googlenet_multi_gpu.py 4 512 alexnet
+test googlenet_multi_gpu.py 4 1024 alexnet
diff --git a/benchmark/tensorflow/image/smallnet_mnist_cifar.py b/benchmark/tensorflow/image/smallnet_mnist_cifar.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a625134a6c58586b29190ede9c66253f484d2cf
--- /dev/null
+++ b/benchmark/tensorflow/image/smallnet_mnist_cifar.py
@@ -0,0 +1,304 @@
+from six.moves import xrange  # pylint: disable=redefined-builtin
+from datetime import datetime
+import math
+import time
+
+import tensorflow.python.platform
+import tensorflow as tf
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_boolean('forward_only', False,
+                            """Only run the forward pass.""")
+tf.app.flags.DEFINE_boolean('forward_backward_only', False,
+                            """Only run the forward-forward pass.""")
+tf.app.flags.DEFINE_string('data_format', 'NCHW',
+                           """The data format for Convnet operations.
+                           Can be either NHWC or NCHW.
+                           """)
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+
+parameters = []
+
+conv_counter = 1
+pool_counter = 1
+affine_counter = 1
+
+
+def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005, act=True):
+    global conv_counter
+    global parameters
+    name = 'conv' + str(conv_counter)
+    conv_counter += 1
+    with tf.name_scope(name) as scope:
+        kernel = tf.Variable(
+            tf.truncated_normal(
+                [kH, kW, nIn, nOut], dtype=tf.float32, stddev=1e-1),
+            name='weights')
+
+        if wd is not None:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        if FLAGS.data_format == 'NCHW':
+            strides = [1, 1, dH, dW]
+        else:
+            strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(
+            inpOp,
+            kernel,
+            strides,
+            padding=padType,
+            data_format=FLAGS.data_format)
+        biases = tf.Variable(
+            tf.constant(
+                0.0, shape=[nOut], dtype=tf.float32),
+            trainable=True,
+            name='biases')
+        bias = tf.reshape(
+            tf.nn.bias_add(
+                conv, biases, data_format=FLAGS.data_format),
+            conv.get_shape())
+
+        conv1 = tf.nn.relu(bias, name=scope) if act else bias
+
+        parameters += [kernel, biases]
+
+        return conv1
+
+
+def _affine(inpOp, nIn, nOut, wd=None, act=True):
+    global affine_counter
+    global parameters
+    name = 'affine' + str(affine_counter)
+    affine_counter += 1
+    with tf.name_scope(name) as scope:
+        kernel = tf.Variable(
+            tf.truncated_normal(
+                [nIn, nOut], dtype=tf.float32, stddev=1e-1),
+            name='weights')
+
+        if wd is not None:
+            weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
+            tf.add_to_collection('losses', weight_decay)
+
+        biases = tf.Variable(
+            tf.constant(
+                0.0, shape=[nOut], dtype=tf.float32),
+            trainable=True,
+            name='biases')
+
+        affine1 = tf.nn.relu_layer(
+            inpOp, kernel, biases,
+            name=name) if act else tf.matmul(inpOp, kernel) + biases
+
+        parameters += [kernel, biases]
+
+        return affine1
+
+
+def _mpool(inpOp, kH, kW, dH, dW, padding):
+    global pool_counter
+    global parameters
+    name = 'pool' + str(pool_counter)
+    pool_counter += 1
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _apool(inpOp, kH, kW, dH, dW, padding):
+    global pool_counter
+    global parameters
+    name = 'pool' + str(pool_counter)
+    pool_counter += 1
+    if FLAGS.data_format == 'NCHW':
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
+    else:
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.avg_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
+
+def _norm(name, l_input, lsize=4):
+    return tf.nn.lrn(l_input,
+                     lsize,
+                     bias=1.0,
+                     alpha=0.001 / 9.0,
+                     beta=0.75,
+                     name=name)
+
+
+def loss(logits, labels):
+    batch_size = tf.size(labels)
+    labels = tf.expand_dims(labels, 1)
+    indices = tf.expand_dims(tf.range(0, batch_size, 1), 1)
+    concated = tf.concat(1, [indices, labels])
+    onehot_labels = tf.sparse_to_dense(concated,
+                                       tf.pack([batch_size, 10]), 1.0, 0.0)
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+        logits, onehot_labels, name='xentropy')
+    loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
+    return loss
+
+
+def get_incoming_shape(incoming):
+    """ Returns the incoming data shape """
+    if isinstance(incoming, tf.Tensor):
+        return incoming.get_shape().as_list()
+    elif type(incoming) in [np.array, list, tuple]:
+        return np.shape(incoming)
+    else:
+        raise Exception("Invalid incoming layer.")
+
+
+def inference(images):
+    conv1 = _conv(images, 3, 32, 5, 5, 1, 1, 'SAME')
+    pool1 = _mpool(conv1, 3, 3, 2, 2, 'SAME')
+    conv2 = _conv(pool1, 32, 32, 5, 5, 1, 1, 'SAME')
+    pool2 = _apool(conv2, 3, 3, 2, 2, 'SAME')
+    conv3 = _conv(pool2, 32, 64, 5, 5, 1, 1, 'SAME')
+    pool3 = _apool(conv3, 3, 3, 2, 2, 'SAME')
+    resh1 = tf.reshape(pool3, [-1, 64 * 4 * 4])
+    affn1 = _affine(resh1, 64 * 4 * 4, 64)
+    affn2 = _affine(affn1, 64, 10, act=False)
+
+    print('conv1:', get_incoming_shape(conv1))
+    print('pool1:', get_incoming_shape(pool1))
+    print('conv2:', get_incoming_shape(conv2))
+    print('pool2:', get_incoming_shape(pool2))
+    print('conv3:', get_incoming_shape(conv3))
+    print('pool3:', get_incoming_shape(pool3))
+
+    return affn2
+
+
+def time_tensorflow_run(session, target, info_string):
+    num_steps_burn_in = 10
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    if not isinstance(target, list):
+        target = [target]
+    target_op = tf.group(*target)
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _ = session.run(target_op)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                print('%s: step %d, duration = %.3f' %
+                      (datetime.now(), i - num_steps_burn_in, duration))
+            total_duration += duration
+            total_duration_squared += duration * duration
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
+
+
+def run_benchmark():
+    global parameters
+    with tf.Graph().as_default():
+        # Generate some dummy images.
+        image_size = 32
+        # Note that our padding definition is slightly different the cuda-convnet.
+        # In order to force the model to start with the same activations sizes,
+        # we add 3 to the image_size and employ VALID padding above.
+        if FLAGS.data_format == 'NCHW':
+            image_shape = [FLAGS.batch_size, 3, image_size, image_size]
+        else:
+            image_shape = [FLAGS.batch_size, image_size, image_size, 3]
+
+        images = tf.get_variable(
+            'image',
+            image_shape,
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.1, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=False)
+
+        labels = tf.get_variable(
+            'label', [FLAGS.batch_size],
+            initializer=tf.constant_initializer(1),
+            dtype=tf.int32,
+            trainable=False)
+
+        # Build a Graph that computes the logits predictions from the
+        # inference model.
+        last_layer = inference(images)
+
+        objective = loss(last_layer, labels)
+
+        # Compute gradients.
+        opt = tf.train.MomentumOptimizer(0.001, 0.9)
+        grads = opt.compute_gradients(objective)
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(
+                0.0, dtype=tf.float32),
+            trainable=False,
+            dtype=tf.float32)
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Track the moving averages of all trainable variables.
+        variable_averages = tf.train.ExponentialMovingAverage(0.9, global_step)
+        variables_averages_op = variable_averages.apply(tf.trainable_variables(
+        ))
+
+        # Build an initialization operation.
+        init = tf.initialize_all_variables()
+
+        # Start running operations on the Graph.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+
+        run_forward = True
+        run_forward_backward = True
+        if FLAGS.forward_only and FLAGS.forward_backward_only:
+            raise ValueError("Cannot specify --forward_only and "
+                             "--forward_backward_only at the same time.")
+        if FLAGS.forward_only:
+            run_forward_backward = False
+        elif FLAGS.forward_backward_only:
+            run_forward = False
+
+        if run_forward:
+            # Run the forward benchmark.
+            time_tensorflow_run(sess, last_layer, "Forward")
+
+        if run_forward_backward:
+            with tf.control_dependencies(
+                [apply_gradient_op, variables_averages_op]):
+                train_op = tf.no_op(name='train')
+            time_tensorflow_run(sess, [train_op, objective], "Forward-backward")
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
diff --git a/benchmark/tensorflow/rnn/README.md b/benchmark/tensorflow/rnn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..da8e7b8b07969051cbec3ac6a713eaf7fc738a55
--- /dev/null
+++ b/benchmark/tensorflow/rnn/README.md
@@ -0,0 +1,5 @@
+You also should install tflearn:
+
+```bash
+pip install -r requirements.txt
+```
diff --git a/benchmark/tensorflow/rnn/reader.py b/benchmark/tensorflow/rnn/reader.py
new file mode 100755
index 0000000000000000000000000000000000000000..f538329a15ea9ad9293c97c94340989e2c421eb2
--- /dev/null
+++ b/benchmark/tensorflow/rnn/reader.py
@@ -0,0 +1,92 @@
+import os.path
+import io
+import numpy as np
+import tensorflow as tf
+
+# tflearn
+import tflearn
+from tflearn.data_utils import to_categorical, pad_sequences
+from tflearn.datasets import imdb
+
+FLAGS = tf.app.flags.FLAGS
+
+
+class DataSet(object):
+    def __init__(self, data, labels):
+        assert data.shape[0] == labels.shape[0], (
+            'data.shape: %s labels.shape: %s' % (data.shape, labels.shape))
+        self._num_examples = data.shape[0]
+
+        self._data = data
+        self._labels = labels
+        self._epochs_completed = 0
+        self._index_in_epoch = 0
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def labels(self):
+        return self._labels
+
+    @property
+    def num_examples(self):
+        return self._num_examples
+
+    @property
+    def epochs_completed(self):
+        return self._epochs_completed
+
+    def next_batch(self, batch_size):
+        assert batch_size <= self._num_examples
+
+        start = self._index_in_epoch
+        self._index_in_epoch += batch_size
+        if self._index_in_epoch > self._num_examples:
+            # Finished epoch
+            self._epochs_completed += 1
+            # Shuffle the data
+            perm = np.arange(self._num_examples)
+            np.random.shuffle(perm)
+            self._data = self._data[perm]
+            self._labels = self._labels[perm]
+            # Start next epoch
+            start = 0
+            self._index_in_epoch = batch_size
+
+        end = self._index_in_epoch
+
+        return self._data[start:end], self._labels[start:end]
+
+
+def create_datasets(file_path, vocab_size=30000, val_fraction=0.0):
+
+    # IMDB Dataset loading
+    train, test, _ = imdb.load_data(
+        path=file_path,
+        n_words=vocab_size,
+        valid_portion=val_fraction,
+        sort_by_len=False)
+    trainX, trainY = train
+    testX, testY = test
+
+    # Data preprocessing
+    # Sequence padding
+    trainX = pad_sequences(trainX, maxlen=FLAGS.max_len, value=0.)
+    testX = pad_sequences(testX, maxlen=FLAGS.max_len, value=0.)
+    # Converting labels to binary vectors
+    trainY = to_categorical(trainY, nb_classes=2)
+    testY = to_categorical(testY, nb_classes=2)
+
+    train_dataset = DataSet(trainX, trainY)
+
+    return train_dataset
+
+
+def main():
+    create_datasets('imdb.pkl')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/tensorflow/rnn/requirements.txt b/benchmark/tensorflow/rnn/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4242e7d24fbbeb18e8fb9a760d76fa6d5363b03f
--- /dev/null
+++ b/benchmark/tensorflow/rnn/requirements.txt
@@ -0,0 +1 @@
+tflearn
diff --git a/benchmark/tensorflow/rnn/rnn.py b/benchmark/tensorflow/rnn/rnn.py
new file mode 100755
index 0000000000000000000000000000000000000000..f288083e13656563b511980553245142efec4e65
--- /dev/null
+++ b/benchmark/tensorflow/rnn/rnn.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import math
+import time
+import numpy as np
+from datetime import datetime
+
+import reader
+import tensorflow as tf
+from tensorflow.python.ops import rnn
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('num_layers', 1, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('max_len', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_boolean('forward_only', False,
+                            """Only run the forward pass.""")
+tf.app.flags.DEFINE_boolean('forward_backward_only', False,
+                            """Only run the forward-forward pass.""")
+tf.app.flags.DEFINE_integer('hidden_size', 128, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('emb_size', 128, """Number of batches to run.""")
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+
+VOCAB_SIZE = 30000
+NUM_CLASS = 2
+
+
+def get_feed_dict(x_data, y_data=None):
+    feed_dict = {}
+
+    if y_data is not None:
+        feed_dict[y_input] = y_data
+
+    for i in xrange(x_data.shape[0]):
+        feed_dict[x_input[i]] = x_data[i, :, :]
+
+    return feed_dict
+
+
+def get_incoming_shape(incoming):
+    """ Returns the incoming data shape """
+    if isinstance(incoming, tf.Tensor):
+        return incoming.get_shape().as_list()
+    elif type(incoming) in [np.array, list, tuple]:
+        return np.shape(incoming)
+    else:
+        raise Exception("Invalid incoming layer.")
+
+
+# Note input * W is done in LSTMCell, 
+# which is different from PaddlePaddle
+def single_lstm(name,
+                incoming,
+                n_units,
+                use_peepholes=True,
+                return_seq=False,
+                return_state=False):
+    with tf.name_scope(name) as scope:
+        cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes)
+        output, _cell_state = rnn.rnn(cell, incoming, dtype=tf.float32)
+        out = output if return_seq else output[-1]
+        return (out, _cell_state) if return_state else out
+
+
+def lstm(name,
+         incoming,
+         n_units,
+         use_peepholes=True,
+         return_seq=False,
+         return_state=False,
+         num_layers=1):
+    with tf.name_scope(name) as scope:
+        lstm_cell = tf.nn.rnn_cell.LSTMCell(
+            n_units, use_peepholes=use_peepholes)
+        cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers)
+        initial_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32)
+        if not isinstance(incoming, list):
+            # if the input is embeding, the Tensor shape : [None, time_step, emb_size]
+            incoming = [
+                tf.squeeze(input_, [1])
+                for input_ in tf.split(1, FLAGS.max_len, incoming)
+            ]
+        outputs, state = tf.nn.rnn(cell,
+                                   incoming,
+                                   initial_state=initial_state,
+                                   dtype=tf.float32)
+        out = outputs if return_seq else outputs[-1]
+        return (out, _cell_state) if return_state else out
+
+
+def embedding(name, incoming, vocab_size, emb_size):
+    with tf.name_scope(name) as scope:
+        #with tf.device("/cpu:0"):
+        embedding = tf.get_variable(
+            name + '_emb', [vocab_size, emb_size], dtype=tf.float32)
+        out = tf.nn.embedding_lookup(embedding, incoming)
+        return out
+
+
+def fc(name, inpOp, nIn, nOut, act=True):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        biases = tf.get_variable(
+            name + '_b', [nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=True)
+
+        net = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
+                  tf.matmul(inpOp, kernel) + biases
+
+        return net
+
+
+def inference(seq):
+    net = embedding('emb', seq, VOCAB_SIZE, FLAGS.emb_size)
+    print "emb:", get_incoming_shape(net)
+    net = lstm('lstm', net, FLAGS.hidden_size, num_layers=FLAGS.num_layers)
+    print "lstm:", get_incoming_shape(net)
+    net = fc('fc1', net, FLAGS.hidden_size, 2)
+    return net
+
+
+def loss(logits, labels):
+    # one label index for one sample
+    labels = tf.cast(labels, tf.float32)
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+        logits, labels, name='cross_entropy_per_example')
+    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
+    tf.add_to_collection('losses', cross_entropy_mean)
+    return tf.add_n(tf.get_collection('losses'), name='total_loss')
+
+
+def time_tensorflow_run(session, target, x_input, y_input, info_string):
+    num_steps_burn_in = 50
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    if not isinstance(target, list):
+        target = [target]
+    target_op = tf.group(*target)
+    train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE)
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        data, label = train_dataset.next_batch(FLAGS.batch_size)
+        _ = session.run(target_op, feed_dict={x_input: data, y_input: label})
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                print('%s: step %d, duration = %.3f' %
+                      (datetime.now(), i - num_steps_burn_in, duration))
+            total_duration += duration
+            total_duration_squared += duration * duration
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
+
+
+def run_benchmark():
+    with tf.Graph().as_default():
+        global_step = 0
+        with tf.device('/cpu:0'):
+            global_step = tf.Variable(0, trainable=False)
+        with tf.device('/gpu:0'):
+            #x_input = tf.placeholder(tf.int32, [None, FLAGS.max_len], name="x_input")
+            #y_input = tf.placeholder(tf.int32, [None, NUM_CLASS], name="y_input")
+            x_input = tf.placeholder(
+                tf.int32, [FLAGS.batch_size, FLAGS.max_len], name="x_input")
+            y_input = tf.placeholder(
+                tf.int32, [FLAGS.batch_size, NUM_CLASS], name="y_input")
+            # Generate some dummy sequnce.
+
+            last_layer = inference(x_input)
+
+            objective = loss(last_layer, y_input)
+            opt = tf.train.AdamOptimizer(0.001)
+            grads = opt.compute_gradients(objective)
+            apply_gradient_op = opt.apply_gradients(
+                grads, global_step=global_step)
+
+            init = tf.initialize_all_variables()
+            sess = tf.Session(config=tf.ConfigProto(
+                allow_soft_placement=True,
+                log_device_placement=FLAGS.log_device_placement))
+            sess.run(init)
+
+            run_forward = True
+            run_forward_backward = True
+            if FLAGS.forward_only and FLAGS.forward_backward_only:
+                raise ValueError("Cannot specify --forward_only and "
+                                 "--forward_backward_only at the same time.")
+            if FLAGS.forward_only:
+                run_forward_backward = False
+            elif FLAGS.forward_backward_only:
+                run_forward = False
+
+            if run_forward:
+                time_tensorflow_run(sess, last_layer, x_input, y_input,
+                                    "Forward")
+
+            if run_forward_backward:
+                with tf.control_dependencies([apply_gradient_op]):
+                    train_op = tf.no_op(name='train')
+                time_tensorflow_run(sess, [train_op, objective], x_input,
+                                    y_input, "Forward-backward")
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
diff --git a/benchmark/tensorflow/rnn/rnn_multi_gpu.py b/benchmark/tensorflow/rnn/rnn_multi_gpu.py
new file mode 100755
index 0000000000000000000000000000000000000000..eabee4fa8fe6325212ace1c11be4862cd2720b08
--- /dev/null
+++ b/benchmark/tensorflow/rnn/rnn_multi_gpu.py
@@ -0,0 +1,322 @@
+#!/usr/bin/env python
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import re
+import math
+import time
+import numpy as np
+from datetime import datetime
+
+import reader
+import tensorflow as tf
+from tensorflow.python.ops import rnn
+
+FLAGS = tf.app.flags.FLAGS
+
+tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('num_layers', 1, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('max_len', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('hidden_size', 128, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('emb_size', 64, """Number of batches to run.""")
+tf.app.flags.DEFINE_boolean('log_device_placement', False,
+                            """Whether to log device placement.""")
+tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""")
+
+VOCAB_SIZE = 30000
+NUM_CLASS = 2
+
+NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
+NUM_EPOCHS_PER_DECAY = 50
+INITIAL_LEARNING_RATE = 0.1
+LEARNING_RATE_DECAY_FACTOR = 0.1
+TOWER_NAME = 'tower'
+
+train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE)
+
+
+def get_incoming_shape(incoming):
+    """ Returns the incoming data shape """
+    if isinstance(incoming, tf.Tensor):
+        return incoming.get_shape().as_list()
+    elif type(incoming) in [np.array, list, tuple]:
+        return np.shape(incoming)
+    else:
+        raise Exception("Invalid incoming layer.")
+
+
+# Note input * W is done in LSTMCell, 
+# which is different from PaddlePaddle
+def single_lstm(name,
+                incoming,
+                n_units,
+                use_peepholes=True,
+                return_seq=False,
+                return_state=False):
+    with tf.name_scope(name) as scope:
+        cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes)
+        output, _cell_state = rnn.rnn(cell, incoming, dtype=tf.float32)
+        out = output if return_seq else output[-1]
+        return (out, _cell_state) if return_state else out
+
+
+def lstm(name,
+         incoming,
+         n_units,
+         use_peepholes=True,
+         return_seq=False,
+         return_state=False,
+         num_layers=1):
+    with tf.name_scope(name) as scope:
+        lstm_cell = tf.nn.rnn_cell.LSTMCell(
+            n_units, use_peepholes=use_peepholes)
+        cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers)
+        initial_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32)
+        if not isinstance(incoming, list):
+            # if the input is embeding, the Tensor shape : [None, time_step, emb_size]
+            incoming = [
+                tf.squeeze(input_, [1])
+                for input_ in tf.split(1, FLAGS.max_len, incoming)
+            ]
+        outputs, state = tf.nn.rnn(cell,
+                                   incoming,
+                                   initial_state=initial_state,
+                                   dtype=tf.float32)
+        out = outputs if return_seq else outputs[-1]
+        return (out, _cell_state) if return_state else out
+
+
+def embedding(name, incoming, vocab_size, emb_size):
+    with tf.name_scope(name) as scope:
+        #with tf.device("/cpu:0"):
+        embedding = tf.get_variable(
+            name + '_emb', [vocab_size, emb_size], dtype=tf.float32)
+        out = tf.nn.embedding_lookup(embedding, incoming)
+        return out
+
+
+def fc(name, inpOp, nIn, nOut, act=True):
+    with tf.name_scope(name) as scope:
+        kernel = tf.get_variable(
+            name + '_w', [nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
+
+        biases = tf.get_variable(
+            name + '_b', [nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=True)
+
+        net = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
+                  tf.matmul(inpOp, kernel) + biases
+
+        return net
+
+
+def inference(seq):
+    net = embedding('emb', seq, VOCAB_SIZE, FLAGS.emb_size)
+    print "emb:", get_incoming_shape(net)
+    net = lstm('lstm', net, FLAGS.hidden_size, num_layers=FLAGS.num_layers)
+    print "lstm:", get_incoming_shape(net)
+    net = fc('fc1', net, FLAGS.hidden_size, 2)
+    return net
+
+
+def loss(logits, labels):
+    # one label index for one sample
+    #labels = tf.cast(labels, tf.int64)
+    # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
+    #                logits, labels, name='cross_entropy_per_example')
+    labels = tf.cast(labels, tf.float32)
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+        logits, labels, name='cross_entropy_per_example')
+    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
+    tf.add_to_collection('losses', cross_entropy_mean)
+    return tf.add_n(tf.get_collection('losses'), name='total_loss')
+
+
+def tower_loss(scope):
+    """Calculate the total loss on a single tower running the model.
+    Args:
+        scope: unique prefix string identifying the tower, e.g. 'tower_0'
+    Returns:
+        Tensor of shape [] containing the total loss for a batch of data
+    """
+    data, label = train_dataset.next_batch(FLAGS.batch_size)
+
+    # Build a Graph that computes the logits predictions from the
+    # inference model.
+    last_layer = inference(data)
+
+    # Build the portion of the Graph calculating the losses. Note that we will
+    # assemble the total_loss using a custom function below.
+    #_ = loss(last_layer, label)
+    _ = loss(last_layer, label)
+
+    # Assemble all of the losses for the current tower only.
+    losses = tf.get_collection('losses', scope)
+
+    # Calculate the total loss for the current tower.
+    total_loss = tf.add_n(losses, name='total_loss')
+
+    # Compute the moving average of all individual losses and the total loss.
+    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
+    loss_averages_op = loss_averages.apply(losses + [total_loss])
+
+    # Attach a scalar summary to all individual losses and the total loss; do the
+    # same for the averaged version of the losses.
+    for l in losses + [total_loss]:
+        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
+        # session. This helps the clarity of presentation on tensorboard.
+        loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
+        # Name each loss as '(raw)' and name the moving average version of the loss
+        # as the original loss name.
+        tf.scalar_summary(loss_name + ' (raw)', l)
+        #tf.scalar_summary(loss_name, loss_averages.average(l))
+
+    with tf.control_dependencies([loss_averages_op]):
+        total_loss = tf.identity(total_loss)
+    return total_loss
+
+
+def average_gradients(tower_grads):
+    """Calculate the average gradient for each shared variable across all towers.
+  Note that this function provides a synchronization point across all towers.
+  Args:
+    tower_grads: List of lists of (gradient, variable) tuples. The outer list
+      is over individual gradients. The inner list is over the gradient
+      calculation for each tower.
+  Returns:
+     List of pairs of (gradient, variable) where the gradient has been averaged
+     across all towers.
+  """
+    average_grads = []
+    for grad_and_vars in zip(*tower_grads):
+        # Note that each grad_and_vars looks like the following:
+        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+        grads = []
+        for g, _ in grad_and_vars:
+            # Add 0 dimension to the gradients to represent the tower.
+            expanded_g = tf.expand_dims(g, 0)
+
+            # Append on a 'tower' dimension which we will average over below.
+            grads.append(expanded_g)
+
+        # Average over the 'tower' dimension.
+        grad = tf.concat(0, grads)
+        grad = tf.reduce_mean(grad, 0)
+
+        # Keep in mind that the Variables are redundant because they are shared
+        # across towers. So .. we will just return the first tower's pointer to
+        # the Variable.
+        v = grad_and_vars[0][1]
+        grad_and_var = (grad, v)
+        average_grads.append(grad_and_var)
+    return average_grads
+
+
+def time_tensorflow_run(session, target):
+    num_steps_burn_in = 80
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _ = session.run(target, feed_dict={x_input: data, y_input: label})
+        _, loss_value = session.run(target)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
+                examples_per_sec = num_examples_per_step / duration
+                # sec_per_batch = duration / FLAGS.num_gpus
+                sec_per_batch = duration
+
+                format_str = (
+                    '%s: step %d, loss= %.2f (%.1f examples/sec; %.3f '
+                    'sec/batch batch_size= %d)')
+                print(format_str %
+                      (datetime.now(), i - num_steps_burn_in, loss_value,
+                       duration, sec_per_batch, num_examples_per_step))
+
+            total_duration += duration
+            total_duration_squared += duration * duration
+
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), FLAGS.num_batches, mn, sd))
+
+
+def run_benchmark():
+    with tf.Graph().as_default(), tf.device('/cpu:0'):
+        # Create a variable to count the number of train() calls. This equals the
+        # number of batches processed * FLAGS.num_gpus.
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(0),
+            trainable=False)
+
+        # Calculate the learning rate schedule.
+        num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
+                                 FLAGS.batch_size)
+        decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
+
+        # Create an optimizer that performs gradient descent.
+        opt = tf.train.AdamOptimizer(0.001)
+
+        #train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE)
+
+        # Calculate the gradients for each model tower.
+        tower_grads = []
+        for i in xrange(FLAGS.num_gpus):
+            with tf.device('/gpu:%d' % i):
+                with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
+                    # Calculate the loss for one tower of the model. This function
+                    # constructs the entire model but shares the variables across
+                    # all towers.
+                    loss = tower_loss(scope)
+
+                    # Reuse variables for the next tower.
+                    tf.get_variable_scope().reuse_variables()
+
+                    # Retain the summaries from the final tower.
+                    # summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
+
+                    # Calculate the gradients for the batch of data on this tower.
+                    grads = opt.compute_gradients(loss)
+
+                    # Keep track of the gradients across all towers.
+                    tower_grads.append(grads)
+
+        # We must calculate the mean of each gradient. Note that this is the
+        # synchronization point across all towers.
+        grads = average_gradients(tower_grads)
+
+        # Apply the gradients to adjust the shared variables.
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Group all updates to into a single train op.
+        train_op = tf.group(apply_gradient_op)
+
+        # Build an initialization operation.
+        init = tf.initialize_all_variables()
+
+        # Start running operations on the Graph. allow_soft_placement must be set to
+        # True to build towers on GPU, as some of the ops do not have GPU
+        # implementations.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+        time_tensorflow_run(sess, [train_op, loss])
+
+
+def main(_):
+    run_benchmark()
+
+
+if __name__ == '__main__':
+    tf.app.run()
diff --git a/benchmark/tensorflow/rnn/run.sh b/benchmark/tensorflow/rnn/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..bb4c69cb95f965eff35f1c5a60376bf1e84f841b
--- /dev/null
+++ b/benchmark/tensorflow/rnn/run.sh
@@ -0,0 +1,29 @@
+set -e
+
+function test() {
+  lstm_num=$1
+  batch_size=$2
+  hid_size=$3
+  prefix=$4
+  python rnn.py --num_layers=${lstm_num} --batch_size=$batch_size \
+      --hidden_size=${hid_size} \
+      --forward_backward_only=1 \
+       > logs/1gpu-${lstm_num}lstm-batch${batch_size}-hid${hid_size}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+#--lstm_num--batch_size--hidden_size--#
+test 2 64 256 
+test 2 64 512 
+test 2 64 1280 
+
+test 2 128 256 
+test 2 128 512 
+test 2 128 1280 
+
+test 2 256 256 
+test 2 256 512 
+test 2 256 1280 
diff --git a/benchmark/tensorflow/rnn/run_multi.sh b/benchmark/tensorflow/rnn/run_multi.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f7f52e01e38d304bb3bf8185c53bd0da26014d3a
--- /dev/null
+++ b/benchmark/tensorflow/rnn/run_multi.sh
@@ -0,0 +1,28 @@
+set -e
+
+function test() {
+  num_gpu=$1
+  lstm_num=$2
+  hid_size=$3
+  batch_per_gpu=`expr ${batch_size} / ${num_gpu}`
+  batch_size=$4
+  python rnn_multi_gpu.py --num_layers=${lstm_num} --batch_size=$batch_per_gpu \
+      --num_gpus=${num_gpu} \
+      --hidden_size=${hid_size} \
+      --forward_backward_only=1 \
+      > logs/${num_gpu}gpu-${lstm_num}lstm-hid${hid_size}-batch${batch_size}.log 2>&1
+}
+
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+#--num_gpus--lstm_num--hiddne_size--batch_size--#
+test 4 2 256 128 
+test 4 2 256 256 
+test 4 2 256 512 
+
+test 4 2 512 128 
+test 4 2 512 256 
+test 4 2 512 512 
+
diff --git a/cmake/util.cmake b/cmake/util.cmake
index a8282f07184c34f77d506ed7ef40206fbbd55b41..11641f6064b9db36e14293460a1f05067e373661 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -148,6 +148,11 @@ function(link_paddle_exe TARGET_NAME)
             target_link_libraries(${TARGET_NAME} rt)
         endif()
     endif()
+
+    if(NOT WITH_DSO)
+        target_link_libraries(${TARGET_NAME}
+            ${WARPCTC_LIBRARY})
+    endif()
 endfunction()
 
 # link_paddle_test
diff --git a/cmake/version.cmake b/cmake/version.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..a0518e07e88a1ff468c301523f888c7d95e15185
--- /dev/null
+++ b/cmake/version.cmake
@@ -0,0 +1,24 @@
+# Get the latest git tag.
+set(PADDLE_VERSION $ENV{PADDLE_VERSION})
+set(tmp_version "HEAD")
+while ("${PADDLE_VERSION}" STREQUAL "")
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 ${tmp_version}
+    WORKING_DIRECTORY ${PROJ_ROOT}
+    OUTPUT_VARIABLE GIT_TAG_NAME
+    RESULT_VARIABLE GIT_RESULT
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if (NOT ${GIT_RESULT})
+    # Check the tag is a correct version
+    if (${GIT_TAG_NAME} MATCHES "v[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
+      string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
+    else()  # otherwise, get the previous git tag name.
+      set(tmp_version "${GIT_TAG_NAME}~1")
+    endif()
+  else()
+    set(PADDLE_VERSION "0.0.0")
+    message(WARNING "Cannot add paddle version from git tag")
+  endif()
+endwhile()
+
+message(STATUS "Paddle version is ${PADDLE_VERSION}")
diff --git a/demo/gan/.gitignore b/demo/gan/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..93a6f5080a16a601cffb0bff51af9aef3ba3bae7
--- /dev/null
+++ b/demo/gan/.gitignore
@@ -0,0 +1,11 @@
+output/
+uniform_params/
+cifar_params/
+mnist_params/
+*.png
+.pydevproject
+.project
+*.log
+*.pyc
+data/mnist_data/
+data/cifar-10-batches-py/
diff --git a/demo/gan/README.md b/demo/gan/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fdc970a07b488c3a4146c9baa76a133a456fc9ab
--- /dev/null
+++ b/demo/gan/README.md
@@ -0,0 +1,13 @@
+# Generative Adversarial Networks (GAN) 
+
+This demo implements GAN training described in the original GAN paper (https://arxiv.org/abs/1406.2661) and DCGAN (https://arxiv.org/abs/1511.06434).
+
+The general training procedures are implemented in gan_trainer.py. The neural network configurations are specified in gan_conf.py (for synthetic data) and gan_conf_image.py (for image data).
+
+In order to run the model, first download the corresponding data by running the shell script in ./data.
+Then you can run the command below. The flag -d specifies the training data (cifar, mnist or uniform) and flag --useGpu specifies whether to use gpu for training (0 is cpu, 1 is gpu).  
+
+$python gan_trainer.py -d cifar --use_gpu 1
+
+The generated images will be stored in ./cifar_samples/
+The corresponding models will be stored in ./cifar_params/
\ No newline at end of file
diff --git a/demo/gan/data/download_cifar.sh b/demo/gan/data/download_cifar.sh
new file mode 100755
index 0000000000000000000000000000000000000000..ea3be594cd08f829e94f2c692a44947baa62b759
--- /dev/null
+++ b/demo/gan/data/download_cifar.sh
@@ -0,0 +1,18 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
+tar zxf cifar-10-python.tar.gz
+rm cifar-10-python.tar.gz
+
diff --git a/demo/gan/data/get_mnist_data.sh b/demo/gan/data/get_mnist_data.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d21bf7067135f1f8be486ef0f13fc3ec94ffc4ed
--- /dev/null
+++ b/demo/gan/data/get_mnist_data.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env sh
+# This script downloads the mnist data and unzips it.
+set -e
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+rm -rf "$DIR/mnist_data"
+mkdir "$DIR/mnist_data"
+cd "$DIR/mnist_data"
+
+echo "Downloading..."
+
+for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
+do
+    if [ ! -e $fname ]; then
+        wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
+        gunzip ${fname}.gz
+    fi
+done
+
+
diff --git a/demo/gan/gan_conf.py b/demo/gan/gan_conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..05eee3a9b9ce455eb3a5d47d3165ee7f42f1002e
--- /dev/null
+++ b/demo/gan/gan_conf.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+
+mode = get_config_arg("mode", str, "generator")
+assert mode in set(["generator",
+                    "discriminator",
+                    "generator_training",
+                    "discriminator_training"])
+
+is_generator_training = mode == "generator_training"
+is_discriminator_training = mode == "discriminator_training"
+is_generator = mode == "generator"
+is_discriminator = mode == "discriminator"
+
+# The network structure below follows the ref https://arxiv.org/abs/1406.2661
+# Here we used two hidden layers and batch_norm
+
+print('mode=%s' % mode)
+# the dim of the noise (z) as the input of the generator network
+noise_dim = 10
+# the dim of the hidden layer
+hidden_dim = 10
+# the dim of the generated sample
+sample_dim = 2
+
+settings(
+    batch_size=128,
+    learning_rate=1e-4,
+    learning_method=AdamOptimizer(beta1=0.5)
+)
+
+def discriminator(sample):
+    """
+    discriminator ouputs the probablity of a sample is from generator
+    or real data.
+    The output has two dimenstional: dimension 0 is the probablity
+    of the sample is from generator and dimension 1 is the probabblity
+    of the sample is from real data.
+    """
+    param_attr = ParamAttr(is_static=is_generator_training)
+    bias_attr = ParamAttr(is_static=is_generator_training,
+                          initial_mean=1.0,
+                          initial_std=0)
+
+    hidden = fc_layer(input=sample, name="dis_hidden", size=hidden_dim,
+                    bias_attr=bias_attr,
+                    param_attr=param_attr,
+                    act=ReluActivation())
+
+    hidden2 = fc_layer(input=hidden, name="dis_hidden2", size=hidden_dim,
+                    bias_attr=bias_attr,
+                    param_attr=param_attr,
+                    act=LinearActivation())
+    
+    hidden_bn = batch_norm_layer(hidden2, 
+                     act=ReluActivation(), 
+                     name="dis_hidden_bn", 
+                     bias_attr=bias_attr, 
+                     param_attr=ParamAttr(is_static=is_generator_training,
+                           initial_mean=1.0,
+                           initial_std=0.02),
+                     use_global_stats=False)
+    
+    return fc_layer(input=hidden_bn, name="dis_prob", size=2,
+                    bias_attr=bias_attr,
+                    param_attr=param_attr,
+                    act=SoftmaxActivation())
+
+def generator(noise):
+    """
+    generator generates a sample given noise
+    """
+    param_attr = ParamAttr(is_static=is_discriminator_training)
+    bias_attr = ParamAttr(is_static=is_discriminator_training,
+                           initial_mean=1.0,
+                           initial_std=0)
+    
+    hidden = fc_layer(input=noise,
+                    name="gen_layer_hidden",
+                    size=hidden_dim,
+                    bias_attr=bias_attr,
+                    param_attr=param_attr,
+                    act=ReluActivation())
+
+    hidden2 = fc_layer(input=hidden, name="gen_hidden2", size=hidden_dim,
+                    bias_attr=bias_attr,
+                    param_attr=param_attr,
+                    act=LinearActivation())
+    
+    hidden_bn = batch_norm_layer(hidden2, 
+                     act=ReluActivation(), 
+                     name="gen_layer_hidden_bn", 
+                     bias_attr=bias_attr, 
+                     param_attr=ParamAttr(is_static=is_discriminator_training,
+                           initial_mean=1.0,
+                           initial_std=0.02),
+                     use_global_stats=False)
+    
+    return fc_layer(input=hidden_bn,
+                    name="gen_layer1",
+                    size=sample_dim,
+                    bias_attr=bias_attr,
+                    param_attr=param_attr,
+                    act=LinearActivation())
+
+if is_generator_training:
+    noise = data_layer(name="noise", size=noise_dim)
+    sample = generator(noise)
+
+if is_discriminator_training:
+    sample = data_layer(name="sample", size=sample_dim)
+
+if is_generator_training or is_discriminator_training:
+    label = data_layer(name="label", size=1)
+    prob = discriminator(sample)
+    cost = cross_entropy(input=prob, label=label)
+    classification_error_evaluator(input=prob, label=label, name=mode+'_error')
+    outputs(cost)
+
+if is_generator:
+    noise = data_layer(name="noise", size=noise_dim)
+    outputs(generator(noise))
diff --git a/demo/gan/gan_conf_image.py b/demo/gan/gan_conf_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc5910e9f02d7aac59207fdaa0222d01ac3bf609
--- /dev/null
+++ b/demo/gan/gan_conf_image.py
@@ -0,0 +1,264 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+
+mode = get_config_arg("mode", str, "generator")
+dataSource = get_config_arg("data", str, "mnist")
+assert mode in set(["generator",
+                    "discriminator",
+                    "generator_training",
+                    "discriminator_training"])
+
+is_generator_training = mode == "generator_training"
+is_discriminator_training = mode == "discriminator_training"
+is_generator = mode == "generator"
+is_discriminator = mode == "discriminator"
+
+# The network structure below follows the dcgan paper 
+# (https://arxiv.org/abs/1511.06434)
+
+print('mode=%s' % mode)
+# the dim of the noise (z) as the input of the generator network
+noise_dim = 100
+# the number of filters in the layer in generator/discriminator that is 
+# closet to the image
+gf_dim = 64
+df_dim = 64
+if dataSource == "mnist":
+    sample_dim = 28 # image dim
+    c_dim = 1 # image color
+else:
+    sample_dim = 32
+    c_dim = 3
+s2, s4 = int(sample_dim/2), int(sample_dim/4), 
+s8, s16 = int(sample_dim/8), int(sample_dim/16)
+
+settings(
+    batch_size=128,
+    learning_rate=2e-4,
+    learning_method=AdamOptimizer(beta1=0.5)
+)
+
+def conv_bn(input, channels, imgSize, num_filters, output_x, stride, name, 
+                 param_attr, bias_attr, param_attr_bn, bn, trans=False, 
+                 act=ReluActivation()):
+    
+    """
+    conv_bn is a utility function that constructs a convolution/deconv layer 
+    with an optional batch_norm layer
+
+    :param bn: whether to use batch_norm_layer
+    :type bn: bool
+    :param trans: whether to use conv (False) or deconv (True)
+    :type trans: bool
+    """
+    
+    # calculate the filter_size and padding size based on the given
+    # imgSize and ouput size
+    tmp =  imgSize - (output_x - 1) * stride
+    if tmp <= 1 or tmp > 5:
+        raise ValueError("conv input-output dimension does not fit")
+    elif tmp <= 3:
+        filter_size = tmp + 2
+        padding = 1
+    else:
+        filter_size = tmp
+        padding = 0
+
+    print (imgSize, output_x, stride, filter_size, padding)
+    
+    if trans:
+        nameApx = "_conv"
+    else:
+        nameApx = "_convt"
+    
+    if bn:
+        conv = img_conv_layer(input, filter_size=filter_size, 
+                   num_filters=num_filters,
+                   name=name + nameApx, num_channels=channels,
+                   act=LinearActivation(), groups=1, stride=stride, 
+                   padding=padding, bias_attr=bias_attr,
+                   param_attr=param_attr, shared_biases=True, layer_attr=None,
+                   filter_size_y=None, stride_y=None, padding_y=None, 
+                   trans=trans)
+        
+        conv_bn = batch_norm_layer(conv, 
+                         act=act, 
+                         name=name + nameApx + "_bn", 
+                         bias_attr=bias_attr, 
+                         param_attr=param_attr_bn,
+                         use_global_stats=False)
+        
+        return conv_bn
+    else:
+        conv = img_conv_layer(input, filter_size=filter_size, 
+                   num_filters=num_filters,
+                   name=name + nameApx, num_channels=channels,
+                   act=act, groups=1, stride=stride, 
+                   padding=padding, bias_attr=bias_attr,
+                   param_attr=param_attr, shared_biases=True, layer_attr=None,
+                   filter_size_y=None, stride_y=None, padding_y=None,
+                   trans=trans)
+        return conv
+    
+def generator(noise):
+    """
+    generator generates a sample given noise
+    """
+    param_attr = ParamAttr(is_static=is_discriminator_training,
+                           initial_mean=0.0,
+                           initial_std=0.02)
+    bias_attr = ParamAttr(is_static=is_discriminator_training,
+                           initial_mean=0.0,
+                           initial_std=0.0)
+    
+    param_attr_bn=ParamAttr(is_static=is_discriminator_training,
+                           initial_mean=1.0,
+                           initial_std=0.02)
+    
+    h1 = fc_layer(input=noise,
+                    name="gen_layer_h1",
+                    size=s8 * s8 * gf_dim * 4,
+                    bias_attr=bias_attr,
+                    param_attr=param_attr,
+                    act=LinearActivation())
+    
+    h1_bn = batch_norm_layer(h1, 
+                     act=ReluActivation(), 
+                     name="gen_layer_h1_bn", 
+                     bias_attr=bias_attr, 
+                     param_attr=param_attr_bn,
+                     use_global_stats=False)
+    
+    h2_bn = conv_bn(h1_bn, 
+                    channels=gf_dim*4, 
+                    output_x=s8,
+                    num_filters=gf_dim*2, 
+                    imgSize=s4,
+                    stride=2,
+                    name="gen_layer_h2", 
+                    param_attr=param_attr, 
+                    bias_attr=bias_attr, 
+                    param_attr_bn=param_attr_bn,
+                    bn=True,
+                    trans=True)
+    
+    h3_bn = conv_bn(h2_bn, 
+                    channels=gf_dim*2, 
+                    output_x=s4,
+                    num_filters=gf_dim, 
+                    imgSize=s2,
+                    stride=2,
+                    name="gen_layer_h3", 
+                    param_attr=param_attr, 
+                    bias_attr=bias_attr, 
+                    param_attr_bn=param_attr_bn,
+                    bn=True,
+                    trans=True)
+     
+    
+    return conv_bn(h3_bn,
+                   channels=gf_dim, 
+                   output_x=s2,
+                   num_filters=c_dim, 
+                   imgSize=sample_dim,
+                   stride=2,
+                   name="gen_layer_h4", 
+                   param_attr=param_attr, 
+                   bias_attr=bias_attr, 
+                   param_attr_bn=param_attr_bn,
+                   bn=False,
+                   trans=True,
+                   act=TanhActivation())
+
+
+def discriminator(sample):
+    """
+    discriminator ouputs the probablity of a sample is from generator
+    or real data.
+    The output has two dimenstional: dimension 0 is the probablity
+    of the sample is from generator and dimension 1 is the probabblity
+    of the sample is from real data.
+    """
+    param_attr = ParamAttr(is_static=is_generator_training,
+                           initial_mean=0.0,
+                           initial_std=0.02)
+    bias_attr = ParamAttr(is_static=is_generator_training,
+                          initial_mean=0.0,
+                          initial_std=0.0)
+    
+    param_attr_bn=ParamAttr(is_static=is_generator_training,
+                           initial_mean=1.0,
+                           initial_std=0.02)
+    
+    h0 = conv_bn(sample, 
+                 channels=c_dim, 
+                 imgSize=sample_dim,
+                 num_filters=df_dim, 
+                 output_x=s2, 
+                 stride=2, 
+                 name="dis_h0", 
+                 param_attr=param_attr, 
+                 bias_attr=bias_attr, 
+                 param_attr_bn=param_attr_bn, 
+                 bn=False)
+    
+    h1_bn = conv_bn(h0, 
+                 channels=df_dim,
+                 imgSize=s2,
+                 num_filters=df_dim*2, 
+                 output_x=s4, 
+                 stride=2, 
+                 name="dis_h1", 
+                 param_attr=param_attr, 
+                 bias_attr=bias_attr, 
+                 param_attr_bn=param_attr_bn, 
+                 bn=True)
+
+    h2_bn = conv_bn(h1_bn, 
+                 channels=df_dim*2,
+                 imgSize=s4,
+                 num_filters=df_dim*4, 
+                 output_x=s8, 
+                 stride=2, 
+                 name="dis_h2", 
+                 param_attr=param_attr, 
+                 bias_attr=bias_attr, 
+                 param_attr_bn=param_attr_bn, 
+                 bn=True)
+        
+    return fc_layer(input=h2_bn, name="dis_prob", size=2,
+                    bias_attr=bias_attr,
+                    param_attr=param_attr,
+                    act=SoftmaxActivation())
+
+
+
+if is_generator_training:
+    noise = data_layer(name="noise", size=noise_dim)
+    sample = generator(noise)
+
+if is_discriminator_training:
+    sample = data_layer(name="sample", size=sample_dim * sample_dim*c_dim)
+
+if is_generator_training or is_discriminator_training:
+    label = data_layer(name="label", size=1)
+    prob = discriminator(sample)
+    cost = cross_entropy(input=prob, label=label)
+    classification_error_evaluator(input=prob, label=label, name=mode+'_error')
+    outputs(cost)
+
+if is_generator:
+    noise = data_layer(name="noise", size=noise_dim)
+    outputs(generator(noise))
diff --git a/demo/gan/gan_trainer.py b/demo/gan/gan_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..72699952b961cb5bf6ac14dd65eee1aeab5e2a7c
--- /dev/null
+++ b/demo/gan/gan_trainer.py
@@ -0,0 +1,329 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import random
+import numpy
+import cPickle
+import sys,os
+from PIL import Image
+
+from paddle.trainer.config_parser import parse_config
+from paddle.trainer.config_parser import logger
+import py_paddle.swig_paddle as api
+import matplotlib.pyplot as plt
+
+def plot2DScatter(data, outputfile):
+    '''
+    Plot the data as a 2D scatter plot and save to outputfile
+    data needs to be two dimensinoal
+    '''
+    x = data[:, 0]
+    y = data[:, 1]
+    logger.info("The mean vector is %s" % numpy.mean(data, 0))
+    logger.info("The std vector is %s" % numpy.std(data, 0))
+
+    heatmap, xedges, yedges = numpy.histogram2d(x, y, bins=50)
+    extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
+
+    plt.clf()
+    plt.scatter(x, y)
+    plt.savefig(outputfile, bbox_inches='tight')
+
+def CHECK_EQ(a, b):
+    assert a == b, "a=%s, b=%s" % (a, b)
+
+def copy_shared_parameters(src, dst):
+    '''
+    copy the parameters from src to dst
+    :param src: the source of the parameters
+    :type src: GradientMachine
+    :param dst: the destination of the parameters
+    :type dst: GradientMachine
+    '''
+    src_params = [src.getParameter(i)
+               for i in xrange(src.getParameterSize())]
+    src_params = dict([(p.getName(), p) for p in src_params])
+
+
+    for i in xrange(dst.getParameterSize()):
+        dst_param = dst.getParameter(i)
+        src_param = src_params.get(dst_param.getName(), None)
+        if src_param is None:
+            continue
+        src_value = src_param.getBuf(api.PARAMETER_VALUE)
+        dst_value = dst_param.getBuf(api.PARAMETER_VALUE)
+        CHECK_EQ(len(src_value), len(dst_value))
+        dst_value.copyFrom(src_value)
+        dst_param.setValueUpdated()
+        
+def print_parameters(src):
+    src_params = [src.getParameter(i)
+               for i in xrange(src.getParameterSize())]
+
+    print "***************"
+    for p in src_params:
+        print "Name is %s" % p.getName()
+        print "value is %s \n" % p.getBuf(api.PARAMETER_VALUE).copyToNumpyArray()
+
+def load_mnist_data(imageFile):
+    f = open(imageFile, "rb")
+    f.read(16)
+
+    # Define number of samples for train/test
+    if "train" in imageFile:
+        n = 60000
+    else:
+        n = 10000
+    
+    data = numpy.fromfile(f, 'ubyte', count=n*28*28).reshape((n, 28*28))
+    data = data / 255.0 * 2.0 - 1.0
+
+    f.close()
+    return data.astype('float32')
+
+def load_cifar_data(cifar_path):
+    batch_size = 10000
+    data = numpy.zeros((5*batch_size, 32*32*3), dtype = "float32")
+    for i in range(1, 6):
+        file = cifar_path + "/data_batch_" + str(i)
+        fo = open(file, 'rb')
+        dict = cPickle.load(fo)
+        fo.close()
+        data[(i - 1)*batch_size:(i*batch_size), :] = dict["data"]
+    
+    data = data / 255.0 * 2.0 - 1.0
+    return data
+
+# synthesize 2-D uniform data
+def load_uniform_data():
+    data = numpy.random.rand(1000000, 2).astype('float32')
+    return data
+
+def merge(images, size):
+    if images.shape[1] == 28*28:
+        h, w, c = 28, 28, 1
+    else:
+        h, w, c = 32, 32, 3
+    img = numpy.zeros((h * size[0], w * size[1], c))
+    for idx in xrange(size[0] * size[1]):
+        i = idx % size[1]
+        j = idx // size[1]
+        img[j*h:j*h+h, i*w:i*w+w, :] = \
+          ((images[idx, :].reshape((h, w, c), order="F").transpose(1, 0, 2) + 1.0) / 2.0 * 255.0)
+    return img.astype('uint8')
+
+def save_images(images, path):
+    merged_img = merge(images, [8, 8])
+    if merged_img.shape[2] == 1:
+        im = Image.fromarray(numpy.squeeze(merged_img)).convert('RGB')
+    else:
+        im = Image.fromarray(merged_img, mode="RGB")
+    im.save(path)
+    
+def get_real_samples(batch_size, data_np):
+    return data_np[numpy.random.choice(data_np.shape[0], batch_size, 
+                                       replace=False),:]
+    
+def get_noise(batch_size, noise_dim):
+    return numpy.random.normal(size=(batch_size, noise_dim)).astype('float32')
+
+def get_fake_samples(generator_machine, batch_size, noise):
+    gen_inputs = api.Arguments.createArguments(1)
+    gen_inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise))
+    gen_outputs = api.Arguments.createArguments(0)
+    generator_machine.forward(gen_inputs, gen_outputs, api.PASS_TEST)
+    fake_samples = gen_outputs.getSlotValue(0).copyToNumpyMat()
+    return fake_samples
+
+def get_training_loss(training_machine, inputs):
+    outputs = api.Arguments.createArguments(0)
+    training_machine.forward(inputs, outputs, api.PASS_TEST)
+    loss = outputs.getSlotValue(0).copyToNumpyMat()
+    return numpy.mean(loss)
+
+def prepare_discriminator_data_batch_pos(batch_size, data_np):
+    real_samples = get_real_samples(batch_size, data_np)
+    labels = numpy.ones(batch_size, dtype='int32')
+    inputs = api.Arguments.createArguments(2)
+    inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(real_samples))
+    inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(labels))
+    return inputs
+
+def prepare_discriminator_data_batch_neg(generator_machine, batch_size, noise):
+    fake_samples = get_fake_samples(generator_machine, batch_size, noise)
+    labels = numpy.zeros(batch_size, dtype='int32')
+    inputs = api.Arguments.createArguments(2)
+    inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(fake_samples))
+    inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(labels))
+    return inputs
+
+def prepare_generator_data_batch(batch_size, noise):
+    label = numpy.ones(batch_size, dtype='int32')
+    inputs = api.Arguments.createArguments(2)
+    inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise))
+    inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(label))
+    return inputs
+
+
+def find(iterable, cond):
+    for item in iterable:
+        if cond(item):
+            return item
+    return None
+
+
+def get_layer_size(model_conf, layer_name):
+    layer_conf = find(model_conf.layers, lambda x: x.name == layer_name)
+    assert layer_conf is not None, "Cannot find '%s' layer" % layer_name
+    return layer_conf.size
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--data_source", help="mnist or cifar or uniform")
+    parser.add_argument("--use_gpu", default="1", 
+                        help="1 means use gpu for training")
+    parser.add_argument("--gpu_id", default="0", 
+                        help="the gpu_id parameter")
+    args = parser.parse_args()
+    data_source = args.data_source
+    use_gpu = args.use_gpu
+    assert data_source in ["mnist", "cifar", "uniform"]
+    assert use_gpu in ["0", "1"]
+
+    if not os.path.exists("./%s_samples/" % data_source):
+        os.makedirs("./%s_samples/" % data_source)
+
+    if not os.path.exists("./%s_params/" % data_source):
+        os.makedirs("./%s_params/" % data_source)
+        
+    api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10', '--log_period=100', 
+                   '--gpu_id=' + args.gpu_id, '--save_dir=' + "./%s_params/" % data_source)
+    
+    if data_source == "uniform":
+        conf = "gan_conf.py"
+        num_iter = 10000
+    else:
+        conf = "gan_conf_image.py"
+        num_iter = 1000
+        
+    gen_conf = parse_config(conf, "mode=generator_training,data=" + data_source)
+    dis_conf = parse_config(conf, "mode=discriminator_training,data=" + data_source)
+    generator_conf = parse_config(conf, "mode=generator,data=" + data_source)
+    batch_size = dis_conf.opt_config.batch_size
+    noise_dim = get_layer_size(gen_conf.model_config, "noise")
+    
+    if data_source == "mnist":
+        data_np = load_mnist_data("./data/mnist_data/train-images-idx3-ubyte")
+    elif data_source == "cifar":
+        data_np = load_cifar_data("./data/cifar-10-batches-py/")
+    else:
+        data_np = load_uniform_data()
+    
+    # this creates a gradient machine for discriminator
+    dis_training_machine = api.GradientMachine.createFromConfigProto(
+        dis_conf.model_config)
+    # this create a gradient machine for generator    
+    gen_training_machine = api.GradientMachine.createFromConfigProto(
+        gen_conf.model_config)
+
+    # generator_machine is used to generate data only, which is used for
+    # training discriminator
+    logger.info(str(generator_conf.model_config))
+    generator_machine = api.GradientMachine.createFromConfigProto(
+        generator_conf.model_config)
+    
+    dis_trainer = api.Trainer.create(
+        dis_conf, dis_training_machine)
+
+    gen_trainer = api.Trainer.create(
+        gen_conf, gen_training_machine)
+    
+    dis_trainer.startTrain()
+    gen_trainer.startTrain()
+    
+    # Sync parameters between networks (GradientMachine) at the beginning
+    copy_shared_parameters(gen_training_machine, dis_training_machine)
+    copy_shared_parameters(gen_training_machine, generator_machine)
+    
+    # constrain that either discriminator or generator can not be trained
+    # consecutively more than MAX_strike times
+    curr_train = "dis"
+    curr_strike = 0
+    MAX_strike = 5
+     
+    for train_pass in xrange(100):
+        dis_trainer.startTrainPass()
+        gen_trainer.startTrainPass()
+        for i in xrange(num_iter):
+            # Do forward pass in discriminator to get the dis_loss
+            noise = get_noise(batch_size, noise_dim)
+            data_batch_dis_pos = prepare_discriminator_data_batch_pos(
+                batch_size, data_np)
+            dis_loss_pos = get_training_loss(dis_training_machine, data_batch_dis_pos)
+            
+            data_batch_dis_neg = prepare_discriminator_data_batch_neg(
+                generator_machine, batch_size, noise)
+            dis_loss_neg = get_training_loss(dis_training_machine, data_batch_dis_neg)            
+                         
+            dis_loss = (dis_loss_pos + dis_loss_neg) / 2.0
+            
+            # Do forward pass in generator to get the gen_loss
+            data_batch_gen = prepare_generator_data_batch(
+                    batch_size, noise)
+            gen_loss = get_training_loss(gen_training_machine, data_batch_gen)
+             
+            if i % 100 == 0:
+                print "d_pos_loss is %s     d_neg_loss is %s" % (dis_loss_pos, dis_loss_neg) 
+                print "d_loss is %s    g_loss is %s" % (dis_loss, gen_loss)
+            
+            # Decide which network to train based on the training history
+            # And the relative size of the loss        
+            if (not (curr_train == "dis" and curr_strike == MAX_strike)) and \
+               ((curr_train == "gen" and curr_strike == MAX_strike) or dis_loss > gen_loss):
+                if curr_train == "dis":
+                    curr_strike += 1
+                else:
+                    curr_train = "dis"
+                    curr_strike = 1                
+                dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_neg)
+                dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_pos)               
+                copy_shared_parameters(dis_training_machine, gen_training_machine)
+ 
+            else:
+                if curr_train == "gen":
+                    curr_strike += 1
+                else:
+                    curr_train = "gen"
+                    curr_strike = 1
+                gen_trainer.trainOneDataBatch(batch_size, data_batch_gen)
+                # TODO: add API for paddle to allow true parameter sharing between different GradientMachines 
+                # so that we do not need to copy shared parameters. 
+                copy_shared_parameters(gen_training_machine, dis_training_machine)
+                copy_shared_parameters(gen_training_machine, generator_machine)
+ 
+        dis_trainer.finishTrainPass()
+        gen_trainer.finishTrainPass()
+        # At the end of each pass, save the generated samples/images
+        fake_samples = get_fake_samples(generator_machine, batch_size, noise)
+        if data_source == "uniform":
+            plot2DScatter(fake_samples, "./%s_samples/train_pass%s.png" % (data_source, train_pass))
+        else:
+            save_images(fake_samples, "./%s_samples/train_pass%s.png" % (data_source, train_pass))
+    dis_trainer.finishTrain()
+    gen_trainer.finishTrain()
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/image_classification/predict.sh b/demo/image_classification/predict.sh
old mode 100644
new mode 100755
diff --git a/demo/image_classification/train.sh b/demo/image_classification/train.sh
index ed9b5220fff6a434cd332f0972d39c4149b3ebfe..db0a057bf35b4ad04a08a1e3f1fad3bd6a486350 100755
--- a/demo/image_classification/train.sh
+++ b/demo/image_classification/train.sh
@@ -24,7 +24,7 @@ paddle train \
 --test_all_data_in_one_period=1 \
 --use_gpu=1 \
 --trainer_count=1 \
---num_passes=200 \
+--num_passes=300 \
 --save_dir=$output \
 2>&1 | tee $log
 
diff --git a/demo/model_zoo/embedding/pre_DictAndModel.sh b/demo/model_zoo/embedding/pre_DictAndModel.sh
index 7821850fb25cc5b87aa305c2113efbf50b093ed1..6d647f5dd9368eaf81c19386511c7d231e4799e3 100755
--- a/demo/model_zoo/embedding/pre_DictAndModel.sh
+++ b/demo/model_zoo/embedding/pre_DictAndModel.sh
@@ -18,7 +18,5 @@ set -x
 # download the dictionary and pretrained model 
 for file in baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb
 do 
-  # following is the google drive address
-  # you can also directly download from https://pan.baidu.com/s/1o8q577s
-  wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/$file --no-check-certificate
+  wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/$file
 done
diff --git a/demo/model_zoo/resnet/get_model.sh b/demo/model_zoo/resnet/get_model.sh
index 89312d43edf8e4e7d639be73d5b3983ea916b902..133d08fca431540f2ed5cd6e63b51d9ce3a1b344 100755
--- a/demo/model_zoo/resnet/get_model.sh
+++ b/demo/model_zoo/resnet/get_model.sh
@@ -24,9 +24,7 @@ echo "Downloading ResNet models..."
 
 for file in resnet_50.tar.gz resnet_101.tar.gz resnet_152.tar.gz mean_meta_224.tar.gz 
 do 
-  # following is the google drive address
-  # you can also directly download from https://pan.baidu.com/s/1o8q577s
-  wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/imagenet/$file --no-check-certificate
+  wget http://paddlepaddle.bj.bcebos.com/model_zoo/imagenet/$file
   tar -xvf $file 
   rm $file
 done
diff --git a/demo/quick_start/data/README.md b/demo/quick_start/data/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..63abcf7ebf31903213e44cf492b93e09f61db14e
--- /dev/null
+++ b/demo/quick_start/data/README.md
@@ -0,0 +1,9 @@
+This dataset consists of electronics product reviews associated with
+binary labels (positive/negative) for sentiment classification.
+
+The preprocessed data can be downloaded by script `get_data.sh`.
+The data was derived from reviews_Electronics_5.json.gz at
+
+http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+
+If you want to process the raw data, you can use the script `proc_from_raw_data/get_data.sh`.
diff --git a/demo/quick_start/data/get_data.sh b/demo/quick_start/data/get_data.sh
index f355d63225b28ab495b34e72dd3be8d237ae08f4..952de3f3c8f52a7a6f84412f9b38f16ac2503ac2 100755
--- a/demo/quick_start/data/get_data.sh
+++ b/demo/quick_start/data/get_data.sh
@@ -17,14 +17,11 @@ set -e
 DIR="$( cd "$(dirname "$0")" ; pwd -P )"
 cd $DIR
 
-echo "Downloading Amazon Electronics reviews data..."
-# http://jmcauley.ucsd.edu/data/amazon/
-wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+# Download the preprocessed data
+wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
 
-echo "Downloading mosesdecoder..."
-#https://github.com/moses-smt/mosesdecoder
-wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
+# Extract package
+tar zxvf preprocessed_data.tar.gz
 
-unzip master.zip
-rm master.zip
-echo "Done."
+# Remove compressed package
+rm preprocessed_data.tar.gz
diff --git a/demo/quick_start/data/pred.list b/demo/quick_start/data/pred.list
deleted file mode 100644
index d88b2b63851101a8b40e706b32d8c17b5fabb201..0000000000000000000000000000000000000000
--- a/demo/quick_start/data/pred.list
+++ /dev/null
@@ -1 +0,0 @@
-./data/pred.txt
diff --git a/demo/quick_start/data/pred.txt b/demo/quick_start/data/pred.txt
deleted file mode 100644
index 6ed5f738ddaff6645448d5e606dcef1baf01b282..0000000000000000000000000000000000000000
--- a/demo/quick_start/data/pred.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-the device is cute , but that &apos;s just about all that &apos;s good. the specs are what you &apos;d expect : it &apos;s a wifi mic , with some noise filter options. the app has the option to upload your baby &apos;s name and photo , which is a cutesy touch. but the app is otherwise unstable and useless unless you upgrade for $ 60 / year.set up involves downloading the app , turning on the mic , switching your phone to the wifi network of the mic , telling the app your wifi settings , switching your wifi back to your home router. the app is then directly connected to your mic.the app is adware ! the main screen says &quot; cry notifications on / off : upgrade to evoz premium and receive a text message of email when your baby is crying &quot; .but the adware points out an important limitation , this monitor is only intended to be used from your home network. if you want to access it remotely , get a webcam. this app would make a lot more sense of the premium features were included with the hardware .
-don &apos;t be fooled by my one star rating. if there was a zero , i would have selected it. this product was a waste of my money.it has never worked like the company said it supposed to. i only have one device , an iphone 4gs. after charging the the iphone mid way , the i.sound portable power max 16,000 mah is completely drained. the led light no longer lit up. when plugging the isound portable power max into a wall outlet to charge , it would charge for about 20-30 minutes and then all four battery led indicator lit up showing a full charge. i would leave it on to charge for the full 8 hours or more but each time with the same result upon using. don &apos;t buy this thing. put your money to good use elsewhere .
diff --git a/demo/quick_start/preprocess.sh b/demo/quick_start/data/proc_from_raw_data/get_data.sh
similarity index 65%
rename from demo/quick_start/preprocess.sh
rename to demo/quick_start/data/proc_from_raw_data/get_data.sh
index c9190e2dd2ef754bf3c7287006322b52493dc3a0..cd85e26842dfccea78e4f26bdfee938887021f03 100755
--- a/demo/quick_start/preprocess.sh
+++ b/demo/quick_start/data/proc_from_raw_data/get_data.sh
@@ -16,10 +16,26 @@
 # 1. size of pos : neg = 1:1.
 # 2. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
 # 3. distinct train set and test set.
-# 4. build dict
 
 set -e
 
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+
+# Download data
+echo "Downloading Amazon Electronics reviews data..."
+# http://jmcauley.ucsd.edu/data/amazon/
+wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+echo "Downloading mosesdecoder..."
+# https://github.com/moses-smt/mosesdecoder
+wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
+
+unzip master.zip
+rm master.zip
+
+##################
+# Preprocess data 
+echo "Preprocess data..."
 export LC_ALL=C
 UNAME_STR=`uname`
 
@@ -29,11 +45,11 @@ else
   SHUF_PROG='gshuf'
 fi
 
-mkdir -p data/tmp
-python preprocess.py -i data/reviews_Electronics_5.json.gz
+mkdir -p tmp
+python preprocess.py -i reviews_Electronics_5.json.gz
 # uniq and shuffle
-cd data/tmp
-echo 'uniq and shuffle...'
+cd tmp
+echo 'Uniq and shuffle...'
 cat pos_*|sort|uniq|${SHUF_PROG}> pos.shuffed
 cat neg_*|sort|uniq|${SHUF_PROG}> neg.shuffed
 
@@ -53,11 +69,11 @@ cat train.pos train.neg | ${SHUF_PROG} >../train.txt
 cat test.pos test.neg | ${SHUF_PROG} >../test.txt
 
 cd -
-echo 'data/train.txt' > data/train.list
-echo 'data/test.txt' > data/test.list
+echo 'train.txt' > train.list
+echo 'test.txt' > test.list
 
 # use 30k dict
-rm -rf data/tmp
-mv data/dict.txt data/dict_all.txt
-cat data/dict_all.txt | head -n 30001 > data/dict.txt
-echo 'preprocess finished'
+rm -rf tmp
+mv dict.txt dict_all.txt
+cat dict_all.txt | head -n 30001 > dict.txt
+echo 'Done.'
diff --git a/demo/quick_start/preprocess.py b/demo/quick_start/data/proc_from_raw_data/preprocess.py
similarity index 95%
rename from demo/quick_start/preprocess.py
rename to demo/quick_start/data/proc_from_raw_data/preprocess.py
index d87fad632a7429f7d9682badabe4c72ca127354f..56c2c5f16ceb63ff88fa51ed78c2e77ea5b64592 100755
--- a/demo/quick_start/preprocess.py
+++ b/demo/quick_start/data/proc_from_raw_data/preprocess.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-1. (remove HTML before or not)tokensizing
+1. Tokenize the words and punctuation 
 2. pos sample : rating score 5; neg sample: rating score 1-2.
 
 Usage:
@@ -76,7 +76,11 @@ def tokenize(sentences):
     sentences : a list of input sentences.
     return: a list of processed text.
     """
-    dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
+    dir = './mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
+    if not os.path.exists(dir):
+        sys.exit(
+            "The ./mosesdecoder-master/scripts/tokenizer/tokenizer.perl does not exists."
+        )
     tokenizer_cmd = [dir, '-l', 'en', '-q', '-']
     assert isinstance(sentences, list)
     text = "\n".join(sentences)
@@ -104,7 +108,7 @@ def tokenize_batch(id):
         num_batch, instance, pre_fix = parse_queue.get()
         if num_batch == -1:  ### parse_queue finished
             tokenize_queue.put((-1, None, None))
-            sys.stderr.write("tokenize theread %s finish\n" % (id))
+            sys.stderr.write("Thread %s finish\n" % (id))
             break
         tokenize_instance = tokenize(instance)
         tokenize_queue.put((num_batch, tokenize_instance, pre_fix))
diff --git a/demo/semantic_role_labeling/data/extract_dict_feature.py b/demo/semantic_role_labeling/data/extract_dict_feature.py
index 2982e54c665b41400aab0a893ff3c76335404988..daca5f01cf2b3bd231bf530f17ec760272ce93e0 100644
--- a/demo/semantic_role_labeling/data/extract_dict_feature.py
+++ b/demo/semantic_role_labeling/data/extract_dict_feature.py
@@ -17,24 +17,15 @@ import os
 from optparse import OptionParser
 
 
-def extract_dict_features(pair_file, feature_file, src_dict_file,
-                          tgt_dict_file):
-    src_dict = set()
-    tgt_dict = set()
-
-    with open(pair_file) as fin, open(feature_file, 'w') as feature_out, open(
-            src_dict_file, 'w') as src_dict_out, open(tgt_dict_file,
-                                                      'w') as tgt_dict_out:
+def extract_dict_features(pair_file, feature_file):
+
+    with open(pair_file) as fin, open(feature_file, 'w') as feature_out:
         for line in fin:
-            sentence, labels = line.strip().split('\t')
+            sentence, predicate, labels = line.strip().split('\t')
             sentence_list = sentence.split()
             labels_list = labels.split()
 
-            src_dict.update(sentence_list)
-            tgt_dict.update(labels_list)
-
             verb_index = labels_list.index('B-V')
-            verb_feature = sentence_list[verb_index]
 
             mark = [0] * len(labels_list)
             if verb_index > 0:
@@ -42,47 +33,50 @@ def extract_dict_features(pair_file, feature_file, src_dict_file,
                 ctx_n1 = sentence_list[verb_index - 1]
             else:
                 ctx_n1 = 'bos'
-            ctx_n1_feature = ctx_n1
+            
+            if verb_index > 1:
+                mark[verb_index - 2] = 1
+                ctx_n2 = sentence_list[verb_index - 2]
+            else:
+                ctx_n2 = 'bos'
 
             mark[verb_index] = 1
-            ctx_0_feature = sentence_list[verb_index]
+            ctx_0 = sentence_list[verb_index]
 
             if verb_index < len(labels_list) - 2:
                 mark[verb_index + 1] = 1
                 ctx_p1 = sentence_list[verb_index + 1]
             else:
                 ctx_p1 = 'eos'
-            ctx_p1_feature = ctx_p1
+            
+            if verb_index < len(labels_list) - 3:
+                mark[verb_index + 2] = 1
+                ctx_p2 = sentence_list[verb_index + 2]
+            else:
+                ctx_p2 = 'eos'
+
 
             feature_str  = sentence + '\t' \
-                           + verb_feature + '\t' \
-                           + ctx_n1_feature + '\t' \
-                           + ctx_0_feature + '\t' \
-                           + ctx_p1_feature + '\t' \
+                           + predicate + '\t' \
+                           + ctx_n2 + '\t' \
+                           + ctx_n1 + '\t' \
+                           + ctx_0 + '\t' \
+                           + ctx_p1 + '\t' \
+                           + ctx_p2 + '\t' \
                            + ' '.join([str(i) for i in mark]) + '\t' \
                            + labels
 
             feature_out.write(feature_str + '\n')
 
-        src_dict_out.write('<unk>\n')
-        src_dict_out.write('\n'.join(list(src_dict)))
-
-        tgt_dict_out.write('\n'.join(list(tgt_dict)))
 
 
 if __name__ == '__main__':
 
-    usage = '-p pair_file -f feature_file -s source dictionary -t target dictionary '
+    usage = '-p pair_file -f feature_file'
     parser = OptionParser(usage)
     parser.add_option('-p', dest='pair_file', help='the pair file')
-    parser.add_option(
-        '-f', dest='feature_file', help='the file to store feature')
-    parser.add_option(
-        '-s', dest='src_dict', help='the file to store source dictionary')
-    parser.add_option(
-        '-t', dest='tgt_dict', help='the file to store target dictionary')
+    parser.add_option('-f', dest='feature_file', help='the feature file')
 
     (options, args) = parser.parse_args()
 
-    extract_dict_features(options.pair_file, options.feature_file,
-                          options.src_dict, options.tgt_dict)
+    extract_dict_features(options.pair_file, options.feature_file)
diff --git a/demo/semantic_role_labeling/data/extract_pairs.py b/demo/semantic_role_labeling/data/extract_pairs.py
index 4d1bef8f958a62be9941d474a0b67542dcc5cfab..86ab00ce41723169de035a841d9e129a1b9e82a3 100644
--- a/demo/semantic_role_labeling/data/extract_pairs.py
+++ b/demo/semantic_role_labeling/data/extract_pairs.py
@@ -51,7 +51,7 @@ def read_sentences(words_file):
         for line in fin:
             line = line.strip()
             if line == '':
-                sentences.append(s.lower())
+                sentences.append(s)
                 s = ''
             else:
                 s += line + ' '
@@ -64,6 +64,11 @@ def transform_labels(sentences, labels):
         if len(labels[i]) == 1:
             continue
         else:
+            verb_list = []
+            for x in labels[i][0]:
+                if x !='-':
+                   verb_list.append(x)
+
             for j in xrange(1, len(labels[i])):
                 label_list = labels[i][j]
                 current_tag = 'O'
@@ -88,8 +93,7 @@ def transform_labels(sentences, labels):
                         is_in_bracket = True
                     else:
                         print 'error:', ll
-
-                sen_lab_pair.append((sentences[i], label_seq))
+                sen_lab_pair.append((sentences[i], verb_list[j-1], label_seq))
     return sen_lab_pair
 
 
@@ -97,9 +101,9 @@ def write_file(sen_lab_pair, output_file):
     with open(output_file, 'w') as fout:
         for x in sen_lab_pair:
             sentence = x[0]
-            label_seq = ' '.join(x[1])
-            assert len(sentence.split()) == len(x[1])
-            fout.write(sentence + '\t' + label_seq + '\n')
+            label_seq = ' '.join(x[2])
+            assert len(sentence.split()) == len(x[2])
+            fout.write(sentence + '\t' + x[1]+'\t' +label_seq + '\n')
 
 
 if __name__ == '__main__':
diff --git a/demo/semantic_role_labeling/data/get_data.sh b/demo/semantic_role_labeling/data/get_data.sh
index 268c0995e27006ec62f38bdda9b0a0994dab096c..99487e0d9a8c31d884c4a338386ad0ff8e5d9dc7 100644
--- a/demo/semantic_role_labeling/data/get_data.sh
+++ b/demo/semantic_role_labeling/data/get_data.sh
@@ -14,6 +14,10 @@
 # limitations under the License.
 set -e
 wget http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt 
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt 
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb
 tar -xzvf conll05st-tests.tar.gz
 rm conll05st-tests.tar.gz
 cp ./conll05st-release/test.wsj/words/test.wsj.words.gz  .
@@ -22,4 +26,4 @@ gunzip test.wsj.words.gz
 gunzip test.wsj.props.gz
 
 python extract_pairs.py  -w test.wsj.words -p test.wsj.props -o test.wsj.seq_pair
-python extract_dict_feature.py -p test.wsj.seq_pair -f feature  -s src.dict  -t tgt.dict
+python extract_dict_feature.py -p test.wsj.seq_pair -f feature 
diff --git a/demo/semantic_role_labeling/dataprovider.py b/demo/semantic_role_labeling/dataprovider.py
index 5c003584a52d459f13b7942ebe3a7147ac58a42f..2c8e13462730a2e980fa1c3fe342ef0e062ab5d7 100644
--- a/demo/semantic_role_labeling/dataprovider.py
+++ b/demo/semantic_role_labeling/dataprovider.py
@@ -17,41 +17,52 @@ from paddle.trainer.PyDataProvider2 import *
 UNK_IDX = 0
 
 
-def hook(settings, word_dict, label_dict, **kwargs):
+def hook(settings, word_dict, label_dict, predicate_dict, **kwargs):
     settings.word_dict = word_dict
     settings.label_dict = label_dict
+    settings.predicate_dict = predicate_dict
+   
     #all inputs are integral and sequential type
     settings.slots = [
         integer_value_sequence(len(word_dict)),
         integer_value_sequence(len(word_dict)),
         integer_value_sequence(len(word_dict)),
         integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)), integer_value_sequence(2),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(word_dict)), 
+        integer_value_sequence(len(predicate_dict)),
+        integer_value_sequence(2),
         integer_value_sequence(len(label_dict))
     ]
 
 
-@provider(init_hook=hook)
-def process(obj, file_name):
+def get_batch_size(yeild_data):
+    return len(yeild_data[0])
+    
+
+@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size, 
+          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_name):
     with open(file_name, 'r') as fdata:
         for line in fdata:
-            sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = \
+            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
                 line.strip().split('\t')
-
+           
             words = sentence.split()
             sen_len = len(words)
-            word_slot = [obj.word_dict.get(w, UNK_IDX) for w in words]
+            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
 
-            predicate_slot = [obj.word_dict.get(predicate, UNK_IDX)] * sen_len
-            ctx_n1_slot = [obj.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
-            ctx_0_slot = [obj.word_dict.get(ctx_0, UNK_IDX)] * sen_len
-            ctx_p1_slot = [obj.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
+            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
+            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
 
             marks = mark.split()
             mark_slot = [int(w) for w in marks]
 
             label_list = label.split()
-            label_slot = [obj.label_dict.get(w) for w in label_list]
-
-            yield word_slot, predicate_slot, ctx_n1_slot, \
-                  ctx_0_slot, ctx_p1_slot, mark_slot, label_slot
+            label_slot = [settings.label_dict.get(w) for w in label_list]
+            yield word_slot, ctx_n2_slot, ctx_n1_slot, \
+                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot, label_slot
diff --git a/demo/semantic_role_labeling/db_lstm.py b/demo/semantic_role_labeling/db_lstm.py
index e3f6edad6972112ed04e173a9b714e3fec13d402..54ceff0e724220cc9ea96b9e0ec6844947a8343e 100644
--- a/demo/semantic_role_labeling/db_lstm.py
+++ b/demo/semantic_role_labeling/db_lstm.py
@@ -18,8 +18,9 @@ import sys
 from paddle.trainer_config_helpers import *
 
 #file paths
-word_dict_file = './data/src.dict'
-label_dict_file = './data/tgt.dict'
+word_dict_file = './data/wordDict.txt'
+label_dict_file = './data/targetDict.txt'
+predicate_file= './data/verbDict.txt'
 train_list_file = './data/train.list'
 test_list_file = './data/test.list'
 
@@ -30,8 +31,10 @@ if not is_predict:
     #load dictionaries
     word_dict = dict()
     label_dict = dict()
+    predicate_dict = dict()
     with open(word_dict_file, 'r') as f_word, \
-         open(label_dict_file, 'r') as f_label:
+         open(label_dict_file, 'r') as f_label, \
+         open(predicate_file, 'r') as f_pre:
         for i, line in enumerate(f_word):
             w = line.strip()
             word_dict[w] = i
@@ -40,6 +43,11 @@ if not is_predict:
             w = line.strip()
             label_dict[w] = i
 
+        for i, line in enumerate(f_pre):
+            w = line.strip()
+            predicate_dict[w] = i
+
+
     if is_test:
         train_list_file = None
 
@@ -50,91 +58,157 @@ if not is_predict:
         module='dataprovider',
         obj='process',
         args={'word_dict': word_dict,
-              'label_dict': label_dict})
+              'label_dict': label_dict,
+              'predicate_dict': predicate_dict })
 
     word_dict_len = len(word_dict)
     label_dict_len = len(label_dict)
+    pred_len = len(predicate_dict)
 
 else:
     word_dict_len = get_config_arg('dict_len', int)
     label_dict_len = get_config_arg('label_len', int)
+    pred_len = get_config_arg('pred_len', int)
 
+############################## Hyper-parameters ##################################
 mark_dict_len = 2
 word_dim = 32
 mark_dim = 5
-hidden_dim = 128
+hidden_dim = 512
 depth = 8
-emb_lr = 1e-2
-fc_lr = 1e-2
-lstm_lr = 2e-2
+
+
+
+########################### Optimizer #######################################
+
 
 settings(
     batch_size=150,
-    learning_method=AdamOptimizer(),
-    learning_rate=1e-3,
+    learning_method=MomentumOptimizer(momentum=0),
+    learning_rate=2e-2,
     regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
+    is_async=False,
+    model_average=ModelAverage(average_window=0.5,
+                               max_average_window=10000),
+                               
+)
 
-#6 features
+
+
+
+####################################### network ##############################
+#8 features and 1 target
 word = data_layer(name='word_data', size=word_dict_len)
-predicate = data_layer(name='verb_data', size=word_dict_len)
+predicate = data_layer(name='verb_data', size=pred_len)
+
+ctx_n2 = data_layer(name='ctx_n2_data', size=word_dict_len)
 ctx_n1 = data_layer(name='ctx_n1_data', size=word_dict_len)
 ctx_0 = data_layer(name='ctx_0_data', size=word_dict_len)
 ctx_p1 = data_layer(name='ctx_p1_data', size=word_dict_len)
+ctx_p2 = data_layer(name='ctx_p2_data', size=word_dict_len)
 mark = data_layer(name='mark_data', size=mark_dict_len)
 
+
 if not is_predict:
     target = data_layer(name='target', size=label_dict_len)
 
-ptt = ParameterAttribute(name='src_emb', learning_rate=emb_lr)
-layer_attr = ExtraLayerAttribute(drop_rate=0.5)
-fc_para_attr = ParameterAttribute(learning_rate=fc_lr)
-lstm_para_attr = ParameterAttribute(initial_std=0., learning_rate=lstm_lr)
-para_attr = [fc_para_attr, lstm_para_attr]
 
-word_embedding = embedding_layer(size=word_dim, input=word, param_attr=ptt)
-predicate_embedding = embedding_layer(
-    size=word_dim, input=predicate, param_attr=ptt)
-ctx_n1_embedding = embedding_layer(size=word_dim, input=ctx_n1, param_attr=ptt)
-ctx_0_embedding = embedding_layer(size=word_dim, input=ctx_0, param_attr=ptt)
-ctx_p1_embedding = embedding_layer(size=word_dim, input=ctx_p1, param_attr=ptt)
-mark_embedding = embedding_layer(size=mark_dim, input=mark)
+default_std=1/math.sqrt(hidden_dim)/3.0
+
+emb_para = ParameterAttribute(name='emb', initial_std=0., learning_rate=0.)
+std_0 = ParameterAttribute(initial_std=0.)
+std_default = ParameterAttribute(initial_std=default_std) 
+
+predicate_embedding = embedding_layer(size=word_dim, input=predicate, param_attr=ParameterAttribute(name='vemb',initial_std=default_std))
+mark_embedding = embedding_layer(name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0)
+
+word_input=[word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+emb_layers = [embedding_layer(size=word_dim, input=x, param_attr=emb_para) for x in word_input]
+emb_layers.append(predicate_embedding)
+emb_layers.append(mark_embedding)
 
 hidden_0 = mixed_layer(
+    name='hidden0',
     size=hidden_dim,
-    input=[
-        full_matrix_projection(input=word_embedding),
-        full_matrix_projection(input=predicate_embedding),
-        full_matrix_projection(input=ctx_n1_embedding),
-        full_matrix_projection(input=ctx_0_embedding),
-        full_matrix_projection(input=ctx_p1_embedding),
-        full_matrix_projection(input=mark_embedding),
-    ])
+    bias_attr=std_default,
+    input=[ full_matrix_projection(input=emb, param_attr=std_default ) for emb in emb_layers ])
+
 
-lstm_0 = lstmemory(input=hidden_0, layer_attr=layer_attr)
+mix_hidden_lr = 1e-3
+lstm_para_attr = ParameterAttribute(initial_std=0.0, learning_rate=1.0)
+hidden_para_attr = ParameterAttribute(initial_std=default_std, learning_rate=mix_hidden_lr)
+
+lstm_0 = lstmemory(name='lstm0',
+                   input=hidden_0, 
+                   act=ReluActivation(),
+                   gate_act=SigmoidActivation(),
+                   state_act=SigmoidActivation(),
+                   bias_attr=std_0,
+                   param_attr=lstm_para_attr)
 
 #stack L-LSTM and R-LSTM with direct edges
 input_tmp = [hidden_0, lstm_0]
 
+
 for i in range(1, depth):
 
-    fc = fc_layer(input=input_tmp, size=hidden_dim, param_attr=para_attr)
+    mix_hidden = mixed_layer(name='hidden'+str(i),
+                             size=hidden_dim, 
+                             bias_attr=std_default,
+                             input=[full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr),
+                                    full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr)
+                                   ]
+                             )
+
+    lstm = lstmemory(name='lstm'+str(i),
+                     input=mix_hidden,
+                     act=ReluActivation(),
+                     gate_act=SigmoidActivation(),
+                     state_act=SigmoidActivation(),
+                     reverse=((i % 2)==1),
+                     bias_attr=std_0,
+                     param_attr=lstm_para_attr)
+
+    input_tmp = [mix_hidden, lstm]
+
+feature_out = mixed_layer(name='output',
+                          size=label_dict_len,
+                          bias_attr=std_default, 
+                          input=[full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr),
+                                 full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr)
+                                ],
+                          )
 
-    lstm = lstmemory(
-        input=fc,
-        act=ReluActivation(),
-        reverse=(i % 2) == 1,
-        layer_attr=layer_attr)
-    input_tmp = [fc, lstm]
 
-prob = fc_layer(
-    input=input_tmp,
-    size=label_dict_len,
-    act=SoftmaxActivation(),
-    param_attr=para_attr)
 
 if not is_predict:
-    cls = classification_cost(input=prob, label=target)
-    outputs(cls)
+    crf_l = crf_layer( name = 'crf',
+                       size = label_dict_len,
+                       input = feature_out, 
+                       label = target,
+                       param_attr=ParameterAttribute(name='crfw',initial_std=default_std, learning_rate=mix_hidden_lr)
+
+                      )
+
+    
+    crf_dec_l = crf_decoding_layer(name = 'crf_dec_l',
+                                   size = label_dict_len,
+                                   input = feature_out,
+                                   label = target,
+                                   param_attr=ParameterAttribute(name='crfw')
+                                       )
+
+
+    eval = sum_evaluator(input=crf_dec_l)
+        
+    outputs(crf_l)
+
 else:
-    outputs(prob)
+    crf_dec_l = crf_decoding_layer(name = 'crf_dec_l',
+                                   size = label_dict_len,
+                                   input = feature_out,
+                                   param_attr=ParameterAttribute(name='crfw')
+                                       )
+
+    outputs(crf_dec_l)
+
diff --git a/demo/semantic_role_labeling/predict.py b/demo/semantic_role_labeling/predict.py
index f051d4175cf6fff43bd7f84b457ab9dd12405a15..a7f1e8f81f59f6fe95fd29593ef1a826e652e570 100644
--- a/demo/semantic_role_labeling/predict.py
+++ b/demo/semantic_role_labeling/predict.py
@@ -26,7 +26,7 @@ UNK_IDX = 0
 
 
 class Prediction():
-    def __init__(self, train_conf, dict_file, model_dir, label_file):
+    def __init__(self, train_conf, dict_file, model_dir, label_file, predicate_dict_file):
         """
         train_conf: trainer configure.
         dict_file: word dictionary file name.
@@ -35,26 +35,37 @@ class Prediction():
 
         self.dict = {}
         self.labels = {}
+        self.predicate_dict={}
         self.labels_reverse = {}
-        self.load_dict_label(dict_file, label_file)
+        self.load_dict_label(dict_file, label_file, predicate_dict_file)
 
         len_dict = len(self.dict)
         len_label = len(self.labels)
-
-        conf = parse_config(train_conf, 'dict_len=' + str(len_dict) +
-                            ',label_len=' + str(len_label) + ',is_predict=True')
+        len_pred = len(self.predicate_dict)
+
+        conf = parse_config(
+            train_conf,
+            'dict_len=' + str(len_dict) + 
+            ',label_len=' + str(len_label) +
+            ',pred_len=' + str(len_pred) +
+            ',is_predict=True')
         self.network = swig_paddle.GradientMachine.createFromConfigProto(
             conf.model_config)
         self.network.loadParameters(model_dir)
 
         slots = [
-            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict), integer_value_sequence(2)
-        ]
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict), 
+            integer_value_sequence(len_pred),
+            integer_value_sequence(2)
+            ]
         self.converter = DataProviderConverter(slots)
 
-    def load_dict_label(self, dict_file, label_file):
+    def load_dict_label(self, dict_file, label_file, predicate_dict_file):
         """
         Load dictionary from self.dict_file.
         """
@@ -65,39 +76,42 @@ class Prediction():
             self.labels[line.strip()] = line_count
             self.labels_reverse[line_count] = line.strip()
 
+        for line_count, line in enumerate(open(predicate_dict_file, 'r')):
+            self.predicate_dict[line.strip()] = line_count
     def get_data(self, data_file):
         """
         Get input data of paddle format.
         """
         with open(data_file, 'r') as fdata:
             for line in fdata:
-                sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = line.strip(
+                sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = line.strip(
                 ).split('\t')
                 words = sentence.split()
                 sen_len = len(words)
-
+                 
                 word_slot = [self.dict.get(w, UNK_IDX) for w in words]
-                predicate_slot = [self.dict.get(predicate, UNK_IDX)] * sen_len
+                predicate_slot = [self.predicate_dict.get(predicate, UNK_IDX)] * sen_len
+                ctx_n2_slot = [self.dict.get(ctx_n2, UNK_IDX)] * sen_len
                 ctx_n1_slot = [self.dict.get(ctx_n1, UNK_IDX)] * sen_len
                 ctx_0_slot = [self.dict.get(ctx_0, UNK_IDX)] * sen_len
                 ctx_p1_slot = [self.dict.get(ctx_p1, UNK_IDX)] * sen_len
+                ctx_p2_slot = [self.dict.get(ctx_p2, UNK_IDX)] * sen_len
 
                 marks = mark.split()
                 mark_slot = [int(w) for w in marks]
+                
+                yield word_slot, ctx_n2_slot, ctx_n1_slot, \
+                      ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot
 
-                yield word_slot, predicate_slot, ctx_n1_slot, \
-                      ctx_0_slot, ctx_p1_slot, mark_slot
-
-    def predict(self, data_file):
+    def predict(self, data_file, output_file):
         """
         data_file: file name of input data.
         """
         input = self.converter(self.get_data(data_file))
         output = self.network.forwardTest(input)
-        prob = output[0]["value"]
-        lab = list(np.argsort(-prob)[:, 0])
+        lab = output[0]["id"].tolist()
 
-        with open(data_file, 'r') as fin, open('predict.res', 'w') as fout:
+        with open(data_file, 'r') as fin, open(output_file, 'w') as fout:
             index = 0
             for line in fin:
                 sen = line.split('\t')[0]
@@ -109,8 +123,8 @@ class Prediction():
 
 
 def option_parser():
-    usage = ("python predict.py -c config -w model_dir "
-             "-d word dictionary -l label_file -i input_file")
+    usage = ("python predict.py -c config -w model_dir " 
+             "-d word dictionary -l label_file -i input_file  -p pred_dict_file")
     parser = OptionParser(usage="usage: %s [options]" % usage)
     parser.add_option(
         "-c",
@@ -131,6 +145,13 @@ def option_parser():
         dest="label_file",
         default=None,
         help="label file")
+    parser.add_option(
+        "-p",
+        "--predict_dict_file",
+        action="store",
+        dest="predict_dict_file",
+        default=None,
+        help="predict_dict_file")
     parser.add_option(
         "-i",
         "--data",
@@ -144,6 +165,14 @@ def option_parser():
         dest="model_path",
         default=None,
         help="model path")
+
+    parser.add_option(
+        "-o",
+        "--output_file",
+        action="store",
+        dest="output_file",
+        default=None,
+        help="output file")
     return parser.parse_args()
 
 
@@ -154,10 +183,12 @@ def main():
     dict_file = options.dict_file
     model_path = options.model_path
     label_file = options.label_file
+    predict_dict_file = options.predict_dict_file
+    output_file = options.output_file
 
     swig_paddle.initPaddle("--use_gpu=0")
-    predict = Prediction(train_conf, dict_file, model_path, label_file)
-    predict.predict(data_file)
+    predict = Prediction(train_conf, dict_file, model_path, label_file, predict_dict_file)
+    predict.predict(data_file,output_file)
 
 
 if __name__ == '__main__':
diff --git a/demo/semantic_role_labeling/predict.sh b/demo/semantic_role_labeling/predict.sh
old mode 100644
new mode 100755
index a545b9a5d591b41bdbd54905cbbffc410abc8fb0..88ab5898f7d41056f4fe549b3145760783b27bf9
--- a/demo/semantic_role_labeling/predict.sh
+++ b/demo/semantic_role_labeling/predict.sh
@@ -18,7 +18,7 @@ set -e
 function get_best_pass() {
   cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
   sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
-  sort | head -n 1
+  sort -n | head -n 1
 }   
 
 log=train.log
@@ -26,15 +26,18 @@ LOG=`get_best_pass $log`
 LOG=(${LOG})
 best_model_path="output/pass-${LOG[1]}"
 
-
 config_file=db_lstm.py
-dict_file=./data/src.dict
-label_file=./data/tgt.dict 
+dict_file=./data/wordDict.txt
+label_file=./data/targetDict.txt 
+predicate_dict_file=./data/verbDict.txt
 input_file=./data/feature
+output_file=predict.res
  
 python predict.py \
      -c $config_file \
      -w $best_model_path \
      -l $label_file \
+     -p $predicate_dict_file  \
      -d $dict_file \
-     -i $input_file
+     -i $input_file \
+     -o $output_file
diff --git a/demo/semantic_role_labeling/test.sh b/demo/semantic_role_labeling/test.sh
old mode 100644
new mode 100755
index 844649e8c0f6867dc0766e4ec6f250c5a4a004d9..f9e1bdcd4c752474329d36c4de3378f7d58e7b4b
--- a/demo/semantic_role_labeling/test.sh
+++ b/demo/semantic_role_labeling/test.sh
@@ -18,7 +18,7 @@ set -e
 function get_best_pass() {
   cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
   sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
-  sort | head -n 1
+  sort -n | head -n 1
 }
 
 log=train.log
@@ -36,4 +36,5 @@ paddle train \
   --job=test \
   --use_gpu=false \
   --config_args=is_test=1 \
+  --test_all_data_in_one_period=1 \
 2>&1 | tee 'test.log'
diff --git a/demo/semantic_role_labeling/train.sh b/demo/semantic_role_labeling/train.sh
old mode 100644
new mode 100755
index c3a22b644be0ca08a2af73a57c09657014e49bfc..420768bb2b4ebed7b135a49c5eee5e5538426ae1
--- a/demo/semantic_role_labeling/train.sh
+++ b/demo/semantic_role_labeling/train.sh
@@ -16,11 +16,14 @@
 set -e
 paddle train \
   --config=./db_lstm.py \
+  --use_gpu=0 \
+  --log_period=5000 \
+  --trainer_count=1 \
+  --show_parameter_stats_period=5000 \
   --save_dir=./output \
-  --trainer_count=4 \
-  --log_period=10 \
-  --num_passes=500 \
-  --use_gpu=false \
-  --show_parameter_stats_period=10 \
+  --num_passes=10000 \
+  --average_test_period=10000000 \
+  --init_model_path=./data \
+  --load_missing_parameter_strategy=rand \
   --test_all_data_in_one_period=1 \
-2>&1 | tee 'train.log'
+  2>&1 | tee 'train.log'
diff --git a/demo/sentiment/test.sh b/demo/sentiment/test.sh
index 098fbb91389b89c8b69ccf2f5d308e4e715ac950..c8b12a0e89dbddea56b4ee069ebf66f8d8630615 100755
--- a/demo/sentiment/test.sh
+++ b/demo/sentiment/test.sh
@@ -17,7 +17,7 @@ set -e
 function get_best_pass() {
   cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
   sed  -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
-  sort | head -n 1
+  sort -n | head -n 1
 }
 
 log=train.log
diff --git a/demo/seqToseq/data/paraphrase_data.sh b/demo/seqToseq/data/paraphrase_data.sh
index ea1f8dbcfad35699189f6cd4efc81d97e8c89148..1b3f1d45e11fbd5e600e58f583e503a603e484ff 100755
--- a/demo/seqToseq/data/paraphrase_data.sh
+++ b/demo/seqToseq/data/paraphrase_data.sh
@@ -16,9 +16,7 @@ set -e
 set -x
 
 # download the in-house paraphrase dataset
-# following is the google drive address
-# you can also directly download from https://pan.baidu.com/s/1o8q577s
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/paraphrase.tar.gz --no-check-certificate
+wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/paraphrase.tar.gz
 
 # untar the dataset
 tar -zxvf paraphrase.tar.gz
diff --git a/demo/seqToseq/data/wmt14_model.sh b/demo/seqToseq/data/wmt14_model.sh
index 2cec30688d27a57902cdf64d7be5712d12c69bdd..d6e7a732644dc188a165215ddf3f69e1514425eb 100755
--- a/demo/seqToseq/data/wmt14_model.sh
+++ b/demo/seqToseq/data/wmt14_model.sh
@@ -16,9 +16,7 @@ set -e
 set -x
 
 # download the pretrained model
-# following is the google drive address
-# you can also directly download from https://pan.baidu.com/s/1o8q577s
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/wmt14_model.tar.gz --no-check-certificate
+wget http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz
 
 # untar the model
 tar -zxvf wmt14_model.tar.gz
diff --git a/demo/seqToseq/dataprovider.py b/demo/seqToseq/dataprovider.py
index c5da1b7685f47fda337921c7c60ac1497b9e48bb..127c3672c774d43d5b2afab3be79558842eb9e8f 100755
--- a/demo/seqToseq/dataprovider.py
+++ b/demo/seqToseq/dataprovider.py
@@ -19,27 +19,43 @@ START = "<s>"
 END = "<e>"
 
 
-def hook(settings, src_dict, trg_dict, file_list, **kwargs):
+def hook(settings, src_dict_path, trg_dict_path, is_generating, file_list,
+         **kwargs):
     # job_mode = 1: training mode
     # job_mode = 0: generating mode
-    settings.job_mode = trg_dict is not None
-    settings.src_dict = src_dict
+    settings.job_mode = not is_generating
+
+    def fun(dict_path):
+        out_dict = dict()
+        with open(dict_path, "r") as fin:
+            out_dict = {
+                line.strip(): line_count
+                for line_count, line in enumerate(fin)
+            }
+        return out_dict
+
+    settings.src_dict = fun(src_dict_path)
+    settings.trg_dict = fun(trg_dict_path)
+
     settings.logger.info("src dict len : %d" % (len(settings.src_dict)))
-    settings.sample_count = 0
 
     if settings.job_mode:
-        settings.trg_dict = trg_dict
-        settings.slots = [
+        settings.slots = {
+            'source_language_word':
             integer_value_sequence(len(settings.src_dict)),
+            'target_language_word':
             integer_value_sequence(len(settings.trg_dict)),
+            'target_language_next_word':
             integer_value_sequence(len(settings.trg_dict))
-        ]
+        }
         settings.logger.info("trg dict len : %d" % (len(settings.trg_dict)))
     else:
-        settings.slots = [
+        settings.slots = {
+            'source_language_word':
             integer_value_sequence(len(settings.src_dict)),
+            'sent_id':
             integer_value_sequence(len(open(file_list[0], "r").readlines()))
-        ]
+        }
 
 
 def _get_ids(s, dictionary):
@@ -69,6 +85,10 @@ def process(settings, file_name):
                     continue
                 trg_ids_next = trg_ids + [settings.trg_dict[END]]
                 trg_ids = [settings.trg_dict[START]] + trg_ids
-                yield src_ids, trg_ids, trg_ids_next
+                yield {
+                    'source_language_word': src_ids,
+                    'target_language_word': trg_ids,
+                    'target_language_next_word': trg_ids_next
+                }
             else:
-                yield src_ids, [line_count]
+                yield {'source_language_word': src_ids, 'sent_id': [line_count]}
diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py
index ad5e3339c1461de06732eb62aca9e8323eea707b..fc9db05ba706ee6eff6eb0ce0885a645ebd76340 100644
--- a/demo/seqToseq/seqToseq_net.py
+++ b/demo/seqToseq/seqToseq_net.py
@@ -37,17 +37,10 @@ def seq_to_seq_data(data_dir,
     """
     src_lang_dict = os.path.join(data_dir, 'src.dict')
     trg_lang_dict = os.path.join(data_dir, 'trg.dict')
-    src_dict = dict()
-    for line_count, line in enumerate(open(src_lang_dict, "r")):
-        src_dict[line.strip()] = line_count
-    trg_dict = dict()
-    for line_count, line in enumerate(open(trg_lang_dict, "r")):
-        trg_dict[line.strip()] = line_count
 
     if is_generating:
         train_list = None
         test_list = os.path.join(data_dir, gen_list)
-        trg_dict = None
     else:
         train_list = os.path.join(data_dir, train_list)
         test_list = os.path.join(data_dir, test_list)
@@ -57,8 +50,11 @@ def seq_to_seq_data(data_dir,
         test_list,
         module="dataprovider",
         obj="process",
-        args={"src_dict": src_dict,
-              "trg_dict": trg_dict})
+        args={
+            "src_dict_path": src_lang_dict,
+            "trg_dict_path": trg_lang_dict,
+            "is_generating": is_generating
+        })
 
     return {
         "src_dict_path": src_lang_dict,
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index ef4e9d102d35fc95e96711175a57f7e181a946c6..efcf8b0ad3d6f2f831fe71f3c09163015cc1ac96 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -15,25 +15,11 @@ set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
 # HTML output directory
 set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html")
 
-
-set(PADDLE_DOXYGEN_OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/doxygen_xml")
-
 configure_file(
     "${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
     "${BINARY_BUILD_DIR}/conf.py"
     @ONLY)
 
-configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in"
-    "${CMAKE_CURRENT_BINARY_DIR}/Doxyfile"
-    @ONLY
-  )
-
-add_custom_target(paddle_doxygen_docs ALL
-    ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
-    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-)
-
 sphinx_add_target(paddle_docs
                   html
                   ${BINARY_BUILD_DIR}
@@ -41,6 +27,5 @@ sphinx_add_target(paddle_docs
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR})
 
-add_dependencies(paddle_docs 
-  gen_proto_py
-  paddle_doxygen_docs)
+add_dependencies(paddle_docs
+  gen_proto_py)
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
deleted file mode 100644
index a1fc3801925dd340709ac77c9aa77c82051ee111..0000000000000000000000000000000000000000
--- a/doc/Doxyfile.in
+++ /dev/null
@@ -1,2384 +0,0 @@
-# Doxyfile 1.8.10
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project.
-#
-# All text after a double hash (##) is considered a comment and is placed in
-# front of the TAG it is preceding.
-#
-# All text after a single hash (#) is considered a comment and will be ignored.
-# The format is:
-# TAG = value [value, ...]
-# For lists, items can also be appended using:
-# TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (\" \").
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
-# The default value is: UTF-8.
-
-DOXYFILE_ENCODING      = UTF-8
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
-# double-quotes, unless you are using Doxywizard) that should identify the
-# project for which the documentation is generated. This name is used in the
-# title of most generated pages and in a few other places.
-# The default value is: My Project.
-
-PROJECT_NAME           = "paddle"
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
-# could be handy for archiving the generated documentation or if some version
-# control system is used.
-
-PROJECT_NUMBER         = 1.0.0
-
-# Using the PROJECT_BRIEF tag one can provide an optional one line description
-# for a project that appears at the top of each page and should give viewer a
-# quick idea about the purpose of the project. Keep the description short.
-
-PROJECT_BRIEF          =
-
-# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
-# in the documentation. The maximum height of the logo should not exceed 55
-# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
-# the logo to the output directory.
-
-PROJECT_LOGO           =
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
-# into which the generated documentation will be written. If a relative path is
-# entered, it will be relative to the location where doxygen was started. If
-# left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       = @PADDLE_DOXYGEN_OUTPUT@
-
-# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
-# option can be useful when feeding doxygen a huge amount of source files, where
-# putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
-# The default value is: NO.
-
-CREATE_SUBDIRS         = NO
-
-# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
-# characters to appear in the names of generated files. If set to NO, non-ASCII
-# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
-# U+3044.
-# The default value is: NO.
-
-ALLOW_UNICODE_NAMES    = NO
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
-# The default value is: English.
-
-OUTPUT_LANGUAGE        = English
-
-# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
-# descriptions after the members that are listed in the file and class
-# documentation (similar to Javadoc). Set to NO to disable this.
-# The default value is: YES.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
-# description of a member or function before the detailed description
-#
-# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
-# brief descriptions will be completely suppressed.
-# The default value is: YES.
-
-REPEAT_BRIEF           = YES
-
-# This tag implements a quasi-intelligent brief description abbreviator that is
-# used to form the text in various listings. Each string in this list, if found
-# as the leading text of the brief description, will be stripped from the text
-# and the result, after processing the whole list, is used as the annotated
-# text. Otherwise, the brief description is used as-is. If left blank, the
-# following values are used ($name is automatically replaced with the name of
-# the entity):The $name class, The $name widget, The $name file, is, provides,
-# specifies, contains, represents, a, an and the.
-
-ABBREVIATE_BRIEF       =
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# doxygen will generate a detailed section even if there is only a brief
-# description.
-# The default value is: NO.
-
-ALWAYS_DETAILED_SEC    = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
-# inherited members of a class in the documentation of that class as if those
-# members were ordinary class members. Constructors, destructors and assignment
-# operators of the base classes will not be shown.
-# The default value is: NO.
-
-INLINE_INHERITED_MEMB  = NO
-
-# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
-# before files name in the file list and in the header files. If set to NO the
-# shortest path that makes the file name unique will be used
-# The default value is: YES.
-
-FULL_PATH_NAMES        = YES
-
-# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
-# Stripping is only done if one of the specified strings matches the left-hand
-# part of the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the path to
-# strip.
-#
-# Note that you can specify absolute paths here, but also relative paths, which
-# will be relative from the directory where doxygen is started.
-# This tag requires that the tag FULL_PATH_NAMES is set to YES.
-
-STRIP_FROM_PATH        =
-
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
-# path mentioned in the documentation of a class, which tells the reader which
-# header file to include in order to use a class. If left blank only the name of
-# the header file containing the class definition is used. Otherwise one should
-# specify the list of include paths that are normally passed to the compiler
-# using the -I flag.
-
-STRIP_FROM_INC_PATH    =
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
-# less readable) file names. This can be useful is your file systems doesn't
-# support long names like on DOS, Mac, or CD-ROM.
-# The default value is: NO.
-
-SHORT_NAMES            = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
-# first line (until the first dot) of a Javadoc-style comment as the brief
-# description. If set to NO, the Javadoc-style will behave just like regular Qt-
-# style comments (thus requiring an explicit @brief command for a brief
-# description.)
-# The default value is: NO.
-
-JAVADOC_AUTOBRIEF      = NO
-
-# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
-# line (until the first dot) of a Qt-style comment as the brief description. If
-# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
-# requiring an explicit \brief command for a brief description.)
-# The default value is: NO.
-
-QT_AUTOBRIEF           = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
-# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
-# a brief description. This used to be the default behavior. The new default is
-# to treat a multi-line C++ comment block as a detailed description. Set this
-# tag to YES if you prefer the old behavior instead.
-#
-# Note that setting this tag to YES also means that rational rose comments are
-# not recognized any more.
-# The default value is: NO.
-
-MULTILINE_CPP_IS_BRIEF = NO
-
-# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
-# documentation from any documented member that it re-implements.
-# The default value is: YES.
-
-INHERIT_DOCS           = YES
-
-# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
-# page for each member. If set to NO, the documentation of a member will be part
-# of the file/class/namespace that contains it.
-# The default value is: NO.
-
-SEPARATE_MEMBER_PAGES  = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
-# uses this value to replace tabs by spaces in code fragments.
-# Minimum value: 1, maximum value: 16, default value: 4.
-
-TAB_SIZE               = 2
-
-# This tag can be used to specify a number of aliases that act as commands in
-# the documentation. An alias has the form:
-# name=value
-# For example adding
-# "sideeffect=@par Side Effects:\n"
-# will allow you to put the command \sideeffect (or @sideeffect) in the
-# documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
-
-ALIASES                =
-
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
-# only. Doxygen will then generate output that is more tailored for C. For
-# instance, some of the names that are used will be different. The list of all
-# members will be omitted, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_FOR_C  = NO
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
-# Python sources only. Doxygen will then generate output that is more tailored
-# for that language. For instance, namespaces will be presented as packages,
-# qualified scopes will look different, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_JAVA   = NO
-
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
-# sources. Doxygen will then generate output that is tailored for Fortran.
-# The default value is: NO.
-
-OPTIMIZE_FOR_FORTRAN   = NO
-
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
-# sources. Doxygen will then generate output that is tailored for VHDL.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_VHDL   = NO
-
-# Doxygen selects the parser to use depending on the extension of the files it
-# parses. With this tag you can assign which parser to use for a given
-# extension. Doxygen has a built-in mapping, but you can override or extend it
-# using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
-#
-# Note: For files without extension you can use no_extension as a placeholder.
-#
-# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
-
-EXTENSION_MAPPING      =
-
-# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
-# according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
-# The output of markdown processing is further processed by doxygen, so you can
-# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
-# case of backward compatibilities issues.
-# The default value is: YES.
-
-MARKDOWN_SUPPORT       = YES
-
-# When enabled doxygen tries to link words that correspond to documented
-# classes, or namespaces to their corresponding documentation. Such a link can
-# be prevented in individual cases by putting a % sign in front of the word or
-# globally by setting AUTOLINK_SUPPORT to NO.
-# The default value is: YES.
-
-AUTOLINK_SUPPORT       = YES
-
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
-# to include (a tag file for) the STL sources as input, then you should set this
-# tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string);
-# versus func(std::string) {}). This also make the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
-# The default value is: NO.
-
-BUILTIN_STL_SUPPORT    = YES
-
-# If you use Microsoft's C++/CLI language, you should set this option to YES to
-# enable parsing support.
-# The default value is: NO.
-
-CPP_CLI_SUPPORT        = NO
-
-# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
-# will parse them like normal C++ but will assume all classes use public instead
-# of private inheritance when no explicit protection keyword is present.
-# The default value is: NO.
-
-SIP_SUPPORT            = NO
-
-# For Microsoft's IDL there are propget and propput attributes to indicate
-# getter and setter methods for a property. Setting this option to YES will make
-# doxygen to replace the get and set methods by a property in the documentation.
-# This will only work if the methods are indeed getting or setting a simple
-# type. If this is not the case, or you want to show the methods anyway, you
-# should set this option to NO.
-# The default value is: YES.
-
-IDL_PROPERTY_SUPPORT   = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES then doxygen will reuse the documentation of the first
-# member in the group (if any) for the other members of the group. By default
-# all members of a group must be documented explicitly.
-# The default value is: NO.
-
-DISTRIBUTE_GROUP_DOC   = NO
-
-# If one adds a struct or class to a group and this option is enabled, then also
-# any nested class or struct is added to the same group. By default this option
-# is disabled and one has to add nested compounds explicitly via \ingroup.
-# The default value is: NO.
-
-GROUP_NESTED_COMPOUNDS = NO
-
-# Set the SUBGROUPING tag to YES to allow class member groups of the same type
-# (for instance a group of public functions) to be put as a subgroup of that
-# type (e.g. under the Public Functions section). Set it to NO to prevent
-# subgrouping. Alternatively, this can be done per class using the
-# \nosubgrouping command.
-# The default value is: YES.
-
-SUBGROUPING            = YES
-
-# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
-# are shown inside the group in which they are included (e.g. using \ingroup)
-# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
-# and RTF).
-#
-# Note that this feature does not work in combination with
-# SEPARATE_MEMBER_PAGES.
-# The default value is: NO.
-
-INLINE_GROUPED_CLASSES = NO
-
-# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
-# with only public data fields or simple typedef fields will be shown inline in
-# the documentation of the scope in which they are defined (i.e. file,
-# namespace, or group documentation), provided this scope is documented. If set
-# to NO, structs, classes, and unions are shown on a separate page (for HTML and
-# Man pages) or section (for LaTeX and RTF).
-# The default value is: NO.
-
-INLINE_SIMPLE_STRUCTS  = NO
-
-# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
-# enum is documented as struct, union, or enum with the name of the typedef. So
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
-# with name TypeT. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named TypeS. This can typically be
-# useful for C code in case the coding convention dictates that all compound
-# types are typedef'ed and only the typedef is referenced, never the tag name.
-# The default value is: NO.
-
-TYPEDEF_HIDES_STRUCT   = NO
-
-# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
-# cache is used to resolve symbols given their name and scope. Since this can be
-# an expensive process and often the same symbol appears multiple times in the
-# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
-# doxygen will become slower. If the cache is too large, memory is wasted. The
-# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
-# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
-# symbols. At the end of a run doxygen will report the cache usage and suggest
-# the optimal cache size from a speed point of view.
-# Minimum value: 0, maximum value: 9, default value: 0.
-
-LOOKUP_CACHE_SIZE      = 0
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
-# documentation are documented, even if no documentation was available. Private
-# class members and static file members will be hidden unless the
-# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
-# Note: This will also disable the warnings about undocumented members that are
-# normally produced when WARNINGS is set to YES.
-# The default value is: NO.
-
-EXTRACT_ALL            = NO
-
-# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
-# be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PRIVATE        = NO
-
-# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
-# scope will be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PACKAGE        = NO
-
-# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
-# included in the documentation.
-# The default value is: NO.
-
-EXTRACT_STATIC         = NO
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
-# locally in source files will be included in the documentation. If set to NO,
-# only classes defined in header files are included. Does not have any effect
-# for Java sources.
-# The default value is: YES.
-
-EXTRACT_LOCAL_CLASSES  = YES
-
-# This flag is only useful for Objective-C code. If set to YES, local methods,
-# which are defined in the implementation section but not in the interface are
-# included in the documentation. If set to NO, only methods in the interface are
-# included.
-# The default value is: NO.
-
-EXTRACT_LOCAL_METHODS  = NO
-
-# If this flag is set to YES, the members of anonymous namespaces will be
-# extracted and appear in the documentation as a namespace called
-# 'anonymous_namespace{file}', where file will be replaced with the base name of
-# the file that contains the anonymous namespace. By default anonymous namespace
-# are hidden.
-# The default value is: NO.
-
-EXTRACT_ANON_NSPACES   = NO
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
-# undocumented members inside documented classes or files. If set to NO these
-# members will be included in the various overviews, but no documentation
-# section is generated. This option has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_MEMBERS     = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy. If set
-# to NO, these classes will be included in the various overviews. This option
-# has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_CLASSES     = NO
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
-# The default value is: NO.
-
-HIDE_FRIEND_COMPOUNDS  = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
-# documentation blocks found inside the body of a function. If set to NO, these
-# blocks will be appended to the function's detailed documentation block.
-# The default value is: NO.
-
-HIDE_IN_BODY_DOCS      = NO
-
-# The INTERNAL_DOCS tag determines if documentation that is typed after a
-# \internal command is included. If the tag is set to NO then the documentation
-# will be excluded. Set it to YES to include the internal documentation.
-# The default value is: NO.
-
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-# The default value is: system dependent.
-
-CASE_SENSE_NAMES       = YES
-
-# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
-# their full class and namespace scopes in the documentation. If set to YES, the
-# scope will be hidden.
-# The default value is: NO.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
-# append additional text to a page's title, such as Class Reference. If set to
-# YES the compound reference will be hidden.
-# The default value is: NO.
-
-HIDE_COMPOUND_REFERENCE= NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
-# the files that are included by a file in the documentation of that file.
-# The default value is: YES.
-
-SHOW_INCLUDE_FILES     = NO
-
-# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
-# grouped member an include statement to the documentation, telling the reader
-# which file to include in order to use the member.
-# The default value is: NO.
-
-SHOW_GROUPED_MEMB_INC  = NO
-
-# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
-# files with double quotes in the documentation rather than with sharp brackets.
-# The default value is: NO.
-
-FORCE_LOCAL_INCLUDES   = NO
-
-# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
-# documentation for inline members.
-# The default value is: YES.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
-# (detailed) documentation of file and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order.
-# The default value is: YES.
-
-SORT_MEMBER_DOCS       = YES
-
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
-# descriptions of file, namespace and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order. Note that
-# this will also influence the order of the classes in the class list.
-# The default value is: NO.
-
-SORT_BRIEF_DOCS        = NO
-
-# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
-# (brief and detailed) documentation of class members so that constructors and
-# destructors are listed first. If set to NO the constructors will appear in the
-# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
-# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
-# member documentation.
-# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
-# detailed member documentation.
-# The default value is: NO.
-
-SORT_MEMBERS_CTORS_1ST = NO
-
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
-# of group names into alphabetical order. If set to NO the group names will
-# appear in their defined order.
-# The default value is: NO.
-
-SORT_GROUP_NAMES       = NO
-
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
-# fully-qualified names, including namespaces. If set to NO, the class list will
-# be sorted only by class name, not including the namespace part.
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the alphabetical
-# list.
-# The default value is: NO.
-
-SORT_BY_SCOPE_NAME     = NO
-
-# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
-# type resolution of all parameters of a function it will reject a match between
-# the prototype and the implementation of a member function even if there is
-# only one candidate or it is obvious which candidate to choose by doing a
-# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
-# accept a match between prototype and implementation in such cases.
-# The default value is: NO.
-
-STRICT_PROTO_MATCHING  = NO
-
-# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
-# list. This list is created by putting \todo commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
-# list. This list is created by putting \test commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
-# list. This list is created by putting \bug commands in the documentation.
-# The default value is: YES.
-
-GENERATE_BUGLIST       = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
-# the deprecated list. This list is created by putting \deprecated commands in
-# the documentation.
-# The default value is: YES.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional documentation
-# sections, marked by \if <section_label> ... \endif and \cond <section_label>
-# ... \endcond blocks.
-
-ENABLED_SECTIONS       =
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
-# initial value of a variable or macro / define can have for it to appear in the
-# documentation. If the initializer consists of more lines than specified here
-# it will be hidden. Use a value of 0 to hide initializers completely. The
-# appearance of the value of individual variables and macros / defines can be
-# controlled using \showinitializer or \hideinitializer command in the
-# documentation regardless of this setting.
-# Minimum value: 0, maximum value: 10000, default value: 30.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
-# the bottom of the documentation of classes and structs. If set to YES, the
-# list will mention the files that were used to generate the documentation.
-# The default value is: YES.
-
-SHOW_USED_FILES        = YES
-
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
-# will remove the Files entry from the Quick Index and from the Folder Tree View
-# (if specified).
-# The default value is: YES.
-
-SHOW_FILES             = YES
-
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
-# page. This will remove the Namespaces entry from the Quick Index and from the
-# Folder Tree View (if specified).
-# The default value is: YES.
-
-SHOW_NAMESPACES        = YES
-
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from
-# the version control system). Doxygen will invoke the program by executing (via
-# popen()) the command command input-file, where command is the value of the
-# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
-# by doxygen. Whatever the program writes to standard output is used as the file
-# version. For an example see the documentation.
-
-FILE_VERSION_FILTER    =
-
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
-# by doxygen. The layout file controls the global structure of the generated
-# output files in an output format independent way. To create the layout file
-# that represents doxygen's defaults, run doxygen with the -l option. You can
-# optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
-#
-# Note that if you run doxygen from a directory containing a file called
-# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
-# tag is left empty.
-
-LAYOUT_FILE            =
-
-# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
-# the reference definitions. This must be a list of .bib files. The .bib
-# extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
-# For LaTeX the style of the bibliography can be controlled using
-# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
-# search path. See also \cite for info how to create references.
-
-CITE_BIB_FILES         =
-
-#---------------------------------------------------------------------------
-# Configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated to
-# standard output by doxygen. If QUIET is set to YES this implies that the
-# messages are off.
-# The default value is: NO.
-
-QUIET                  = NO
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
-# this implies that the warnings are on.
-#
-# Tip: Turn warnings on while writing the documentation.
-# The default value is: YES.
-
-WARNINGS               = YES
-
-# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
-# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
-# will automatically be disabled.
-# The default value is: YES.
-
-WARN_IF_UNDOCUMENTED   = NO
-
-# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
-# The default value is: YES.
-
-WARN_IF_DOC_ERROR      = YES
-
-# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
-# are documented, but have no documentation for their parameters or return
-# value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
-# The default value is: NO.
-
-WARN_NO_PARAMDOC       = NO
-
-# The WARN_FORMAT tag determines the format of the warning messages that doxygen
-# can produce. The string should contain the $file, $line, and $text tags, which
-# will be replaced by the file and line number from which the warning originated
-# and the warning text. Optionally the format may contain $version, which will
-# be replaced by the version of the file (if it could be obtained via
-# FILE_VERSION_FILTER)
-# The default value is: $file:$line: $text.
-
-WARN_FORMAT            = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning and error
-# messages should be written. If left blank the output is written to standard
-# error (stderr).
-
-WARN_LOGFILE           =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag is used to specify the files and/or directories that contain
-# documented source files. You may enter file names like myfile.cpp or
-# directories like /usr/src/myproject. Separate the files or directories with
-# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
-# Note: If this tag is empty the current directory is searched.
-
-INPUT                  = @PROJ_ROOT@/paddle
-
-# This tag can be used to specify the character encoding of the source files
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
-# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
-# The default value is: UTF-8.
-
-INPUT_ENCODING         = UTF-8
-
-# If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
-# *.h) to filter out the source-files in the directories.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# read by doxygen.
-#
-# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
-# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd,
-# *.vhdl, *.ucf, *.qsf, *.as and *.js.
-
-FILE_PATTERNS          = *.c *.cc *.cpp *.cu *.h *.hpp *.cuh *.ph
-
-# The RECURSIVE tag can be used to specify whether or not subdirectories should
-# be searched for input files as well.
-# The default value is: NO.
-
-RECURSIVE              = YES
-
-# The EXCLUDE tag can be used to specify files and/or directories that should be
-# excluded from the INPUT source files. This way you can easily exclude a
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-#
-# Note that relative paths are relative to the directory from which doxygen is
-# run.
-
-EXCLUDE                = 
-
-# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
-# directories that are symbolic links (a Unix file system feature) are excluded
-# from the input.
-# The default value is: NO.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories.
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories for example use the pattern */test/*
-
-EXCLUDE_PATTERNS       = */x86_64-scm-linux-gnu/* */internals/* */mkl/* */test/* */tests/* */platform/*
-
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the
-# output. The symbol name can be a fully qualified name, a word, or if the
-# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
-
-EXCLUDE_SYMBOLS        =
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or directories
-# that contain example code fragments that are included (see the \include
-# command).
-
-EXAMPLE_PATH           =
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank all
-# files are included.
-
-EXAMPLE_PATTERNS       =
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude commands
-# irrespective of the value of the RECURSIVE tag.
-# The default value is: NO.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or directories
-# that contain images that are to be included in the documentation (see the
-# \image command).
-
-IMAGE_PATH             =
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should
-# invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command:
-#
-# <filter> <input-file>
-#
-# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
-# name of an input file. Doxygen will then use the output that the filter
-# program writes to standard output. If FILTER_PATTERNS is specified, this tag
-# will be ignored.
-#
-# Note that the filter must not add or remove lines; it is applied before the
-# code is scanned, but not when the output code is generated. If lines are added
-# or removed, the anchors will not be placed correctly.
-
-INPUT_FILTER           =
-
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis. Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match. The filters are a list of the form: pattern=filter
-# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
-# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
-# patterns match the file name, INPUT_FILTER is applied.
-
-FILTER_PATTERNS        =
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER) will also be used to filter the input files that are used for
-# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
-# The default value is: NO.
-
-FILTER_SOURCE_FILES    = NO
-
-# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
-# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
-# it is also possible to disable source filtering for a specific pattern using
-# *.ext= (so without naming a filter).
-# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
-
-FILTER_SOURCE_PATTERNS =
-
-# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
-# is part of the input, its contents will be placed on the main page
-# (index.html). This can be useful if you have a project on for instance GitHub
-# and want to reuse the introduction page also for the doxygen output.
-
-USE_MDFILE_AS_MAINPAGE =
-
-#---------------------------------------------------------------------------
-# Configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
-# generated. Documented entities will be cross-referenced with these sources.
-#
-# Note: To get rid of all source code in the generated output, make sure that
-# also VERBATIM_HEADERS is set to NO.
-# The default value is: NO.
-
-SOURCE_BROWSER         = NO
-
-# Setting the INLINE_SOURCES tag to YES will include the body of functions,
-# classes and enums directly into the documentation.
-# The default value is: NO.
-
-INLINE_SOURCES         = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
-# special comment blocks from generated source code fragments. Normal C, C++ and
-# Fortran comments will always remain visible.
-# The default value is: YES.
-
-STRIP_CODE_COMMENTS    = YES
-
-# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
-# The default value is: NO.
-
-REFERENCED_BY_RELATION = NO
-
-# If the REFERENCES_RELATION tag is set to YES then for each documented function
-# all documented entities called/used by that function will be listed.
-# The default value is: NO.
-
-REFERENCES_RELATION    = NO
-
-# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
-# to YES then the hyperlinks from functions in REFERENCES_RELATION and
-# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
-# link to the documentation.
-# The default value is: YES.
-
-REFERENCES_LINK_SOURCE = YES
-
-# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
-# source code will show a tooltip with additional information such as prototype,
-# brief description and links to the definition and documentation. Since this
-# will make the HTML file larger and loading of large files a bit slower, you
-# can opt to disable this feature.
-# The default value is: YES.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-SOURCE_TOOLTIPS        = YES
-
-# If the USE_HTAGS tag is set to YES then the references to source code will
-# point to the HTML generated by the htags(1) tool instead of doxygen built-in
-# source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
-# 4.8.6 or higher.
-#
-# To use it do the following:
-# - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
-# - Make sure the INPUT points to the root of the source tree
-# - Run doxygen as normal
-#
-# Doxygen will invoke htags (and that will in turn invoke gtags), so these
-# tools must be available from the command line (i.e. in the search path).
-#
-# The result: instead of the source browser generated by doxygen, the links to
-# source code will now point to the output of htags.
-# The default value is: NO.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-USE_HTAGS              = NO
-
-# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
-# verbatim copy of the header file for each class for which an include is
-# specified. Set to NO to disable this.
-# See also: Section \class.
-# The default value is: YES.
-
-VERBATIM_HEADERS       = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
-# compounds will be generated. Enable this if the project contains a lot of
-# classes, structs, unions or interfaces.
-# The default value is: YES.
-
-ALPHABETICAL_INDEX     = YES
-
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-IGNORE_PREFIX          =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
-# The default value is: YES.
-
-GENERATE_HTML          = NO
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_OUTPUT            = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
-# generated HTML page (for example: .htm, .php, .asp).
-# The default value is: .html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
-# each generated HTML page. If the tag is left blank doxygen will generate a
-# standard header.
-#
-# To get valid HTML the header file that includes any scripts and style sheets
-# that doxygen needs, which is dependent on the configuration options used (e.g.
-# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
-# default header using
-# doxygen -w html new_header.html new_footer.html new_stylesheet.css
-# YourConfigFile
-# and then modify the file new_header.html. See also section "Doxygen usage"
-# for information on how to generate the default header that doxygen normally
-# uses.
-# Note: The header is subject to change so you typically have to regenerate the
-# default header when upgrading to a newer version of doxygen. For a description
-# of the possible markers and block names see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_HEADER            =
-
-# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
-# generated HTML page. If the tag is left blank doxygen will generate a standard
-# footer. See HTML_HEADER for more information on how to generate a default
-# footer and what special commands can be used inside the footer. See also
-# section "Doxygen usage" for information on how to generate the default footer
-# that doxygen normally uses.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FOOTER            =
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
-# sheet that is used by each HTML page. It can be used to fine-tune the look of
-# the HTML output. If left blank doxygen will generate a default style sheet.
-# See also section "Doxygen usage" for information on how to generate the style
-# sheet that doxygen normally uses.
-# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
-# it is more robust and this tag (HTML_STYLESHEET) will in the future become
-# obsolete.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_STYLESHEET        =
-
-# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# cascading style sheets that are included after the standard style sheets
-# created by doxygen. Using this option one can overrule certain style aspects.
-# This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefore more robust against future updates.
-# Doxygen will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list). For an example see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_STYLESHEET  =
-
-# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the HTML output directory. Note
-# that these files will be copied to the base HTML output directory. Use the
-# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
-# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
-# files will be copied as-is; there are no commands or markers available.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_FILES       =
-
-# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
-# will adjust the colors in the style sheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
-# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
-# purple, and 360 is red again.
-# Minimum value: 0, maximum value: 359, default value: 220.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_HUE    = 220
-
-# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
-# value of 255 will produce the most vivid colors.
-# Minimum value: 0, maximum value: 255, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_SAT    = 100
-
-# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
-# luminance component of the colors in the HTML output. Values below 100
-# gradually make the output lighter, whereas values above 100 make the output
-# darker. The value divided by 100 is the actual gamma applied, so 80 represents
-# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
-# change the gamma.
-# Minimum value: 40, maximum value: 240, default value: 80.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_GAMMA  = 80
-
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to YES can help to show when doxygen was last run and thus if the
-# documentation is up to date.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = NO
-
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
-# documentation will contain sections that can be hidden and shown after the
-# page has loaded.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_DYNAMIC_SECTIONS  = NO
-
-# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
-# shown in the various tree structured indices initially; the user can expand
-# and collapse entries dynamically later on. Doxygen will expand the tree to
-# such a level that at most the specified number of entries are visible (unless
-# a fully collapsed tree already exceeds this amount). So setting the number of
-# entries 1 will produce a full collapsed tree by default. 0 is a special value
-# representing an infinite number of entries and will result in a full expanded
-# tree by default.
-# Minimum value: 0, maximum value: 9999, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_INDEX_NUM_ENTRIES = 100
-
-# If the GENERATE_DOCSET tag is set to YES, additional index files will be
-# generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_DOCSET        = NO
-
-# This tag determines the name of the docset feed. A documentation feed provides
-# an umbrella under which multiple documentation sets from a single provider
-# (such as a company or product suite) can be grouped.
-# The default value is: Doxygen generated docs.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-
-# This tag specifies a string that should uniquely identify the documentation
-# set bundle. This should be a reverse domain-name style string, e.g.
-# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-
-# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
-# the documentation publisher. This should be a reverse domain-name style
-# string, e.g. com.mycompany.MyDocSet.documentation.
-# The default value is: org.doxygen.Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-
-# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
-# The default value is: Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_NAME  = Publisher
-
-# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
-# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
-# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
-#
-# The HTML Help Workshop contains a compiler that can convert all HTML output
-# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
-# files are now used as the Windows 98 help format, and will replace the old
-# Windows help format (.hlp) on all Windows platforms in the future. Compressed
-# HTML files also contain an index, a table of contents, and you can search for
-# words in the documentation. The HTML workshop also contains a viewer for
-# compressed HTML files.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_HTMLHELP      = NO
-
-# The CHM_FILE tag can be used to specify the file name of the resulting .chm
-# file. You can add a path in front of the file if the result should not be
-# written to the html output directory.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_FILE               =
-
-# The HHC_LOCATION tag can be used to specify the location (absolute path
-# including file name) of the HTML help compiler (hhc.exe). If non-empty,
-# doxygen will try to run the HTML help compiler on the generated index.hhp.
-# The file has to be specified with full path.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-HHC_LOCATION           =
-
-# The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-GENERATE_CHI           = NO
-
-# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
-# and project file content.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_INDEX_ENCODING     =
-
-# The BINARY_TOC flag controls whether a binary table of contents is generated
-# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
-# enables the Previous and Next buttons.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members to
-# the table of contents of the HTML help documentation and to the tree view.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-TOC_EXPAND             = NO
-
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
-# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
-# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
-# (.qch) of the generated HTML documentation.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_QHP           = NO
-
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
-# the file name of the resulting .qch file. The path specified is relative to
-# the HTML output folder.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QCH_FILE               =
-
-# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
-# Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_NAMESPACE          = org.doxygen.Project
-
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
-# Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
-# The default value is: doc.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_VIRTUAL_FOLDER     = doc
-
-# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
-# filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_NAME   =
-
-# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
-# custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_ATTRS  =
-
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
-# project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_SECT_FILTER_ATTRS  =
-
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHG_LOCATION           =
-
-# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
-# generated, together with the HTML files, they form an Eclipse help plugin. To
-# install this plugin and make it available under the help contents menu in
-# Eclipse, the contents of the directory containing the HTML and XML files needs
-# to be copied into the plugins directory of eclipse. The name of the directory
-# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
-# After copying Eclipse needs to be restarted before the help appears.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_ECLIPSEHELP   = NO
-
-# A unique identifier for the Eclipse help plugin. When installing the plugin
-# the directory name containing the HTML and XML files should also have this
-# name. Each documentation set should have its own identifier.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
-
-ECLIPSE_DOC_ID         = org.doxygen.Project
-
-# If you want full control over the layout of the generated HTML pages it might
-# be necessary to disable the index and replace it with your own. The
-# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
-# of each HTML page. A value of NO enables the index and the value YES disables
-# it. Since the tabs in the index contain the same information as the navigation
-# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-DISABLE_INDEX          = NO
-
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
-# structure should be generated to display hierarchical information. If the tag
-# value is set to YES, a side panel will be generated containing a tree-like
-# index structure (just like the one that is generated for HTML Help). For this
-# to work a browser that supports JavaScript, DHTML, CSS and frames is required
-# (i.e. any modern browser). Windows users are probably better off using the
-# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_TREEVIEW      = NO
-
-# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
-# doxygen will group on one line in the generated HTML documentation.
-#
-# Note that a value of 0 will completely suppress the enum values from appearing
-# in the overview section.
-# Minimum value: 0, maximum value: 20, default value: 4.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-ENUM_VALUES_PER_LINE   = 4
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
-# to set the initial width (in pixels) of the frame in which the tree is shown.
-# Minimum value: 0, maximum value: 1500, default value: 250.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-TREEVIEW_WIDTH         = 250
-
-# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
-# external symbols imported via tag files in a separate window.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-EXT_LINKS_IN_WINDOW    = NO
-
-# Use this tag to change the font size of LaTeX formulas included as images in
-# the HTML documentation. When you change the font size after a successful
-# doxygen run you need to manually remove any form_*.png images from the HTML
-# output directory to force them to be regenerated.
-# Minimum value: 8, maximum value: 50, default value: 10.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_FONTSIZE       = 10
-
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_TRANSPARENT    = YES
-
-# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
-# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
-# installed or if you want to formulas look prettier in the HTML output. When
-# enabled you may also need to install MathJax separately and configure the path
-# to it using the MATHJAX_RELPATH option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-USE_MATHJAX            = NO
-
-# When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
-# Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
-# The default value is: HTML-CSS.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_FORMAT         = HTML-CSS
-
-# When MathJax is enabled you need to specify the location relative to the HTML
-# output directory using the MATHJAX_RELPATH option. The destination directory
-# should contain the MathJax.js script. For instance, if the mathjax directory
-# is located at the same level as the HTML output directory, then
-# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
-# Content Delivery Network so you can quickly see the result without installing
-# MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-
-# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
-# extension names that should be enabled during MathJax rendering. For example
-# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_EXTENSIONS     =
-
-# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
-# of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
-# example see the documentation.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_CODEFILE       =
-
-# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
-# the HTML output. The underlying search engine uses javascript and DHTML and
-# should work on any modern browser. Note that when using HTML help
-# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
-# there is already a search function so this one should typically be disabled.
-# For large projects the javascript based search engine can be slow, then
-# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
-# search using the keyboard; to jump to the search box use <access key> + S
-# (what the <access key> is depends on the OS and browser, but it is typically
-# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
-# key> to jump into the search results window, the results can be navigated
-# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
-# the search. The filter options can be selected when the cursor is inside the
-# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
-# to select a filter and <Enter> or <escape> to activate or cancel the filter
-# option.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-SEARCHENGINE           = YES
-
-# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
-# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
-# setting. When disabled, doxygen will generate a PHP script for searching and
-# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
-# and searching needs to be provided by external tools. See the section
-# "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SERVER_BASED_SEARCH    = NO
-
-# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
-# script for searching. Instead the search results are written to an XML file
-# which needs to be processed by an external indexer. Doxygen will invoke an
-# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
-# search results.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
-#
-# See the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH        = NO
-
-# The SEARCHENGINE_URL should point to a search engine hosted by a web server
-# which will return the search results when EXTERNAL_SEARCH is enabled.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHENGINE_URL       =
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
-# search data is written to a file for indexing by an external tool. With the
-# SEARCHDATA_FILE tag the name of this file can be specified.
-# The default file is: searchdata.xml.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHDATA_FILE        = searchdata.xml
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
-# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
-# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
-# projects and redirect the results back to the right project.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH_ID     =
-
-# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
-# projects other than the one defined by this configuration file, but that are
-# all added to the same external search index. Each project needs to have a
-# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
-# to a relative location where the documentation can be found. The format is:
-# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTRA_SEARCH_MAPPINGS  =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
-# The default value is: YES.
-
-GENERATE_LATEX         = NO
-
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_OUTPUT           = latex
-
-# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
-# invoked.
-#
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_CMD_NAME         = latex
-
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
-# index for LaTeX.
-# The default file is: makeindex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-MAKEINDEX_CMD_NAME     = makeindex
-
-# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-COMPACT_LATEX          = NO
-
-# The PAPER_TYPE tag can be used to set the paper type that is used by the
-# printer.
-# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
-# 14 inches) and executive (7.25 x 10.5 inches).
-# The default value is: a4.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PAPER_TYPE             = a4
-
-# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
-# that should be included in the LaTeX output. The package can be specified just
-# by its name or with the correct syntax as to be used with the LaTeX
-# \usepackage command. To get the times font for instance you can specify :
-# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
-# To use the option intlimits with the amsmath package you can specify:
-# EXTRA_PACKAGES=[intlimits]{amsmath}
-# If left blank no extra packages will be included.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-EXTRA_PACKAGES         =
-
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
-#
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
-# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
-# string, for the replacement values of the other commands the user is referred
-# to HTML_HEADER.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HEADER           =
-
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer. See
-# LATEX_HEADER for more information on how to generate a default footer and what
-# special commands can be used inside the footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_FOOTER           =
-
-# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# LaTeX style sheets that are included after the standard style sheets created
-# by doxygen. Using this option one can overrule certain style aspects. Doxygen
-# will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list).
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_STYLESHEET =
-
-# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the LATEX_OUTPUT output
-# directory. Note that the files will be copied as-is; there are no commands or
-# markers available.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_FILES      =
-
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
-# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
-# contain links (just like the HTML output) instead of page references. This
-# makes the output suitable for online browsing using a PDF viewer.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PDF_HYPERLINKS         = YES
-
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-USE_PDFLATEX           = YES
-
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BATCHMODE        = NO
-
-# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
-# index chapters (such as File Index, Compound Index, etc.) in the output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HIDE_INDICES     = NO
-
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
-# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
-# bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
-# The default value is: plain.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BIB_STYLE        = plain
-
-#---------------------------------------------------------------------------
-# Configuration options related to the RTF output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
-# RTF output is optimized for Word 97 and may not look too pretty with other RTF
-# readers/editors.
-# The default value is: NO.
-
-GENERATE_RTF           = NO
-
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: rtf.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_OUTPUT             = rtf
-
-# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-COMPACT_RTF            = NO
-
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
-# contain hyperlink fields. The RTF file will contain links (just like the HTML
-# output) instead of page references. This makes the output suitable for online
-# browsing using Word or some other Word compatible readers that support those
-# fields.
-#
-# Note: WordPad (write) and others do not support links.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_HYPERLINKS         = NO
-
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
-#
-# See also section "Doxygen usage" for information on how to generate the
-# default style sheet that doxygen normally uses.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_STYLESHEET_FILE    =
-
-# Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_EXTENSIONS_FILE    =
-
-# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
-# with syntax highlighting in the RTF output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_SOURCE_CODE        = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the man page output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
-# classes and files.
-# The default value is: NO.
-
-GENERATE_MAN           = NO
-
-# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it. A directory man3 will be created inside the directory specified by
-# MAN_OUTPUT.
-# The default directory is: man.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_OUTPUT             = man
-
-# The MAN_EXTENSION tag determines the extension that is added to the generated
-# man pages. In case the manual section does not start with a number, the number
-# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
-# optional.
-# The default value is: .3.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_EXTENSION          = .3
-
-# The MAN_SUBDIR tag determines the name of the directory created within
-# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
-# MAN_EXTENSION with the initial . removed.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_SUBDIR             =
-
-# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
-# will generate one additional man file for each entity documented in the real
-# man page(s). These additional files only source the real man page, but without
-# them the man command would be unable to find the correct page.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_LINKS              = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
-# captures the structure of the code including all documentation.
-# The default value is: NO.
-
-GENERATE_XML           = YES
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: xml.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_OUTPUT             = xml
-
-# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
-# listings (including syntax highlighting and cross-referencing information) to
-# the XML output. Note that enabling this will significantly increase the size
-# of the XML output.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_PROGRAMLISTING     = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the DOCBOOK output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
-# that can be used to generate PDF.
-# The default value is: NO.
-
-GENERATE_DOCBOOK       = NO
-
-# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
-# front of it.
-# The default directory is: docbook.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_OUTPUT         = docbook
-
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
-# program listings (including syntax highlighting and cross-referencing
-# information) to the DOCBOOK output. Note that enabling this will significantly
-# increase the size of the DOCBOOK output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_PROGRAMLISTING = NO
-
-#---------------------------------------------------------------------------
-# Configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
-# structure of the code including all documentation. Note that this feature is
-# still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_AUTOGEN_DEF   = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
-# file that captures the structure of the code including all documentation.
-#
-# Note that this feature is still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_PERLMOD       = NO
-
-# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
-# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
-# output from the Perl module output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_LATEX          = NO
-
-# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
-# formatted so it can be parsed by a human reader. This is useful if you want to
-# understand what is going on. On the other hand, if this tag is set to NO, the
-# size of the Perl module output will be much smaller and Perl will parse it
-# just the same.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_PRETTY         = YES
-
-# The names of the make variables in the generated doxyrules.make file are
-# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
-# so different doxyrules.make files included by the same Makefile don't
-# overwrite each other's variables.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_MAKEVAR_PREFIX =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
-# C-preprocessor directives found in the sources and include files.
-# The default value is: YES.
-
-ENABLE_PREPROCESSING   = YES
-
-# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
-# in the source code. If set to NO, only conditional compilation will be
-# performed. Macro expansion can be done in a controlled way by setting
-# EXPAND_ONLY_PREDEF to YES.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-MACRO_EXPANSION        = NO
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
-# the macro expansion is limited to the macros specified with the PREDEFINED and
-# EXPAND_AS_DEFINED tags.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_ONLY_PREDEF     = NO
-
-# If the SEARCH_INCLUDES tag is set to YES, the include files in the
-# INCLUDE_PATH will be searched if a #include is found.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SEARCH_INCLUDES        = YES
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by the
-# preprocessor.
-# This tag requires that the tag SEARCH_INCLUDES is set to YES.
-
-INCLUDE_PATH           =
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
-# patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will be
-# used.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-INCLUDE_FILE_PATTERNS  =
-
-# The PREDEFINED tag can be used to specify one or more macro names that are
-# defined before the preprocessor is started (similar to the -D option of e.g.
-# gcc). The argument of the tag is a list of macros of the form: name or
-# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
-# is assumed. To prevent a macro definition from being undefined via #undef or
-# recursively expanded use the := operator instead of the = operator.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-PREDEFINED             =
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
-# tag can be used to specify a list of macro names that should be expanded. The
-# macro definition that is found in the sources will be used. Use the PREDEFINED
-# tag if you want to use a different macro definition that overrules the
-# definition found in the source code.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_AS_DEFINED      =
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
-# remove all references to function-like macros that are alone on a line, have
-# an all uppercase name, and do not end with a semicolon. Such function macros
-# are typically used for boiler-plate code, and will confuse the parser if not
-# removed.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SKIP_FUNCTION_MACROS   = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to external references
-#---------------------------------------------------------------------------
-
-# The TAGFILES tag can be used to specify one or more tag files. For each tag
-# file the location of the external documentation should be added. The format of
-# a tag file without this location is as follows:
-# TAGFILES = file1 file2 ...
-# Adding location for the tag files is done as follows:
-# TAGFILES = file1=loc1 "file2 = loc2" ...
-# where loc1 and loc2 can be relative or absolute paths or URLs. See the
-# section "Linking to external documentation" for more information about the use
-# of tag files.
-# Note: Each tag file must have a unique name (where the name does NOT include
-# the path). If a tag file is not located in the directory in which doxygen is
-# run, you must also specify the path to the tagfile here.
-
-TAGFILES               =
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
-# tag file that is based on the input files it reads. See section "Linking to
-# external documentation" for more information about the usage of tag files.
-
-GENERATE_TAGFILE       =
-
-# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
-# the class index. If set to NO, only the inherited external classes will be
-# listed.
-# The default value is: NO.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will be
-# listed.
-# The default value is: YES.
-
-EXTERNAL_GROUPS        = YES
-
-# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
-# the related pages index. If set to NO, only the current project's pages will
-# be listed.
-# The default value is: YES.
-
-EXTERNAL_PAGES         = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
-# You can include diagrams made with dia in doxygen documentation. Doxygen will
-# then run dia to produce the diagram and insert it in the documentation. The
-# DIA_PATH tag allows you to specify the directory where the dia binary resides.
-# If left empty dia is assumed to be found in the default search path.
-
-DIA_PATH               =
-
-# If set to YES the inheritance and collaboration graphs will hide inheritance
-# and usage relations if the target is undocumented or is not a class.
-# The default value is: YES.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
-# Bell Labs. The other options in this section have no effect if this option is
-# set to NO
-# The default value is: NO.
-
-HAVE_DOT               = NO
-
-# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
-# to run in parallel. When set to 0 doxygen will base this on the number of
-# processors available in the system. You can set it explicitly to a value
-# larger than 0 to get control over the balance between CPU load and processing
-# speed.
-# Minimum value: 0, maximum value: 32, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_NUM_THREADS        = 0
-
-# When you want a differently looking font in the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTNAME           = Helvetica
-
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTSIZE           = 10
-
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTPATH           =
-
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
-# graph for each documented class showing the direct and indirect implementation
-# dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-COLLABORATION_GRAPH    = YES
-
-# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GROUP_GRAPHS           = YES
-
-# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
-# collaboration diagrams in a style similar to the OMG's Unified Modeling
-# Language.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LOOK               = NO
-
-# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
-# class node. If there are many fields or methods and many nodes the graph may
-# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
-# number of items for each type to make the size more manageable. Set this to 0
-# for no limit. Note that the threshold may be exceeded by 50% before the limit
-# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
-# but if the number exceeds 15, the total amount of fields shown is limited to
-# 10.
-# Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LIMIT_NUM_FIELDS   = 10
-
-# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
-# collaboration graphs will show the relations between templates and their
-# instances.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-TEMPLATE_RELATIONS     = NO
-
-# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
-# YES then doxygen will generate a graph for each documented file showing the
-# direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDE_GRAPH          = YES
-
-# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
-# set to YES then doxygen will generate a graph for each documented file showing
-# the direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDED_BY_GRAPH      = YES
-
-# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command. Disabling a call graph can be
-# accomplished by means of the command \hidecallgraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALL_GRAPH             = NO
-
-# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command. Disabling a caller graph can be
-# accomplished by means of the command \hidecallergraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALLER_GRAPH           = NO
-
-# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
-# hierarchy of all classes instead of a textual one.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
-# dependencies a directory has on other directories in a graphical way. The
-# dependency relations are determined by the #include relations between the
-# files in the directories.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DIRECTORY_GRAPH        = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot. For an explanation of the image formats see the section
-# output formats in the documentation of the dot tool (Graphviz (see:
-# http://www.graphviz.org/)).
-# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
-# to make the SVG files visible in IE 9+ (other browsers do not have this
-# requirement).
-# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
-# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
-# png:gdiplus:gdiplus.
-# The default value is: png.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_IMAGE_FORMAT       = png
-
-# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
-# enable generation of interactive SVG images that allow zooming and panning.
-#
-# Note that this requires a modern browser other than Internet Explorer. Tested
-# and working are Firefox, Chrome, Safari, and Opera.
-# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
-# the SVG files visible. Older versions of IE do not have SVG support.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INTERACTIVE_SVG        = NO
-
-# The DOT_PATH tag can be used to specify the path where the dot tool can be
-# found. If left blank, it is assumed the dot tool can be found in the path.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_PATH               =
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the \dotfile
-# command).
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOTFILE_DIRS           =
-
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
-
-MSCFILE_DIRS           =
-
-# The DIAFILE_DIRS tag can be used to specify one or more directories that
-# contain dia files that are included in the documentation (see the \diafile
-# command).
-
-DIAFILE_DIRS           =
-
-# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
-# path where java can find the plantuml.jar file. If left blank, it is assumed
-# PlantUML is not used or called during a preprocessing step. Doxygen will
-# generate a warning when it encounters a \startuml command in this case and
-# will not generate output for the diagram.
-
-PLANTUML_JAR_PATH      =
-
-# When using plantuml, the specified paths are searched for files specified by
-# the !include statement in a plantuml block.
-
-PLANTUML_INCLUDE_PATH  =
-
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
-# that will be shown in the graph. If the number of nodes in a graph becomes
-# larger than this value, doxygen will truncate the graph, which is visualized
-# by representing a node as a red box. Note that doxygen if the number of direct
-# children of the root node in a graph is already larger than
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
-# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
-# Minimum value: 0, maximum value: 10000, default value: 50.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_GRAPH_MAX_NODES    = 50
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
-# generated by dot. A depth value of 3 means that only nodes reachable from the
-# root by following a path via at most 3 edges will be shown. Nodes that lay
-# further from the root node will be omitted. Note that setting this option to 1
-# or 2 may greatly reduce the computation time needed for large code bases. Also
-# note that the size of a graph can be further restricted by
-# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
-# Minimum value: 0, maximum value: 1000, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-MAX_DOT_GRAPH_DEPTH    = 0
-
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
-# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
-# files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10) support
-# this, this feature is disabled by default.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_MULTI_TARGETS      = NO
-
-# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
-# explaining the meaning of the various boxes and arrows in the dot generated
-# graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
-# files that are used to generate the various graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_CLEANUP            = YES
diff --git a/doc/about/index.rst b/doc/about/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8a372d2bc2b2c54b021ed63941482cbad8d8f719
--- /dev/null
+++ b/doc/about/index.rst
@@ -0,0 +1,14 @@
+ABOUT
+=======
+
+PaddlPaddle is an easy-to-use, efficient, flexible and scalable deep learning platform,
+which is originally developed by Baidu scientists and engineers for the purpose of applying deep learning to many products at Baidu.
+
+PaddlePaddle is now open source but far from complete, which is intended to be built upon, improved, scaled, and extended.
+We hope to build an active open source community both by providing feedback and by actively contributing to the source code.
+
+
+Credits
+--------
+
+We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/blob/develop/authors>`_ of PaddlePaddle!
diff --git a/doc/algorithm/index.rst b/doc/algorithm/index.rst
deleted file mode 100644
index 6073add3c0cbb12529eabb0f8d8a051bcb84e628..0000000000000000000000000000000000000000
--- a/doc/algorithm/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Algorithm Tutorial
-==================
-
-..  toctree::
-  :maxdepth: 1
-
-  rnn/rnn.rst
diff --git a/doc/algorithm/rnn/bi_lstm.jpg b/doc/algorithm/rnn/bi_lstm.jpg
deleted file mode 120000
index a53296cf806f97f7f2520e1700c4fb93f6bfc9d8..0000000000000000000000000000000000000000
--- a/doc/algorithm/rnn/bi_lstm.jpg
+++ /dev/null
@@ -1 +0,0 @@
-../../demo/sentiment_analysis/bi_lstm.jpg
\ No newline at end of file
diff --git a/doc/algorithm/rnn/encoder-decoder-attention-model.png b/doc/algorithm/rnn/encoder-decoder-attention-model.png
deleted file mode 120000
index db71321a43a37b774e7de0af3765a60345033743..0000000000000000000000000000000000000000
--- a/doc/algorithm/rnn/encoder-decoder-attention-model.png
+++ /dev/null
@@ -1 +0,0 @@
-../../demo/text_generation/encoder-decoder-attention-model.png
\ No newline at end of file
diff --git a/doc/ui/data_provider/index.rst b/doc/api/data_provider/index.rst
similarity index 97%
rename from doc/ui/data_provider/index.rst
rename to doc/api/data_provider/index.rst
index 3db5b57376257b83fc2a27c518b0db663682136d..5e7a49d63236ffa854e64c53921441bacebc13ae 100644
--- a/doc/ui/data_provider/index.rst
+++ b/doc/api/data_provider/index.rst
@@ -1,5 +1,5 @@
-DataProvider Introduction
-=========================
+Introduction
+==============
 DataProvider is a module that loads training or testing data into cpu or gpu
 memory for the following triaining or testing process.
 
diff --git a/doc/ui/data_provider/pydataprovider2.rst b/doc/api/data_provider/pydataprovider2.rst
similarity index 99%
rename from doc/ui/data_provider/pydataprovider2.rst
rename to doc/api/data_provider/pydataprovider2.rst
index e105d3be308705d228c0b188e15742a0f7325ab6..b42cbca576e4b5d67d50d0156939a01faae4533d 100644
--- a/doc/ui/data_provider/pydataprovider2.rst
+++ b/doc/api/data_provider/pydataprovider2.rst
@@ -1,5 +1,5 @@
-How to use PyDataProvider2
-==========================
+PyDataProvider2
+=================
 
 We highly recommand users to use PyDataProvider2 to provide training or testing
 data to PaddlePaddle. The user only needs to focus on how to read a single
diff --git a/doc/api/index.rst b/doc/api/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ccee7a0f1f3e5290891dfa963ea24c7fdbd6275e
--- /dev/null
+++ b/doc/api/index.rst
@@ -0,0 +1,36 @@
+API
+====
+
+DataProvider API
+----------------
+
+..  toctree::
+  :maxdepth: 1
+
+  data_provider/index.rst
+  data_provider/pydataprovider2.rst
+
+Model Config API
+----------------
+
+..  toctree::
+  :maxdepth: 1
+
+  trainer_config_helpers/index.rst
+  trainer_config_helpers/optimizers.rst
+  trainer_config_helpers/data_sources.rst
+  trainer_config_helpers/layers.rst
+  trainer_config_helpers/activations.rst 
+  trainer_config_helpers/poolings.rst
+  trainer_config_helpers/networks.rst
+  trainer_config_helpers/evaluators.rst
+  trainer_config_helpers/attrs.rst
+
+
+Applications API
+----------------
+
+..  toctree::
+  :maxdepth: 1
+
+  predict/swig_py_paddle_en.rst
\ No newline at end of file
diff --git a/doc/ui/predict/predict_sample.py b/doc/api/predict/predict_sample.py
similarity index 100%
rename from doc/ui/predict/predict_sample.py
rename to doc/api/predict/predict_sample.py
diff --git a/doc/ui/predict/swig_py_paddle_en.rst b/doc/api/predict/swig_py_paddle_en.rst
similarity index 98%
rename from doc/ui/predict/swig_py_paddle_en.rst
rename to doc/api/predict/swig_py_paddle_en.rst
index b743fc456914664168e1be6c7f18a419c38afa62..9845cd1607b425dc0a4ddc665aab40d96fa2fbe4 100644
--- a/doc/ui/predict/swig_py_paddle_en.rst
+++ b/doc/api/predict/swig_py_paddle_en.rst
@@ -1,5 +1,5 @@
-Python Prediction API
-=====================
+Python Prediction
+==================
 
 PaddlePaddle offers a set of clean prediction interfaces for python with the help of
 SWIG. The main steps of predict values in python are:
diff --git a/doc/ui/api/trainer_config_helpers/activations.rst b/doc/api/trainer_config_helpers/activations.rst
similarity index 100%
rename from doc/ui/api/trainer_config_helpers/activations.rst
rename to doc/api/trainer_config_helpers/activations.rst
diff --git a/doc/api/trainer_config_helpers/attrs.rst b/doc/api/trainer_config_helpers/attrs.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ac63127bf7d9db6351365ab7b58f43db12347a8e
--- /dev/null
+++ b/doc/api/trainer_config_helpers/attrs.rst
@@ -0,0 +1,5 @@
+Parameter Attributes
+=======================
+
+..  automodule:: paddle.trainer_config_helpers.attrs
+    :members:
diff --git a/doc/ui/api/trainer_config_helpers/data_sources.rst b/doc/api/trainer_config_helpers/data_sources.rst
similarity index 100%
rename from doc/ui/api/trainer_config_helpers/data_sources.rst
rename to doc/api/trainer_config_helpers/data_sources.rst
diff --git a/doc/ui/api/trainer_config_helpers/evaluators.rst b/doc/api/trainer_config_helpers/evaluators.rst
similarity index 100%
rename from doc/ui/api/trainer_config_helpers/evaluators.rst
rename to doc/api/trainer_config_helpers/evaluators.rst
diff --git a/doc/ui/api/trainer_config_helpers/layers.rst b/doc/api/trainer_config_helpers/layers.rst
similarity index 100%
rename from doc/ui/api/trainer_config_helpers/layers.rst
rename to doc/api/trainer_config_helpers/layers.rst
diff --git a/doc/ui/api/trainer_config_helpers/networks.rst b/doc/api/trainer_config_helpers/networks.rst
similarity index 100%
rename from doc/ui/api/trainer_config_helpers/networks.rst
rename to doc/api/trainer_config_helpers/networks.rst
diff --git a/doc/ui/api/trainer_config_helpers/optimizers.rst b/doc/api/trainer_config_helpers/optimizers.rst
similarity index 100%
rename from doc/ui/api/trainer_config_helpers/optimizers.rst
rename to doc/api/trainer_config_helpers/optimizers.rst
diff --git a/doc/ui/api/trainer_config_helpers/poolings.rst b/doc/api/trainer_config_helpers/poolings.rst
similarity index 100%
rename from doc/ui/api/trainer_config_helpers/poolings.rst
rename to doc/api/trainer_config_helpers/poolings.rst
diff --git a/doc/build/docker_install.rst b/doc/build/docker_install.rst
deleted file mode 100644
index e95de35f4da35fee511551f13bc6026532cce5c3..0000000000000000000000000000000000000000
--- a/doc/build/docker_install.rst
+++ /dev/null
@@ -1,122 +0,0 @@
-Docker installation guide
-==========================
-
-PaddlePaddle provide the `Docker <https://www.docker.com/>`_ image. `Docker`_ is a lightweight container utilities. The performance of PaddlePaddle in `Docker`_ container is basically as same as run it in a normal linux. The `Docker`_ is a very convenient way to deliver the binary release for linux programs.
-
-..  note::
-
-    The `Docker`_ image is the recommended way to run PaddlePaddle 
-
-PaddlePaddle Docker images
---------------------------
-
-There are 12 `images <https://hub.docker.com/r/paddledev/paddle/tags/>`_ for PaddlePaddle, and the name is :code:`paddle-dev/paddle`,  tags are\: 
-
-
-+-----------------+------------------+------------------------+-----------------------+
-|                 |   normal         |           devel        |          demo         |
-+=================+==================+========================+=======================+
-|       CPU       | cpu-latest       | cpu-devel-latest       | cpu-demo-latest       |
-+-----------------+------------------+------------------------+-----------------------+
-|       GPU       | gpu-latest       | gpu-devel-latest       | gpu-demo-latest       |
-+-----------------+------------------+------------------------+-----------------------+
-| CPU WITHOUT AVX | cpu-noavx-latest | cpu-devel-noavx-latest | cpu-demo-noavx-latest |
-+-----------------+------------------+------------------------+-----------------------+
-| GPU WITHOUT AVX | gpu-noavx-latest | gpu-devel-noavx-latest | gpu-demo-noavx-latest |
-+-----------------+------------------+------------------------+-----------------------+
-
-And the three columns are:
-
-* normal\: The docker image only contains binary of PaddlePaddle.
-* devel\: The docker image contains PaddlePaddle binary, source code and essential build environment.
-* demo\: The docker image contains the dependencies to run PaddlePaddle demo.
-
-And the four rows are:
-
-* CPU\: CPU Version. Support CPU which has :code:`AVX` instructions.
-* GPU\: GPU Version. Support GPU, and cpu has :code:`AVX` instructions.
-* CPU WITHOUT AVX\: CPU Version, which support most CPU even doesn't have :code:`AVX` instructions.
-* GPU WITHOUT AVX\: GPU Version, which support most CPU even doesn't have :code:`AVX` instructions.
-
-User can choose any version depends on machine. The following script can help you to detect your CPU support :code:`AVX` or not.
-
-..  code-block:: bash
-    
-    if cat /proc/cpuinfo | grep -q avx ; then echo "Support AVX"; else echo "Not support AVX"; fi
-
-If the output is :code:`Support AVX`, then you can choose the AVX version of PaddlePaddle, otherwise, you need select :code:`noavx` version of PaddlePaddle. For example, the CPU develop version of PaddlePaddle is :code:`paddle-dev/paddle:cpu-devel-latest`.
-
-The PaddlePaddle images don't contain any entry command. You need to write your entry command to use this image. See :code:`Remote Access` part or just use following command to run a :code:`bash`
-
-..  code-block:: bash
-
-    docker run -it paddledev/paddle:cpu-latest /bin/bash
-
-
-Download and Run Docker images
-------------------------------
-
-You have to install Docker in your machine which has linux kernel version 3.10+ first. You can refer to the official guide https://docs.docker.com/engine/installation/ for further information.
-
-You can use :code:`docker pull ` to download images first, or just launch a container with :code:`docker run` \:
-
-..  code-block:: bash
-
-    docker run -it paddledev/paddle:cpu-latest
-
-
-If you want to launch container with GPU support, you need to set some environment variables at the same time:
-
-..  code-block:: bash
-
-    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest
-
-
-Some notes for docker
----------------------
-
-Performance
-+++++++++++
-
-Since Docker is based on the lightweight virtual containers, the CPU computing performance maintains well. And GPU driver and equipments are all mapped to the container, so the GPU computing performance would not be seriously affected.
-
-If you use high performance nic, such as RDMA(RoCE 40GbE or IB 56GbE), Ethernet(10GbE), it is recommended to use config "-net = host".
-
-
-
-
-Remote access
-+++++++++++++
-
-
-If you want to enable ssh access background, you need to build an image by yourself. Please refer to official guide https://docs.docker.com/engine/reference/builder/ for further information.
-
-Following is a simple Dockerfile with ssh:
-
-..  literalinclude:: ../../doc_cn/build_and_install/install/paddle_ssh.Dockerfile
-
-Then you can build an image with Dockerfile and launch a container:
-
-..  code-block:: bash
-
-    # cd into Dockerfile directory
-    docker build . -t paddle_ssh
-    # run container, and map host machine port 8022 to container port 22
-    docker run -d -p 8022:22 --name paddle_ssh_machine paddle_ssh
-
-Now, you can ssh on port 8022 to access the container, username is root, password is also root:
-
-..  code-block:: bash
-
-    ssh -p 8022 root@YOUR_HOST_MACHINE
-
-You can stop and delete the container as following:
-
-..  code-block:: bash
-
-    # stop
-    docker stop paddle_ssh_machine
-    # delete
-    docker rm paddle_ssh_machine
diff --git a/doc/cluster/index.rst b/doc/cluster/index.rst
deleted file mode 100644
index 9062f85f98d2981b5c8dcf8149e32c2ccdac77f4..0000000000000000000000000000000000000000
--- a/doc/cluster/index.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-Cluster Train
-====================
-
-.. toctree::
-  :glob:
-
-  opensource/cluster_train.md
-  internal/index.md
diff --git a/doc/conf.py.in b/doc/conf.py.in
index 6c221f598b805fc00a9475a269d95cb54d1f4e98..5fb307e3a9b572f14789dec3707611f336a5d44f 100644
--- a/doc/conf.py.in
+++ b/doc/conf.py.in
@@ -23,21 +23,7 @@ AutoStructify = transform.AutoStructify
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 sys.path.insert(0, '@PROJ_ROOT@/python')
 
-templates_path = ["@PROJ_ROOT@/doc/templates"]
-
-# -- Doxygen Settings
-breathe_projects = {
-   'paddle': '@PADDLE_DOXYGEN_OUTPUT@/xml'
-}
-breathe_default_project = 'paddle'
-
-breathe_domain_by_extension = {
-    'h': 'cpp',  # mapping XXX.h XXX.cpp together
-}
-
-breathe_default_members = {
-    'protected-members','undoc-members'
-}
+templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
 
 # -- General configuration ------------------------------------------------
 
@@ -62,7 +48,6 @@ extensions = [
     'sphinx.ext.autosummary',
     'sphinx.ext.mathjax',
     'sphinx.ext.napoleon',
-    'breathe'
 ]
 
 
@@ -128,13 +113,12 @@ todo_include_todos = False
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-#html_theme = 'sphinx_rtd_theme'
-html_theme = 'classic'
+html_theme = 'sphinx_rtd_theme'
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ['@PROJ_ROOT@/doc_theme/static']
 
 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'
diff --git a/doc/dev/index.rst b/doc/dev/index.rst
deleted file mode 100644
index 0468dd492b6246cfe0771a05c3597ddee95b3ddd..0000000000000000000000000000000000000000
--- a/doc/dev/index.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-Development Guide
-=================
-
-..  toctree::
-  :maxdepth: 1
-
-  layer.md
-  new_layer/new_layer.rst
-  ../source/index.md
diff --git a/doc/dev/layer.md b/doc/dev/layer.md
deleted file mode 100644
index 930fb0de1ac074b15d06197ed0e732f92288b411..0000000000000000000000000000000000000000
--- a/doc/dev/layer.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Layer Documents
-
-* [Layer Source Code Document](../source/gserver/layers/index.rst)
-* [Layer Python API Document](../ui/api/trainer_config_helpers/index.rst)
diff --git a/doc/getstarted/basic_usage/basic_usage.rst b/doc/getstarted/basic_usage/basic_usage.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dca7a6b1f4f017b302148c611122806f112564a9
--- /dev/null
+++ b/doc/getstarted/basic_usage/basic_usage.rst
@@ -0,0 +1,109 @@
+Basic Usage
+=============
+
+PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
+
+1. A Classic Problem
+---------------------
+
+Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - `simple linear regression <https://en.wikipedia.org/wiki/Simple_linear_regression>`_: you have observed a set of two-dimensional data points of ``X`` and ``Y``, where ``X`` is an explanatory variable and ``Y`` is corresponding dependent variable, and you want to recover the underlying correlation between ``X`` and ``Y``. Linear regression can be used in many practical scenarios. For example, ``X`` can be a variable about house size, and ``Y`` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
+
+2. Prepare the Data
+--------------------
+
+Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
+
+    .. code-block:: python
+
+        # dataprovider.py
+        from paddle.trainer.PyDataProvider2 import *
+        import random
+
+        # define data types of input: 2 real numbers
+        @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
+        def process(settings, input_file):
+            for i in xrange(2000):
+                x = random.random()
+                yield [x], [2*x+0.3]
+
+3. Train a NeuralNetwork
+-------------------------
+
+To recover this relationship between ``X`` and ``Y``, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line ``Y' = wX + b`` , then we gradually adapt ``w`` and ``b`` to minimize the difference between ``Y'`` and ``Y``. Here is what it looks like in PaddlePaddle:
+
+    .. code-block:: python
+
+        # trainer_config.py
+        from paddle.trainer_config_helpers import *
+
+        # 1. read data. Suppose you saved above python code as dataprovider.py
+        data_file = 'empty.list'
+        with open(data_file, 'w') as f: f.writelines(' ')
+        define_py_data_sources2(train_list=data_file, test_list=None, 
+                module='dataprovider', obj='process',args={})
+
+        # 2. learning algorithm
+        settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
+
+        # 3. Network configuration
+        x = data_layer(name='x', size=1)
+        y = data_layer(name='y', size=1)
+        y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
+        cost = regression_cost(input=y_predict, label=y)
+        outputs(cost)
+
+Some of the most fundamental usages of PaddlePaddle are demonstrated:
+
+-  The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly.
+
+-  The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time.
+
+-  Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration:
+	-  **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for ``X`` and ``Y``.
+	-  **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model.
+	-  **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters.
+
+Now that everything is ready, you can train the network with a simple command line call:
+
+    .. code-block:: bash
+ 
+        paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
+ 
+
+This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path ``./output``. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
+
+
+4. Evaluate the Model
+-----------------------
+
+Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: ``w=2, b=0.3``, thus a better option is to check out model parameters directly.
+
+In PaddlePaddle, training is just to get a collection of model parameters, which are ``w`` and ``b`` in this case. Each parameter is saved in an individual file in the popular ``numpy`` array format. Here is the code that reads parameters from last pass.
+
+    .. code-block:: python
+
+        import numpy as np
+        import os
+
+        def load(file_name):
+            with open(file_name, 'rb') as f:
+                f.read(16) # skip header for float type.
+                return np.fromfile(f, dtype=np.float32)
+                
+        print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
+        # w=1.999743, b=0.300137
+
+    .. image:: parameters.png
+        :align: center
+
+Although starts from a random guess, you can see that value of ``w`` changes quickly towards 2 and ``b`` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer.
+
+There, you have recovered the underlying pattern between ``X`` and ``Y`` only from observed data.
+
+
+5. Where to Go from Here
+-------------------------
+
+- `Install and Build <../build_and_install/index.html>`_
+- `Tutorials <../demo/quick_start/index_en.html>`_
+- `Example and Demo <../demo/index.html>`_
diff --git a/doc/getstarted/basic_usage/parameters.png b/doc/getstarted/basic_usage/parameters.png
new file mode 100644
index 0000000000000000000000000000000000000000..2ec67480951e21f0400bce1c34b3108dcd65c18c
Binary files /dev/null and b/doc/getstarted/basic_usage/parameters.png differ
diff --git a/doc/build/build_from_source.md b/doc/getstarted/build_and_install/build_from_source.md
similarity index 91%
rename from doc/build/build_from_source.md
rename to doc/getstarted/build_and_install/build_from_source.md
index b8f26f431eb7a04147fe791a8c805427c827fe09..a2a96f6f48369bf80c8f7b5040121fd7dba5a77b 100644
--- a/doc/build/build_from_source.md
+++ b/doc/getstarted/build_and_install/build_from_source.md
@@ -6,11 +6,12 @@ Installing from Sources
 * [3. Build on Ubuntu](#ubuntu)
 
 ## <span id="download">Download and Setup</span> 
-You can download PaddlePaddle from the [github source](https://github.com/gangliao/Paddle).
+You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
 
 ```bash
-git clone https://github.com/baidu/Paddle paddle
+git clone https://github.com/PaddlePaddle/Paddle paddle
 cd paddle
+git submodule update --init --recursive
 ```
 
 ## <span id="requirements">Requirements</span>
@@ -95,7 +96,7 @@ As a simple example, consider the following:
     ```bash
     # necessary
     sudo apt-get update
-    sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
+    sudo apt-get install -y g++ make cmake swig build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
     # optional
     sudo apt-get install libgoogle-glog-dev
     sudo apt-get install libgflags-dev
@@ -149,15 +150,15 @@ If still not found, you can manually set it based on CMake error information fro
 
 As a simple example, consider the following:
 
-- **Only CPU**
+- **Only CPU with swig**
 
   ```bash
-  cmake  .. -DWITH_GPU=OFF
+  cmake  .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON
   ```
-- **GPU**
+- **GPU with swig**
 
   ```bash
-  cmake .. -DWITH_GPU=ON
+  cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON
   ```
 
 - **GPU with doc and swig**
@@ -170,15 +171,13 @@ Finally, you can build PaddlePaddle:
 
 ```bash
 # you can add build option here, such as:    
-cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<path to install>
+cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<path to install> -DWITH_SWIG_PY=ON
 # please use sudo make install, if you want to install PaddlePaddle into the system
 make -j `nproc` && make install
 # set PaddlePaddle installation path in ~/.bashrc
 export PATH=<path to install>/bin:$PATH
 ```
 
-**Note:**
-
 If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
 Otherwise, PaddlePaddle will automatically install python dependencies
 at first time when user run paddle commands, such as `paddle version`, `paddle train`.
diff --git a/doc/build/cmake.png b/doc/getstarted/build_and_install/cmake.png
similarity index 100%
rename from doc/build/cmake.png
rename to doc/getstarted/build_and_install/cmake.png
diff --git a/doc/getstarted/build_and_install/docker_install.rst b/doc/getstarted/build_and_install/docker_install.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1ab6fc6a728f68b16d798a577da2896481eb17d1
--- /dev/null
+++ b/doc/getstarted/build_and_install/docker_install.rst
@@ -0,0 +1,106 @@
+PaddlePaddle in Docker Containers
+=================================
+
+Docker container is currently the only officially-supported way to
+running PaddlePaddle.  This is reasonable as Docker now runs on all
+major operating systems including Linux, Mac OS X, and Windows.
+Please be aware that you will need to change `Dockers settings
+<https://github.com/PaddlePaddle/Paddle/issues/627>`_ to make full use
+of your hardware resource on Mac OS X and Windows.
+
+
+CPU-only and GPU Images
+-----------------------
+
+For each version of PaddlePaddle, we release 2 Docker images, a
+CPU-only one and a CUDA GPU one.  We do so by configuring
+`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_
+automatically runs the following commands:
+
+.. code-block:: base
+
+   docker build -t paddle:cpu -f paddle/scripts/docker/Dockerfile .
+   docker build -t paddle:gpu -f paddle/scripts/docker/Dockerfile.gpu .
+
+
+To run the CPU-only image as an interactive container:
+
+.. code-block:: bash
+
+    docker run -it --rm paddledev/paddle:cpu-latest /bin/bash
+
+or, we can run it as a daemon container
+
+.. code-block:: bash
+
+    docker run -d -p 2202:22 paddledev/paddle:cpu-latest
+
+and SSH to this container using password :code:`root`:
+
+.. code-block:: bash
+
+    ssh -p 2202 root@localhost
+
+An advantage of using SSH is that we can connect to PaddlePaddle from
+more than one terminals.  For example, one terminal running vi and
+another one running Python interpreter.  Another advantage is that we
+can run the PaddlePaddle container on a remote server and SSH to it
+from a laptop.
+
+
+Above methods work with the GPU image too -- just please don't forget
+to install CUDA driver and let Docker knows about it:
+
+.. code-block:: bash
+
+    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest
+
+
+Non-AVX Images
+--------------
+
+Please be aware that the CPU-only and the GPU images both use the AVX
+instruction set, but old computers produced before 2008 do not support
+AVX.  The following command checks if your Linux computer supports
+AVX:
+
+.. code-block:: bash
+
+   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+
+
+If it doesn't, we will need to build non-AVX images manually from
+source code:
+
+.. code-block:: bash
+
+   cd ~
+   git clone github.com/PaddlePaddle/Paddle
+   cd Paddle
+   git submodule update --init --recursive
+   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
+   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
+
+
+Documentation
+-------------
+
+Paddle Docker images include an HTML version of C++ source code
+generated using `woboq code browser
+<https://github.com/woboq/woboq_codebrowser>`_.  This makes it easy
+for users to browse and understand the C++ source code.
+
+As long as we give the Paddle Docker container a name, we can run an
+additional nginx Docker container to serve the volume from the Paddle
+container:
+
+.. code-block:: bash
+
+   docker run -d --name paddle-cpu-doc paddle:cpu
+   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
+
+
+Then we can direct our Web browser to the HTML version of source code
+at http://localhost:8088/paddle/
diff --git a/doc/build/index.rst b/doc/getstarted/build_and_install/index.rst
similarity index 80%
rename from doc/build/index.rst
rename to doc/getstarted/build_and_install/index.rst
index b4fe4596047c7d201fdf36bc76c26d5134611560..6187be9d7257b1690d223770f0f6a5b466cf2898 100644
--- a/doc/build/index.rst
+++ b/doc/getstarted/build_and_install/index.rst
@@ -8,8 +8,6 @@ Install PaddlePaddle
     :maxdepth: 1
     :glob:
 
-    install_*
-    internal/install_from_jumbo.md
     docker_install.rst
     ubuntu_install.rst
 
@@ -24,5 +22,4 @@ Build from Source
     :maxdepth: 1
     :glob:
 
-    build_from_source.md
-    contribute_to_paddle.md
+    build_from_source.md
\ No newline at end of file
diff --git a/doc/build/ubuntu_install.rst b/doc/getstarted/build_and_install/ubuntu_install.rst
similarity index 100%
rename from doc/build/ubuntu_install.rst
rename to doc/getstarted/build_and_install/ubuntu_install.rst
diff --git a/doc/getstarted/index.rst b/doc/getstarted/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5f2787066ea5cdb32a1eff2939ffd0585d7fbc63
--- /dev/null
+++ b/doc/getstarted/index.rst
@@ -0,0 +1,8 @@
+GET STARTED
+============
+
+..  toctree::
+  :maxdepth: 2
+
+  build_and_install/index.rst
+  basic_usage/basic_usage.rst
diff --git a/doc/cluster/opensource/cluster_train.md b/doc/howto/cluster/cluster_train.md
similarity index 99%
rename from doc/cluster/opensource/cluster_train.md
rename to doc/howto/cluster/cluster_train.md
index cb493a88f031850cb6a5eeed0ebe9e41bb7e01c3..1de34a6a99440bf45af8b1fec2c7a2361865fed3 100644
--- a/doc/cluster/opensource/cluster_train.md
+++ b/doc/howto/cluster/cluster_train.md
@@ -1,4 +1,4 @@
-# Distributed Training
+# How to Run Distributed Training
 
 In this article, we explain how to run distributed Paddle training jobs on clusters.  We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).
 
@@ -9,7 +9,7 @@ In this article, we explain how to run distributed Paddle training jobs on clust
 1. Aforementioned scripts use a Python library [fabric](http://www.fabfile.org/) to run SSH commands.  We can use `pip` to install fabric:
 
    ```bash
-pip install fabric
+   pip install fabric
    ```
 
 1. We need to install PaddlePaddle on all nodes in the cluster.  To enable GPUs, we need to install CUDA in `/usr/local/cuda`; otherwise Paddle would report errors at runtime.
diff --git a/doc/ui/cmd_argument/argument_outline.md b/doc/howto/cmd_parameter/arguments.md
similarity index 99%
rename from doc/ui/cmd_argument/argument_outline.md
rename to doc/howto/cmd_parameter/arguments.md
index d6cc2c6ed7cc1b9209d56b4348497427efe40ac3..013edbc9047817d7f6b82c4d5188412bd2ce41d6 100644
--- a/doc/ui/cmd_argument/argument_outline.md
+++ b/doc/howto/cmd_parameter/arguments.md
@@ -143,7 +143,7 @@ It looks like there are a lot of arguments. However, most of them are for develo
 </tr>
 
 <tr>
-<td class="left" rowspan = "2">testing during training</td><td class="left">test_all_data_in_one_period</td>
+<td class="left" rowspan = "2">testing during training</td><td class="left">test_period</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>
 
diff --git a/doc/ui/cmd_argument/detail_introduction.md b/doc/howto/cmd_parameter/detail_introduction.md
similarity index 96%
rename from doc/ui/cmd_argument/detail_introduction.md
rename to doc/howto/cmd_parameter/detail_introduction.md
index 07608e5edf740bd3e1242913f1d2d7589ad313aa..510396b629e398cef2ccda2f1cec474160693219 100644
--- a/doc/ui/cmd_argument/detail_introduction.md
+++ b/doc/howto/cmd_parameter/detail_introduction.md
@@ -31,7 +31,7 @@
   - type: string (default: null).
 
 * `--version`
-  - Whether to print version infomatrion.
+  - Whether to print version information.
   - type: bool (default: 0).
 
 * `--show_layer_stat`
@@ -110,8 +110,8 @@
   - type: int32 (default: -1).
 
 * `--test_period`
-  - Run testing every test_period train batches. If not set, run testing each pass.
-  - type: int32 (default: 1000).
+   - if equal 0, do test on all test data at the end of each pass. While if equal non-zero, do test on all test data every test_period batches.
+  - type: int32 (default: 0).
 
 * `--test_wait`
   - Whether to wait for parameter per pass if not exist. If set test_data_path in submitting environment of cluster, it will launch one process to perfom testing, so we need to set test_wait=1. Note that in the cluster submitting environment, this argument has been set True by default.
@@ -121,10 +121,6 @@
   - File that saves the model list when testing. It was set automatically when using cluster submitting environment after setting model_path.
   - type: string (default: "", null).
 
-* `--test_all_data_in_one_period`
-  - This argument is usually used in testing period during traning. If true, all data will be tested in one test period. Otherwise (batch_size * log_peroid) data will be tested.
-  - type: bool (default: 0).
-
 * `--predict_output_dir`
   - Directory that saves the layer output. It is configured in Outputs() in network config. Default, this argument is null, meaning save nothing. Specify this directory if you want to save feature map of some layers in testing mode. Note that, layer outputs are values after activation function.
   - type: string (default: "", null).
diff --git a/doc/howto/cmd_parameter/index.md b/doc/howto/cmd_parameter/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..48cf835de142f19f41a9d077786d312100859592
--- /dev/null
+++ b/doc/howto/cmd_parameter/index.md
@@ -0,0 +1,5 @@
+# How to Set Command-line Parameters
+
+* [Use Case](use_case.md)
+* [Arguments](arguments.md)
+* [Detailed Descriptions](detail_introduction.md)
diff --git a/doc/ui/cmd_argument/use_case.md b/doc/howto/cmd_parameter/use_case.md
similarity index 97%
rename from doc/ui/cmd_argument/use_case.md
rename to doc/howto/cmd_parameter/use_case.md
index a6bfba29af4f73055338c3a671bcafaa1456c7cf..4d7bb33f36fe258ee24796eedc9296065923e58f 100644
--- a/doc/ui/cmd_argument/use_case.md
+++ b/doc/howto/cmd_parameter/use_case.md
@@ -10,9 +10,8 @@ paddle train \
   --config=network_config \
   --save_dir=output \
   --trainer_count=COUNT \                #(default:1)
-  --test_period=M \                      #(default:1000）
-  --test_all_data_in_one_period=true \   #(default:false) 
-  --num_passes=N \                       #(defalut:100）
+  --test_period=M \                      #(default:0) 
+  --num_passes=N \                       #(defalut:100)
   --log_period=K \                       #(default:100)
   --dot_period=1000 \                    #(default:1)
   #[--show_parameter_stats_period=100] \ #(default:0)
diff --git a/doc/build/contribute_to_paddle.md b/doc/howto/contribute_to_paddle.md
similarity index 94%
rename from doc/build/contribute_to_paddle.md
rename to doc/howto/contribute_to_paddle.md
index 1d03eb7362b1b6f2fcdac7b53f8b7f93fb75e49c..1decc91d62cc25c5b3157bdc6e0835421be23252 100644
--- a/doc/build/contribute_to_paddle.md
+++ b/doc/howto/contribute_to_paddle.md
@@ -1,4 +1,4 @@
-# Contribute Code
+# How to Contribute Code
 
 We sincerely appreciate your contributions. You can use fork and pull request
 workflow to merge your code. 
@@ -36,8 +36,9 @@ If your repository doesn't contain **develop** branch, just create it by your ow
 git clone https://github.com/USERNAME/Paddle.git Paddle
 cd Paddle
 git checkout -b develop  # create develop branch.
-git remote add upstream https://github.com/baidu/Paddle.git  # add upstream to baidu/Paddle
+git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # add upstream to baidu/Paddle
 git pull upstream develop  # update to upstream
+git submodule update --init --recursive
 ```
 
 Then you can start to develop by making a local developement branch
@@ -69,7 +70,7 @@ To do this, you'll need to add a remote at first:
 # see the current configured remote repository
 git remote -v
 # add upstream repository
-git remote add upstream https://github.com/baidu/Paddle.git
+git remote add upstream https://github.com/PaddlePaddle/Paddle.git
 # verify the new upstream
 git remote -v
 ```
diff --git a/doc/howto/deep_model/index.rst b/doc/howto/deep_model/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..06ef443f62f63723704d4f06ba5d321636c0b72b
--- /dev/null
+++ b/doc/howto/deep_model/index.rst
@@ -0,0 +1,7 @@
+How to Configure Deep Models
+============================
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn/rnn.rst
diff --git a/doc/algorithm/rnn/rnn.rst b/doc/howto/deep_model/rnn/rnn.rst
similarity index 99%
rename from doc/algorithm/rnn/rnn.rst
rename to doc/howto/deep_model/rnn/rnn.rst
index 01d2caefb5cdf4e949511fd0f5bbafe0e604e881..da29b8efadd299fe4fc74a71392cbc9a56e32be3 100644
--- a/doc/algorithm/rnn/rnn.rst
+++ b/doc/howto/deep_model/rnn/rnn.rst
@@ -42,7 +42,7 @@ Simple Gated Recurrent Neural Network
 
 Recurrent neural network process a sequence at each time step sequentially. An example of the architecture of LSTM is listed below.
 
-.. image:: ./bi_lstm.jpg
+.. image:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
 	 :align: center
 
 Generally speaking, a recurrent network perform the following operations from :math:`t=1` to :math:`t=T`, or reversely from :math:`t=T` to :math:`t=1`.
@@ -101,7 +101,7 @@ Sequence to Sequence Model with Attention
 -----------------------------------------
 We will use the sequence to sequence model with attention as an example to demonstrate how you can configure complex recurrent neural network models. An illustration of the sequence to sequence model with attention is shown in the following figure.
 
-.. image:: ./encoder-decoder-attention-model.png
+.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
  	 :align: center
 
 In this model, the source sequence :math:`S = \{s_1, \dots, s_T\}` is encoded with a bidirectional gated recurrent neural networks. The hidden states of the bidirectional gated recurrent neural network :math:`H_S = \{H_1, \dots, H_T\}` is called *encoder vector* The decoder is a gated recurrent neural network. When decoding each token :math:`y_t`, the gated recurrent neural network generates a set of weights :math:`W_S^t = \{W_1^t, \dots, W_T^t\}`, which are used to compute a weighted sum of the encoder vector. The weighted sum of the encoder vector is utilized to condition the generation of the token :math:`y_t`.
diff --git a/doc/howto/index.rst b/doc/howto/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..41877a64a56e289c39ca7513eb05fa6a41129487
--- /dev/null
+++ b/doc/howto/index.rst
@@ -0,0 +1,29 @@
+HOW TO
+=======
+
+Usage
+-------
+
+..  toctree::
+  :maxdepth: 1
+
+  cmd_parameter/index.md
+  deep_model/index.rst
+  cluster/cluster_train.md
+
+Development
+------------
+
+..  toctree::
+  :maxdepth: 1
+
+  new_layer/index.rst
+  contribute_to_paddle.md
+
+Optimization
+-------------
+
+..  toctree::
+  :maxdepth: 1
+
+  optimization/index.rst
diff --git a/doc/dev/new_layer/FullyConnected.jpg b/doc/howto/new_layer/FullyConnected.jpg
similarity index 100%
rename from doc/dev/new_layer/FullyConnected.jpg
rename to doc/howto/new_layer/FullyConnected.jpg
diff --git a/doc/dev/new_layer/new_layer.rst b/doc/howto/new_layer/index.rst
similarity index 99%
rename from doc/dev/new_layer/new_layer.rst
rename to doc/howto/new_layer/index.rst
index af8b76a3075194ead9be40d2c943238b2cfadecc..922bda5b0d879b9041e3c0ca5d2518363a7cfa05 100644
--- a/doc/dev/new_layer/new_layer.rst
+++ b/doc/howto/new_layer/index.rst
@@ -1,6 +1,6 @@
-==================
-Writing New Layers
-==================
+=======================
+How to Write New Layers
+=======================
 
 This tutorial will guide you to write customized layers in PaddlePaddle. We will utilize fully connected layer as an example to guide you through the following steps for writing a new layer.
 
diff --git a/doc/howto/optimization/gpu_profiling.rst b/doc/howto/optimization/gpu_profiling.rst
new file mode 100644
index 0000000000000000000000000000000000000000..667bf1364e7cd4c9098caba72a127228d78ca38b
--- /dev/null
+++ b/doc/howto/optimization/gpu_profiling.rst
@@ -0,0 +1,237 @@
+Profiling on PaddlePaddle
+=========================
+
+This tutorial will guide you step-by-step through how to conduct profiling and performance tuning using built-in timer, **nvprof** and **nvvp**.
+
+- What is profiling?
+- Why we need profiling?
+- How to do profiling?
+- Profile tools
+- Hands-on Tutorial
+- Profiling tips
+
+What's profiling?
+=================
+In software engineering, profiling is a form of dynamic program analysis that measures the space (memory) or time
+complexity of a program, the usage of particular instructions, or the frequency and duration of function calls.
+Most commonly, profiling information serves to aid program optimization.
+
+Briefly, profiler is used to measure application performance. Program analysis tools are extremely important for
+understanding program behavior. Simple profiling can tell you that how long does an operation take? For advanced
+profiling, it can interpret why does an operation take a long time?
+
+Why we need profiling?
+======================
+Since training deep neural network typically take a very long time to get over, performance is gradually becoming
+the most important thing in deep learning field. The first step to improve performance is to understand what parts
+are slow.  There is no point in improving performance of a region which doesn’t take much time!
+
+
+How to do profiling?
+====================
+To achieve maximum performance, there are five steps you can take to reach your goals.
+
+- Profile the code
+- Find the slow parts
+- Work out why they’re slow
+- Make them fast
+- Profile the code again
+
+Usually, processor has two key performance limits include float point throughput and
+memory throughput. For GPU,  it also need more parallelism to fulfill its potential.
+This is why they can be so fast.
+
+Profiler Tools
+==============
+For general GPU profiling, a bunch of tools are provided from both NVIDIA and third party.
+
+**nvprof** is Nvidia profiler and **nvvp** is (GUI based) Nvidia visual profiler.
+In this tutorial, we will focus on nvprof and nvvp.
+
+:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
+above profilers. 
+
+.. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
+   :language: c++
+   :lines: 111-124
+   :linenos:
+
+The above code snippet includes two methods, you can use any of them to profile the regions of interest.
+
+1. :code:`REGISTER_TIMER_INFO` is a built-in timer wrapper which can calculate the time overhead of both cpu functions and cuda kernels.
+
+2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
+program crashes when CPU version of PaddlePaddle invokes them.
+
+You can find more details about how to use both of them in the next session.
+
+Hands-on Approach
+=================
+
+Built-in Timer
+--------------
+
+To enable built-in timer in PaddlePaddle, first you have to add :code:`REGISTER_TIMER_INFO` into the regions of you interest.
+Then, all information could be stamped in the console via :code:`printStatus` or :code:`printAllStatus` function.
+As a simple example, consider the following:
+
+1. Add :code:`REGISTER_TIMER_INFO` and :code:`printAllStatus` functions (see the emphasize-lines).
+
+    .. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 111-124
+        :emphasize-lines: 8-10,13
+        :linenos:
+
+2. Configure cmake with **WITH_TIMER** and recompile PaddlePaddle.
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_TIMER=ON
+        make
+
+3. Execute your code and observe the results (see the emphasize-lines). 
+
+    .. code-block:: bash
+        :emphasize-lines: 1,12-15
+
+        > ./paddle/math/tests/test_GpuProfiler                                                                             
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler                                             
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions                                                                      
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.                                                                   
+        [==========] Running 1 test from 1 test case.                                                                                                
+        [----------] Global test environment set-up.                                                                                                 
+        [----------] 1 test from Profiler                                                                                                            
+        [ RUN      ] Profiler.BilinearFwdBwd                                                                                                         
+        I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
+        gSizeX = 64, imgSizeY = 64"                                                                                                                  
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751                                           
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======                                               
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1                                                                                                                                  
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======                                                          
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------                                            
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)                                                                                                
+        [----------] 1 test from Profiler (136 ms total)                                                                                             
+                                                                                                                                                    
+        [----------] Global test environment tear-down                                                                                               
+        [==========] 1 test from 1 test case ran. (136 ms total)                                                                                     
+        [  PASSED  ] 1 test.
+
+nvprof profiler
+---------------
+
+To use this command line profiler **nvprof**, you can simply issue the following command:
+
+1. Add :code:`REGISTER_GPU_PROFILER` function (see the emphasize-lines).
+
+    .. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 111-124
+        :emphasize-lines: 6-7
+        :linenos:
+
+2. Configure cmake with **WITH_PROFILER** and recompile PaddlePaddle.
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_PROFILER=ON
+        make
+
+3. Use Nvidia profiler **nvprof** to profile the binary.
+
+    .. code-block:: bash
+
+        nvprof  ./paddle/math/tests/test_GpuProfiler
+
+Then, you can get the following profiling result:
+
+.. code-block:: bash
+
+    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler                                                                                                      
+    ==78544== Profiling result:                                                                                                                                                
+    Time(%)     Time     Calls       Avg       Min       Max  Name                                                                                                            
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]                                                                                              
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw                                                                                            
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw                                                                                        
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]                                                                                              
+                                                                                                                                                                            
+    ==78544== API calls:                                                                                                                                                       
+    Time(%)     Time     Calls       Avg       Min       Max  Name                                                                                                            
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags                                                                                       
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree                                                                                                        
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate                                                                                                
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy                                                                                                      
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize                                                                                           
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc                                                                                                   
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc                                                                                                      
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice                                                                                                   
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags                                                                                        
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute                                                                                            
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount                                                                                              
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties                                                                                         
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch                                                                                                      
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName                                                                                                 
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem                                                                                                
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice                                                                                                   
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate                                                                                                 
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute                                                                                          
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart                                                                                               
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall                                                                                               
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError                                                                                                
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument                                                                                               
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet                                                                                                     
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount                                                                                                
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion                                                                                              
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit                                                                                                          
+    0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
+
+
+nvvp profiler
+-------------
+
+For visual profiler **nvvp**, you can either import the output of :code:`nvprof –o ...` or
+run application through GUI.
+
+**Note: nvvp also support CPU profiling** (Click the box in nvvp to enable profile execution on CPU).
+
+..  image:: nvvp1.png
+    :align: center
+    :scale: 33%
+
+From the perspective of kernel functions, **nvvp** can even illustrate why does an operation take a long time?
+As shown in the following figure, kernel's block usage, register usage and shared memory usage from :code:`nvvp`
+allow us to fully utilize all warps on the GPU.
+
+..  image:: nvvp2.png
+    :align: center
+    :scale: 33%
+
+From the perspective of application, **nvvp** can give you some suggestions to address performance bottleneck.
+For instance, some advice in data movement and compute utilization from the below figure can guide you to tune performance.
+
+..  image:: nvvp3.png
+    :align: center
+    :scale: 33%
+
+..  image:: nvvp4.png
+    :align: center
+    :scale: 33%
+
+Profiling tips
+==============
+
+- The **nvprof** and **nvvp** output is a very good place to start.
+- The timeline is a good place to go next.
+- Only dig deep into a kernel if it’s taking a significant amount of your time.
+- Where possible, try to match profiler output with theory.
+    1) For example, if I know I’m moving 1GB, and my kernel takes 10ms, I expect the profiler to report 100GB/s.
+    2) Discrepancies are likely to mean your application isn’t doing what you thought it was.
+- Know your hardware: If your GPU can do 6 TFLOPs, and you’re already doing 5.5 TFLOPs, you won’t go much faster!
+
+
+Profiling is a key step in optimization. Sometimes quite simple changes can lead to big improvements in performance.
+Your mileage may vary!
+
+Reference
+=========
+Jeremy Appleyard, `GPU Profiling for Deep Learning <http://www.robots.ox.ac.uk/~seminars/seminars/Extra/2015_10_08_JeremyAppleyard.pdf>`_, 2015
diff --git a/doc/howto/optimization/index.rst b/doc/howto/optimization/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e2822a00982b386e8900420e8c200ed1f92a2d9b
--- /dev/null
+++ b/doc/howto/optimization/index.rst
@@ -0,0 +1,7 @@
+How to Tune GPU Performance
+===========================
+
+.. toctree::
+  :maxdepth: 3
+
+  gpu_profiling.rst
diff --git a/doc/howto/optimization/nvvp1.png b/doc/howto/optimization/nvvp1.png
new file mode 100644
index 0000000000000000000000000000000000000000..1af23ac3c52929b2b0645d2f9fa4d4c6db1f6e77
Binary files /dev/null and b/doc/howto/optimization/nvvp1.png differ
diff --git a/doc/howto/optimization/nvvp2.png b/doc/howto/optimization/nvvp2.png
new file mode 100644
index 0000000000000000000000000000000000000000..177c9db708da6863d1075f3e615f5962dbe18b29
Binary files /dev/null and b/doc/howto/optimization/nvvp2.png differ
diff --git a/doc/howto/optimization/nvvp3.png b/doc/howto/optimization/nvvp3.png
new file mode 100644
index 0000000000000000000000000000000000000000..d8f393667d6569b6f1e61ffccac43fae5888b6db
Binary files /dev/null and b/doc/howto/optimization/nvvp3.png differ
diff --git a/doc/howto/optimization/nvvp4.png b/doc/howto/optimization/nvvp4.png
new file mode 100644
index 0000000000000000000000000000000000000000..51f2f3e183295de6cf8ddaf2b3b8a0862aa35f01
Binary files /dev/null and b/doc/howto/optimization/nvvp4.png differ
diff --git a/doc/source/api.rst b/doc/howto/source/api.rst
similarity index 100%
rename from doc/source/api.rst
rename to doc/howto/source/api.rst
diff --git a/doc/source/cuda/index.rst b/doc/howto/source/cuda/index.rst
similarity index 100%
rename from doc/source/cuda/index.rst
rename to doc/howto/source/cuda/index.rst
diff --git a/doc/source/cuda/matrix.rst b/doc/howto/source/cuda/matrix.rst
similarity index 100%
rename from doc/source/cuda/matrix.rst
rename to doc/howto/source/cuda/matrix.rst
diff --git a/doc/source/cuda/nn.rst b/doc/howto/source/cuda/nn.rst
similarity index 100%
rename from doc/source/cuda/nn.rst
rename to doc/howto/source/cuda/nn.rst
diff --git a/doc/source/cuda/utils.rst b/doc/howto/source/cuda/utils.rst
similarity index 100%
rename from doc/source/cuda/utils.rst
rename to doc/howto/source/cuda/utils.rst
diff --git a/doc/source/gserver/activations.rst b/doc/howto/source/gserver/activations.rst
similarity index 100%
rename from doc/source/gserver/activations.rst
rename to doc/howto/source/gserver/activations.rst
diff --git a/doc/source/gserver/dataproviders.rst b/doc/howto/source/gserver/dataproviders.rst
similarity index 100%
rename from doc/source/gserver/dataproviders.rst
rename to doc/howto/source/gserver/dataproviders.rst
diff --git a/doc/source/gserver/evaluators.rst b/doc/howto/source/gserver/evaluators.rst
similarity index 100%
rename from doc/source/gserver/evaluators.rst
rename to doc/howto/source/gserver/evaluators.rst
diff --git a/doc/source/gserver/gradientmachines.rst b/doc/howto/source/gserver/gradientmachines.rst
similarity index 100%
rename from doc/source/gserver/gradientmachines.rst
rename to doc/howto/source/gserver/gradientmachines.rst
diff --git a/doc/source/gserver/index.rst b/doc/howto/source/gserver/index.rst
similarity index 100%
rename from doc/source/gserver/index.rst
rename to doc/howto/source/gserver/index.rst
diff --git a/doc/source/gserver/layers.rst b/doc/howto/source/gserver/layers.rst
similarity index 100%
rename from doc/source/gserver/layers.rst
rename to doc/howto/source/gserver/layers.rst
diff --git a/doc/source/gserver/neworks.rst b/doc/howto/source/gserver/neworks.rst
similarity index 100%
rename from doc/source/gserver/neworks.rst
rename to doc/howto/source/gserver/neworks.rst
diff --git a/doc/source/index.rst b/doc/howto/source/index.rst
similarity index 100%
rename from doc/source/index.rst
rename to doc/howto/source/index.rst
diff --git a/doc/source/math/functions.rst b/doc/howto/source/math/functions.rst
similarity index 100%
rename from doc/source/math/functions.rst
rename to doc/howto/source/math/functions.rst
diff --git a/doc/source/math/index.rst b/doc/howto/source/math/index.rst
similarity index 100%
rename from doc/source/math/index.rst
rename to doc/howto/source/math/index.rst
diff --git a/doc/source/math/matrix.rst b/doc/howto/source/math/matrix.rst
similarity index 100%
rename from doc/source/math/matrix.rst
rename to doc/howto/source/math/matrix.rst
diff --git a/doc/source/math/utils.rst b/doc/howto/source/math/utils.rst
similarity index 100%
rename from doc/source/math/utils.rst
rename to doc/howto/source/math/utils.rst
diff --git a/doc/source/math/vector.rst b/doc/howto/source/math/vector.rst
similarity index 100%
rename from doc/source/math/vector.rst
rename to doc/howto/source/math/vector.rst
diff --git a/doc/source/parameter/index.rst b/doc/howto/source/parameter/index.rst
similarity index 100%
rename from doc/source/parameter/index.rst
rename to doc/howto/source/parameter/index.rst
diff --git a/doc/source/parameter/optimizer.rst b/doc/howto/source/parameter/optimizer.rst
similarity index 100%
rename from doc/source/parameter/optimizer.rst
rename to doc/howto/source/parameter/optimizer.rst
diff --git a/doc/source/parameter/parameter.rst b/doc/howto/source/parameter/parameter.rst
similarity index 100%
rename from doc/source/parameter/parameter.rst
rename to doc/howto/source/parameter/parameter.rst
diff --git a/doc/source/parameter/updater.rst b/doc/howto/source/parameter/updater.rst
similarity index 100%
rename from doc/source/parameter/updater.rst
rename to doc/howto/source/parameter/updater.rst
diff --git a/doc/source/pserver/client.rst b/doc/howto/source/pserver/client.rst
similarity index 100%
rename from doc/source/pserver/client.rst
rename to doc/howto/source/pserver/client.rst
diff --git a/doc/source/pserver/index.rst b/doc/howto/source/pserver/index.rst
similarity index 100%
rename from doc/source/pserver/index.rst
rename to doc/howto/source/pserver/index.rst
diff --git a/doc/source/pserver/network.rst b/doc/howto/source/pserver/network.rst
similarity index 100%
rename from doc/source/pserver/network.rst
rename to doc/howto/source/pserver/network.rst
diff --git a/doc/source/pserver/server.rst b/doc/howto/source/pserver/server.rst
similarity index 100%
rename from doc/source/pserver/server.rst
rename to doc/howto/source/pserver/server.rst
diff --git a/doc/source/trainer.rst b/doc/howto/source/trainer.rst
similarity index 100%
rename from doc/source/trainer.rst
rename to doc/howto/source/trainer.rst
diff --git a/doc/source/utils/customStackTrace.rst b/doc/howto/source/utils/customStackTrace.rst
similarity index 100%
rename from doc/source/utils/customStackTrace.rst
rename to doc/howto/source/utils/customStackTrace.rst
diff --git a/doc/source/utils/enum.rst b/doc/howto/source/utils/enum.rst
similarity index 100%
rename from doc/source/utils/enum.rst
rename to doc/howto/source/utils/enum.rst
diff --git a/doc/source/utils/index.rst b/doc/howto/source/utils/index.rst
similarity index 100%
rename from doc/source/utils/index.rst
rename to doc/howto/source/utils/index.rst
diff --git a/doc/source/utils/lock.rst b/doc/howto/source/utils/lock.rst
similarity index 100%
rename from doc/source/utils/lock.rst
rename to doc/howto/source/utils/lock.rst
diff --git a/doc/source/utils/queue.rst b/doc/howto/source/utils/queue.rst
similarity index 100%
rename from doc/source/utils/queue.rst
rename to doc/howto/source/utils/queue.rst
diff --git a/doc/source/utils/thread.rst b/doc/howto/source/utils/thread.rst
similarity index 100%
rename from doc/source/utils/thread.rst
rename to doc/howto/source/utils/thread.rst
diff --git a/doc/index.rst b/doc/index.rst
index 668ad75a902bdd14c6198c41380ae93e29cec0d3..3555da1dfc81b29a89c7dfa6087d5fbb734a727b 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -4,7 +4,9 @@ PaddlePaddle Documentation
 ..  toctree::
   :maxdepth: 1
 
-  introduction/index.md
-  user_guide.rst
-  dev/index.rst
-  algorithm/index.rst
+  getstarted/index.rst
+  tutorials/index.md
+  howto/index.rst
+  api/index.rst
+  about/index.rst 
+ 
\ No newline at end of file
diff --git a/doc/introduction/index.md b/doc/introduction/index.md
deleted file mode 100644
index 01f52031a1d0247cd0b885218c17001f23685239..0000000000000000000000000000000000000000
--- a/doc/introduction/index.md
+++ /dev/null
@@ -1,100 +0,0 @@
-# Introduction
-
-PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
-
-## 1. A Classic Problem
-
-Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - <a href="https://en.wikipedia.org/wiki/Simple_linear_regression">**simple linear regression**</a> : you have observed a set of two-dimensional data points of `X` and `Y`, where `X` is an explanatory variable and `Y` is corresponding dependent variable, and you want to recover the underlying correlation between `X` and `Y`. Linear regression can be used in many practical scenarios. For example, `X` can be a variable about house size, and `Y` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
-
-## 2. Prepare the Data
-
-Suppose the true relationship can be characterized as `Y = 2X + 0.3`, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
-
-```python
-# dataprovider.py
-from paddle.trainer.PyDataProvider2 import *
-import random
-
-# define data types of input: 2 real numbers
-@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-def process(settings, input_file):
-    for i in xrange(2000):
-        x = random.random()
-        yield [x], [2*x+0.3]
-```
-
-## 3. Train a NeuralNetwork in PaddlePaddle
-
-To recover this relationship between `X` and `Y`, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line `Y' = wX + b` , then we gradually adapt `w` and `b` to minimize the difference between `Y'` and `Y`. Here is what it looks like in PaddlePaddle:
-
-```python
-# trainer_config.py
-from paddle.trainer_config_helpers import *
-
-# 1. read data. Suppose you saved above python code as dataprovider.py
-data_file = 'empty.list'
-with open(data_file, 'w') as f: f.writelines(' ')
-define_py_data_sources2(train_list=data_file, test_list=None, 
-        module='dataprovider', obj='process',args={})
-
-# 2. learning algorithm
-settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-
-# 3. Network configuration
-x = data_layer(name='x', size=1)
-y = data_layer(name='y', size=1)
-y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-cost = regression_cost(input=y_predict, label=y)
-outputs(cost)
-```
-
-Some of the most fundamental usages of PaddlePaddle are demonstrated:
-
--  The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly.
-
--  The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time.
-
--  Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration:
-	-  **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for `X` and `Y`.
-	-  **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model.
-	-  **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters.
-
-Now that everything is ready, you can train the network with a simple command line call:
- ```
- paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
- ```
-
-This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path `./output`. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
-
-
-## 4. Evaluate the Model
-
-Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: `w=2, b=0.3`, thus a better option is to check out model parameters directly.
-
-In PaddlePaddle, training is just to get a collection of model parameters, which are `w` and `b` in this case. Each parameter is saved in an individual file in the popular `numpy` array format. Here is the code that reads parameters from last pass.
-
-```python
-import numpy as np
-import os
-
-def load(file_name):
-    with open(file_name, 'rb') as f:
-        f.read(16) # skip header for float type.
-        return np.fromfile(f, dtype=np.float32)
-        
-print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-# w=1.999743, b=0.300137
-```
-
-<center> ![](./parameters.png) </center>
-
-Although starts from a random guess, you can see that value of `w` changes quickly towards 2 and `b` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer.
-
-There, you have recovered the underlying pattern between `X` and `Y` only from observed data.
-
-
-## 5. Where to Go from Here
-
-- <a href="../build/index.html"> Build and Installation </a>
-- <a href="../demo/quick_start/index_en.html">Quick Start</a>
-- <a href="../demo/index.html">Example and Demo</a>
diff --git a/doc/introduction/parameters.png b/doc/introduction/parameters.png
deleted file mode 120000
index f47e74c94fffabbd32f055febbadb1b18aa0c429..0000000000000000000000000000000000000000
--- a/doc/introduction/parameters.png
+++ /dev/null
@@ -1 +0,0 @@
-../../doc_cn/introduction/parameters.png
\ No newline at end of file
diff --git a/doc/demo/embedding_model/index.md b/doc/tutorials/embedding_model/index.md
similarity index 100%
rename from doc/demo/embedding_model/index.md
rename to doc/tutorials/embedding_model/index.md
diff --git a/doc/demo/embedding_model/neural-n-gram-model.png b/doc/tutorials/embedding_model/neural-n-gram-model.png
similarity index 100%
rename from doc/demo/embedding_model/neural-n-gram-model.png
rename to doc/tutorials/embedding_model/neural-n-gram-model.png
diff --git a/doc/demo/image_classification/cifar.png b/doc/tutorials/image_classification/cifar.png
similarity index 100%
rename from doc/demo/image_classification/cifar.png
rename to doc/tutorials/image_classification/cifar.png
diff --git a/doc/demo/image_classification/image_classification.md b/doc/tutorials/image_classification/image_classification.md
similarity index 100%
rename from doc/demo/image_classification/image_classification.md
rename to doc/tutorials/image_classification/image_classification.md
diff --git a/doc/demo/image_classification/image_classification.png b/doc/tutorials/image_classification/image_classification.png
similarity index 100%
rename from doc/demo/image_classification/image_classification.png
rename to doc/tutorials/image_classification/image_classification.png
diff --git a/doc/demo/image_classification/index.rst b/doc/tutorials/image_classification/index.rst
similarity index 100%
rename from doc/demo/image_classification/index.rst
rename to doc/tutorials/image_classification/index.rst
diff --git a/doc/demo/image_classification/lenet.png b/doc/tutorials/image_classification/lenet.png
similarity index 100%
rename from doc/demo/image_classification/lenet.png
rename to doc/tutorials/image_classification/lenet.png
diff --git a/doc/demo/image_classification/plot.png b/doc/tutorials/image_classification/plot.png
similarity index 100%
rename from doc/demo/image_classification/plot.png
rename to doc/tutorials/image_classification/plot.png
diff --git a/doc/demo/imagenet_model/resnet_block.jpg b/doc/tutorials/imagenet_model/resnet_block.jpg
similarity index 100%
rename from doc/demo/imagenet_model/resnet_block.jpg
rename to doc/tutorials/imagenet_model/resnet_block.jpg
diff --git a/doc/demo/imagenet_model/resnet_model.md b/doc/tutorials/imagenet_model/resnet_model.md
similarity index 100%
rename from doc/demo/imagenet_model/resnet_model.md
rename to doc/tutorials/imagenet_model/resnet_model.md
diff --git a/doc/demo/index.md b/doc/tutorials/index.md
similarity index 96%
rename from doc/demo/index.md
rename to doc/tutorials/index.md
index 289199d496eb3b527fa8c8261820bc8e4d301786..ebf5397391e65e096b265f44a0ad81942f0b9ec2 100644
--- a/doc/demo/index.md
+++ b/doc/tutorials/index.md
@@ -1,4 +1,4 @@
-# Examples and demos
+# TUTORIALS
 There are serveral examples and demos here.
 
 ## Image
diff --git a/doc/demo/quick_start/NetContinuous_en.png b/doc/tutorials/quick_start/NetContinuous_en.png
similarity index 100%
rename from doc/demo/quick_start/NetContinuous_en.png
rename to doc/tutorials/quick_start/NetContinuous_en.png
diff --git a/doc/demo/quick_start/NetConv_en.png b/doc/tutorials/quick_start/NetConv_en.png
similarity index 100%
rename from doc/demo/quick_start/NetConv_en.png
rename to doc/tutorials/quick_start/NetConv_en.png
diff --git a/doc/demo/quick_start/NetLR_en.png b/doc/tutorials/quick_start/NetLR_en.png
similarity index 100%
rename from doc/demo/quick_start/NetLR_en.png
rename to doc/tutorials/quick_start/NetLR_en.png
diff --git a/doc/demo/quick_start/NetRNN_en.png b/doc/tutorials/quick_start/NetRNN_en.png
similarity index 100%
rename from doc/demo/quick_start/NetRNN_en.png
rename to doc/tutorials/quick_start/NetRNN_en.png
diff --git a/doc/demo/quick_start/PipelineNetwork_en.jpg b/doc/tutorials/quick_start/PipelineNetwork_en.jpg
similarity index 100%
rename from doc/demo/quick_start/PipelineNetwork_en.jpg
rename to doc/tutorials/quick_start/PipelineNetwork_en.jpg
diff --git a/doc/demo/quick_start/PipelineTest_en.png b/doc/tutorials/quick_start/PipelineTest_en.png
similarity index 100%
rename from doc/demo/quick_start/PipelineTest_en.png
rename to doc/tutorials/quick_start/PipelineTest_en.png
diff --git a/doc/demo/quick_start/PipelineTrain_en.png b/doc/tutorials/quick_start/PipelineTrain_en.png
similarity index 100%
rename from doc/demo/quick_start/PipelineTrain_en.png
rename to doc/tutorials/quick_start/PipelineTrain_en.png
diff --git a/doc/demo/quick_start/Pipeline_en.jpg b/doc/tutorials/quick_start/Pipeline_en.jpg
similarity index 100%
rename from doc/demo/quick_start/Pipeline_en.jpg
rename to doc/tutorials/quick_start/Pipeline_en.jpg
diff --git a/doc/demo/quick_start/index_en.md b/doc/tutorials/quick_start/index_en.md
similarity index 98%
rename from doc/demo/quick_start/index_en.md
rename to doc/tutorials/quick_start/index_en.md
index 80d816a768a71156ce72cda6ea92b749fbcdbe1f..ec548b5393d7b210d6409328c00917aeb679a451 100644
--- a/doc/demo/quick_start/index_en.md
+++ b/doc/tutorials/quick_start/index_en.md
@@ -59,12 +59,11 @@ To build your text classification system, your code will need to perform five st
 ## Preprocess data into standardized format
 In this example, you are going to use [Amazon electronic product review dataset](http://jmcauley.ucsd.edu/data/amazon/) to build a bunch of deep neural network models for text classification. Each text in this dataset is a product review. This dataset has two categories: “positive” and “negative”. Positive means the reviewer likes the product, while negative means the reviewer does not like the product.
 
-`demo/quick_start` in the [source code](https://github.com/baidu/Paddle) provides scripts for downloading data and preprocessing data as shown below. The data process takes several minutes (about 3 minutes in our machine).
+`demo/quick_start` in the [source code](https://github.com/PaddlePaddle/Paddle) provides script for downloading the preprocessed data as shown below. (If you want to process the raw data, you can use the script `demo/quick_start/data/proc_from_raw_data/get_data.sh`).
 
 ```bash
 cd demo/quick_start
 ./data/get_data.sh
-./preprocess.sh
 ```
 
 ## Transfer Data to Model
@@ -477,7 +476,7 @@ The scripts of data downloading, network configurations, and training scrips are
 <td class="left">Word embedding</td>
 <td class="left"> 15MB </td>
 <td class="left"> 8.484%</td>
-<td class="left">trainer_config.bow.py</td>
+<td class="left">trainer_config.emb.py</td>
 </tr>
 
 <tr>
diff --git a/doc/demo/rec/ml_dataset.md b/doc/tutorials/rec/ml_dataset.md
similarity index 100%
rename from doc/demo/rec/ml_dataset.md
rename to doc/tutorials/rec/ml_dataset.md
diff --git a/doc/demo/rec/ml_regression.rst b/doc/tutorials/rec/ml_regression.rst
similarity index 100%
rename from doc/demo/rec/ml_regression.rst
rename to doc/tutorials/rec/ml_regression.rst
diff --git a/doc/demo/rec/rec_regression_network.png b/doc/tutorials/rec/rec_regression_network.png
similarity index 100%
rename from doc/demo/rec/rec_regression_network.png
rename to doc/tutorials/rec/rec_regression_network.png
diff --git a/doc/tutorials/semantic_role_labeling/curve.jpg b/doc/tutorials/semantic_role_labeling/curve.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..baa35ae7f0a0b6c246f3a0d331735477ab8bcd70
Binary files /dev/null and b/doc/tutorials/semantic_role_labeling/curve.jpg differ
diff --git a/doc/demo/semantic_role_labeling/feature.jpg b/doc/tutorials/semantic_role_labeling/feature.jpg
similarity index 100%
rename from doc/demo/semantic_role_labeling/feature.jpg
rename to doc/tutorials/semantic_role_labeling/feature.jpg
diff --git a/doc/demo/semantic_role_labeling/index.rst b/doc/tutorials/semantic_role_labeling/index.rst
similarity index 100%
rename from doc/demo/semantic_role_labeling/index.rst
rename to doc/tutorials/semantic_role_labeling/index.rst
diff --git a/doc/demo/semantic_role_labeling/network_arch.png b/doc/tutorials/semantic_role_labeling/network_arch.png
similarity index 100%
rename from doc/demo/semantic_role_labeling/network_arch.png
rename to doc/tutorials/semantic_role_labeling/network_arch.png
diff --git a/doc/demo/semantic_role_labeling/semantic_role_labeling.md b/doc/tutorials/semantic_role_labeling/semantic_role_labeling.md
similarity index 71%
rename from doc/demo/semantic_role_labeling/semantic_role_labeling.md
rename to doc/tutorials/semantic_role_labeling/semantic_role_labeling.md
index 890f7314582c65e9add50664006b57aa4e0709eb..f5bdf64487aa189cefcd55d633cc6638912b9e31 100644
--- a/doc/demo/semantic_role_labeling/semantic_role_labeling.md
+++ b/doc/tutorials/semantic_role_labeling/semantic_role_labeling.md
@@ -30,8 +30,6 @@ Several new files appear in the `data `directory as follows.
 conll05st-release：the test data set of CoNll-2005 shared task 
 test.wsj.words：the Wall Street Journal data sentences
 test.wsj.props:  the propositional arguments
-src.dict：the dictionary of words in sentences
-tgt.dict：the labels dictionary
 feature: the extracted features from data set
 ```
 
@@ -67,6 +65,8 @@ def hook(settings, word_dict, label_dict, **kwargs):
     settings.label_dict = label_dict
     #all inputs are integral and sequential type
     settings.slots = [
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(predicate_dict)),
         integer_value_sequence(len(word_dict)),
         integer_value_sequence(len(word_dict)),
         integer_value_sequence(len(word_dict)),
@@ -77,34 +77,39 @@ def hook(settings, word_dict, label_dict, **kwargs):
 ```
 The corresponding data iterator is as following:
 ```
-@provider(use_seq=True, init_hook=hook)
-def process(obj, file_name):
+@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size,
+          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_name):
     with open(file_name, 'r') as fdata:
         for line in fdata:
-            sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = line.strip().split('\t')
+            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
+                line.strip().split('\t')
+
             words = sentence.split()
             sen_len = len(words)
-            word_slot = [obj.word_dict.get(w, UNK_IDX) for w in words]
+            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
 
-            predicate_slot = [obj.word_dict.get(predicate, UNK_IDX)] * sen_len
-            ctx_n1_slot = [obj.word_dict.get(ctx_n1, UNK_IDX) ] * sen_len
-            ctx_0_slot = [obj.word_dict.get(ctx_0, UNK_IDX) ] * sen_len
-            ctx_p1_slot = [obj.word_dict.get(ctx_p1, UNK_IDX) ] * sen_len
+            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
+            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
+            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
 
             marks = mark.split()
             mark_slot = [int(w) for w in marks]
 
             label_list = label.split()
-            label_slot = [obj.label_dict.get(w) for w in label_list]
-
-            yield word_slot, predicate_slot, ctx_n1_slot, ctx_0_slot, ctx_p1_slot, mark_slot, label_slot
+            label_slot = [settings.label_dict.get(w) for w in label_list]
+            yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
+                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
 ```
-The `process`function yield 7 lists which are six features and labels.
+The `process`function yield 9 lists which are 8 features and label.
  
 ### Neural Network Config
 `db_lstm.py` is the neural network config file to load the dictionaries and define the  data provider module and network architecture during the training procedure. 
 
-Seven `data_layer` load instances from data provider. Six features are transformed into embedddings respectively, and mixed by `mixed_layer` .  Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels.
+Nine `data_layer` load instances from data provider. Eight features are transformed into embedddings respectively, and mixed by `mixed_layer` .  Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels.
 
 ### Run Training 
 The script for training is `train.sh`, user just need to execute:
@@ -115,27 +120,36 @@ The content in `train.sh`:
 ```
 paddle train \
   --config=./db_lstm.py \
+  --use_gpu=0 \
+  --log_period=5000 \
+  --trainer_count=1 \
+  --show_parameter_stats_period=5000 \
   --save_dir=./output \
-  --trainer_count=4 \
-  --log_period=10 \
-  --num_passes=500 \
-  --use_gpu=false \
-  --show_parameter_stats_period=10 \
+  --num_passes=10000 \
+  --average_test_period=10000000 \
+  --init_model_path=./data \
+  --load_missing_parameter_strategy=rand \
   --test_all_data_in_one_period=1 \
 2>&1 | tee 'train.log'
 ```
 
 -  \--config=./db_lstm.py : network config file.
--  \--save_di=./output: output path to save models.
--  \--trainer_count=4 : set thread number (or GPU count).
--  \--log_period=10 : print log every 20 batches.
--  \--num_passes=500: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
--  \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train.
--  \--show_parameter_stats_period=10: show parameter statistic every 100 batches.
--  \--test_all_data_in_one_period=1: test all data in every testing.
-
-
-After training, the models  will be saved in directory `output`.
+-  \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train, until now crf_layer do not support GPU
+-  \--log_period=500: print log every 20 batches.
+-  \--trainer_count=1: set thread number (or GPU count).
+-  \--show_parameter_stats_period=5000: show parameter statistic every 100 batches.
+-  \--save_dir=./output: output path to save models.
+-  \--num_passes=10000: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
+-  \--average_test_period=10000000:  do test on average parameter every average_test_period batches
+-  \--init_model_path=./data: parameter initialization path 
+-  \--load_missing_parameter_strategy=rand: random initialization unexisted parameters
+-  \--test_all_data_in_one_period=1: test all data in one period
+
+
+After training, the models  will be saved in directory `output`. Our training curve is as following:
+<center>
+![pic](./curve.jpg)
+</center>
 
 ### Run testing
 The script for testing is `test.sh`, user just need to execute:
@@ -155,6 +169,7 @@ paddle train \
   - \--model_list=$model_list.list: model list file
   - \--job=test: indicate the test job
   - \--config_args=is_test=1: flag to indicate test
+  - \--test_all_data_in_one_period=1: test all data in 1 period
   
 
 ### Run prediction
@@ -166,11 +181,13 @@ The script for prediction is `predict.sh`, user just need to execute:
 In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file
 ```
 python predict.py 
-     -c $config_file 
-     -w $model_path 
-     -l $label_file 
-     -d $dict_file 
-     -i $input_file
+     -c $config_file \
+     -w $best_model_path \
+     -l $label_file \
+     -p $predicate_dict_file  \
+     -d $dict_file \
+     -i $input_file \
+     -o $output_file
 ```
 
 `predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix.
diff --git a/doc/demo/sentiment_analysis/bi_lstm.jpg b/doc/tutorials/sentiment_analysis/bi_lstm.jpg
similarity index 100%
rename from doc/demo/sentiment_analysis/bi_lstm.jpg
rename to doc/tutorials/sentiment_analysis/bi_lstm.jpg
diff --git a/doc/demo/sentiment_analysis/index.rst b/doc/tutorials/sentiment_analysis/index.rst
similarity index 100%
rename from doc/demo/sentiment_analysis/index.rst
rename to doc/tutorials/sentiment_analysis/index.rst
diff --git a/doc/demo/sentiment_analysis/lstm.png b/doc/tutorials/sentiment_analysis/lstm.png
similarity index 100%
rename from doc/demo/sentiment_analysis/lstm.png
rename to doc/tutorials/sentiment_analysis/lstm.png
diff --git a/doc/demo/sentiment_analysis/sentiment_analysis.md b/doc/tutorials/sentiment_analysis/sentiment_analysis.md
similarity index 100%
rename from doc/demo/sentiment_analysis/sentiment_analysis.md
rename to doc/tutorials/sentiment_analysis/sentiment_analysis.md
diff --git a/doc/demo/sentiment_analysis/stacked_lstm.jpg b/doc/tutorials/sentiment_analysis/stacked_lstm.jpg
similarity index 100%
rename from doc/demo/sentiment_analysis/stacked_lstm.jpg
rename to doc/tutorials/sentiment_analysis/stacked_lstm.jpg
diff --git a/doc/demo/text_generation/encoder-decoder-attention-model.png b/doc/tutorials/text_generation/encoder-decoder-attention-model.png
similarity index 100%
rename from doc/demo/text_generation/encoder-decoder-attention-model.png
rename to doc/tutorials/text_generation/encoder-decoder-attention-model.png
diff --git a/doc/demo/text_generation/index.rst b/doc/tutorials/text_generation/index.rst
similarity index 100%
rename from doc/demo/text_generation/index.rst
rename to doc/tutorials/text_generation/index.rst
diff --git a/doc/demo/text_generation/text_generation.md b/doc/tutorials/text_generation/text_generation.md
similarity index 100%
rename from doc/demo/text_generation/text_generation.md
rename to doc/tutorials/text_generation/text_generation.md
diff --git a/doc/ui/api/trainer_config_helpers/attrs.rst b/doc/ui/api/trainer_config_helpers/attrs.rst
deleted file mode 100644
index 44919aba90df0b9da7c311a62339052c16c44ad1..0000000000000000000000000000000000000000
--- a/doc/ui/api/trainer_config_helpers/attrs.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-Parameter and Extra Layer Attribute
-===================================
-
-..  automodule:: paddle.trainer_config_helpers.attrs
-    :members:
diff --git a/doc/ui/api/trainer_config_helpers/index.rst b/doc/ui/api/trainer_config_helpers/index.rst
deleted file mode 100644
index 8395eb75710b3e67ec0c5442f79c999bdacdff42..0000000000000000000000000000000000000000
--- a/doc/ui/api/trainer_config_helpers/index.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Model Config Interface
-======================
-
-.. toctree::
-  :maxdepth: 1
-
-  optimizers.rst
-  data_sources.rst
-  layers.rst
-  activations.rst 
-  poolings.rst
-  networks.rst
-  evaluators.rst
-  attrs.rst
diff --git a/doc/ui/index.md b/doc/ui/index.md
deleted file mode 100644
index 9c1ba27bdc14fa9ab762ffb97424a8a5946808f9..0000000000000000000000000000000000000000
--- a/doc/ui/index.md
+++ /dev/null
@@ -1,20 +0,0 @@
-# User Interface
-
-## Data Provider
-
-* [Introduction](data_provider/index.rst)
-* [PyDataProvider2](data_provider/pydataprovider2.rst)
-
-## API Reference
-
-* [Model Config Interface](api/trainer_config_helpers/index.md)
-
-## Command Line Argument
-
-* [Use Case](cmd_argument/use_case.md)
-* [Argument Outline](cmd_argument/argument_outline.md)
-* [Detailed Descriptions](cmd_argument/detail_introduction.md)
-
-## Predict
-
-* [Python Prediction API](predict/swig_py_paddle_en.rst)
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
deleted file mode 100644
index d4deb3ca5a4523b509ea5082f32be8a315570dea..0000000000000000000000000000000000000000
--- a/doc/user_guide.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-User Guide
-==========
-
-..  toctree::
-  :maxdepth: 1
-
-  demo/quick_start/index_en.md
-  build/index.rst
-  build/contribute_to_paddle.md
-  ui/index.md
-  ui/api/trainer_config_helpers/index.rst
-  demo/index.md
-  cluster/index.md
diff --git a/doc_cn/algorithm/rnn/hierarchical-layer.md b/doc_cn/algorithm/rnn/hierarchical-layer.rst
similarity index 50%
rename from doc_cn/algorithm/rnn/hierarchical-layer.md
rename to doc_cn/algorithm/rnn/hierarchical-layer.rst
index 519653df081d6e7919ada3cbff6aaf4d2a2f6115..a9906b8b9c2036ae349f30d7edee770884f73f99 100644
--- a/doc_cn/algorithm/rnn/hierarchical-layer.md
+++ b/doc_cn/algorithm/rnn/hierarchical-layer.rst
@@ -1,6 +1,11 @@
-# 支持双层序列作为输入的Layer
+###########################
+支持双层序列作为输入的Layer
+###########################
 
-## 概述
+..	contents::
+
+概述
+====
 
 在自然语言处理任务中，序列是一种常见的数据类型。一个独立的词语，可以看作是一个非序列输入，或者，我们称之为一个0层的序列；由词语构成的句子，是一个单层序列；若干个句子构成一个段落，是一个双层的序列。
 
@@ -12,55 +17,79 @@
 + 单层序列：排成一列的多个元素，每个元素是一个0层序列，元素之间的顺序是重要的输入信息
 + 双层序列：排成一列的多个元素，每个元素是一个单层序列，称之为双层序列的一个子序列（subseq），subseq的每个元素是一个0层序列
 
-
 在 PaddlePaddle中，下面这些Layer能够接受双层序列作为输入，完成相应的计算。
-## pooling_layer
-
-pooling_layer的使用示例如下，详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#pooling-layer">配置API</a>。
-```python
-seq_pool = pooling_layer(input=layer,
-                         pooling_type=AvgPooling(),
-                         agg_level=AggregateLevel.EACH_SEQUENCE)
-```
+
+pooling_layer
+==============
+
+pooling_layer 的使用示例如下，详细见 `pooling_layer`_ 配置API。
+
+..	code-block:: bash
+
+        seq_pool = pooling_layer(input=layer,
+                                 pooling_type=AvgPooling(),
+                                 agg_level=AggregateLevel.EACH_SEQUENCE)
+        
 - `pooling_type` 目前支持两种，分别是：MaxPooling()和AvgPooling()。
-- `agg_level=AggregateLevel.TIMESTEP`时（默认值）：
+
+- `agg_level=AggregateLevel.TIMESTEP` 时（默认值）：
+
   - 作用：双层序列经过运算变成一个0层序列，或单层序列经过运算变成一个0层序列
   - 输入：一个双层序列，或一个单层序列
   - 输出：一个0层序列，即整个输入序列（单层或双层）的平均值（或最大值）
-- `agg_level=AggregateLevel.EACH_SEQUENCE`时：
+
+- `agg_level=AggregateLevel.EACH_SEQUENCE` 时：
+
   - 作用：一个双层序列经过运算变成一个单层序列
   - 输入：必须是一个双层序列
   - 输出：一个单层序列，序列的每个元素是原来双层序列每个subseq元素的平均值（或最大值）
 
-## last_seq 和 first_seq
+last_seq 和 first_seq
+=====================
+
+last_seq 的使用示例如下（ `first_seq`_ 类似），详细见 `last_seq`_ 配置API。
+
+..	code-block:: bash
+
+        last = last_seq(input=layer,
+                        agg_level=AggregateLevel.EACH_SEQUENCE)
+        
+- `agg_level=AggregateLevel.TIMESTEP` 时（默认值）：
 
-last_seq的使用示例如下（first_seq类似），详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#last-seq">配置API</a>。
-```python
-last = last_seq(input=layer,
-                agg_level=AggregateLevel.EACH_SEQUENCE)
-```
-- `agg_level=AggregateLevel.TIMESTEP`时（默认值）：
   - 作用：一个双层序列经过运算变成一个0层序列，或一个单层序列经过运算变成一个0层序列
   - 输入：一个双层序列或一个单层序列
   - 输出：一个0层序列，即整个输入序列（双层或者单层）最后一个，或第一个元素。
-- `agg_level=AggregateLevel.EACH_SEQUENCE`时：
+
+- `agg_level=AggregateLevel.EACH_SEQUENCE` 时：
   - 作用：一个双层序列经过运算变成一个单层序列
   - 输入：必须是一个双层序列
   - 输出：一个单层序列，其中每个元素是双层序列中每个subseq最后一个（或第一个）元素。
 
-## expand_layer
+expand_layer
+============
+
+expand_layer 的使用示例如下，详细见 `expand_layer`_ 配置API。
+
+..	code-block:: bash
+
+        expand = expand_layer(input=layer1,
+                              expand_as=layer2,
+                              expand_level=ExpandLevel.FROM_TIMESTEP)
+        
+- `expand_level=ExpandLevel.FROM_TIMESTEP` 时（默认值）：
 
-expand_layer的使用示例如下，详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#expand-layer">配置API</a>。
-```python
-expand = expand_layer(input=layer1,
-                      expand_as=layer2,
-                      expand_level=ExpandLevel.FROM_TIMESTEP)
-```
-- `expand_level=ExpandLevel.FROM_TIMESTEP`时（默认值）：
   - 作用：一个0层序列经过运算扩展成一个单层序列，或者一个双层序列
-  - 输入：layer1必须是一个0层序列，是待扩展的数据；layer2可以是一个单层序列，或者是一个双层序列，提供扩展的长度信息
-  - 输出：一个单层序列，或一个双层序列，输出序列的类型（双层序列，或单层序列）和序列中含有元素的数目同 layer2一致。若输出是单层序列，单层序列的每个元素（0层序列），都是对layer1元素的拷贝；若输出是双层序列，双层序列每个subseq中每个元素（0层序列），都是对layer1元素的拷贝
-- `expand_level=ExpandLevel.FROM_SEQUENCE`时：
+  - 输入：layer1必须是一个0层序列，是待扩展的数据；layer2 可以是一个单层序列，或者是一个双层序列，提供扩展的长度信息
+  - 输出：一个单层序列或一个双层序列，输出序列的类型（双层序列或单层序列）和序列中含有元素的数目同 layer2 一致。若输出是单层序列，单层序列的每个元素（0层序列），都是对layer1元素的拷贝；若输出是双层序列，双层序列每个subseq中每个元素（0层序列），都是对layer1元素的拷贝
+
+- `expand_level=ExpandLevel.FROM_SEQUENCE` 时：
+
   - 作用：一个单层序列经过运算扩展成一个双层序列
-  - 输入：layer1必须是一个单层序列，是待扩展的数据；layer2必须是一个双层序列，提供扩展的长度信息
-  - 输出：一个双层序列，序列中含有元素的数目同layer2一致。要求单层序列含有元素的数目（0层序列），和双层序列含有subseq 的数目一致。单层序列第i个元素（0层序列），被扩展为一个单层序列，构成了输出双层序列的第i个subseq。
+  - 输入：layer1必须是一个单层序列，是待扩展的数据；layer2 必须是一个双层序列，提供扩展的长度信息
+  - 输出：一个双层序列，序列中含有元素的数目同 layer2 一致。要求单层序列含有元素的数目（0层序列）和双层序列含有subseq 的数目一致。单层序列第i个元素（0层序列），被扩展为一个单层序列，构成了输出双层序列的第i个 subseq 。
+
+
+.. _pooling_layer: ../../../doc/ui/api/trainer_config_helpers/layers.html#pooling-layer
+.. _last_seq: ../../../doc/ui/api/trainer_config_helpers/layers.html#last-seq
+.. _first_seq: ../../../doc/ui/api/trainer_config_helpers/layers.html#first-seq
+.. _expand_layer: ../../../doc/ui/api/trainer_config_helpers/layers.html#expand-layer
diff --git a/doc_cn/build/docker/build_docker_image.rst b/doc_cn/build/docker/build_docker_image.rst
deleted file mode 100644
index 73409ceaff4e1a1f8ac3ad0d828e003a214c8fcb..0000000000000000000000000000000000000000
--- a/doc_cn/build/docker/build_docker_image.rst
+++ /dev/null
@@ -1,38 +0,0 @@
-构建PaddlePaddle Docker Image
-===========================
-
-PaddlePaddle的Docker Image构建源码放置在 :code:`${源码根目录}/paddle/scripts/docker/`目录下。
-该Image基于ubuntu 14.04。该目录下有两个文件，Dockerfile和build.sh。其中:
-
-*  Dockerfile是docker image的主要描述文件。描述了Docker image的构建步骤、各种参数和维护
-   人员等等。
-*  build.sh是docker image的主要构建步骤。
-
-该image的构建在docker 1.12版本测试通过, 低于docker 1.12版本的情况下并没有测试。主要由于旧版本
-的docker可能缺乏 :code:`--build-arg` 参数，从而不能在运行编译命令的时候接受参数。
-
-同时，该构建脚本充分考虑了网络不稳定的情况，对于cuda的Toolkit有断点续传和传输速度过小重启下载的
-简单优化。
-
-使用脚本构建PaddlePaddle Docker Image
--------------------------------------------
-
-该脚本的使用方法是，进入该源码目录，执行 :code:`docker build .` 命令。可以使用
- :code:`--build-arg` 传入的配置参数包括:
-
-*  LOWEST\_DL\_SPEED\: 多线程下载过程中，最低线程的下载速度(默认单位是Bytes，可以传入10K, 
-   10M，或者10G这样的单位)。如果小于这个下载速度，那么这个下载线程将会关闭。所有的下载线程关闭时，
-   下载进程会重启。
-*  WITH\_GPU\: ON or OFF。是否开启GPU功能。注意，编译PaddlePaddle的GPU版本并不需要一定在具有GPU
-   的机器上进行。但是，运行PaddlePaddle的GPU版本一定要在具有CUDA的机器上运行。
-
-简单的使用样例为\:
-
-..  code-block:: bash
-
-    cd ${源码根目录}/paddle/scripts/docker/
-    docker build --build-arg LOWEST_DL_SPEED=50K\
-                 --build-arg WITH_GPU=ON \
-                 --tag  paddle_gpu:latest .
-
-即可在本地编译出PaddlePaddle的镜像。
diff --git a/doc_cn/build_and_install/cmake/cblas_settings.csv b/doc_cn/build_and_install/cmake/cblas_settings.csv
index d804c0a662cb652dbefb0d09fb18538308c20aec..a6356baf16a0d3d2499e39d2055d8ee878dcaef2 100644
--- a/doc_cn/build_and_install/cmake/cblas_settings.csv
+++ b/doc_cn/build_and_install/cmake/cblas_settings.csv
@@ -1,4 +1,5 @@
-MKL_ROOT,mkl的路径，在${MKL_ROOT}/include下需要包含mkl.h，在${MKL_ROOT}/lib目录下需要包含 mkl_core，mkl_sequential和mkl_intel_lp64三个库
-ATLAS_ROOT,ATLAS库的路径，在${ATLAS_ROOT}/include下需要包含cblas.h，而在${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库
-OPENBLAS_ROOT,在${OPENBLAS_ROOT}/include下需要包含cblas.h，而在${OPENBLAS_ROOT}/lib下需要包含openblas库
-REFERENCE_CBLAS_ROOT,在${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h，在${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库
\ No newline at end of file
+编译选项,描述,注意
+MKL_ROOT,MKL的路径,${MKL_ROOT}/include下需要包含mkl.h，${MKL_ROOT}/lib目录下需要包含mkl_core，mkl_sequential和mkl_intel_lp64三个库。
+ATLAS_ROOT,ATLAS的路径,${ATLAS_ROOT}/include下需要包含cblas.h，${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库。
+OPENBLAS_ROOT,OpenBLAS的路径,${OPENBLAS_ROOT}/include下需要包含cblas.h，${OPENBLAS_ROOT}/lib下需要包含openblas库。
+REFERENCE_CBLAS_ROOT,REFERENCE BLAS的路径,${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h，${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库。
\ No newline at end of file
diff --git a/doc_cn/build_and_install/cmake/compile_options.csv b/doc_cn/build_and_install/cmake/compile_options.csv
index 0b8015aaee4d7b9068cb4a8de5d9967569e37f0c..12b45eebb2822d77447fa1bc754360605971dcab 100644
--- a/doc_cn/build_and_install/cmake/compile_options.csv
+++ b/doc_cn/build_and_install/cmake/compile_options.csv
@@ -1,15 +1,14 @@
-选项,说明,默认值
-WITH_GPU,是否编译GPU支持。,是否寻找到cuda工具链
-WITH_DOUBLE,是否使用双精度浮点数。,否
-WITH_DSO,是否使用运行时动态加载cuda动态库，而非静态加载cuda动态库。,是
-WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制,是
-WITH_PYTHON,是否内嵌python解释器。可以方便嵌入式工作。,是
-WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
-WITH_RDMA,是否开启RDMA支持,否
-WITH_GLOG,是否使用GLOG，如果不使用则会使用一个简化版的日志实现。可以方便嵌入式工作。,取决于是否寻找到GLOG
-WITH_GFLAGS,是否使用GFLAGS，如果不使用则会使用一个简化版的命令行参数解析。可以方便嵌入式工作。,取决于是否寻找到GFLAGS
-WITH_TIMER,是否开启计时功能开启计时功能会导致运行略慢，打印的日志变多。但是方便调试和benchmark,否
-WITH_TESTING,是否开启单元测试,取决于是否寻找到gtest
-WITH_DOC,是否编译英文文档,否
-WITH_DOC_CN,是否编译中文文档,否
-WITH_SWIG_PY,是否编译python的swig接口，python的swig接口可以方便进行预测和定制化训练,取决于是否找到swig
+选项,说明,默认值
+WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
+WITH_DOUBLE,是否使用双精度浮点数。,否
+WITH_DSO,是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。,是
+WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
+WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
+WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
+WITH_RDMA,是否开启RDMA,否
+WITH_GLOG,是否开启GLOG。如果不开启，则会使用一个简化版的日志，同时方便今后的嵌入式移植工作。,取决于是否寻找到GLOG
+WITH_GFLAGS,是否使用GFLAGS。如果不开启，则会使用一个简化版的命令行参数解析器，同时方便今后的嵌入式移植工作。,取决于是否寻找到GFLAGS
+WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢，打印的日志变多，但是方便调试和测Benchmark,否
+WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
+WITH_DOC,是否编译中英文文档,否
+WITH_SWIG_PY,是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练,取决于是否寻找到SWIG
\ No newline at end of file
diff --git a/doc_cn/build_and_install/cmake/compile_options.rst b/doc_cn/build_and_install/cmake/compile_options.rst
index bb5b18a073803662774cb6b7bcbdbafe3ad51112..f345ead2bf851bdad7be2fb8185d16fd2a318a66 100644
--- a/doc_cn/build_and_install/cmake/compile_options.rst
+++ b/doc_cn/build_and_install/cmake/compile_options.rst
@@ -1,62 +1,43 @@
-设置PaddlePaddle的编译选项
-==========================
-
-PaddlePaddle的编译选项可以在调用cmake的时候设置。cmake是一个跨平台的编译脚本，调用
-cmake可以将cmake项目文件，生成各个平台的makefile。详细的cmake使用方法可以参考
-`cmake的官方文档 <https://cmake.org/cmake-tutorial>`_ 。
-
-PaddlePaddle的编译选项是可以控制PaddlePaddle生成CPU/GPU版本二进制，链接何种blas等等。所有的
-编译选项列表如下
-
-PaddlePaddle的编译选项
-----------------------
-
-bool型的编译选项
-++++++++++++++++
-设置下列编译选项时，可以在cmake的命令行设置。使用 -D命令即可。例如 
-:code:`cmake -D WITH_GPU=OFF`
-
-..  csv-table:: PaddlePaddle的bool型编译选项
-    :widths: 1, 7, 2
-    :file: compile_options.csv
-
-blas相关的编译选项
-++++++++++++++++++
-
-PaddlePaddle可以使用 `MKL <https://software.intel.com/en-us/intel-mkl>`_ ，
-`Atlas <http://math-atlas.sourceforge.net/>`_ ,
-`OpenBlas <http://www.openblas.net/>`_ 和 
-`refference Blas <http://www.netlib.org/blas/>`_ ，任意一种cblas实现。
-通过编译时指定路径来实现引用各种blas。
-
-cmake编译时会首先在系统路径(/usr/lib\:/usr/local/lib)中寻找这些blas的实现。同时
-也会读取相关路径变量来进行搜索。路径变量为\:
-
-
-..  csv-table:: PaddlePaddle的cblas编译选项
-    :widths: 1, 9
-    :header: "编译选项", "描述"
-    :file: cblas_settings.csv
-
-这些变量均可以使用 -D命令指定。例如 :code:`cmake -D MKL_ROOT=/opt/mkl/`。这些变
-量也可以通过调用cmake命令前通过环境变量指定。例如
-
-..  code-block:: bash
-
-    export MKL_ROOT=/opt/mkl
-    cmake
-
-需要注意的是，这些变量只在第一次cmake的时候有效。如果在第一次cmake之后想要重新设
-置这些变量，推荐清理( :code:`rm -rf` )掉编译目录后，再指定。
-
-cuda/cudnn相关的编译选项
-++++++++++++++++++++++++
-
-PaddlePaddle可以使用 cudnn v2之后的任何一个cudnn版本来编译运行。但需要注意的是编译和
-运行使用的cudnn尽量是同一个版本。推荐使用最新版本的cudnn v5.1。
-
-在cmake配置时可以使用 :code:`CUDNN_ROOT` 来配置CUDNN的安装路径。使用的命令也是 
--D，例如 :code:`cmake -D CUDNN_ROOT=/opt/cudnnv5` 。
-
-需要注意的是，这些变量只在第一次cmake的时候有效。如果在第一次cmake之后想要重新设
-置这些变量，推荐清理( :code:`rm -rf` )掉编译目录后，再指定。
+PaddlePaddle的编译选项
+======================
+
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考 `官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+Bool型的编译选项
+----------------
+用户可在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: Bool型的编译选项
+    :widths: 1, 7, 2
+    :file: compile_options.csv
+
+BLAS/CUDA/Cudnn的编译选项
+--------------------------
+BLAS
++++++
+
+PaddlePaddle支持以下任意一种BLAS库：`MKL <https://software.intel.com/en-us/intel-mkl>`_ ，`ATLAS <http://math-atlas.sourceforge.net/>`_ ，`OpenBlAS <http://www.openblas.net/>`_ 和 `REFERENCE BLAS <http://www.netlib.org/blas/>`_ 。
+
+..  csv-table:: BLAS路径相关的编译选项
+    :widths: 1, 2, 7
+    :file: cblas_settings.csv
+
+CUDA/Cudnn
++++++++++++
+
+PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时，首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
+
+..  code-block:: bash
+
+    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
+
+注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
\ No newline at end of file
diff --git a/doc_cn/build_and_install/index.rst b/doc_cn/build_and_install/index.rst
index 2205e282248c4e7f6d1173be47aadf160554c6be..48163fb36e561fe5fd8f6907379687a8b5c97f68 100644
--- a/doc_cn/build_and_install/index.rst
+++ b/doc_cn/build_and_install/index.rst
@@ -8,9 +8,7 @@ PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜
 
 .. toctree::
    :maxdepth: 1
-   :glob:
    
-   使用Jumbo安装(对内) <../build/internal/install_from_jumbo.rst>
    install/docker_install.rst 
    install/ubuntu_install.rst
 
@@ -25,8 +23,5 @@ PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜
 
 .. toctree::
    :maxdepth: 1
-   :glob:
 
-   源码下载(对内) <../build/internal/download_paddle_source_zh_cn.rst>
-   从源码编译安装(对内)  <../build/internal/build_from_source_zh_cn.rst>
    cmake/index.rst
diff --git a/doc_cn/build_and_install/install/docker_install.rst b/doc_cn/build_and_install/install/docker_install.rst
index a5f5fb117e11e8ac1ae49e4271e826fa12d5e810..40339659be406ec72da8ad89b6d5dd38d72bb5ae 100644
--- a/doc_cn/build_and_install/install/docker_install.rst
+++ b/doc_cn/build_and_install/install/docker_install.rst
@@ -1,9 +1,7 @@
 安装PaddlePaddle的Docker镜像
 ============================
 
-PaddlePaddle提供了Docker的使用镜像。PaddlePaddle推荐使用Docker进行PaddlePaddle的部署和
-运行。Docker是一个基于容器的轻量级虚拟环境。具有和宿主机相近的运行效率，并提供
-了非常方便的二进制分发手段。
+PaddlePaddle项目提供官方 `Docker <https://www.docker.com/>`_ 镜像。Docker镜像是我们目前唯一官方支持的部署和运行方式。
 
 下述内容将分为如下几个类别描述。
 
@@ -41,7 +39,7 @@ PaddlePaddle提供的Docker镜像版本
 * CPU WITHOUT AVX: CPU版本，不支持AVX指令集的CPU也可以运行
 * GPU WITHOUT AVX: GPU版本，不需要AVX指令集的CPU也可以运行。
 
-用户可以选择对应版本的docker image。使用如下脚本可以确定本机的CPU知否支持 :code:`AVX` 指令集\:
+用户可以选择对应版本的docker image。使用如下脚本可以确定本机的CPU是否支持 :code:`AVX` 指令集\:
 
 ..  code-block:: bash
 
@@ -67,7 +65,7 @@ mac osx或者是windows机器，请参考
 
 ..  code-block:: bash
     
-    $ docker run -it paddledev/paddlepaddle:cpu-latest
+    $ docker run -it paddledev/paddle:cpu-latest
 
 即可启动和进入PaddlePaddle的container。如果运行GPU版本的PaddlePaddle，则需要先将
 cuda相关的Driver和设备映射进container中，脚本类似于
@@ -76,7 +74,7 @@ cuda相关的Driver和设备映射进container中，脚本类似于
 
     $ export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
     $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddlepaddle:latest-gpu
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest
 
 进入Docker container后，运行 :code:`paddle version` 即可打印出PaddlePaddle的版本和构建
 信息。安装完成的PaddlePaddle主体包括三个部分， :code:`paddle` 脚本， python的
diff --git a/doc_cn/build_and_install/install/ubuntu_install.rst b/doc_cn/build_and_install/install/ubuntu_install.rst
index 70ac5225bd82e40838875b49f67e70ff08eff853..4500d6e0b03be9280e3e6c25cddbf7fb389671b8 100644
--- a/doc_cn/build_and_install/install/ubuntu_install.rst
+++ b/doc_cn/build_and_install/install/ubuntu_install.rst
@@ -1,35 +1,42 @@
-使用deb包在Ubuntu上安装PaddlePaddle
+Ubuntu部署PaddlePaddle
 ===================================
 
-PaddlePaddle目前支持使用deb包安装。Paddle的 :code:`deb` 安装包在ubuntu 14.04中正确，但理论上支持其他的 debian 发行版。
+PaddlePaddle提供了ubuntu 14.04 deb安装包。
 
+安装
+------
 
-PaddlePaddle的ubuntu安装包分为四个版本，他们是 cpu、gpu、cpu-noavx、gpu-noavx 四个版本。其中 noavx 用于不支持AVX指令集的cpu。安装包的下载地址是\: https://github.com/baidu/Paddle/releases/
+安装包的下载地址是\: https://github.com/PaddlePaddle/Paddle/releases
 
+它包含四个版本\:
 
-用户需要先将PaddlePaddle安装包下载到本地，然后执行如下 :code:`gdebi` 命令即可完成安装。
+* cpu版本: 支持主流x86处理器平台, 使用了avx指令集。
 
-..  code-block:: shell
+* cpu-noavx版本：支持主流x86处理器平台，没有使用avx指令集。
 
-    gdebi paddle-*-cpu.deb
+* gpu版本：支持主流x86处理器平台，支持nvidia cuda平台，使用了avx指令集。
+
+* gpu-noavx版本：支持主流x86处理器平台，支持nvidia cuda平台，没有使用avx指令集。
 
-如果 :code:`gdebi` 没有安装,则需要使用 :code:`sudo apt-get install gdebi`, 来安装 :code:`gdebi` 。
+下载完相关安装包后，执行:
 
+..  code-block:: shell
+
+    sudo apt-get install gdebi
+    gdebi paddle-*-cpu.deb
 
-或者使用下面一条命令安装.
+或者:
 
 ..  code-block:: shell
 
     dpkg -i paddle-*-cpu.deb
     apt-get install -f
 
+
 在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的，
 在 :code:`apt-get install -f` 里会继续安装 PaddlePaddle。
 
-需要注意的是，如果使用GPU版本的PaddlePaddle，请安装CUDA 7.5 和CUDNN 5到本地环境中，
-并设置好对应的环境变量(LD_LIBRARY_PATH等等)。
-
-安装完成后,可以使用命令 :code:`paddle version` 查看安装后的paddle 版本。可能的输出为
+安装完成后，可以使用命令 :code:`paddle version` 查看安装后的paddle 版本:
 
 ..  literalinclude:: paddle_version.txt
 
@@ -39,45 +46,16 @@ PaddlePaddle的ubuntu安装包分为四个版本，他们是 cpu、gpu、cpu-noa
 libcudart.so/libcudnn.so找不到
 ++++++++++++++++++++++++++++++
 
-安装完成PaddlePaddle后，运行 :code:`paddle train` 报错\:
-
-..	code-block:: shell
-
-	0831 12:36:04.151525  1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH.
-
-PaddlePaddle使用运行时动态连接CUDA的so，如果在 LD_LIBRARY_PATH里面找不到这些动态
-库的话，会报寻找不到这些动态库。
+安装完成后，运行 :code:`paddle train` 报错\:
 
-解决方法很简单，就是将这些动态库加到环境变量里面。比较可能的命令如下。
+.. 	code-block:: shell
 
-..	code-block:: text
+	  0831 12:36:04.151525  1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH.
 
-	export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+原因是未设置cuda运行时环境变量。 如果使用GPU版本的PaddlePaddle，请安装CUDA 7.5 和CUDNN 5到本地环境中，并设置：
 
-CUDA Driver找不到
-+++++++++++++++++
-
-运行 :code:`paddle train` 报错\:
-
-..	code-block:: text
-
-	F0831 12:39:16.699000  1090 hl_cuda_device.cc:530] Check failed: cudaSuccess == cudaStat (0 vs. 35) Cuda Error: CUDA driver version is insufficient for CUDA runtime version
-
-PaddlePaddle运行时如果没有寻找到cuda的driver，变会报这个错误。解决办法是将cuda 
-driver添加到LD_LIBRARY_PATH中。比较可能的命令如下。
-
-..	code-block:: text
-
-	export LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
-
-config文件找不到
-++++++++++++++++
-
-运行 :code:`paddle train` 得到结果\:
-
-..	code-block:: text
+..  code-block:: shell
 
-	F0831 20:53:07.525789  1302 TrainerMain.cpp:94] Check failed: config != nullptr no valid config
+    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib:$LD_LIBRARY_PATH
+    export PATH=/usr/local/cuda/bin:$PATH
 
-PaddlePaddle在运行时找不到对应的config文件，说明命令行参数 :code:`config` 没有设置。
-而这个一般说明PaddlePaddle已经安装完毕了。
\ No newline at end of file
diff --git a/doc_cn/build_and_install/paddle_on_kubernetes.md b/doc_cn/build_and_install/paddle_on_kubernetes.md
new file mode 100644
index 0000000000000000000000000000000000000000..f8c9f19a9fef50c03f6ffee639a580adbf29844a
--- /dev/null
+++ b/doc_cn/build_and_install/paddle_on_kubernetes.md
@@ -0,0 +1,205 @@
+# Paddle On Kubernetes：单机训练
+
+在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。
+
+## 制作Docker镜像
+
+在一个功能齐全的Kubernetes机群里，通常我们会安装Ceph等分布式文件系统来存储训练数据。这样的话，一个分布式Paddle训练任务中的每个进程都可以从Ceph读取数据。在这个例子里，我们只演示一个单机作业，所以可以简化对环境的要求，把训练数据直接放在
+Paddle的Docker image里。为此，我们需要制作一个包含训练数据的Paddle镜像。
+
+Paddle 的 [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) 
+里介绍了用Paddle源码中的脚本下载训练数据的过程。
+而 `paddledev/paddle:cpu-demo-latest` 镜像里有 Paddle 源码与demo，（ 请注意，默认的
+Paddle镜像 `paddledev/paddle:cpu-latest` 是不包括源码的, Paddle的各版本镜像可以参考 [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html) ），所以我们使用这个镜像来下载训练数据到Docker container中，然后把这个包含了训练数据的container保存为一个新的镜像。
+  
+### 运行容器
+
+```
+$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+```
+
+### 下载数据
+
+进入容器`/root/paddle/demo/quick_start/data`目录，使用`get_data.sh`下载数据
+
+```
+$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh
+
+Downloading Amazon Electronics reviews data...
+--2016-10-31 01:33:43--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
+Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
+HTTP request sent, awaiting response... 200 OK
+Length: 495854086 (473M) [application/x-gzip]
+Saving to: 'reviews_Electronics_5.json.gz'
+
+ 10% [=======>                                         ] 874,279     64.7KB/s  eta 2h 13m
+
+```
+
+### 修改启动脚本
+
+下载完数据后，修改`/root/paddle/demo/quick_start/train.sh`文件，内容如下（增加了一条cd命令）
+```
+set -e
+cd /root/paddle/demo/quick_start
+cfg=trainer_config.lr.py
+#cfg=trainer_config.emb.py
+#cfg=trainer_config.cnn.py
+#cfg=trainer_config.lstm.py
+#cfg=trainer_config.bidi-lstm.py
+#cfg=trainer_config.db-lstm.py
+paddle train \
+  --config=$cfg \
+  --save_dir=./output \
+  --trainer_count=4 \
+  --log_period=20 \
+  --num_passes=15 \
+  --use_gpu=false \
+  --show_parameter_stats_period=100 \
+  --test_all_data_in_one_period=1 \
+  2>&1 | tee 'train.log'
+```
+
+### 提交镜像
+
+修改启动脚本后，退出容器，使用`docker commit`命令创建新镜像。
+
+```
+$ docker commit quick_start_data mypaddle/paddle:quickstart
+```
+
+## 使用 Kubernetes 进行训练
+
+>针对任务运行完成后容器自动退出的场景，Kubernetes有Job类型的资源来支持。下文就是用Job类型的资源来进行训练。
+
+### 编写yaml文件
+
+在训练时，输出结果可能会随着容器的消耗而被删除，需要在创建容器前挂载卷以便我们保存训练结果。使用我们之前构造的镜像，可以创建一个 [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job)，简单的yaml文件如下：
+
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: quickstart
+spec:
+  parallelism: 1
+  completions: 1
+  template:
+    metadata:
+      name: quickstart
+    spec:
+      volumes:
+      - name: output
+        hostPath: 
+          path: /home/work/paddle_output     
+      containers:
+      - name: pi
+        image: mypaddle/paddle:quickstart
+        command: ["bin/bash",  "-c", "/root/paddle/demo/quick_start/train.sh"]
+        volumeMounts:
+        - name: output
+          mountPath: /root/paddle/demo/quick_start/output
+      restartPolicy: Never
+```
+
+### 创建Paddle Job
+
+使用上文创建的yaml文件创建Kubernetes Job，命令为：
+
+```
+$ kubectl  create -f paddle.yaml
+```
+
+查看job的详细情况：
+
+```
+$ kubectl  get job
+NAME         DESIRED   SUCCESSFUL   AGE
+quickstart   1         0            58s
+
+$ kubectl  describe job quickstart
+Name:		quickstart
+Namespace:	default
+Image(s):	registry.baidu.com/public/paddle:cpu-demo-latest
+Selector:	controller-uid=f120da72-9f18-11e6-b363-448a5b355b84
+Parallelism:	1
+Completions:	1
+Start Time:	Mon, 31 Oct 2016 11:20:16 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Pods Statuses:	0 Running / 1 Succeeded / 0 Failed
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+Events:
+  FirstSeen	LastSeen	Count	From			SubobjectPath	Type		Reason			Message
+  ---------	--------	-----	----			-------------	--------	------			-------
+  1m		1m		1	{job-controller }			Normal		SuccessfulCreate	Created pod: quickstart-fa0wx
+```
+
+### 查看训练结果
+
+根据Job对应的Pod信息，可以查看此Pod运行的宿主机。
+
+```
+kubectl  describe pod quickstart-fa0wx
+Name:		quickstart-fa0wx
+Namespace:	default
+Node:		paddle-demo-let02/10.206.202.44
+Start Time:	Mon, 31 Oct 2016 11:20:17 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Status:		Succeeded
+IP:		10.0.0.9
+Controllers:	Job/quickstart
+Containers:
+  quickstart:
+    Container ID:	docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465
+    Image:		registry.baidu.com/public/paddle:cpu-demo-latest
+    Image ID:		docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750
+    Port:
+    Command:
+      bin/bash
+      -c
+      /root/paddle/demo/quick_start/train.sh
+    QoS Tier:
+      cpu:		BestEffort
+      memory:		BestEffort
+    State:		Terminated
+      Reason:		Completed
+      Exit Code:	0
+      Started:		Mon, 31 Oct 2016 11:20:20 +0800
+      Finished:		Mon, 31 Oct 2016 11:21:46 +0800
+    Ready:		False
+    Restart Count:	0
+    Environment Variables:
+Conditions:
+  Type		Status
+  Ready 	False
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+```
+
+我们还可以登录到宿主机上查看训练结果。
+
+```
+[root@paddle-demo-let02 paddle_output]# ll
+total 60
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014
+```
diff --git a/doc_cn/cluster/k8s/Dockerfile b/doc_cn/cluster/k8s/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..3a73606c61432329b4cc2d2f8daadc5af8735c96
--- /dev/null
+++ b/doc_cn/cluster/k8s/Dockerfile
@@ -0,0 +1,7 @@
+FROM paddledev/paddle:cpu-latest
+
+MAINTAINER zjsxzong89@gmail.com
+
+COPY start.sh /root/
+COPY start_paddle.py /root/
+CMD ["bash"," -c","/root/start.sh"]
\ No newline at end of file
diff --git a/doc_cn/cluster/k8s/distributed_training_on_kubernetes.md b/doc_cn/cluster/k8s/distributed_training_on_kubernetes.md
new file mode 100644
index 0000000000000000000000000000000000000000..d9ed431ec0566cf90f11ebaeec56560ff69e71fe
--- /dev/null
+++ b/doc_cn/cluster/k8s/distributed_training_on_kubernetes.md
@@ -0,0 +1,309 @@
+
+# PaddlePaddle on Kubernetes：分布式训练
+
+前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
+
+## Kubernetes 基本概念
+
+[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统，其提供应用部署、维护、 扩展机制等功能，利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行，且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws)，[Azure](http://kubernetes.io/docs/getting-started-guides/azure/)，[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前，需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识，下面先简要介绍一下本文用到的几个Kubernetes概念。
+
+- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点，这个节点可以是物理机或者虚拟机，Kubernetes集群就是由node节点与master节点组成的。
+
+- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器，pod是Kubernetes的最小调度单元，一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET，PID，IPC，UTS等Linux namespace。由于容器之间共享NET namespace，所以它们使用同一个IP地址，可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
+
+- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 是Kubernetes上运行的作业，一次作业称为一个job，通常每个job包括一个或者多个pods。
+
+- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷，是pod内的容器都可以访问的共享目录，也是容器与node之间共享文件的方式，因为容器内的文件都是暂时存在的，当容器因为各种原因被销毁时，其内部的文件也会随之消失。通过volume，就可以将这些文件持久化存储。Kubernetes支持多种volume，例如hostPath(宿主机目录)，gcePersistentDisk，awsElasticBlockStore等。
+
+- [*Namespaces*](http://kubernetes.io/docs/user-guide/volumes/) 命名空间，在kubernetes中创建的所有资源对象(例如上文的pod，job)等都属于一个命名空间，在同一个命名空间中，资源对象的名字是唯一的，不同空间的资源名可以重复，命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
+
+## 整体方案
+
+### 部署Kubernetes集群
+
+首先，我们需要拥有一个Kubernetes集群，在这个集群中所有node与pod都可以互相通信。关于Kubernetes集群搭建，可以参考[官方文档](http://kubernetes.io/docs/getting-started-guides/kubeadm/)，在以后的文章中我们也会介绍AWS上搭建的方案。本文假设大家能找到几台物理机，并且可以按照官方文档在上面部署Kubernetes。在本文的环境中，Kubernetes集群中所有node都挂载了一个[MFS](http://moosefs.org/)（Moose filesystem，一种分布式文件系统）共享目录，我们通过这个目录来存放训练文件与最终输出的模型。关于MFS的安装部署，可以参考[MooseFS documentation](https://moosefs.com/documentation.html)。在训练之前，用户将配置与训练数据切分好放在MFS目录中，训练时，程序从此目录拷贝文件到容器内进行训练，将结果保存到此目录里。整体的结构图如下：
+
+![paddle on kubernetes结构图](k8s-paddle-arch.png)
+
+上图描述了一个3节点的分布式训练场景，Kubernetes集群的每个node上都挂载了一个MFS目录，这个目录可以通过volume的形式挂载到容器中。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行，每个pod包含一个PaddlePaddle容器。在容器创建后，会启动pserver与trainer进程，读取volume中的数据进行这次分布式训练。
+
+### 使用 Job
+
+我们使用Kubernetes中的job这个概念来代表一次分布式训练。Job表示一次性作业，在作业完成后，Kubernetes会销毁job产生的容器并且释放相关资源。
+
+在Kubernetes中，可以通过编写一个YAML文件，来描述这个job，在这个文件中，主要包含了一些配置信息，例如PaddlePaddle的节点个数，`paddle pserver`开放的端口个数与端口号，使用的网卡设备等，这些信息通过环境变量的形式传递给容器内的程序使用。
+
+在一次分布式训练中，用户确定好本次训练需要的PaddlePaddle节点个数，将切分好的训练数据与配置文件上传到MFS共享目录中。然后编写这次训练的job YAML文件，提交给Kubernetes集群创建并开始作业。
+
+### 创建PaddlePaddle节点
+
+当Kubernetes master收到请求，解析完YAML文件后，会创建出多个pod(个数为PaddlePaddle节点数)，Kubernetes会把这些pod调度到集群的node上运行。一个pod就代表一个PaddlePaddle节点，当pod被成功分配到一台物理/虚拟机上后，Kubernetes会启动pod内的容器，这个容器会根据YAML文件中的环境变量，启动`paddle pserver`与`paddle train`进程。
+
+### 启动训练
+
+在容器启动后，会通过脚本来启动这次分布式训练，我们知道`paddle train`进程启动时需要知道其他节点的IP地址以及本节点的trainer_id，由于PaddlePaddle本身不提供类似服务发现的功能，所以在本文的启动脚本中，每个节点会根据job name向Kubernetes apiserver查询这个job对应的所有pod信息(Kubernetes默认会在每个容器的环境变量中写入apiserver的地址)。
+
+根据这些pod信息，就可以通过某种方式，为每个pod分配一个唯一的trainer_id。本文把所有pod的IP地址进行排序，将顺序作为每个PaddlePaddle节点的trainer_id。启动脚本的工作流程大致如下：
+
+  1. 查询Kubernetes apiserver获取pod信息，根据IP分配trainer_id
+  1. 从MFS共享目录中拷贝训练文件到容器内
+  1. 根据环境变量，解析出`paddle pserver`与`paddle train`的启动参数，启动进程
+  1. 训练时，PaddlePaddle会自动将结果保存在trainer_id为0的节点上，将输出路径设置为MFS目录，保存输出的文件
+
+
+## 搭建过程
+
+根据前文的描述，要在已有的Kubernetes集群上进行PaddlePaddle的分布式训练，主要分为以下几个步骤：
+
+1. 制作PaddlePaddle镜像
+1. 将训练文件与切分好的数据上传到共享存储
+1. 编写本次训练的YAML文件，创建一个Kubernetes job
+1. 训练结束后查看输出结果
+
+下面就根据这几个步骤分别介绍。
+
+
+### 制作镜像
+
+PaddlePaddle镜像需要提供`paddle pserver`与`paddle train`进程的运行环境，用这个镜像创建的容器需要有以下两个功能：
+
+- 拷贝训练文件到容器内
+
+- 生成`paddle pserver`与`paddle train`进程的启动参数，并且启动训练
+
+因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。镜像的*Dockerfile*如下：
+
+```Dockerfile
+FROM paddledev/paddle:cpu-latest
+
+MAINTAINER zjsxzong89@gmail.com
+
+COPY start.sh /root/
+COPY start_paddle.py /root/
+CMD ["bash"," -c","/root/start.sh"]
+```
+
+[`start.sh`](start.sh)文件拷贝训练文件到容器内，然后执行[`start_paddle.py`](start_paddle.py)脚本启动训练，前文提到的获取其他节点IP地址，分配`trainer_id`等都在`start_paddle.py`脚本中完成。
+
+`start_paddle.py`脚本开始时，会先进行参数的初始化与解析。
+
+```python
+parser = argparse.ArgumentParser(prog="start_paddle.py",
+                                     description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+```
+
+然后通过函数`getPodList()`访问Kubernetes的接口来查询此job对应的所有pod信息。当所有pod都处于running状态（容器运行都运行）时，再通过函数`getIdMap(podlist)`获取trainer_id。
+
+```python
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(10)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+```
+
+在函数`getIdMap(podlist)`内部，我们通过读取`podlist`中每个pod的IP地址，将IP排序生成的序号作为trainer_id。
+
+```python
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+```
+
+在得到`idMap`后，通过函数`startPaddle(idMap, train_args_dict)`构造`paddle pserver`与`paddle train`的启动参数并执行进程。
+
+在函数`startPaddle`中，最主要的工作就是解析出`paddle pserver`与`paddle train`的启动参数。例如`paddle train`参数的解析，解析环境变量得到`PADDLE_NIC`，`PADDLE_PORT`，`PADDLE_PORTS_NUM`等参数，然后通过自身的IP地址在`idMap`中获取`trainerId`。
+
+```python
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+```
+
+使用 `docker build` 构建镜像：
+
+```bash
+docker build -t your_repo/paddle:mypaddle .
+```
+
+然后将构建成功的镜像上传到镜像仓库。
+
+```bash
+docker push  your_repo/paddle:mypaddle
+```
+
+### 上传训练文件
+
+本文使用PaddlePaddle官方的[recommendation demo](http://www.paddlepaddle.org/doc/demo/index.html#recommendation)作为这次训练的内容，我们将训练文件与数据放在一个job name命名的目录中，上传到MFS共享存储。完成后MFS上的文件内容大致如下：
+
+```bash
+[root@paddle-kubernetes-node0 mfs]# tree -d
+.
+└── paddle-cluster-job
+    ├── data
+    │   ├── 0
+    │   │
+    │   ├── 1
+    │   │
+    │   └── 2
+    ├── output
+    └── recommendation
+```
+
+目录中paddle-cluster-job是本次训练对应的job name，本次训练要求有3个PaddlePaddle节点，在paddle-cluster-job/data目录中存放切分好的数据，文件夹0，1，2分别代表3个节点的trainer_id。recommendation文件夹内存放训练文件，output文件夹存放训练结果与日志。
+
+### 创建Job
+
+Kubernetes可以通过YAML文件来创建相关对象，然后可以使用命令行工具创建job。
+
+Job YAML文件描述了这次训练使用的Docker镜像，需要启动的节点个数以及 `paddle pserver`与 `paddle train`进程启动的必要参数，也描述了容器需要使用的存储卷挂载的情况。YAML文件中各个字段的具体含义，可以查看[Kubernetes Job API](http://kubernetes.io/docs/api-reference/batch/v1/definitions/#_v1_job)。例如，本次训练的YAML文件可以写成：
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: jobpath
+        hostPath:
+          path: /home/work/mfs
+      containers:
+      - name: trainer
+        image: your_repo/paddle:mypaddle
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: recommendation
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        volumeMounts:
+        - name: jobpath
+          mountPath: /home/jobpath
+      restartPolicy: Never
+```
+
+文件中，`metadata`下的`name`表示这个job的名字。`parallelism，completions`字段表示这个job会同时开启3个PaddlePaddle节点，成功训练且退出的pod数目为3时，这个job才算成功结束。然后申明一个存储卷`jobpath`，代表宿主机目录`/home/work/mfs`，在对容器的描述`containers`字段中，将此目录挂载为容器的`/home/jobpath`目录，这样容器的`/home/jobpath`目录就成为了共享存储，放在这个目录里的文件其实是保存到了MFS上。
+
+`env`字段表示容器的环境变量，我们将`paddle`运行的一些参数通过这种方式传递到容器内。
+
+`JOB_PATH`表示共享存储挂载的路径，`JOB_NAME`表示job名字，`TRAIN_CONFIG_DIR`表示本次训练文件所在目录，这三个变量组合就可以找到本次训练需要的文件路径。
+
+`CONF_PADDLE_NIC`表示`paddle pserver`进程需要的`--nics`参数，即网卡名
+
+`CONF_PADDLE_PORT`表示`paddle pserver`的`--port`参数，`CONF_PADDLE_PORTS_NUM`则表示稠密更新的端口数量，也就是`--ports_num`参数。
+
+`CONF_PADDLE_PORTS_NUM_SPARSE`表示稀疏更新的端口数量，也就是`--ports_num_for_sparse`参数。
+
+`CONF_PADDLE_GRADIENT_NUM`表示训练节点数量，即`--num_gradient_servers`参数
+
+编写完YAML文件后，可以使用Kubernetes的命令行工具创建job。
+
+```bash
+kubectl create -f job.yaml
+```
+
+创建成功后，Kubernetes就会创建3个pod作为PaddlePaddle节点然后拉取镜像，启动容器开始训练。
+
+
+### 查看输出
+
+在训练过程中，可以在共享存储上查看输出的日志和模型，例如output目录下就存放了输出结果。注意node_0，node_1，node_2这几个目录表示PaddlePaddle节点与trainer_id，并不是Kubernetes中的node概念。
+
+```bash
+[root@paddle-kubernetes-node0 output]# tree -d
+.
+├── node_0
+│   ├── server.log
+│   └── train.log
+├── node_1
+│   ├── server.log
+│   └── train.log
+├── node_2
+......
+├── pass-00002
+│   ├── done
+│   ├── ___embedding_0__.w0
+│   ├── ___embedding_1__.w0
+......
+```
+
+我们可以通过日志查看容器训练的情况，例如：
+
+```bash
+[root@paddle-kubernetes-node0 node_0]# cat train.log
+I1116 09:10:17.123121    50 Util.cpp:155] commandline:
+ /usr/local/bin/../opt/paddle/bin/paddle_trainer
+    --nics=eth0 --port=7164
+    --ports_num=2 --comment=paddle_process_by_paddle
+    --pservers=192.168.129.66,192.168.223.143,192.168.129.71
+    --ports_num_for_sparse=2 --config=./trainer_config.py
+    --trainer_count=4 --num_passes=10 --use_gpu=0 
+    --log_period=50 --dot_period=10 --saving_period=1 
+    --local=0 --trainer_id=0
+    --save_dir=/home/jobpath/paddle-cluster-job/output
+I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
+I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
+[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
+[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__regression_cost_0__]
+I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
+I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.681543    50 GradientMachine.cpp:134] Initing parameters..
+I1116 09:10:18.012390    50 GradientMachine.cpp:141] Init parameters done.
+I1116 09:10:18.018641    50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164
+I1116 09:10:18.018950    50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165
+I1116 09:10:18.019069    50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164
+I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165
+I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
+I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
+```
\ No newline at end of file
diff --git a/doc_cn/cluster/k8s/job.yaml b/doc_cn/cluster/k8s/job.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1e0ac464b2ec71e98c28f090124690b01b0755ce
--- /dev/null
+++ b/doc_cn/cluster/k8s/job.yaml
@@ -0,0 +1,43 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: jobpath
+        hostPath: 
+          path: /home/work/paddle_output              
+      containers:
+      - name: trainer
+        image: registry.baidu.com/public/paddle:mypaddle
+        command: ["bin/bash",  "-c", "/root/start.sh"]        
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath     
+        - name: JOB_NAMESPACE
+          value: default         
+        - name: TRAIN_CONFIG_DIR
+          value: recommendation
+        - name: CONF_PADDLE_NIC
+          value: eth0  
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"     
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"  
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"                                                               
+        volumeMounts:
+        - name: jobpath
+          mountPath: /home/jobpath       
+      restartPolicy: Never
+    
\ No newline at end of file
diff --git a/doc_cn/cluster/k8s/k8s-paddle-arch.png b/doc_cn/cluster/k8s/k8s-paddle-arch.png
new file mode 100644
index 0000000000000000000000000000000000000000..a8c64550b1fa7f41de1eaa9a037c65cddc0cd30e
Binary files /dev/null and b/doc_cn/cluster/k8s/k8s-paddle-arch.png differ
diff --git a/doc_cn/cluster/k8s/start.sh b/doc_cn/cluster/k8s/start.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b3a1334174a20b018d35de3b01b149fc5b10d49d
--- /dev/null
+++ b/doc_cn/cluster/k8s/start.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+set -eu
+
+jobconfig=${JOB_PATH}"/"${JOB_NAME}"/"${TRAIN_CONFIG_DIR}
+cd /root
+cp -rf $jobconfig .
+cd $TRAIN_CONFIG_DIR
+
+
+python /root/start_paddle.py \
+  --dot_period=10 \
+  --ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM \
+  --log_period=50 \
+  --num_passes=10 \
+  --trainer_count=4 \
+  --saving_period=1 \
+  --local=0 \
+  --config=./trainer_config.py \
+  --use_gpu=0
diff --git a/doc_cn/cluster/k8s/start_paddle.py b/doc_cn/cluster/k8s/start_paddle.py
new file mode 100755
index 0000000000000000000000000000000000000000..bc0112a77fb84db8965a09716006377c127ad4db
--- /dev/null
+++ b/doc_cn/cluster/k8s/start_paddle.py
@@ -0,0 +1,159 @@
+#!/usr/bin/python
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import time
+import socket
+import os
+import argparse
+
+
+# configuration for cluster
+API = "/api/v1/namespaces/"
+JOBSELECTOR = "labelSelector=job-name="
+JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
+JOB_PATH_DATA = JOB_PATH + "/data"
+JOB_PATH_OUTPUT = JOB_PATH + "/output"
+JOBNAME = os.getenv("JOB_NAME")
+NAMESPACE = os.getenv("JOB_NAMESPACE")
+PADDLE_NIC = os.getenv("CONF_PADDLE_NIC")
+PADDLE_PORT = os.getenv("CONF_PADDLE_PORT")
+PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
+PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
+PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
+
+
+def refine_unknown_args(cmd_args):
+    '''
+    refine unknown parameters to handle some special parameters
+    '''
+    new_args = []
+    for arg in cmd_args:
+        if arg.startswith("--") and arg.find("=") != -1:
+            equal_pos = arg.find("=")  # find first = pos
+            arglist = list(arg)
+            arglist[equal_pos] = " "
+            arg = "".join(arglist)
+            arg = arg.lstrip("-")
+            new_args += arg.split(" ")
+        elif arg.startswith("--") and arg.find("=") == -1:
+            arg = arg.lstrip("-")
+            new_args.append(arg)
+        else:
+            new_args.append(arg)
+    return new_args
+
+
+def isPodAllRunning(podlist):
+    '''
+    check all pod is running
+    '''
+    require = len(podlist["items"])
+    running = 0
+    for pod in podlist["items"]:
+        if pod["status"]["phase"] == "Running":
+            running += 1
+    if require == running:
+        return True
+    return False
+
+
+def getPodList():
+    '''
+    get all container status of the job
+    '''
+    apiserver = "https://" + \
+        os.getenv("KUBERNETES_SERVICE_HOST") + ":" + \
+        os.getenv("KUBERNETES_SERVICE_PORT_HTTPS")
+
+    pod = API + NAMESPACE + "/pods?"
+    job = JOBNAME
+    return requests.get(apiserver + pod + JOBSELECTOR + job,
+                        verify=False).json()
+
+
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+
+
+def startPaddle(idMap={}, train_args_dict=None):
+    '''
+    start paddle pserver and trainer
+    '''
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+    logDir = JOB_PATH_OUTPUT + "/node_" + str(trainerId)
+    if not os.path.exists(JOB_PATH_OUTPUT):
+        os.makedirs(JOB_PATH_OUTPUT)
+    os.mkdir(logDir)
+    copyCommand = 'cp -rf ' + JOB_PATH_DATA + \
+        "/" + str(trainerId) + " ./data"
+    os.system(copyCommand)
+    startPserver = 'nohup paddle pserver' + \
+        " --port=" + str(PADDLE_PORT) + \
+        " --ports_num=" + str(PADDLE_PORTS_NUM) + \
+        " --ports_num_for_sparse=" + str(PADDLE_PORTS_NUM_SPARSE) + \
+        " --nics=" + PADDLE_NIC + \
+        " --comment=" + "paddle_process_by_paddle" + \
+        " --num_gradient_servers=" + str(PADDLE_SERVER_NUM) +\
+        " > " + logDir + "/server.log 2>&1 &"
+    print startPserver
+    os.system(startPserver)
+    # wait until pservers completely start
+    time.sleep(10)
+    startTrainer = program + args + " > " + \
+        logDir + "/train.log 2>&1 < /dev/null"
+    print startTrainer
+    os.system(startTrainer)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(prog="start_paddle.py",
+                                     description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(10)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+    startPaddle(idMap, train_args_dict)
diff --git a/doc_cn/conf.py.in b/doc_cn/conf.py.in
index 93242ace406000c84414bcabe1ecb683b9ff3cea..421e0c298d4430082b5ba7ef317408fc5c32cda7 100644
--- a/doc_cn/conf.py.in
+++ b/doc_cn/conf.py.in
@@ -22,7 +22,7 @@ AutoStructify = transform.AutoStructify
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 sys.path.insert(0, '@PROJ_ROOT@/python')
-templates_path = ["@PROJ_ROOT@/doc/templates"]
+templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
 
 # -- General configuration ------------------------------------------------
 
@@ -112,12 +112,12 @@ todo_include_todos = False
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-#html_theme = 'sphinx_rtd_theme'  # sphinx_rtd_theme will cause table bad style
-html_theme = 'classic'
+html_theme = 'sphinx_rtd_theme'
+
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ['@PROJ_ROOT@/doc_theme/static']
 
 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'
diff --git a/doc_cn/demo/quick_start/index.md b/doc_cn/demo/quick_start/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..4a6e07ee1ffd94cf8f781af307b53a96a78e6b93
--- /dev/null
+++ b/doc_cn/demo/quick_start/index.md
@@ -0,0 +1,543 @@
+# PaddlePaddle快速入门教程
+
+我们以文本分类问题作为背景，介绍PaddlePaddle使用流程和常用的网络基础单元的配置方法。
+
+## 安装(Install)
+
+首先请参考<a href = "../../build_and_install/index.html">安装教程</a>安装PaddlePaddle。
+
+## 使用概述(Overview)
+
+**文本分类问题**：对于给定的一条文本， 我们从提前给定的类别集合中选择其所属类
+别。比如通过用户对电子商务网站评论，评估产品的质量：
+
+- 这个显示器很棒！ （好评）
+- 用了两个月之后这个显示器屏幕碎了。（差评）
+
+每一个任务流程都可以分为如下5个基础部分。
+<center> ![](./Pipeline.jpg) </center>
+
+1. 数据格式准备
+    - 每行保存一条样本，类别Id 和文本信息用Tab间隔， 文本中的单词用空格分隔（如果不切词，则字与字之间用空格分隔），例如：```类别Id ‘\t’ 这 个 显 示 器 很 棒 ！```
+2. 数据向模型传送
+    - PaddlePaddle可以读取Python写的传输数据脚本，所有字符都将转换为连续整数表示的Id传给模型
+3. 网络结构（由易到难展示4种不同的网络配置）
+    - 逻辑回归模型
+    - 词向量模型
+    - 卷积模型
+    - 时序模型
+    - 优化算法
+4. 训练模型
+5. 预测
+
+## 数据格式准备(Data Preparation)
+在本问题中，我们使用[Amazon电子产品评论数据](http://jmcauley.ucsd.edu/data/amazon/)，
+将评论分为好评(正样本)和差评(负样本)两类。[源码](https://github.com/PaddlePaddle/Paddle)的`demo/quick_start`里提供了下载已经预处理数据的脚本（如果想从最原始的数据处理，可以使用脚本 `./demo/quick_start/data/proc_from_raw_data/get_data.sh`）。
+
+```bash
+cd demo/quick_start
+./data/get_data.sh
+```
+
+## 数据向模型传送(Transfer Data to Model)
+
+### Python数据加载脚本(Data Provider Script)
+
+下面dataprovider_bow.py文件给出了完整例子，主要包括两部分：
+
+* initalizer： 定义文本信息、类别Id的数据类型。
+* process： yield文本信息和类别Id，和initalizer里定义顺序一致。
+
+```python
+from paddle.trainer.PyDataProvider2 import *
+
+# id of the word not in dictionary
+UNK_IDX = 0
+
+# initializer is called by the framework during initialization.
+# It allows the user to describe the data types and setup the
+# necessary data structure for later use.
+# `settings` is an object. initializer need to properly fill settings.input_types.
+# initializer can also store other data structures needed to be used at process().
+# In this example, dictionary is stored in settings.
+# `dictionay` and `kwargs` are arguments passed from trainer_config.lr.py
+def initializer(settings, dictionary, **kwargs):
+    # Put the word dictionary into settings
+    settings.word_dict = dictionary
+
+    # setting.input_types specifies what the data types the data provider
+    # generates.
+    settings.input_types = [
+        # The first input is a sparse_binary_vector,
+        # which means each dimension of the vector is either 0 or 1. It is the
+        # bag-of-words (BOW) representation of the texts.
+        sparse_binary_vector(len(dictionary)),
+        # The second input is an integer. It represents the category id of the
+        # sample. 2 means there are two labels in the dataset.
+        # (1 for positive and 0 for negative)
+        integer_value(2)]
+
+# Delaring a data provider. It has an initializer 'data_initialzer'.
+# It will cache the generated data of the first pass in memory, so that
+# during later pass, no on-the-fly data generation will be needed.
+# `setting` is the same object used by initializer()
+# `file_name` is the name of a file listed train_list or test_list file given
+# to define_py_data_sources2(). See trainer_config.lr.py.
+@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_name):
+    # Open the input data file.
+    with open(file_name, 'r') as f:
+        # Read each line.
+        for line in f:
+            # Each line contains the label and text of the comment, separated by \t.
+            label, comment = line.strip().split('\t')
+
+            # Split the words into a list.
+            words = comment.split()
+
+            # convert the words into a list of ids by looking them up in word_dict.
+            word_vector = [settings.word_dict.get(w, UNK_IDX) for w in words]
+
+            # Return the features for the current comment. The first is a list
+            # of ids representing a 0-1 binary sparse vector of the text,
+            # the second is the integer id of the label.
+            yield word_vector, int(label)
+```
+
+### 配置中的数据加载定义(Data Provider in Configure)
+
+在模型配置中利用`define_py_data_sources2`加载数据：
+
+```python
+from paddle.trainer_config_helpers import *
+
+file = "data/dict.txt"
+word_dict = dict()
+with open(dict_file, 'r') as f:
+    for i, line in enumerate(f):
+        w = line.strip().split()[0]
+        word_dict[w] = i
+# define the data sources for the model.
+# We need to use different process for training and prediction.
+# For training, the input data includes both word IDs and labels.
+# For prediction, the input data only includs word Ids.
+define_py_data_sources2(train_list='data/train.list',
+                        test_list='data/test.list',
+                        module="dataprovider_bow",
+                        obj="process",
+                        args={"dictionary": word_dict})
+```
+* data/train.list,data/test.list: 指定训练、测试数据
+* module="dataprovider": 数据处理Python文件名
+* obj="process": 指定生成数据的函数
+* args={"dictionary": word_dict}: 额外的参数，这里指定词典
+
+更详细数据格式和用例请参考<a href = "../../ui/data_provider/pydataprovider2.html">
+PyDataProvider2</a>。
+
+## 网络结构(Network Architecture)
+本节我们将专注于网络结构的介绍。
+<center> ![](./PipelineNetwork.jpg) </center>
+
+我们将以基本的逻辑回归网络作为起点，并逐渐展示更加深入的功能。更详细的网络配置
+连接请参考<a href = "../../../doc/layer.html">Layer文档</a>。
+所有配置在[源码](https://github.com/PaddlePaddle/Paddle)`demo/quick_start`目录，首先列举逻辑回归网络。
+
+### 逻辑回归模型(Logistic Regression)
+
+流程如下：
+<center> ![](./NetLR.jpg) </center>
+
+- 获取利用one-hot vector表示的每个单词，维度是词典大小
+
+```python
+word = data_layer(name="word",  size=word_dim)
+```
+
+- 获取该条样本类别Id，维度是类别个数。
+
+```python
+label = data_layer(name="label", size=label_dim)
+```
+
+- 利用逻辑回归模型对该向量进行分类，同时会计算分类准确率
+
+```python
+# Define a fully connected layer with logistic activation (also called softmax activation).
+output = fc_layer(input=word,
+                  size=label_dim,
+                  act_type=SoftmaxActivation())
+# Define cross-entropy classification loss and error.
+classification_cost(input=output, label=label)
+```
+
+ - input: 除过data层，每个层都有一个或多个input,多个input以list方式输入
+ - size: 该层神经元个数
+ - act_type: 激活函数类型
+
+效果总结：我们将在后面介绍训练和预测的流程的脚本。在此为方便对比不同网络结构，
+我们随时总结了各个网络的复杂度和效果。
+
+<html>
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+
+<thead>
+<th scope="col" class="left">网络名称</th>
+<th scope="col" class="left">参数数量</th>
+<th scope="col" class="left">错误率</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">逻辑回归</td>
+<td class="left">252 KB</td>
+<td class="left">8.652%</td>
+</tr>
+
+</tbody>
+</table></center>
+</html>
+<br>
+
+### 词向量模型(Word Vector)
+
+embedding模型需要稍微改变数据提供的脚本，即`dataprovider_emb.py`，词向量模型、
+卷积模型、时序模型均使用该脚本。其中文本输入类型定义为整数时序类型integer_value_sequence。
+
+```
+def initializer(settings, dictionary, **kwargs):
+    settings.word_dict = dictionary
+    settings.input_types = [
+        # Define the type of the first input as sequence of integer.
+        # The value of the integers range from 0 to len(dictrionary)-1
+        integer_value_sequence(len(dictionary)),
+        # Define the second input for label id
+        integer_value(2)]
+
+@provider(init_hook=initializer)
+def process(settings, file_name):
+    ...
+    # omitted, it is same as the data provider for LR model
+```
+
+该模型依然是使用逻辑回归分类网络的框架， 只是将句子利用连续向量表示替换稀疏
+向量表示， 即对第3步进行替换。句子表示的计算更新为2步：
+<center> ![](./NetContinuous.jpg) </center>
+
+- 利用单词Id查找对应的该单词的连续表示向量(维度为word_dim)， 输入N个单词，输出为N个word_dim维度向量
+
+```python
+emb = embedding_layer(input=word, size=word_dim)
+```
+
+- 将该句话包含的所有单词向量求平均得到句子的表示
+
+```python
+avg = pooling_layer(input=emb, pooling_type=AvgPooling())
+```
+
+其它部分和逻辑回归网络结构一致。
+效果总结：
+
+<html>
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+
+<thead>
+<th scope="col" class="left">网络名称</th>
+<th scope="col" class="left">参数数量</th>
+<th scope="col" class="left">错误率</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">词向量模型</td>
+<td class="left">15 MB</td>
+<td class="left">8.484%</td>
+</tr>
+
+</tbody>
+</table>
+</html></center>
+<br>
+
+### 卷积模型(Convolution)
+卷积网络是一种特殊的从词向量表示到句子表示的方法， 也就是将词向量模型额步
+骤3-2进行进一步演化， 变为3个新的子步骤。
+<center> ![](./NetConv.jpg) </center>
+
+文本卷积分为三个步骤：
+1. 获取每个单词左右各k个近邻， 拼接成一个新的向量表示；
+2. 对该表示进行非线性变换 （例如Sigmoid变换）, 成为维度为hidden_dim的新的向量；
+3. 在每个维度上取出在该句话新的向量集合上该维度的最大值作为最后的句子表示向量。 这3个子步骤可配置为:
+
+```python
+text_conv = sequence_conv_pool(input=emb,
+	                           context_start=k,
+	                           context_len=2 * k + 1)
+```
+
+效果总结：
+
+<html>
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+
+<thead>
+<th scope="col" class="left">网络名称</th>
+<th scope="col" class="left">参数数量</th>
+<th scope="col" class="left">错误率</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">卷积模型</td>
+<td class="left">16 MB</td>
+<td class="left">5.628%</td>
+</tr>
+
+</tbody>
+</table></center>
+<br>
+
+### 时序模型(Time Sequence)
+<center> ![](./NetRNN.jpg) </center>
+
+时序模型即为RNN模型, 包括简单的RNN模型、GRU模型、LSTM模型等。
+
+- GRU模型配置：
+
+```python
+gru = simple_gru(input=emb, size=gru_size)
+```
+
+- LSTM模型配置：
+
+```python
+lstm = simple_lstm(input=emb, size=lstm_size)
+```
+
+针对本问题，我们采用单层LSTM模型，并使用了Dropout，效果总结：
+
+<html>
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+
+<thead>
+<th scope="col" class="left">网络名称</th>
+<th scope="col" class="left">参数数量</th>
+<th scope="col" class="left">错误率</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">时序模型</td>
+<td class="left">16 MB</td>
+<td class="left">4.812%</td>
+</tr>
+
+</tbody>
+</table></center>
+</html>
+<br>
+
+## 优化算法(Optimization Algorithm)
+<a href = "../../../doc/ui/trainer_config_helpers_api.html#module-paddle.trainer_config_helpers.optimizers">优化算法</a>包括
+Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优化方法，加了L2正则和梯度截断。
+
+```python
+settings(batch_size=128,
+         learning_rate=2e-3,
+         learning_method=AdamOptimizer(),
+         regularization=L2Regularization(8e-4),
+         gradient_clipping_threshold=25)
+```
+
+## 训练模型(Training Model)
+在完成了数据和网络结构搭建之后， 我们进入到训练部分。
+<center> ![](./PipelineTrain.jpg) </center>
+
+训练脚本：我们将训练的命令行保存在了 `train.sh`文件中。训练时所需设置的主要参数如下：
+
+```bash
+paddle train \
+--config=trainer_config.py \
+--log_period=20 \
+--save_dir=./output \
+--num_passes=15 \
+--use_gpu=false
+```
+这里没有介绍多机分布式训练，可以参考<a href = "../../cluster/index.html">分布式训练</a>的demo学习如何进行多机训练。
+
+## 预测(Prediction)
+可以使用训练好的模型评估带有label的验证集，也可以预测没有label的测试集。
+<center> ![](./PipelineTest.jpg) </center>
+
+测试脚本如下，将会测试配置文件中test.list指定的数据。
+
+```bash
+paddle train \
+--use_gpu=false \
+--job=test \
+--init_model_path=./output/pass-0000x
+```
+
+可以参考<a href = "../../ui/predict/swig_py_paddle.html">Python API预测</a>
+教程，或其他<a href = "../../demo/index.html">demo</a>的Python预测过程。也可以通过如下方式预测。
+
+预测脚本(`predict.sh`)：
+
+```bash
+model="output/pass-00003"
+paddle train \
+    --config=trainer_config.lstm.py \
+    --use_gpu=false \
+    --job=test \
+    --init_model_path=$model \
+    --config_args=is_predict=1 \
+    --predict_output_dir=. \
+
+mv rank-00000 result.txt
+```
+这里以`output/pass-00003`为例进行预测，用户可以根据训练log选择test结果最好的模型来预测。与训练网络配置不同的是：无需label相关的层，指定outputs输出概率层(softmax输出)，
+指定batch_size=1，数据传输无需label数据，预测数据指定test_list的位置。
+
+预测结果以文本的形式保存在`result.txt`中，一行为一个样本，格式如下：
+
+```
+预测ID;ID为0的概率 ID为1的概率
+预测ID;ID为0的概率 ID为1的概率
+```
+
+```
+is_predict = get_config_arg('is_predict', bool, False)
+trn = 'data/train.list' if not is_predict else None
+tst = 'data/test.list' if not is_predict else 'data/pred.list'
+obj = 'process' if not is_predict else 'process_pre'
+batch_size = 128 if not is_predict else 1
+if is_predict:
+    maxid = maxid_layer(output)
+    outputs([maxid,output])
+else:
+    label = data_layer(name="label", size=2)
+    cls = classification_cost(input=output, label=label)
+    outputs(cls)
+```
+
+## 总体效果总结(Summary)
+这些流程中的数据下载、网络配置、训练脚本在`/demo/quick_start`目录，我们在此总
+结上述网络结构在Amazon-Elec测试集(25k)上的效果:
+
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+
+<thead>
+<th scope="col" class="left">网络名称</th>
+<th scope="col" class="left">参数数量</th>
+<th scope="col" class="left">错误率</th>
+<th scope="col" class="left">配置文件</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">逻辑回归模型</td>
+<td class="left"> 252KB </td>
+<td class="left">8.652%</td>
+<td class="left">trainer_config.lr.py</td>
+</tr>
+
+<tr>
+<td class="left">词向量模型</td>
+<td class="left"> 15MB </td>
+<td class="left"> 8.484%</td>
+<td class="left">trainer_config.emb.py</td>
+</tr>
+
+<tr>
+<td class="left">卷积模型</td>
+<td class="left"> 16MB </td>
+<td class="left"> 5.628%</td>
+<td class="left">trainer_config.cnn.py</td>
+</tr>
+
+<tr>
+<td class="left">时序模型</td>
+<td class="left"> 16MB </td>
+<td class="left"> 4.812%</td>
+<td class="left">trainer_config.lstm.py</td>
+</tr>
+
+</tbody>
+</table>
+</center>
+<br>
+
+## 附录(Appendix)
+### 命令行参数(Command Line Argument)
+
+* \--config：网络配置
+* \--save_dir：模型存储路径
+* \--log_period：每隔多少batch打印一次日志
+* \--num_passes：训练轮次，一个pass表示过一遍所有训练样本
+* \--config_args：命令指定的参数会传入网络配置中。
+* \--init_model_path：指定初始化模型路径，可用在测试或训练时指定初始化模型。
+
+默认一个pass保存一次模型，也可以通过saving_period_by_batches设置每隔多少batch保存一次模型。
+可以通过show_parameter_stats_period设置打印参数信息等。
+其他参数请参考<a href = "../../ui/index.html#command-line-argument">令行参数文档</a>。
+
+### 输出日志(Log)
+
+```
+TrainerInternal.cpp:160]  Batch=20 samples=2560 AvgCost=0.628761 CurrentCost=0.628761 Eval: classification_error_evaluator=0.304297  CurrentEval: classification_error_evaluator=0.304297
+```
+模型训练会看到这样的日志，详细的参数解释如下面表格：
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+
+<thead>
+<th scope="col" class="left">名称</th>
+<th scope="col" class="left">解释</th>
+</tr>
+</thead>
+
+<tr>
+<td class="left">Batch=20</td>
+<td class="left"> 表示过了20个batch </td>
+</tr>
+
+<tr>
+<td class="left">samples=2560</td>
+<td class="left"> 表示过了2560个样本 </td>
+</tr>
+
+<tr>
+<td class="left">AvgCost</td>
+<td class="left"> 每个pass的第0个batch到当前batch所有样本的平均cost </td>
+</tr>
+
+<tr>
+<td class="left">CurrentCost</td>
+<td class="left"> 当前log_period个batch所有样本的平均cost </td>
+</tr>
+
+<tr>
+<td class="left">Eval: classification_error_evaluator</td>
+<td class="left"> 每个pass的第0个batch到当前batch所有样本的平均分类错误率 </td>
+</tr>
+
+<tr>
+<td class="left">CurrentEval: classification_error_evaluator</td>
+<td class="left"> 当前log_period个batch所有样本的平均分类错误率 </td>
+</tr>
+
+</tbody>
+</table>
+</center>
+<br>
diff --git a/doc_cn/faq/index.rst b/doc_cn/faq/index.rst
index 3eb0e10ae2228740cd384270db5070e367f7007b..f611255aaccd54f079c04dd509454bfd08af1307 100644
--- a/doc_cn/faq/index.rst
+++ b/doc_cn/faq/index.rst
@@ -4,22 +4,18 @@ PaddlePaddle常见问题
 
 ..  contents::
 
-1. 如何减少PaddlePaddle的内存占用
+1. 如何减少内存占用
 ---------------------------------
 
-神经网络的训练本身是一个非常消耗内存和显存的工作。经常会消耗数十G的内存和数G的显存。
+神经网络的训练本身是一个非常消耗内存和显存的工作，经常会消耗数10GB的内存和数GB的显存。
 PaddlePaddle的内存占用主要分为如下几个方面\:
 
-* DataProvider缓冲池内存 (只针对内存)
-* 神经元激活内存 （针对内存和显存）
-* 参数内存 (针对内存和显存)
+* DataProvider缓冲池内存（只针对内存）
+* 神经元激活内存（针对内存和显存）
+* 参数内存 （针对内存和显存）
 * 其他内存杂项
 
-这其中，其他内存杂项是指PaddlePaddle本身所用的一些内存，包括字符串分配，临时变量等等，
-这些内存就不考虑如何缩减了。
-
-其他的内存的减少方法依次为
-
+其中，其他内存杂项是指PaddlePaddle本身所用的一些内存，包括字符串分配，临时变量等等，暂不考虑在内。
 
 减少DataProvider缓冲池内存
 ++++++++++++++++++++++++++
@@ -39,28 +35,28 @@ PyDataProvider使用的是异步加载，同时在内存里直接随即选取数
 
 ..  literalinclude:: reduce_min_pool_size.py
 
-这样做可以极大的减少内存占用，并且可能会加速训练过程。 详细文档参考 `这里
+这样做可以极大的减少内存占用，并且可能会加速训练过程，详细文档参考 `这里
 <../ui/data_provider/pydataprovider2.html#provider>`_ 。
 
 神经元激活内存
 ++++++++++++++
 
-神经网络在训练的时候，会对每一个激活暂存一些数据，包括激活，參差等等。
+神经网络在训练的时候，会对每一个激活暂存一些数据，如神经元激活值等。
 在反向传递的时候，这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系，
 一是batch size，另一个是每条序列(Sequence)长度。所以，其实也是和每个mini-batch中包含
 的时间步信息成正比。
 
-所以，做法可以有两种。他们是
+所以做法可以有两种：
 
 * 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数，减小batch size可能会对训练结果产生影响。
 * 减小序列的长度，或者直接扔掉非常长的序列。比如，一个数据集大部分序列长度是100-200,
-  但是突然有一个10000长的序列，就很容易导致内存超限。特别是在LSTM等RNN中。
+  但是突然有一个10000长的序列，就很容易导致内存超限，特别是在LSTM等RNN中。
 
 参数内存
 ++++++++
 
 PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需要使用不同大小的内存。
-例如如果使用 :code:`adadelta` 算法，则需要使用参数规模大约5倍的内存。 如果参数保存下来的
+例如使用 :code:`adadelta` 算法，则需要使用等于权重参数规模大约5倍的内存。举例，如果参数保存下来的模型目录
 文件为 :code:`100M`， 那么该优化算法至少需要 :code:`500M` 的内存。
 
 可以考虑使用一些优化算法，例如 :code:`momentum`。
@@ -68,11 +64,11 @@ PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需
 2. 如何加速PaddlePaddle的训练速度
 ---------------------------------
 
-PaddlePaddle是神经网络训练平台，加速PaddlePaddle训练有如下几个方面\：
+加速PaddlePaddle训练可以考虑从以下几个方面\：
 
 * 减少数据载入的耗时
 * 加速训练速度
-* 利用更多的计算资源
+* 利用分布式训练驾驭更多的计算资源
 
 减少数据载入的耗时
 ++++++++++++++++++
@@ -108,25 +104,20 @@ PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`spa
 利用更多的计算资源可以分为一下几个方式来进行\:
 
 * 单机CPU训练
-  * 使用多线程训练。设置命令行参数 :code:`trainer_count`，即可以设置参与训练的线程数量。使用方法为 :code:`paddle train --trainer_count=4`
+  * 使用多线程训练。设置命令行参数 :code:`trainer_count`。
+
 * 单机GPU训练
-  * 使用显卡训练。设置命令行参数 :code:`use_gpu`。 使用方法为 :code:`paddle train --use_gpu=true`
-  * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count`。使用 :code:`--use_gpu=True` 开启GPU训练，使用 :code:`trainer_count` 指定显卡数量。使用方法为 :code:`paddle train --use_gpu=true --trainer_count=4`
+  * 使用显卡训练。设置命令行参数 :code:`use_gpu`。
+  * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count` 。
+
 * 多机训练
-  * 使用多机训练的方法也比较简单，需要先在每个节点启动 :code:`paddle pserver`，在使用 :code:`paddle train --pservers=192.168.100.1,192.168.100.2` 来指定每个pserver的ip地址
-  * 具体的多机训练方法参考 `多机训练 <TBD>`_ 文档。
+  * 具体的多机训练方法参考  `多机训练文档 <../ui/data_provider/pydataprovider2.html#provider>`_ 。
 
 
 3. 遇到“非法指令”或者是“illegal instruction” 
 --------------------------------------------
 
-paddle在进行计算的时候为了提升计算性能，使用了avx指令。部分老的cpu型号无法支持这样的指令。通常来说执行下grep avx /proc/cpuinfo看看是否有输出即可知道是否支持。（另：用此方法部分虚拟机可能检测到支持avx指令但是实际运行会挂掉，请当成是不支持，看下面的解决方案）
-
-解决办法是\:
-
-* 使用 NO_AVX的 `安装包 <../build_and_install/index.html>`_ 或者 `Docker image <../build_and_install/install/docker_install.html>`_
-* 或者，使用 :code:`-DWITH_AVX=OFF` 重新编译PaddlePaddle。
-
+PaddlePaddle使用avx SIMD指令提高cpu执行效率，因此错误的使用二进制发行版可能会导致这种错误，请选择正确的版本。
 
 4. 如何选择SGD算法的学习率
 --------------------------
@@ -158,7 +149,7 @@ paddle在进行计算的时候为了提升计算性能，使用了avx指令。
 6. 如何共享参数
 ---------------
 
-PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字的参数，会共享参数。设置参数的名字，可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式，是想要共享的参数使用同样的 :code:`ParamAttr` 对象。
+PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字的参数，会共享参数。设置参数的名字，可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式，是使得要共享的参数使用同样的 :code:`ParamAttr` 对象。
 
 简单的全连接网络，参数共享的配置示例为\:
 
@@ -208,9 +199,56 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
     paddle package is already in your PYTHONPATH. But unittest need a clean environment.
     Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
     
-解决办法是：卸载paddle包 :code:`pip uninstall paddle`。
+解决办法是：
+
+* 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包，使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面，单元测试会引用site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。同时，即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
+
+9. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致
+----------------------------------------------------------
+
+这是目前CMake寻找Python的逻辑存在缺陷，如果系统安装了多个Python版本，CMake找到的Python库和Python解释器版本可能有不一致现象，导致编译PaddlePaddle失败。正确的解决方法是，
+用户强制指定特定的Python版本，具体操作如下：
+
+    ..  code-block:: bash
+        
+        cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
+
+用户需要指定本机上Python的路径：``<exc_path>``, ``<lib_path>``, ``<inc_path>``
+
+10. A protocol message was rejected because it was too big
+----------------------------------------------------------
+
+如果在训练NLP相关模型时，出现以下错误：
+
+..  code-block:: bash
+
+    [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes).  To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
+    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr) 
+
+可能的原因是：传给dataprovider的某一个args过大，一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似：
+
+..  code-block:: python
+
+     src_dict = dict()
+     for line_count, line in enumerate(open(src_dict_path, "r")):
+        src_dict[line.strip()] = line_count
+
+     define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={"src_dict": src_dict})
+
+解决方案是：将字典的地址作为args传给dataprovider，然后在dataprovider里面根据该地址加载字典。即define_py_data_sources2应改为：
+
+..  code-block:: python
 
-原因是：单元测试使用了一个旧版本的python包，而没有测试到代码中实际修改的python包。即单元测试需要一个干净的环境：
+     define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={"src_dict_path": src_dict_path})
 
-* 如果paddle包已经在python的site-packages里面了，那么单元测试时使用的paddle包，就是site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。
-* 即便设置了 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
\ No newline at end of file
+完整源码可参考 `seqToseq <https://github.com/PaddlePaddle/Paddle/tree/develop/demo/seqToseq>`_ 示例。
\ No newline at end of file
diff --git a/doc_cn/howto/build_docker_image.rst b/doc_cn/howto/build_docker_image.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c23d26babe84f5b3a61644a59e28ec16933a811f
--- /dev/null
+++ b/doc_cn/howto/build_docker_image.rst
@@ -0,0 +1,35 @@
+构建PaddlePaddle的Docker Image
+==============================
+PaddlePaddle的Docker Image构建源码放置在 ``${源码根目录}/paddle/scripts/docker/`` 目录下。该目录有三类文件：
+
+- Dockerfile：Docker Image的描述文件，包括构建步骤、各种参数和维护人员等。
+  
+  - 一共维护了12个Dockerfile，Dockerfile.m4是它们的模板。
+  - PaddlePaddle中所有的Image都基于ubuntu 14.04。
+
+- build.sh：Docker Image的构建脚本，使用方式见下一小节。
+- generate.sh：通过Dockerfile.m4模板生成不同的Dockerfile。
+
+使用脚本构建Docker Image
+------------------------
+
+进入源码目录，执行 ``docker build`` 命令，即可在本地编译出PaddlePaddle的镜像。简单的使用样例为
+
+..  code-block:: bash
+
+    cd ${源码根目录}/paddle/scripts/docker/
+    docker build --build-arg LOWEST_DL_SPEED=50K \
+                 --build-arg WITH_GPU=ON \
+                 --tag  paddle_gpu:latest .
+
+其中，``--build-arg`` 传入的配置参数包括:
+
+- LOWEST\_DL\_SPEED\: 在多线程下载过程中，设置下载线程的最低速度。
+
+  - 默认单位是Bytes，但可以传入10K、10M、或10G等这样的单位。
+  - 如果小于这个速度，那么这个线程将会关闭。当所有的线程都关闭了，那么下载进程将会重启。
+-  WITH\_GPU\: ON or OFF，是否开启GPU功能。注意，
+  - **编译** PaddlePaddle的GPU版本 **不一定** 要在具有GPU的机器上进行。
+  - **运行** PaddlePaddle的GPU版本 **一定** 要在具有GPU的机器上运行。
+
+注意：所有Image的构建在Docker 1.12版本测试通过, 低于1.12的版本并没有测试。原因是旧版本可能缺乏 ``--build-arg`` 参数，从而不能在运行编译命令的时候接受参数。
diff --git a/doc_cn/howto/how_to_write_docs/index.rst b/doc_cn/howto/how_to_write_docs/index.rst
index 869ef747f9f88c7dbb5efdf6e03111a3f76c4014..a1f983b3405fa40f436885e40fca2ebbb4695491 100644
--- a/doc_cn/howto/how_to_write_docs/index.rst
+++ b/doc_cn/howto/how_to_write_docs/index.rst
@@ -2,32 +2,19 @@
 如何贡献/修改PaddlePaddle的文档
 ###############################
 
-PaddlePaddle的文档使用 `cmake`_ 驱动 `sphinx`_ 生成。公有两个文档，:code:`doc` 和 :code:`doc_cn` 。这两者会在 `cmake`_ 中进行编译，生成后的文档会存储在服务器的 :code:`doc` 和 :code:`doc_cn` 两个目录下。
+PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成，生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
 
-下面分几个部分介绍一下PaddlePaddle文档的贡献方法。
-
-如何书写PaddlePaddle的文档
-==========================
-
-TBD
 
 如何构建PaddlePaddle的文档
 ==========================
 
-构建PaddlePaddle文档，需要使用构建Paddle的全部环境。准备这个环境相对来说比较复杂，所以本文档提供两种方式构建PaddlePaddle的文档，即
-
-* 使用Docker构建PaddlePaddle的文档
-* 直接构建PaddlePaddle的文档。
-
-并且，我们推荐使用Docker来构建PaddlePaddle的文档。
+PaddlePaddle的文档构建有直接构建和基于Docker构建两种方式。构建PaddlePaddle文档需要准备的环境相对较复杂，所以我们推荐使用基于Docker来构建PaddlePaddle的文档。
 
 
 使用Docker构建PaddlePaddle的文档
 --------------------------------
 
-使用Docker构建PaddlePaddle的文档，首先要求在系统里安装好Docker工具包。安装Docker请参考 `Docker的官网 <https://docs.docker.com/>`_ 。
-
-安装好Docker之后可以使用源码目录下的脚本构建文档，即
+使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后可以使用源码目录下的脚本构建文档，即
 
 ..	code-block:: bash
 
@@ -35,10 +22,10 @@ TBD
 	cd paddle/scripts/tools/build_docs
 	bash build_docs.sh
 
-执行完这个脚本后，该目录下会生成两个目录，分别是\:
+编译完成后，该目录下会生成如下两个子目录\:
 
-* doc 目录，英文文档地址
-* doc_cn 目录，中文文档地址
+* doc 英文文档目录
+* doc_cn 中文文档目录
 
 打开浏览器访问对应目录下的index.html即可访问本地文档。
 
@@ -52,6 +39,10 @@ TBD
 
 TBD
 
+如何书写PaddlePaddle的文档
+==========================
+
+TBD
 
 如何更新www.paddlepaddle.org文档
 ================================
diff --git a/doc_cn/index.rst b/doc_cn/index.rst
index f1398206fddffca77f583c195e00034e55932639..c9f2126c3e36a019a1dbff2cfafd109eb58b77aa 100644
--- a/doc_cn/index.rst
+++ b/doc_cn/index.rst
@@ -16,6 +16,7 @@ PaddlePaddle文档
 --------
 * `新写Layer <../doc/dev/new_layer/index.html>`_
 * `如何贡献文档 <howto/how_to_write_docs/index.html>`_
+* `如何构建Docker Image <howto/build_docker_image.html>`_
 
 算法教程
 --------
diff --git a/doc_cn/introduction/index.md b/doc_cn/introduction/index.md
deleted file mode 100644
index 164cb7d4943dfbfcc00a2df7329ae2a877b2d703..0000000000000000000000000000000000000000
--- a/doc_cn/introduction/index.md
+++ /dev/null
@@ -1,105 +0,0 @@
-# 简介
-
-PaddlePaddle 是起源于百度的开源深度学习平台。它是简单易用的：你可以通过简单的十数行配置搭建经典的神经网络模型；它也是高效强大的：PaddlePaddle可以支撑复杂集群环境下超大模型的训练，令你受益于深度学习的前沿成果。在百度内部，已经有大量产品线使用了基于PaddlePaddle的深度学习技术。
-
-这份简短的介绍将像你展示如何利用PaddlePaddle解决一个经典的学习问题。
-
-## 1. 一个经典的任务
-
-让我们从一个基础问题开始：<a href="https://www.baidu.com/s?wd=单变量线性回归">单变量的线性回归</a>。问题假定观测到了一批二维空间上的点`(x, y) `，并且已知 `x` 和 `y` 之间存在着某种线性关系，我们的目标是通过观测数据还原这个线性关系。作为一个简单基础的模型，线性回归却有着广泛的应用场景。比如可以想象一个资产定价的简化场景，其中 `x` 对应于房屋的大小，`y` 对应于房屋价格。我们可以通过观察市场上房屋的情况获得二者之间的关系，从而为新房屋的定价提供参考。
-
-
-## 2. 准备数据
-
-假设变量 `X` 和 `Y` 的真实关系为： `Y = 2X + 0.3`，这里展示如何使用观测数据还原这一线性关系。如下Python代码将随机产生2000个观测点，它们将被用作PaddlePaddle的输入。产生PaddlePaddle的输入数据和写一段普通的Python脚本几乎一样，你唯一需要增加的就是定义输入数据的类型。
-
-```python
-# -*- coding:utf-8 -*-
-# dataprovider.py
-from paddle.trainer.PyDataProvider2 import *
-import random
-
-# 定义输入数据的类型: 2个浮点数
-@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-def process(settings, input_file):
-    for i in xrange(2000):
-        x = random.random()
-        yield [x], [2*x+0.3]
-```
-
-## 3. 训练模型
-
-为了还原 `Y = 2X + 0.3`，我们先从一条随机的直线 `Y' = wX + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `Y'` 和 `Y` 的差距不断减小，最终趋于相同。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
-
-在PaddlePaddle里，该模型的网络配置如下。
-
-```python
-# -*- coding:utf-8 -*-
-# trainer_config.py
-from paddle.trainer_config_helpers import *
-
-# 1. 定义数据来源，调用上面的process函数获得观测数据
-data_file = 'empty.list'
-with open(data_file, 'w') as f: f.writelines(' ')
-define_py_data_sources2(train_list=data_file, test_list=None, 
-        module='dataprovider', obj='process',args={})
-
-# 2. 学习算法。控制如何改变模型参数 w 和 b
-settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-
-# 3. 神经网络配置
-x = data_layer(name='x', size=1)
-y = data_layer(name='y', size=1)
-# 线性计算单元: y_predict = wx + b
-y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-# 损失计算，度量 y_predict 和真实 y 之间的差距
-cost = regression_cost(input=y_predict, label=y)
-outputs(cost)
-```
-这段简短的配置展示了PaddlePaddle的基本用法：
-
-- 首先，第一部分定义了数据输入。一般情况下，PaddlePaddle先从一个文件列表里获得数据文件地址，然后交给用户自定义的函数（例如上面的`process`函数）进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件，所以放一个空列表（`empty.list`）即可。
-
-- 第二部分主要是选择学习算法，它定义了模型参数如何改变。PaddlePaddle提供了很多优秀的学习算法，但这里使用一个简单的基于momentum的算法就足够了，它每次读取12个数据进行计算和模型更新。
-
-- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络单元（Layer），所以很多时候你需要做的只是声明正确的网络单元并把它们拼接起来。这里使用了三种网络单元：
-	- **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到下游的其它单元。这里数据层有两个，分别对应于变量 `X` 和 `Y`。
-	- **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以挖掘复杂的数据关系。
-	- **回归损失层**：回归损失层 `regression_cost`是众多损失函数层的一种，它们在训练过程作为网络的出口，用来计算模型的表现，并指导模型参数的改变。
-
-这样定义了网络结构并保存为`trainer_config.py`之后，运行训练命令即可：
- ```
- paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
- ```
-
-PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加损失函数的输出在不断的减小，这意味着模型在不断的改进，直到逼近真实解：` Y = 2X + 0.3 `
-
-## 4. 模型检验
-
-训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用模型对另外一组数据进行预测，然后评价预测的效果。但在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
-
-PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件，所以可以利用如下方法读取模型的参数。
-
-```python
-import numpy as np
-import os
-
-def load(file_name):
-    with open(file_name, 'rb') as f:
-        f.read(16) # skip header for float type.
-        return np.fromfile(f, dtype=np.float32)
-        
-print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-# w=1.999743, b=0.300137
-```
-<center> ![](./parameters.png) </center>
-
-从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型重合。
-
-这样，我们就完成了对单变量线性回归问题的解决：将数据输入PaddlePaddle，训练模型，最后验证结果。
-
-## 5. 推荐后续阅读
-
-- <a href="../build_and_install/index.html">安装/编译</a>：PaddlePaddle的安装与编译文档。
-- <a href="../demo/quick_start/index.html">快速入门 </a>：使用商品评论分类任务，系统性的介绍如何一步步改进，最终得到产品级的深度模型。
-- <a href="../demo/index.html">示例</a>：各种实用案例，涵盖图像、文本、推荐等多个领域。
diff --git a/doc_cn/introduction/index.rst b/doc_cn/introduction/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c996f5f4acd07011c98c3e1086080e85ed7dd1b4
--- /dev/null
+++ b/doc_cn/introduction/index.rst
@@ -0,0 +1,114 @@
+简介
+====
+
+PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍将向你展示如何利用PaddlePaddle来解决一个经典的线性回归问题。
+
+1. 一个经典的任务
+-----------------
+
+我们展示如何用PaddlePaddle解决 `单变量的线性回归 <https://www.baidu.com/s?wd=单变量线性回归>`_ 问题。线性回归的输入是一批点 `(x, y)` ，其中 `y = wx + b + ε`， 而 ε 是一个符合高斯分布的随机变量。线性回归的输出是从这批点估计出来的参数 `w` 和 `b` 。
+
+一个例子是房产估值。我们假设房产的价格（y）是其大小（x）的一个线性函数，那么我们可以通过收集市场上房子的大小和价格，用来估计线性函数的参数w 和 b。
+
+2. 准备数据
+-----------
+
+假设变量 `x` 和 `y` 的真实关系为： `y = 2x + 0.3 + ε`，这里展示如何使用观测数据来拟合这一线性关系。首先，Python代码将随机产生2000个观测点，作为线性回归的输入。下面脚本符合PaddlePaddle期待的读取数据的Python程序的模式。
+
+.. code-block:: python
+
+    # dataprovider.py
+    from paddle.trainer.PyDataProvider2 import *
+    import random
+
+    # 定义输入数据的类型: 2个浮点数
+    @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
+    def process(settings, input_file):
+        for i in xrange(2000):
+            x = random.random()
+            yield [x], [2*x+0.3]
+
+3. 训练模型
+-----------
+
+为了还原 `y = 2x + 0.3`，我们先从一条随机的直线 `y' = wx + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `y'` 和 `y` 的差距不断减小，最终趋于接近。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
+
+在PaddlePaddle里，该模型的网络配置如下。
+
+.. code-block:: python
+
+    # trainer_config.py
+    from paddle.trainer_config_helpers import *
+
+    # 1. 定义数据来源，调用上面的process函数获得观测数据
+    data_file = 'empty.list'
+    with open(data_file, 'w') as f: f.writelines(' ')
+    define_py_data_sources2(train_list=data_file, test_list=None, 
+                            module='dataprovider', obj='process',args={})
+
+    # 2. 学习算法。控制如何改变模型参数 w 和 b
+    settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
+
+    # 3. 神经网络配置
+    x = data_layer(name='x', size=1)
+    y = data_layer(name='y', size=1)
+    # 线性计算网络层: ȳ = wx + b
+    ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
+    # 计算误差函数，即  ȳ 和真实 y 之间的距离
+    cost = regression_cost(input= ȳ, label=y)
+    outputs(cost)
+
+这段简短的配置展示了PaddlePaddle的基本用法：
+
+- 第一部分定义了数据输入。一般情况下，PaddlePaddle先从一个文件列表里获得数据文件地址，然后交给用户自定义的函数（例如上面的 `process`函数）进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件，所以放一个空列表（`empty.list`）即可。
+
+- 第二部分主要是选择学习算法，它定义了模型参数改变的规则。PaddlePaddle提供了很多优秀的学习算法，这里使用一个基于momentum的随机梯度下降(SGD)算法，该算法每批量(batch)读取12个采样数据进行随机梯度计算来更新更新。
+
+- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络层，所以很多时候你需要做的只是定义正确的网络层并把它们连接起来。这里使用了三种网络单元：
+	
+	- **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到接下来的网络层。这里数据层有两个，分别对应于变量 `x` 和 `y`。
+	- **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以拟合任意的函数来学习复杂的数据关系。
+	- **回归误差代价层**：回归误差代价层 `regression_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
+
+定义了网络结构并保存为 `trainer_config.py` 之后，运行以下训练命令：
+
+.. code-block:: bash
+
+    paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
+
+PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加误差代价函数的输出在不断的减小，这意味着模型在训练数据上不断的改进，直到逼近真实解：` y = 2x + 0.3 `
+
+4. 模型检验
+-----------
+
+训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用学习的模型对另外一组测试数据进行预测，评价预测的效果。在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
+
+PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件，所以可以利用如下方法读取模型的参数。
+
+.. code-block:: python
+
+    import numpy as np
+    import os
+
+    def load(file_name):
+        with open(file_name, 'rb') as f:
+            f.read(16) # skip header for float type.
+            return np.fromfile(f, dtype=np.float32)
+        
+    print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
+    # w=1.999743, b=0.300137
+
+.. image:: ./parameters.png
+	 :align: center
+	 :scale: 80 %
+
+从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型一致。
+
+这样，我们用PaddlePaddle解决了单变量线性回归问题， 包括数据输入、模型训练和最后的结果验证。
+
+5. 推荐后续阅读
+---------------
+
+- `安装/编译 <../build_and_install/index.html>`_ ：PaddlePaddle的安装与编译文档。
+- `快速入门 <../demo/quick_start/index.html>`_ ：使用商品评论分类任务，系统性的介绍如何一步步改进，最终得到产品级的深度模型。
+- `示例 <../demo/index.html>`_ ：各种实用案例，涵盖图像、文本、推荐等多个领域。
\ No newline at end of file
diff --git a/doc_cn/ui/cmd/dump_config.rst b/doc_cn/ui/cmd/dump_config.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/doc_cn/ui/cmd/index.rst b/doc_cn/ui/cmd/index.rst
index 6d62180a6a5e3f2490cccd2a90213050aa3c172e..31a8b8a79f4a87101bd6030eb4e779fd11d65811 100644
--- a/doc_cn/ui/cmd/index.rst
+++ b/doc_cn/ui/cmd/index.rst
@@ -1,24 +1,20 @@
-命令行参数
-==========
+命令
+====
 
-安装好的PaddlePaddle脚本包括多条命令，他们是
+安装好PaddlePaddle后，在命令行直接敲击 ``paddle`` 或 ``paddle --help`` 会显示如下一些命令。
 
-* paddle train即为PaddlePaddle的训练进程。可以使用paddle train完成单机多显卡多线程的训
-  练。也可以和paddle pserver组合使用，完成多机训练。
-* paddle pserver为PaddlePaddle的parameter server进程。负责多机训练中的参数聚合工作。
-* paddle version可以打印出PaddlePaddle的版本和编译时信息。
-* merge_model 可以将PaddlePaddle的模型和配置打包成一个文件。方便部署分发。
-* dump_config 可以将PaddlePaddle的训练模型以proto string的格式打印出来
-* make_diagram 可以使用graphviz对PaddlePaddle的网络模型进行绘制，方便调试使用。
+* ``train`` Start a paddle_trainer
+    启动一个PaddlePaddle训练进程。 ``paddle train`` 可以通过命令行参数 ``-local=true`` 启动一个单机的训练进程；也可以和 ``paddle pserver`` 一起使用启动多机的分布式训练进程。
+* ``pserver`` Start a paddle_pserver_main
+    在多机分布式训练下启动PaddlePaddle的parameter server进程。
+* ``version`` Print paddle version
+    用于打印当前PaddlePaddle的版本和编译选项相关信息。常见的输出格式如下：1）第一行说明了PaddlePaddle的版本信息；2）第二行开始说明了一些主要的编译选项，具体意义可以参考 `编译参数选项文件 <../../build_and_install/cmake/compile_options.html>`_ 。
 
-更详细的介绍请参考各个命令的命令行参数文档。
+    ..  literalinclude:: paddle_version.txt
 
-..  toctree::
-    :glob:
-
-    paddle_train.rst
-    paddle_pserver.rst
-    paddle_version.rst
-    merge_model.rst
-    dump_config.rst
-    make_diagram.rst
+* ``merge_model`` Start a paddle_merge_model
+    用于将PaddlePaddle的模型参数文件和模型配置文件打包成一个文件，方便做部署分发。
+* ``dump_config`` Dump the trainer config as proto string
+    用于将PaddlePaddle的模型配置文件以proto string的格式打印出来。
+* ``make_diagram``
+    使用graphviz对PaddlePaddle的模型配置文件进行绘制。
\ No newline at end of file
diff --git a/doc_cn/ui/cmd/make_diagram.rst b/doc_cn/ui/cmd/make_diagram.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/doc_cn/ui/cmd/merge_model.rst b/doc_cn/ui/cmd/merge_model.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/doc_cn/ui/cmd/paddle_pserver.rst b/doc_cn/ui/cmd/paddle_pserver.rst
deleted file mode 100644
index 891975c34af5c34dddc754b79bd3e1adda9d9671..0000000000000000000000000000000000000000
--- a/doc_cn/ui/cmd/paddle_pserver.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-paddle pserver的命令行参数
-==========================
diff --git a/doc_cn/ui/cmd/paddle_train.rst b/doc_cn/ui/cmd/paddle_train.rst
deleted file mode 100644
index 87b84f5cbdbbe016d9bcdbda2cb30d93d2ad8022..0000000000000000000000000000000000000000
--- a/doc_cn/ui/cmd/paddle_train.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-paddle train的命令行参数
-========================
diff --git a/doc_cn/ui/cmd/paddle_version.rst b/doc_cn/ui/cmd/paddle_version.rst
deleted file mode 100644
index 0a4f8dd472a6009ef6832df75be043c24bb32ba0..0000000000000000000000000000000000000000
--- a/doc_cn/ui/cmd/paddle_version.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-paddle version的命令行参数
-==========================
-
-paddle version可以打印出paddle的版本信息和编译的选项。常见的输出格式为
-
-..  literalinclude:: paddle_version.txt
-
-其第一行说明了paddle的版本，后面跟着一系列编译参数。这里可以参考paddle的
-`编译参数选项文件 <../../build/cmake/compile_options.html>`_
diff --git a/doc_cn/ui/data_provider/dataprovider.rst b/doc_cn/ui/data_provider/dataprovider.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e6796429a78801eba5e5fb776dd6fbe3413115ea
--- /dev/null
+++ b/doc_cn/ui/data_provider/dataprovider.rst
@@ -0,0 +1,13 @@
+DataProvider的介绍
+==================
+
+DataProvider是PaddlePaddle负责提供数据的模块。其作用是将数据传入内存或显存，让神经网络可以进行训练或预测。用户可以通过简单使用Python接口 `PyDataProvider2 <pydataprovider2.html>`_ ，来自定义传数据的过程。如果有更复杂的使用，或者需要更高的效率，用户也可以在C++端自定义一个 ``DataProvider`` 。
+
+PaddlePaddle需要用户在网络配置（trainer_config.py）中定义使用哪种DataProvider，并且在DataProvider中实现如何访问训练文件列表（train.list）或测试文件列表（test.list）。
+
+- train.list和test.list存放在本地（推荐直接存放到训练目录，以相对路径引用)。一般情况下，两者均为纯文本文件，其中每一行对应一个数据文件地址：
+  
+  - 如果数据文件存于本地磁盘，这个地址则为它的绝对路径或相对路径(相对于PaddlePaddle程序运行时的路径)。
+  - 地址也可以为hdfs文件路径，或者数据库连接路径等。
+  - 由于这个地址会被DataProvider使用，因此，如何解析该地址也是用户自定义DataProvider时需要考虑的地方。
+- 如果没有设置test.list，或设置为None，那么在训练过程中不会执行测试操作；否则，会根据命令行参数指定的测试方式，在训练过程中进行测试，从而防止过拟合。
diff --git a/doc_cn/ui/data_provider/index.rst b/doc_cn/ui/data_provider/index.rst
deleted file mode 100644
index ec8f8e5dc5b29e3504d0087e844c1f14436919d9..0000000000000000000000000000000000000000
--- a/doc_cn/ui/data_provider/index.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-PaddlePaddle的数据提供(DataProvider)介绍
-========================================
-
-数据提供(DataProvider)是PaddlePaddle负责提供数据的模块。其作用是将训练数据传入内存或者显存，让神经网络可以进行训练。简单的使用，用户可以使用Python的 :code:`PyDataProvider` 来自定义传数据的过程。如果有更复杂的使用，或者需要更高的效率，用户也可以在C++端自定义一个 :code:`DataProvider` 。
-
-PaddlePaddle需要用户在网络配置(trainer_config.py)中定义使用哪种DataProvider及其参数，训练文件列表(train.list)和测试文件列表(test.list)。
-
-其中，train.list和test.list均为本地的两个文件(推荐直接放置到训练目录，以相对路径引用)。如果test.list不设置，或者设置为None，那么在训练过程中，不会执行测试操作。否则，会根据命令行参数指定的测试方式，在训练过程中进行测试，从而防止过拟合。
-
-一般情况下，train.list和test.list为纯文本文件，一行对应一个数据文件，数据文件存放在本地磁盘中。将文件的绝对路径或相对路径(相对于PaddlePaddle程序运行时的路径)写在train.list和test.list中。当然，train.list和test.list也可以放置hdfs文件路径，或者数据库连接地址等等。
-
-用户在DataProvider中需要实现如何访问其中每一个文件。DataProvider的具体用法和如何实现一个新的DataProvider，请参考下述文章:
-
-..	toctree::
-
-	pydataprovider2.rst
-	write_new_dataprovider.rst
diff --git a/doc_cn/ui/data_provider/mnist_config.py b/doc_cn/ui/data_provider/mnist_config.py
index 39becff03b08f5e75b8503aaf01e782d2b0fb3be..429338c57f8f865f0c5835d933445b65ee2ea7aa 100644
--- a/doc_cn/ui/data_provider/mnist_config.py
+++ b/doc_cn/ui/data_provider/mnist_config.py
@@ -5,5 +5,6 @@ define_py_data_sources2(
     test_list=None,
     module='mnist_provider',
     obj='process')
+
 img = data_layer(name='pixel', size=784)
 label = data_layer(name='label', size=10)
diff --git a/doc_cn/ui/data_provider/mnist_provider.py b/doc_cn/ui/data_provider/mnist_provider.py
deleted file mode 100644
index 8b828641d55735e67ca634107d5b239150649651..0000000000000000000000000000000000000000
--- a/doc_cn/ui/data_provider/mnist_provider.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from paddle.trainer.PyDataProvider2 import *
-
-
-# Define a py data provider
-@provider(input_types=[dense_vector(28 * 28), integer_value(10)])
-def process(settings, filename):  # settings is not used currently.
-    f = open(filename, 'r')  # open one of training file
-
-    for line in f:  # read each line
-        label, pixel = line.split(';')
-
-        # get features and label
-        pixels_str = pixel.split(' ')
-
-        pixels_float = []
-        for each_pixel_str in pixels_str:
-            pixels_float.append(float(each_pixel_str))
-
-        # give data to paddle.
-        yield pixels_float, int(label)
-
-    f.close()  # close file
diff --git a/doc_cn/ui/data_provider/pydataprovider2.rst b/doc_cn/ui/data_provider/pydataprovider2.rst
index 80b40084d8f5037a76df0b3e01ed5742d8476bd0..dce373118c5ae01c7ecf9afc15e1d9af9bf4ebe4 100644
--- a/doc_cn/ui/data_provider/pydataprovider2.rst
+++ b/doc_cn/ui/data_provider/pydataprovider2.rst
@@ -1,257 +1,227 @@
-PyDataProvider2的使用
-=====================
-
-PyDataProvider是PaddlePaddle使用Python提供数据的推荐接口。使用该接口用户可以只关注如何
-从文件中读取每一条数据，而不用关心数据如何传输给PaddlePaddle，数据如何存储等等。该数据
-接口使用多线程读取数据，并提供了简单的Cache功能。
-
-
-简单的使用场景
---------------
-
-这里以MNIST手写识别为例，来说明简单的PyDataProvider如何使用。MNIST是一个包含有
-70,000张灰度图片的数字分类数据集。对于MNIST而言，标签是0-9的数字，而特征即为
-28*28的像素灰度值。这里我们使用简单的文本文件表示MNIST图片，样例数据如下。
-
-..  literalinclude:: mnist_train.txt
-
-其数据使用;间隔，第一段数据为这张图片的label，第二段数据为这个图片的像素值。
-首先我们将这个数据文件(例如文件名是'mnist_train.txt')写入train.list。那么
-train.list即为
-
-..  literalinclude:: train.list
-
-那么对应的dataprovider既为
-
-..  literalinclude:: mnist_provider.py
-    :linenos:
-
-其中第一行是引入PaddlePaddle的PyDataProvider2包。主要函数是process函数。process函数
-具有两个参数，第一个参数是 settings 。这个参数在这个样例里没有使用，具
-体可以参考 settings 。第二个参数是filename，这个参数被PaddlePaddle进程传入，为
-train.list中的一行(即train.list若干数据文件路径的某一个路径)。
-
-:code:`@provider` 是一个Python的 `Decorator <http://www.learnpython.org/en/Decorators>`_
-。这行的作用是设置DataProvider的一些属性，并且标记process函数是一个DataProvider。
-如果不了解 `Decorator <http://www.learnpython.org/en/Decorators>`_ 是什么也没关系，
-只需要知道这只是一个标记属性的方法就可以了。
-
-属性 `input_types`_ 是设置这个DataProvider返回什么样的数据。这里设置的是返回一个
-28*28的稠密向量和一个[0-9]，10维的整数值。 `input_types`_ 具体可以设置成什么其他格
-式，请参考 `input_types`_ 的文档。
-
-process函数是实现数据输入的主函数，在这个函数中，实现了打开文本文件，从文本文件中读取
-每一行，并将每行转换成和 `input_types`_ 一致的特征，并在23行返回给PaddlePaddle进程。需要注意
-的是， 返回的顺序需要和 `input_types`_ 中定义的顺序一致。
-
-同时，返回数据在PaddlePaddle中是仅仅返回一条完整的训练样本，并且使用关键词 :code:`yield` 。
-在PyDataProvider中，可以为一个数据文件返回多条训练样本(就像这个样例一样)，只需要在
-process函数调用多次 :code:`yield` 即可。 :code:`yield` 是Python的一个关键词，相关的概
-念是 :code:`generator` 。使用这个关键词，可以在一个函数里，多次返回变量。
-
-在训练配置里，只需要使用一行代码即可以设置训练引用这个DataProvider。这个设置为
-
-..  literalinclude:: mnist_config.py
-
-这里说明了训练数据是 'train.list'，而没有测试数据。引用的DataProvider是 'mnist_provider' 
-这个模块中的 'process' 函数。
-
-同时，根据模型配置文件中 :code:`data_layer` 的名字，用户也可以显式指定返回的数据对应关系。例如:
-
-.. literalinclude:: mnist_provider.dict.py
-   :linenos:
-
-如果用户不指定返回数据的对应关系，那么PaddlePaddle会粗略的根据layer的声明顺序，
-来确定对应关系。这个对应关系可能不正确。所以推荐使用显式指定返回值和数据对应关系。
-
-至此，简单的PyDataProvider样例就说明完毕了。对于用户来说，讲数据发送给PaddlePaddle，仅仅需要
-知道如何从 **一个文件** 里面读取 **一条** 样本。而PaddlePaddle进程帮助用户做了
-
-* 将数据组合成Batch训练
-* Shuffle训练数据
-* 多线程数据读取
-* 缓存训练数据到内存(可选)
-* CPU->GPU双缓存
-
-是不是很简单呢？
-
-序列模型数据提供
-----------------
-
-序列模型是指数据的某一维度是一个序列形式，即包含时间步信息。所谓时间步信息，
-不一定和时间有关系，只是说明数据的顺序是重要的。例如，文本信息就是一个序列
-数据。
-
-这里举例的数据是英文情感分类的数据。数据是给一段英文文本，分类成正面情绪和
-负面情绪两类(用0和1表示)。样例数据为
-
-..  literalinclude:: sentimental_train.txt
-
-这里，DataProvider可以是
-
-..  literalinclude:: sentimental_provider.py
-
-这个序列模型比较复杂。主要是增加了初始化机制。其中 :code:`on_init` 函数是使用
-`@provider`_ 中的 `init_hook`_ 配置参数配置给DataProvider的。这个函数会在
-DataProvider创建的时候执行。这个初始化函数具有如下参数:
-
-* 第一个参数是 settings 对象。
-* 其他参数均使用key word argument形式传入。有部分参数是Paddle自动生成的，
-  参考 `init_hook`_ 。这里的 :code:`dictionary` 是从训练配置传入的dict对象。
-  即从单词字符串到单词id的字典。
-
-传入这个变量的方式为
-
-..  literalinclude:: sentimental_config.py
-
-这个声明基本上和mnist的样例一致。除了
-
-* 在配置中读取了字典
-* 在声明DataProvider的时候传入了dictionary作为参数。
-
-在 :code:`on_init` 函数中，配置了 `input_types` 。这个和在 `@provider`_ 中配置
-`input_types` 效果一致，但是在 `on_init` 中配置 `input_types` 是在运行时执行的，所以
-可以根据不同的数据配置不同的输入类型。这里的输入特征是词id的序列，所以将 :code:`seq_type`
-设置成了序列(同时，也可以使用 :code:`integer_sequence` 类型来设置)。
-
-同时，将字典存入了settings 对象。这个字典可以在 :code:`process` 函数中使用。 :code:`process`
-函数中的 settings 和 :code:`on_init` 中的settings 是同一个对象。
-
-而在 :code:`process` 函数中，基本的处理逻辑也和mnist逻辑一致。依次返回了文件中的每条数据。
-
-至此，基本的PyDataProvider使用介绍完毕了。具体DataProvider还具有什么功能，请参考下节reference。
-
-参考(Reference)
----------------
-
-@provider
-+++++++++
-
-:code:`@provider` 是一个Python的 `Decorator`_ ，他可以将某一个函数标记成一个PyDataProvider。它包含的参数有:
-
-*  `input_types`_ 是数据输入格式。具体有哪些格式，参考 `input_types`_ 。
-*  should_shuffle 是个DataProvider是不是要做shuffle，如果不设置的话，训练的时候默认shuffle，
-   测试的时候默认不shuffle。
-*  min_pool_size 是设置DataProvider在内存中最小暂存的数据条数。这个也是PaddlePaddle所能够保证的shuffle粒度。
-   设置成-1的话，会预先读取全部数据到内存中。
-*  pool_size 是设置DataProvider在内存中暂存的数据条数。设置成-1的话，即不在乎内存暂存多少条数据。
-*  can_over_batch_size 表示是否允许Paddle暂存略微多余pool_size的数据。这样做可以避免很多死锁问题。
-   一般推荐设置成True
-*  calc_batch_size 传入的是一个函数，这个函数以一条数据为参数，返回batch_size的大小。默认情况下一条数据
-   是一个batch size，但是有时为了计算均衡性，可以将一条数据设置成多个batch size
-*  cache 是数据缓存的策略，参考 `cache`_
-*  init_hook 是初始化时调用的函数，参考 `init_hook`_
-*  check 设置成true的话，会根据input_types检查数据的合法性。
-*  check_fail_continue 如果设置成true的话，即使在check中数据不合法，也会扔到这条数据，继续训练。 如果
-   check是false的话，没有作用。
-
-input_types
-+++++++++++
-
-PaddlePaddle的数据包括四种主要类型，和三种序列模式。其中，四种数据类型是
-
-* dense_vector 表示稠密的浮点数向量。
-* sparse_binary_vector 表示稀疏的零一向量，即大部分值为0，有值的位置只能取1
-* sparse_float_vector 表示稀疏的向量，即大部分值为0，有值的部分可以是任何浮点数
-* integer 表示整数标签。
-
-而三种序列模式为
-
-* SequenceType.NO_SEQUENCE 即不是一条序列
-* SequenceType.SEQUENCE 即是一条时间序列
-* SequenceType.SUB_SEQUENCE 即是一条时间序列，且序列的每一个元素还是一个时间序列。
-
-不同的数据类型和序列模式返回的格式不同，列表如下
-
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-|                      | NO_SEQUENCE         | SEQUENCE                          |  SUB_SEQUENCE                                  |
-+======================+=====================+===================================+================================================+
-| dense_vector         | [f, f, ...]         | [[f, ...], [f, ...], ...]         | [[[f, ...], ...], [[f, ...], ...],...]         |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| sparse_binary_vector | [i, i, ...]         | [[i, ...], [i, ...], ...]         | [[[i, ...], ...], [[i, ...], ...],...]         |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| sparse_float_vector  | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| integer_value        |  i                  | [i, i, ...]                       | [[i, ...], [i, ...], ...]                      |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-
-其中，f代表一个浮点数，i代表一个整数。
-
-init_hook
-+++++++++
-
-init_hook可以传入一个函数。这个函数在初始化的时候会被调用。这个函数的参数是:
-
-* 第一个参数是 settings 对象。这个对象和process的第一个参数一致。具有的属性有
-    * settings.input_types 设置输入类型。参考 `input_types`_
-    * settings.logger 一个logging对象
-* 其他参数都使用key word argument传入。这些参数包括paddle定义的参数，和用户传入的参数。
-    * Paddle定义的参数包括:
-        * is_train bool参数，表示这个DataProvider是训练用的DataProvider或者测试用的
-          DataProvider
-        * file_list 所有文件列表。
-    * 用户定义的参数使用args在训练配置中设置。
-
-注意，PaddlePaddle保留添加参数的权力，所以init_hook尽量使用 :code:`**kwargs` , 来接受不使用的
-函数来保证兼容性。
-
-cache
-+++++
-
-DataProvider提供了两种简单的Cache策略。他们是
-
-* CacheType.NO_CACHE 不缓存任何数据，每次都会从python端读取数据
-* CacheType.CACHE_PASS_IN_MEM 第一个pass会从python端读取数据，剩下的pass会直接从内存里
-  读取数据。 
-
-
-注意事项
---------
-
-可能的内存泄露问题
-++++++++++++++++++
-
-PaddlePaddle将train.list中的每一行，都传递给process函数，从而生成多个generator。
-即如果train.list中，有100个训练文件，即会生成100个generator。这个本身不是一个很
-严重的问题。
-
-但是，如果在训练时，每一条训练数据都是一个文件，并且，训练数据非常多的情况下，就
-会生成多个generator。每个generator在没有调用的时候，是几乎不占内存的。但是，当调
-用过一次的时候，generator便会存下当前的上下文(Context)。而这个Context可能会非常
-大。并且，generator至少调用两次才会知道是否停止。所以，即使在process里面只会有一
-个yield，也需要两次随机选择到同样的generator的时候，才会释放该段内存。
-
-..  code-block:: python
-
-    def func():
-        yield 0
-
-    f = func()  # 创建generator
-    tmp = next(f)  # 调用一次，返回0
-    tmp = next(f)  # 调用第二次的时候，才会Stop Iteration
-
-而如果按顺序调用这些generator就不会出现这个问题。
-
-所以最佳实践推荐不要将每一个样本都放入train.list。而是将样本的地址放入另一个文本
-文件，train.list写入那个文本文件的地址。 或者在python generator的上下文中尽量留
-下非常少的变量引用。例如
-
-..  code-block:: python
-
-    def real_process(fn):
-        # ... read from fn
-        return result   # 当函数返回的时候，python可以解除掉内部变量的引用。
-
-    def process(fn):
-        yield real_process(fn)
-
-这个问题是PyDataProvider读数据时候的逻辑问题，基本上不能整体修正。
-
-
-内存不够用的情况
-++++++++++++++++
-
-PyDataProvider2会尽量使用内存。所以如果对于内存比较小的机器，推荐设置
-:code:`pool_size` 变量，而这个变量推荐大于训练的batch size，并且在内存足够
-的情况下越大越好。
-
+PyDataProvider2的使用
+=====================
+
+PyDataProvider2是PaddlePaddle使用Python提供数据的推荐接口。该接口使用多线程读取数据，并提供了简单的Cache功能；同时可以使用户只关注如何从文件中读取每一条数据，而不用关心数据如何传输，如何存储等等。
+
+..  contents::
+
+MNIST的使用场景
+---------------
+
+我们以MNIST手写识别为例，来说明PyDataProvider2的简单使用场景。
+
+样例数据
+++++++++
+
+MNIST是一个包含有70,000张灰度图片的数字分类数据集。样例数据 ``mnist_train.txt`` 如下：
+
+..  literalinclude:: mnist_train.txt
+
+其中每行数据代表一张图片，行内使用 ``;`` 分成两部分。第一部分是图片的标签，为0-9中的一个数字；第二部分是28*28的图片像素灰度值。 对应的 ``train.list`` 即为这个数据文件的名字：
+
+..  literalinclude:: train.list
+
+dataprovider的使用
+++++++++++++++++++
+
+..  literalinclude:: mnist_provider.dict.py
+
+- 首先，引入PaddlePaddle的PyDataProvider2包。
+- 其次，定义一个Python的 `Decorator <http://www.learnpython.org/en/Decorators>`_ `@provider`_ 。用于将下一行的数据输入函数标记成一个PyDataProvider2，同时设置它的input_types属性。
+  
+  - `input_types`_：设置这个PyDataProvider2返回什么样的数据。本例根据网络配置中 ``data_layer`` 的名字，显式指定返回的是一个28*28维的稠密浮点数向量和一个[0-9]的10维整数标签。
+
+    ..  literalinclude:: mnist_config.py
+         :lines: 9-10
+
+  - 注意：如果用户不显示指定返回数据的对应关系，那么PaddlePaddle会根据layer的声明顺序，来确定对应关系。但这个关系可能不正确，所以推荐使用显式指定的方式来设置input_types。
+- 最后，实现数据输入函数（如本例的 ``process`` 函数）。
+
+  - 该函数的功能是：打开文本文件，读取每一行，将行中的数据转换成与input_types一致的格式，然后返回给PaddlePaddle进程。注意，
+    
+    - 返回的顺序需要和input_types中定义的顺序一致。
+    - 返回时，必须使用Python关键词 ``yield`` ，相关概念是 ``generator`` 。
+    - 一次yield调用，返回一条完整的样本。如果想为一个数据文件返回多条样本，只需要在函数中调用多次yield即可（本例中使用for循环进行多次调用）。
+  
+  - 该函数具有两个参数：
+  
+    - settings：在本例中没有使用，具体可以参考 `init_hook`_ 中的说明。
+    - filename：为 ``train.list`` 或 ``test.list`` 中的一行，即若干数据文件路径的某一个。
+
+网络配置中的调用
+++++++++++++++++
+
+在网络配置里，只需要一行代码就可以调用这个PyDataProvider2，如，
+
+..  literalinclude:: mnist_config.py
+     :lines: 1-7
+
+训练数据是 ``train.list`` ，没有测试数据，调用的PyDataProvider2是 ``mnist_provider`` 模块中的 ``process`` 函数。
+
+小结
++++++
+
+至此，简单的PyDataProvider2样例就说明完毕了。对用户来说，仅需要知道如何从 **一个文件** 中读取 **一条样本** ，就可以将数据传送给PaddlePaddle了。而PaddlePaddle则会帮用户做以下工作：
+
+* 将数据组合成Batch进行训练
+* 对训练数据进行Shuffle
+* 多线程的数据读取
+* 缓存训练数据到内存(可选)
+* CPU->GPU双缓存
+
+是不是很简单呢？
+
+时序模型的使用场景
+------------------
+样例数据
+++++++++
+
+时序模型是指数据的某一维度是一个序列形式，即包含时间步信息。所谓时间步信息，不一定和时间有关系，只是说明数据的顺序是重要的。例如，文本信息就是一个序列数据。
+
+本例采用英文情感分类的数据，即将一段英文文本数据，分类成正面情绪和负面情绪两类(用0和1表示)。样例数据 ``sentimental_train.txt`` 如下：
+
+..  literalinclude:: sentimental_train.txt
+
+dataprovider的使用
+++++++++++++++++++
+
+相对MNIST而言，这个dataprovider较复杂，主要原因是增加了初始化机制 `init_hook`_。本例的 ``on_init`` 函数就是根据该机制配置的，它会在dataprovider创建的时候执行。
+
+- 其中 ``input_types`` 和在 `@provider`_ 中配置的效果一致。本例中的输入特征是词ID的序列，因此使用 ``integer_value_sequence`` 类型来设置。
+- 将 ``dictionary`` 存入settings对象，在 ``process`` 函数中使用。 dictionary是从网络配置中传入的dict对象，即一个将单词字符串映射到单词ID的字典。
+
+..  literalinclude:: sentimental_provider.py
+
+网络配置中的调用
+++++++++++++++++
+
+调用这个PyDataProvider2的方法，基本上和MNIST样例一致，除了
+
+* 在配置中需要读取外部字典。
+* 在声明DataProvider的时候传入dictionary作为参数。
+
+..  literalinclude:: sentimental_config.py
+     :emphasize-lines: 12-14
+
+参考(Reference)
+---------------
+
+@provider
++++++++++
+
+``@provider`` 是一个Python的 `Decorator`_ ，可以将某一个函数标记成一个PyDataProvider2。如果不了解 `Decorator`_ 是什么也没关系，只需知道这是一个标记属性的方法就可以了。它包含的属性参数如下:
+
+*  input_types：数据输入格式。具体的格式说明，请参考 `input_types`_ 。
+*  should_shuffle：是不是要对数据做Shuffle。训练时默认shuffle，测试时默认不shuffle。
+*  min_pool_size：设置内存中最小暂存的数据条数，也是PaddlePaddle所能够保证的shuffle粒度。如果为-1，则会预先读取全部数据到内存中。
+*  pool_size： 设置内存中暂存的数据条数。如果为-1（默认），则不在乎内存暂存多少条数据。如果设置，则推荐大于训练时batch size的值，并且在内存足够的情况下越大越好。
+*  can_over_batch_size：是否允许暂存略微多余pool_size的数据。由于这样做可以避免很多死锁问题，一般推荐设置成True。
+*  calc_batch_size：可以传入一个函数，用于自定义每条数据的batch size（默认为1）。
+*  cache： 数据缓存的策略，具体请参考 `cache`_ 。
+*  init_hook：初始化时调用的函数，具体请参考 `init_hook`_ 。
+*  check：如果为true，会根据input_types检查数据的合法性。
+*  check_fail_continue：如果为true，那么当check出数据不合法时，会扔到这条数据，继续训练或预测。（对check=false的情况，没有作用）
+
+input_types
++++++++++++
+
+PaddlePaddle的数据包括四种主要类型，和三种序列模式。
+
+四种数据类型：
+
+* dense_vector：稠密的浮点数向量。
+* sparse_binary_vector：稀疏的01向量，即大部分值为0，但有值的地方必须为1。
+* sparse_float_vector：稀疏的向量，即大部分值为0，但有值的部分可以是任何浮点数。
+* integer：整数标签。
+
+三种序列模式：
+
+* SequenceType.NO_SEQUENCE：不是一条序列
+* SequenceType.SEQUENCE：是一条时间序列
+* SequenceType.SUB_SEQUENCE： 是一条时间序列，且序列的每一个元素还是一个时间序列。
+
+不同的数据类型和序列模式返回的格式不同，列表如下：
+
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+|                      | NO_SEQUENCE         | SEQUENCE                          |  SUB_SEQUENCE                                  |
++======================+=====================+===================================+================================================+
+| dense_vector         | [f, f, ...]         | [[f, ...], [f, ...], ...]         | [[[f, ...], ...], [[f, ...], ...],...]         |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| sparse_binary_vector | [i, i, ...]         | [[i, ...], [i, ...], ...]         | [[[i, ...], ...], [[i, ...], ...],...]         |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| sparse_float_vector  | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| integer_value        |  i                  | [i, i, ...]                       | [[i, ...], [i, ...], ...]                      |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+
+其中，f代表一个浮点数，i代表一个整数。
+
+注意：对sparse_binary_vector和sparse_float_vector，PaddlePaddle存的是有值位置的索引。例如，
+
+- 对一个5维非序列的稀疏01向量 ``[0, 1, 1, 0, 0]`` ，类型是sparse_binary_vector，返回的是 ``[1, 2]`` 。
+- 对一个5维非序列的稀疏浮点向量 ``[0, 0.5, 0.7, 0, 0]`` ，类型是sparse_float_vector，返回的是 ``[(1, 0.5), (2, 0.7)]`` 。
+
+init_hook
++++++++++
+
+init_hook可以传入一个函数。该函数在初始化的时候会被调用，其参数如下:
+
+* 第一个参数是settings对象，它和数据传入函数的第一个参数（如本例中 ``process`` 函数的 ``settings`` 参数）必须一致。该对象具有以下两个属性：
+    * settings.input_types：数据输入格式，具体请参考 `input_types`_ 。
+    * settings.logger：一个logging对象。
+* 其他参数使用 ``kwargs`` （key word arguments）传入，包括以下两种：
+    * PaddlePaddle定义的参数: 1）is_train：bool型参数，表示用于训练或预测；2）file_list：所有文件列表。
+    * 用户定义的参数：使用args在网络配置中设置。
+
+注意：PaddlePaddle保留添加参数的权力，因此init_hook尽量使用 ``**kwargs`` 来接受不使用的函数以保证兼容性。
+
+cache
++++++
+
+PyDataProvider2提供了两种简单的Cache策略：
+
+* CacheType.NO_CACHE：不缓存任何数据，每次都会从python端读取数据
+* CacheType.CACHE_PASS_IN_MEM：第一个pass会从python端读取数据，剩下的pass会直接从内存里
+  读取数据。 
+
+
+注意事项
+--------
+
+可能的内存泄露问题
+++++++++++++++++++
+
+PaddlePaddle将train.list中的每一行都传递给process函数，从而生成多个generator。当训练数据非常多时，就会生成非常多的generator。
+
+虽然每个generator在没有调用的时候，是几乎不占内存的；但当调用过一次后，generator便会存下当前的上下文(Context)，而这个Context可能会非常大。并且，generator至少需要调用两次才会知道是否停止。所以，即使process函数里面只有一个yield，也需要两次随机选择到相同generator的时候，才会释放该段内存。
+
+..  code-block:: python
+
+    def func():
+        yield 0
+
+    f = func()  # 创建generator
+    tmp = next(f)  # 调用一次，返回0
+    tmp = next(f)  # 调用第二次的时候，才会Stop Iteration
+
+由于顺序调用这些generator不会出现上述问题，因此有两种解决方案：
+
+1. **最佳推荐**：将样本的地址放入另一个文本文件，train.list写入那个文本文件的地址。即不要将每一个样本都放入train.list。
+2. 在generator的上下文中尽量留下非常少的变量引用，例如
+
+..  code-block:: python
+
+    def real_process(fn):
+        # ... read from fn
+        return result   # 当函数返回的时候，python可以解除掉内部变量的引用。
+
+    def process(fn):
+        yield real_process(fn)
+
+注意：这个问题是PyDataProvider读数据时候的逻辑问题，很难整体修正。
+
+内存不够用的情况
+++++++++++++++++
+
+PyDataProvider2会尽可能多的使用内存。因此，对于内存较小的机器，推荐使用 ``pool_size`` 变量来设置内存中暂存的数据条。具体请参考 `@provider`_ 中的说明。
+
diff --git a/doc_cn/ui/data_provider/sentimental_provider.py b/doc_cn/ui/data_provider/sentimental_provider.py
index 0fb0bb88e95a230f01f18b78ebb37b659c3768f1..14bd0e05a921dbfd5212d8483524d3af3e4ae98f 100644
--- a/doc_cn/ui/data_provider/sentimental_provider.py
+++ b/doc_cn/ui/data_provider/sentimental_provider.py
@@ -8,19 +8,16 @@ def on_init(settings, dictionary, **kwargs):
 
     # set input types in runtime. It will do the same thing as
     # @provider(input_types) will do, but it is set dynamically during runtime.
-    settings.input_types = [
+    settings.input_types = {
         # The text is a sequence of integer values, and each value is a word id.
         # The whole sequence is the sentences that we want to predict its
         # sentimental.
-        integer_value(
-            len(dictionary), seq_type=SequenceType),  # text input
+        'data': integer_value_sequence(len(dictionary)),  # text input
+        'label': integer_value(2)  # label positive/negative
+    }
 
-        # label positive/negative
-        integer_value(2)
-    ]
-
-    # save dictionary as settings.dictionary. It will be used in process
-    # method.
+    # save dictionary as settings.dictionary. 
+    # It will be used in process method.
     settings.dictionary = dictionary
 
 
diff --git a/doc_cn/ui/data_provider/write_new_dataprovider.rst b/doc_cn/ui/data_provider/write_new_dataprovider.rst
deleted file mode 100644
index a2495fe66371eb0cf678434f43feb6f91d93f3cf..0000000000000000000000000000000000000000
--- a/doc_cn/ui/data_provider/write_new_dataprovider.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-自定义一个DataProvider
-====================
-
-TBD
\ No newline at end of file
diff --git a/doc_cn/ui/index.rst b/doc_cn/ui/index.rst
index 5aba272c627204110a56337f0f120f3f2cd37ae9..ff36c9adb690f4126cf6ee332a9f0b09648270bd 100644
--- a/doc_cn/ui/index.rst
+++ b/doc_cn/ui/index.rst
@@ -1,25 +1,32 @@
+########
 用户接口
-========
+########
 
 数据提供
-''''''''
+========
 
 ..  toctree::
     :maxdepth: 1
 
-    data_provider/index.rst
+    data_provider/dataprovider.rst
+    data_provider/pydataprovider2.rst
+
+命令及命令行参数
+================
 
+..  toctree::
+    :maxdepth: 1
 
-命令行参数
-''''''''''
-* `Use Case <../../doc/ui/cmd_argument/use_case.html>`_
-* `Argument Outline <../../doc/ui/cmd_argument/argument_outline.html>`_
-* `Detail Description <../../doc/ui/cmd_argument/detail_introduction.html>`_
+    cmd/index.rst
 
+* `参数用例 <../../doc/ui/cmd_argument/use_case.html>`_
+* `参数分类 <../../doc/ui/cmd_argument/argument_outline.html>`_
+* `参数描述 <../../doc/ui/cmd_argument/detail_introduction.html>`_
 
 预测
-''''
+=======
 
 ..  toctree::
+    :maxdepth: 1
 
     predict/swig_py_paddle.rst
diff --git a/doc_cn/ui/predict/swig_py_paddle.rst b/doc_cn/ui/predict/swig_py_paddle.rst
index 012ac4ff6e66a022fa7d8af798236f55b62011ec..05f25345c5246687363dee1931310120b5723d0b 100644
--- a/doc_cn/ui/predict/swig_py_paddle.rst
+++ b/doc_cn/ui/predict/swig_py_paddle.rst
@@ -1,42 +1,50 @@
-PaddlePaddle的Python预测接口
-==================================
+基于Python的预测
+================
 
-PaddlePaddle目前使用Swig对其常用的预测接口进行了封装，使在Python环境下的预测接口更加简单。
-在Python环境下预测结果，主要分为以下几个步骤。
+预测流程
+--------
 
-* 读入解析训练配置
-* 构造GradientMachine
-* 准备数据
-* 预测
+PaddlePaddle使用swig对常用的预测接口进行了封装，通过编译会生成py_paddle软件包，安装该软件包就可以在python环境下实现模型预测。可以使用python的 ``help()`` 函数查询软件包相关API说明。
 
-典型的预测代码如下，使用mnist手写识别作为样例, 完整代码见
-:code:`src_root/doc/ui/predict/predict_sample.py` 。
+基于Python的模型预测，主要包括以下五个步骤。
+
+1. 初始化PaddlePaddle环境
+
+   在程序开始阶段，通过调用 ``swig_paddle.initPaddle()`` 并传入相应的命令行参数初始化PaddlePaddle。
+
+2. 解析模型配置文件
+   
+   初始化之后，可以通过调用 ``parse_config()`` 解析训练模型时用的配置文件。注意预测数据通常不包含label, 同时预测网络通常直接输出最后一层的结果而不是像训练网络一样再接一层cost layer，所以一般需要对训练用的模型配置文件稍作相应修改才能在预测时使用。
+
+3. 构造paddle.GradientMachine
+  
+   通过调用 ``swig_paddle.GradientMachine.createFromConfigproto()`` 传入上一步解析出来的模型配置就可以创建一个 ``GradientMachine``。
+
+4. 准备预测数据
+  
+   swig_paddle中的预测接口的参数是自定义的C++数据类型，py_paddle里面提供了一个工具类 ``DataProviderConverter`` 可以用于接收和PyDataProvider2一样的输入数据并转换成预测接口所需的数据类型。
+
+5. 模型预测
+  
+   通过调用 ``forwardTest()`` 传入预测数据，直接返回计算结果。
+
+
+预测Demo
+--------
+
+如下是一段使用mnist model来实现手写识别的预测代码。完整的代码见 ``src_root/doc/ui/predict/predict_sample.py`` 。mnist model可以通过 ``src_root\demo\mnist`` 目录下的demo训练出来。
 
 ..  literalinclude:: ../../../doc/ui/predict/predict_sample.py
     :language: python
-    :lines: 15-18,90-100,101-104
-
-主要的软件包为py_paddle.swig_paddle，这个软件包文档相对完善。可以使用python的
-:code:`help()` 函数查询文档。主要步骤为:
-
-* 在程序开始阶段，使用 :code:`swig_paddle.initPaddle()` 传入命令行参数初始化
-  PaddlePaddle。详细的命令行参数请参考
-  `命令行参数 <../cmd_argument/detail_introduction.html>`_ 。
-* 接下来使用 :code:`parse_config()` 解析训练时的配置文件。这里要注意预测数据通常
-  不包含label, 而且预测网络通常直接输出最后一层的结果而不是像训练时一样以cost
-  layer作为输出，所以用于预测的配置文件要做相应的修改。
-* 使用 :code:`swig_paddle.GradientMachine.createFromConfigproto()` 根据上一步解
-  析好的配置创建神经网络。
-* 创建一个 :code:`DataProviderConverter` 对象converter。
-    - swig_paddle接受的原始数据是C++的Matrix，也就是直接写内存的float数组。
-      这个接口并不用户友好。所以，我们提供了一个工具类DataProviderConverter。
-      这个工具类接收和PyDataProvider2一样的输入数据，详情请参考
-      `PyDataProvider2文档 <../../../doc/ui/data_provider/pydataprovider2.html>`_ 。
-* 最后使用 :code:`forwardTest()` 直接提取出神经网络Output层的输出结果。典型的输出结果为\:
+    :lines: 15-18,121-136
+
+
+Demo预测输出如下，其中value即为softmax层的输出。由于TEST_DATA包含两条预测数据，所以输出的value包含两个向量 。
 
 ..  code-block:: text
 
-    [{'id': None, 'value': array([[  5.53018653e-09,   1.12194102e-05,   1.96644767e-09,
+    [{'id': None, 'value': array(
+      [[  5.53018653e-09,   1.12194102e-05,   1.96644767e-09,
           1.43630644e-02,   1.51111044e-13,   9.85625684e-01,
           2.08823112e-10,   2.32777140e-08,   2.00186201e-09,
           1.15501715e-08],
@@ -45,4 +53,4 @@ PaddlePaddle目前使用Swig对其常用的预测接口进行了封装，使在P
           2.70634608e-08,   3.48565123e-08,   5.25639710e-09,
           4.48684503e-08]], dtype=float32)}]
 
-其中，value即为softmax层的输出。由于数据是两条，所以输出的value包含两个向量 。
+
diff --git a/doc_theme/static/css/override.css b/doc_theme/static/css/override.css
new file mode 100644
index 0000000000000000000000000000000000000000..438a87848a0176a7857177aeb672c59f35bd8d4b
--- /dev/null
+++ b/doc_theme/static/css/override.css
@@ -0,0 +1,506 @@
+body {
+    padding-top: 80px;
+    background-image: none !important;
+    font-family: Roboto;
+}
+a, a:focus, a:hover, a:visited {
+    color: #597cf1;
+}
+.site-header {
+    position: fixed;
+    top: 0;
+    width: 100%;
+    left: 0;
+    z-index: 99;
+    background: #333;
+    height: 80px;
+    display: -webkit-flex;
+    display: -ms-flex;
+    display: -o-flex;
+    display: flex;
+    flex-flow: row nowrap;
+    justify-content: space-between;
+    box-shadow: #ccc 0 3px 3px;
+}
+.site-header > div {
+    height: 80px;
+    display: inline-block;
+    background-color: #2f323a;
+    padding: 0 30px;
+}
+.site-header .site-logo {
+    line-height: 80px;
+    width: 290px;
+    flex: 0 1 290px;
+}
+.site-header .site-logo > a {
+    display: inline-block;
+    width: 230px;
+}
+.site-header .site-nav-links {
+    flex: 0 1 100%;
+}
+.site-header .site-nav-links .site-menu {
+    height: 30px;
+    line-height: 30px; 
+    font-size: 12px;
+    background: -webkit-linear-gradient(#282b33, #2f323a);
+    background: -o-linear-gradient(#282b33, #2f323a);
+    background: -moz-linear-gradient(#282b33, #2f323a);
+    background: linear-gradient(to left, #282b33, #2f323a);
+    margin-right: -30px;
+    padding-right: 30px;
+}
+.site-header .site-nav-links .site-menu .site-page-links {
+    display: inline-block;
+    float: right;
+    margin-right: 20px;
+}
+.site-header .site-nav-links .site-menu .site-page-links> li {
+    display: inline-block;
+    float: left;
+}
+.site-header .site-nav-links .site-menu .site-page-links > li > a {
+    color: #a7adbd;
+    display: inline-block;
+    height: 30px;
+    padding: 0 20px;
+    font-size: 12px;
+}
+.site-header .site-nav-links .site-menu .site-page-links > li:hover > a,
+.site-header .site-nav-links .site-menu .site-page-links > li.active > a {
+    background-color: #2f323a;
+    color: #bcc1d0;
+}
+.site-header .site-nav-links .site-menu .site-page-links > li.active > a {
+    font-weight: bold;
+}
+.site-header .site-nav-links .site-menu .fork-on-github {
+    color: #597cf1;
+    line-height: 30px;
+    display: inline-block;
+    padding: 0 0 0 20px;
+    float: right;
+    position: relative;
+}
+.site-header .site-nav-links .site-menu .fork-on-github .fa {
+    margin-right: 5px;
+    font-size: 16px;
+    vertical-align: middle;
+}
+.site-header .site-nav-links .site-menu .language-switcher {
+    height: 30px;
+    display: inline-block;
+    float: right;
+    line-height: 30px;
+    padding: 0 20px;
+    position: relative;
+}
+.site-header .site-nav-links .site-menu .language-switcher > a {
+    color: #a7adbd;
+}
+.site-header .site-nav-links .site-menu .language-switcher.open > a {
+    background-color: #24272f;
+    color: #bcc1d0;
+}
+.site-header .site-nav-links .site-menu .language-switcher .fa {
+    margin-left: 5px;
+}
+.site-header .site-nav-links .site-menu .language-switcher .fa-angle-down {
+    display: inline;
+}
+.site-header .site-nav-links .site-menu .language-switcher.open .fa-angle-down {
+    display: none;
+}
+.site-header .site-nav-links .site-menu .language-switcher .fa-angle-up {
+    display: none;
+}
+.site-header .site-nav-links .site-menu .language-switcher.open .fa-angle-up {
+    display: inline;
+}
+.site-header .site-nav-links .site-menu .fork-on-github:before,
+.site-header .site-nav-links .site-menu .language-switcher:before {
+    width: 1px;
+    height: 12px;
+    top: 9px;
+    background-color: #3a3d47;
+    left: 0;
+    display: inline-block;
+    position: absolute;
+    content: "";
+}
+.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu {
+    display: none;
+    position: absolute;
+    box-shadow: #ccc 0 0 5px;
+    background-color: #fff;
+    width: 100%;
+    left: 0;
+    top: 30px;
+}
+.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu > li {
+    line-height: 30px;
+    padding: 0 20px;
+}
+.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu > li:hover {
+    background-color: #f7f8fe;
+}
+.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu > li + li {
+    border-top: 1px solid #dedfe5;
+}
+.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu > li > a {
+    color: #2f323a;
+}
+.site-header .site-nav-links .site-menu .language-switcher.open .dropdown-menu {
+    display: inline-block;
+}
+.site-header .site-nav-links .doc-module {
+    display: block;
+    height: 50px;
+    line-height: 50px;
+}
+.site-header .site-nav-links .doc-module > ul > li {
+    display: inline-block;
+    float: left;
+}
+.site-header .site-nav-links .doc-module > ul > li > a {
+    color: #c9cbd0;
+    font-size: 14px;
+    display: inline-block;
+    height: 50px;
+    line-height: 50px;
+    border-bottom: 2px solid transparent;
+    padding: 0 20px;
+}
+.site-header .site-nav-links .doc-module > ul > li:hover > a {
+    color: #fff;
+}
+.site-header .site-nav-links .doc-module > ul > li.current > a {
+    border-bottom-color: #fff;
+    color: #fff;
+}
+.site-header .site-nav-links .doc-module [role="search"]{
+    float: right;
+}
+.site-header .site-nav-links .doc-module [role="search"] input {
+    background-color: #3a3d47;
+    border-radius: 15px;
+    color: #a7adbd;
+    border: 1px solid transparent;
+    padding: 6px 15px;
+    width: 180px;
+    box-shadow: none;
+    transition: all .2s;
+    -webkit-transition: all .2s;
+    -moz-transition: all .2s;
+    -o-transition: all .2s;
+    background-repeat: no-repeat;
+    background-position: 145px center;
+    background-image: url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAACXBIWXMAAA7EAAAOxAGVKw4bAAAKTWlDQ1BQaG90b3Nob3AgSUNDIHByb2ZpbGUAAHjanVN3WJP3Fj7f92UPVkLY8LGXbIEAIiOsCMgQWaIQkgBhhBASQMWFiApWFBURnEhVxILVCkidiOKgKLhnQYqIWotVXDjuH9yntX167+3t+9f7vOec5/zOec8PgBESJpHmomoAOVKFPDrYH49PSMTJvYACFUjgBCAQ5svCZwXFAADwA3l4fnSwP/wBr28AAgBw1S4kEsfh/4O6UCZXACCRAOAiEucLAZBSAMguVMgUAMgYALBTs2QKAJQAAGx5fEIiAKoNAOz0ST4FANipk9wXANiiHKkIAI0BAJkoRyQCQLsAYFWBUiwCwMIAoKxAIi4EwK4BgFm2MkcCgL0FAHaOWJAPQGAAgJlCLMwAIDgCAEMeE80DIEwDoDDSv+CpX3CFuEgBAMDLlc2XS9IzFLiV0Bp38vDg4iHiwmyxQmEXKRBmCeQinJebIxNI5wNMzgwAABr50cH+OD+Q5+bk4eZm52zv9MWi/mvwbyI+IfHf/ryMAgQAEE7P79pf5eXWA3DHAbB1v2upWwDaVgBo3/ldM9sJoFoK0Hr5i3k4/EAenqFQyDwdHAoLC+0lYqG9MOOLPv8z4W/gi372/EAe/tt68ABxmkCZrcCjg/1xYW52rlKO58sEQjFu9+cj/seFf/2OKdHiNLFcLBWK8ViJuFAiTcd5uVKRRCHJleIS6X8y8R+W/QmTdw0ArIZPwE62B7XLbMB+7gECiw5Y0nYAQH7zLYwaC5EAEGc0Mnn3AACTv/mPQCsBAM2XpOMAALzoGFyolBdMxggAAESggSqwQQcMwRSswA6cwR28wBcCYQZEQAwkwDwQQgbkgBwKoRiWQRlUwDrYBLWwAxqgEZrhELTBMTgN5+ASXIHrcBcGYBiewhi8hgkEQcgIE2EhOogRYo7YIs4IF5mOBCJhSDSSgKQg6YgUUSLFyHKkAqlCapFdSCPyLXIUOY1cQPqQ28ggMor8irxHMZSBslED1AJ1QLmoHxqKxqBz0XQ0D12AlqJr0Rq0Hj2AtqKn0UvodXQAfYqOY4DRMQ5mjNlhXIyHRWCJWBomxxZj5Vg1Vo81Yx1YN3YVG8CeYe8IJAKLgBPsCF6EEMJsgpCQR1hMWEOoJewjtBK6CFcJg4Qxwicik6hPtCV6EvnEeGI6sZBYRqwm7iEeIZ4lXicOE1+TSCQOyZLkTgohJZAySQtJa0jbSC2kU6Q+0hBpnEwm65Btyd7kCLKArCCXkbeQD5BPkvvJw+S3FDrFiOJMCaIkUqSUEko1ZT/lBKWfMkKZoKpRzame1AiqiDqfWkltoHZQL1OHqRM0dZolzZsWQ8ukLaPV0JppZ2n3aC/pdLoJ3YMeRZfQl9Jr6Afp5+mD9HcMDYYNg8dIYigZaxl7GacYtxkvmUymBdOXmchUMNcyG5lnmA+Yb1VYKvYqfBWRyhKVOpVWlX6V56pUVXNVP9V5qgtUq1UPq15WfaZGVbNQ46kJ1Bar1akdVbupNq7OUndSj1DPUV+jvl/9gvpjDbKGhUaghkijVGO3xhmNIRbGMmXxWELWclYD6yxrmE1iW7L57Ex2Bfsbdi97TFNDc6pmrGaRZp3mcc0BDsax4PA52ZxKziHODc57LQMtPy2x1mqtZq1+rTfaetq+2mLtcu0W7eva73VwnUCdLJ31Om0693UJuja6UbqFutt1z+o+02PreekJ9cr1Dund0Uf1bfSj9Rfq79bv0R83MDQINpAZbDE4Y/DMkGPoa5hpuNHwhOGoEctoupHEaKPRSaMnuCbuh2fjNXgXPmasbxxirDTeZdxrPGFiaTLbpMSkxeS+Kc2Ua5pmutG003TMzMgs3KzYrMnsjjnVnGueYb7ZvNv8jYWlRZzFSos2i8eW2pZ8ywWWTZb3rJhWPlZ5VvVW16xJ1lzrLOtt1ldsUBtXmwybOpvLtqitm63Edptt3xTiFI8p0in1U27aMez87ArsmuwG7Tn2YfYl9m32zx3MHBId1jt0O3xydHXMdmxwvOuk4TTDqcSpw+lXZxtnoXOd8zUXpkuQyxKXdpcXU22niqdun3rLleUa7rrStdP1o5u7m9yt2W3U3cw9xX2r+00umxvJXcM970H08PdY4nHM452nm6fC85DnL152Xlle+70eT7OcJp7WMG3I28Rb4L3Le2A6Pj1l+s7pAz7GPgKfep+Hvqa+It89viN+1n6Zfgf8nvs7+sv9j/i/4XnyFvFOBWABwQHlAb2BGoGzA2sDHwSZBKUHNQWNBbsGLww+FUIMCQ1ZH3KTb8AX8hv5YzPcZyya0RXKCJ0VWhv6MMwmTB7WEY6GzwjfEH5vpvlM6cy2CIjgR2yIuB9pGZkX+X0UKSoyqi7qUbRTdHF09yzWrORZ+2e9jvGPqYy5O9tqtnJ2Z6xqbFJsY+ybuIC4qriBeIf4RfGXEnQTJAntieTE2MQ9ieNzAudsmjOc5JpUlnRjruXcorkX5unOy553PFk1WZB8OIWYEpeyP+WDIEJQLxhP5aduTR0T8oSbhU9FvqKNolGxt7hKPJLmnVaV9jjdO31D+miGT0Z1xjMJT1IreZEZkrkj801WRNberM/ZcdktOZSclJyjUg1plrQr1zC3KLdPZisrkw3keeZtyhuTh8r35CP5c/PbFWyFTNGjtFKuUA4WTC+oK3hbGFt4uEi9SFrUM99m/ur5IwuCFny9kLBQuLCz2Lh4WfHgIr9FuxYji1MXdy4xXVK6ZHhp8NJ9y2jLspb9UOJYUlXyannc8o5Sg9KlpUMrglc0lamUycturvRauWMVYZVkVe9ql9VbVn8qF5VfrHCsqK74sEa45uJXTl/VfPV5bdra3kq3yu3rSOuk626s91m/r0q9akHV0IbwDa0b8Y3lG19tSt50oXpq9Y7NtM3KzQM1YTXtW8y2rNvyoTaj9nqdf13LVv2tq7e+2Sba1r/dd3vzDoMdFTve75TsvLUreFdrvUV99W7S7oLdjxpiG7q/5n7duEd3T8Wej3ulewf2Re/ranRvbNyvv7+yCW1SNo0eSDpw5ZuAb9qb7Zp3tXBaKg7CQeXBJ9+mfHvjUOihzsPcw83fmX+39QjrSHkr0jq/dawto22gPaG97+iMo50dXh1Hvrf/fu8x42N1xzWPV56gnSg98fnkgpPjp2Snnp1OPz3Umdx590z8mWtdUV29Z0PPnj8XdO5Mt1/3yfPe549d8Lxw9CL3Ytslt0utPa49R35w/eFIr1tv62X3y+1XPK509E3rO9Hv03/6asDVc9f41y5dn3m978bsG7duJt0cuCW69fh29u0XdwruTNxdeo94r/y+2v3qB/oP6n+0/rFlwG3g+GDAYM/DWQ/vDgmHnv6U/9OH4dJHzEfVI0YjjY+dHx8bDRq98mTOk+GnsqcTz8p+Vv9563Or59/94vtLz1j82PAL+YvPv655qfNy76uprzrHI8cfvM55PfGm/K3O233vuO+638e9H5ko/ED+UPPR+mPHp9BP9z7nfP78L/eE8/sl0p8zAAAAIGNIUk0AAHolAACAgwAA+f8AAIDpAAB1MAAA6mAAADqYAAAXb5JfxUYAAAEpSURBVHjanNO7K8dhFMfx1w8LBqVM5DLxF7hMTGSQpAwmJSkDizAZLSb5Ayi3clsMFgwWISGXkoSyGYRSym15fvr27duvH5/leTqd8+6c83ye1NLatohqMIgWVOEV+5jDAr7ElBO5j+IIH+hBJRqwjDHsoTQOyAvnCPpRi4tYziVmMY2dkPMc7aAG42hPKE7rAwMBNhEfYQgzOJNZ3xhGL4qigGasyk43OEdjFFCGe9nrNtT8Al5Q8AdAMd6jgFPU/QFwiN0oYD4sJzdLwBiuo4A5vGEKqQyF1ahPcuInOsJrrKMiwWx9OMAWWpOc+BD2MImr4Ik7FIb4AzqRH6zdhU1IxT4TlKAJ5XjCMU6CkaANi2lIXsKsj1jJsIsNdKc7yfE/pSGTPwMABBFCGflm+rsAAAAASUVORK5CYII=");
+}
+.site-header .site-nav-links .doc-module [role="search"] input:focus {
+   width: 300px;
+}
+.site-header .site-nav-links .doc-module [role="search"] input:focus {
+    background-position: 265px center;
+}
+.site-header .site-nav-links .doc-module [role="search"] input:hover,
+.site-header .site-nav-links .doc-module [role="search"] input:focus {
+   color: #fff;
+   border-color: #597cf1;
+   background-image: url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAACXBIWXMAAA7EAAAOxAGVKw4bAAAKTWlDQ1BQaG90b3Nob3AgSUNDIHByb2ZpbGUAAHjanVN3WJP3Fj7f92UPVkLY8LGXbIEAIiOsCMgQWaIQkgBhhBASQMWFiApWFBURnEhVxILVCkidiOKgKLhnQYqIWotVXDjuH9yntX167+3t+9f7vOec5/zOec8PgBESJpHmomoAOVKFPDrYH49PSMTJvYACFUjgBCAQ5svCZwXFAADwA3l4fnSwP/wBr28AAgBw1S4kEsfh/4O6UCZXACCRAOAiEucLAZBSAMguVMgUAMgYALBTs2QKAJQAAGx5fEIiAKoNAOz0ST4FANipk9wXANiiHKkIAI0BAJkoRyQCQLsAYFWBUiwCwMIAoKxAIi4EwK4BgFm2MkcCgL0FAHaOWJAPQGAAgJlCLMwAIDgCAEMeE80DIEwDoDDSv+CpX3CFuEgBAMDLlc2XS9IzFLiV0Bp38vDg4iHiwmyxQmEXKRBmCeQinJebIxNI5wNMzgwAABr50cH+OD+Q5+bk4eZm52zv9MWi/mvwbyI+IfHf/ryMAgQAEE7P79pf5eXWA3DHAbB1v2upWwDaVgBo3/ldM9sJoFoK0Hr5i3k4/EAenqFQyDwdHAoLC+0lYqG9MOOLPv8z4W/gi372/EAe/tt68ABxmkCZrcCjg/1xYW52rlKO58sEQjFu9+cj/seFf/2OKdHiNLFcLBWK8ViJuFAiTcd5uVKRRCHJleIS6X8y8R+W/QmTdw0ArIZPwE62B7XLbMB+7gECiw5Y0nYAQH7zLYwaC5EAEGc0Mnn3AACTv/mPQCsBAM2XpOMAALzoGFyolBdMxggAAESggSqwQQcMwRSswA6cwR28wBcCYQZEQAwkwDwQQgbkgBwKoRiWQRlUwDrYBLWwAxqgEZrhELTBMTgN5+ASXIHrcBcGYBiewhi8hgkEQcgIE2EhOogRYo7YIs4IF5mOBCJhSDSSgKQg6YgUUSLFyHKkAqlCapFdSCPyLXIUOY1cQPqQ28ggMor8irxHMZSBslED1AJ1QLmoHxqKxqBz0XQ0D12AlqJr0Rq0Hj2AtqKn0UvodXQAfYqOY4DRMQ5mjNlhXIyHRWCJWBomxxZj5Vg1Vo81Yx1YN3YVG8CeYe8IJAKLgBPsCF6EEMJsgpCQR1hMWEOoJewjtBK6CFcJg4Qxwicik6hPtCV6EvnEeGI6sZBYRqwm7iEeIZ4lXicOE1+TSCQOyZLkTgohJZAySQtJa0jbSC2kU6Q+0hBpnEwm65Btyd7kCLKArCCXkbeQD5BPkvvJw+S3FDrFiOJMCaIkUqSUEko1ZT/lBKWfMkKZoKpRzame1AiqiDqfWkltoHZQL1OHqRM0dZolzZsWQ8ukLaPV0JppZ2n3aC/pdLoJ3YMeRZfQl9Jr6Afp5+mD9HcMDYYNg8dIYigZaxl7GacYtxkvmUymBdOXmchUMNcyG5lnmA+Yb1VYKvYqfBWRyhKVOpVWlX6V56pUVXNVP9V5qgtUq1UPq15WfaZGVbNQ46kJ1Bar1akdVbupNq7OUndSj1DPUV+jvl/9gvpjDbKGhUaghkijVGO3xhmNIRbGMmXxWELWclYD6yxrmE1iW7L57Ex2Bfsbdi97TFNDc6pmrGaRZp3mcc0BDsax4PA52ZxKziHODc57LQMtPy2x1mqtZq1+rTfaetq+2mLtcu0W7eva73VwnUCdLJ31Om0693UJuja6UbqFutt1z+o+02PreekJ9cr1Dund0Uf1bfSj9Rfq79bv0R83MDQINpAZbDE4Y/DMkGPoa5hpuNHwhOGoEctoupHEaKPRSaMnuCbuh2fjNXgXPmasbxxirDTeZdxrPGFiaTLbpMSkxeS+Kc2Ua5pmutG003TMzMgs3KzYrMnsjjnVnGueYb7ZvNv8jYWlRZzFSos2i8eW2pZ8ywWWTZb3rJhWPlZ5VvVW16xJ1lzrLOtt1ldsUBtXmwybOpvLtqitm63Edptt3xTiFI8p0in1U27aMez87ArsmuwG7Tn2YfYl9m32zx3MHBId1jt0O3xydHXMdmxwvOuk4TTDqcSpw+lXZxtnoXOd8zUXpkuQyxKXdpcXU22niqdun3rLleUa7rrStdP1o5u7m9yt2W3U3cw9xX2r+00umxvJXcM970H08PdY4nHM452nm6fC85DnL152Xlle+70eT7OcJp7WMG3I28Rb4L3Le2A6Pj1l+s7pAz7GPgKfep+Hvqa+It89viN+1n6Zfgf8nvs7+sv9j/i/4XnyFvFOBWABwQHlAb2BGoGzA2sDHwSZBKUHNQWNBbsGLww+FUIMCQ1ZH3KTb8AX8hv5YzPcZyya0RXKCJ0VWhv6MMwmTB7WEY6GzwjfEH5vpvlM6cy2CIjgR2yIuB9pGZkX+X0UKSoyqi7qUbRTdHF09yzWrORZ+2e9jvGPqYy5O9tqtnJ2Z6xqbFJsY+ybuIC4qriBeIf4RfGXEnQTJAntieTE2MQ9ieNzAudsmjOc5JpUlnRjruXcorkX5unOy553PFk1WZB8OIWYEpeyP+WDIEJQLxhP5aduTR0T8oSbhU9FvqKNolGxt7hKPJLmnVaV9jjdO31D+miGT0Z1xjMJT1IreZEZkrkj801WRNberM/ZcdktOZSclJyjUg1plrQr1zC3KLdPZisrkw3keeZtyhuTh8r35CP5c/PbFWyFTNGjtFKuUA4WTC+oK3hbGFt4uEi9SFrUM99m/ur5IwuCFny9kLBQuLCz2Lh4WfHgIr9FuxYji1MXdy4xXVK6ZHhp8NJ9y2jLspb9UOJYUlXyannc8o5Sg9KlpUMrglc0lamUycturvRauWMVYZVkVe9ql9VbVn8qF5VfrHCsqK74sEa45uJXTl/VfPV5bdra3kq3yu3rSOuk626s91m/r0q9akHV0IbwDa0b8Y3lG19tSt50oXpq9Y7NtM3KzQM1YTXtW8y2rNvyoTaj9nqdf13LVv2tq7e+2Sba1r/dd3vzDoMdFTve75TsvLUreFdrvUV99W7S7oLdjxpiG7q/5n7duEd3T8Wej3ulewf2Re/ranRvbNyvv7+yCW1SNo0eSDpw5ZuAb9qb7Zp3tXBaKg7CQeXBJ9+mfHvjUOihzsPcw83fmX+39QjrSHkr0jq/dawto22gPaG97+iMo50dXh1Hvrf/fu8x42N1xzWPV56gnSg98fnkgpPjp2Snnp1OPz3Umdx590z8mWtdUV29Z0PPnj8XdO5Mt1/3yfPe549d8Lxw9CL3Ytslt0utPa49R35w/eFIr1tv62X3y+1XPK509E3rO9Hv03/6asDVc9f41y5dn3m978bsG7duJt0cuCW69fh29u0XdwruTNxdeo94r/y+2v3qB/oP6n+0/rFlwG3g+GDAYM/DWQ/vDgmHnv6U/9OH4dJHzEfVI0YjjY+dHx8bDRq98mTOk+GnsqcTz8p+Vv9563Or59/94vtLz1j82PAL+YvPv655qfNy76uprzrHI8cfvM55PfGm/K3O233vuO+638e9H5ko/ED+UPPR+mPHp9BP9z7nfP78L/eE8/sl0p8zAAAAIGNIUk0AAHolAACAgwAA+f8AAIDpAAB1MAAA6mAAADqYAAAXb5JfxUYAAAEpSURBVHjanNO9K4ZhFMfxz4MFg1Im8jJ5/gIvExMZJCnFpCRlYBEGGS0m+QMoLwOyGCwyWISEvJQklM0glFLeluvR3d3d08Nvua5O53w751y/K9Uz+SyiNIbRihq8Yh+LWMaXmPIi93Ec4QN9qEYjVjGBPZTHAQXhHMMg6nARy7nEAuawE3Keox2kMYWOhOKMPjAUYNPxEUYwjzPZ9Y1R9KMkCmjButx0g3M0RQEVuJe7bkPNL+AFRX8AlOI9CjhF/R8Ah9iNApbCcvJzBEzgOgpYxBtmkcpSWIuGJCd+ojO8xgaqEsw2gANsoy3JiQ9hDzO4Cp64Q3GIP6ALhcHa3diCVOwzQRmaUYknHOMkGAnasZKBFCTM+oi1LLvYRG+mkzz/UwYy8zMAmkpBg3fGpFUAAAAASUVORK5CYII=");
+}
+.doc-menu-vertical {
+    display: inline-block;
+    float: left;
+    width: 240px;
+    height: 100%;
+    background-color: #ecedee;
+    position: absolute;
+    left: 0;
+    top: 0;
+    overflow: hidden;
+    padding: 0;
+    border-right: 1px solid #dddfe3;
+}
+.doc-menu-vertical > ul {
+    display: none;
+}
+.doc-menu-vertical > ul.current{
+    display: block;
+}
+.doc-menu-vertical > ul.current > li.toctree-l1 {
+    display: none;
+}
+.doc-menu-vertical > ul.current > li.toctree-l1.current {
+    display: block;
+}
+.doc-menu-vertical > ul.current > li.toctree-l1.current > a {
+    display: none;
+}
+.doc-menu-vertical .toctree-l2  a {
+    width: 100%;
+    overflow: hidden;
+    text-overflow: ellipsis;
+    white-space: nowrap;
+    padding-right: 30px;
+}
+.doc-menu-vertical .toctree-l2 > a {
+    font-size: 14px;
+    color: #2f323a;
+    padding-left: 30px;
+    line-height: 50px;
+    display: block;
+    font-weight: bold;
+    border-bottom: 1px solid #dddfe3;
+}
+.doc-menu-vertical .toctree-l2.has-child > a:after {
+    font-family: "FontAwesome";
+    display: inline-block;
+    font-style: normal;
+    font-weight: normal;
+    text-decoration: inherit;
+    content: "";
+    float: right;
+    line-height: 50px;
+    color: #a7adbd;
+    position: absolute;
+    right: 15px;
+}
+.doc-menu-vertical .toctree-l2.has-child.current > a:after {
+    content: "";
+}
+.doc-menu-vertical .toctree-l2 > a + ul{
+    background-color: #e4e6e9;
+    height: 0;
+    overflow: hidden;
+}
+.doc-menu-vertical .toctree-l2.current > a + ul {
+    border-bottom: 1px solid #dddfe3;
+    height: auto;
+}
+.doc-menu-vertical .toctree-l2 li.active > a {
+    background-color: #597cf1;
+    color: #fff;
+}
+.doc-menu-vertical .toctree-l3 > a {
+    font-size: 12px;
+    color: #2f323a;
+    padding-left: 30px;
+    line-height: 40px;
+    display: block;
+}
+.doc-menu-vertical .toctree-l4 > a {
+    font-size: 12px;
+    color: #64697b;
+    padding-left: 50px;
+    line-height: 30px;
+    display: block;
+}
+.doc-menu-vertical .toctree-l5 > a {
+    font-size: 14px;
+    color: #ccc;
+    padding-left: 40px;
+    display: block;
+}
+.local-toc {
+    position: absolute;
+    height: 100%;
+    background-color: #f6f7f8;
+    top: 0;
+    left: 240px;
+    padding: 0;
+    z-index: 9;
+}
+.local-toc:after {
+    content: "";
+    position: absolute;
+    height: 100%;
+    width: 1px;
+    display: inline-block;
+    right: 0;
+    background-color: #dddfe3;
+    top: 0;
+    z-index: -1;
+}
+.local-toc:hover a {
+    width: auto;
+}
+.local-toc > ul > li a {
+    position: relative;
+    font-size: 12px;
+    overflow: hidden;
+    display: none;
+}
+.local-toc > ul > li > ul > li a {
+    display: block;
+    border-top: 1px solid transparent;
+    border-bottom: 1px solid transparent;
+    padding-right: 20px;
+    width: 50px;
+}
+.local-toc > ul > li > ul > li > ul > li > ul a {
+    display: none;
+}
+.local-toc > ul > li > ul li > a:after {
+    content: "";
+    display: inline-block;
+    width: 1px;
+    height: 100%;
+    background-color: transparent;
+    position: absolute;
+    right: 0;
+    top: 0;
+}
+.local-toc > ul > li > ul li a:hover{
+    background-color: #e6eaf7 !important;
+}
+.local-toc > ul > li > ul li a:hover:after {
+    background-color: #e6eaf7 !important;
+}
+.local-toc > ul > li > ul li.active > a {
+    color: #ff9711;
+    background-color: #fff;
+    border-top: 1px solid #dddfe3;
+    border-bottom: 1px solid #dddfe3;
+}
+.local-toc > ul > li > ul li.active > a:before {
+    background-color: #ff9711;
+    width: 10px;
+    height: 10px;
+    margin: 15px 20px;
+    border-radius: 5px;
+}
+.local-toc > ul > li > ul li.active > a:after {
+    background-color: #fff;
+}
+.local-toc > ul > li > ul > li {
+    position: relative;
+    line-height: 40px;
+    white-space: nowrap;
+}
+.local-toc > ul > li > ul > li > a {
+    color: #64697b;
+}
+.local-toc > ul > li > ul > li > a + ul {
+    display: none;
+}
+.local-toc > ul > li > ul > li > a:before {
+    display: inline-block;
+    content: "";
+    width: 6px;
+    height: 6px;
+    background-color: #ccc;
+    border-radius: 3px;
+    margin: 17px 22px;
+    float: left;
+}
+.local-toc > ul > li > ul > li > ul > li > a {
+    color: #a7adbd;
+}
+.local-toc > ul > li > ul > li > ul > li > a:before {
+    display: inline-block;
+    content: "";
+    width: 6px;
+    height: 6px;
+    background-color: #ccc;
+    border-radius: 3px;
+    margin: 17px 22px;
+    float: left;
+}
+.main-content-wrap {
+    position: absolute;
+    width: 100%;
+    top: 80px;
+    bottom: 0;
+    overflow: auto;
+    background-color: #f6f7f8;
+}
+.doc-content-wrap {
+    margin-left: 290px;
+    height: 100%;
+    position: relative;
+    padding-top: 60px;
+    background-color: #fff;
+}
+.doc-content-wrap > div[role='navigation'] {
+    position: absolute;
+    top: 0;
+    width: 100%;
+    left: 0;
+    padding: 0 30px;
+    height: 60px;
+}
+.wy-breadcrumbs {
+    line-height: 50px;
+    height: 60px;
+    background-image: url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAAUCAYAAABMDlehAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAA4ZpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADw/eHBhY2tldCBiZWdpbj0i77u/IiBpZD0iVzVNME1wQ2VoaUh6cmVTek5UY3prYzlkIj8+IDx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6bWV0YS8iIHg6eG1wdGs9IkFkb2JlIFhNUCBDb3JlIDUuNS1jMDIxIDc5LjE1NTc3MiwgMjAxNC8wMS8xMy0xOTo0NDowMCAgICAgICAgIj4gPHJkZjpSREYgeG1sbnM6cmRmPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1zeW50YXgtbnMjIj4gPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIgeG1sbnM6eG1wTU09Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC9tbS8iIHhtbG5zOnN0UmVmPSJodHRwOi8vbnMuYWRvYmUuY29tL3hhcC8xLjAvc1R5cGUvUmVzb3VyY2VSZWYjIiB4bWxuczp4bXA9Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC8iIHhtcE1NOk9yaWdpbmFsRG9jdW1lbnRJRD0ieG1wLmRpZDpjMjhmMGQ3ZC0wODU3LTQ0ZTctOGRhZi00NGU3OTc1ZmM2MzkiIHhtcE1NOkRvY3VtZW50SUQ9InhtcC5kaWQ6NzRBN0NEODRBRTM2MTFFNjlGMDI4RUM3M0VDQzY4NTkiIHhtcE1NOkluc3RhbmNlSUQ9InhtcC5paWQ6NzRBN0NEODNBRTM2MTFFNjlGMDI4RUM3M0VDQzY4NTkiIHhtcDpDcmVhdG9yVG9vbD0iQWRvYmUgUGhvdG9zaG9wIENDIDIwMTQgKE1hY2ludG9zaCkiPiA8eG1wTU06RGVyaXZlZEZyb20gc3RSZWY6aW5zdGFuY2VJRD0ieG1wLmlpZDozNWQwMzI1ZC01ZDAyLTQ1YTYtODUxOS1lNWUzNjU5NGFhMzAiIHN0UmVmOmRvY3VtZW50SUQ9ImFkb2JlOmRvY2lkOnBob3Rvc2hvcDozZGVmZmY0OS1mNjA4LTExNzktYTRlZC1kZjJiNGY3N2YwNzMiLz4gPC9yZGY6RGVzY3JpcHRpb24+IDwvcmRmOlJERj4gPC94OnhtcG1ldGE+IDw/eHBhY2tldCBlbmQ9InIiPz7FGmP1AAAAKUlEQVR42mK4/+DpfwY9Q0tBJgYGhv8g4h8uFoKLEGOAc9FYSARAgAEAUgMQYBNmQ7sAAAAASUVORK5CYII=");
+    background-repeat: repeat no-repeat;
+    background-position: center 50px;
+}
+.wy-breadcrumbs > li {
+    color: #ccc;
+}
+.wy-breadcrumbs > li a {
+    color: #ff9711;
+    padding: 0;
+}
+.wy-breadcrumbs > li:first-child a {
+    color: #597cf1;
+}
+.wy-nav-content{
+    max-width: none;
+    overflow: auto;
+    position: relative;
+    padding: 30px;
+    background-color: #fff;
+}
+.wy-nav-content h1 {
+    font-size: 24px;
+    color: #2f323a;
+    margin-bottom: 30px;
+}
+.wy-nav-content h2 {
+    font-size: 20px;
+    color: #2f323a;
+    margin-bottom: 30px;
+}
+.wy-nav-content h3 {
+    font-size: 18px;
+    color: #2f323a;
+    margin-bottom: 30px;
+}
+.wy-nav-content h4 {
+    font-size: 16px;
+    color: #2f323a;
+    margin-bottom: 30px;
+}
+.wy-nav-content p + h1,
+.wy-nav-content p + h2,
+.wy-nav-content p + h3,
+.wy-nav-content p + h4 {
+    margin-top: 20px;
+}
+.wy-nav-content p{
+    color: #2f323a;
+    margin-bottom: 20px;
+    font-size: 14px;
+}
+#search-results h2 {
+    font-size: 24px;
+    margin: 20px 0 10px 0;
+}
+#search-results p {
+    color: #a7adbd;
+}
+#search-results ul.search > li {
+    border-bottom: none;
+}
+#search-results ul.search > li > a {
+    color: #597cf1;
+}
+.rst-content .highlighted{
+    background-color: transparent;
+    color: #ff9711;
+    padding: 0;
+}
diff --git a/doc_theme/static/images/PP_w.png b/doc_theme/static/images/PP_w.png
new file mode 100644
index 0000000000000000000000000000000000000000..bc58b0b458135773fcde5ee941ea095e3d4d07a0
Binary files /dev/null and b/doc_theme/static/images/PP_w.png differ
diff --git a/doc_theme/static/js/paddle_doc_init.js b/doc_theme/static/js/paddle_doc_init.js
new file mode 100644
index 0000000000000000000000000000000000000000..5c815a8d3a3dab9bdbce544ff3bb49be40ad8934
--- /dev/null
+++ b/doc_theme/static/js/paddle_doc_init.js
@@ -0,0 +1,31 @@
+$(document).ready(function(){
+    $('.local-toc').on('click' ,'a.reference.internal', function (){
+        $('.local-toc li.active').removeClass('active');
+        $(this).parent('li').addClass('active');
+    });
+
+    if ($('.local-toc a:visible').length) {
+        $('.local-toc > ul').addClass('nav nav-stacked');
+        $('#doc-content').scrollspy({
+            target: '.local-toc'
+        });
+		$('.local-toc').perfectScrollbar();
+    } else {
+		$('.doc-content-wrap').css('margin-left', '-=50px');
+        $('.local-toc').remove();
+    }
+
+    if (!$('.doc-menu-vertical > ul > li.current > ul').length) {
+        $('.doc-content-wrap').css('margin-left', '-=240px');
+        $('.doc-menu-vertical').remove();
+        $('.local-toc').css('left', '0');
+    }
+
+	$('.doc-menu-vertical .toctree-l2').each(function (i, e){
+        $(e).toggleClass('has-child', !!$(e).find('ul').length);
+    });
+
+    $('.doc-menu-vertical').find('li.current').last().addClass('active');
+
+    $('.doc-menu-vertical').perfectScrollbar();
+});
\ No newline at end of file
diff --git a/doc_theme/templates/breadcrumbs.html b/doc_theme/templates/breadcrumbs.html
new file mode 100644
index 0000000000000000000000000000000000000000..22f773a8e975dd901c5604b51cd9f2d67b3c9a1f
--- /dev/null
+++ b/doc_theme/templates/breadcrumbs.html
@@ -0,0 +1,24 @@
+{# Support for Sphinx 1.3+ page_source_suffix, but don't break old builds. #}
+
+{% if page_source_suffix %} 
+{% set suffix = page_source_suffix %}
+{% else %}
+{% set suffix = source_suffix %}
+{% endif %}
+
+{% if meta is defined and 'github_url' in meta %}
+{% set display_github = True %}
+{% endif %}
+
+{% if meta is defined and 'bitbucket_url' in meta %}
+{% set display_bitbucket = True %}
+{% endif %}
+
+<div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+      {% for doc in parents %}
+        <li><a href="{{ doc.link|e }}">{{ doc.title }}</a> > </li>
+      {% endfor %}
+    <li>{{ title }}</li>
+  </ul>
+</div>
diff --git a/doc_theme/templates/layout.html b/doc_theme/templates/layout.html
new file mode 100644
index 0000000000000000000000000000000000000000..034740369ed10a748856e2205d3315f51a7de62f
--- /dev/null
+++ b/doc_theme/templates/layout.html
@@ -0,0 +1,191 @@
+{# TEMPLATE VAR SETTINGS #}
+{%- set url_root = pathto('', 1) %}
+{%- if url_root == '#' %}{% set url_root = '' %}{% endif %}
+{%- if not embedded and docstitle %}
+  {%- set titlesuffix = " &mdash; "|safe + docstitle|e %}
+{%- else %}
+  {%- set titlesuffix = "" %}
+{%- endif %}
+
+<!DOCTYPE html>
+<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
+<head>
+  <meta charset="utf-8">
+  {{ metatags }}
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  {% block htmltitle %}
+  <title>{{ title|striptags|e }}{{ titlesuffix }}</title>
+  {% endblock %}
+
+  {# FAVICON #}
+  {% if favicon %}
+    <link rel="shortcut icon" href="{{ pathto('_static/' + favicon, 1) }}"/>
+  {% endif %}
+
+  {# CSS #}
+
+  {# OPENSEARCH #}
+  {% if not embedded %}
+    {% if use_opensearch %}
+      <link rel="search" type="application/opensearchdescription+xml" title="{% trans docstitle=docstitle|e %}Search within {{ docstitle }}{% endtrans %}" href="{{ pathto('_static/opensearch.xml', 1) }}"/>
+    {% endif %}
+
+  {% endif %}
+
+  {# RTD hosts this file, so just load on non RTD builds #}
+  {% if not READTHEDOCS %}
+    <link rel="stylesheet" href="{{ pathto('_static/' + style, 1) }}" type="text/css" />
+  {% endif %}
+
+  {% for cssfile in css_files %}
+    <link rel="stylesheet" href="{{ pathto(cssfile, 1) }}" type="text/css" />
+  {% endfor %}
+  {% for cssfile in extra_css_files %}
+    <link rel="stylesheet" href="{{ pathto(cssfile, 1) }}" type="text/css" />
+  {% endfor %}
+
+  {%- block linktags %}
+    {%- if hasdoc('about') %}
+        <link rel="author" title="{{ _('About these documents') }}"
+              href="{{ pathto('about') }}"/>
+    {%- endif %}
+    {%- if hasdoc('genindex') %}
+        <link rel="index" title="{{ _('Index') }}"
+              href="{{ pathto('genindex') }}"/>
+    {%- endif %}
+    {%- if hasdoc('search') %}
+        <link rel="search" title="{{ _('Search') }}" href="{{ pathto('search') }}"/>
+    {%- endif %}
+    {%- if hasdoc('copyright') %}
+        <link rel="copyright" title="{{ _('Copyright') }}" href="{{ pathto('copyright') }}"/>
+    {%- endif %}
+    <link rel="top" title="{{ docstitle|e }}" href="{{ pathto('index') }}"/>
+    {%- if parents %}
+        <link rel="up" title="{{ parents[-1].title|striptags|e }}" href="{{ parents[-1].link|e }}"/>
+    {%- endif %}
+    {%- if next %}
+        <link rel="next" title="{{ next.title|striptags|e }}" href="{{ next.link|e }}"/>
+    {%- endif %}
+    {%- if prev %}
+        <link rel="prev" title="{{ prev.title|striptags|e }}" href="{{ prev.link|e }}"/>
+    {%- endif %}
+  {%- endblock %}
+  {%- block extrahead %} 
+
+  <link rel="stylesheet" href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" type="text/css" />
+  <link rel="stylesheet" href="{{pathto('_static/css/override.css', 1)}}" type="text/css" />
+  <script>
+  var _hmt = _hmt || [];
+  (function() {
+    var hm = document.createElement("script");
+    hm.src = "//hm.baidu.com/hm.js?b9a314ab40d04d805655aab1deee08ba";
+    var s = document.getElementsByTagName("script")[0]; 
+    s.parentNode.insertBefore(hm, s);
+  })();
+  </script>
+
+  {% endblock %}
+
+  {# Keep modernizr in head - http://modernizr.com/docs/#installing #}
+  <script src="{{ pathto('_static/js/modernizr.min.js', 1) }}"></script>
+
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  {% block extrabody %}
+  <header class="site-header">
+    <div class="site-logo">
+      <a href="/"><img src="{{pathto('_static/images/PP_w.png', 1)}}"></a>
+    </div>
+    <div class="site-nav-links">
+      <div class="site-menu">
+        <a class="fork-on-github" href="https://github.com/PaddlePaddle/Paddle" target="_blank"><i class="fa fa-github"></i>Folk me on Github</a>
+        <div class="language-switcher dropdown">
+          <a type="button" data-toggle="dropdown">
+            <span>English</span>
+            <i class="fa fa-angle-up"></i>
+            <i class="fa fa-angle-down"></i>
+          </a>
+          <ul class="dropdown-menu">
+            <li><a href="/doc_cn">中文</a></li>
+            <li><a href="/doc">English</a></li>
+          </ul>
+        </div>
+        <ul class="site-page-links">
+          <li><a>Home</a></li>
+          <li><a>Get Started</a></li>
+          <li class="active"><a>Documentation</a></li>
+          <li><a>About Us</a></li>
+        </ul>
+      </div>
+      <div class="doc-module">
+        {%set modules = toctree(maxdepth=0, collapse=False, titles_only=True)%}
+        {{modules}}
+        {% include "searchbox.html" %}        
+      </div>
+    </div>
+  </header>
+  {% endblock %}
+  <div class="main-content-wrap">
+
+    {# SIDE NAV, TOGGLES ON MOBILE #}
+    <nav class="doc-menu-vertical" role="navigation">
+        {% block menu %}
+          {% set toctree = toctree(maxdepth=-1, collapse=False,titles_only=True, includehidden=True) %}
+          {{ toctree }}
+        {% endblock %}
+    </nav>
+    {% if toc %}
+    <nav class="local-toc">{{ toc }}</nav>
+    {% endif %}
+    <section class="doc-content-wrap">
+
+      {% include "breadcrumbs.html" %}
+      {# PAGE CONTENT #}
+      <div class="wy-nav-content" id="doc-content">
+        <div class="rst-content">
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+           <div itemprop="articleBody">
+            {% block body %}{% endblock %}
+           </div>
+          </div>
+          {% include "footer.html" %}
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+  {% include "versions.html" %}
+
+  {% if not embedded %}
+
+    <script type="text/javascript">
+        var DOCUMENTATION_OPTIONS = {
+            URL_ROOT:'{{ url_root }}',
+            VERSION:'{{ release|e }}',
+            COLLAPSE_INDEX:false,
+            FILE_SUFFIX:'{{ '' if no_search_suffix else file_suffix }}',
+            HAS_SOURCE:  {{ has_source|lower }}
+        };
+    </script>
+    {%- for scriptfile in script_files %}
+      <script type="text/javascript" src="{{ pathto(scriptfile, 1) }}"></script>
+    {%- endfor %}
+       
+  {% endif %}
+
+  {# RTD hosts this file, so just load on non RTD builds #}
+  {% if not READTHEDOCS %}
+    <script type="text/javascript" src="{{ pathto('_static/js/theme.js', 1) }}"></script>
+  {% endif %}
+  
+  <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
+  <script src="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/js/perfect-scrollbar.jquery.min.js"></script>
+  <script src="{{ pathto('_static/js/paddle_doc_init.js', 1) }}"></script>
+  {%- block footer %} {% endblock %}
+
+</body>
+</html>
diff --git a/doc_theme/templates/search.html b/doc_theme/templates/search.html
new file mode 100644
index 0000000000000000000000000000000000000000..171ab915988db06bdf6bf88f8a7930d48f9e7bee
--- /dev/null
+++ b/doc_theme/templates/search.html
@@ -0,0 +1,52 @@
+{#
+    basic/search.html
+    ~~~~~~~~~~~~~~~~~
+
+    Template for the search page.
+
+    :copyright: Copyright 2007-2013 by the Sphinx team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+#}
+{%- extends "layout.html" %}
+{% set title = _('Search') %}
+{% set script_files = script_files + ['_static/searchtools.js'] %}
+{% block footer %}
+  <script type="text/javascript">
+    jQuery(function() { Search.loadIndex("{{ pathto('searchindex.js', 1) }}"); });
+    jQuery('.doc-content-wrap > div[role="navigation"]').remove();
+    jQuery('.doc-content-wrap').css('padding-top', 0);
+  </script>
+  {# this is used when loading the search index using $.ajax fails,
+     such as on Chrome for documents on localhost #}
+  <script type="text/javascript" id="searchindexloader"></script>
+  {{ super() }}
+{% endblock %}
+{% block body %}
+  <noscript>
+  <div id="fallback" class="admonition warning">
+    <p class="last">
+      {% trans %}Please activate JavaScript to enable the search
+      functionality.{% endtrans %}
+    </p>
+  </div>
+  </noscript>
+
+  {% if search_performed %}
+    <h2>{{ _('Search Results') }}</h2>
+    {% if not search_results %}
+      <p>{{ _('Your search did not match any documents. Please make sure that all words are spelled correctly and that you\'ve selected enough categories.') }}</p>
+    {% endif %}
+  {% endif %}
+  <div id="search-results">
+  {% if search_results %}
+    <ul>
+    {% for href, caption, context in search_results %}
+      <li>
+        <a href="{{ pathto(item.href) }}">{{ caption }}</a>
+        <p class="context">{{ context|e }}</p>
+      </li>
+    {% endfor %}
+    </ul>
+  {% endif %}
+  </div>
+{% endblock %}
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
index 6f51d551200696ebafade2a46243b78086975265..bd1fdffe8984e8b8804c576890ec6a37dc7cf574 100644
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
 
@@ -28,11 +27,6 @@ Arguments* Arguments::createArguments(size_t slotNum) {
 
 void Arguments::resize(size_t slotNum) { m->outputs.resize(slotNum); }
 
-Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return Matrix::createByPaddleMatrixPtr(&a.value);
-}
-
 Arguments::Arguments() : m(new ArgumentsPrivate()) {}
 
 Arguments::~Arguments() { delete m; }
@@ -44,6 +38,16 @@ Arguments* Arguments::createByPaddleArgumentVector(void* ptr) {
   return args;
 }
 
+Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return Matrix::createByPaddleMatrixPtr(&a.value);
+}
+
+Matrix* Arguments::getSlotGrad(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return Matrix::createByPaddleMatrixPtr(&a.grad);
+}
+
 IVector* Arguments::getSlotIds(size_t idx) const throw(RangeError) {
   auto& a = m->getArg(idx);
   return IVector::createByPaddleVectorPtr(&a.ids);
@@ -59,6 +63,11 @@ void Arguments::setSlotValue(size_t idx, Matrix* mat) throw(RangeError) {
   a.value = m->cast<paddle::Matrix>(mat->getSharedPtr());
 }
 
+void Arguments::setSlotGrad(size_t idx, Matrix* mat) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.grad = m->cast<paddle::Matrix>(mat->getSharedPtr());
+}
+
 void Arguments::setSlotIn(size_t idx, Matrix* mat) throw(RangeError) {
   auto& a = m->getArg(idx);
   a.in = m->cast<paddle::Matrix>(mat->getSharedPtr());
@@ -112,7 +121,7 @@ void Arguments::setSlotSequenceStartPositions(size_t idx,
 }
 
 void Arguments::setSlotSubSequenceStartPositions(
-    size_t idx, IVector *vec) throw(RangeError) {
+    size_t idx, IVector* vec) throw(RangeError) {
   auto& a = m->getArg(idx);
   auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
   a.subSequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
diff --git a/paddle/api/ConfigParser.cpp b/paddle/api/ConfigParser.cpp
index 25d94f5a6a1255f3e2faff9816cfd003b20c0418..bc40d871d180a6bfe21200c866181dc161f5f078 100644
--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
 #include "paddle/trainer/Trainer.h"
@@ -44,8 +43,7 @@ TrainerConfig* TrainerConfig::createFromTrainerConfigFile(
   return retv;
 }
 
-TrainerConfig* TrainerConfig::createFromProtoString(
-    const std::string& str) {
+TrainerConfig* TrainerConfig::createFromProtoString(const std::string& str) {
   auto retv = new TrainerConfig();
   paddle::TrainerConfig trainerConfigProto;
   auto conf = std::make_shared<paddle::TrainerConfigHelper>(trainerConfigProto);
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
index bef499c67858b8e2d5432155a8defca56af6019c..9a4846d80980e23e97f89b6134e15af71207ae6b 100644
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
 
@@ -27,7 +26,8 @@ GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {}
 GradientMachine::~GradientMachine() { delete m; }
 
 GradientMachine* GradientMachine::createFromPaddleModelPtr(
-    const void* confPtr, GradientMatchineCreateMode mode,
+    const void* confPtr,
+    GradientMatchineCreateMode mode,
     const std::vector<int>& types) {
   auto& conf = *(const paddle::ModelConfig*)(confPtr);
   std::vector<ParameterType> realTypes;
@@ -44,7 +44,8 @@ GradientMachine* GradientMachine::createFromPaddleModelPtr(
 }
 
 GradientMachine* GradientMachine::createByConfigProtoStr(
-    const std::string& protoStr, GradientMatchineCreateMode mode,
+    const std::string& protoStr,
+    GradientMatchineCreateMode mode,
     const std::vector<int>& types) {
   paddle::ModelConfig conf;
   conf.ParseFromString(protoStr);
@@ -56,13 +57,15 @@ GradientMachine* GradientMachine::createByConfigProtoStr(
 }
 
 GradientMachine* GradientMachine::createByModelConfig(
-    ModelConfig* conf, GradientMatchineCreateMode mode,
+    ModelConfig* conf,
+    GradientMatchineCreateMode mode,
     const std::vector<int>& types) {
   auto confPtr = &conf->m->conf->getModelConfig();
   return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
 }
 
-void GradientMachine::forward(const Arguments& inArgs, Arguments* outArgs,
+void GradientMachine::forward(const Arguments& inArgs,
+                              Arguments* outArgs,
                               PassType passType) {
   auto& in =
       m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
@@ -99,7 +102,8 @@ void GradientMachine::backward(const UpdateCallback& callback) {
 }
 
 void GradientMachine::forwardBackward(const Arguments& inArgs,
-                                      Arguments* outArgs, PassType passType,
+                                      Arguments* outArgs,
+                                      PassType passType,
                                       const UpdateCallback& callback) {
   auto& in =
       m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
@@ -129,7 +133,7 @@ Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
 void GradientMachine::randParameters() { m->machine->randParameters(); }
 
 Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
-  throw(UnsupportError) {
+    throw(UnsupportError) {
   auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(m->machine);
   if (nn) {
     auto mat = nn->getLayerOutput(layerName);
@@ -140,8 +144,11 @@ Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
 }
 
 SequenceGenerator* GradientMachine::asSequenceGenerator(
-    const std::vector<std::string>& dict, size_t begin_id, size_t end_id,
-    size_t max_length, size_t beam_size) {
+    const std::vector<std::string>& dict,
+    size_t begin_id,
+    size_t end_id,
+    size_t max_length,
+    size_t beam_size) {
   SequenceGenerator* r =
       SequenceGenerator::createByGradientMachineSharedPtr(&m->machine);
   r->setDict(dict);
diff --git a/paddle/api/Internal.h b/paddle/api/Internal.h
index b990f650be9fa401898a8c6d10c21d9c90eb728a..66a13bc603ed5098997f168d3f527160ac3822ef 100644
--- a/paddle/api/Internal.h
+++ b/paddle/api/Internal.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "PaddleAPI.h"
@@ -23,7 +22,8 @@ limitations under the License. */
 template <typename T1, typename T2>
 void staticCastVector(std::vector<T2>* dest, const std::vector<T1>& src) {
   dest->resize(src.size());
-  std::transform(src.begin(), src.end(), dest->begin(), [](T1 t){
-    return static_cast<T2>(t);
-  });
+  std::transform(src.begin(),
+                 src.end(),
+                 dest->begin(),
+                 [](T1 t) { return static_cast<T2>(t); });
 }
diff --git a/paddle/api/Matrix.cpp b/paddle/api/Matrix.cpp
index e5493a381a6f9e3d135c14649a8e1e438494d363..f257ee65aa4a12dfcd1914ddbf0e16461a9b128c 100644
--- a/paddle/api/Matrix.cpp
+++ b/paddle/api/Matrix.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
@@ -44,17 +43,21 @@ Matrix* Matrix::createZero(size_t height, size_t width, bool useGpu) {
   return m;
 }
 
-Matrix* Matrix::createDense(const std::vector<float>& data, size_t height,
-                            size_t width, bool useGpu) {
+Matrix* Matrix::createDense(const std::vector<float>& data,
+                            size_t height,
+                            size_t width,
+                            bool useGpu) {
   auto m = new Matrix();
   m->m->mat = paddle::Matrix::create(height, width, useGpu);
   m->m->mat->copyFrom(data.data(), data.size());
   return m;
 }
 
-Matrix* Matrix::createDenseFromNumpy(float* data, int dim1, int dim2,
-                                      bool copy, bool useGpu)
-                                     throw (UnsupportError) {
+Matrix* Matrix::createDenseFromNumpy(float* data,
+                                     int dim1,
+                                     int dim2,
+                                     bool copy,
+                                     bool useGpu) throw(UnsupportError) {
   if (useGpu) {
     /// Gpu mode only supports copy=True
     if (!copy) {
@@ -66,7 +69,9 @@ Matrix* Matrix::createDenseFromNumpy(float* data, int dim1, int dim2,
   }
 }
 
-Matrix* Matrix::createCpuDenseFromNumpy(float* data, int dim1, int dim2,
+Matrix* Matrix::createCpuDenseFromNumpy(float* data,
+                                        int dim1,
+                                        int dim2,
                                         bool copy) {
   auto m = new Matrix();
   if (copy) {
@@ -85,12 +90,20 @@ Matrix* Matrix::createGpuDenseFromNumpy(float* data, int dim1, int dim2) {
   return m;
 }
 
-Matrix* Matrix::createSparse(size_t height, size_t width, size_t nnz,
-                             bool isNonVal, bool isTrans, bool useGpu) {
+Matrix* Matrix::createSparse(size_t height,
+                             size_t width,
+                             size_t nnz,
+                             bool isNonVal,
+                             bool isTrans,
+                             bool useGpu) {
   auto m = new Matrix();
   m->m->mat = paddle::Matrix::createSparseMatrix(
-      height, width, nnz, isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
-      isTrans, useGpu);
+      height,
+      width,
+      nnz,
+      isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
+      isTrans,
+      useGpu);
   return m;
 }
 
@@ -221,7 +234,8 @@ FloatArray Matrix::getData() const {
 }
 
 void Matrix::sparseCopyFrom(
-    const std::vector<int>& rows, const std::vector<int>& cols,
+    const std::vector<int>& rows,
+    const std::vector<int>& cols,
     const std::vector<float>& vals) throw(UnsupportError) {
   auto cpuSparseMat =
       std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
@@ -240,7 +254,8 @@ void Matrix::sparseCopyFrom(
 
 void* Matrix::getSharedPtr() const { return &m->mat; }
 
-void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
+void Matrix::toNumpyMatInplace(float** view_data,
+                               int* dim1,
                                int* dim2) throw(UnsupportError) {
   auto cpuMat = std::dynamic_pointer_cast<paddle::CpuMatrix>(m->mat);
   if (cpuMat) {
@@ -251,7 +266,8 @@ void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
     throw UnsupportError();
   }
 }
-void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
+void Matrix::copyToNumpyMat(float** view_m_data,
+                            int* dim1,
                             int* dim2) throw(UnsupportError) {
   static_assert(sizeof(paddle::real) == sizeof(float),
                 "Currently PaddleAPI only support for single "
@@ -269,8 +285,8 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
     } else if (auto gpuMat = dynamic_cast<paddle::GpuMatrix*>(m->mat.get())) {
       auto src = gpuMat->getData();
       auto dest = *view_m_data;
-      hl_memcpy_device2host(dest, src,
-                            sizeof(paddle::real) * (*dim1) * (*dim2));
+      hl_memcpy_device2host(
+          dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
     } else {
       LOG(WARNING) << "Unexpected Situation";
       throw UnsupportError();
@@ -278,7 +294,8 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
   }
 }
 
-void Matrix::copyFromNumpyMat(float* data, int dim1,
+void Matrix::copyFromNumpyMat(float* data,
+                              int dim1,
                               int dim2) throw(UnsupportError, RangeError) {
   if (isSparse()) {
     throw UnsupportError();
diff --git a/paddle/api/Paddle.swig b/paddle/api/Paddle.swig
index 6a0fbc537d9345f2221ab65d90733f4696be6880..9194a6371be9e00c037967464ee2b63c1e4f6192 100644
--- a/paddle/api/Paddle.swig
+++ b/paddle/api/Paddle.swig
@@ -193,5 +193,4 @@ namespace std {
 %ignore OptimizationConfigPrivate;
 %ignore ParameterTraverseCallbackPrivate;
 %include "utils/GlobalConstants.h"
-%include "api/PaddleAPI.h"
-
+%include "api/PaddleAPI.h"
\ No newline at end of file
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 5688ece44d2d58a2184a9f23d4af26c51c319579..a125934fc17ceb2df3b4fd89538e7a79eee3761e 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <stddef.h>
@@ -61,8 +60,8 @@ class RangeError {};
 /// Not support Error, such as access GPU memory directly, etc.
 class UnsupportError : public std::runtime_error {
 public:
-  UnsupportError() : std::runtime_error(" ") {};
-  UnsupportError(const std::string& message) : std::runtime_error(message) {};
+  UnsupportError() : std::runtime_error(" "){};
+  UnsupportError(const std::string& message) : std::runtime_error(message){};
 };
 
 /// This type will map to python's list of float.
@@ -112,7 +111,8 @@ public:
   /**
    * Create A Matrix with height,width, which is filled by zero.
    */
-  static Matrix* createZero(size_t height, size_t width,
+  static Matrix* createZero(size_t height,
+                            size_t width,
                             bool useGpu = isUsingGpu());
 
   /**
@@ -124,8 +124,11 @@ public:
    *
    * @note the default sparse type is SPARSE_CSR.
    */
-  static Matrix* createSparse(size_t height, size_t width, size_t nnz,
-                              bool isNonVal = true, bool trans = false,
+  static Matrix* createSparse(size_t height,
+                              size_t width,
+                              size_t nnz,
+                              bool isNonVal = true,
+                              bool trans = false,
                               bool useGpu = isUsingGpu());
 
   /**
@@ -134,13 +137,17 @@ public:
    * @param data  list of float should be passed in python.
    * @note        the value will be copy into a new matrix.
    */
-  static Matrix* createDense(const std::vector<float>& data, size_t height,
-                             size_t width, bool useGpu = isUsingGpu());
-
-  static Matrix* createDenseFromNumpy(float* data, int dim1, int dim2,
-                                      bool copy = true,
-                                      bool useGpu = isUsingGpu())
-                                      throw (UnsupportError);
+  static Matrix* createDense(const std::vector<float>& data,
+                             size_t height,
+                             size_t width,
+                             bool useGpu = isUsingGpu());
+
+  static Matrix* createDenseFromNumpy(
+      float* data,
+      int dim1,
+      int dim2,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
 
   /**
    *  Create Cpu Dense Matrix from numpy matrix, dtype=float32
@@ -149,10 +156,15 @@ public:
    *  @param dim1  dimension of data.
    *  @param dim2  dimension of data.
    *  @param copy  true if copy into a new matrix, false will create
-   *               matrix inplace.
+   *               matrix inplace. copy = false should be used with extreme
+   *               care because Matrix will share the memory with the given
+   *               numpy array. If the numpy array object is no longer valid,
+   *               the memory space will not be usable.
    */
-  static Matrix* createCpuDenseFromNumpy(float* data, int dim1, int dim2,
-                                         bool copy = false);
+  static Matrix* createCpuDenseFromNumpy(float* data,
+                                         int dim1,
+                                         int dim2,
+                                         bool copy = true);
 
   /// Create Gpu Dense Matrix from numpy matrix, dtype=float32
   static Matrix* createGpuDenseFromNumpy(float* data, int dim1, int dim2);
@@ -171,11 +183,13 @@ public:
    * numpy_mat = m.toNumpyMat()
    * @endcode
    */
-  void toNumpyMatInplace(float** view_data, int* dim1,
+  void toNumpyMatInplace(float** view_data,
+                         int* dim1,
                          int* dim2) throw(UnsupportError);
 
   /// Copy To numpy mat.
-  void copyToNumpyMat(float** view_m_data, int* dim1,
+  void copyToNumpyMat(float** view_m_data,
+                      int* dim1,
                       int* dim2) throw(UnsupportError);
 
   /// Copy From Numpy Mat
@@ -248,20 +262,30 @@ public:
   static Vector* create(const std::vector<float>& data,
                         bool useGpu = isUsingGpu());
 
-  static Vector* createVectorFromNumpy(float* data, int dim, bool copy = true,
-                                       bool useGpu = isUsingGpu())
-                                       throw (UnsupportError);
+  static Vector* createVectorFromNumpy(
+      float* data,
+      int dim,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
   /**
    * Create Cpu Vector from numpy array, which dtype=float32
    *
    * If copy is false, it will create vector inplace.
    */
-  static Vector* createCpuVectorFromNumpy(float* data, int dim,
-                                          bool copy = false);
+  static Vector* createCpuVectorFromNumpy(float* data,
+                                          int dim,
+                                          bool copy = true);
 
   /// Create Gpu Vector from numpy array, which dtype=float32
   static Vector* createGpuVectorFromNumpy(float* data, int dim);
 
+  /**
+   * copy from another vector
+   * throw(RangeError) if size of src vector is different from size of this
+   * vector
+   */
+  void copyFrom(Vector* src) throw(RangeError);
+
   /// Cast to numpy array inplace.
   void toNumpyArrayInplace(float** view_data, int* dim1) throw(UnsupportError);
 
@@ -312,17 +336,20 @@ public:
   static IVector* create(const std::vector<int>& data,
                          bool useGpu = isUsingGpu());
 
-  static IVector* createVectorFromNumpy(int* data, int dim, bool copy = true,
-                                        bool useGpu = isUsingGpu())
-                                        throw (UnsupportError);
+  static IVector* createVectorFromNumpy(
+      int* data,
+      int dim,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
 
   /**
    * Create Cpu IVector from numpy array, which dtype=int32
    *
    * If copy is false, it will create vector inplace
    */
-  static IVector* createCpuVectorFromNumpy(int* data, int dim,
-                                           bool copy = false);
+  static IVector* createCpuVectorFromNumpy(int* data,
+                                           int dim,
+                                           bool copy = true);
   /**
    * Create Gpu IVector from numpy array, which dtype=int32
    */
@@ -401,6 +428,7 @@ public:
    * the param idx is the slot id
    */
   Matrix* getSlotValue(size_t idx) const throw(RangeError);
+  Matrix* getSlotGrad(size_t idx) const throw(RangeError);
   IVector* getSlotIds(size_t idx) const throw(RangeError);
   Matrix* getSlotIn(size_t idx) const throw(RangeError);
   IVector* getSlotSequenceStartPositions(size_t idx) const throw(RangeError);
@@ -417,6 +445,7 @@ public:
    * The other param is the input Matrix or vector.
    */
   void setSlotValue(size_t idx, Matrix* mat) throw(RangeError);
+  void setSlotGrad(size_t idx, Matrix* mat) throw(RangeError);
   void setSlotIn(size_t idx, Matrix* mat) throw(RangeError);
   void setSlotIds(size_t idx, IVector* vec) throw(RangeError);
   void setSlotSequenceStartPositions(size_t idx,
@@ -518,6 +547,7 @@ public:
   size_t getID() const;
 
   ParameterConfig* getConfig();
+  void setValueUpdated();
 
 private:
   static Parameter* createFromRawPtr(void* ptr);
@@ -605,7 +635,8 @@ class ParameterTraverseCallback {
 public:
   ~ParameterTraverseCallback();
 
-  void apply(const std::vector<Vector*>& vecs, const ParameterConfig& config,
+  void apply(const std::vector<Vector*>& vecs,
+             const ParameterConfig& config,
              size_t sparseId);
 
 private:
@@ -638,7 +669,8 @@ public:
 
   void finishBatch();
 
-  void update(const std::vector<Vector*>& vecs, const ParameterConfig& conf,
+  void update(const std::vector<Vector*>& vecs,
+              const ParameterConfig& conf,
               size_t sparseId = NO_SPARSE_ID);
 
   std::vector<int> getParameterTypes() const;
@@ -678,7 +710,8 @@ public:
    * model config by TrainerConfig
    */
   static GradientMachine* createByModelConfig(
-      ModelConfig* conf, GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
+      ModelConfig* conf,
+      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
       const std::vector<int>& parameterTypes = defaultParamTypes);
 
   /**
@@ -701,7 +734,8 @@ public:
   /**
    * Combine forward/backward
    */
-  void forwardBackward(const Arguments& inArgs, Arguments* outArgs,
+  void forwardBackward(const Arguments& inArgs,
+                       Arguments* outArgs,
                        PassType passType,
                        const UpdateCallback& callback = UpdateCallback());
 
@@ -722,14 +756,17 @@ public:
    */
   SequenceGenerator* asSequenceGenerator(
       const std::vector<std::string>& dict = std::vector<std::string>(),
-      size_t begin_id = 0UL, size_t end_id = 0UL, size_t max_length = 100UL,
+      size_t begin_id = 0UL,
+      size_t end_id = 0UL,
+      size_t max_length = 100UL,
       size_t beam_size = -1UL);
 
 private:
   GradientMachinePrivate* m;
 
   static GradientMachine* createFromPaddleModelPtr(
-      const void* confPtr, GradientMatchineCreateMode mode,
+      const void* confPtr,
+      GradientMatchineCreateMode mode,
       const std::vector<int>& types);
 
   // Not to use c++ 11 init-list, so we use static var as function default arg.
@@ -751,8 +788,8 @@ public:
   /// Create A Trainer By TrainerConfig. using paddle command line.
   static Trainer* createByCommandLine() throw(IOError);
 
-  static Trainer* create(TrainerConfig* optConfig, GradientMachine* gm)
-      throw(IOError);
+  static Trainer* create(TrainerConfig* optConfig,
+                         GradientMachine* gm) throw(IOError);
 
   /// Start training
   void startTrain();
diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp
index 8b56adc97c2d6178a9e0b272a9af89732a3573f6..9c30ef6ff421235e84896813c701da5d8bfe7af9 100644
--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "paddle/parameter/Parameter.h"
 
@@ -69,3 +68,5 @@ ParameterConfig* Parameter::getConfig() {
 }
 
 size_t Parameter::getID() const { return m->getPtr()->getID(); }
+
+void Parameter::setValueUpdated() { m->getPtr()->setValueUpdated(); }
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp
index b13761ab0900d57008c17094c5199ef31a040f54..21d031e4bcb897eb693e5cff56bc77a637dc6bd2 100644
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
 #include "paddle/parameter/ParameterOptimizer.h"
@@ -32,17 +31,21 @@ struct ParameterTraverseCallbackPrivate {
       const paddle::ParameterOptimizer::TraverseCallback& callback)
       : callback(callback) {}
 
-  void apply(const std::vector<Vector*>& vecs, const ParameterConfig& conf,
+  void apply(const std::vector<Vector*>& vecs,
+             const ParameterConfig& conf,
              size_t sparseId) {
     std::vector<paddle::VectorPtr> real_vecs;
     real_vecs.resize(vecs.size());
-    std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
-      if (v) {
-        return *(paddle::VectorPtr*)(v->getSharedPtr());
-      } else {
-        return paddle::VectorPtr();
-      }
-    });
+    std::transform(vecs.begin(),
+                   vecs.end(),
+                   real_vecs.begin(),
+                   [](Vector* v) {
+                     if (v) {
+                       return *(paddle::VectorPtr*)(v->getSharedPtr());
+                     } else {
+                       return paddle::VectorPtr();
+                     }
+                   });
 
     paddle::ParameterConfig& real_conf =
         *(paddle::ParameterConfig*)(const_cast<ParameterConfig&>(conf)
@@ -86,10 +89,12 @@ void ParameterOptimizer::startBatch(size_t numSamplesProcessed) {
 void ParameterOptimizer::finishBatch() { m->optimizer->finishBatch(); }
 
 void ParameterOptimizer::update(const std::vector<Vector*>& vecs,
-                                const ParameterConfig& conf, size_t sparseId) {
-  ParameterTraverseCallbackPrivate invoker([&](
-      const paddle::VectorPtr _vecs[], const paddle::ParameterConfig& config,
-      size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
+                                const ParameterConfig& conf,
+                                size_t sparseId) {
+  ParameterTraverseCallbackPrivate invoker(
+      [&](const paddle::VectorPtr _vecs[],
+          const paddle::ParameterConfig& config,
+          size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
   invoker.apply(vecs, conf, sparseId);
 }
 
@@ -116,8 +121,9 @@ void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
 
 ParameterTraverseCallback* ParameterOptimizer::needSpecialTraversal(
     const ParameterConfig& config) const {
-  auto& param_config = *(paddle::ParameterConfig*)const_cast<ParameterConfig&>(
-                            config).getRawPtr();
+  auto& param_config =
+      *(paddle::ParameterConfig*)const_cast<ParameterConfig&>(config)
+           .getRawPtr();
   auto callback = m->optimizer->needSpecialTraversal(param_config);
   if (callback) {
     auto retCallback = new ParameterTraverseCallback();
diff --git a/paddle/api/SequenceGenerator.cpp b/paddle/api/SequenceGenerator.cpp
index 9d353ccc8e281e72a207ba19f45517fd256d6df2..d51be78d45902967107f4bf0af995958faed931a 100644
--- a/paddle/api/SequenceGenerator.cpp
+++ b/paddle/api/SequenceGenerator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "paddle/gserver/gradientmachines/GradientMachine.h"
 #include "paddle/parameter/Argument.h"
@@ -42,8 +41,10 @@ struct Path {
 // position
 static void findNBest(paddle::GradientMachine* gradMachine,
                       std::vector<paddle::Argument>& inArgs,
-                      std::vector<Path>& finalPaths, size_t bos_id,
-                      size_t eos_id, size_t max_length) {
+                      std::vector<Path>& finalPaths,
+                      size_t bos_id,
+                      size_t eos_id,
+                      size_t max_length) {
   std::vector<Path> paths;
   Path emptyPath;
   paths.push_back(emptyPath);
@@ -166,7 +167,8 @@ public:
     if (id < getSize()) {
       Path& p = (*path_)[id];
       std::ostringstream sout;
-      std::transform(p.ids.begin(), p.ids.end(),
+      std::transform(p.ids.begin(),
+                     p.ids.end(),
                      std::ostream_iterator<std::string>(sout, split ? " " : ""),
                      [&](int id) { return (*dict_)[id]; });
       return sout.str();
diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp
index b61f36f740d47fe785b30361f26059bf0b64829d..7a6aa69fb652313748b1fa787847ffd74fda7a22 100644
--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -64,12 +64,11 @@ Trainer* Trainer::createByCommandLine() throw(IOError) {
 
 Trainer::Trainer(TrainerConfig* config, GradientMachine* gm)
     : m(new TrainerPrivate()) {
-  m->init(config->m->conf, /* testing= */false, gm ? gm->m->machine : nullptr);
+  m->init(config->m->conf, /* testing= */ false, gm ? gm->m->machine : nullptr);
 }
 
-Trainer* Trainer::create(TrainerConfig* config, GradientMachine* gm)
-    throw(IOError)
-{
+Trainer* Trainer::create(TrainerConfig* config,
+                         GradientMachine* gm) throw(IOError) {
   auto retv = new Trainer(config, gm);
   if (retv->m->getConfig().IsInitialized()) {
     return retv;
@@ -134,15 +133,17 @@ void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
 
 Matrix* Trainer::getLayerOutput(const std::string& layerName) {
   auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(
-          this->m->getGradientMachine());
+      this->m->getGradientMachine());
   CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
   auto m = nn->getLayerOutput(layerName);
   return Matrix::createByPaddleMatrixPtr(&m);
 }
 
-void Trainer::forwardOneBatch(size_t batchSize) { m->forwardOneBatch(batchSize); }
+void Trainer::forwardOneBatch(size_t batchSize) {
+  m->forwardOneBatch(batchSize);
+}
 
-bool TrainerPrivate::forwardOneBatch(size_t batchSize)  {
+bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
   CHECK(dataProvider_) << "data_provider is not specified";
   paddle::DataBatch dataBatch;
   int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
@@ -156,7 +157,6 @@ bool TrainerPrivate::forwardOneBatch(size_t batchSize)  {
 
 void TrainerPrivate::forwardOneDataBatch(
     const std::vector<paddle::Argument>& inArgs) {
-
   std::vector<paddle::Argument>& outArgs = forwardOutput_;
 
   if (config_->getOptConfig().use_sparse_remote_updater()) {
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index a8932351a685474a756c3f5b0e5e8c42bbf58237..1bba1df2e1c0a2d3cd2d8307ed3a0d784bb949b4 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -37,13 +37,15 @@ FloatArray::FloatArray(const float* b, const size_t l)
 IntArray::IntArray(const int* b, const size_t l, bool f)
     : buf(b), length(l), needFree(f) {}
 
-IntWithFloatArray::IntWithFloatArray(const float* v, const int* i, size_t l,
+IntWithFloatArray::IntWithFloatArray(const float* v,
+                                     const int* i,
+                                     size_t l,
                                      bool f)
     : valBuf(v), idxBuf(i), length(l), needFree(f) {}
 
-bool isUsingGpu() {return FLAGS_use_gpu;}
+bool isUsingGpu() { return FLAGS_use_gpu; }
 
-void setUseGpu(bool useGpu) {FLAGS_use_gpu = useGpu;}
+void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
 
 bool isGpuVersion() {
 #ifdef PADDLE_ONLY_CPU
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index d44cdefc35bd09e04412b52fb9981947caf89588..74c9ff8dc7373f2beb6e6faaf951678038803c56 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 
 #include "paddle/math/Vector.h"
@@ -39,8 +38,10 @@ IVector* IVector::create(const std::vector<int>& data, bool useGpu) {
   return v;
 }
 
-IVector* IVector::createVectorFromNumpy(int* data, int dim, bool copy,
-                                        bool useGpu) throw (UnsupportError){
+IVector* IVector::createVectorFromNumpy(int* data,
+                                        int dim,
+                                        bool copy,
+                                        bool useGpu) throw(UnsupportError) {
   if (useGpu) {
     /// if use gpu only copy=true is supported
     if (!copy) {
@@ -137,8 +138,8 @@ void IVector::copyToNumpyArray(int** view_m_data, int* dim1) {
   if (auto cpuVec = dynamic_cast<paddle::CpuIVector*>(m->vec.get())) {
     std::memcpy(*view_m_data, cpuVec->getData(), sizeof(int) * (*dim1));
   } else if (auto gpuVec = dynamic_cast<paddle::GpuIVector*>(m->vec.get())) {
-    hl_memcpy_device2host(*view_m_data, gpuVec->getData(),
-                          sizeof(int) * (*dim1));
+    hl_memcpy_device2host(
+        *view_m_data, gpuVec->getData(), sizeof(int) * (*dim1));
   } else {
     LOG(INFO) << "Unexpected situation";
   }
@@ -201,8 +202,10 @@ Vector* Vector::createByPaddleVectorPtr(void* ptr) {
   }
 }
 
-Vector* Vector::createVectorFromNumpy(float* data, int dim, bool copy,
-                                      bool useGpu) throw (UnsupportError){
+Vector* Vector::createVectorFromNumpy(float* data,
+                                      int dim,
+                                      bool copy,
+                                      bool useGpu) throw(UnsupportError) {
   if (useGpu) {
     /// if use gpu only copy=True is supported
     if (!copy) {
@@ -251,8 +254,8 @@ void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
   if (auto cpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
     std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
   } else if (auto gpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
-    hl_memcpy_device2host(*view_m_data, gpuVec->getData(),
-                          sizeof(float) * (*dim1));
+    hl_memcpy_device2host(
+        *view_m_data, gpuVec->getData(), sizeof(float) * (*dim1));
   } else {
     LOG(INFO) << "Unexpected situation";
   }
@@ -278,6 +281,13 @@ FloatArray Vector::getData() const {
   }
 }
 
+void Vector::copyFrom(Vector* src) throw(RangeError) {
+  if (src->m->vec->getSize() !=  m->vec->getSize()) {
+    throw RangeError();
+  }
+  m->vec->copyFrom(*src->m->vec);
+}
+
 bool Vector::isGpu() const {
   return std::dynamic_pointer_cast<paddle::GpuVector>(m->vec) != nullptr;
 }
diff --git a/paddle/api/test/run_tests.sh b/paddle/api/test/run_tests.sh
index a4814f98f89c2e24195074369bc897b8b4bd2d9b..ff69c45264950db90fbec44c0ab6087994ddeea2 100755
--- a/paddle/api/test/run_tests.sh
+++ b/paddle/api/test/run_tests.sh
@@ -20,11 +20,7 @@ popd > /dev/null
 
 cd $SCRIPTPATH
 
-if [ ! -f ../../dist/*.whl ] ; then  # Swig not compiled.
-  exit 0
-fi
-
-rm .test_env -rf
+rm -rf .test_env
 virtualenv .test_env
 source .test_env/bin/activate
 
diff --git a/paddle/api/test/testMatrix.py b/paddle/api/test/testMatrix.py
index 0432345edd659f13bddb1b99f62622c5ea64a4cb..8b0da626928e292c392142a1c25c6bd8f677372b 100644
--- a/paddle/api/test/testMatrix.py
+++ b/paddle/api/test/testMatrix.py
@@ -68,7 +68,7 @@ class TestMatrix(unittest.TestCase):
 
     def test_numpyCpu(self):
         numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
-        m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat)
+        m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat, copy=False)
         self.assertEqual((int(m.getHeight()), int(m.getWidth())),
                          numpy_mat.shape)
 
diff --git a/paddle/api/test/testVector.py b/paddle/api/test/testVector.py
index 48aaa1d73da9e6c207ad5fa2be14a531267bd901..963359236d5e27ac569c00fd82b9a58f44eee4c9 100644
--- a/paddle/api/test/testVector.py
+++ b/paddle/api/test/testVector.py
@@ -43,7 +43,7 @@ class TestIVector(unittest.TestCase):
 
     def test_cpu_numpy(self):
         vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
-        iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec)
+        iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec, copy=False)
         self.assertEqual(vec.shape[0], int(iv.__len__()))
         vec[4] = 832
         for i in xrange(len(iv)):
@@ -107,7 +107,7 @@ class TestVector(unittest.TestCase):
 
     def testCpuNumpy(self):
         numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
-        vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr)
+        vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr, copy=False)
         assert isinstance(vec, swig_paddle.Vector)
         numpy_arr[0] = 0.1
         for n, v in zip(numpy_arr, vec):
@@ -152,4 +152,4 @@ if __name__ == '__main__':
     unittest.TextTestRunner().run(suite)
     if swig_paddle.isGpuVersion():
         swig_paddle.setUseGpu(True)
-        unittest.main()
\ No newline at end of file
+        unittest.main()
diff --git a/paddle/api/test/util.py b/paddle/api/test/util.py
index 93a01b242f9f9a4c939cfbf9c4c7c47bb0e4e9cf..dbcdba5bf27c2fd7df95f8838ad5fdcd131cccf1 100644
--- a/paddle/api/test/util.py
+++ b/paddle/api/test/util.py
@@ -24,7 +24,9 @@ def doubleEqual(a, b):
 
 def __readFromFile():
     for i in xrange(10002):
-        yield np.random.rand(784), random.randint(0, 9)
+        label = np.random.randint(0, 9)
+        sample = np.random.rand(784) + 0.1 * label
+        yield sample, label
 
 
 def loadMNISTTrainData(batch_size=100):
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index cdb730bb3cec7a32fa42cf4c6738d575b76c6032..10fa34b92727b03f8219a721a60b623f74582ffa 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -15,20 +15,28 @@ else()
 endif()
 
 set(CUDA_CXX_WITH_GPU_SOURCES
+    src/hl_cudart_wrap.cc
     src/hl_cuda_cublas.cc
     src/hl_cuda_cudnn.cc
     src/hl_cuda_device.cc)
 
-set_source_files_properties(${CUDA_CXX_WITH_GPU_SOURCES}
-                            PROPERTIES COMPILE_FLAGS "-D__NVCC__")
+if(WITH_GPU)
+    set(CUDA_CXX_SOURCES
+        src/hl_dso_loader.cc
+        src/hl_warpctc_wrap.cc
+        ${CUDA_CXX_WITH_GPU_SOURCES})
+
+    set_source_files_properties(${CUDA_CXX_SOURCES}
+                                PROPERTIES COMPILE_FLAGS "-D__NVCC__")
+else()
+    set(CUDA_CXX_SOURCES
+        src/hl_dso_loader.cc
+        src/hl_warpctc_wrap.cc)
+endif()
 
 set_source_files_properties(${AVX_SOURCES}
                             PROPERTIES COMPILE_FLAGS "-mavx")
 
-set(CUDA_DSO_SOURCES
-    src/hl_dso_loader.cc
-    src/hl_cudart_wrap.cc)
-
 set(CUDA_CU_SOURCES
     src/hl_perturbation_util.cu
     src/hl_cuda_aggregate.cu
@@ -44,6 +52,7 @@ set(CUDA_CU_SOURCES
 set(CUDA_HEADERS
     include/hl_time.h
     include/hl_dso_loader.h
+    include/hl_warpctc_wrap.h
     include/hl_sequence.h
     include/hl_cuda_cublas.h
     include/hl_batch_transpose.h
@@ -75,11 +84,14 @@ if(WITH_GPU)
     cuda_add_library(paddle_cuda
         ${CUDA_SOURCES}
         ${CUDA_CU_SOURCES}
-        ${CUDA_DSO_SOURCES}
-        ${CUDA_CXX_WITH_GPU_SOURCES})
+        ${CUDA_CXX_SOURCES})
 else()
-    add_library(paddle_cuda ${CUDA_SOURCES})
+    add_library(paddle_cuda
+                ${CUDA_SOURCES}
+                ${CUDA_CXX_SOURCES})
 endif()
 
-add_style_check_target(paddle_cuda ${CUDA_SOURCES})
-add_style_check_target(paddle_cuda ${CUDA_HEADERS})
+add_style_check_target(paddle_cuda
+                       ${CUDA_SOURCES}
+                       ${CUDA_HEADERS}
+                       ${CUDA_CXX_SOURCES})
diff --git a/paddle/cuda/include/hl_activation_functions.h b/paddle/cuda/include/hl_activation_functions.h
index c8aabc7844cd48d7ebdd0077684f9efa50f941a2..03e15b2223a50625c6999f6b081ae984e76b182b 100644
--- a/paddle/cuda/include/hl_activation_functions.h
+++ b/paddle/cuda/include/hl_activation_functions.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_ACTIVATION_FUNCTIONS_H_
 #define HL_ACTIVATION_FUNCTIONS_H_
 
@@ -21,11 +20,8 @@ limitations under the License. */
 /**
  * Active functions: sigmoid, relu, tanh and linear.
  */
-#define HPPL_ACTIVE_FUNCTION  {hppl::sigmoid,   \
-                               hppl::relu,      \
-                               hppl::tanh,      \
-                               hppl::linear     \
-                              }
+#define HPPL_ACTIVE_FUNCTION \
+  { hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear }
 
 namespace hppl {
 
@@ -42,18 +38,18 @@ public:
 
 #ifdef __NVCC__
 namespace gpu {
-static __device__ Active<real>::forward  forward[]  = HPPL_ACTIVE_FUNCTION;
+static __device__ Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static __device__ Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
 }
 #else
 namespace cpu {
-static Active<real>::forward  forward[] = HPPL_ACTIVE_FUNCTION;
+static Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
 }
 
 #ifdef __AVX__
 namespace avx {
-static Active<__m256>::forward  forward[] = HPPL_ACTIVE_FUNCTION;
+static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static Active<__m256>::backward backward[] = HPPL_ACTIVE_FUNCTION;
 }
 #endif
diff --git a/paddle/cuda/include/hl_aggregate.h b/paddle/cuda/include/hl_aggregate.h
index db75809f5de195d41577ed6569e8508f48241b69..a6d9ff8483eee28b2c8a380f0aca097c7662a02e 100644
--- a/paddle/cuda/include/hl_aggregate.h
+++ b/paddle/cuda/include/hl_aggregate.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_AGGREGATE_H_
 #define HL_AGGREGATE_H_
 
diff --git a/paddle/cuda/include/hl_avx_functions.h b/paddle/cuda/include/hl_avx_functions.h
index cf062dd969bf79554e00369367e3b85c2ae7fc0d..ed339e312a7639cf9b78f130a43d67a7446576bb 100644
--- a/paddle/cuda/include/hl_avx_functions.h
+++ b/paddle/cuda/include/hl_avx_functions.h
@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_AVX_FUNCTIONS_H_
 #define HL_AVX_FUNCTIONS_H_
 
 #include <immintrin.h>
 
 namespace hppl {
-  __m256 relu(const __m256 a);
-  __m256 sigmoid(const __m256 a);
-  __m256 tanh(const __m256 a);
-  __m256 linear(const __m256 a);
-
-  __m256 relu(const __m256 a, const __m256 b);
-  __m256 sigmoid(const __m256 a, const __m256 b);
-  __m256 tanh(const __m256 a, const __m256 b);
-  __m256 linear(const __m256 a, const __m256 b);
+__m256 relu(const __m256 a);
+__m256 sigmoid(const __m256 a);
+__m256 tanh(const __m256 a);
+__m256 linear(const __m256 a);
+
+__m256 relu(const __m256 a, const __m256 b);
+__m256 sigmoid(const __m256 a, const __m256 b);
+__m256 tanh(const __m256 a, const __m256 b);
+__m256 linear(const __m256 a, const __m256 b);
 }  // namespace hppl
 
 #endif  // HL_AVX_FUNCTIONS_H_
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
index 9f80898a1f927a0e8bbf86108567a04ccecc38f5..a076952467a5ce10dc1f58007dda2170aa694fbb 100644
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
-
 #ifndef HL_BASE_H_
 #define HL_BASE_H_
 
@@ -33,36 +31,36 @@ limitations under the License. */
  *          HPPL_STREAM_DEFAULT is HPPL default stream.
  */
 typedef enum {
-    HPPL_STREAM_DEFAULT = 0,    /* Thread Default Stream*/
-    HPPL_STREAM_1 = 1,
-    HPPL_STREAM_2 = 2,
-    HPPL_STREAM_3 = 3,
-    HPPL_STREAM_4 = 4,
-    HPPL_THREAD_STREAM_1 = 5,
-    HPPL_THREAD_STREAM_2 = 6,
-    HPPL_THREAD_STREAM_3 = 7,
-    HPPL_THREAD_STREAM_4 = 8,
-    HPPL_STREAM_END
+  HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/
+  HPPL_STREAM_1 = 1,
+  HPPL_STREAM_2 = 2,
+  HPPL_STREAM_3 = 3,
+  HPPL_STREAM_4 = 4,
+  HPPL_THREAD_STREAM_1 = 5,
+  HPPL_THREAD_STREAM_2 = 6,
+  HPPL_THREAD_STREAM_3 = 7,
+  HPPL_THREAD_STREAM_4 = 8,
+  HPPL_STREAM_END
 } hl_stream_t;
 
 /**
  * @brief HPPL activation mode.
  */
 typedef enum {
-    HL_ACTIVATION_SIGMOID   = 0,
-    HL_ACTIVATION_RELU      = 1,
-    HL_ACTIVATION_TANH      = 2,
-    HL_ACTIVATION_LINEAR    = 3,
-    HL_ACTIVATION_END
+  HL_ACTIVATION_SIGMOID = 0,
+  HL_ACTIVATION_RELU = 1,
+  HL_ACTIVATION_TANH = 2,
+  HL_ACTIVATION_LINEAR = 3,
+  HL_ACTIVATION_END
 } hl_activation_mode_t;
 
 /**
  * @brief Transpose type.
  */
 typedef enum {
-    HPPL_OP_N = 0, /* transpose */
-    HPPL_OP_T = 1, /* non transpose */
-    HPPL_OP_END
+  HPPL_OP_N = 0, /* transpose */
+  HPPL_OP_T = 1, /* non transpose */
+  HPPL_OP_END
 } hl_trans_op_t;
 
 /**
@@ -148,23 +146,21 @@ typedef struct {
  * @brief  Sparse matrix value type.
  */
 typedef enum {
-    HL_NO_VALUE = 0,                       /* matrix values only 0 or 1 */
-    HL_FLOAT_VALUE = 1,
-    HL_VALUE_END
+  HL_NO_VALUE = 0, /* matrix values only 0 or 1 */
+  HL_FLOAT_VALUE = 1,
+  HL_VALUE_END
 } hl_matrix_value_t;
 
-
 /**
  * @brief  HPPL matrix format.
  */
 typedef enum {
-    HL_SPARSE_CSR = 0,
-    HL_SPARSE_CSC = 1,
-    HL_SPARSE_END
+  HL_SPARSE_CSR = 0,
+  HL_SPARSE_CSC = 1,
+  HL_SPARSE_END
 } hl_matrix_format_t;
 
-
-typedef struct _hl_matrix_s * hl_matrix_s;
+typedef struct _hl_matrix_s *hl_matrix_s;
 
 /**
  * @brief   HPPL sparse matrix.
@@ -177,12 +173,12 @@ typedef struct _hl_matrix_s * hl_matrix_s;
  * @param  nnz        nonzero values of sparse matrix.
  */
 typedef struct {
-    hl_matrix_s             matrix;
-    hl_matrix_format_t      format;
-    hl_matrix_value_t       type;
-    int                     rows;
-    int                     cols;
-    size_t                  nnz;
+  hl_matrix_s matrix;
+  hl_matrix_format_t format;
+  hl_matrix_value_t type;
+  int rows;
+  int cols;
+  size_t nnz;
 } _hl_sparse_matrix_s, *hl_sparse_matrix_s;
 
 #ifndef PADDLE_TYPE_DOUBLE
@@ -195,7 +191,7 @@ typedef struct {
  *
  * HL_FLOAT_MIN: 1.17549435e-38F
  */
-#define HL_FLOAT_MAX        3.40282347e+38F
+#define HL_FLOAT_MAX 3.40282347e+38F
 /**
  * if real == double
  *
@@ -203,20 +199,18 @@ typedef struct {
  *
  * HL_FLOAT_MIN: 2.2250738585072014e-308
  */
-#define HL_FLOAT_MIN        1.17549435e-38F
+#define HL_FLOAT_MIN 1.17549435e-38F
 #else
-#define HL_FLOAT_MAX        1.7976931348623157e+308
-#define HL_FLOAT_MIN        2.2250738585072014e-308
+#define HL_FLOAT_MAX 1.7976931348623157e+308
+#define HL_FLOAT_MIN 2.2250738585072014e-308
 #endif
 
-
 /**
  * The maximum input value for exp, used to avoid overflow problem.
  *
  * Currently only used for tanh function.
  */
-#define EXP_MAX_INPUT       40.0
-
+#define EXP_MAX_INPUT 40.0
 
 /**
  * @brief DIVUP(x, y) is similar to ceil(x / y).
@@ -224,7 +218,7 @@ typedef struct {
  *        the size of blockDim.
  */
 #ifndef DIVUP
-#define DIVUP(x, y) (((x) + (y) - 1) / (y))
+#define DIVUP(x, y) (((x) + (y)-1) / (y))
 #endif
 
 #ifdef __NVCC__
@@ -233,7 +227,7 @@ typedef struct {
 #include "hl_cuda.h"
 #include "cuda_runtime.h"
 
-extern  __thread bool g_sync_flag;
+extern __thread bool g_sync_flag;
 extern __thread cudaStream_t default_stream;
 #define STREAM_DEFAULT default_stream
 
@@ -241,16 +235,15 @@ extern __thread cudaStream_t default_stream;
  * @brief   Check cuda kernel execution.
  * @param   msg   error string
  */
-#define CHECK_SYNC(msg)                                   \
-  if (true == g_sync_flag) {                              \
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);           \
-    cudaError_t err                                       \
-      = (cudaError_t)hl_get_device_last_error();          \
-    CHECK_EQ(cudaSuccess, err) << "[" << msg << "] "      \
-      << "CUDA error: "                                   \
-      << hl_get_device_error_string((size_t)err);         \
+#define CHECK_SYNC(msg)                                               \
+  if (true == g_sync_flag) {                                          \
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);                       \
+    cudaError_t err = (cudaError_t)hl_get_device_last_error();        \
+    CHECK_EQ(cudaSuccess, err)                                        \
+        << "[" << msg << "] "                                         \
+        << "CUDA error: " << hl_get_device_error_string((size_t)err); \
   }
 
-#endif  /* __NVCC__ */
+#endif /* __NVCC__ */
 
-#endif  /* HL_BASE_H_ */
+#endif /* HL_BASE_H_ */
diff --git a/paddle/cuda/include/hl_batch_transpose.h b/paddle/cuda/include/hl_batch_transpose.h
index 414c7996acee4ccbe2d7dbd093e25a23119fea3c..f3630e9762508fd39935e62e0007de04f9140fff 100644
--- a/paddle/cuda/include/hl_batch_transpose.h
+++ b/paddle/cuda/include/hl_batch_transpose.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_BATCH_TRANSPOSE_H_
 #define HL_BATCH_TRANSPOSE_H_
 
@@ -31,10 +30,7 @@ limitations under the License. */
  *          order. Each batch has height * width data, which are
  *          arranged in height-first (or row-first) manner.
  */
-extern void batchTranspose(const real* input,
-                           real* output,
-                           int width,
-                           int height,
-                           int batchSize);
+extern void batchTranspose(
+    const real* input, real* output, int width, int height, int batchSize);
 
 #endif  // HL_BATCH_TRANSPOSE_H_
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 70b5be6fda2509853029a68d31129df28d580942..cffaac634f0f64be5ddab961d549ae43775bb7b0 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CNN_H_
 #define HL_CNN_H_
 
@@ -37,15 +36,21 @@ limitations under the License. */
  * @param[in]   alpha
  * @param[in]   beta
  */
-extern void hl_shrink_col2feature(
-    const real * dataCol, size_t channels,
-    size_t height, size_t width,
-    size_t blockH, size_t blockW,
-    size_t strideH, size_t strideW,
-    size_t paddingH, size_t paddingW,
-    size_t outputH, size_t outputW,
-    real* dataIm,
-    real alpha = 1.0f, real beta = 0.0f);
+extern void hl_shrink_col2feature(const real* dataCol,
+                                  size_t channels,
+                                  size_t height,
+                                  size_t width,
+                                  size_t blockH,
+                                  size_t blockW,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real* dataIm,
+                                  real alpha = 1.0f,
+                                  real beta = 0.0f);
 
 /**
  * @brief   Expand feature to column.
@@ -65,14 +70,19 @@ extern void hl_shrink_col2feature(
  * @param[out]  dataCol     expand data.
  *
  */
-extern void hl_expand_feature2col(
-    const real* dataIm, size_t channels,
-    size_t height, size_t width,
-    size_t blockH, size_t blockW,
-    size_t strideH, size_t strideW,
-    size_t paddingH, size_t paddingW,
-    size_t outputH, size_t outputW,
-    real* dataCol);
+extern void hl_expand_feature2col(const real* dataIm,
+                                  size_t channels,
+                                  size_t height,
+                                  size_t width,
+                                  size_t blockH,
+                                  size_t blockW,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real* dataCol);
 
 /**
  * @brief   Maximum pool forward.
@@ -94,15 +104,21 @@ extern void hl_expand_feature2col(
  * @param[in]   tgtStride   stride between output data samples.
  *
  */
-extern void hl_maxpool_forward(
-    const int frameCnt, const real* inputData,
-    const int channels,
-    const int height, const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW,
-    real* tgtData, const int tgtStride);
+extern void hl_maxpool_forward(const int frameCnt,
+                               const real* inputData,
+                               const int channels,
+                               const int height,
+                               const int width,
+                               const int pooledH,
+                               const int pooledW,
+                               const int sizeX,
+                               const int sizeY,
+                               const int strideH,
+                               const int strideW,
+                               const int paddingH,
+                               const int paddingW,
+                               real* tgtData,
+                               const int tgtStride);
 
 /**
  * @brief   Maximum pool backward.
@@ -125,20 +141,28 @@ extern void hl_maxpool_forward(
  * @param[in]   paddingH    padding height.
  * @param[in]   paddingW    padding width.
  * @param[out]  targetGrad  output grad.
- * @param[in]   outStride   stride between output data samples. 
+ * @param[in]   outStride   stride between output data samples.
  *
  */
-extern void hl_maxpool_backward(
-    const int frameCnt, const real* inputData,
-    const real* outData, const real* outGrad,
-    const int channels, const int height,
-    const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW,
-    real scaleA, real scaleB,
-    real* targetGrad, const int outStride);
+extern void hl_maxpool_backward(const int frameCnt,
+                                const real* inputData,
+                                const real* outData,
+                                const real* outGrad,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int pooledH,
+                                const int pooledW,
+                                const int sizeX,
+                                const int sizeY,
+                                const int strideH,
+                                const int strideW,
+                                const int paddingH,
+                                const int paddingW,
+                                real scaleA,
+                                real scaleB,
+                                real* targetGrad,
+                                const int outStride);
 
 /**
  * @brief   Averge pool forward.
@@ -160,15 +184,21 @@ extern void hl_maxpool_backward(
  * @param[in]   tgtStride   stride between output data samples.
  *
  */
-extern void hl_avgpool_forward(
-    const int frameCnt, const real* inputData,
-    const int channels,
-    const int height, const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW,
-    real* tgtData, const int tgtStride);
+extern void hl_avgpool_forward(const int frameCnt,
+                               const real* inputData,
+                               const int channels,
+                               const int height,
+                               const int width,
+                               const int pooledH,
+                               const int pooledW,
+                               const int sizeX,
+                               const int sizeY,
+                               const int strideH,
+                               const int strideW,
+                               const int paddingH,
+                               const int paddingW,
+                               real* tgtData,
+                               const int tgtStride);
 
 /**
  * @brief   Maximum pool backward.
@@ -189,19 +219,26 @@ extern void hl_avgpool_forward(
  * @param[in]   scaleA      scale.
  * @param[in]   scaleB      scale.
  * @param[out]  backGrad    output grad.
- * @param[in]   outStride   stride between output data samples. 
+ * @param[in]   outStride   stride between output data samples.
  *
  */
-extern void hl_avgpool_backward(
-    const int frameCnt, const real* outGrad,
-    const int channels, const int height,
-    const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    int paddingH, int paddingW,
-    real scaleA, real scaleB,
-    real* backGrad, const int outStride);
+extern void hl_avgpool_backward(const int frameCnt,
+                                const real* outGrad,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int pooledH,
+                                const int pooledW,
+                                const int sizeX,
+                                const int sizeY,
+                                const int strideH,
+                                const int strideW,
+                                int paddingH,
+                                int paddingW,
+                                real scaleA,
+                                real scaleB,
+                                real* backGrad,
+                                const int outStride);
 
 /**
  * @brief   Cross-map-respose normalize forward.
@@ -218,10 +255,16 @@ extern void hl_avgpool_backward(
  * @param[in]   beta        scale.
  *
  */
-extern void hl_CMRNorm_forward(
-    size_t frameCnt, const real* in, real* scale, real* out,
-    size_t channels, size_t height, size_t width, size_t sizeX,
-    real alpha, real beta);
+extern void hl_CMRNorm_forward(size_t frameCnt,
+                               const real* in,
+                               real* scale,
+                               real* out,
+                               size_t channels,
+                               size_t height,
+                               size_t width,
+                               size_t sizeX,
+                               real alpha,
+                               real beta);
 
 /**
  * @brief   Cross-map-respose normalize backward.
@@ -240,11 +283,18 @@ extern void hl_CMRNorm_forward(
  * @param[in]   beta        scale.
  *
  */
-extern void hl_CMRNorm_backward(
-    size_t frameCnt, const real* inV, const real* scale,
-    const real* outV, const real* outDiff, real *inDiff,
-    size_t channels, size_t height, size_t width, size_t sizeX,
-    real alpha, real beta);
+extern void hl_CMRNorm_backward(size_t frameCnt,
+                                const real* inV,
+                                const real* scale,
+                                const real* outV,
+                                const real* outDiff,
+                                real* inDiff,
+                                size_t channels,
+                                size_t height,
+                                size_t width,
+                                size_t sizeX,
+                                real alpha,
+                                real beta);
 
 /**
  * @brief   Bilinear interpolation forward.
@@ -278,24 +328,24 @@ extern void hl_bilinear_forward(const real* inData,
                                 const real ratioH,
                                 const real ratioW);
 
- /**
- * @brief   Bilinear interpolation backward.
- *
- * @param[out]  inGrad      input gradient.
- * @param[in]   inImgH      input image height.
- * @param[in]   inImgW      input image width.
- * @param[in]   inputH      input batchSize.
- * @param[in]   inputW      input image data dim.
- * @param[in]   outGrad     output gradient.
- * @param[in]   outImgH     output image height.
- * @param[in]   outImgW     output image width.
- * @param[in]   outputH     output batchSize.
- * @param[in]   outputW     output image data dim.
- * @param[in]   numChannels number of channels.
- * @param[in]   ratioH      inImgH / outImgH.
- * @param[in]   ratioW      inImgW / outImgW.
- *
- */                               
+/**
+* @brief   Bilinear interpolation backward.
+*
+* @param[out]  inGrad      input gradient.
+* @param[in]   inImgH      input image height.
+* @param[in]   inImgW      input image width.
+* @param[in]   inputH      input batchSize.
+* @param[in]   inputW      input image data dim.
+* @param[in]   outGrad     output gradient.
+* @param[in]   outImgH     output image height.
+* @param[in]   outImgW     output image width.
+* @param[in]   outputH     output batchSize.
+* @param[in]   outputW     output image data dim.
+* @param[in]   numChannels number of channels.
+* @param[in]   ratioH      inImgH / outImgH.
+* @param[in]   ratioW      inImgW / outImgW.
+*
+*/
 extern void hl_bilinear_backward(real* inGrad,
                                  const size_t inImgH,
                                  const size_t inImgW,
@@ -321,9 +371,13 @@ extern void hl_bilinear_backward(real* inGrad,
  * @param[in]   featLen     feature length = image height * image width.
  * @param[in]   groups      number of groups.
  */
-extern void hl_maxout_forward(
-    const real* inData, real* outData, int* idData,
-    size_t batchSize, size_t size, size_t featLen, size_t groups);
+extern void hl_maxout_forward(const real* inData,
+                              real* outData,
+                              int* idData,
+                              size_t batchSize,
+                              size_t size,
+                              size_t featLen,
+                              size_t groups);
 
 /**
  * @brief   MaxOut backward.
@@ -336,8 +390,12 @@ extern void hl_maxout_forward(
  * @param[in]   featLen     feature length = image height * image width.
  * @param[in]   groups      number of groups.
  */
-extern void hl_maxout_backward(
-    real* inGrad, const real* outGrad, const int* idData,
-    size_t batchSize, size_t size, size_t featLen, size_t groups);
+extern void hl_maxout_backward(real* inGrad,
+                               const real* outGrad,
+                               const int* idData,
+                               size_t batchSize,
+                               size_t size,
+                               size_t featLen,
+                               size_t groups);
 
 #endif /* HL_CNN_H_ */
diff --git a/paddle/cuda/include/hl_cuda.h b/paddle/cuda/include/hl_cuda.h
index 3196db67f61fd2e6b75df4abb3652df4456a0366..2c7d665101f36f9c32ab132ca279abf3ac062a8f 100644
--- a/paddle/cuda/include/hl_cuda.h
+++ b/paddle/cuda/include/hl_cuda.h
@@ -12,18 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_H_
 #define HL_CUDA_H_
 
-#include "hl_base.h"
 #include <string>
+#include "hl_base.h"
 
 /**
  * @brief   HPPL event.
  */
-typedef struct _hl_event_st *  hl_event_t;
-
+typedef struct _hl_event_st *hl_event_t;
 
 /**
  * @brief return cuda runtime api version.
@@ -42,7 +40,7 @@ extern void hl_start();
  *                      if device is NULL, will start all GPU.
  * @param[in]   number  number of devices.
  */
-extern void hl_specify_devices_start(int* device, int number);
+extern void hl_specify_devices_start(int *device, int number);
 
 /**
  * @brief   Queries if a device may directly access a peer device's memory.
@@ -126,7 +124,7 @@ extern int hl_get_device();
  *
  * @return      dest_d   pointer to device memory.
  */
-extern void* hl_malloc_device(size_t size);
+extern void *hl_malloc_device(size_t size);
 
 /**
  * @brief   Free device memory.
@@ -143,7 +141,7 @@ extern void hl_free_mem_device(void *dest_d);
  *
  * @return      dest_h   pointer to host memory.
  */
-extern void* hl_malloc_host(size_t size);
+extern void *hl_malloc_host(size_t size);
 
 /**
  * @brief   Free host page-lock memory.
@@ -228,9 +226,9 @@ extern void hl_srand(unsigned int seed);
  * @param[in]   stream  stream id.
  */
 extern void hl_memcpy_async(void *dst,
-                           void *src,
-                           size_t size,
-                           hl_stream_t stream);
+                            void *src,
+                            size_t size,
+                            hl_stream_t stream);
 
 /**
  * @brief   Waits for stream tasks to complete.
@@ -261,8 +259,7 @@ extern void hl_destroy_event(hl_event_t event);
  *
  * @return      time   Time between start and end in ms.
  */
-extern float hl_event_elapsed_time(hl_event_t start,
-                                   hl_event_t end);
+extern float hl_event_elapsed_time(hl_event_t start, hl_event_t end);
 
 /**
  * @brief   Records an event.
@@ -300,7 +297,7 @@ extern void hl_set_device_flags_block();
 /**
  * @brief   Returns the last error string from a cuda runtime call.
  */
-extern const char* hl_get_device_error_string();
+extern const char *hl_get_device_error_string();
 
 /**
  * @brief     Returns the last error string from a cuda runtime call.
@@ -309,7 +306,7 @@ extern const char* hl_get_device_error_string();
  *
  * @see       hl_get_device_last_error()
  */
-extern const char* hl_get_device_error_string(size_t err);
+extern const char *hl_get_device_error_string(size_t err);
 
 /**
  * @brief   Returns the last error number.
@@ -335,4 +332,14 @@ extern bool hl_cuda_event_is_ready(hl_event_t event);
  */
 extern void hl_device_synchronize();
 
+/**
+ * @brief   gpu profiler start
+ */
+extern void hl_profiler_start();
+
+/**
+ * @brief   gpu profiler stop
+ */
+extern void hl_profiler_end();
+
 #endif  // HL_CUDA_H_
diff --git a/paddle/cuda/include/hl_cuda_cublas.h b/paddle/cuda/include/hl_cuda_cublas.h
index d757317eb4a97559feef22d4fd8edf7c10ca6745..db8c03c2c01c67788622d37b5330e22c31e03f34 100644
--- a/paddle/cuda/include/hl_cuda_cublas.h
+++ b/paddle/cuda/include/hl_cuda_cublas.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_CUBLAS_H_
 #define HL_CUDA_CUBLAS_H_
 
@@ -29,12 +28,8 @@ limitations under the License. */
  * @param[in]   ldc     the first dimension of C_d.
  *
  */
-extern void hl_matrix_transpose(real *A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN,
-                                int lda,
-                                int ldc);
+extern void hl_matrix_transpose(
+    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc);
 
 /*
  * @brief Matrix transpose, while lda = dimN, ldc = dimM.
@@ -45,10 +40,7 @@ extern void hl_matrix_transpose(real *A_d,
  * @param[in]   dimN    matrix width.
  *
  */
-extern void hl_matrix_transpose(real *A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN);
+extern void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN);
 
 /*
  * @brief Matrix inverse
@@ -60,11 +52,7 @@ extern void hl_matrix_transpose(real *A_d,
  * @param[in]   ldc    the first dimension of C_d
  *
  */
-extern void hl_matrix_inverse(real *A_d,
-                              real *C_d,
-                              int dimN,
-                              int lda,
-                              int ldc);
+extern void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc);
 
 /**
  * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -84,12 +72,19 @@ extern void hl_matrix_inverse(real *A_d,
  * @param[in]   ldc     the first dimension of C_d.
  *
  */
-extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                          real *B_d, hl_trans_op_t transb,
+extern void hl_matrix_mul(real *A_d,
+                          hl_trans_op_t transa,
+                          real *B_d,
+                          hl_trans_op_t transb,
                           real *C_d,
-                          int dimM, int dimN, int dimK,
-                          real alpha, real beta,
-                          int lda, int ldb, int ldc);
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta,
+                          int lda,
+                          int ldb,
+                          int ldc);
 
 /**
  * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -106,11 +101,16 @@ extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
  * @param[in]   beta    scalar used for multiplication.
  *
  */
-extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                          real *B_d, hl_trans_op_t transb,
+extern void hl_matrix_mul(real *A_d,
+                          hl_trans_op_t transa,
+                          real *B_d,
+                          hl_trans_op_t transb,
                           real *C_d,
-                          int dimM, int dimN, int dimK,
-                          real alpha, real beta);
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta);
 
 /**
  * @brief   This function performs the matrix-vector multiplication.
@@ -132,11 +132,17 @@ extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
  *
  */
 
-extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
-                                 real *B_d, real *C_d,
-                                 int dimM, int dimN,
-                                 real alpha, real beta,
-                                 int lda, int incb, int incc);
+extern void hl_matrix_mul_vector(real *A_d,
+                                 hl_trans_op_t trans,
+                                 real *B_d,
+                                 real *C_d,
+                                 int dimM,
+                                 int dimN,
+                                 real alpha,
+                                 real beta,
+                                 int lda,
+                                 int incb,
+                                 int incc);
 
 /**
  * @brief   This function performs the matrix-vector multiplication.
@@ -154,9 +160,13 @@ extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
  * @param[in]     beta   scalar used for multiplication.
  *
  */
-extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
-                                 real *B_d, real *C_d,
-                                 int dimM, int dimN,
-                                 real alpha, real beta);
+extern void hl_matrix_mul_vector(real *A_d,
+                                 hl_trans_op_t trans,
+                                 real *B_d,
+                                 real *C_d,
+                                 int dimM,
+                                 int dimN,
+                                 real alpha,
+                                 real beta);
 
 #endif /* HL_CUDA_CUBLAS_H_ */
diff --git a/paddle/cuda/include/hl_cuda_cudnn.h b/paddle/cuda/include/hl_cuda_cudnn.h
index f256cb54dfe69e8df7cc7fcc0ed0a58f3574acd3..3a2f916210277145efa8f6d7663a2698ea546b0b 100644
--- a/paddle/cuda/include/hl_cuda_cudnn.h
+++ b/paddle/cuda/include/hl_cuda_cudnn.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_CUDNN_H_
 #define HL_CUDA_CUDNN_H_
 
@@ -22,7 +21,7 @@ limitations under the License. */
  *  hppl pooling mode
  */
 typedef enum {
-  HL_POOLING_MAX     = 0,
+  HL_POOLING_MAX = 0,
   // average includes padded values
   HL_POOLING_AVERAGE = 1,
   // average does not include padded values
@@ -324,17 +323,16 @@ extern void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
  * @param[in]   sizeInBytes         gpu workspace size (bytes).
  * @param[in]   convBwdFilterAlgo   backward filter algorithm.
  */
-extern void hl_convolution_backward_filter(
-        hl_tensor_descriptor input,
-        real* input_data,
-        hl_tensor_descriptor output,
-        real* output_grad_data,
-        hl_filter_descriptor filter,
-        real* filter_grad_data,
-        hl_convolution_descriptor conv,
-        void* gpuWorkSpace,
-        size_t sizeInBytes,
-        int  convBwdFilterAlgo);
+extern void hl_convolution_backward_filter(hl_tensor_descriptor input,
+                                           real* input_data,
+                                           hl_tensor_descriptor output,
+                                           real* output_grad_data,
+                                           hl_filter_descriptor filter,
+                                           real* filter_grad_data,
+                                           hl_convolution_descriptor conv,
+                                           void* gpuWorkSpace,
+                                           size_t sizeInBytes,
+                                           int convBwdFilterAlgo);
 
 /**
  * @brief   convolution backward data(calculate input image grad data).
@@ -350,17 +348,16 @@ extern void hl_convolution_backward_filter(
  * @param[in]   sizeInBytes         gpu workspace size (bytes).
  * @param[in]   convBwdDataAlgo     backward data algorithm.
  */
-extern void hl_convolution_backward_data(
-        hl_tensor_descriptor input,
-        real* input_data_grad,
-        hl_tensor_descriptor output,
-        real* output_grad_data,
-        hl_filter_descriptor filter,
-        real* filter_data,
-        hl_convolution_descriptor conv,
-        void* gpuWorkSpace,
-        size_t sizeInBytes,
-        int convBwdDataAlgo);
+extern void hl_convolution_backward_data(hl_tensor_descriptor input,
+                                         real* input_data_grad,
+                                         hl_tensor_descriptor output,
+                                         real* output_grad_data,
+                                         hl_filter_descriptor filter,
+                                         real* filter_data,
+                                         hl_convolution_descriptor conv,
+                                         void* gpuWorkSpace,
+                                         size_t sizeInBytes,
+                                         int convBwdDataAlgo);
 
 /**
  * @brief   convolution backward bias(calculate bias grad data).
@@ -383,8 +380,8 @@ extern void hl_convolution_backward_bias(hl_tensor_descriptor bias,
  * @param[in]   height              matrix height.
  * @param[in]   width               matrix width.
  */
-extern void hl_softmax_forward(real *input,
-                               real *output,
+extern void hl_softmax_forward(real* input,
+                               real* output,
                                int height,
                                int width);
 
@@ -396,8 +393,8 @@ extern void hl_softmax_forward(real *input,
  * @param[in]   height              matrix height.
  * @param[in]   width               matrix width.
  */
-extern void hl_softmax_backward(real *output_value,
-                                real *output_grad,
+extern void hl_softmax_backward(real* output_value,
+                                real* output_grad,
                                 int height,
                                 int width);
 
@@ -426,18 +423,18 @@ extern void hl_softmax_backward(real *output_value,
  *
  */
 extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                           real *input,
+                                           real* input,
                                            hl_tensor_descriptor outputDesc,
-                                           real *output,
+                                           real* output,
                                            hl_tensor_descriptor bnParamDesc,
-                                           real *scale,
-                                           real *bias,
+                                           real* scale,
+                                           real* bias,
                                            double factor,
-                                           real *runningMean,
-                                           real *runningInvVar,
+                                           real* runningMean,
+                                           real* runningInvVar,
                                            double epsilon,
-                                           real *savedMean,
-                                           real *savedVar);
+                                           real* savedMean,
+                                           real* savedVar);
 
 /**
  * @brief   cudnn batch norm forward.
@@ -463,14 +460,14 @@ extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
  *
  */
 extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                            real *input,
+                                            real* input,
                                             hl_tensor_descriptor outputDesc,
-                                            real *output,
+                                            real* output,
                                             hl_tensor_descriptor bnParamDesc,
-                                            real *scale,
-                                            real *bias,
-                                            real *estimatedMean,
-                                            real *estimatedVar,
+                                            real* scale,
+                                            real* bias,
+                                            real* estimatedMean,
+                                            real* estimatedVar,
                                             double epsilon);
 
 /**
@@ -483,7 +480,8 @@ extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
  * @param[in]   inGradDesc      input tensor descriptor desc.
  * @param[in]   inGrad          input data.
  * @param[in]   dBnParamDesc    tensor descriptor desc.
- *                              bnScale, bnBias, running mean/var, save_mean/var.
+ *                              bnScale, bnBias, running mean/var,
+ * save_mean/var.
  * @param[in]   scale           batch normalization scale parameter (in original
  *                              paper scale is referred to as gamma).
  * @param[in]   scaleGrad       batch normalization scale parameter (in original
@@ -497,17 +495,17 @@ extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
  *
  */
 extern void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                                   real *input,
+                                   real* input,
                                    hl_tensor_descriptor outGradDesc,
-                                   real *outGrad,
+                                   real* outGrad,
                                    hl_tensor_descriptor inGradDesc,
-                                   real *inGrad,
+                                   real* inGrad,
                                    hl_tensor_descriptor dBnParamDesc,
-                                   real *scale,
-                                   real *scaleGrad,
-                                   real *biasGrad,
+                                   real* scale,
+                                   real* scaleGrad,
+                                   real* biasGrad,
                                    double epsilon,
-                                   real *savedMean,
-                                   real *savedInvVar);
+                                   real* savedMean,
+                                   real* savedInvVar);
 
 #endif  // HL_CUDA_CUDNN_H_
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h
index f36c724e2da3dce11696fcda7daf98f5cda36dd6..e5d3d4031140391339d5a53c9ee11ca942697730 100644
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -12,17 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_DSO_LOADER_H_
 #define HL_DSO_LOADER_H_
 
 #include <dlfcn.h>
 #include <string>
 #include <memory>
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <curand.h>
-#include <cudnn.h>
 #include "hl_base.h"
 
 /**
@@ -57,4 +52,12 @@ void GetCudartDsoHandle(void** dso_handle);
  */
 void GetCurandDsoHandle(void** dso_handle);
 
+/**
+ * @brief    load the DSO of warp-ctc
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetWarpCTCDsoHandle(void** dso_handle);
+
 #endif  // HL_DSO_LOADER_H_
diff --git a/paddle/cuda/include/hl_functions.h b/paddle/cuda/include/hl_functions.h
index 65f366461ced0f9ee31ff9075f6dfaeb6c9b72a2..91ce9a0678463597df88c548aeac322ee19d95de 100644
--- a/paddle/cuda/include/hl_functions.h
+++ b/paddle/cuda/include/hl_functions.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_FUNCTIONS_H_
 #define HL_FUNCTIONS_H_
 
@@ -21,30 +20,30 @@ limitations under the License. */
 /**
  * sigmoid threshold maximum
  */
-#define     SIGMOID_THRESHOLD_MIN   -40.0
+#define SIGMOID_THRESHOLD_MIN -40.0
 
 /**
  * sigmoid threshold minimum
  */
-#define     SIGMOID_THRESHOLD_MAX   13.0
+#define SIGMOID_THRESHOLD_MAX 13.0
 
 #ifndef __NVCC__
 namespace hppl {
-  /*
-   * forward activation
-   */
-  real relu(const real a);
-  real sigmoid(const real a);
-  real tanh(const real a);
-  real linear(const real a);
-
-  /*
-   * backward activation
-   */
-  real relu(const real a, const real b);
-  real sigmoid(const real a, const real b);
-  real tanh(const real a, const real b);
-  real linear(const real a, const real b);
+/*
+ * forward activation
+ */
+real relu(const real a);
+real sigmoid(const real a);
+real tanh(const real a);
+real linear(const real a);
+
+/*
+ * backward activation
+ */
+real relu(const real a, const real b);
+real sigmoid(const real a, const real b);
+real tanh(const real a, const real b);
+real linear(const real a, const real b);
 }  // namespace hppl
 
 #ifdef __AVX__
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h
index 05039663b6e9f5e4a72f15ab822d723635f9b282..6dd6d1321270a5f24661911f8bee9de1d0cbb4cf 100644
--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_GPU_H_
 #define HL_GPU_H_
 
@@ -26,6 +25,7 @@ limitations under the License. */
 #include "hl_sparse.h"
 #include "hl_lstm.h"
 #include "hl_sequence.h"
+#include "hl_warpctc_wrap.h"
 
 #ifdef HPPL_STUB_FUNC
 #include "stub/hl_cuda_stub.h"
diff --git a/paddle/cuda/include/hl_lstm.h b/paddle/cuda/include/hl_lstm.h
index 1f95e318a1fe06050bbd31c2e276974f4a8bdc1e..7e527a79025969320f1aca75d313fd9d0194efd1 100644
--- a/paddle/cuda/include/hl_lstm.h
+++ b/paddle/cuda/include/hl_lstm.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_LSTM_H_
 #define HL_LSTM_H_
 
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
index 6195e30b9974d3ad092b4cf604e6b74fa481835c..96648661e345d8fa5d50cb2aae3a56ee53921f90 100644
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_MATRIX_H_
 #define HL_MATRIX_H_
 
@@ -30,13 +29,8 @@ limitations under the License. */
  * @param[in]   beta    scalar used for addition.
  *
  */
-extern void hl_matrix_add(real* A_d,
-                          real* B_d,
-                          real* C_d,
-                          int dimM,
-                          int dimN,
-                          real alpha,
-                          real beta);
+extern void hl_matrix_add(
+    real* A_d, real* B_d, real* C_d, int dimM, int dimN, real alpha, real beta);
 /**
  * @brief   Matrix Softmax.
  *
@@ -46,7 +40,7 @@ extern void hl_matrix_add(real* A_d,
  * @param[in]   dimN    matrix width.
  *
  */
-extern void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN);
+extern void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN);
 
 /**
  * @brief   Matrix softmax derivative.
@@ -58,11 +52,8 @@ extern void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN);
  * @param[in]   dimN         matrix width.
  *
  */
-extern void hl_matrix_softmax_derivative(real* grad_d,
-                                         real* output_d,
-                                         real* sftmaxSum_d,
-                                         int dimM,
-                                         int dimN);
+extern void hl_matrix_softmax_derivative(
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN);
 
 /**
  * @brief   Sequence softmax.
@@ -73,8 +64,8 @@ extern void hl_matrix_softmax_derivative(real* grad_d,
  * @param[in]   numSequence sequence number.
  *
  */
-extern void hl_sequence_softmax_forward(real *A_d,
-                                        real *C_d,
+extern void hl_sequence_softmax_forward(real* A_d,
+                                        real* C_d,
                                         const int* index,
                                         int numSequence);
 
@@ -88,11 +79,8 @@ extern void hl_sequence_softmax_forward(real *A_d,
  * @param[in]   dimN    matrix width.
  *
  */
-extern void hl_matrix_classification_error(real* A_d,
-                                           int* B_d,
-                                           real* C_d,
-                                           int dimM,
-                                           int dimN);
+extern void hl_matrix_classification_error(
+    real* A_d, int* B_d, real* C_d, int dimM, int dimN);
 
 /**
  * @brief   Matrix cross entropy.
@@ -104,11 +92,8 @@ extern void hl_matrix_classification_error(real* A_d,
  * @param[in]   dimN    matrix width.
  *
  */
-extern void hl_matrix_cross_entropy(real* A_d,
-                                    real* C_d,
-                                    int* label_d,
-                                    int dimM,
-                                    int dimN);
+extern void hl_matrix_cross_entropy(
+    real* A_d, real* C_d, int* label_d, int dimM, int dimN);
 
 /**
  * @brief   Matrix cross entropy back propagation.
@@ -120,11 +105,8 @@ extern void hl_matrix_cross_entropy(real* A_d,
  * @param[in]   dimN        matrix width.
  *
  */
-extern void hl_matrix_cross_entropy_bp(real* grad_d,
-                                       real* output_d,
-                                       int* label_d,
-                                       int dimM,
-                                       int dimN);
+extern void hl_matrix_cross_entropy_bp(
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN);
 
 /**
  * @brief  Matrix multi-binary label cross entropy
@@ -135,11 +117,8 @@ extern void hl_matrix_cross_entropy_bp(real* grad_d,
  * @param[in]   dimM      matrix height.
  * @param[in]   dimN      matrix width.
  */
-extern void hl_matrix_multi_binary_cross_entropy(real* output,
-                                                 real* entropy,
-                                                 hl_sparse_matrix_s mat,
-                                                 int dimM,
-                                                 int dimN);
+extern void hl_matrix_multi_binary_cross_entropy(
+    real* output, real* entropy, hl_sparse_matrix_s mat, int dimM, int dimN);
 
 /**
  * @brief  Matrix multi-binary label cross entropy backprop
@@ -150,11 +129,8 @@ extern void hl_matrix_multi_binary_cross_entropy(real* output,
  * @param[in]   dimM      matrix height.
  * @param[in]   dimN      matrix width.
  */
-extern void hl_matrix_multi_binary_cross_entropy_bp(real* output,
-                                                    real* grad,
-                                                    hl_sparse_matrix_s mat,
-                                                    int dimM,
-                                                    int dimN);
+extern void hl_matrix_multi_binary_cross_entropy_bp(
+    real* output, real* grad, hl_sparse_matrix_s mat, int dimM, int dimN);
 
 /**
  * @brief  Matrix zero memory.
@@ -176,12 +152,8 @@ extern void hl_matrix_zero_mem(real* data, int num);
  * @param[in]  partial_sum
  */
 
-extern void hl_param_relu_forward(real* output,
-                                  real* input,
-                                  real* w,
-                                  int width,
-                                  int height,
-                                  int partial_sum);
+extern void hl_param_relu_forward(
+    real* output, real* input, real* w, int width, int height, int partial_sum);
 /**
  * @brief parameter relu backward w
  *
diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/cuda/include/hl_matrix_type.cuh
index 51e483d1fb2ff311ae68219d823411a3d707a30e..060be073645b1d5f79e55dc02fdb60f42bcea61b 100644
--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_MATRIX_TYPE_CUH_
 #define HL_MATRIX_TYPE_CUH_
 
 #include "hl_base.h"
 
 #ifdef __CUDA_ARCH__
-// typedef void*  vecType;
 #include <vector_types.h>
 #ifndef PADDLE_TYPE_DOUBLE
 typedef float4 vecType;
@@ -37,4 +35,10 @@ typedef __m128d vecType;
 #endif
 #endif
 
-#endif /* HL_MATRIX_TYPE_CUH_ */
+#ifdef __CUDA_ARCH__
+#define INLINE   __device__ inline
+#else
+#define INLINE   inline
+#endif
+
+#endif  // HL_MATRIX_TYPE_CUH_
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
index 46d86b2982f065802eec83ca7554f787d1d02f3a..b98d7bdeafe5dfbd6b27304b11c55329f861165e 100644
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_SEQUENCE_H_
 #define HL_SEQUENCE_H_
 
@@ -32,7 +31,7 @@ limitations under the License. */
 extern void hl_max_sequence_forward(real* input,
                                     const int* sequence,
                                     real* output,
-                                    int *index,
+                                    int* index,
                                     int numSequences,
                                     int dim);
 
@@ -46,11 +45,8 @@ extern void hl_max_sequence_forward(real* input,
  * @param[in]   dim             input dimension.
  *
  */
-extern void hl_max_sequence_backward(real* outputGrad,
-                                     int *index,
-                                     real* inputGrad,
-                                     int numSequences,
-                                     int dim);
+extern void hl_max_sequence_backward(
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim);
 
 /**
  * @brief   Context projection forward.
@@ -63,7 +59,8 @@ extern void hl_max_sequence_backward(real* outputGrad,
  * @param[in]   inputDim        input sequence dimension.
  * @param[in]   contextLength   context length.
  * @param[in]   contextStart    context start.
- * @param[in]   beginPad        number of extra timesteps added at the beginning.
+ * @param[in]   beginPad        number of extra timesteps added at the
+ * beginning.
  * @param[in]   isPadding       trainable padding.
  *
  */
@@ -109,7 +106,8 @@ extern void hl_context_projection_backward_data(real* outputGrad,
  * @param[in]   totalPad        number of extra timesteps.
  * @param[in]   contextLength   context length.
  * @param[in]   contextStart    context start.
- * @param[in]   beginPad        number of extra timesteps added at the beginning.
+ * @param[in]   beginPad        number of extra timesteps added at the
+ * beginning.
  *
  */
 extern void hl_context_projection_backward_weight(real* outputGrad,
@@ -141,9 +139,9 @@ extern void hl_context_projection_backward_weight(real* outputGrad,
  * @param[in]       seq2batch   copy direction.
  *
  */
-extern void hl_sequence2batch_copy(real *batch,
-                                   real *sequence,
-                                   const int *batchIndex,
+extern void hl_sequence2batch_copy(real* batch,
+                                   real* sequence,
+                                   const int* batchIndex,
                                    int seqWidth,
                                    int batchCount,
                                    bool seq2batch);
@@ -167,13 +165,46 @@ extern void hl_sequence2batch_copy(real *batch,
  * @param[in]       seq2batch   copy direction.
  *
  */
-extern void hl_sequence2batch_add(real *batch,
-                                  real *sequence,
-                                  int *batchIndex,
+extern void hl_sequence2batch_add(real* batch,
+                                  real* sequence,
+                                  int* batchIndex,
                                   int seqWidth,
                                   int batchCount,
                                   bool seq2batch);
 
+/**
+ * @brief   Memory copy from sequence to batch,
+ *          while padding all sequences to the same length.
+ *
+ * if seq2batch == true
+ *
+ *    copy from sequence to batch:
+ *        batch[i] = sequence[sequenceStartPositions[i]]
+ *
+ * if seq2batch == false
+ *
+ *    copy from batch to sequence:
+ *        sequence[sequenceStartPositions[i]] = batch[i]
+ *
+ * @param[in,out]   batch                   batch matrix.
+ * @param[in,out]   sequence                sequence matrix.
+ * @param[in]       sequenceStartPositions  index vector.
+ * @param[in]       sequenceWidth           width of sequence.
+ * @param[in]       maxSequenceLength       maximum length of sequences.
+ * @param[in]       numSequences            number of sequences.
+ * @param[in]       normByTimes             whether dividing sequence's length.
+ * @param[in]       seq2batch               copy direction.
+ *
+ */
+extern void hl_sequence2batch_copy_padding(real* batch,
+                                           real* sequence,
+                                           const int* sequenceStartPositions,
+                                           const size_t sequenceWidth,
+                                           const size_t maxSequenceLength,
+                                           const size_t numSequences,
+                                           bool normByTimes,
+                                           bool seq2batch);
+
 /**
  * @brief  dst = Op(src), src is sequence.
  *
diff --git a/paddle/cuda/include/hl_sparse.h b/paddle/cuda/include/hl_sparse.h
index 9acdebdebf37761e1485e3441963586ead9f3c85..c4e0be23e2031cbcb124b532216a23d8a344668d 100644
--- a/paddle/cuda/include/hl_sparse.h
+++ b/paddle/cuda/include/hl_sparse.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_SPARSE_H_
 #define HL_SPARSE_H_
 
@@ -31,7 +30,7 @@ limitations under the License. */
  */
 extern void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
                                     hl_matrix_format_t format,
-                                    hl_matrix_value_t  value_type,
+                                    hl_matrix_value_t value_type,
                                     int dimM,
                                     int dimN,
                                     int nnz);
@@ -60,10 +59,10 @@ extern void hl_free_sparse_matrix(hl_sparse_matrix_s A_d);
  *
  */
 extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       void * dest_d,
+                                       void *dest_d,
                                        size_t size,
                                        hl_matrix_format_t format,
-                                       hl_matrix_value_t  value_type,
+                                       hl_matrix_value_t value_type,
                                        int dimM,
                                        int dimN,
                                        int nnz);
@@ -94,11 +93,11 @@ extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
  *
  */
 extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       real* value_d,
-                                       int* rows_d,
-                                       int* cols_d,
+                                       real *value_d,
+                                       int *rows_d,
+                                       int *cols_d,
                                        hl_matrix_format_t format,
-                                       hl_matrix_value_t  value_type,
+                                       hl_matrix_value_t value_type,
                                        int dimM,
                                        int dimN,
                                        int nnz);
@@ -259,10 +258,14 @@ extern void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
  */
 extern void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
                                     hl_trans_op_t transa,
-                                    real *B_d, hl_trans_op_t transb,
+                                    real *B_d,
+                                    hl_trans_op_t transb,
                                     real *C_d,
-                                    int dimM, int dimN, int dimK,
-                                    real alpha, real beta);
+                                    int dimM,
+                                    int dimN,
+                                    int dimK,
+                                    real alpha,
+                                    real beta);
 
 /**
  * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
@@ -311,11 +314,16 @@ extern void hl_matrix_dense_mul_csc(real *A_d,
  * @note    transb is not support HPPL_OP_T.
  *
  */
-extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
-                                 real *B_d, hl_trans_op_t transb,
+extern void hl_sparse_matrix_mul(real *A_d,
+                                 hl_trans_op_t transa,
+                                 real *B_d,
+                                 hl_trans_op_t transb,
                                  hl_sparse_matrix_s C_d,
-                                 int dimM, int dimN, int dimK,
-                                 real alpha, real beta);
+                                 int dimM,
+                                 int dimN,
+                                 int dimK,
+                                 real alpha,
+                                 real beta);
 
 /**
  * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -336,12 +344,16 @@ extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
  * @note    transa is not support HPPL_OP_T.
  *
  */
-extern void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
+extern void hl_matrix_dense_mul_csr(real *A_d,
+                                    hl_trans_op_t transa,
                                     hl_sparse_matrix_s B_d,
                                     hl_trans_op_t transb,
                                     real *C_d,
-                                    int dimM, int dimN, int dimK,
-                                    real alpha, real beta);
+                                    int dimM,
+                                    int dimN,
+                                    int dimK,
+                                    real alpha,
+                                    real beta);
 
 /**
  * @brief   Memcpy csc_matrix to host.
@@ -412,7 +424,6 @@ extern void hl_memcpy_from_csr_matrix(real *csr_val,
                                       hl_sparse_matrix_s csr_matrix,
                                       hl_stream_t stream);
 
-
 /**
  * @brief   A_d[j] += B_d[i,j] for i in range(height)
  *
@@ -423,19 +434,13 @@ extern void hl_memcpy_from_csr_matrix(real *csr_val,
  * @param[in]       scale  scale of B_d
  *
  */
-extern void hl_sparse_matrix_column_sum(real* A_d,
-                                        hl_sparse_matrix_s B_d,
-                                        int dimM,
-                                        int dimN,
-                                        real scale);
+extern void hl_sparse_matrix_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale);
 /**
  * @brief implementation of csr sparse matrix in hl_sparse_matirx_column_sum
  */
-extern void hl_matrix_csr_column_sum(real* A_d,
-                                     hl_sparse_matrix_s B_d,
-                                     int dimM,
-                                     int dimN,
-                                     real scale);
+extern void hl_matrix_csr_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale);
 
 /**
  * @brief   A_d[i,j] += B_d[j]
@@ -446,13 +451,13 @@ extern void hl_matrix_csr_column_sum(real* A_d,
  *
  */
 extern void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
-                                      real* B_d,
+                                      real *B_d,
                                       real scale);
 /**
  * @brief implementation of csr sparse matrix in hl_sparse_matrix_add_bias
  */
 extern void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
-                                   real* B_d,
+                                   real *B_d,
                                    real scale);
 
 /**
@@ -470,7 +475,7 @@ extern void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
  *
  */
 extern void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
-                                       real* B_d,
+                                       real *B_d,
                                        int dimM,
                                        int dimN,
                                        real alpha,
@@ -479,7 +484,7 @@ extern void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
  * @brief implementation of csr sparse matrix in hl_sparse_matrix_add_dense
  */
 extern void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
-                                    real* B_d,
+                                    real *B_d,
                                     int dimM,
                                     int dimN,
                                     real alpha,
@@ -493,7 +498,7 @@ extern void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
  * @return   return rows pointer, which is gpu address
  *
  */
-extern int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
+extern int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
 
 /**
  * @brief get cols pionter of GpuSparseMatrix
@@ -503,7 +508,7 @@ extern int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
  * @return   return cols pointer, which is gpu address
  *
  */
-extern int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
+extern int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
 
 /**
  * @brief get value pionter of GpuSparseMatrix
@@ -513,7 +518,6 @@ extern int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
  * @return   return value pointer, which is gpu address
  *
  */
-extern real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat);
-
+extern real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat);
 
 #endif /* HL_SPARSE_H_ */
diff --git a/paddle/cuda/include/hl_table_apply.h b/paddle/cuda/include/hl_table_apply.h
index 3c9428e9253d5ed563e4e9f62d8842667496b83c..b4ac83a66af13c2a843872faba2ebd972008a738 100644
--- a/paddle/cuda/include/hl_table_apply.h
+++ b/paddle/cuda/include/hl_table_apply.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_TABLE_APPLY_H_
 #define HL_TABLE_APPLY_H_
 
@@ -31,8 +30,10 @@ limitations under the License. */
  * @param[in]   dim             width of table.
  *
  */
-extern void hl_matrix_select_rows(real* output, int ldo,
-                                  real* table, int ldt,
+extern void hl_matrix_select_rows(real* output,
+                                  int ldo,
+                                  real* table,
+                                  int ldt,
                                   int* ids,
                                   int numSamples,
                                   int tableSize,
@@ -53,8 +54,10 @@ extern void hl_matrix_select_rows(real* output, int ldo,
  * @param[in]   dim             width of table.
  *
  */
-extern void hl_matrix_add_to_rows(real* table, int ldt,
-                                  real* input, int ldi,
+extern void hl_matrix_add_to_rows(real* table,
+                                  int ldt,
+                                  real* input,
+                                  int ldi,
                                   int* ids,
                                   int numSamples,
                                   int tableSize,
@@ -72,8 +75,7 @@ extern void hl_matrix_add_to_rows(real* table, int ldt,
  *
  */
 template <class T>
-extern void hl_vector_select_from(T* dst, int sized,
-                                  const T* src, int sizes,
-                                  const int* ids, int sizei);
+extern void hl_vector_select_from(
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei);
 
-#endif  /* HL_TABLE_APPLY_H_ */
+#endif /* HL_TABLE_APPLY_H_ */
diff --git a/paddle/cuda/include/hl_tensor_ops.h b/paddle/cuda/include/hl_tensor_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc95620e37cec55df667f56c6d7dedc7e20dcb54
--- /dev/null
+++ b/paddle/cuda/include/hl_tensor_ops.h
@@ -0,0 +1,334 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_TENSOR_OPS_H_
+#define HL_TENSOR_OPS_H_
+
+#include <cmath>
+#include "hl_matrix_type.cuh"
+
+namespace hppl {
+namespace unary {
+
+template <class T>
+class add_scale {
+private:
+  const T p;
+
+public:
+  INLINE add_scale(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a + p; }
+};
+
+template <class T>
+class sub_scale {
+private:
+  const T p;
+
+public:
+  INLINE sub_scale(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a - p; }
+};
+
+template <class T>
+class mul_scale {
+private:
+  const T p;
+
+public:
+  INLINE mul_scale(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a * p; }
+};
+
+template <class T>
+class div_scale {
+private:
+  const T p;
+
+public:
+  INLINE div_scale(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a / p; }
+};
+
+template <class T>
+class neg {
+public:
+  INLINE T operator()(const T a) const { return -a; }
+};
+
+template <class T>
+class exp_op {
+public:
+  INLINE T operator()(const T a) const { return std::exp(a); }
+};
+
+template <class T>
+class log_op {
+public:
+  INLINE T operator()(const T a) const { return std::log(a); }
+};
+
+template <class T>
+class sqrt_op {
+public:
+  INLINE T operator()(const T a) const { return std::sqrt(a); }
+};
+
+template <class T>
+class square {
+public:
+  INLINE T operator()(const T a) const { return a * a; }
+};
+
+template <class T>
+class reciprocal {
+public:
+  INLINE T operator()(const T a) const { return T(1) / a; }
+};
+
+template <class T>
+class abs {
+public:
+  INLINE T operator()(const T a) const { return a > 0 ? a : -a; }
+};
+
+template <class T>
+class sign {
+public:
+  INLINE T operator()(const T a) const { return (a > 0) - (a < 0); }
+};
+
+template <class T>
+class min {
+private:
+  const T p;
+
+public:
+  INLINE min(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a > p ? p : a; }
+};
+
+template <class T>
+class max {
+private:
+  const T p;
+
+public:
+  INLINE max(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a < p ? p : a; }
+};
+
+template <class T>
+class pow_op {
+private:
+  const T p;
+
+public:
+  INLINE pow_op(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return std::pow(a, p); }
+};
+
+template <class T>
+class constant {
+private:
+  const T p;
+
+public:
+  INLINE constant(const T s) : p(s) {}
+  INLINE T operator()(int i) const { return p; }
+  INLINE T operator()(int i, int j) const { return p; }
+};
+
+template <class T>
+class cmp_eq {
+private:
+  const T p;
+
+public:
+  INLINE cmp_eq(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a == p; }
+};
+
+template <class T>
+class cmp_ne {
+private:
+  const T p;
+
+public:
+  INLINE cmp_ne(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a != p; }
+};
+
+template <class T>
+class cmp_le {
+private:
+  const T p;
+
+public:
+  INLINE cmp_le(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a <= p; }
+};
+
+template <class T>
+class cmp_lt {
+private:
+  const T p;
+
+public:
+  INLINE cmp_lt(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a < p; }
+};
+
+template <class T>
+class cmp_ge {
+private:
+  const T p;
+
+public:
+  INLINE cmp_ge(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a >= p; }
+};
+
+template <class T>
+class cmp_gt {
+private:
+  const T p;
+
+public:
+  INLINE cmp_gt(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a > p; }
+};
+
+template <class T>
+class and_op {
+private:
+  const T p;
+
+public:
+  INLINE and_op(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a && p; }
+};
+
+template <class T>
+class or_op {
+private:
+  const T p;
+
+public:
+  INLINE or_op(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a || p; }
+};
+
+}  // namespace unary
+
+namespace binary {
+template <class T>
+class add {
+public:
+  INLINE T operator()(const T a, const T b) const { return a + b; }
+};
+
+template <class T>
+class add_scale {
+private:
+  const T p1;
+  const T p2;
+
+public:
+  INLINE add_scale(const T s1, const T s2) : p1(s1), p2(s2) {}
+  INLINE T operator()(const T a, const T b) const { return p1 * a + p2 * b; }
+};
+
+template <class T>
+class sub {
+public:
+  INLINE T operator()(const T a, const T b) const { return a - b; }
+};
+
+template <class T>
+class mul {
+public:
+  INLINE T operator()(const T a, const T b) const { return a * b; }
+};
+
+template <class T>
+class div {
+public:
+  INLINE T operator()(const T a, const T b) const { return a / b; }
+};
+
+template <class T>
+class cmp_eq {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a == b; }
+};
+
+template <class T>
+class cmp_ne {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a != b; }
+};
+
+template <class T>
+class cmp_le {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a <= b; }
+};
+
+template <class T>
+class cmp_lt {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a < b; }
+};
+
+template <class T>
+class cmp_ge {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a >= b; }
+};
+
+template <class T>
+class cmp_gt {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a > b; }
+};
+
+template <class T>
+class and_op {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a && b; }
+};
+
+template <class T>
+class or_op {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a || b; }
+};
+
+template <class T>
+class min {
+public:
+  INLINE T operator()(const T a, const T b) const { return a > b ? b : a; }
+};
+
+template <class T>
+class max {
+public:
+  INLINE T operator()(const T a, const T b) const { return a < b ? b : a; }
+};
+
+}  // namespace binary
+}  // namespace hppl
+
+#endif  // HL_TENSOR_OPS_H_
diff --git a/paddle/cuda/include/hl_time.h b/paddle/cuda/include/hl_time.h
index 4414b0b2d2ed4ab6a48294ffaed3a43a639e5950..b0a88c66a12fcfec6ea96b877423f907dac8dfa1 100644
--- a/paddle/cuda/include/hl_time.h
+++ b/paddle/cuda/include/hl_time.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_TIME_H_
 #define HL_TIME_H_
 
diff --git a/paddle/cuda/include/hl_top_k.h b/paddle/cuda/include/hl_top_k.h
index a38d4cf862278a060f72b970d723895dc3735d0a..e8cfebbf6a3bd27c10a71d7817238bc304681fa4 100644
--- a/paddle/cuda/include/hl_top_k.h
+++ b/paddle/cuda/include/hl_top_k.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_TOP_K_H_
 #define HL_TOP_K_H_
 
@@ -31,9 +30,11 @@ limitations under the License. */
  * @param[in]   numSamples     height of input value.
  *
  */
-extern void hl_matrix_top_k(real* topVal, int ldv,
-                            int * topIds,
-                            real* src, int lds,
+extern void hl_matrix_top_k(real* topVal,
+                            int ldv,
+                            int* topIds,
+                            real* src,
+                            int lds,
                             int dim,
                             int beamSize,
                             int numSamples);
@@ -50,8 +51,9 @@ extern void hl_matrix_top_k(real* topVal, int ldv,
  *
  * @note    Only support HL_SPARSE_CSR format.
  */
-extern void hl_sparse_matrix_top_k(real* topVal, int ldv,
-                                   int * topIds,
+extern void hl_sparse_matrix_top_k(real* topVal,
+                                   int ldv,
+                                   int* topIds,
                                    hl_sparse_matrix_s src,
                                    int beamSize,
                                    int numSamples);
diff --git a/paddle/cuda/include/hl_warpctc_wrap.h b/paddle/cuda/include/hl_warpctc_wrap.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc50cf9d20829c4b7d03a4445c81cc912c4eb072
--- /dev/null
+++ b/paddle/cuda/include/hl_warpctc_wrap.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_WARPCTC_WRAP_H_
+#define HL_WARPCTC_WRAP_H_
+
+#include "hl_base.h"
+#include "warp-ctc/include/ctc.h"
+
+typedef ctcStatus_t hl_warpctc_status_t;
+typedef ctcOptions hl_warpctc_options_t;
+
+/**
+ * @brief Init ctc options.
+ *
+ * @param[in]   blank     blank label used in ctc loss function.
+ * @param[in]   useGpu    whether use gpu.
+ * @param[out]  options   handle to store cpu or gpu informations.
+ *
+ */
+extern void hl_warpctc_init(const size_t blank,
+                            bool useGpu,
+                            hl_warpctc_options_t* options);
+
+/**
+ * @brief Compute the connectionist temporal classification loss,
+ *        and optionally compute the gradient with respect to the inputs.
+ *
+ * if batchGrad == nullptr
+ *
+ *    only compute the ctc loss.
+ *
+ * if batchGrad != nullptr
+ *
+ *    compute both ctc loss and gradient.
+ *
+ * @param[in]   batchInput      batch matrix of input probabilities,
+ *                              in maxSequenceLength x numSequence x numClasses
+ *                              (row-major) format.
+ * @param[out]  batchGrad       batch matrix of gradient.
+ * @param[in]   cpuLabels       labels always in CPU memory.
+ * @param[in]   cpuLabelLengths length of all labels in CPU memory.
+ * @param[in]   cpuInputLengths length of all sequences in CPU memory.
+ * @param[in]   numClasses      number of possible output symbols.
+ * @param[in]   numSequences    number of sequence.
+ * @param[out]  cpuCosts        cost of each sequence in CPU memory.
+ * @param[out]  workspace       workspace to store some temporary results.
+ * @param[in]   options         handle to store cpu or gpu informations.
+ *
+ */
+extern void hl_warpctc_compute_loss(const real* batchInput,
+                                    real* batchGrad,
+                                    const int* cpuLabels,
+                                    const int* cpuLabelLengths,
+                                    const int* cpuInputLengths,
+                                    const size_t numClasses,
+                                    const size_t numSequences,
+                                    real* cpuCosts,
+                                    void* workspace,
+                                    hl_warpctc_options_t* options);
+
+/**
+ * @brief Compute the required workspace size.
+ *        There is no memory allocated operations within warp-ctc.
+ *
+ * @param[in]   cpuLabelLengths length of all labels in CPU memory.
+ * @param[in]   cpuInputLengths length of all sequences in CPU memory.
+ * @param[in]   numClasses      number of possible output symbols.
+ * @param[in]   numSequences    number of sequence.
+ * @param[in]   options         handle to store cpu or gpu informations.
+ * @param[out]  bytes           pointer to a scalar where the memory
+ *                              requirement in bytes will be placed.
+ *
+ */
+extern void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
+                                          const int* cpuInputLengths,
+                                          const size_t numClasses,
+                                          const size_t numSequences,
+                                          hl_warpctc_options_t* options,
+                                          size_t* bytes);
+
+#endif  // HL_WARPCTC_WRAP_H_
diff --git a/paddle/cuda/include/stub/hl_aggregate_stub.h b/paddle/cuda/include/stub/hl_aggregate_stub.h
index 4c0c68f3c98fe95f01060b82c3a1b9822d2a3715..bb53fc581e09905aa7a9b2d8dfe44b04c677c40a 100644
--- a/paddle/cuda/include/stub/hl_aggregate_stub.h
+++ b/paddle/cuda/include/stub/hl_aggregate_stub.h
@@ -12,29 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_AGGREGATE_STUB_H_
 #define HL_AGGREGATE_STUB_H_
 
 #include "hl_aggregate.h"
 
-inline void hl_matrix_row_sum(real *A_d, real *C_d,
-                              int dimM, int dimN) {}
+inline void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_row_max(real *A_d, real *C_d,
-                              int dimM, int dimN) {}
+inline void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_row_min(real *A_d, real *C_d,
-                              int dimM, int dimN) {}
+inline void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_column_sum(real *A_d, real *C_d,
-                                 int dimM, int dimN) {}
+inline void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_column_max(real *A_d, real *C_d,
-                                 int dimM, int dimN) {}
+inline void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_column_min(real *A_d, real *C_d,
-                                 int dimM, int dimN) {}
+inline void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {}
 
 inline void hl_vector_sum(real *A_d, real *C_h, int dimM) {}
 
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index c6f32ad337705ff938b7b370a4785dc7f4393041..2f73b9671edd3609996aebff2913f5262805f869 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -12,84 +12,134 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CNN_STUB_H_
 #define HL_CNN_STUB_H_
 
 #include "hl_cnn.h"
 
-inline void hl_shrink_col2feature(
-    const real * dataCol, size_t channels,
-    size_t height, size_t width,
-    size_t blockH, size_t blockW,
-    size_t strideH, size_t strideW,
-    size_t paddingH, size_t paddingW,
-    size_t outputH, size_t outputW,
-    real* dataIm,
-    real alpha, real beta) {}
-
-inline void hl_expand_feature2col(
-    const real* dataIm, size_t channels,
-    size_t height, size_t width,
-    size_t blockH, size_t blockW,
-    size_t strideH, size_t strideW,
-    size_t paddingH, size_t paddingW,
-    size_t outputH, size_t outputW,
-    real* dataCol) {}
-
-inline void hl_maxpool_forward(
-    const int frameCnt, const real* inputData,
-    const int channels,
-    const int height, const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW,
-    real* tgtData, const int tgtStride) {}
-
-inline void hl_maxpool_backward(
-    const int frameCnt, const real* inputData,
-    const real* outData, const real* outGrad,
-    const int channels, const int height,
-    const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW,
-    real scaleA, real scaleB,
-    real* targetGrad, const int outStride) {}
-
-inline void hl_avgpool_forward(
-    const int frameCnt, const real* inputData,
-    const int channels,
-    const int height, const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW,
-    real* tgtData, const int tgtStride) {}
-
-inline void hl_avgpool_backward(
-    const int frameCnt, const real* outGrad,
-    const int channels, const int height,
-    const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    int paddingH, int paddingW,
-    real scaleA, real scaleB,
-    real* backGrad, const int outStride) {}
-
-inline void hl_CMRNorm_forward(
-    size_t frameCnt, const real* in, real* scale, real* out,
-    size_t channels, size_t height, size_t width, size_t sizeX,
-    real alpha, real beta) {}
-
-inline void hl_CMRNorm_backward(
-    size_t frameCnt, const real* inV, const real* scale,
-    const real* outV, const real* outDiff, real *inDiff,
-    size_t channels, size_t height, size_t width, size_t sizeX,
-    real alpha, real beta) {}
+inline void hl_shrink_col2feature(const real* dataCol,
+                                  size_t channels,
+                                  size_t height,
+                                  size_t width,
+                                  size_t blockH,
+                                  size_t blockW,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real* dataIm,
+                                  real alpha,
+                                  real beta) {}
+
+inline void hl_expand_feature2col(const real* dataIm,
+                                  size_t channels,
+                                  size_t height,
+                                  size_t width,
+                                  size_t blockH,
+                                  size_t blockW,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real* dataCol) {}
+
+inline void hl_maxpool_forward(const int frameCnt,
+                               const real* inputData,
+                               const int channels,
+                               const int height,
+                               const int width,
+                               const int pooledH,
+                               const int pooledW,
+                               const int sizeX,
+                               const int sizeY,
+                               const int strideH,
+                               const int strideW,
+                               const int paddingH,
+                               const int paddingW,
+                               real* tgtData,
+                               const int tgtStride) {}
+
+inline void hl_maxpool_backward(const int frameCnt,
+                                const real* inputData,
+                                const real* outData,
+                                const real* outGrad,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int pooledH,
+                                const int pooledW,
+                                const int sizeX,
+                                const int sizeY,
+                                const int strideH,
+                                const int strideW,
+                                const int paddingH,
+                                const int paddingW,
+                                real scaleA,
+                                real scaleB,
+                                real* targetGrad,
+                                const int outStride) {}
+
+inline void hl_avgpool_forward(const int frameCnt,
+                               const real* inputData,
+                               const int channels,
+                               const int height,
+                               const int width,
+                               const int pooledH,
+                               const int pooledW,
+                               const int sizeX,
+                               const int sizeY,
+                               const int strideH,
+                               const int strideW,
+                               const int paddingH,
+                               const int paddingW,
+                               real* tgtData,
+                               const int tgtStride) {}
+
+inline void hl_avgpool_backward(const int frameCnt,
+                                const real* outGrad,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int pooledH,
+                                const int pooledW,
+                                const int sizeX,
+                                const int sizeY,
+                                const int strideH,
+                                const int strideW,
+                                int paddingH,
+                                int paddingW,
+                                real scaleA,
+                                real scaleB,
+                                real* backGrad,
+                                const int outStride) {}
+
+inline void hl_CMRNorm_forward(size_t frameCnt,
+                               const real* in,
+                               real* scale,
+                               real* out,
+                               size_t channels,
+                               size_t height,
+                               size_t width,
+                               size_t sizeX,
+                               real alpha,
+                               real beta) {}
+
+inline void hl_CMRNorm_backward(size_t frameCnt,
+                                const real* inV,
+                                const real* scale,
+                                const real* outV,
+                                const real* outDiff,
+                                real* inDiff,
+                                size_t channels,
+                                size_t height,
+                                size_t width,
+                                size_t sizeX,
+                                real alpha,
+                                real beta) {}
 
 inline void hl_bilinear_forward(const real* inData,
                                 const size_t inImgH,
@@ -106,25 +156,33 @@ inline void hl_bilinear_forward(const real* inData,
                                 const real ratioW) {}
 
 inline void hl_bilinear_backward(real* inGrad,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t inputH,
-                                const size_t inputW,
-                                const real* outGrad,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t outputH,
-                                const size_t outputW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {}
-
-inline void hl_maxout_forward(
-    const real* inData, real* outData, int* idData,
-    size_t batchSize, size_t size, size_t featLen, size_t group) {}
-
-inline void hl_maxout_backward(
-    real* inGrad, const real* outGrad, const int* idData,
-    size_t batchSize, size_t size, size_t featLen, size_t group) {}
+                                 const size_t inImgH,
+                                 const size_t inImgW,
+                                 const size_t inputH,
+                                 const size_t inputW,
+                                 const real* outGrad,
+                                 const size_t outImgH,
+                                 const size_t outImgW,
+                                 const size_t outputH,
+                                 const size_t outputW,
+                                 const size_t numChannels,
+                                 const real ratioH,
+                                 const real ratioW) {}
+
+inline void hl_maxout_forward(const real* inData,
+                              real* outData,
+                              int* idData,
+                              size_t batchSize,
+                              size_t size,
+                              size_t featLen,
+                              size_t group) {}
+
+inline void hl_maxout_backward(real* inGrad,
+                               const real* outGrad,
+                               const int* idData,
+                               size_t batchSize,
+                               size_t size,
+                               size_t featLen,
+                               size_t group) {}
 
 #endif  // HL_CNN_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
index 903dcbe8355d6f593d96bc1f9e686d54035a9366..85f7c390c47397127487b16fdc933f0afe2fb880 100644
--- a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
@@ -12,41 +12,42 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_CUBLAS_STUB_H_
 #define HL_CUDA_CUBLAS_STUB_H_
 
 #include "hl_cuda_cublas.h"
 
-inline void hl_matrix_transpose(real *A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN,
-                                int lda,
-                                int ldc) {}
-
-inline void hl_matrix_transpose(real *A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN) {}
-
-inline void hl_matrix_inverse(real *A_d,
-                              real *C_d,
-                              int dimN,
-                              int lda,
-                              int ldc) {}
-
-inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                          real *B_d, hl_trans_op_t transb,
-                          real *C_d,
-                          int dimM, int dimN, int dimK,
-                          real alpha, real beta,
-                          int lda, int ldb, int ldc) {}
+inline void hl_matrix_transpose(
+    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {}
+
+inline void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                          real *B_d, hl_trans_op_t transb,
+inline void hl_matrix_inverse(
+    real *A_d, real *C_d, int dimN, int lda, int ldc) {}
+
+inline void hl_matrix_mul(real *A_d,
+                          hl_trans_op_t transa,
+                          real *B_d,
+                          hl_trans_op_t transb,
+                          real *C_d,
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta,
+                          int lda,
+                          int ldb,
+                          int ldc) {}
+
+inline void hl_matrix_mul(real *A_d,
+                          hl_trans_op_t transa,
+                          real *B_d,
+                          hl_trans_op_t transb,
                           real *C_d,
-                          int dimM, int dimN, int dimK,
-                          real alpha, real beta) {}
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta) {}
 
 #endif  // HL_CUDA_CUBLAS_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
index b96804afd86ba5e8c7b7eed7eb768295b4e23096..3beb0e5b5170261a6c453936b8b0347f3e97dbff 100644
--- a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
@@ -12,15 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_CUDNN_STUB_H_
 #define HL_CUDA_CUDNN_STUB_H_
 
 #include "hl_cuda_cudnn.h"
 
-inline int hl_get_cudnn_lib_version() {
-  return 0;
-}
+inline int hl_get_cudnn_lib_version() { return 0; }
 
 inline void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {}
 
@@ -68,41 +65,41 @@ inline void hl_pooling_backward(hl_tensor_descriptor input,
                                 hl_pooling_descriptor pooling) {}
 
 inline void hl_create_filter_descriptor(hl_filter_descriptor* filter,
-                                       int input_feature_maps,
-                                       int output_feature_maps,
-                                       int height,
-                                       int width) {}
+                                        int input_feature_maps,
+                                        int output_feature_maps,
+                                        int height,
+                                        int width) {}
 
 inline void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {}
 
 inline void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
-        hl_tensor_descriptor image,
-        hl_filter_descriptor filter,
-        int padding_height,
-        int padding_width,
-        int stride_height,
-        int stride_width) {}
+                                             hl_tensor_descriptor image,
+                                             hl_filter_descriptor filter,
+                                             int padding_height,
+                                             int padding_width,
+                                             int stride_height,
+                                             int stride_width) {}
 
 inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
-        hl_tensor_descriptor image,
-        hl_filter_descriptor filter,
-        int padding_height,
-        int padding_width,
-        int stride_height,
-        int stride_width) {}
+                                            hl_tensor_descriptor image,
+                                            hl_filter_descriptor filter,
+                                            int padding_height,
+                                            int padding_width,
+                                            int stride_height,
+                                            int stride_width) {}
 
 inline void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {}
 
 inline void hl_conv_workspace(hl_tensor_descriptor input,
-                       hl_tensor_descriptor output,
-                       hl_filter_descriptor filter,
-                       hl_convolution_descriptor conv,
-                       int* convFwdAlgo,
-                       size_t* fwdLimitBytes,
-                       int* convBwdDataAlgo,
-                       size_t* bwdDataLimitBytes,
-                       int* convBwdFilterAlgo,
-                       size_t* bwdFilterLimitBytes) {}
+                              hl_tensor_descriptor output,
+                              hl_filter_descriptor filter,
+                              hl_convolution_descriptor conv,
+                              int* convFwdAlgo,
+                              size_t* fwdLimitBytes,
+                              int* convBwdDataAlgo,
+                              size_t* bwdDataLimitBytes,
+                              int* convBwdFilterAlgo,
+                              size_t* bwdFilterLimitBytes) {}
 
 inline void hl_convolution_forward(hl_tensor_descriptor input,
                                    real* input_data,
@@ -116,86 +113,84 @@ inline void hl_convolution_forward(hl_tensor_descriptor input,
                                    int convFwdAlgo) {}
 
 inline void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
-        real* bias_data,
-        hl_tensor_descriptor output,
-        real* output_data) {}
-
-inline void hl_convolution_backward_filter(
-        hl_tensor_descriptor input,
-        real* input_data,
-        hl_tensor_descriptor output,
-        real* output_grad_data,
-        hl_filter_descriptor filter,
-        real* filter_grad_data,
-        hl_convolution_descriptor conv,
-        void* gpuWorkSpace,
-        size_t sizeInBytes,
-        int convBwdFilterAlgo) {}
-
-inline void hl_convolution_backward_data(
-        hl_tensor_descriptor input,
-        real* input_data_grad,
-        hl_tensor_descriptor output,
-        real* output_grad_data,
-        hl_filter_descriptor filter,
-        real* filter_data,
-        hl_convolution_descriptor conv,
-        void* gpuWorkSpace,
-        size_t sizeInBytes,
-        int convBwdDataAlgo) {}
+                                            real* bias_data,
+                                            hl_tensor_descriptor output,
+                                            real* output_data) {}
+
+inline void hl_convolution_backward_filter(hl_tensor_descriptor input,
+                                           real* input_data,
+                                           hl_tensor_descriptor output,
+                                           real* output_grad_data,
+                                           hl_filter_descriptor filter,
+                                           real* filter_grad_data,
+                                           hl_convolution_descriptor conv,
+                                           void* gpuWorkSpace,
+                                           size_t sizeInBytes,
+                                           int convBwdFilterAlgo) {}
+
+inline void hl_convolution_backward_data(hl_tensor_descriptor input,
+                                         real* input_data_grad,
+                                         hl_tensor_descriptor output,
+                                         real* output_grad_data,
+                                         hl_filter_descriptor filter,
+                                         real* filter_data,
+                                         hl_convolution_descriptor conv,
+                                         void* gpuWorkSpace,
+                                         size_t sizeInBytes,
+                                         int convBwdDataAlgo) {}
 
 inline void hl_convolution_backward_bias(hl_tensor_descriptor bias,
-                                        real* bias_grad_data,
-                                        hl_tensor_descriptor output,
-                                        real* output_grad_data) {}
+                                         real* bias_grad_data,
+                                         hl_tensor_descriptor output,
+                                         real* output_grad_data) {}
 
-inline void hl_softmax_forward(real *input,
-                              real *output,
-                              int height,
-                              int width) {}
-
-inline void hl_softmax_backward(real *output_value,
-                               real *output_grad,
+inline void hl_softmax_forward(real* input,
+                               real* output,
                                int height,
                                int width) {}
 
+inline void hl_softmax_backward(real* output_value,
+                                real* output_grad,
+                                int height,
+                                int width) {}
+
 inline void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                           real *input,
+                                           real* input,
                                            hl_tensor_descriptor outputDesc,
-                                           real *output,
+                                           real* output,
                                            hl_tensor_descriptor bnParamDesc,
-                                           real *scale,
-                                           real *bias,
+                                           real* scale,
+                                           real* bias,
                                            double factor,
-                                           real *runningMean,
-                                           real *runningInvVar,
+                                           real* runningMean,
+                                           real* runningInvVar,
                                            double epsilon,
-                                           real *savedMean,
-                                           real *savedVar) {}
+                                           real* savedMean,
+                                           real* savedVar) {}
 
 inline void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                            real *input,
+                                            real* input,
                                             hl_tensor_descriptor outputDesc,
-                                            real *output,
+                                            real* output,
                                             hl_tensor_descriptor bnParamDesc,
-                                            real *scale,
-                                            real *bias,
-                                            real *estimatedMean,
-                                            real *estimatedVar,
+                                            real* scale,
+                                            real* bias,
+                                            real* estimatedMean,
+                                            real* estimatedVar,
                                             double epsilon) {}
 
 inline void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                                   real *input,
+                                   real* input,
                                    hl_tensor_descriptor outGradDesc,
-                                   real *outGrad,
+                                   real* outGrad,
                                    hl_tensor_descriptor inGradDesc,
-                                   real *inGrad,
+                                   real* inGrad,
                                    hl_tensor_descriptor dBnParamDesc,
-                                   real *scale,
-                                   real *scaleGrad,
-                                   real *biasGrad,
+                                   real* scale,
+                                   real* scaleGrad,
+                                   real* biasGrad,
                                    double epsilon,
-                                   real *savedMean,
-                                   real *savedInvVar) {}
+                                   real* savedMean,
+                                   real* savedInvVar) {}
 
 #endif  // HL_CUDA_CUDNN_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_stub.h b/paddle/cuda/include/stub/hl_cuda_stub.h
index 675ac03b0e188e9b26038dd4e40264099618e17a..24923a0d4a0cdd49214305c2f7716eeef575c7ee 100644
--- a/paddle/cuda/include/stub/hl_cuda_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_STUB_H_
 #define HL_CUDA_STUB_H_
 
@@ -24,29 +23,25 @@ inline void hl_specify_devices_start(int *device, int number) {}
 
 inline void hl_init(int device) {}
 
-inline int hl_get_cuda_lib_version(int device) {
-  return 0;
-}
+inline int hl_get_cuda_lib_version(int device) { return 0; }
 
 inline void hl_fini() {}
 
 inline void hl_set_sync_flag(bool flag) {}
 
-inline bool hl_get_sync_flag() {
-  return false;
-}
+inline bool hl_get_sync_flag() { return false; }
 
-inline int hl_get_device_count() { return 0;  }
+inline int hl_get_device_count() { return 0; }
 
 inline void hl_set_device(int device) {}
 
-inline int hl_get_device() { return 0;  }
+inline int hl_get_device() { return 0; }
 
-inline void* hl_malloc_device(size_t size) { return NULL; }
+inline void *hl_malloc_device(size_t size) { return NULL; }
 
 inline void hl_free_mem_device(void *dest_d) {}
 
-inline void* hl_malloc_host(size_t size) { return NULL;  }
+inline void *hl_malloc_host(size_t size) { return NULL; }
 
 inline void hl_free_mem_host(void *dest_h) {}
 
@@ -64,7 +59,9 @@ inline void hl_rand(real *dest_d, size_t num) {}
 
 inline void hl_srand(unsigned int seed) {}
 
-inline void hl_memcpy_async(void *dst, void *src, size_t size,
+inline void hl_memcpy_async(void *dst,
+                            void *src,
+                            size_t size,
                             hl_stream_t stream) {}
 
 inline void hl_stream_synchronize(hl_stream_t stream) {}
@@ -83,14 +80,18 @@ inline void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {}
 
 inline void hl_event_synchronize(hl_event_t event) {}
 
-inline int hl_get_device_last_error() { return 0;  }
+inline int hl_get_device_last_error() { return 0; }
 
-inline const char* hl_get_device_error_string() { return NULL; }
+inline const char *hl_get_device_error_string() { return NULL; }
 
-inline const char* hl_get_device_error_string(size_t err) { return NULL; }
+inline const char *hl_get_device_error_string(size_t err) { return NULL; }
 
 inline bool hl_cuda_event_is_ready(hl_event_t event) { return true; }
 
 inline void hl_device_synchronize() {}
 
+inline void hl_profiler_start() {}
+
+inline void hl_profiler_end() {}
+
 #endif  // HL_CUDA_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_lstm_stub.h b/paddle/cuda/include/stub/hl_lstm_stub.h
index 2700bef02a5e1e40ee7603ccab7fec754196f8cd..7ccda032d26f2fbbe99136e8481416daea557a78 100644
--- a/paddle/cuda/include/stub/hl_lstm_stub.h
+++ b/paddle/cuda/include/stub/hl_lstm_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_LSTM_STUB_H_
 #define HL_LSTM_STUB_H_
 
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
index 76cac2e57769301fee2e5979e2685976daf35441..1bd78d23fbaf46e6265ba0db25ea399a204bd96f 100644
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_MATRIX_STUB_H_
 #define HL_MATRIX_STUB_H_
 
@@ -26,48 +25,30 @@ inline void hl_matrix_add(real* A_d,
                           real alpha,
                           real beta) {}
 
-inline void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) {}
+inline void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {}
 
-inline void hl_sequence_softmax_forward(real *A_d,
-                                        real *C_d,
+inline void hl_sequence_softmax_forward(real* A_d,
+                                        real* C_d,
                                         const int* index,
                                         int numSequence) {}
 
-inline void hl_matrix_softmax_derivative(real* grad_d,
-                                         real* output_d,
-                                         real* sftmaxSum_d,
-                                         int dimM,
-                                         int dimN) {}
-
-inline void hl_matrix_classification_error(real* A_d,
-                                           int* B_d,
-                                           real* C_d,
-                                           int dimM,
-                                           int dimN) {}
-
-inline void hl_matrix_cross_entropy(real* A_d,
-                                    real* C_d,
-                                    int* label_d,
-                                    int dimM,
-                                    int dimN) {}
-
-inline void hl_matrix_cross_entropy_bp(real* grad_d,
-                                       real* output_d,
-                                       int* label_d,
-                                       int dimM,
-                                       int dimN) {}
-
-inline void hl_matrix_multi_binary_cross_entropy(real* output,
-                                                 real* entropy,
-                                                 hl_sparse_matrix_s mat,
-                                                 int dimM,
-                                                 int dimN) {}
-
-inline void hl_matrix_multi_binary_cross_entropy_bp(real* output,
-                                                    real* grad,
-                                                    hl_sparse_matrix_s mat,
-                                                    int dimM,
-                                                    int dimN) {}
+inline void hl_matrix_softmax_derivative(
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {}
+
+inline void hl_matrix_classification_error(
+    real* A_d, int* B_d, real* C_d, int dimM, int dimN) {}
+
+inline void hl_matrix_cross_entropy(
+    real* A_d, real* C_d, int* label_d, int dimM, int dimN) {}
+
+inline void hl_matrix_cross_entropy_bp(
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {}
+
+inline void hl_matrix_multi_binary_cross_entropy(
+    real* output, real* entropy, hl_sparse_matrix_s mat, int dimM, int dimN) {}
+
+inline void hl_matrix_multi_binary_cross_entropy_bp(
+    real* output, real* grad, hl_sparse_matrix_s mat, int dimM, int dimN) {}
 
 inline void hl_matrix_zero_mem(real* data, int num) {}
 
@@ -101,7 +82,6 @@ inline void hl_cossim(real* output,
                       int input2_height,
                       real scale) {}
 
-
 inline void hl_cossim_derivative(real* grad,
                                  real* output,
                                  real* prevOutX,
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
index aabd956c37f7dce48a379b995ab88a53aa65c760..3343463a8d5faa2f409a710752a29238455b2085 100644
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_SEQUENCE_STUB_H_
 #define HL_SEQUENCE_STUB_H_
 
@@ -21,15 +20,12 @@ limitations under the License. */
 inline void hl_max_sequence_forward(real* input,
                                     const int* sequence,
                                     real* output,
-                                    int *index,
+                                    int* index,
                                     int numSequences,
                                     int dim) {}
 
-inline void hl_max_sequence_backward(real* outputGrad,
-                                     int *index,
-                                     real* inputGrad,
-                                     int numSequences,
-                                     int dim) {}
+inline void hl_max_sequence_backward(
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {}
 
 inline void hl_context_projection_forward(real* input,
                                           const int* sequence,
@@ -60,20 +56,29 @@ inline void hl_context_projection_backward_weight(real* outputGrad,
                                                   int contextStart,
                                                   int beginPad) {}
 
-inline void hl_sequence2batch_copy(real *batch,
-                                   real *sequence,
-                                   const int *batchIndex,
+inline void hl_sequence2batch_copy(real* batch,
+                                   real* sequence,
+                                   const int* batchIndex,
                                    int seqWidth,
                                    int batchCount,
                                    bool seq2batch) {}
 
-inline void hl_sequence2batch_add(real *batch,
-                                  real *sequence,
-                                  int *batchIndex,
+inline void hl_sequence2batch_add(real* batch,
+                                  real* sequence,
+                                  int* batchIndex,
                                   int seqWidth,
                                   int batchCount,
                                   bool seq2batch) {}
 
+inline void hl_sequence2batch_copy_padding(real* batch,
+                                           real* sequence,
+                                           const int* sequenceStartPositions,
+                                           const size_t sequenceWidth,
+                                           const size_t maxSequenceLength,
+                                           const size_t numSequences,
+                                           bool normByTimes,
+                                           bool seq2batch) {}
+
 inline void hl_sequence_avg_forward(real* dst,
                                     real* src,
                                     const int* starts,
diff --git a/paddle/cuda/include/stub/hl_sparse_stub.h b/paddle/cuda/include/stub/hl_sparse_stub.h
index 346a1900dda5825e9a4311a2c51e8a50e6e7df0b..d47bdd2c47d097c4c68b7b7e88ef888bc18270c2 100644
--- a/paddle/cuda/include/stub/hl_sparse_stub.h
+++ b/paddle/cuda/include/stub/hl_sparse_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_SPARSE_STUB_H_
 #define HL_SPARSE_STUB_H_
 
@@ -20,7 +19,7 @@ limitations under the License. */
 
 inline void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
                                     hl_matrix_format_t format,
-                                    hl_matrix_value_t  value_type,
+                                    hl_matrix_value_t value_type,
                                     int dimM,
                                     int dimN,
                                     int nnz) {}
@@ -28,20 +27,20 @@ inline void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
 inline void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {}
 
 inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       void * dest_d,
+                                       void *dest_d,
                                        size_t size,
                                        hl_matrix_format_t format,
-                                       hl_matrix_value_t  value_type,
+                                       hl_matrix_value_t value_type,
                                        int dimM,
                                        int dimN,
                                        int nnz) {}
 
 inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       real* value_d,
-                                       int* rows_d,
-                                       int* cols_d,
+                                       real *value_d,
+                                       int *rows_d,
+                                       int *cols_d,
                                        hl_matrix_format_t format,
-                                       hl_matrix_value_t  value_type,
+                                       hl_matrix_value_t value_type,
                                        int dimM,
                                        int dimN,
                                        int nnz) {}
@@ -87,10 +86,14 @@ inline void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
 
 inline void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
                                     hl_trans_op_t transa,
-                                    real *B_d, hl_trans_op_t transb,
+                                    real *B_d,
+                                    hl_trans_op_t transb,
                                     real *C_d,
-                                    int dimM, int dimN, int dimK,
-                                    real alpha, real beta) {}
+                                    int dimM,
+                                    int dimN,
+                                    int dimK,
+                                    real alpha,
+                                    real beta) {}
 
 inline void hl_matrix_dense_mul_csc(real *A_d,
                                     hl_trans_op_t transa,
@@ -103,18 +106,27 @@ inline void hl_matrix_dense_mul_csc(real *A_d,
                                     real alpha,
                                     real beta) {}
 
-inline void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
-                                 real *B_d, hl_trans_op_t transb,
+inline void hl_sparse_matrix_mul(real *A_d,
+                                 hl_trans_op_t transa,
+                                 real *B_d,
+                                 hl_trans_op_t transb,
                                  hl_sparse_matrix_s C_d,
-                                 int dimM, int dimN, int dimK,
-                                 real alpha, real beta) {}
+                                 int dimM,
+                                 int dimN,
+                                 int dimK,
+                                 real alpha,
+                                 real beta) {}
 
-inline void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
+inline void hl_matrix_dense_mul_csr(real *A_d,
+                                    hl_trans_op_t transa,
                                     hl_sparse_matrix_s B_d,
                                     hl_trans_op_t transb,
                                     real *C_d,
-                                    int dimM, int dimN, int dimK,
-                                    real alpha, real beta) {}
+                                    int dimM,
+                                    int dimN,
+                                    int dimK,
+                                    real alpha,
+                                    real beta) {}
 
 inline void hl_memcpy_from_csc_matrix(real *csc_val,
                                       size_t val_size,
@@ -134,49 +146,39 @@ inline void hl_memcpy_from_csr_matrix(real *csr_val,
                                       hl_sparse_matrix_s csr_matrix,
                                       hl_stream_t stream) {}
 
-inline void hl_sparse_matrix_column_sum(real* A_d,
-                                        hl_sparse_matrix_s B_d,
-                                        int dimM,
-                                        int dimN,
-                                        real scale) {}
+inline void hl_sparse_matrix_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {}
 
-inline void hl_matrix_csr_column_sum(real* A_d,
-                                     hl_sparse_matrix_s B_d,
-                                     int dimM,
-                                     int dimN,
-                                     real scale) {}
+inline void hl_matrix_csr_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {}
 
 inline void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
-                                      real* B_d,
+                                      real *B_d,
                                       real scale) {}
 
 inline void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
-                                   real* B_d,
+                                   real *B_d,
                                    real scale) {}
 
 inline void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
-                                       real* B_d,
+                                       real *B_d,
                                        int dimM,
                                        int dimN,
                                        real alpha,
                                        real beta) {}
 
 inline void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
-                                    real* B_d,
+                                    real *B_d,
                                     int dimM,
                                     int dimN,
                                     real alpha,
                                     real beta) {}
 
-inline int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
-  return NULL;
-}
+inline int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) { return NULL; }
 
-inline int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
-  return NULL;
-}
+inline int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) { return NULL; }
 
-inline real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
+inline real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
   return NULL;
 }
 
diff --git a/paddle/cuda/src/avx_mathfun.h b/paddle/cuda/src/avx_mathfun.h
index 2922d4dc2937662d66fb2433f4883448ba21fa3f..2412ed5abc13b2a83521a75524f581e106788b60 100644
--- a/paddle/cuda/src/avx_mathfun.h
+++ b/paddle/cuda/src/avx_mathfun.h
@@ -32,32 +32,35 @@
 #include <immintrin.h>
 
 /* yes I know, the top of this file is quite ugly */
-# define ALIGN32_BEG
-# define ALIGN32_END __attribute__((aligned(32)))
+#define ALIGN32_BEG
+#define ALIGN32_END __attribute__((aligned(32)))
 
 /* __m128 is ugly to write */
-typedef __m256  v8sf; // vector of 8 float (avx)
-typedef __m256i v8si; // vector of 8 int   (avx)
-typedef __m128i v4si; // vector of 8 int   (avx)
+typedef __m256 v8sf;   // vector of 8 float (avx)
+typedef __m256i v8si;  // vector of 8 int   (avx)
+typedef __m128i v4si;  // vector of 8 int   (avx)
 
-#define _PI32AVX_CONST(Name, Val)                                            \
-  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { Val, Val, Val, Val }
+#define _PI32AVX_CONST(Name, Val)                                 \
+  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { \
+      Val, Val, Val, Val}
 
 _PI32AVX_CONST(1, 1);
 _PI32AVX_CONST(inv1, ~1);
 _PI32AVX_CONST(2, 2);
 _PI32AVX_CONST(4, 4);
 
-
 /* declare some AVX constants -- why can't I figure a better way to do that? */
-#define _PS256_CONST(Name, Val)                                            \
-  static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-#define _PI32_CONST256(Name, Val)                                            \
-  static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-#define _PS256_CONST_TYPE(Name, Type, Val)                                 \
-  static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-
-_PS256_CONST(1  , 1.0f);
+#define _PS256_CONST(Name, Val)                                   \
+  static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PI32_CONST256(Name, Val)                                  \
+  static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PS256_CONST_TYPE(Name, Type, Val)                       \
+  static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+
+_PS256_CONST(1, 1.0f);
 _PS256_CONST(0p5, 0.5f);
 /* the smallest non denormalized float number */
 _PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
@@ -76,14 +79,14 @@ _PI32_CONST256(0x7f, 0x7f);
 
 _PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
 _PS256_CONST(cephes_log_p0, 7.0376836292E-2);
-_PS256_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS256_CONST(cephes_log_p1, -1.1514610310E-1);
 _PS256_CONST(cephes_log_p2, 1.1676998740E-1);
-_PS256_CONST(cephes_log_p3, - 1.2420140846E-1);
-_PS256_CONST(cephes_log_p4, + 1.4249322787E-1);
-_PS256_CONST(cephes_log_p5, - 1.6668057665E-1);
-_PS256_CONST(cephes_log_p6, + 2.0000714765E-1);
-_PS256_CONST(cephes_log_p7, - 2.4999993993E-1);
-_PS256_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS256_CONST(cephes_log_p3, -1.2420140846E-1);
+_PS256_CONST(cephes_log_p4, +1.4249322787E-1);
+_PS256_CONST(cephes_log_p5, -1.6668057665E-1);
+_PS256_CONST(cephes_log_p6, +2.0000714765E-1);
+_PS256_CONST(cephes_log_p7, -2.4999993993E-1);
+_PS256_CONST(cephes_log_p8, +3.3333331174E-1);
 _PS256_CONST(cephes_log_q1, -2.12194440e-4);
 _PS256_CONST(cephes_log_q2, 0.693359375);
 
@@ -94,50 +97,51 @@ typedef union imm_xmm_union {
   v4si xmm[2];
 } imm_xmm_union;
 
-#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) {    \
-    imm_xmm_union u __attribute__((aligned(32)));  \
-    u.imm = imm_;				   \
-    xmm0_ = u.xmm[0];                            \
-    xmm1_ = u.xmm[1];                            \
-}
-
-#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) {                       \
+#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_)       \
+  {                                               \
     imm_xmm_union u __attribute__((aligned(32))); \
-    u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \
+    u.imm = imm_;                                 \
+    xmm0_ = u.xmm[0];                             \
+    xmm1_ = u.xmm[1];                             \
   }
 
+#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_)       \
+  {                                               \
+    imm_xmm_union u __attribute__((aligned(32))); \
+    u.xmm[0] = xmm0_;                             \
+    u.xmm[1] = xmm1_;                             \
+    imm_ = u.imm;                                 \
+  }
 
-#define AVX2_BITOP_USING_SSE2(fn) \
-static inline v8si avx2_mm256_##fn(v8si x, int a) \
-{ \
-  /* use SSE2 instruction to perform the bitop AVX2 */ \
-  v4si x1, x2; \
-  v8si ret; \
-  COPY_IMM_TO_XMM(x, x1, x2); \
-  x1 = _mm_##fn(x1,a); \
-  x2 = _mm_##fn(x2,a); \
-  COPY_XMM_TO_IMM(x1, x2, ret); \
-  return(ret); \
-}
+#define AVX2_BITOP_USING_SSE2(fn)                        \
+  static inline v8si avx2_mm256_##fn(v8si x, int a) {    \
+    /* use SSE2 instruction to perform the bitop AVX2 */ \
+    v4si x1, x2;                                         \
+    v8si ret;                                            \
+    COPY_IMM_TO_XMM(x, x1, x2);                          \
+    x1 = _mm_##fn(x1, a);                                \
+    x2 = _mm_##fn(x2, a);                                \
+    COPY_XMM_TO_IMM(x1, x2, ret);                        \
+    return (ret);                                        \
+  }
 
 //#warning "Using SSE2 to perform AVX2 bitshift ops"
 AVX2_BITOP_USING_SSE2(slli_epi32)
 AVX2_BITOP_USING_SSE2(srli_epi32)
 
-#define AVX2_INTOP_USING_SSE2(fn) \
-static inline v8si avx2_mm256_##fn(v8si x, v8si y) \
-{ \
-  /* use SSE2 instructions to perform the AVX2 integer operation */ \
-  v4si x1, x2; \
-  v4si y1, y2; \
-  v8si ret; \
-  COPY_IMM_TO_XMM(x, x1, x2); \
-  COPY_IMM_TO_XMM(y, y1, y2); \
-  x1 = _mm_##fn(x1,y1); \
-  x2 = _mm_##fn(x2,y2); \
-  COPY_XMM_TO_IMM(x1, x2, ret); \
-  return(ret); \
-}
+#define AVX2_INTOP_USING_SSE2(fn)                                     \
+  static inline v8si avx2_mm256_##fn(v8si x, v8si y) {                \
+    /* use SSE2 instructions to perform the AVX2 integer operation */ \
+    v4si x1, x2;                                                      \
+    v4si y1, y2;                                                      \
+    v8si ret;                                                         \
+    COPY_IMM_TO_XMM(x, x1, x2);                                       \
+    COPY_IMM_TO_XMM(y, y1, y2);                                       \
+    x1 = _mm_##fn(x1, y1);                                            \
+    x2 = _mm_##fn(x2, y2);                                            \
+    COPY_XMM_TO_IMM(x1, x2, ret);                                     \
+    return (ret);                                                     \
+  }
 
 //#warning "Using SSE2 to perform AVX2 integer ops"
 AVX2_INTOP_USING_SSE2(and_si128)
@@ -157,84 +161,83 @@ AVX2_INTOP_USING_SSE2(add_epi32)
 #define avx2_mm256_add_epi32 _mm256_add_epi32
 #endif /* __AVX2__ */
 
-
-/* natural logarithm computed for 8 simultaneous float 
+/* natural logarithm computed for 8 simultaneous float
    return NaN for x <= 0
 */
 v8sf log256_ps(v8sf x) {
   v8si imm0;
-  v8sf one = *(v8sf*)_ps256_1;
+  v8sf one = *(v8sf *)_ps256_1;
 
-  //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
+  // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
   v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
 
-  x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos);  /* cut off denormalized stuff */
+  x = _mm256_max_ps(
+      x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */
 
   // can be done with AVX2
   imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
 
   /* keep only the fractional part */
-  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask);
-  x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5);
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);
+  x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);
 
   // this is again another AVX2 instruction
-  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);
   v8sf e = _mm256_cvtepi32_ps(imm0);
 
   e = _mm256_add_ps(e, one);
 
-  /* part2: 
+  /* part2:
      if( x < SQRTHF ) {
        e -= 1;
        x = x + x - 1.0;
      } else { x = x - 1.0; }
   */
-  //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
-  v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
+  // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
+  v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
   v8sf tmp = _mm256_and_ps(x, mask);
   x = _mm256_sub_ps(x, one);
   e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
   x = _mm256_add_ps(x, tmp);
 
-  v8sf z = _mm256_mul_ps(x,x);
+  v8sf z = _mm256_mul_ps(x, x);
 
-  v8sf y = *(v8sf*)_ps256_cephes_log_p0;
+  v8sf y = *(v8sf *)_ps256_cephes_log_p0;
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);
   y = _mm256_mul_ps(y, x);
 
   y = _mm256_mul_ps(y, z);
-  
-  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1);
-  y = _mm256_add_ps(y, tmp);
 
+  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);
+  y = _mm256_add_ps(y, tmp);
 
-  tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
   y = _mm256_sub_ps(y, tmp);
 
-  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2);
+  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);
   x = _mm256_add_ps(x, y);
   x = _mm256_add_ps(x, tmp);
-  x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
+  x = _mm256_or_ps(x, invalid_mask);  // negative arg will be NAN
   return x;
 }
 
-_PS256_CONST(exp_hi,	88.3762626647949f);
-_PS256_CONST(exp_lo,	-88.3762626647949f);
+_PS256_CONST(exp_hi, 88.3762626647949f);
+_PS256_CONST(exp_lo, -88.3762626647949f);
 
 _PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
 _PS256_CONST(cephes_exp_C1, 0.693359375);
@@ -250,45 +253,45 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
 v8sf exp256_ps(v8sf x) {
   v8sf tmp = _mm256_setzero_ps(), fx;
   v8si imm0;
-  v8sf one = *(v8sf*)_ps256_1;
+  v8sf one = *(v8sf *)_ps256_1;
 
-  x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi);
-  x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo);
+  x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);
+  x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);
 
   /* express exp(x) as exp(g + n*log(2)) */
-  fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF);
-  fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5);
+  fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);
+  fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);
 
   /* how to perform a floorf with SSE: just below */
-  //imm0 = _mm256_cvttps_epi32(fx);
-  //tmp  = _mm256_cvtepi32_ps(imm0);
-  
+  // imm0 = _mm256_cvttps_epi32(fx);
+  // tmp  = _mm256_cvtepi32_ps(imm0);
+
   tmp = _mm256_floor_ps(fx);
 
   /* if greater, substract 1 */
-  //v8sf mask = _mm256_cmpgt_ps(tmp, fx);    
-  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);    
+  // v8sf mask = _mm256_cmpgt_ps(tmp, fx);
+  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
   mask = _mm256_and_ps(mask, one);
   fx = _mm256_sub_ps(tmp, mask);
 
-  tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1);
-  v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2);
+  tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);
+  v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);
   x = _mm256_sub_ps(x, tmp);
   x = _mm256_sub_ps(x, z);
 
-  z = _mm256_mul_ps(x,x);
-  
-  v8sf y = *(v8sf*)_ps256_cephes_exp_p0;
+  z = _mm256_mul_ps(x, x);
+
+  v8sf y = *(v8sf *)_ps256_cephes_exp_p0;
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);
   y = _mm256_mul_ps(y, z);
   y = _mm256_add_ps(y, x);
   y = _mm256_add_ps(y, one);
@@ -296,7 +299,7 @@ v8sf exp256_ps(v8sf x) {
   /* build 2^n */
   imm0 = _mm256_cvttps_epi32(fx);
   // another two AVX2 instructions
-  imm0 = avx2_mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+  imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);
   imm0 = avx2_mm256_slli_epi32(imm0, 23);
   v8sf pow2n = _mm256_castsi256_ps(imm0);
   y = _mm256_mul_ps(y, pow2n);
@@ -307,13 +310,12 @@ _PS256_CONST(minus_cephes_DP1, -0.78515625);
 _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
 _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
 _PS256_CONST(sincof_p0, -1.9515295891E-4);
-_PS256_CONST(sincof_p1,  8.3321608736E-3);
+_PS256_CONST(sincof_p1, 8.3321608736E-3);
 _PS256_CONST(sincof_p2, -1.6666654611E-1);
-_PS256_CONST(coscof_p0,  2.443315711809948E-005);
+_PS256_CONST(coscof_p0, 2.443315711809948E-005);
 _PS256_CONST(coscof_p1, -1.388731625493765E-003);
-_PS256_CONST(coscof_p2,  4.166664568298827E-002);
-_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
-
+_PS256_CONST(coscof_p2, 4.166664568298827E-002);
+_PS256_CONST(cephes_FOPI, 1.27323954473516);  // 4 / M_PI
 
 /* evaluation of 8 sines at onces using AVX intrisics
 
@@ -327,7 +329,7 @@ _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
    surprising but correct result.
 
 */
-v8sf sin256_ps(v8sf x) { // any x
+v8sf sin256_ps(v8sf x) {  // any x
   v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
   v8si imm0, imm2;
 
@@ -338,78 +340,78 @@ v8sf sin256_ps(v8sf x) { // any x
 
   sign_bit = x;
   /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
   /* extract the sign bit (upper one) */
-  sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
-  
+  sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);
+
   /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
 
-  /*
-    Here we start a series of integer operations, which are in the
-    realm of AVX2.
-    If we don't have AVX, let's perform them using SSE2 directives
-  */
+/*
+  Here we start a series of integer operations, which are in the
+  realm of AVX2.
+  If we don't have AVX, let's perform them using SSE2 directives
+*/
 
 #ifdef __AVX2__
   /* store the integer part of y in mm0 */
   imm2 = _mm256_cvttps_epi32(y);
   /* j=(j+1) & (~1) (see the cephes sources) */
   // another two AVX2 instruction
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
   y = _mm256_cvtepi32_ps(imm2);
 
   /* get the swap sign flag */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
+  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
   imm0 = avx2_mm256_slli_epi32(imm0, 29);
-  /* get the polynom selection mask 
+  /* get the polynom selection mask
      there is one polynom for 0 <= x <= Pi/4
      and another one for Pi/4<x<=Pi/2
 
      Both branches will be computed.
   */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
 #else
   /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
 
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
 
-  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
   y = _mm256_cvtepi32_ps(imm2);
 
-  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
+  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
+  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
 
   imm0_1 = _mm_slli_epi32(imm0_1, 29);
   imm0_2 = _mm_slli_epi32(imm0_2, 29);
 
   COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
 
   imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
   imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
 
   COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
 #endif
- 
+
   v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
   v8sf poly_mask = _mm256_castsi256_ps(imm2);
   sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
 
-  /* The magic pass: "Extended precision modular arithmetic" 
+  /* The magic pass: "Extended precision modular arithmetic"
      x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
   xmm1 = _mm256_mul_ps(y, xmm1);
   xmm2 = _mm256_mul_ps(y, xmm2);
   xmm3 = _mm256_mul_ps(y, xmm3);
@@ -418,35 +420,35 @@ v8sf sin256_ps(v8sf x) { // any x
   x = _mm256_add_ps(x, xmm3);
 
   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf*)_ps256_coscof_p0;
-  v8sf z = _mm256_mul_ps(x,x);
+  y = *(v8sf *)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x, x);
 
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
   y = _mm256_mul_ps(y, z);
   y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
   y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
-  
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+
   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 
-  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
   y2 = _mm256_mul_ps(y2, z);
   y2 = _mm256_mul_ps(y2, x);
   y2 = _mm256_add_ps(y2, x);
 
-  /* select the correct result from the two polynoms */  
+  /* select the correct result from the two polynoms */
   xmm3 = poly_mask;
-  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
+  y2 = _mm256_and_ps(xmm3, y2);  //, xmm3);
   y = _mm256_andnot_ps(xmm3, y);
-  y = _mm256_add_ps(y,y2);
+  y = _mm256_add_ps(y, y2);
   /* update the sign */
   y = _mm256_xor_ps(y, sign_bit);
 
@@ -454,7 +456,7 @@ v8sf sin256_ps(v8sf x) { // any x
 }
 
 /* almost the same as sin_ps */
-v8sf cos256_ps(v8sf x) { // any x
+v8sf cos256_ps(v8sf x) {  // any x
   v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
   v8si imm0, imm2;
 
@@ -464,53 +466,53 @@ v8sf cos256_ps(v8sf x) { // any x
 #endif
 
   /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
-  
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+
   /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
-  
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+
 #ifdef __AVX2__
   /* store the integer part of y in mm0 */
   imm2 = _mm256_cvttps_epi32(y);
   /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
   y = _mm256_cvtepi32_ps(imm2);
-  imm2 = avx2_mm256_sub_epi32(imm2, *(v8si*)_pi32_256_2);
-  
+  imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2);
+
   /* get the swap sign flag */
-  imm0 = avx2_mm256_andnot_si256(imm2, *(v8si*)_pi32_256_4);
+  imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4);
   imm0 = avx2_mm256_slli_epi32(imm0, 29);
   /* get the polynom selection mask */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
 #else
 
   /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
 
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
 
-  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
   y = _mm256_cvtepi32_ps(imm2);
 
-  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si*)_pi32avx_2);
-  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si*)_pi32avx_2);
+  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2);
 
-  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si*)_pi32avx_4);
-  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si*)_pi32avx_4);
+  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4);
+  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4);
 
   imm0_1 = _mm_slli_epi32(imm0_1, 29);
   imm0_2 = _mm_slli_epi32(imm0_2, 29);
 
   COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
 
   imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
   imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
@@ -521,58 +523,58 @@ v8sf cos256_ps(v8sf x) { // any x
   v8sf sign_bit = _mm256_castsi256_ps(imm0);
   v8sf poly_mask = _mm256_castsi256_ps(imm2);
 
-  /* The magic pass: "Extended precision modular arithmetic" 
+  /* The magic pass: "Extended precision modular arithmetic"
      x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
   xmm1 = _mm256_mul_ps(y, xmm1);
   xmm2 = _mm256_mul_ps(y, xmm2);
   xmm3 = _mm256_mul_ps(y, xmm3);
   x = _mm256_add_ps(x, xmm1);
   x = _mm256_add_ps(x, xmm2);
   x = _mm256_add_ps(x, xmm3);
-  
+
   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf*)_ps256_coscof_p0;
-  v8sf z = _mm256_mul_ps(x,x);
+  y = *(v8sf *)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x, x);
 
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
   y = _mm256_mul_ps(y, z);
   y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
   y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
-  
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+
   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 
-  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
   y2 = _mm256_mul_ps(y2, z);
   y2 = _mm256_mul_ps(y2, x);
   y2 = _mm256_add_ps(y2, x);
 
-  /* select the correct result from the two polynoms */  
+  /* select the correct result from the two polynoms */
   xmm3 = poly_mask;
-  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
+  y2 = _mm256_and_ps(xmm3, y2);  //, xmm3);
   y = _mm256_andnot_ps(xmm3, y);
-  y = _mm256_add_ps(y,y2);
+  y = _mm256_add_ps(y, y2);
   /* update the sign */
   y = _mm256_xor_ps(y, sign_bit);
 
   return y;
 }
 
-/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them..
+/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could
+   replace both of them..
    it is almost as fast, and gives you a free cosine with your sine */
 void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
-
   v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
   v8si imm0, imm2, imm4;
 
@@ -584,59 +586,59 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
 
   sign_bit_sin = x;
   /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
   /* extract the sign bit (upper one) */
-  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask);
-  
+  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask);
+
   /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
 
-#ifdef __AVX2__    
+#ifdef __AVX2__
   /* store the integer part of y in imm2 */
   imm2 = _mm256_cvttps_epi32(y);
 
   /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
 
   y = _mm256_cvtepi32_ps(imm2);
   imm4 = imm2;
 
   /* get the swap sign flag for the sine */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
+  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
   imm0 = avx2_mm256_slli_epi32(imm0, 29);
-  //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+  // v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
 
   /* get the polynom selection mask for the sine*/
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
-  //v8sf poly_mask = _mm256_castsi256_ps(imm2);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+// v8sf poly_mask = _mm256_castsi256_ps(imm2);
 #else
   /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
+
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
 
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
-  
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
 
-  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
   y = _mm256_cvtepi32_ps(imm2);
 
   imm4_1 = imm2_1;
   imm4_2 = imm2_2;
 
-  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
-  
+  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
+  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
+
   imm0_1 = _mm_slli_epi32(imm0_1, 29);
   imm0_2 = _mm_slli_epi32(imm0_2, 29);
 
   COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
 
   imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
   imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
@@ -646,11 +648,11 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
   v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
   v8sf poly_mask = _mm256_castsi256_ps(imm2);
 
-  /* The magic pass: "Extended precision modular arithmetic" 
+  /* The magic pass: "Extended precision modular arithmetic"
      x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
   xmm1 = _mm256_mul_ps(y, xmm1);
   xmm2 = _mm256_mul_ps(y, xmm2);
   xmm3 = _mm256_mul_ps(y, xmm3);
@@ -659,16 +661,16 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
   x = _mm256_add_ps(x, xmm3);
 
 #ifdef __AVX2__
-  imm4 = avx2_mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2);
-  imm4 = avx2_mm256_andnot_si256(imm4, *(v8si*)_pi32_256_4);
+  imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2);
+  imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4);
   imm4 = avx2_mm256_slli_epi32(imm4, 29);
 #else
-  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si*)_pi32avx_2);
-  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si*)_pi32avx_2);
+  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2);
+  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2);
+
+  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4);
+  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4);
 
-  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si*)_pi32avx_4);
-  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si*)_pi32avx_4);
-  
   imm4_1 = _mm_slli_epi32(imm4_1, 29);
   imm4_2 = _mm_slli_epi32(imm4_2, 29);
 
@@ -678,42 +680,42 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
   v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
 
   sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
-  
+
   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  v8sf z = _mm256_mul_ps(x,x);
-  y = *(v8sf*)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x, x);
+  y = *(v8sf *)_ps256_coscof_p0;
 
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
   y = _mm256_mul_ps(y, z);
   y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
   y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
-  
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+
   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 
-  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
   y2 = _mm256_mul_ps(y2, z);
   y2 = _mm256_mul_ps(y2, x);
   y2 = _mm256_add_ps(y2, x);
 
-  /* select the correct result from the two polynoms */  
+  /* select the correct result from the two polynoms */
   xmm3 = poly_mask;
   v8sf ysin2 = _mm256_and_ps(xmm3, y2);
   v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
-  y2 = _mm256_sub_ps(y2,ysin2);
+  y2 = _mm256_sub_ps(y2, ysin2);
   y = _mm256_sub_ps(y, ysin1);
 
-  xmm1 = _mm256_add_ps(ysin1,ysin2);
-  xmm2 = _mm256_add_ps(y,y2);
- 
+  xmm1 = _mm256_add_ps(ysin1, ysin2);
+  xmm2 = _mm256_add_ps(y, y2);
+
   /* update the sign */
   *s = _mm256_xor_ps(xmm1, sign_bit_sin);
   *c = _mm256_xor_ps(xmm2, sign_bit_cos);
diff --git a/paddle/cuda/src/hl_avx_functions.cc b/paddle/cuda/src/hl_avx_functions.cc
index 08976180fff5b099475b1406b16f967655867e5b..c1e0c7f9d9e7958a6b4ba3617ca488e49af20655 100644
--- a/paddle/cuda/src/hl_avx_functions.cc
+++ b/paddle/cuda/src/hl_avx_functions.cc
@@ -12,62 +12,58 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <immintrin.h>
 #include "hl_functions.h"
 
 namespace hppl {
 
-  extern __m256 exp(__m256 a);
+extern __m256 exp(__m256 a);
 
-  __m256 relu(const __m256 a) {
-    __m256 tmp = _mm256_set1_ps(0.0f);
-    return _mm256_max_ps(a, tmp);
-  }
+__m256 relu(const __m256 a) {
+  __m256 tmp = _mm256_set1_ps(0.0f);
+  return _mm256_max_ps(a, tmp);
+}
 
-  __m256 sigmoid(const __m256 a) {
-    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
-    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
-    __m256 tmp = _mm256_max_ps(a, min);
-    tmp = _mm256_min_ps(tmp, max);
-    tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
-    tmp = exp(tmp);
-    tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
-    tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
-    return tmp;
-  }
+__m256 sigmoid(const __m256 a) {
+  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
+  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
+  __m256 tmp = _mm256_max_ps(a, min);
+  tmp = _mm256_min_ps(tmp, max);
+  tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
+  tmp = exp(tmp);
+  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
+  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
+  return tmp;
+}
 
-  __m256 tanh(const __m256 a) {
-    __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
-    __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
-    tmp = _mm256_min_ps(tmp, max);
-    tmp = exp(tmp);
-    return _mm256_sub_ps(
-        _mm256_div_ps(_mm256_set1_ps(2.0f),
-        _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)), _mm256_set1_ps(1.0f));
-  }
+__m256 tanh(const __m256 a) {
+  __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
+  __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
+  tmp = _mm256_min_ps(tmp, max);
+  tmp = exp(tmp);
+  return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f),
+                                     _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)),
+                       _mm256_set1_ps(1.0f));
+}
 
-  __m256 linear(const __m256 a) {
-    return a;
-  }
+__m256 linear(const __m256 a) { return a; }
 
-  __m256 relu(const __m256 a, const __m256 b) {
-    return _mm256_mul_ps(a,
+__m256 relu(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(
+      a,
       _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
-      _mm256_set1_ps(1.0f)));
-  }
+                    _mm256_set1_ps(1.0f)));
+}
 
-  __m256 sigmoid(const __m256 a, const __m256 b) {
-    return _mm256_mul_ps(_mm256_mul_ps(a, b),
-        _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
-  }
+__m256 sigmoid(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(_mm256_mul_ps(a, b),
+                       _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
+}
 
-  __m256 tanh(const __m256 a, const __m256 b) {
-    return _mm256_mul_ps(a,
-      _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
-  }
+__m256 tanh(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(
+      a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
+}
 
-  __m256 linear(const __m256 a, const __m256 b) {
-    return a;
-  }
+__m256 linear(const __m256 a, const __m256 b) { return a; }
 }  // namespace hppl
diff --git a/paddle/cuda/src/hl_cpu_functions.cc b/paddle/cuda/src/hl_cpu_functions.cc
index b8352c2d537fba5ec9cd3237fe8f3fa9c25cbffe..af00f352e536bf342e15315d1f6804225b87eb0b 100644
--- a/paddle/cuda/src/hl_cpu_functions.cc
+++ b/paddle/cuda/src/hl_cpu_functions.cc
@@ -12,46 +12,33 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <math.h>
 #include "hl_functions.h"
 
 namespace hppl {
 
-  real relu(const real a) {
-    return a > 0.0f ? a : 0.0f;
-  }
-
-  real sigmoid(const real a) {
-    const real min = SIGMOID_THRESHOLD_MIN;
-    const real max = SIGMOID_THRESHOLD_MAX;
-    real tmp = (a < min) ? min : ((a > max) ? max : a);
-    return 1.0 / (1.0 + exp(-tmp));
-  }
-
-  real tanh(const real a) {
-    real tmp = -2.0 * a;
-    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-    return (2.0 / (1.0 + exp(tmp))) - 1.0;
-  }
-
-  real linear(const real a) {
-    return a;
-  }
-
-  real relu(const real a, const real b) {
-    return a * (b > 0.0f ? 1.0f : 0.0f);
-  }
-
-  real sigmoid(const real a, const real b) {
-    return a * b * (1 - b);
-  }
-
-  real tanh(const real a, const real b) {
-    return a * (1.0f - b * b);
-  }
-
-  real linear(const real a, const real b) {
-    return a;
-  }
+real relu(const real a) { return a > 0.0f ? a : 0.0f; }
+
+real sigmoid(const real a) {
+  const real min = SIGMOID_THRESHOLD_MIN;
+  const real max = SIGMOID_THRESHOLD_MAX;
+  real tmp = (a < min) ? min : ((a > max) ? max : a);
+  return 1.0 / (1.0 + exp(-tmp));
+}
+
+real tanh(const real a) {
+  real tmp = -2.0 * a;
+  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+  return (2.0 / (1.0 + exp(tmp))) - 1.0;
+}
+
+real linear(const real a) { return a; }
+
+real relu(const real a, const real b) { return a * (b > 0.0f ? 1.0f : 0.0f); }
+
+real sigmoid(const real a, const real b) { return a * b * (1 - b); }
+
+real tanh(const real a, const real b) { return a * (1.0f - b * b); }
+
+real linear(const real a, const real b) { return a; }
 }  // namespace hppl
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
index f16376ec937d3a397d9e7117de528c304f8403ee..e8ba232d44b3f66254d4749d4abbcfbe46d1fd0e 100644
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <sys/time.h>
 #include <mutex>
 #include "hl_cuda.h"
@@ -24,7 +23,7 @@ limitations under the License. */
 namespace dynload {
 
 std::once_flag cublas_dso_flag;
-void* cublas_dso_handle = nullptr;
+void *cublas_dso_handle = nullptr;
 
 /**
  * The following macro definition can generate structs
@@ -34,31 +33,30 @@ void* cublas_dso_handle = nullptr;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                          \
-   struct DynLoad__##__name {                                     \
-    template <typename... Args>                                   \
-    cublasStatus_t operator()(Args... args) {                     \
-        typedef cublasStatus_t (*cublasFunc)(Args...);            \
-        std::call_once(cublas_dso_flag, GetCublasDsoHandle,       \
-                      &cublas_dso_handle);                        \
-        void* p_##__name = dlsym(cublas_dso_handle, #__name);     \
-        return reinterpret_cast<cublasFunc>(p_##__name)(args...); \
-    }                                                             \
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    cublasStatus_t operator()(Args... args) {                                  \
+      typedef cublasStatus_t (*cublasFunc)(Args...);                           \
+      std::call_once(cublas_dso_flag, GetCublasDsoHandle, &cublas_dso_handle); \
+      void *p_##__name = dlsym(cublas_dso_handle, #__name);                    \
+      return reinterpret_cast<cublasFunc>(p_##__name)(args...);                \
+    }                                                                          \
   } __name;  // struct DynLoad__##__name
 #else
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                          \
-   struct DynLoad__##__name {                                     \
-    template <typename... Args>                                   \
-    cublasStatus_t operator()(Args... args) {                     \
-      return __name(args...);                                     \
-    }                                                             \
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)      \
+  struct DynLoad__##__name {                  \
+    template <typename... Args>               \
+    cublasStatus_t operator()(Args... args) { \
+      return __name(args...);                 \
+    }                                         \
   } __name;  // struct DynLoad__##__name
 #endif
 
-#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
-  DYNAMIC_LOAD_CUBLAS_WRAP(__name)
+#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name)
 
 // include all needed cublas functions in HPPL
+// clang-format off
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
   __macro(cublasSgemv)                    \
   __macro(cublasDgemv)                    \
@@ -88,41 +86,41 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
 
 } /* namespace dynload */
 
-
+// clang-format on
 #ifndef PADDLE_TYPE_DOUBLE
-#define     CUBLAS_GEAM     dynload::cublasSgeam
-#define     CUBLAS_GEMV     dynload::cublasSgemv
-#define     CUBLAS_GEMM     dynload::cublasSgemm
-#define     CUBLAS_GETRF    dynload::cublasSgetrfBatched
-#define     CUBLAS_GETRI    dynload::cublasSgetriBatched
+#define CUBLAS_GEAM dynload::cublasSgeam
+#define CUBLAS_GEMV dynload::cublasSgemv
+#define CUBLAS_GEMM dynload::cublasSgemm
+#define CUBLAS_GETRF dynload::cublasSgetrfBatched
+#define CUBLAS_GETRI dynload::cublasSgetriBatched
 #else
-#define     CUBLAS_GEAM     dynload::cublasDgeam
-#define     CUBLAS_GEMV     dynload::cublasDgemv
-#define     CUBLAS_GEMM     dynload::cublasDgemm
-#define     CUBLAS_GETRF    dynload::cublasDgetrfBatched
-#define     CUBLAS_GETRI    dynload::cublasDgetriBatched
+#define CUBLAS_GEAM dynload::cublasDgeam
+#define CUBLAS_GEMV dynload::cublasDgemv
+#define CUBLAS_GEMM dynload::cublasDgemm
+#define CUBLAS_GETRF dynload::cublasDgetrfBatched
+#define CUBLAS_GETRI dynload::cublasDgetriBatched
 #endif
 
-const char* hl_cublas_get_error_string(cublasStatus_t status) {
-  switch(status) {
-     case CUBLAS_STATUS_NOT_INITIALIZED:
-        return "[cublas status]: not initialized";
-     case CUBLAS_STATUS_ALLOC_FAILED:
-        return "[cublas status]: allocate failed";
-     case CUBLAS_STATUS_INVALID_VALUE:
-        return "[cublas status]: invalid value";
-     case CUBLAS_STATUS_ARCH_MISMATCH:
-        return "[cublas status]: arch mismatch";
-     case CUBLAS_STATUS_MAPPING_ERROR:
-        return "[cublas status]: mapping error";
-     case CUBLAS_STATUS_EXECUTION_FAILED:
-        return "[cublas status]: execution failed";
-     case CUBLAS_STATUS_INTERNAL_ERROR:
-        return "[cublas status]: internal error";
-     case CUBLAS_STATUS_SUCCESS:
-        return "[cublas status]: success";
-     default:
-        return "[cublas status]: unknown error";
+const char *hl_cublas_get_error_string(cublasStatus_t status) {
+  switch (status) {
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "[cublas status]: not initialized";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "[cublas status]: allocate failed";
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "[cublas status]: invalid value";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "[cublas status]: arch mismatch";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "[cublas status]: mapping error";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "[cublas status]: execution failed";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "[cublas status]: internal error";
+    case CUBLAS_STATUS_SUCCESS:
+      return "[cublas status]: success";
+    default:
+      return "[cublas status]: unknown error";
   }
 }
 
@@ -131,27 +129,21 @@ const char* hl_cublas_get_error_string(cublasStatus_t status) {
  * support << operator for more details error info.
  */
 cublasStatus_t g_cublasStat;
-#define CHECK_CUBLAS(cublas_func)                 \
-  g_cublasStat = cublas_func;                     \
-  CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat)   \
-      << "Cublas Error: "                         \
-      << hl_cublas_get_error_string(g_cublasStat) \
-      << " "
+#define CHECK_CUBLAS(cublas_func)               \
+  g_cublasStat = cublas_func;                   \
+  CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat) \
+      << "Cublas Error: " << hl_cublas_get_error_string(g_cublasStat) << " "
 
 void hl_cublas_init(cublasHandle_t *cublas_handle, cudaStream_t stream) {
   CHECK_CUBLAS(dynload::cublasCreate(cublas_handle))
-    << "[cublas init] Cublas create handle faild!";
+      << "[cublas init] Cublas create handle faild!";
 
   CHECK_CUBLAS(dynload::cublasSetStream(*cublas_handle, stream))
-    << "[cublas init] Cublas set stream faild!";
+      << "[cublas init] Cublas set stream faild!";
 }
 
-void hl_matrix_transpose(real *A_d,
-                         real *C_d,
-                         int dimM,
-                         int dimN,
-                         int lda,
-                         int ldc) {
+void hl_matrix_transpose(
+    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {
   real alpha = 1.0;
   real beta = 0.0;
 
@@ -159,11 +151,18 @@ void hl_matrix_transpose(real *A_d,
   CHECK_NOTNULL(C_d);
 
   CHECK_CUBLAS(CUBLAS_GEAM(t_resource.handle,
-               CUBLAS_OP_T, CUBLAS_OP_N,
-               dimM, dimN,
-               &alpha, A_d, lda,
-               &beta, nullptr, dimM,
-               C_d, ldc));
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           dimM,
+                           dimN,
+                           &alpha,
+                           A_d,
+                           lda,
+                           &beta,
+                           nullptr,
+                           dimM,
+                           C_d,
+                           ldc));
   CHECK_SYNC("hl_matrix_transpose failed");
 }
 
@@ -181,21 +180,20 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
   real **inout_d = (real **)hl_malloc_device(sizeof(real *));
   hl_memcpy(inout_d, inout_h, sizeof(real *));
 
-  int *pivot_d = (int *)hl_malloc_device(dimN*sizeof(int));  
+  int *pivot_d = (int *)hl_malloc_device(dimN * sizeof(int));
   int *info_d = (int *)t_resource.gpu_mem;
 
   /* Note: cublasSgetrfBatched is used to calculate a number of
      small-sized matrices. There may be a better way to reconstruct
      the API for better performance.
    */
-  CHECK_CUBLAS(CUBLAS_GETRF(t_resource.handle,
-	       dimN, inout_d, lda, pivot_d,
-               info_d, 1));
+  CHECK_CUBLAS(
+      CUBLAS_GETRF(t_resource.handle, dimN, inout_d, lda, pivot_d, info_d, 1));
 
-  int info_h; 
+  int info_h;
   hl_memcpy(&info_h, info_d, sizeof(int));
   if (info_h != 0) {
-      LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
+    LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
   }
 
   /* Step 2: Compute the inverse of the matrix given its LU decomposition */
@@ -204,27 +202,40 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
   hl_memcpy(out_d, out_h, sizeof(real *));
 
   CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
-	       dimN, (const real **)inout_d, lda, pivot_d,
-	       out_d, ldc, info_d, 1));
+                            dimN,
+                            (const real **)inout_d,
+                            lda,
+                            pivot_d,
+                            out_d,
+                            ldc,
+                            info_d,
+                            1));
 
   hl_memcpy(&info_h, info_d, sizeof(int));
   if (info_h != 0) {
-      LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
+    LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
   }
 
   hl_free_mem_device(inout_d);
   hl_free_mem_device(pivot_d);
   hl_free_mem_device(out_d);
-  
+
   CHECK_SYNC("hl_matrix_inverse failed");
 }
 
-void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                   real *B_d, hl_trans_op_t transb,
+void hl_matrix_mul(real *A_d,
+                   hl_trans_op_t transa,
+                   real *B_d,
+                   hl_trans_op_t transb,
                    real *C_d,
-                   int dimM, int dimN, int dimK,
-                   real alpha, real beta,
-                   int lda, int ldb, int ldc) {
+                   int dimM,
+                   int dimN,
+                   int dimK,
+                   real alpha,
+                   real beta,
+                   int lda,
+                   int ldb,
+                   int ldc) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
   CHECK_NOTNULL(C_d);
@@ -232,8 +243,8 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
   if (dimN == 1 && dimM != 1 && dimK != 1 && transb == HPPL_OP_N) {
     int m = (transa == HPPL_OP_N) ? dimM : dimK;
     int n = (transa == HPPL_OP_N) ? dimK : dimM;
-    hl_matrix_mul_vector(A_d, transa, B_d, C_d, m, n,
-                         alpha, beta, lda, ldb, ldc);
+    hl_matrix_mul_vector(
+        A_d, transa, B_d, C_d, m, n, alpha, beta, lda, ldb, ldc);
     return;
   }
 
@@ -241,8 +252,7 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
     int m = (transb == HPPL_OP_N) ? dimK : dimN;
     int n = (transb == HPPL_OP_N) ? dimN : dimK;
     hl_trans_op_t trans = (transb == HPPL_OP_N) ? HPPL_OP_T : HPPL_OP_N;
-    hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n,
-                         alpha, beta, ldb, 1, 1);
+    hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n, alpha, beta, ldb, 1, 1);
     return;
   }
 
@@ -251,26 +261,47 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
     stat = CUBLAS_GEMM(t_resource.handle,
                        CUBLAS_OP_N,
                        CUBLAS_OP_N,
-                       dimN, dimM, dimK,
-                       &alpha, B_d, ldb,
-                       A_d, lda,
-                       &beta, C_d, ldc);
+                       dimN,
+                       dimM,
+                       dimK,
+                       &alpha,
+                       B_d,
+                       ldb,
+                       A_d,
+                       lda,
+                       &beta,
+                       C_d,
+                       ldc);
   } else if ((HPPL_OP_T == transa) && (HPPL_OP_N == transb)) {
     stat = CUBLAS_GEMM(t_resource.handle,
                        CUBLAS_OP_N,
                        CUBLAS_OP_T,
-                       dimN, dimM, dimK,
-                       &alpha, B_d, ldb,
-                       A_d, lda,
-                       &beta, C_d, ldc);
+                       dimN,
+                       dimM,
+                       dimK,
+                       &alpha,
+                       B_d,
+                       ldb,
+                       A_d,
+                       lda,
+                       &beta,
+                       C_d,
+                       ldc);
   } else if ((HPPL_OP_N == transa) && (HPPL_OP_T == transb)) {
     stat = CUBLAS_GEMM(t_resource.handle,
                        CUBLAS_OP_T,
                        CUBLAS_OP_N,
-                       dimN, dimM, dimK,
-                       &alpha, B_d, ldb,
-                       A_d, lda,
-                       &beta, C_d, ldc);
+                       dimN,
+                       dimM,
+                       dimK,
+                       &alpha,
+                       B_d,
+                       ldb,
+                       A_d,
+                       lda,
+                       &beta,
+                       C_d,
+                       ldc);
   } else {
     LOG(FATAL) << "parameter transa error!";
   }
@@ -278,24 +309,46 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
   CHECK_SYNC("hl_matrix_mul failed");
 }
 
-void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                   real *B_d, hl_trans_op_t transb,
+void hl_matrix_mul(real *A_d,
+                   hl_trans_op_t transa,
+                   real *B_d,
+                   hl_trans_op_t transb,
                    real *C_d,
-                   int dimM, int dimN, int dimK,
-                   real alpha, real beta) {
+                   int dimM,
+                   int dimN,
+                   int dimK,
+                   real alpha,
+                   real beta) {
   int lda = (HPPL_OP_N == transa) ? dimK : dimM;
   int ldb = (HPPL_OP_N == transb) ? dimN : dimK;
   int ldc = dimN;
 
-  hl_matrix_mul(A_d, transa, B_d, transb, C_d, dimM, dimN,
-                dimK, alpha, beta, lda, ldb, ldc);
+  hl_matrix_mul(A_d,
+                transa,
+                B_d,
+                transb,
+                C_d,
+                dimM,
+                dimN,
+                dimK,
+                alpha,
+                beta,
+                lda,
+                ldb,
+                ldc);
 }
 
-void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
-                          real *B_d, real *C_d,
-                          int dimM, int dimN,
-                          real alpha, real beta,
-                          int lda, int incb, int incc) {
+void hl_matrix_mul_vector(real *A_d,
+                          hl_trans_op_t trans,
+                          real *B_d,
+                          real *C_d,
+                          int dimM,
+                          int dimN,
+                          real alpha,
+                          real beta,
+                          int lda,
+                          int incb,
+                          int incc) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
   CHECK_NOTNULL(C_d);
@@ -304,21 +357,29 @@ void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
   if (HPPL_OP_N == trans) {
     stat = CUBLAS_GEMV(t_resource.handle,
                        CUBLAS_OP_T,
-                       dimN, dimM,
+                       dimN,
+                       dimM,
                        &alpha,
-                       A_d, lda,
-                       B_d, incb,
+                       A_d,
+                       lda,
+                       B_d,
+                       incb,
                        &beta,
-                       C_d, incc);
+                       C_d,
+                       incc);
   } else if (HPPL_OP_T == trans) {
     stat = CUBLAS_GEMV(t_resource.handle,
                        CUBLAS_OP_N,
-                       dimN, dimM,
+                       dimN,
+                       dimM,
                        &alpha,
-                       A_d, lda,
-                       B_d, incb,
+                       A_d,
+                       lda,
+                       B_d,
+                       incb,
                        &beta,
-                       C_d, incc);
+                       C_d,
+                       incc);
   } else {
     LOG(FATAL) << "parameter transa error!";
   }
@@ -327,10 +388,14 @@ void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
   CHECK_SYNC("hl_matrix_mul_vector");
 }
 
-void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
-                          real *B_d, real *C_d,
-                          int dimM, int dimN,
-                          real alpha, real beta) {
-  hl_matrix_mul_vector(A_d, trans, B_d, C_d, dimM, dimN,
-                       alpha, beta, dimN, 1, 1);
+void hl_matrix_mul_vector(real *A_d,
+                          hl_trans_op_t trans,
+                          real *B_d,
+                          real *C_d,
+                          int dimM,
+                          int dimN,
+                          real alpha,
+                          real beta) {
+  hl_matrix_mul_vector(
+      A_d, trans, B_d, C_d, dimM, dimN, alpha, beta, dimN, 1, 1);
 }
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index 92b28e4345c3d4d306e6ee2a7f9f50189454f951..9d4ff08a78d641896e946e9bf04590d4ba93350f 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <cudnn.h>
 #include <mutex>
 #include "hl_cuda_cudnn.h"
@@ -22,9 +21,10 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/CommandLineParser.h"
 
-P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb, 4096,
-                "Specify cuDNN max workspace limit, in units MB, "
-                "4096MB=4GB by default.");
+P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
+               4096,
+               "Specify cuDNN max workspace limit, in units MB, "
+               "4096MB=4GB by default.");
 
 namespace dynload {
 
@@ -41,16 +41,15 @@ void* cudnn_dso_handle = nullptr;
 
 #ifdef PADDLE_USE_DSO
 
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                          \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      using cudnn_func = decltype(__name(args...))(*)(Args...);  \
-      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle,          \
-                     &cudnn_dso_handle);                         \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);       \
-      return reinterpret_cast<cudnn_func>(p_##__name)(args...);  \
-    }                                                            \
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                                     \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    auto operator()(Args... args) -> decltype(__name(args...)) {            \
+      using cudnn_func = decltype(__name(args...)) (*)(Args...);            \
+      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle); \
+      void* p_##__name = dlsym(cudnn_dso_handle, #__name);                  \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);             \
+    }                                                                       \
   } __name; /* struct DynLoad__##__name */
 
 #else
@@ -69,6 +68,7 @@ void* cudnn_dso_handle = nullptr;
  * include all needed cudnn functions in HPPL
  * different cudnn version has different interfaces
  **/
+// clang-format off
 #define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
   __macro(cudnnSetTensor4dDescriptor)                     \
   __macro(cudnnSetTensor4dDescriptorEx)                   \
@@ -141,58 +141,53 @@ CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
 #undef CUDNN_DNN_ROUTINE_EACH
-
+// clang-format on
 } /* namespace dynload */
 
 /**
  * Check build-in cudnn function using glog and it **does not**
  * support << operator for more details error info.
  */
-#define CHECK_CUDNN(cudnnFunc)                               \
-  do {                                                       \
-    cudnnStatus_t cudnnStat = cudnnFunc;                     \
-    CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat)                \
-        << "Cudnn Error: "                                   \
-        << dynload::cudnnGetErrorString(cudnnStat);          \
+#define CHECK_CUDNN(cudnnFunc)                                         \
+  do {                                                                 \
+    cudnnStatus_t cudnnStat = cudnnFunc;                               \
+    CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat)                          \
+        << "Cudnn Error: " << dynload::cudnnGetErrorString(cudnnStat); \
   } while (0)
 
 bool g_is_libcudnn_init = false;
 int g_cudnn_lib_version = 0;
 
-void hl_cudnn_desc_init(cudnnTensorDescriptor_t*  cudnn_desc)
-{
-    CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
+void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) {
+  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
 }
 
-void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream)
-{
-    size_t cudnn_dso_ver = dynload::cudnnGetVersion();
-    size_t cudnn_dso_major = cudnn_dso_ver / 1000;
-    size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
-
-    // Compare cudnn header version with that of cudnn.so.
-    CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
-          (cudnn_cuh_major == cudnn_dso_major))
-        << "[cudnn init] libcudnn v" << cudnn_dso_major <<
-        " with header v" << cudnn_cuh_major << " unmatched!\n"
-        << "PaddlePaddle Requirement: "
-        << "(header v[2-3] with libcudnn v[2-3]) Or "
-        << "(header v4 with libcudnn v4) Or "
-        << "(header v5 with libcudnn v5).";
-
-    CHECK(!(CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
-        << "cudnn v5 requires cuda version >= 7.5";
-
-    CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
-    CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
-
-    g_is_libcudnn_init = true;
-    g_cudnn_lib_version = cudnn_dso_ver;
+void hl_cudnn_init(cudnnHandle_t* cudnn_handle, cudaStream_t stream) {
+  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
+  size_t cudnn_dso_major = cudnn_dso_ver / 1000;
+  size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
+
+  // Compare cudnn header version with that of cudnn.so.
+  CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
+        (cudnn_cuh_major == cudnn_dso_major))
+      << "[cudnn init] libcudnn v" << cudnn_dso_major << " with header v"
+      << cudnn_cuh_major << " unmatched!\n"
+      << "PaddlePaddle Requirement: "
+      << "(header v[2-3] with libcudnn v[2-3]) Or "
+      << "(header v4 with libcudnn v4) Or "
+      << "(header v5 with libcudnn v5).";
+
+  CHECK(!(CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
+      << "cudnn v5 requires cuda version >= 7.5";
+
+  CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
+  CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
+
+  g_is_libcudnn_init = true;
+  g_cudnn_lib_version = cudnn_dso_ver;
 }
 
-int hl_get_cudnn_lib_version() {
-  return g_cudnn_lib_version;
-}
+int hl_get_cudnn_lib_version() { return g_cudnn_lib_version; }
 
 void hl_conv_workspace(hl_tensor_descriptor input,
                        hl_tensor_descriptor output,
@@ -206,94 +201,91 @@ void hl_conv_workspace(hl_tensor_descriptor input,
                        size_t* bwdFilterLimitBytes) {
 #if CUDNN_VERSION >= 4000
 
-    CHECK_NOTNULL(input);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(filter);
-    CHECK_NOTNULL(conv);
-
-    // Specify workspace limit directly
-    size_t memoryLimitBytes = (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
-
-    // cudnn convolution forward configuration
-    cudnnTensorDescriptor_t       fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
-    cudnnTensorDescriptor_t       fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnFilterDescriptor_t       fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
-    cudnnConvolutionDescriptor_t  fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
-             t_resource.cudnn_handle,
-             fwd_src_desc,
-             fwd_filter_desc,
-             fwd_conv_desc,
-             fwd_dest_desc,
-             CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-             memoryLimitBytes,
-             reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
-             t_resource.cudnn_handle,
-             fwd_src_desc,
-             fwd_filter_desc,
-             fwd_conv_desc,
-             fwd_dest_desc,
-             static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo),
-             fwdLimitBytes));
-
-    // cudnn convolution backward data configuration
-    cudnnFilterDescriptor_t       bwd_data_filter_desc =
-                                          GET_FILTER_DESCRIPTOR(filter);
-    cudnnTensorDescriptor_t       bwd_data_diff_desc =
-                                          GET_TENSOR_DESCRIPTOR(output);
-    cudnnTensorDescriptor_t       bwd_data_grad_desc =
-                                          GET_TENSOR_DESCRIPTOR(input);
-    cudnnConvolutionDescriptor_t  bwd_data_conv_desc =
-                                          GET_CONVOLUTION_DESCRIPTOR(conv);
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-             t_resource.cudnn_handle,
-             bwd_data_filter_desc,
-             bwd_data_diff_desc,
-             bwd_data_conv_desc,
-             bwd_data_grad_desc,
-             CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-             memoryLimitBytes,
-             reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
-             t_resource.cudnn_handle,
-             bwd_data_filter_desc,
-             bwd_data_diff_desc,
-             bwd_data_conv_desc,
-             bwd_data_grad_desc,
-             static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
-             bwdDataLimitBytes));
-
-    // cudnn convolution backward filter configuration
-    cudnnTensorDescriptor_t       bwd_filter_src_desc =
-                                      GET_TENSOR_DESCRIPTOR(input);
-    cudnnTensorDescriptor_t       bwd_filter_diff_desc =
-                                      GET_TENSOR_DESCRIPTOR(output);
-    cudnnConvolutionDescriptor_t  bwd_filter_conv_desc =
-                                      GET_CONVOLUTION_DESCRIPTOR(conv);
-    cudnnFilterDescriptor_t       bwd_filter_grad_desc =
-                                      GET_FILTER_DESCRIPTOR(filter);
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-             t_resource.cudnn_handle,
-             bwd_filter_src_desc,
-             bwd_filter_diff_desc,
-             bwd_filter_conv_desc,
-             bwd_filter_grad_desc,
-             CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-             memoryLimitBytes,
-             reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-             t_resource.cudnn_handle, bwd_filter_src_desc,
-             bwd_filter_diff_desc, bwd_filter_conv_desc,
-             bwd_filter_grad_desc,
-             static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
-             bwdFilterLimitBytes));
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(filter);
+  CHECK_NOTNULL(conv);
+
+  // Specify workspace limit directly
+  size_t memoryLimitBytes =
+      (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
+
+  // cudnn convolution forward configuration
+  cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
+      t_resource.cudnn_handle,
+      fwd_src_desc,
+      fwd_filter_desc,
+      fwd_conv_desc,
+      fwd_dest_desc,
+      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+      memoryLimitBytes,
+      reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
+      t_resource.cudnn_handle,
+      fwd_src_desc,
+      fwd_filter_desc,
+      fwd_conv_desc,
+      fwd_dest_desc,
+      static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo),
+      fwdLimitBytes));
+
+  // cudnn convolution backward data configuration
+  cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnConvolutionDescriptor_t bwd_data_conv_desc =
+      GET_CONVOLUTION_DESCRIPTOR(conv);
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+      t_resource.cudnn_handle,
+      bwd_data_filter_desc,
+      bwd_data_diff_desc,
+      bwd_data_conv_desc,
+      bwd_data_grad_desc,
+      CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+      memoryLimitBytes,
+      reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+      t_resource.cudnn_handle,
+      bwd_data_filter_desc,
+      bwd_data_diff_desc,
+      bwd_data_conv_desc,
+      bwd_data_grad_desc,
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
+      bwdDataLimitBytes));
+
+  // cudnn convolution backward filter configuration
+  cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
+      GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+      t_resource.cudnn_handle,
+      bwd_filter_src_desc,
+      bwd_filter_diff_desc,
+      bwd_filter_conv_desc,
+      bwd_filter_grad_desc,
+      CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+      memoryLimitBytes,
+      reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+      t_resource.cudnn_handle,
+      bwd_filter_src_desc,
+      bwd_filter_diff_desc,
+      bwd_filter_conv_desc,
+      bwd_filter_grad_desc,
+      static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
+      bwdFilterLimitBytes));
 
 #endif
 }
@@ -302,78 +294,75 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
                                  int batch_size,
                                  int feature_maps,
                                  int height,
-                                 int width)
-{
-    CHECK_NOTNULL(image_desc);
+                                 int width) {
+  CHECK_NOTNULL(image_desc);
 
-    cudnn_tensor_descriptor hl_desc =
-        (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
-    CHECK_NOTNULL(hl_desc);
+  cudnn_tensor_descriptor hl_desc =
+      (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
+  CHECK_NOTNULL(hl_desc);
 
 #ifndef PADDLE_TYPE_DOUBLE
-    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
-    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
 #endif
-    CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
-
-    CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
-                hl_desc->desc,
-                CUDNN_TENSOR_NCHW,
-                data_type,
-                batch_size,
-                feature_maps,
-                height,
-                width));
-
-    hl_desc->format = CUDNN_TENSOR_NCHW;
-    hl_desc->data_type = data_type;
-    hl_desc->batch_size = batch_size;
-    hl_desc->feature_maps = feature_maps;
-    hl_desc->height = height;
-    hl_desc->width = width;
-
-    *image_desc = (hl_tensor_descriptor)hl_desc;
+  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
+
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(hl_desc->desc,
+                                                  CUDNN_TENSOR_NCHW,
+                                                  data_type,
+                                                  batch_size,
+                                                  feature_maps,
+                                                  height,
+                                                  width));
+
+  hl_desc->format = CUDNN_TENSOR_NCHW;
+  hl_desc->data_type = data_type;
+  hl_desc->batch_size = batch_size;
+  hl_desc->feature_maps = feature_maps;
+  hl_desc->height = height;
+  hl_desc->width = width;
+
+  *image_desc = (hl_tensor_descriptor)hl_desc;
 }
 
 void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {
-    CHECK_NOTNULL(image_desc);
+  CHECK_NOTNULL(image_desc);
 
-    cudnn_tensor_descriptor hl_desc =
-        (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
-    CHECK_NOTNULL(hl_desc);
+  cudnn_tensor_descriptor hl_desc =
+      (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
+  CHECK_NOTNULL(hl_desc);
 
 #ifndef PADDLE_TYPE_DOUBLE
-    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
-    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
 #endif
-    CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
+  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
 
-    hl_desc->data_type = data_type;
+  hl_desc->data_type = data_type;
 
-    *image_desc = (hl_tensor_descriptor)hl_desc;
+  *image_desc = (hl_tensor_descriptor)hl_desc;
 }
 
 void hl_tensor_reshape(hl_tensor_descriptor image_desc,
                        int batch_size,
                        int feature_maps,
                        int height,
-                       int width)
-{
-    const int stride_w = 1;
-    const int stride_h = width * stride_w;
-    const int stride_c = height * stride_h;
-    const int stride_n = feature_maps * stride_c;
-    return hl_tensor_reshape(image_desc,
-                             batch_size,
-                             feature_maps,
-                             height,
-                             width,
-                             stride_n,
-                             stride_c,
-                             stride_h,
-                             stride_w);
+                       int width) {
+  const int stride_w = 1;
+  const int stride_h = width * stride_w;
+  const int stride_c = height * stride_h;
+  const int stride_n = feature_maps * stride_c;
+  return hl_tensor_reshape(image_desc,
+                           batch_size,
+                           feature_maps,
+                           height,
+                           width,
+                           stride_n,
+                           stride_c,
+                           stride_h,
+                           stride_w);
 }
 
 void hl_tensor_reshape(hl_tensor_descriptor image_desc,
@@ -384,45 +373,42 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
                        int nStride,
                        int cStride,
                        int hStride,
-                       int wStride)
-{
-    CHECK_NOTNULL(image_desc);
-
-    cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
-    CHECK_NOTNULL(hl_desc->desc);
-
-    CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
-                hl_desc->data_type,
-                batch_size,
-                feature_maps,
-                height,
-                width,
-                nStride,
-                cStride,
-                hStride,
-                wStride));
-
-    hl_desc->batch_size = batch_size;
-    hl_desc->feature_maps = feature_maps;
-    hl_desc->height = height;
-    hl_desc->width = width;
+                       int wStride) {
+  CHECK_NOTNULL(image_desc);
+
+  cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
+  CHECK_NOTNULL(hl_desc->desc);
+
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
+                                                    hl_desc->data_type,
+                                                    batch_size,
+                                                    feature_maps,
+                                                    height,
+                                                    width,
+                                                    nStride,
+                                                    cStride,
+                                                    hStride,
+                                                    wStride));
+
+  hl_desc->batch_size = batch_size;
+  hl_desc->feature_maps = feature_maps;
+  hl_desc->height = height;
+  hl_desc->width = width;
 }
 
-void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc)
-{
-    CHECK_NOTNULL(image_desc);
+void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {
+  CHECK_NOTNULL(image_desc);
 
-    cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
-    CHECK_NOTNULL(hl_desc->desc);
+  cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
+  CHECK_NOTNULL(hl_desc->desc);
 
-    CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
+  CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
 
-    hl_desc->desc = NULL;
+  hl_desc->desc = NULL;
 
-    free(image_desc);
+  free(image_desc);
 }
 
-
 void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
                                   hl_pooling_mode_t mode,
                                   int height,
@@ -430,99 +416,93 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
                                   int height_padding,
                                   int width_padding,
                                   int stride_height,
-                                  int stride_width)
-{
-    cudnnPoolingMode_t cudnn_mode;
-    switch (mode)
-    {
-        case HL_POOLING_MAX:
-            cudnn_mode = CUDNN_POOLING_MAX;
-            break;
-        case HL_POOLING_AVERAGE:
-            cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-            break;
-        case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
-            cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-            break;
-        default:
-            LOG(FATAL) << "parameter mode error";
-    }
-
-    CHECK_NOTNULL(pooling_desc);
-
-    cudnn_pooling_descriptor hl_pooling_desc =
-        (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
-    CHECK_NOTNULL(hl_pooling_desc);
-
-    CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
-
-    CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(
-                hl_pooling_desc->desc,
-                cudnn_mode,
+                                  int stride_width) {
+  cudnnPoolingMode_t cudnn_mode;
+  switch (mode) {
+    case HL_POOLING_MAX:
+      cudnn_mode = CUDNN_POOLING_MAX;
+      break;
+    case HL_POOLING_AVERAGE:
+      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+      break;
+    case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
+      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+      break;
+    default:
+      LOG(FATAL) << "parameter mode error";
+  }
+
+  CHECK_NOTNULL(pooling_desc);
+
+  cudnn_pooling_descriptor hl_pooling_desc =
+      (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
+  CHECK_NOTNULL(hl_pooling_desc);
+
+  CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
+
+  CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(hl_pooling_desc->desc,
+                                                   cudnn_mode,
 #if CUDNN_VERSION >= 5000
-                CUDNN_PROPAGATE_NAN,
+                                                   CUDNN_PROPAGATE_NAN,
 #endif
-                height,
-                width,
-                height_padding,
-                width_padding,
-                stride_height,
-                stride_width));
-
-    hl_pooling_desc->mode = cudnn_mode;
-    hl_pooling_desc->window_height = height;
-    hl_pooling_desc->window_width = width;
-    hl_pooling_desc->stride_height = stride_height;
-    hl_pooling_desc->stride_width = stride_width;
-
-    *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
+                                                   height,
+                                                   width,
+                                                   height_padding,
+                                                   width_padding,
+                                                   stride_height,
+                                                   stride_width));
+
+  hl_pooling_desc->mode = cudnn_mode;
+  hl_pooling_desc->window_height = height;
+  hl_pooling_desc->window_width = width;
+  hl_pooling_desc->stride_height = stride_height;
+  hl_pooling_desc->stride_width = stride_width;
+
+  *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
 }
 
-void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc)
-{
-    CHECK_NOTNULL(pooling_desc);
+void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {
+  CHECK_NOTNULL(pooling_desc);
 
-    cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
-    CHECK_NOTNULL(hl_pooling->desc);
+  cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
 
-    CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
+  CHECK_NOTNULL(hl_pooling->desc);
+  CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
 
-    hl_pooling->desc = NULL;
+  hl_pooling->desc = NULL;
 
-    free(pooling_desc);
+  free(pooling_desc);
 }
 
 void hl_pooling_forward(hl_tensor_descriptor input,
                         real* input_image,
                         hl_tensor_descriptor output,
                         real* output_image,
-                        hl_pooling_descriptor pooling)
-{
-    cudnnPoolingDescriptor_t    pooling_desc;
-    cudnnTensorDescriptor_t     input_desc;
-    cudnnTensorDescriptor_t     output_desc;
-
-    CHECK_NOTNULL(input);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(pooling);
-    CHECK_NOTNULL(input_image);
-    CHECK_NOTNULL(output_image);
-
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    input_desc = ((cudnn_tensor_descriptor)input)->desc;
-    output_desc = ((cudnn_tensor_descriptor)output)->desc;
-    pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
-    CHECK_CUDNN(dynload::cudnnPoolingForward(
-                t_resource.cudnn_handle,
-                pooling_desc,
-                &alpha,
-                input_desc,
-                input_image,
-                &beta,
-                output_desc,
-                output_image));
-    CHECK_SYNC("hl_pooling_forward failed");
+                        hl_pooling_descriptor pooling) {
+  cudnnPoolingDescriptor_t pooling_desc;
+  cudnnTensorDescriptor_t input_desc;
+  cudnnTensorDescriptor_t output_desc;
+
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(pooling);
+  CHECK_NOTNULL(input_image);
+  CHECK_NOTNULL(output_image);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  input_desc = ((cudnn_tensor_descriptor)input)->desc;
+  output_desc = ((cudnn_tensor_descriptor)output)->desc;
+  pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
+  CHECK_CUDNN(dynload::cudnnPoolingForward(t_resource.cudnn_handle,
+                                           pooling_desc,
+                                           &alpha,
+                                           input_desc,
+                                           input_image,
+                                           &beta,
+                                           output_desc,
+                                           output_image));
+  CHECK_SYNC("hl_pooling_forward failed");
 }
 
 void hl_pooling_backward(hl_tensor_descriptor input,
@@ -531,94 +511,87 @@ void hl_pooling_backward(hl_tensor_descriptor input,
                          hl_tensor_descriptor output,
                          real* output_image,
                          real* output_image_grad,
-                         hl_pooling_descriptor pooling)
-{
-    cudnnPoolingDescriptor_t    pooling_desc;
-    cudnnTensorDescriptor_t     input_desc;
-    cudnnTensorDescriptor_t     output_desc;
-
-    CHECK_NOTNULL(input);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(pooling);
-    CHECK_NOTNULL(input_image);
-    CHECK_NOTNULL(input_image_grad);
-    CHECK_NOTNULL(output_image);
-    CHECK_NOTNULL(output_image_grad);
-
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    input_desc = ((cudnn_tensor_descriptor)input)->desc;
-    output_desc = ((cudnn_tensor_descriptor)output)->desc;
-    pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
-    CHECK_CUDNN(dynload::cudnnPoolingBackward(
-                t_resource.cudnn_handle,
-                pooling_desc,
-                &alpha,
-                output_desc,
-                output_image,
-                output_desc,
-                output_image_grad,
-                input_desc,
-                input_image,
-                &beta,
-                input_desc,
-                input_image_grad));
+                         hl_pooling_descriptor pooling) {
+  cudnnPoolingDescriptor_t pooling_desc;
+  cudnnTensorDescriptor_t input_desc;
+  cudnnTensorDescriptor_t output_desc;
+
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(pooling);
+  CHECK_NOTNULL(input_image);
+  CHECK_NOTNULL(input_image_grad);
+  CHECK_NOTNULL(output_image);
+  CHECK_NOTNULL(output_image_grad);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  input_desc = ((cudnn_tensor_descriptor)input)->desc;
+  output_desc = ((cudnn_tensor_descriptor)output)->desc;
+  pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
+  CHECK_CUDNN(dynload::cudnnPoolingBackward(t_resource.cudnn_handle,
+                                            pooling_desc,
+                                            &alpha,
+                                            output_desc,
+                                            output_image,
+                                            output_desc,
+                                            output_image_grad,
+                                            input_desc,
+                                            input_image,
+                                            &beta,
+                                            input_desc,
+                                            input_image_grad));
   CHECK_SYNC("hl_pooling_backward failed");
 }
 
-
 void hl_create_filter_descriptor(hl_filter_descriptor* filter,
                                  int input_feature_maps,
                                  int output_feature_maps,
                                  int height,
-                                 int width)
-{
-    CHECK_NOTNULL(filter);
+                                 int width) {
+  CHECK_NOTNULL(filter);
 
-    cudnn_filter_descriptor hl_filter =
-        (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
-    CHECK_NOTNULL(hl_filter);
+  cudnn_filter_descriptor hl_filter =
+      (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
+  CHECK_NOTNULL(hl_filter);
 
-    CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
+  CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
 
 #ifndef PADDLE_TYPE_DOUBLE
-    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
-    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
 #endif
-    CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(
-             hl_filter->desc,
-             data_type,
+  CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(hl_filter->desc,
+                                                  data_type,
 #if CUDNN_VERSION >= 5000
-             CUDNN_TENSOR_NCHW,
+                                                  CUDNN_TENSOR_NCHW,
 #endif
-             output_feature_maps,
-             input_feature_maps,
-             height,
-             width));
-
-    hl_filter->data_type = data_type;
-    hl_filter->output_feature_maps = output_feature_maps;
-    hl_filter->input_feature_maps = input_feature_maps;
-    hl_filter->filter_height = height;
-    hl_filter->filter_width = width;
-
-    *filter = (hl_filter_descriptor)hl_filter;
+                                                  output_feature_maps,
+                                                  input_feature_maps,
+                                                  height,
+                                                  width));
+
+  hl_filter->data_type = data_type;
+  hl_filter->output_feature_maps = output_feature_maps;
+  hl_filter->input_feature_maps = input_feature_maps;
+  hl_filter->filter_height = height;
+  hl_filter->filter_width = width;
+
+  *filter = (hl_filter_descriptor)hl_filter;
 }
 
+void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {
+  CHECK_NOTNULL(filter);
 
-void hl_destroy_filter_descriptor(hl_filter_descriptor filter)
-{
-    CHECK_NOTNULL(filter);
+  cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
+  CHECK_NOTNULL(hl_filter->desc);
 
-    cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
-    CHECK_NOTNULL(hl_filter->desc);
+  CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
 
-    CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
+  hl_filter->desc = NULL;
 
-    hl_filter->desc = NULL;
-
-    free(filter);
+  free(filter);
 }
 
 void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
@@ -627,38 +600,36 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                       int padding_height,
                                       int padding_width,
                                       int stride_height,
-                                      int stride_width)
-{
-    CHECK_NOTNULL(conv);
-
-    cudnn_convolution_descriptor hl_conv =
-        (cudnn_convolution_descriptor)malloc(sizeof(_cudnn_convolution_descriptor));
-    CHECK_NOTNULL(hl_conv);
-
-    CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
-
-    cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
-    CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(
-                hl_conv->desc,
-                padding_height,
-                padding_width,
-                stride_height,
-                stride_width,
-                1,
-                1,
-                mode));
-
-    hl_conv->input_image = image;
-    hl_conv->filter = filter;
-    hl_conv->padding_height = padding_height;
-    hl_conv->padding_width = padding_width;
-    hl_conv->stride_height = stride_height;
-    hl_conv->stride_width = stride_width;
-    hl_conv->upscalex = 1;
-    hl_conv->upscaley = 1;
-    hl_conv->mode = mode;
-
-    *conv = (hl_convolution_descriptor)hl_conv;
+                                      int stride_width) {
+  CHECK_NOTNULL(conv);
+
+  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc(
+      sizeof(_cudnn_convolution_descriptor));
+
+  CHECK_NOTNULL(hl_conv);
+  CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
+
+  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
+                                                       padding_height,
+                                                       padding_width,
+                                                       stride_height,
+                                                       stride_width,
+                                                       1,
+                                                       1,
+                                                       mode));
+
+  hl_conv->input_image = image;
+  hl_conv->filter = filter;
+  hl_conv->padding_height = padding_height;
+  hl_conv->padding_width = padding_width;
+  hl_conv->stride_height = stride_height;
+  hl_conv->stride_width = stride_width;
+  hl_conv->upscalex = 1;
+  hl_conv->upscaley = 1;
+  hl_conv->mode = mode;
+
+  *conv = (hl_convolution_descriptor)hl_conv;
 }
 
 void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
@@ -667,47 +638,44 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                      int padding_height,
                                      int padding_width,
                                      int stride_height,
-                                     int stride_width)
-{
-    CHECK_NOTNULL(conv);
-    CHECK_NOTNULL(image);
-    CHECK_NOTNULL(filter);
-
-    cudnnConvolutionDescriptor_t  conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-    cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
-    CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(
-                conv_desc,
-                padding_height,
-                padding_width,
-                stride_height,
-                stride_width,
-                1,
-                1,
-                mode));
-
-    cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
-    hl_conv->input_image = image;
-    hl_conv->filter = filter;
-    hl_conv->padding_height = padding_height;
-    hl_conv->padding_width = padding_width;
-    hl_conv->stride_height = stride_height;
-    hl_conv->stride_width = stride_width;
-    hl_conv->upscalex = 1;
-    hl_conv->upscaley = 1;
-    hl_conv->mode = mode;
+                                     int stride_width) {
+  CHECK_NOTNULL(conv);
+  CHECK_NOTNULL(image);
+  CHECK_NOTNULL(filter);
+
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
+                                                       padding_height,
+                                                       padding_width,
+                                                       stride_height,
+                                                       stride_width,
+                                                       1,
+                                                       1,
+                                                       mode));
+
+  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
+  hl_conv->input_image = image;
+  hl_conv->filter = filter;
+  hl_conv->padding_height = padding_height;
+  hl_conv->padding_width = padding_width;
+  hl_conv->stride_height = stride_height;
+  hl_conv->stride_width = stride_width;
+  hl_conv->upscalex = 1;
+  hl_conv->upscaley = 1;
+  hl_conv->mode = mode;
 }
 
-void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv)
-{
-    CHECK_NOTNULL(conv);
+void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {
+  CHECK_NOTNULL(conv);
 
-    cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
-    CHECK_NOTNULL(hl_conv->desc);
+  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
+  CHECK_NOTNULL(hl_conv->desc);
 
-    CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
-    hl_conv->desc = NULL;
+  CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
+  hl_conv->desc = NULL;
 
-    free(conv);
+  free(conv);
 }
 
 void hl_convolution_forward(hl_tensor_descriptor input,
@@ -720,87 +688,83 @@ void hl_convolution_forward(hl_tensor_descriptor input,
                             void* gpuWorkSpace,
                             size_t sizeInBytes,
                             int convFwdAlgo) {
-    CHECK_NOTNULL(input);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(filter);
-    CHECK_NOTNULL(conv);
-    CHECK_NOTNULL(input_data);
-    CHECK_NOTNULL(output_data);
-    CHECK_NOTNULL(filter_data);
-    cudnnTensorDescriptor_t       src_desc = GET_TENSOR_DESCRIPTOR(input);
-    cudnnTensorDescriptor_t       dest_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnFilterDescriptor_t       filter_desc = GET_FILTER_DESCRIPTOR(filter);
-    cudnnConvolutionDescriptor_t  conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    CHECK_CUDNN(dynload::cudnnConvolutionForward(
-                t_resource.cudnn_handle,
-                &alpha,
-                src_desc,
-                input_data,
-                filter_desc,
-                filter_data,
-                conv_desc,
-                static_cast<cudnnConvolutionFwdAlgo_t>(convFwdAlgo),
-                gpuWorkSpace,
-                sizeInBytes,
-                &beta,
-                dest_desc,
-                output_data));
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(filter);
+  CHECK_NOTNULL(conv);
+  CHECK_NOTNULL(input_data);
+  CHECK_NOTNULL(output_data);
+  CHECK_NOTNULL(filter_data);
+  cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t dest_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  CHECK_CUDNN(dynload::cudnnConvolutionForward(
+      t_resource.cudnn_handle,
+      &alpha,
+      src_desc,
+      input_data,
+      filter_desc,
+      filter_data,
+      conv_desc,
+      static_cast<cudnnConvolutionFwdAlgo_t>(convFwdAlgo),
+      gpuWorkSpace,
+      sizeInBytes,
+      &beta,
+      dest_desc,
+      output_data));
   CHECK_SYNC("hl_convolution_forward failed");
 }
 
 void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
                                      real* bias_data,
                                      hl_tensor_descriptor output,
-                                     real* output_data)
-{
-    CHECK_NOTNULL(bias);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(bias_data);
-    CHECK_NOTNULL(output_data);
-
-    cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
-    real alpha = 1.0f;
-    real beta = 1.0f;
-
-    CHECK_CUDNN(dynload::cudnnAddTensor(
-                t_resource.cudnn_handle,
+                                     real* output_data) {
+  CHECK_NOTNULL(bias);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(bias_data);
+  CHECK_NOTNULL(output_data);
+
+  cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+
+  CHECK_CUDNN(dynload::cudnnAddTensor(t_resource.cudnn_handle,
 #if CUDNN_VERSION < 4000
-                CUDNN_ADD_SAME_C,
+                                      CUDNN_ADD_SAME_C,
 #endif
-                &alpha,
-                bias_desc,
-                bias_data,
-                &beta,
-                output_desc,
-                output_data));
+                                      &alpha,
+                                      bias_desc,
+                                      bias_data,
+                                      &beta,
+                                      output_desc,
+                                      output_data));
   CHECK_SYNC("hl_convolution_forward_add_bias failed");
 }
 
 void hl_convolution_backward_bias(hl_tensor_descriptor bias,
                                   real* bias_grad_data,
                                   hl_tensor_descriptor output,
-                                  real* output_grad_data)
-{
-    CHECK_NOTNULL(bias);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(bias_grad_data);
-    CHECK_NOTNULL(output_grad_data);
-
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
-    CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(
-                t_resource.cudnn_handle,
-                &alpha,
-                diff_desc,
-                output_grad_data,
-                &beta,
-                bias_desc,
-                bias_grad_data));
+                                  real* output_grad_data) {
+  CHECK_NOTNULL(bias);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(bias_grad_data);
+  CHECK_NOTNULL(output_grad_data);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
+  CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(t_resource.cudnn_handle,
+                                                    &alpha,
+                                                    diff_desc,
+                                                    output_grad_data,
+                                                    &beta,
+                                                    bias_desc,
+                                                    bias_grad_data));
   CHECK_SYNC("hl_convolution_backward_bias failed");
 }
 
@@ -814,38 +778,37 @@ void hl_convolution_backward_filter(hl_tensor_descriptor input,
                                     void* gpuWorkSpace,
                                     size_t sizeInBytes,
                                     int convBwdFilterAlgo) {
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(filter);
+  CHECK_NOTNULL(conv);
+  CHECK_NOTNULL(input_data);
+  CHECK_NOTNULL(output_grad_data);
+  CHECK_NOTNULL(filter_grad_data);
 
-    CHECK_NOTNULL(input);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(filter);
-    CHECK_NOTNULL(conv);
-    CHECK_NOTNULL(input_data);
-    CHECK_NOTNULL(output_grad_data);
-    CHECK_NOTNULL(filter_grad_data);
-
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    cudnnTensorDescriptor_t       src_desc = GET_TENSOR_DESCRIPTOR(input);
-    cudnnTensorDescriptor_t       diff_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnConvolutionDescriptor_t  conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-    cudnnFilterDescriptor_t       grad_desc = GET_FILTER_DESCRIPTOR(filter);
-
-    CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
-                t_resource.cudnn_handle,
-                &alpha,
-                src_desc,
-                input_data,
-                diff_desc,
-                output_grad_data,
-                conv_desc,
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnFilterDescriptor_t grad_desc = GET_FILTER_DESCRIPTOR(filter);
+
+  CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
+      t_resource.cudnn_handle,
+      &alpha,
+      src_desc,
+      input_data,
+      diff_desc,
+      output_grad_data,
+      conv_desc,
 #if CUDNN_VERSION >= 4000
-                static_cast<cudnnConvolutionBwdFilterAlgo_t>(convBwdFilterAlgo),
-                gpuWorkSpace,
-                sizeInBytes,
+      static_cast<cudnnConvolutionBwdFilterAlgo_t>(convBwdFilterAlgo),
+      gpuWorkSpace,
+      sizeInBytes,
 #endif
-                &beta,
-                grad_desc,
-                filter_grad_data));
+      &beta,
+      grad_desc,
+      filter_grad_data));
   CHECK_SYNC("hl_convolution_backward_filter failed");
 }
 
@@ -859,121 +822,111 @@ void hl_convolution_backward_data(hl_tensor_descriptor input,
                                   void* gpuWorkSpace,
                                   size_t sizeInBytes,
                                   int convBwdDataAlgo) {
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    cudnnFilterDescriptor_t       filter_desc = GET_FILTER_DESCRIPTOR(filter);
-    cudnnTensorDescriptor_t       diff_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnTensorDescriptor_t       grad_desc = GET_TENSOR_DESCRIPTOR(input);
-    cudnnConvolutionDescriptor_t  conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-
-    CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
-                t_resource.cudnn_handle,
-                &alpha,
-                filter_desc,
-                filter_data,
-                diff_desc,
-                output_grad_data,
-                conv_desc,
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t grad_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+
+  CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
+      t_resource.cudnn_handle,
+      &alpha,
+      filter_desc,
+      filter_data,
+      diff_desc,
+      output_grad_data,
+      conv_desc,
 #if CUDNN_VERSION >= 4000
-                static_cast<cudnnConvolutionBwdDataAlgo_t>(convBwdDataAlgo),
-                gpuWorkSpace,
-                sizeInBytes,
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(convBwdDataAlgo),
+      gpuWorkSpace,
+      sizeInBytes,
 #endif
-                &beta,
-                grad_desc,
-                input_data_grad));
+      &beta,
+      grad_desc,
+      input_data_grad));
   CHECK_SYNC("hl_convolution_backward_data failed");
 }
 
-
-void hl_softmax_forward(real *input,
-                        real *output,
-                        int height,
-                        int width)
-{
+void hl_softmax_forward(real* input, real* output, int height, int width) {
 #ifndef PADDLE_TYPE_DOUBLE
-    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
-    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
 #endif
-    CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
-                t_resource.cudnn_desc,
-                CUDNN_TENSOR_NCHW,
-                data_type,
-                height,
-                width,
-                1,
-                1));
-
-    real alpha = 1.0f;
-    real beta = 0.0f;
-    CHECK_CUDNN(dynload::cudnnSoftmaxForward(
-                t_resource.cudnn_handle,
-                CUDNN_SOFTMAX_ACCURATE,
-                CUDNN_SOFTMAX_MODE_CHANNEL,
-                &alpha,
-                t_resource.cudnn_desc,
-                input,
-                &beta,
-                t_resource.cudnn_desc,
-                output));
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
+                                                  CUDNN_TENSOR_NCHW,
+                                                  data_type,
+                                                  height,
+                                                  width,
+                                                  1,
+                                                  1));
+
+  real alpha = 1.0f;
+  real beta = 0.0f;
+  CHECK_CUDNN(dynload::cudnnSoftmaxForward(t_resource.cudnn_handle,
+                                           CUDNN_SOFTMAX_ACCURATE,
+                                           CUDNN_SOFTMAX_MODE_CHANNEL,
+                                           &alpha,
+                                           t_resource.cudnn_desc,
+                                           input,
+                                           &beta,
+                                           t_resource.cudnn_desc,
+                                           output));
   CHECK_SYNC("hl_softmax_forward failed");
 }
 
-void hl_softmax_backward(real *output_value,
-                         real *output_grad,
+void hl_softmax_backward(real* output_value,
+                         real* output_grad,
                          int height,
-                         int width)
-{
+                         int width) {
 #ifndef PADDLE_TYPE_DOUBLE
-    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
-    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
 #endif
-    CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
-                t_resource.cudnn_desc,
-                CUDNN_TENSOR_NCHW,
-                data_type,
-                height,
-                width,
-                1,
-                1));
-
-    real alpha = 1.0f;
-    real beta = 0.0f;
-    CHECK_CUDNN(dynload::cudnnSoftmaxBackward(
-                t_resource.cudnn_handle,
-                CUDNN_SOFTMAX_ACCURATE,
-                CUDNN_SOFTMAX_MODE_CHANNEL,
-                &alpha,
-                t_resource.cudnn_desc,
-                output_value,
-                t_resource.cudnn_desc,
-                output_grad,
-                &beta,
-                t_resource.cudnn_desc,
-                output_grad));
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
+                                                  CUDNN_TENSOR_NCHW,
+                                                  data_type,
+                                                  height,
+                                                  width,
+                                                  1,
+                                                  1));
+
+  real alpha = 1.0f;
+  real beta = 0.0f;
+  CHECK_CUDNN(dynload::cudnnSoftmaxBackward(t_resource.cudnn_handle,
+                                            CUDNN_SOFTMAX_ACCURATE,
+                                            CUDNN_SOFTMAX_MODE_CHANNEL,
+                                            &alpha,
+                                            t_resource.cudnn_desc,
+                                            output_value,
+                                            t_resource.cudnn_desc,
+                                            output_grad,
+                                            &beta,
+                                            t_resource.cudnn_desc,
+                                            output_grad));
   CHECK_SYNC("hl_softmax_backward failed");
 }
 
 void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                    real *input,
+                                    real* input,
                                     hl_tensor_descriptor outputDesc,
-                                    real *output,
+                                    real* output,
                                     hl_tensor_descriptor bnParamDesc,
-                                    real *scale,
-                                    real *bias,
+                                    real* scale,
+                                    real* bias,
                                     double factor,
-                                    real *runningMean,
-                                    real *runningInvVar,
+                                    real* runningMean,
+                                    real* runningInvVar,
                                     double epsilon,
-                                    real *savedMean,
-                                    real *savedVar) {
+                                    real* savedMean,
+                                    real* savedVar) {
 #if CUDNN_VERSION >= 4007
   if ((NULL != runningMean && NULL == runningInvVar) ||
       (NULL == runningMean && NULL != runningInvVar)) {
     LOG(FATAL) << "runningMean and runningInvVar can be NULL "
-              << "but only at the same time.";
+               << "but only at the same time.";
   }
   if ((NULL != savedMean && NULL == savedVar) ||
       (NULL == savedMean && NULL != savedVar)) {
@@ -987,10 +940,24 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
   real alpha = 1.0f;
   real beta = 1.0f;
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-  CHECK_CUDNN(dynload::cudnnBatchNormalizationForwardTraining(
-              t_resource.cudnn_handle, mode, &alpha, &beta, xDesc,
-              input, yDesc, output, bnDesc, scale, bias, factor,
-              runningMean, runningInvVar, epsilon, savedMean, savedVar));
+  CHECK_CUDNN(
+      dynload::cudnnBatchNormalizationForwardTraining(t_resource.cudnn_handle,
+                                                      mode,
+                                                      &alpha,
+                                                      &beta,
+                                                      xDesc,
+                                                      input,
+                                                      yDesc,
+                                                      output,
+                                                      bnDesc,
+                                                      scale,
+                                                      bias,
+                                                      factor,
+                                                      runningMean,
+                                                      runningInvVar,
+                                                      epsilon,
+                                                      savedMean,
+                                                      savedVar));
 
   CHECK_SYNC("hl_batch_norm_forward_training failed");
 #else
@@ -1000,15 +967,15 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
 }
 
 void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                    real *input,
-                                    hl_tensor_descriptor outputDesc,
-                                    real *output,
-                                    hl_tensor_descriptor bnParamDesc,
-                                    real *scale,
-                                    real *bias,
-                                    real *estimatedMean,
-                                    real *estimatedInvVar,
-                                    double epsilon) {
+                                     real* input,
+                                     hl_tensor_descriptor outputDesc,
+                                     real* output,
+                                     hl_tensor_descriptor bnParamDesc,
+                                     real* scale,
+                                     real* bias,
+                                     real* estimatedMean,
+                                     real* estimatedInvVar,
+                                     double epsilon) {
 #if CUDNN_VERSION >= 4007
   cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
   cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
@@ -1016,10 +983,21 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
   real alpha = 1.0f;
   real beta = 1.0f;
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-  CHECK_CUDNN(dynload::cudnnBatchNormalizationForwardInference(
-              t_resource.cudnn_handle, mode, &alpha, &beta, xDesc,
-              input, yDesc, output, bnDesc, scale, bias,
-              estimatedMean, estimatedInvVar, epsilon));
+  CHECK_CUDNN(
+      dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
+                                                       mode,
+                                                       &alpha,
+                                                       &beta,
+                                                       xDesc,
+                                                       input,
+                                                       yDesc,
+                                                       output,
+                                                       bnDesc,
+                                                       scale,
+                                                       bias,
+                                                       estimatedMean,
+                                                       estimatedInvVar,
+                                                       epsilon));
 
   CHECK_SYNC("hl_batch_norm_forward_inference failed");
 #else
@@ -1029,18 +1007,18 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
 }
 
 void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                            real *input,
+                            real* input,
                             hl_tensor_descriptor outGradDesc,
-                            real *outGrad,
+                            real* outGrad,
                             hl_tensor_descriptor inGradDesc,
-                            real *inGrad,
+                            real* inGrad,
                             hl_tensor_descriptor dBnParamDesc,
-                            real *scale,
-                            real *scaleGrad,
-                            real *biasGrad,
+                            real* scale,
+                            real* scaleGrad,
+                            real* biasGrad,
                             double epsilon,
-                            real *savedMean,
-                            real *savedInvVar) {
+                            real* savedMean,
+                            real* savedInvVar) {
 #if CUDNN_VERSION >= 4007
   if ((NULL != savedMean && NULL == savedInvVar) ||
       (NULL == savedMean && NULL != savedInvVar)) {
@@ -1055,12 +1033,25 @@ void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
   real alpha = 1.0f;
   real beta = 1.0f;
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-  CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(
-              t_resource.cudnn_handle, mode, &alpha, &beta,
-              &alpha, &beta,
-              xDesc, input, dyDesc, outGrad, dxDesc, inGrad,
-              bnDesc, scale, scaleGrad, biasGrad, epsilon,
-              savedMean, savedInvVar));
+  CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(t_resource.cudnn_handle,
+                                                       mode,
+                                                       &alpha,
+                                                       &beta,
+                                                       &alpha,
+                                                       &beta,
+                                                       xDesc,
+                                                       input,
+                                                       dyDesc,
+                                                       outGrad,
+                                                       dxDesc,
+                                                       inGrad,
+                                                       bnDesc,
+                                                       scale,
+                                                       scaleGrad,
+                                                       biasGrad,
+                                                       epsilon,
+                                                       savedMean,
+                                                       savedInvVar));
 
   CHECK_SYNC("hl_batch_norm_backward failed");
 #else
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index 3ea2c91bd5a41e0cd6ece0605a25e645676faa40..6b71a538485a09cf40a53eddf1ee2f3e2c768b2c 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -12,22 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
-#include <sys/time.h>
+#include <cuda_profiler_api.h>
 #include <string.h>
-#include <unistd.h>
 #include <sys/syscall.h>
+#include <sys/time.h>
+#include <unistd.h>
 #include <mutex>
 #include "hl_cuda.h"
 #include "hl_cuda.ph"
-#include "hl_thread.ph"
 #include "hl_dso_loader.h"
+#include "hl_thread.ph"
 #include "paddle/utils/Logging.h"
 
 namespace dynload {
 
 std::once_flag curand_dso_flag;
-void* curand_dso_handle = nullptr;
+void *curand_dso_handle = nullptr;
 
 /**
  * The following macro definition can generate structs
@@ -37,34 +37,35 @@ void* curand_dso_handle = nullptr;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)                           \
-  struct DynLoad__##__name {                                       \
-    template <typename... Args>                                    \
-    curandStatus_t operator()(Args... args) {                      \
-       typedef curandStatus_t (*curandFunc)(Args...);              \
-       std::call_once(curand_dso_flag, GetCurandDsoHandle,         \
-                      &curand_dso_handle);                         \
-       void* p_##__name = dlsym(curand_dso_handle, #__name);       \
-       return reinterpret_cast<curandFunc>(p_##__name)(args...);   \
-    }                                                              \
-  } __name;  /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CURAND_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    curandStatus_t operator()(Args... args) {                                  \
+      typedef curandStatus_t (*curandFunc)(Args...);                           \
+      std::call_once(curand_dso_flag, GetCurandDsoHandle, &curand_dso_handle); \
+      void *p_##__name = dlsym(curand_dso_handle, #__name);                    \
+      return reinterpret_cast<curandFunc>(p_##__name)(args...);                \
+    }                                                                          \
+  } __name; /* struct DynLoad__##__name */
 #else
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)                           \
-  struct DynLoad__##__name {                                       \
-    template <typename... Args>                                    \
-    curandStatus_t operator()(Args... args) {                      \
-       return __name(args...);                                     \
-    }                                                              \
-  } __name;  /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CURAND_WRAP(__name)      \
+  struct DynLoad__##__name {                  \
+    template <typename... Args>               \
+    curandStatus_t operator()(Args... args) { \
+      return __name(args...);                 \
+    }                                         \
+  } __name; /* struct DynLoad__##__name */
 #endif
 
 /* include all needed curand functions in HPPL */
+// clang-format off
 #define CURAND_RAND_ROUTINE_EACH(__macro)    \
   __macro(curandCreateGenerator)             \
   __macro(curandSetStream)                   \
   __macro(curandSetPseudoRandomGeneratorSeed)\
   __macro(curandGenerateUniform)             \
   __macro(curandGenerateUniformDouble)
+// clang-format on
 
 CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
 
@@ -72,7 +73,7 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
 #undef DYNAMIC_LOAD_CURAND_WRAP
 
 std::once_flag cudart_dso_flag;
-void* cudart_dso_handle = nullptr;
+void *cudart_dso_handle = nullptr;
 
 /**
  * The following macro definition can generate structs
@@ -82,28 +83,28 @@ void* cudart_dso_handle = nullptr;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUDART_WRAP(__name)                            \
-  struct DynLoad__##__name {                                        \
-    template <typename... Args>                                     \
-    auto operator()(Args... args) -> decltype(__name(args...)) {    \
-      using cudart_func = decltype(__name(args...))(*)(Args...);    \
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle,           \
-                     &cudart_dso_handle);                           \
-      void* p_##__name = dlsym(cudart_dso_handle, #__name);         \
-      return reinterpret_cast<cudart_func>(p_##__name)(args...);    \
-    }                                                               \
-  } __name;  /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+      using cudart_func = decltype(__name(args...)) (*)(Args...);              \
+      std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
+      void *p_##__name = dlsym(cudart_dso_handle, #__name);                    \
+      return reinterpret_cast<cudart_func>(p_##__name)(args...);               \
+    }                                                                          \
+  } __name; /* struct DynLoad__##__name */
 #else
-#define DYNAMIC_LOAD_CUDART_WRAP(__name)                            \
-  struct DynLoad__##__name {                                        \
-    template <typename... Args>                                     \
-    auto operator()(Args... args) -> decltype(__name(args...)) {    \
-      return __name(args...);                                       \
-    }                                                               \
-  } __name;  /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name)                         \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      return __name(args...);                                    \
+    }                                                            \
+  } __name; /* struct DynLoad__##__name */
 #endif
 
 /* include all needed cuda functions in HPPL */
+// clang-format off
 #define CUDA_ROUTINE_EACH(__macro)        \
   __macro(cudaMalloc)                     \
   __macro(cudaHostAlloc)                  \
@@ -133,58 +134,60 @@ void* cudart_dso_handle = nullptr;
   __macro(cudaGetLastError)               \
   __macro(cudaFuncSetCacheConfig)         \
   __macro(cudaRuntimeGetVersion)          \
-  __macro(cudaGetErrorString)
+  __macro(cudaGetErrorString)             \
+  __macro(cudaProfilerStart)              \
+  __macro(cudaProfilerStop)
+// clang-format on
 
 CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
 
 #undef CUDA_ROUNTINE_EACH
 #undef DYNAMIC_LOAD_CUDART_WRAP
 
-}  /* namespace dynload */
+} /* namespace dynload */
 
 /**
  * @brief   global resource.
  */
-int                     g_system_device_num = 0;    /* system device number */
-int                     device_num = 0;             /* use    device number */
-hl_device_prop          *g_device;                  /* device info table */
-__thread thread_device_resources *t_device;         /* device resources table */
+int g_system_device_num = 0;                /* system device number */
+int device_num = 0;                         /* use    device number */
+hl_device_prop *g_device;                   /* device info table */
+__thread thread_device_resources *t_device; /* device resources table */
 int g_cuda_lib_version = 0;
 
 /* number of global stream */
-#define  NUMBER_OF_GLOBAL_STREAM    (HPPL_THREAD_STREAM_1)
+#define NUMBER_OF_GLOBAL_STREAM (HPPL_THREAD_STREAM_1)
 /* number of thread stream */
-#define  NUMBER_OF_THREAD_STREAM    (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
+#define NUMBER_OF_THREAD_STREAM (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
 /* sizeof of device memory */
-#define  HPPL_GPU_MEMORY_SIZE                (256*4)
+#define HPPL_GPU_MEMORY_SIZE (256 * 4)
 
 /**
  * Check build-in cuda function using glog and it **does not**
  * support << operator for more details error info.
  */
-#define CHECK_CUDA(cudaFunc)                               \
-  do {                                                     \
-    cudaError_t cudaStat = cudaFunc;                       \
-    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "      \
-        << dynload::cudaGetErrorString(cudaStat);          \
+#define CHECK_CUDA(cudaFunc)                                                  \
+  do {                                                                        \
+    cudaError_t cudaStat = cudaFunc;                                          \
+    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                         \
+                                    << dynload::cudaGetErrorString(cudaStat); \
   } while (0)
 
 /**
  * @brief   thread resource.
  */
-__thread _hl_thread_resource t_resource = {
-                                           {0},     /* stream */
-                                           0,       /* handle */
-                                           0,       /* gen */
-                                           0,       /* cudnn_handle */
-                                           0,       /* cudnn_desc */
-                                           NULL,    /* gen_mutex */
-                                           NULL,    /* gpu_mem */
-                                           NULL,    /* cpu_mem */
-                                           0,       /* event */
-                                           -1,      /* device */
-                                           0,       /* major */
-                                           false};  /* is_init */
+__thread _hl_thread_resource t_resource = {{0},    /* stream */
+                                           0,      /* handle */
+                                           0,      /* gen */
+                                           0,      /* cudnn_handle */
+                                           0,      /* cudnn_desc */
+                                           NULL,   /* gen_mutex */
+                                           NULL,   /* gpu_mem */
+                                           NULL,   /* cpu_mem */
+                                           0,      /* event */
+                                           -1,     /* device */
+                                           0,      /* major */
+                                           false}; /* is_init */
 
 __thread cudaStream_t default_stream = 0;
 __thread bool g_sync_flag = true;
@@ -198,18 +201,17 @@ inline pid_t gettid() {
   uint64_t tid;
   pthread_threadid_np(NULL, &tid);
 #else
-  #ifndef __NR_gettid
-  #define __NR_gettid 224
-  #endif
+#ifndef __NR_gettid
+#define __NR_gettid 224
+#endif
   pid_t tid = syscall(__NR_gettid);
 #endif
-  CHECK_NE(tid, -1);
-  return tid;    
+  CHECK_NE((int)tid, -1);
+  return tid;
 }
 
 void hl_init(int device) {
-  CHECK(hl_start_flag)
-    << "[Init failed] hl_start() did not succeed.";
+  CHECK(hl_start_flag) << "[Init failed] hl_start() did not succeed.";
 
   /* thread has been initialized */
   if (true == t_resource.is_init) {
@@ -220,16 +222,16 @@ void hl_init(int device) {
   /* create thread devcie resources */
   char *tmp;
   thread_device_resources device_res;
-  tmp = (char *)malloc(g_system_device_num*sizeof(thread_device_resources*) +
-                       device_num*sizeof(_thread_device_resources));
+  tmp = (char *)malloc(g_system_device_num * sizeof(thread_device_resources *) +
+                       device_num * sizeof(_thread_device_resources));
   CHECK_NOTNULL(tmp);
-  t_device = (thread_device_resources*)tmp;
-  device_res = (thread_device_resources)((char*)tmp +
-               g_system_device_num*sizeof(thread_device_resources*));
-  memset(t_device, 0, g_system_device_num*sizeof(thread_device_resources*));
+  t_device = (thread_device_resources *)tmp;
+  device_res = (thread_device_resources)(
+      (char *)tmp + g_system_device_num * sizeof(thread_device_resources *));
+  memset(t_device, 0, g_system_device_num * sizeof(thread_device_resources *));
 
-  char *tmp_stream = (char *)
-      malloc(device_num*NUMBER_OF_THREAD_STREAM*sizeof(cudaStream_t));
+  char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_THREAD_STREAM *
+                                    sizeof(cudaStream_t));
   CHECK_NOTNULL(tmp_stream);
 
   int num = 0;
@@ -239,8 +241,9 @@ void hl_init(int device) {
     }
 
     t_device[dev] = &device_res[num];
-    t_device[dev]->stream = (cudaStream_t*)(tmp_stream +
-        num*NUMBER_OF_THREAD_STREAM*sizeof(cudaStream_t));
+    t_device[dev]->stream =
+        (cudaStream_t *)(tmp_stream +
+                         num * NUMBER_OF_THREAD_STREAM * sizeof(cudaStream_t));
 
     hl_create_thread_resources(dev, t_device[dev]);
     num++;
@@ -266,14 +269,14 @@ void hl_fini() {
     t_resource.stream[i] = 0;
   }
 
-  char* tmp = (char*)t_device;
-  char* tmp_stream = NULL;
+  char *tmp = (char *)t_device;
+  char *tmp_stream = NULL;
   for (int dev = 0; dev < g_system_device_num; dev++) {
     if (!t_device[dev]) {
       continue;
     }
     if (!tmp_stream) {
-        tmp_stream = (char*)t_device[dev]->stream;
+      tmp_stream = (char *)t_device[dev]->stream;
     }
     for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
       CHECK_CUDA(dynload::cudaStreamDestroy(t_device[dev]->stream[j]));
@@ -290,9 +293,7 @@ void hl_fini() {
   t_resource.is_init = false;
 }
 
-int hl_get_device_count() {
-  return device_num;
-}
+int hl_get_device_count() { return device_num; }
 
 void hl_set_device(int device) {
   if (device == t_resource.device) {
@@ -300,7 +301,7 @@ void hl_set_device(int device) {
   }
 
   CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-    << "Device: " << device << " is not specified in startup.";
+      << "Device: " << device << " is not specified in startup.";
 
   CHECK_CUDA(dynload::cudaSetDevice(device));
 
@@ -312,11 +313,11 @@ void hl_set_device(int device) {
   if (true == t_resource.is_init) {
     for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
       t_resource.stream[i] =
-        t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
+          t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
     }
     t_resource.gpu_mem = t_device[device]->gpu_mem;
     t_resource.cpu_mem = t_device[device]->cpu_mem;
-    t_resource.event   = t_device[device]->mem_event;
+    t_resource.event = t_device[device]->mem_event;
   }
 
   t_resource.handle = g_device[device]->device_resources->handle;
@@ -334,11 +335,11 @@ int hl_get_device() {
   return device;
 }
 
-void* hl_malloc_device(size_t size) {
+void *hl_malloc_device(size_t size) {
   void *dest_d;
 
   CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(dynload::cudaMalloc((void**)&dest_d, size));
+  CHECK_CUDA(dynload::cudaMalloc((void **)&dest_d, size));
 
   return dest_d;
 }
@@ -348,14 +349,15 @@ void hl_free_mem_device(void *dest_d) {
 
   cudaError_t err = dynload::cudaFree(dest_d);
   CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
-    << hl_get_device_error_string();
+      << hl_get_device_error_string();
 }
 
-void* hl_malloc_host(size_t size) {
+void *hl_malloc_host(size_t size) {
   void *dest_h;
 
   CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(dynload::cudaHostAlloc((void**)&dest_h, size, cudaHostAllocDefault));
+  CHECK_CUDA(
+      dynload::cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
 
   return dest_h;
 }
@@ -364,8 +366,8 @@ void hl_free_mem_host(void *dest_h) {
   CHECK_NOTNULL(dest_h);
 
   cudaError_t err = dynload::cudaFreeHost(dest_h);
-  CHECK (cudaSuccess == err || cudaErrorCudartUnloading == err)
-    << hl_get_device_error_string();
+  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
+      << hl_get_device_error_string();
 }
 
 void hl_memcpy(void *dst, void *src, size_t size) {
@@ -387,8 +389,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
   }
   CHECK_NOTNULL(src_h);
   CHECK_NOTNULL(dest_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size,
-             cudaMemcpyHostToDevice));
+  CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
 }
 
 void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
@@ -397,8 +398,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
   }
   CHECK_NOTNULL(dest_h);
   CHECK_NOTNULL(src_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size,
-             cudaMemcpyDeviceToHost));
+  CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
 }
 
 void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
@@ -407,8 +407,8 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
   }
   CHECK_NOTNULL(dest_d);
   CHECK_NOTNULL(src_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_d, size,
-             cudaMemcpyDeviceToDevice));
+  CHECK_CUDA(
+      dynload::cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
 }
 
 void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
@@ -422,8 +422,8 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
   CHECK_LT(stream, HPPL_STREAM_END);
   cu_stream = t_resource.stream[stream];
 
-  CHECK_CUDA(dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault,
-             cu_stream));
+  CHECK_CUDA(
+      dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
 }
 
 void hl_start() {
@@ -434,8 +434,8 @@ void hl_start() {
 
 bool hl_device_can_access_peer(int device, int peerDevice) {
   int canAccessPeer;
-  CHECK_CUDA(dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device,
-             peerDevice));
+  CHECK_CUDA(
+      dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
 
   if (canAccessPeer == 1) {
     return true;
@@ -477,32 +477,32 @@ void hl_create_global_resources(hl_device_prop device_prop) {
 
   /* create curand gen */
   CHECK_EQ(dynload::curandCreateGenerator(&device_res->gen,
-           CURAND_RNG_PSEUDO_DEFAULT), CURAND_STATUS_SUCCESS)
-           << "[Start failed] Curand init failed.";
+                                          CURAND_RNG_PSEUDO_DEFAULT),
+           CURAND_STATUS_SUCCESS)
+      << "[Start failed] Curand init failed.";
 
-  CHECK_EQ(dynload::curandSetStream(device_res->gen,
-           device_res->stream[0]), CURAND_STATUS_SUCCESS)
-           << "[Start failed] Curand set stream failed!";
+  CHECK_EQ(dynload::curandSetStream(device_res->gen, device_res->stream[0]),
+           CURAND_STATUS_SUCCESS)
+      << "[Start failed] Curand set stream failed!";
 
   /* create cudnn handle */
   hl_cudnn_init(&device_res->cudnn_handle, device_res->stream[0]);
 
   int seed = gettid();
-  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(
-           device_res->gen, seed+device), CURAND_STATUS_SUCCESS);
+  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(device_res->gen,
+                                                       seed + device),
+           CURAND_STATUS_SUCCESS);
 
-  device_res->gen_mutex =
-    (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t)));
+  device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
   pthread_mutex_init(device_res->gen_mutex, NULL);
 
   CHECK_CUDA(dynload::cudaRuntimeGetVersion(&g_cuda_lib_version));
 }
 
-int hl_get_cuda_version() {
-  return g_cuda_lib_version;
-}
+int hl_get_cuda_version() { return g_cuda_lib_version; }
 
-void hl_create_thread_resources(int device, thread_device_resources device_res) {
+void hl_create_thread_resources(int device,
+                                thread_device_resources device_res) {
   CHECK_CUDA(dynload::cudaSetDevice(device));
 
   /* create thread stream */
@@ -511,15 +511,15 @@ void hl_create_thread_resources(int device, thread_device_resources device_res)
   }
 
   /* allocation device memory */
-  device_res->gpu_mem = (real*)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
+  device_res->gpu_mem = (real *)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
 
   /* allocation host memory */
-  device_res->cpu_mem = (real*)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
+  device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
 
   CHECK_CUDA(dynload::cudaEventCreate(&device_res->mem_event));
 }
 
-void hl_specify_devices_start(int* device, int number) {
+void hl_specify_devices_start(int *device, int number) {
   if (hl_start_flag) return;
 
   /* 1. get the number of devices */
@@ -531,20 +531,19 @@ void hl_specify_devices_start(int* device, int number) {
 
   /* 2. check device & create device property table */
   CHECK_LE(number, g_system_device_num)
-    << "[Start failed] System does not have enough device. "
-    << "Device number: " << g_system_device_num
-    << "Input number: " << number;
+      << "[Start failed] System does not have enough device. "
+      << "Device number: " << g_system_device_num << "Input number: " << number;
 
   char *tmp;
   hl_device_prop device_prop;
-  tmp = (char *)malloc(g_system_device_num*sizeof(hl_device_prop*) +
-                       number*sizeof(_hl_device_prop));
+  tmp = (char *)malloc(g_system_device_num * sizeof(hl_device_prop *) +
+                       number * sizeof(_hl_device_prop));
   CHECK(tmp) << "[Start failed] System memory is not enough.";
 
-  g_device = (hl_device_prop*)tmp;
-  device_prop = (hl_device_prop)((char*)tmp +
-                g_system_device_num*sizeof(hl_device_prop*));
-  memset(g_device, 0, g_system_device_num*sizeof(hl_device_prop*));
+  g_device = (hl_device_prop *)tmp;
+  device_prop = (hl_device_prop)(
+      (char *)tmp + g_system_device_num * sizeof(hl_device_prop *));
+  memset(g_device, 0, g_system_device_num * sizeof(hl_device_prop *));
   int num = 0;
   for (int i = 0; i < number; i++) {
     int dev;
@@ -555,13 +554,13 @@ void hl_specify_devices_start(int* device, int number) {
     }
 
     CHECK_LT(dev, g_system_device_num)
-      << "[Start failed] The specified device number is "
-      << "out of range. Max device number: " << g_system_device_num - 1
-      << " Specified devcie number: "<< dev;
+        << "[Start failed] The specified device number is "
+        << "out of range. Max device number: " << g_system_device_num - 1
+        << " Specified devcie number: " << dev;
 
     if (g_device[dev]) {
       /* Warning */
-      LOG(WARNING) <<"[Warning] Repeat specify device: " << dev;
+      LOG(WARNING) << "[Warning] Repeat specify device: " << dev;
       continue;
     }
 
@@ -572,11 +571,11 @@ void hl_specify_devices_start(int* device, int number) {
   device_num = num;
 
   /* 3.  create global device resources */
-  char *tmp_res = (char *)malloc(device_num*sizeof(_global_device_resources));
+  char *tmp_res = (char *)malloc(device_num * sizeof(_global_device_resources));
   CHECK_NOTNULL(tmp_res);
 
-  char *tmp_stream =
-    (char *)malloc(device_num*NUMBER_OF_GLOBAL_STREAM*sizeof(cudaStream_t));
+  char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_GLOBAL_STREAM *
+                                    sizeof(cudaStream_t));
   CHECK_NOTNULL(tmp_stream);
 
   num = 0;
@@ -585,10 +584,11 @@ void hl_specify_devices_start(int* device, int number) {
       continue;
     }
 
-    g_device[i]->device_resources = (global_device_resources)(tmp_res +
-      num*sizeof(_global_device_resources));
-    g_device[i]->device_resources->stream = (cudaStream_t*)(tmp_stream +
-      num*NUMBER_OF_GLOBAL_STREAM*sizeof(cudaStream_t));
+    g_device[i]->device_resources = (global_device_resources)(
+        tmp_res + num * sizeof(_global_device_resources));
+    g_device[i]->device_resources->stream =
+        (cudaStream_t *)(tmp_stream +
+                         num * NUMBER_OF_GLOBAL_STREAM * sizeof(cudaStream_t));
 
     hl_create_global_resources(g_device[i]);
     num++;
@@ -598,9 +598,9 @@ void hl_specify_devices_start(int* device, int number) {
   hl_start_flag = true;
   /* set default device */
   if (device == NULL) {
-      hl_set_device(0);
+    hl_set_device(0);
   } else {
-      hl_set_device(device[0]);
+    hl_set_device(device[0]);
   }
 }
 
@@ -608,35 +608,31 @@ void hl_rand(real *dest_d, size_t num) {
   pthread_mutex_lock(t_resource.gen_mutex);
   CHECK_EQ(
 #ifndef PADDLE_TYPE_DOUBLE
-  dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
+      dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
 #else
-  dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
+      dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
 #endif
-  CURAND_STATUS_SUCCESS);
+      CURAND_STATUS_SUCCESS);
   pthread_mutex_unlock(t_resource.gen_mutex);
   CHECK_SYNC("hl_rand failed");
 }
 
 void hl_srand(unsigned int seed) {
   pthread_mutex_lock(t_resource.gen_mutex);
-  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(
-           t_resource.gen, seed), CURAND_STATUS_SUCCESS);
+  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(t_resource.gen, seed),
+           CURAND_STATUS_SUCCESS);
   pthread_mutex_unlock(t_resource.gen_mutex);
 }
 
-void hl_set_sync_flag(bool flag) {
-  g_sync_flag = flag;
-}
+void hl_set_sync_flag(bool flag) { g_sync_flag = flag; }
 
-bool hl_get_sync_flag() {
-  return g_sync_flag;
-}
+bool hl_get_sync_flag() { return g_sync_flag; }
 
 void hl_stream_synchronize(hl_stream_t stream) {
   cudaStream_t cu_stream;
 
-  CHECK_LT(stream, HPPL_STREAM_END)
-    << __func__ <<": the parameter stream is error.";
+  CHECK_LT(stream, HPPL_STREAM_END) << __func__
+                                    << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
   CHECK_CUDA(dynload::cudaStreamSynchronize(cu_stream));
@@ -645,8 +641,8 @@ void hl_stream_synchronize(hl_stream_t stream) {
 void hl_create_event(hl_event_t *event) {
   CHECK_NOTNULL(event);
 
-  struct _hl_event_st* st_event =
-    (struct _hl_event_st*)malloc(sizeof(struct _hl_event_st));
+  struct _hl_event_st *st_event =
+      (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
 
   CHECK_CUDA(dynload::cudaEventCreate(&st_event->cu_event));
 
@@ -658,8 +654,8 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
   CHECK_NOTNULL(start);
   CHECK_NOTNULL(end);
 
-  CHECK_CUDA(dynload::cudaEventElapsedTime(&time,
-             start->cu_event, end->cu_event));
+  CHECK_CUDA(
+      dynload::cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
   return time;
 }
 
@@ -667,24 +663,22 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
   cudaStream_t cu_stream;
 
   CHECK_NOTNULL(event);
-  CHECK_LT(stream, HPPL_STREAM_END)
-    << __func__ <<": the parameter stream is error.";
+  CHECK_LT(stream, HPPL_STREAM_END) << __func__
+                                    << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaEventRecord(
-             event->cu_event, cu_stream));
+  CHECK_CUDA(dynload::cudaEventRecord(event->cu_event, cu_stream));
 }
 
 void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
   cudaStream_t cu_stream;
 
   CHECK_NOTNULL(event);
-  CHECK_LT(stream, HPPL_STREAM_END)
-    << __func__ <<": the parameter stream is error.";
+  CHECK_LT(stream, HPPL_STREAM_END) << __func__
+                                    << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaStreamWaitEvent(
-             cu_stream, event->cu_event, 0));
+  CHECK_CUDA(dynload::cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
 }
 
 void hl_destroy_event(hl_event_t event) {
@@ -703,15 +697,15 @@ void hl_event_synchronize(hl_event_t event) {
 void hl_get_device_name(char *name, int len, int device) {
   CHECK_NOTNULL(name);
   CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-    << "Device("<< device <<") is not specified in startup.";
+      << "Device(" << device << ") is not specified in startup.";
 
-  strncpy(name, g_device[device]->device_name , len);
+  strncpy(name, g_device[device]->device_name, len);
 }
 
 void hl_get_device_memory(size_t *mem_size, int device) {
   CHECK_NOTNULL(mem_size);
   CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-    << "Device("<< device <<") is not specified in startup.";
+      << "Device(" << device << ") is not specified in startup.";
 
   *mem_size = g_device[device]->device_mem;
 }
@@ -720,31 +714,26 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) {
   CHECK_NOTNULL(major);
   CHECK_NOTNULL(minor);
   CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-    << "Device("<< device << ") is not specified in startup.";
+      << "Device(" << device << ") is not specified in startup.";
 
   *major = g_device[device]->major;
   *minor = g_device[device]->minor;
 }
 
-int hl_get_device_last_error() {
-  return (int)dynload::cudaGetLastError();
-}
+int hl_get_device_last_error() { return (int)dynload::cudaGetLastError(); }
 
-const char* hl_get_device_error_string() {
+const char *hl_get_device_error_string() {
   cudaError_t err = dynload::cudaGetLastError();
   return dynload::cudaGetErrorString(err);
 }
 
-const char* hl_get_device_error_string(size_t err) {
+const char *hl_get_device_error_string(size_t err) {
   return dynload::cudaGetErrorString((cudaError_t)err);
 }
 
-void hl_device_synchronize() {
-  CHECK_CUDA(dynload::cudaDeviceSynchronize());
-}
+void hl_device_synchronize() { CHECK_CUDA(dynload::cudaDeviceSynchronize()); }
 void hl_set_device_flags_block() {
-  CHECK_CUDA(dynload::cudaSetDeviceFlags(
-             cudaDeviceScheduleBlockingSync));
+  CHECK_CUDA(dynload::cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
 }
 
 bool hl_cuda_event_is_ready(hl_event_t event) {
@@ -756,3 +745,7 @@ bool hl_cuda_event_is_ready(hl_event_t event) {
   }
   return true;
 }
+
+void hl_profiler_start() { CHECK_CUDA(dynload::cudaProfilerStart()); }
+
+void hl_profiler_end() { CHECK_CUDA(dynload::cudaProfilerStop()); }
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index 63824eaa4c201c50ea20521801cd12de685aa3b9..e83a60ad72fa45999b0c29656f7eaf55c81910a5 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -447,6 +447,112 @@ void hl_sequence2batch_add(real *batch,
   CHECK_SYNC("hl_sequence2batch_add failed");
 }
 
+template<bool normByTimes, bool seq2batch>
+__global__
+void KeSequence2BatchPadding(real* batch,
+                             real* sequence,
+                             const int* sequenceStartPositions,
+                             const size_t sequenceWidth,
+                             const size_t maxSequenceLength,
+                             const size_t numSequences) {
+  int batchIdx = blockIdx.y;
+  int sequenceStart = sequenceStartPositions[batchIdx];
+  int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart;
+
+  int sequenceIdx = blockIdx.x * blockDim.y + threadIdx.y;
+  int batchBaseIdx = (sequenceIdx * numSequences + batchIdx) * sequenceWidth;
+  int sequenceBaseIdx = (sequenceStart + sequenceIdx) * sequenceWidth;
+
+  real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f;
+
+  if (sequenceIdx < sequenceLength) {
+    if (seq2batch) {
+      /* sequence -> batch */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i];
+      }
+    } else {
+      /* batch -> sequence */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i];
+      }
+    }
+  } else if (sequenceIdx < maxSequenceLength) {
+    if (seq2batch) {
+      /* sequence -> batch */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        batch[batchBaseIdx + i] = 0;
+      }
+    }
+  }
+}
+
+void hl_sequence2batch_copy_padding(real* batch,
+                                    real* sequence,
+                                    const int* sequenceStartPositions,
+                                    const size_t sequenceWidth,
+                                    const size_t maxSequenceLength,
+                                    const size_t numSequences,
+                                    bool normByTimes,
+                                    bool seq2batch) {
+  CHECK_NOTNULL(batch);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(sequenceStartPositions);
+
+  if (!normByTimes && numSequences == 1) {
+    size_t elementCount = maxSequenceLength * sequenceWidth;
+    if (seq2batch) {
+      /* sequence -> batch */
+      hl_memcpy_device2device(batch, sequence, sizeof(real) * elementCount);
+    } else {
+      /* batch -> sequence */
+      hl_memcpy_device2device(sequence, batch, sizeof(real) * elementCount);
+    }
+    return;
+  }
+
+  const int CUDA_BLOCK_SIZE = 512;
+
+  /* At least use 32 threads to copy sequenceWidth elements,
+     and at least 8 elements for each thread. */
+  int blockDimX = ((((sequenceWidth + 7) >> 3) + 31) >> 5) << 5;
+  blockDimX = (blockDimX < CUDA_BLOCK_SIZE) ? blockDimX : CUDA_BLOCK_SIZE;
+
+  int blockDimY = CUDA_BLOCK_SIZE / blockDimX;
+  dim3 threads(blockDimX, blockDimY);
+
+  int gridDimX = (maxSequenceLength * blockDimX + CUDA_BLOCK_SIZE - 1) /
+      CUDA_BLOCK_SIZE;
+  int gridDimY = numSequences;
+  dim3 grid(gridDimX, gridDimY);
+
+  if (seq2batch) {
+    /* sequence -> batch */
+    if (normByTimes) {
+      KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    } else {
+      KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    }
+  } else {
+    /* batch -> sequence */
+    if (normByTimes) {
+      KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    } else {
+      KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    }
+  }
+
+  CHECK_SYNC("hl_sequence2batch_copy_padding failed");
+}
+
 __device__ inline float my_rsqrt(float x) {
   return rsqrtf(x);
 }
diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc
index 27bbd03bc328293d978867c6badddc13a754ece2..a95f5557afb4976e5fc1d5a71ea4f70463f00122 100644
--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ b/paddle/cuda/src/hl_cudart_wrap.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifdef PADDLE_USE_DSO
 
 #include <mutex>
+#include <cuda_runtime.h>
 #include "hl_dso_loader.h"
 
 /**
@@ -29,26 +29,26 @@ limitations under the License. */
 namespace dynload {
 
 extern std::once_flag cudart_dso_flag;
-extern void* cudart_dso_handle;
+extern void *cudart_dso_handle;
 
 /**
  * The following macro definition can generate structs
  * (for each function) to dynamic load cuda routine
  * via operator overloading.
  **/
-#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type)                    \
-  struct DynLoad__##__name {                                        \
-    template <typename... Args>                                     \
-    __type operator()(Args... args) {                               \
-      typedef __type (*cudartFunc)(Args...);                        \
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle,           \
-                    &cudart_dso_handle);                            \
-      void* p_##__name = dlsym(cudart_dso_handle, #__name);         \
-      return reinterpret_cast<cudartFunc>(p_##__name)(args...);     \
-    }                                                               \
-  } __name;  /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type)                               \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    __type operator()(Args... args) {                                          \
+      typedef __type (*cudartFunc)(Args...);                                   \
+      std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
+      void *p_##__name = dlsym(cudart_dso_handle, #__name);                    \
+      return reinterpret_cast<cudartFunc>(p_##__name)(args...);                \
+    }                                                                          \
+  } __name; /* struct DynLoad__##__name */
 
 /* include all needed cuda functions in HPPL */
+// clang-format off
 #define CUDA_ROUTINE_EACH(__macro)          \
   __macro(cudaLaunch, cudaError_t)          \
   __macro(cudaSetupArgument, cudaError_t)   \
@@ -61,16 +61,17 @@ extern void* cudart_dso_handle;
   __macro(__cudaInitModule, char)           \
   __macro(__cudaRegisterTexture, void)      \
   __macro(__cudaRegisterSurface, void)
+// clang-format on
 
 CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
 
 #if CUDART_VERSION >= 7000
-  DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
+DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
 #endif
 
 #undef CUDA_ROUNTINE_EACH
 
-}  /* namespace dynload */
+} /* namespace dynload */
 
 #if CUDART_VERSION >= 7000
 __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
@@ -78,131 +79,120 @@ __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
                                                 dim3 blockDim,
                                                 void **args,
                                                 size_t sharedMem,
-                                                cudaStream_t stream)
-{
-  return dynload::cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream);
+                                                cudaStream_t stream) {
+  return dynload::cudaLaunchKernel(
+      func, gridDim, blockDim, args, sharedMem, stream);
 }
 #endif /* CUDART_VERSION >= 7000 */
 
-
-__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func)
-{
+__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
   return dynload::cudaLaunch(func);
 }
 
 __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
                                                  size_t size,
-                                                 size_t offset)
-{
+                                                 size_t offset) {
   return dynload::cudaSetupArgument(arg, size, offset);
 }
 
 __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
                                                  dim3 blockDim,
                                                  size_t sharedMem,
-                                                 cudaStream_t stream)
-{
-  return dynload::cudaConfigureCall(gridDim, blockDim,
-                                    sharedMem, stream);
+                                                 cudaStream_t stream) {
+  return dynload::cudaConfigureCall(gridDim, blockDim, sharedMem, stream);
 }
 
 extern "C" {
 
-void** CUDARTAPI __cudaRegisterFatBinary(
-  void *fatCubin
-)
-{
+void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
   return dynload::__cudaRegisterFatBinary(fatCubin);
-
 }
 
-void CUDARTAPI __cudaUnregisterFatBinary(
-  void **fatCubinHandle
-)
-{
+void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
   return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
 }
 
-void CUDARTAPI __cudaRegisterFunction(
-        void   **fatCubinHandle,
-  const char    *hostFun,
-        char    *deviceFun,
-  const char    *deviceName,
-        int      thread_limit,
-        uint3   *tid,
-        uint3   *bid,
-        dim3    *bDim,
-        dim3    *gDim,
-        int     *wSize
-) {
-  return dynload::__cudaRegisterFunction(
-                fatCubinHandle, hostFun, deviceFun, deviceName,
-                thread_limit, tid, bid, bDim, gDim, wSize);
+void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle,
+                                      const char *hostFun,
+                                      char *deviceFun,
+                                      const char *deviceName,
+                                      int thread_limit,
+                                      uint3 *tid,
+                                      uint3 *bid,
+                                      dim3 *bDim,
+                                      dim3 *gDim,
+                                      int *wSize) {
+  return dynload::__cudaRegisterFunction(fatCubinHandle,
+                                         hostFun,
+                                         deviceFun,
+                                         deviceName,
+                                         thread_limit,
+                                         tid,
+                                         bid,
+                                         bDim,
+                                         gDim,
+                                         wSize);
 }
 
-void CUDARTAPI __cudaRegisterVar(
-        void **fatCubinHandle,
-        char  *hostVar,
-        char  *deviceAddress,
-  const char  *deviceName,
-        int    ext,
-        int    size,
-        int    constant,
-        int    global
-) {
-  return dynload::__cudaRegisterVar(
-                fatCubinHandle, hostVar, deviceAddress,
-                deviceName, ext, size, constant, global);
+void CUDARTAPI __cudaRegisterVar(void **fatCubinHandle,
+                                 char *hostVar,
+                                 char *deviceAddress,
+                                 const char *deviceName,
+                                 int ext,
+                                 int size,
+                                 int constant,
+                                 int global) {
+  return dynload::__cudaRegisterVar(fatCubinHandle,
+                                    hostVar,
+                                    deviceAddress,
+                                    deviceName,
+                                    ext,
+                                    size,
+                                    constant,
+                                    global);
 }
 
-
-
-extern void CUDARTAPI __cudaRegisterManagedVar(
-        void **fatCubinHandle,
-        void **hostVarPtrAddress,
-        char  *deviceAddress,
-  const char  *deviceName,
-        int    ext,
-        int    size,
-        int    constant,
-        int    global
-) {
-  return dynload::__cudaRegisterManagedVar(
-                fatCubinHandle, hostVarPtrAddress, deviceAddress,
-                deviceName, ext, size, constant, global);
+extern void CUDARTAPI __cudaRegisterManagedVar(void **fatCubinHandle,
+                                               void **hostVarPtrAddress,
+                                               char *deviceAddress,
+                                               const char *deviceName,
+                                               int ext,
+                                               int size,
+                                               int constant,
+                                               int global) {
+  return dynload::__cudaRegisterManagedVar(fatCubinHandle,
+                                           hostVarPtrAddress,
+                                           deviceAddress,
+                                           deviceName,
+                                           ext,
+                                           size,
+                                           constant,
+                                           global);
 }
 
-char CUDARTAPI __cudaInitModule(
-        void **fatCubinHandle
-) {
+char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
   return dynload::__cudaInitModule(fatCubinHandle);
 }
 
-void CUDARTAPI __cudaRegisterTexture(
-        void                    **fatCubinHandle,
-  const struct textureReference  *hostVar,
-  const void                    **deviceAddress,
-  const char                     *deviceName,
-        int                       dim,
-        int                       norm,
-        int                       ext
-) {
+void CUDARTAPI __cudaRegisterTexture(void **fatCubinHandle,
+                                     const struct textureReference *hostVar,
+                                     const void **deviceAddress,
+                                     const char *deviceName,
+                                     int dim,
+                                     int norm,
+                                     int ext) {
   return dynload::__cudaRegisterTexture(
-                fatCubinHandle, hostVar, deviceAddress,
-                deviceName, dim, norm, ext);
+      fatCubinHandle, hostVar, deviceAddress, deviceName, dim, norm, ext);
 }
 
-void CUDARTAPI __cudaRegisterSurface(
-        void                    **fatCubinHandle,
-  const struct surfaceReference  *hostVar,
-  const void                    **deviceAddress,
-  const char                     *deviceName,
-        int                       dim,
-        int                       ext
-) {
+void CUDARTAPI __cudaRegisterSurface(void **fatCubinHandle,
+                                     const struct surfaceReference *hostVar,
+                                     const void **deviceAddress,
+                                     const char *deviceName,
+                                     int dim,
+                                     int ext) {
   return dynload::__cudaRegisterSurface(
-                fatCubinHandle, hostVar, deviceAddress,
-                deviceName, dim, ext);
+      fatCubinHandle, hostVar, deviceAddress, deviceName, dim, ext);
 }
 
 } /* extern "C" */
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index b564b969033680a001577de25ceb84dae391754a..ce19073626a8e85e5133d4e1ba1ca71e5653025c 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -12,27 +12,30 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "hl_dso_loader.h"
-#include "paddle/utils/Logging.h"
 #include "paddle/utils/CommandLineParser.h"
+#include "paddle/utils/Logging.h"
 
-P_DEFINE_string(cudnn_dir, "",
+P_DEFINE_string(cudnn_dir,
+                "",
                 "Specify path for loading libcudnn.so. For instance, "
-                "/usr/local/cudnn/lib64. If empty [default], dlopen will search "
-                "cudnn from LD_LIBRARY_PATH");
+                "/usr/local/cudnn/lib. If empty [default], dlopen "
+                "will search cudnn from LD_LIBRARY_PATH");
 
-P_DEFINE_string(cuda_dir, "",
+P_DEFINE_string(cuda_dir,
+                "",
                 "Specify path for loading cuda library, such as libcublas, "
-                "libcurand. For instance, /usr/local/cuda/lib64. "
-                "(Note: libcudart can not be specified by cuda_dir, since some "
+                "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
+                "libcudart can not be specified by cuda_dir, since some "
                 "build-in function in cudart already ran before main entry). "
-                "If empty [default], dlopen will search cuda from LD_LIBRARY_PATH");
+                "If default, dlopen will search cuda from LD_LIBRARY_PATH");
 
-static inline std::string join(const std::string& part1, const std::string& part2) {
+P_DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
+
+static inline std::string join(const std::string& part1,
+                               const std::string& part2) {
   // directory separator
   const char sep = '/';
-
   if (!part2.empty() && part2.front() == sep) {
     return part2;
   }
@@ -46,100 +49,124 @@ static inline std::string join(const std::string& part1, const std::string& part
   return ret;
 }
 
-static inline void GetDsoHandleFromDefaultPath(
-        std::string& dso_path, void** dso_handle, int dynload_flags) {
-    VLOG(3) << "Try to find cuda library: " << dso_path
-              << " from default system path.";
-    // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH 
+static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
+                                               void** dso_handle,
+                                               int dynload_flags) {
+  VLOG(3) << "Try to find cuda library: " << dso_path
+          << " from default system path.";
+  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+  *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+
+// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
+// bring System Integrity Projection (SIP), if dso_handle
+// is null, search from default package path in Mac OS.
+#if defined(__APPLE__) || defined(__OSX__)
+  if (nullptr == *dso_handle) {
+    dso_path = join("/usr/local/cuda/lib/", dso_path);
     *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-    
-    // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
-    // bring System Integrity Projection (SIP), if dso_handle
-    // is null, search from default package path in Mac OS.
-    #if defined(__APPLE__) || defined(__OSX__)
     if (nullptr == *dso_handle) {
-        dso_path = join("/usr/local/cuda/lib/", dso_path);
-        *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-        if (nullptr == *dso_handle) {
-            if (dso_path == "libcudnn.dylib") {
-                LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"
-                << "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C "
-                << "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h "
-                << "/usr/local/cuda/lib/libcudnn*";
-            }
-        } 
-    }   
-    #endif
+      if (dso_path == "libcudnn.dylib") {
+        LOG(FATAL)
+            << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"  // NOLINT
+            << "For instance, sudo tar -xzf "
+               "cudnn-7.5-osx-x64-v5.0-ga.tgz -C "  // NOLINT
+            << "/usr/local \n sudo chmod a+r "
+               "/usr/local/cuda/include/cudnn.h "  // NOLINT
+            << "/usr/local/cuda/lib/libcudnn*";
+      }
+    }
+  }
+#endif
 }
 
-static inline void GetDsoHandleFromSearchPath(
-        const std::string& search_root,
-        const std::string& dso_name,
-        void** dso_handle) {
-    int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
-    *dso_handle = nullptr;
-
-    std::string dlPath = dso_name;
-    if (search_root.empty()) {
-        GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
-    } else {
-        // search xxx.so from custom path
-        dlPath = join(search_root, dso_name);
-        *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
-        // if not found, search from default path
-        if (nullptr == dso_handle) {
-            LOG(WARNING) << "Failed to find cuda library: " << dlPath;
-            dlPath = dso_name;
-            GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
-        }
+static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
+                                              const std::string& dso_name,
+                                              void** dso_handle) {
+  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+  *dso_handle = nullptr;
+
+  std::string dlPath = dso_name;
+  if (search_root.empty()) {
+    GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+  } else {
+    // search xxx.so from custom path
+    dlPath = join(search_root, dso_name);
+    *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+    // if not found, search from default path
+    if (nullptr == *dso_handle) {
+      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
+                   << dlerror() << ")";
+      dlPath = dso_name;
+      GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
     }
+  }
 
-    CHECK(nullptr != *dso_handle)
-      << "Failed to find cuda library: " << dlPath << std::endl
-      << "Please specify its path correctly using one of the following ideas: \n"
-
-      << "Idea 1. set cuda and cudnn lib path at runtime. "
-      << "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n"
-      << "For instance, issue command: paddle train --use_gpu=1 "
-      << "--cuda_dir=/usr/local/cudnn/lib --cudnn_dir=/usr/local/cudnn/lib ...\n"
-
-      << "Idea 2. set environment variable LD_LIBRARY_PATH on Linux or "
-      << "DYLD_LIBRARY_PATH on Mac OS. \n"
-      << "For instance, issue command: export LD_LIBRARY_PATH=... \n"
-
-      << "Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is impossible "
-      << "unless System Integrity Protection (SIP) is disabled. However, @Idea 1"
-      << "always work well.";
+  CHECK(nullptr != *dso_handle) << "Failed to find dynamic library: " << dlPath
+                                << " (" << dlerror() << ") \n"
+                                << "Please specify its path correctly using "
+                                   "one of the following ways: \n"
+
+                                << "Method 1. set cuda and cudnn lib path at "
+                                   "runtime. "
+                                << "http://www.paddlepaddle.org/doc/ui/"
+                                   "cmd_argument/"
+                                   "argument_outline.html \n"
+                                << "For instance, issue command: paddle train "
+                                   "--use_gpu=1 "
+                                << "--cuda_dir=/usr/local/cuda/lib64 "
+                                   "--cudnn_dir=/usr/local/cudnn/lib "
+                                   "...\n"
+
+                                << "Method 2. set environment variable "
+                                   "LD_LIBRARY_PATH on Linux or "
+                                << "DYLD_LIBRARY_PATH on Mac OS. \n"
+                                << "For instance, issue command: export "
+                                   "LD_LIBRARY_PATH=... \n"
+
+                                << "Note: After Mac OS 10.11, using the "
+                                   "DYLD_LIBRARY_PATH is impossible "
+                                << "unless System Integrity Protection (SIP) "
+                                   "is disabled. However, "
+                                   "method 1 "
+                                << "always work well.";
 }
 
 void GetCublasDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
 #else
-    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
 #endif
 }
 
 void GetCudnnDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
 #else
-    GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
 #endif
 }
 
 void GetCudartDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
+  GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
 #else
-    GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
+  GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
 #endif
 }
 
 void GetCurandDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+#endif
+}
+
+void GetWarpCTCDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
 #else
-    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
 #endif
 }
diff --git a/paddle/cuda/src/hl_math.cc b/paddle/cuda/src/hl_math.cc
index 76d48c4a9b94d402cf84c57bd240e03a1a83b1a0..f4bf888bab4e92dd940714ef1b7aeee9242eb817 100644
--- a/paddle/cuda/src/hl_math.cc
+++ b/paddle/cuda/src/hl_math.cc
@@ -12,24 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "avx_mathfun.h"
 
 namespace hppl {
-__m256 exp(__m256 a) {
-  return exp256_ps(a);
-}
+__m256 exp(__m256 a) { return exp256_ps(a); }
 
-__m256 log(__m256 a) {
-  return log256_ps(a);
-}
+__m256 log(__m256 a) { return log256_ps(a); }
 
-__m256 sin(__m256 a) {
-  return sin256_ps(a);
-}
+__m256 sin(__m256 a) { return sin256_ps(a); }
 
-__m256 cos(__m256 a) {
-  return cos256_ps(a);
-}
+__m256 cos(__m256 a) { return cos256_ps(a); }
 
 }  // namespace hppl
diff --git a/paddle/cuda/src/hl_time.cc b/paddle/cuda/src/hl_time.cc
index adc88d60dd8d547cedcae5fd088b2fa581d8e5be..d52b2a1df07374f632def12eb52e10e10ca86028 100644
--- a/paddle/cuda/src/hl_time.cc
+++ b/paddle/cuda/src/hl_time.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <chrono>
 #include <stdlib.h>
 #include <iostream>
@@ -21,8 +20,7 @@ limitations under the License. */
 using std::chrono::high_resolution_clock;
 
 int64_t getCurrentTimeStick() {
-    high_resolution_clock::time_point tp = high_resolution_clock::now();
-    high_resolution_clock::duration dtn = tp.time_since_epoch();
-    return dtn.count();
+  high_resolution_clock::time_point tp = high_resolution_clock::now();
+  high_resolution_clock::duration dtn = tp.time_since_epoch();
+  return dtn.count();
 }
-
diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d3bf461586740175e24fbc60b3503e035f6d224
--- /dev/null
+++ b/paddle/cuda/src/hl_warpctc_wrap.cc
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <mutex>
+#include "hl_warpctc_wrap.h"
+#include "hl_dso_loader.h"
+#include "paddle/utils/Logging.h"
+
+namespace dynload {
+
+std::once_flag warpctc_dso_flag;
+void* warpctc_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load warpctc routine
+ * via operator overloading. When PADDLE_USE_DSO is
+ * false, you need to add the path of libwarp-ctc.so to
+ * the linked-libs of paddle or to LD_PRELOAD.
+ */
+#ifdef PADDLE_USE_DSO
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                              \
+  struct DynLoad__##__name {                                           \
+    template <typename... Args>                                        \
+    auto operator()(Args... args) -> decltype(__name(args...)) {       \
+      using warpctcFunc = decltype(__name(args...)) (*)(Args...);      \
+      std::call_once(                                                  \
+          warpctc_dso_flag, GetWarpCTCDsoHandle, &warpctc_dso_handle); \
+      void* p_##_name = dlsym(warpctc_dso_handle, #__name);            \
+      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);        \
+    }                                                                  \
+  } __name;  // struct DynLoad__##__name
+#else
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                        \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      return __name(args...);                                    \
+    }                                                            \
+  } __name;  // struct DynLoad__##__name
+#endif
+
+// include all needed warp-ctc functions
+DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version)
+DYNAMIC_LOAD_WARPCTC_WRAP(ctcGetStatusString)
+DYNAMIC_LOAD_WARPCTC_WRAP(compute_ctc_loss)
+DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size)
+
+#undef DYNAMIC_LOAD_WARPCTC_WRAP
+
+} /* namespace dynload */
+
+#define WARPCTC_GET_VERSION dynload::get_warpctc_version
+#define WARPCTC_GET_STATUS_STRING dynload::ctcGetStatusString
+
+#ifndef PADDLE_TYPE_DOUBLE
+#define WARPCTC_COMPUTE_LOSS dynload::compute_ctc_loss
+#define WARPCTC_GET_WORKSPACE_SIZE dynload::get_workspace_size
+#else
+#define WARPCTC_LOG_FATAL                                \
+  LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion \
+             << "] Error: not support double precision."
+#define WARPCTC_COMPUTE_LOSS(...) WARPCTC_LOG_FATAL(__VA_ARGS__)
+#define WARPCTC_GET_WORKSPACE_SIZE(...) WARPCTC_LOG_FATAL(__VA_ARGS__)
+#endif
+
+/**
+ * Check build-in warp-ctc function using glog and it also
+ * support << operator for more details error info.
+ */
+static int g_warpctcVersion = -1;
+#define CHECK_WARPCTC(warpctcStat)                \
+  CHECK_EQ(CTC_STATUS_SUCCESS, warpctcStat)       \
+      << "warp-ctc [version " << g_warpctcVersion \
+      << "] Error: " << WARPCTC_GET_STATUS_STRING(warpctcStat) << " "
+
+void hl_warpctc_init(const size_t blank,
+                     bool useGpu,
+                     hl_warpctc_options_t* options) {
+  CHECK_NOTNULL(options);
+
+  g_warpctcVersion = WARPCTC_GET_VERSION();
+
+  if (useGpu) {
+#ifdef __NVCC__
+    options->loc = CTC_GPU;
+    options->stream = STREAM_DEFAULT;
+#else
+    LOG(FATAL) << "[warpctc init] GPU is not enabled.";
+#endif
+  } else {
+    options->loc = CTC_CPU;
+    options->num_threads = 1;
+  }
+
+  options->blank_label = blank;
+}
+
+void hl_warpctc_compute_loss(const real* batchInput,
+                             real* batchGrad,
+                             const int* cpuLabels,
+                             const int* cpuLabelLengths,
+                             const int* cpuInputLengths,
+                             const size_t numClasses,
+                             const size_t numSequences,
+                             real* cpuCosts,
+                             void* workspace,
+                             hl_warpctc_options_t* options) {
+  CHECK_NOTNULL(batchInput);
+  CHECK_NOTNULL(cpuLabels);
+  CHECK_NOTNULL(cpuLabelLengths);
+  CHECK_NOTNULL(cpuInputLengths);
+  CHECK_NOTNULL(cpuCosts);
+  CHECK_NOTNULL(workspace);
+  CHECK_NOTNULL(options);
+
+  CHECK_WARPCTC(WARPCTC_COMPUTE_LOSS(batchInput,
+                                     batchGrad,
+                                     cpuLabels,
+                                     cpuLabelLengths,
+                                     cpuInputLengths,
+                                     numClasses,
+                                     numSequences,
+                                     cpuCosts,
+                                     workspace,
+                                     *options));
+}
+
+void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
+                                   const int* cpuInputLengths,
+                                   const size_t numClasses,
+                                   const size_t numSequences,
+                                   hl_warpctc_options_t* options,
+                                   size_t* bytes) {
+  CHECK_NOTNULL(cpuLabelLengths);
+  CHECK_NOTNULL(cpuInputLengths);
+  CHECK_NOTNULL(options);
+  CHECK_NOTNULL(bytes);
+
+  CHECK_WARPCTC(WARPCTC_GET_WORKSPACE_SIZE(cpuLabelLengths,
+                                           cpuInputLengths,
+                                           numClasses,
+                                           numSequences,
+                                           *options,
+                                           bytes));
+}
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index 27eed75d4d76c351e381a3b71dc44a3254fb1a4d..220f220e0f2919ab69a164e6bf6265c3cbaa5494 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -51,12 +51,14 @@ static ClassRegistrar<ActivationFunction> gActivationRegistrar;
  * @brief Macro for registering a derived activation class
  */
 #define END_DEFINE_ACTIVATION(ACTIVATION_NAME)                     \
-  };                                                               \
+  }                                                                \
+  ;                                                                \
   const std::string ACTIVATION_CLASS_NAME(ACTIVATION_NAME)::name = \
       #ACTIVATION_NAME;                                            \
   static InitFunction __reg_activation__##ACTIVATION_NAME([] {     \
-    gActivationRegistrar.registerClass<                            \
-        ACTIVATION_CLASS_NAME(ACTIVATION_NAME)>(#ACTIVATION_NAME); \
+    gActivationRegistrar                                           \
+        .registerClass<ACTIVATION_CLASS_NAME(ACTIVATION_NAME)>(    \
+            #ACTIVATION_NAME);                                     \
   });
 
 /**
@@ -111,14 +113,22 @@ void backward(Argument& act) {
     outputG->softmaxBackward(*outputV);
   } else {
     SetDevice device(act.deviceId);
-    Matrix::resizeOrCreate(sftMaxDot_, outputG->getHeight(),
+    Matrix::resizeOrCreate(sftMaxDot_,
+                           outputG->getHeight(),
                            outputG->getWidth(),
-                           /* trans */ false, useGpu(act.deviceId));
-    Matrix::resizeOrCreate(sftMaxSum_, outputG->getHeight(), 1,
-                           /* trans */ false, useGpu(act.deviceId));
+                           /* trans */ false,
+                           useGpu(act.deviceId));
+    Matrix::resizeOrCreate(sftMaxSum_,
+                           outputG->getHeight(),
+                           1,
+                           /* trans */ false,
+                           useGpu(act.deviceId));
     if (!one_ || one_->getWidth() != outputG->getWidth()) {
-      Matrix::resizeOrCreate(one_, 1, outputG->getWidth(),
-                             /* trans */ false, useGpu(act.deviceId));
+      Matrix::resizeOrCreate(one_,
+                             1,
+                             outputG->getWidth(),
+                             /* trans */ false,
+                             useGpu(act.deviceId));
       one_->one();
     }
 
@@ -130,7 +140,6 @@ void backward(Argument& act) {
 }
 END_DEFINE_ACTIVATION(softmax)
 
-
 /**
  * @brief Sequence_softmax Activation
  * @note Softmax on all frames of one sequence.
@@ -146,10 +155,16 @@ void forward(Argument& act) {
   CHECK_EQ(act.value->getWidth(), 1UL);
 
   if (!argument_.value) {
-    argument_.value = Matrix::create(nullptr, /* height= */ 1, 1,
-                                     /* trans= */ false, useGpu(act.deviceId));
-    argument_.grad = Matrix::create(nullptr, /* height= */ 1, 1,
-                                    /* trans= */ false, useGpu(act.deviceId));
+    argument_.value = Matrix::create(nullptr,
+                                     /* height= */ 1,
+                                     1,
+                                     /* trans= */ false,
+                                     useGpu(act.deviceId));
+    argument_.grad = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    1,
+                                    /* trans= */ false,
+                                    useGpu(act.deviceId));
   }
 
   auto starts = act.sequenceStartPositions->getVector(useGpu(act.deviceId));
@@ -267,11 +282,14 @@ END_DEFINE_ACTIVATION(softrelu)
 BEGIN_DEFINE_ACTIVATION(abs)
 void forward(Argument& act) {
   SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
-                         /* trans */ false, useGpu(act.deviceId));
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
 
   act.in->copyFrom(*act.value);
-  act.value->abs(*act.value);
+  act.value->abs2(*act.value);
 }
 
 void backward(Argument& act) { act.grad->absDerivative(*act.in); }
@@ -286,11 +304,14 @@ END_DEFINE_ACTIVATION(abs)
 BEGIN_DEFINE_ACTIVATION(square)
 void forward(Argument& act) {
   SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
-                         /* trans */ false, useGpu(act.deviceId));
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
 
   act.in->copyFrom(*act.value);
-  act.value->square(*act.value);
+  act.value->square2(*act.value);
 }
 
 void backward(Argument& act) { act.grad->squareDerivative(*act.in); }
@@ -303,7 +324,7 @@ END_DEFINE_ACTIVATION(square)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(exponential)
-void forward(Argument& act) { act.value->exp(*act.value); }
+void forward(Argument& act) { act.value->exp2(*act.value); }
 
 void backward(Argument& act) { act.grad->expDerivative(*act.value); }
 END_DEFINE_ACTIVATION(exponential)
@@ -317,11 +338,14 @@ END_DEFINE_ACTIVATION(exponential)
 BEGIN_DEFINE_ACTIVATION(log)
 void forward(Argument& act) {
   SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
-                         /* trans */ false, useGpu(act.deviceId));
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
 
   act.in->copyFrom(*act.value);
-  act.value->log(*act.value);
+  act.value->log2(*act.value);
 }
 
 void backward(Argument& act) { act.grad->dotDiv(*act.grad, *act.in); }
@@ -333,11 +357,9 @@ ActivationFunction* ActivationFunction::create(const std::string& type) {
 
 std::vector<std::string> ActivationFunction::getAllRegisteredTypes() {
   std::vector<std::string> types;
-  gActivationRegistrar.forEachType([&](const std::string& type) {
-      types.push_back(type);
-    });
+  gActivationRegistrar.forEachType(
+      [&](const std::string& type) { types.push_back(type); });
   return types;
 }
 
-
 }  // namespace paddle
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
index c483372256c035e39bfdbcaa4193a1a2e7fd80b8..e9ed5c619ab5e4dd9c52c0dac24478c2a57aa1bf 100644
--- a/paddle/gserver/activations/ActivationFunction.h
+++ b/paddle/gserver/activations/ActivationFunction.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <string>
 #include <vector>
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
index 2cfb5a3a18c8a63d69bf0598eeee2807376340bc..e6cc4a246a8494d287f8638674f4ae213f38f657 100644
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "DataProvider.h"
 
 #include "paddle/utils/Util.h"
@@ -57,7 +56,7 @@ void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) {
   }
 }
 
-DoubleBuffer::DoubleBuffer(DataProvider *dataPool,
+DoubleBuffer::DoubleBuffer(DataProvider* dataPool,
                            bool useGpu,
                            int64_t batchSize) {
   batchSize_ = batchSize;
@@ -155,7 +154,7 @@ void DoubleBuffer::startAsyncLoad() {
 }
 
 ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool>
-DataProvider::registrar_;
+    DataProvider::registrar_;
 
 DataProvider* DataProvider::create(const DataConfig& config,
                                    const ModelConfig& modelConfig,
@@ -182,7 +181,8 @@ int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
   for (int i = 0; i < config_.constant_slots_size(); ++i) {
     MemoryHandlePtr handle =
         constantSlots[i] ? constantSlots[i]->getMemoryHandle() : nullptr;
-    Matrix::resizeOrCreate(constantSlots[i], batchSize,
+    Matrix::resizeOrCreate(constantSlots[i],
+                           batchSize,
                            1,         // = width
                            false,     // = trans
                            useGpu_);  // = useGpu
@@ -216,7 +216,8 @@ void DataProvider::initAsyncLoader() {
 }
 
 SimpleDataProviderBase::SimpleDataProviderBase(const DataConfig& config,
-                                               bool useGpu, bool withInfo)
+                                               bool useGpu,
+                                               bool withInfo)
     : DataProvider(config, useGpu) {
   /* initialize the size of a sample, and the buffer */
   sampleDim_ = config_.feat_dim() * (2 * config_.context_len() + 1);
@@ -337,7 +338,8 @@ int64_t SimpleDataProviderBase::fillBuffer() {
   sampleNumInBuf_ =
       n + fillBufferImp(hInputDataBuf_->getData() + n * sampleDim_,
                         hInputLabelBuf_->getData() + n,
-                        hInputInfoBuf_->getData() + n, bufferCapacity_ - n);
+                        hInputInfoBuf_->getData() + n,
+                        bufferCapacity_ - n);
 
   /* for stachastic gradient training */
   if (!skipShuffle_) {
@@ -357,11 +359,14 @@ SimpleDataProvider::SimpleDataProvider(const DataConfig& config, bool useGpu)
 
 SimpleDataProvider::~SimpleDataProvider() {}
 
-int64_t SimpleDataProvider::fillBufferImp(real* data, int* label, int* info,
+int64_t SimpleDataProvider::fillBufferImp(real* data,
+                                          int* label,
+                                          int* info,
                                           int64_t size) {
   (void)info;
   int64_t n = std::min<int64_t>(labels_.size() - currentSampleIndex_, size);
-  memcpy(data, &data_[currentSampleIndex_ * sampleDim_],
+  memcpy(data,
+         &data_[currentSampleIndex_ * sampleDim_],
          n * sampleDim_ * sizeof(real));
   memcpy(label, &labels_[currentSampleIndex_], sizeof(int) * n);
   currentSampleIndex_ += n;
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 112e45de1cb232097ed63b120d5ac631b37952e9..8247693822a2bdcda9d98029f45ab6224de168fe 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <vector>
@@ -44,15 +43,15 @@ namespace paddle {
  * @brief Macro for registering a data provider. The class type should contain
  *        a consturctor with parameter (DataConfig, bool).
  */
-#define REGISTER_DATA_PROVIDER(__type_name, __class_name)\
-  static InitFunction __reg_type_##__type_name([]() {\
-  DataProvider::registrar_.registerClass(\
-  #__type_name, \
-  [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
-    DataProvider* dp = new __class_name (conf, useGpu);\
-    return dp;\
-  });\
-})
+#define REGISTER_DATA_PROVIDER(__type_name, __class_name)                \
+  static InitFunction __reg_type_##__type_name([]() {                    \
+    DataProvider::registrar_.registerClass(                              \
+        #__type_name,                                                    \
+        [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
+          DataProvider* dp = new __class_name(conf, useGpu);             \
+          return dp;                                                     \
+        });                                                              \
+  })
 
 /**
  * @def REGISTER_DATA_PROVIDER_EX
@@ -61,8 +60,8 @@ namespace paddle {
  */
 #define REGISTER_DATA_PROVIDER_EX(__type_name, __class_name)            \
   static InitFunction __reg_type_##__type_name([] {                     \
-  DataProvider::registrar_.registerClass<__class_name>(#__type_name);   \
-})
+    DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
+  })
 
 class DataBatch;
 class BufferBatch;
@@ -181,7 +180,8 @@ public:
    * @param[in]  size    DataBatch.getSize()
    * @param[in]  dataId  sub dataprovider id (in MultiDataProvider)
    */
-  void appendArguments(const std::vector<Argument>& argus, int size,
+  void appendArguments(const std::vector<Argument>& argus,
+                       int size,
                        int dataId) {
     size_ += size;
     for (const auto& argu : argus) {
@@ -259,9 +259,7 @@ typedef Queue<BufferBatch*> BufferBatchQueue;
 
 class DoubleBuffer {
 public:
-  DoubleBuffer(DataProvider* dataPool,
-               bool useGpu,
-               int64_t batchSize = 0);
+  DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
   virtual ~DoubleBuffer();
   void removeOneBatch(DataBatch* dataBatch);
 
@@ -273,7 +271,9 @@ public:
   void finishAsyncLoad() {
     stopping_ = true;
     taskReadySem_.post();
-    asyncLoader_->join();
+    if (asyncLoader_) {
+      asyncLoader_->join();
+    }
   }
 
   void setPending(bool pending) { pending_ = pending; }
@@ -310,7 +310,7 @@ public:
   /**
    * @brief create only used for unittest.
    */
-  inline static DataProvider* create(const DataConfig &config,
+  inline static DataProvider* create(const DataConfig& config,
                                      bool useGpu = FLAGS_use_gpu) {
     return create(config, ModelConfig(), useGpu);
   }
@@ -462,7 +462,9 @@ protected:
    *
    * label[n] is the label for the n-th sample.
    */
-  virtual int64_t fillBufferImp(real* data, int* label, int* info,
+  virtual int64_t fillBufferImp(real* data,
+                                int* label,
+                                int* info,
                                 int64_t size) = 0;
 };
 
@@ -475,7 +477,9 @@ public:
 protected:
   void loadData(const std::string& fileName);
   void loadDataFile(const std::string& fileName);
-  virtual int64_t fillBufferImp(real* data, int* label, int* info,
+  virtual int64_t fillBufferImp(real* data,
+                                int* label,
+                                int* info,
                                 int64_t size);
 
 protected:
diff --git a/paddle/gserver/dataproviders/DataProviderGroup.h b/paddle/gserver/dataproviders/DataProviderGroup.h
index 0689f90f3e7dd3d3e1df19f3958c821d53e69700..6c178e29ee714a6bd7f58861d7cf15716fee848d 100644
--- a/paddle/gserver/dataproviders/DataProviderGroup.h
+++ b/paddle/gserver/dataproviders/DataProviderGroup.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "DataProvider.h"
@@ -65,8 +64,8 @@ void DataProviderGroup<T>::reset() {
   provider_ = nullptr;
 
   // shuffle file list
-  std::shuffle(fileList_.begin(), fileList_.end(),
-      ThreadLocalRandomEngine::get());
+  std::shuffle(
+      fileList_.begin(), fileList_.end(), ThreadLocalRandomEngine::get());
 
   startLoader();
   DataProvider::reset();
@@ -113,8 +112,9 @@ void DataProviderGroup<T>::startLoader() {
     size_t endPos = std::min(fileList_.size(), startPos + loadFileCount);
     std::vector<std::string> fileVec(fileList_.begin() + startPos,
                                      fileList_.begin() + endPos);
-    loader_->addJob([this, fileVec]()
-                        -> ProviderPtrType { return this->loadFile(fileVec); });
+    loader_->addJob([this, fileVec]() -> ProviderPtrType {
+      return this->loadFile(fileVec);
+    });
   }
   loader_->stopAddJob();
 }
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.cpp b/paddle/gserver/dataproviders/MultiDataProvider.cpp
index 8e4f53978a0451f3bb6cd5da30f017708448f9ac..51fb1f26668c55dc1c2aecd5389f327e2569a52f 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.cpp
+++ b/paddle/gserver/dataproviders/MultiDataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "MultiDataProvider.h"
 #include "paddle/utils/Logging.h"
@@ -59,10 +58,8 @@ MultiDataProvider::MultiDataProvider(const DataConfig& config,
                    "MultiDataProvider";
       subConfig.set_async_load_data(false);
     }
-    subDataProviders_[i] =
-        std::unique_ptr<DataProvider>(DataProvider::create(subConfig,
-                                                           modelConfig,
-                                                           useGpu_));
+    subDataProviders_[i] = std::unique_ptr<DataProvider>(
+        DataProvider::create(subConfig, modelConfig, useGpu_));
   }
 }
 
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.h b/paddle/gserver/dataproviders/MultiDataProvider.h
index b498ba6516c4320566b1b3cc2bd557ae016d7c39..876467c04f074cf37e48fdfa9b24f236fcfe8ba1 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.h
+++ b/paddle/gserver/dataproviders/MultiDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "DataProvider.h"
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
index 344644755f24045443b8cb3ebd08004a4b1cdcb5..0a7ff802461f2ded0e6e842c088bddf218361f79 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ProtoDataProvider.h"
 #include "paddle/utils/Util.h"
 #include "paddle/utils/StringUtil.h"
@@ -23,7 +22,8 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 #include "DataProviderGroup.h"
 
-P_DEFINE_double(memory_threshold_on_load_data, 1.0,
+P_DEFINE_double(memory_threshold_on_load_data,
+                1.0,
                 "stop loading data when memory is not sufficient");
 
 namespace paddle {
@@ -32,7 +32,8 @@ REGISTER_DATA_PROVIDER(proto_group, DataProviderGroup<ProtoDataProvider>);
 REGISTER_DATA_PROVIDER(proto_sequence_group,
                        DataProviderGroup<ProtoSequenceDataProvider>);
 
-ProtoDataProvider::ProtoDataProvider(const DataConfig& config, bool useGpu,
+ProtoDataProvider::ProtoDataProvider(const DataConfig& config,
+                                     bool useGpu,
                                      bool loadDataAll)
     : DataProvider(config, useGpu), sampleNums_(0), currentSequenceIndex_(0) {
   if (loadDataAll) {
@@ -279,7 +280,8 @@ void ProtoDataProvider::fillSlots(const DataSample& sample) {
         }
         slot.sparseNonValueData.resize(slot.indices.back() + slotSize);
         const unsigned int* ids = sample.vector_slots(i).ids().data();
-        memcpy(slot.sparseNonValueData.data() + slot.indices.back(), ids,
+        memcpy(slot.sparseNonValueData.data() + slot.indices.back(),
+               ids,
                sizeof(*ids) * slotSize);
         slot.indices.push_back(slot.indices.back() + slotSize);
         if (subSlotSize) {
@@ -318,10 +320,11 @@ void ProtoDataProvider::fillSlots(const DataSample& sample) {
         slot.varDenseData[oldSize].data.resize(varDim);
         const float* values = sample.vector_slots(i).values().data();
 #ifdef PADDLE_TYPE_DOUBLE
-        std::copy(values, values + varDim,
-                  slot.varDenseData[oldSize].data.data());
+        std::copy(
+            values, values + varDim, slot.varDenseData[oldSize].data.data());
 #else
-        memcpy(slot.varDenseData[oldSize].data.data(), values,
+        memcpy(slot.varDenseData[oldSize].data.data(),
+               values,
                sizeof(real) * varDim);
 #endif
         slot.varDenseData[oldSize].dims.resize(
@@ -374,8 +377,9 @@ void ProtoDataProvider::reset() {
 }
 
 void ProtoDataProvider::shuffle() {
-  std::shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end(),
-      ThreadLocalRandomEngine::get());
+  std::shuffle(shuffledSequenceIds_.begin(),
+               shuffledSequenceIds_.end(),
+               ThreadLocalRandomEngine::get());
 }
 
 /*
@@ -502,7 +506,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
 
   if (!iidData()) {
     ICpuGpuVector::resizeOrCreate(cpuArguments[0].sequenceStartPositions,
-                                   numSequences + 1, /* useGpu= */ false);
+                                  numSequences + 1,
+                                  /* useGpu= */ false);
     int* buf = cpuArguments[0].sequenceStartPositions->getMutableData(false);
     int pos = 0;
     int i = 0;
@@ -530,7 +535,9 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
 
     switch (slotType) {
       case SlotDef::VECTOR_DENSE: {
-        Matrix::resizeOrCreate(cpuArguments[slot].value, size, dim,
+        Matrix::resizeOrCreate(cpuArguments[slot].value,
+                               size,
+                               dim,
                                false,   // trans = false
                                false);  // useGpu = false
         real* buf = cpuArguments[slot].value->getData();
@@ -543,19 +550,27 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
       }
       case SlotDef::VECTOR_SPARSE_NON_VALUE: {
         if (!(cpuArguments[slot].value)) {
-          cpuArguments[slot].value = Matrix::createSparseMatrix(
-              size, dim, size /*DEFAULT_AVG_WIDTH = 1*/, NO_VALUE, SPARSE_CSR,
-              false, useGpu_);
+          cpuArguments[slot].value =
+              Matrix::createSparseMatrix(size,
+                                         dim,
+                                         size /*DEFAULT_AVG_WIDTH = 1*/,
+                                         NO_VALUE,
+                                         SPARSE_CSR,
+                                         false,
+                                         useGpu_);
         }
         auto mat = cpuArguments[slot].value;
         mat->resize(size, dim);
         if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
           std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
-                         slots_[slot].sparseNonValueData.data(), HPPL_STREAM_1);
+              ->copyFrom(dataPos.data(),
+                         slots_[slot].indices.data(),
+                         slots_[slot].sparseNonValueData.data(),
+                         HPPL_STREAM_1);
         } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
           std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
+              ->copyFrom(dataPos.data(),
+                         slots_[slot].indices.data(),
                          slots_[slot].sparseNonValueData.data());
         } else {
           LOG(FATAL) << "Not Supported";
@@ -571,19 +586,27 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
       }
       case SlotDef::VECTOR_SPARSE_VALUE: {
         if (!(cpuArguments[slot].value)) {
-          cpuArguments[slot].value = Matrix::createSparseMatrix(
-              size, dim, size /*DEFAULT_AVG_WIDTH = 1*/, FLOAT_VALUE,
-              SPARSE_CSR, false, useGpu_);
+          cpuArguments[slot].value =
+              Matrix::createSparseMatrix(size,
+                                         dim,
+                                         size /*DEFAULT_AVG_WIDTH = 1*/,
+                                         FLOAT_VALUE,
+                                         SPARSE_CSR,
+                                         false,
+                                         useGpu_);
         }
         auto mat = cpuArguments[slot].value;
         mat->resize(size, dim);
         if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(), slots_[slot].indices.data(),
-              slots_[slot].sparseFloatValueData.data(), HPPL_STREAM_1);
+          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
+              ->copyFrom(dataPos.data(),
+                         slots_[slot].indices.data(),
+                         slots_[slot].sparseFloatValueData.data(),
+                         HPPL_STREAM_1);
         } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
           std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
+              ->copyFrom(dataPos.data(),
+                         slots_[slot].indices.data(),
                          slots_[slot].sparseFloatValueData.data());
         } else {
           LOG(FATAL) << "Not Supported";
@@ -591,7 +614,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
         break;
       }
       case SlotDef::INDEX: {
-        IVector::resizeOrCreate(cpuArguments[slot].ids, size,
+        IVector::resizeOrCreate(cpuArguments[slot].ids,
+                                size,
                                 /*  useGpu= */ false);
         int* buf = cpuArguments[slot].ids->getData();
         for (int i = 0; i < size; ++i) {
@@ -621,7 +645,9 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
         if (oldWidth < height) {
           totalDim = width * height * depth;
         }
-        Matrix::resizeOrCreate(cpuArguments[slot].value, size, totalDim,
+        Matrix::resizeOrCreate(cpuArguments[slot].value,
+                               size,
+                               totalDim,
                                false,   // trans = false
                                false);  // useGpu = false
         real* buf = cpuArguments[slot].value->getData();
@@ -637,13 +663,13 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
             }
           }
         } else {
-          memcpy(buf, slots_[slot].varDenseData[dataPos[0]].data.data(),
+          memcpy(buf,
+                 slots_[slot].varDenseData[dataPos[0]].data.data(),
                  sizeof(real) * totalDim);
         }
-        ICpuGpuVector::resizeOrCreate(
-            cpuArguments[slot].sequenceStartPositions,
-            size + 1, /* size == 1 currently */
-            /* useGpu= */ false);
+        ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+                                      size + 1, /* size == 1 currently */
+                                      /* useGpu= */ false);
         int* bufStarts =
             cpuArguments[slot].sequenceStartPositions->getMutableData(false);
         bufStarts[0] = 0;
@@ -653,16 +679,17 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
       case SlotDef::VAR_MDIM_INDEX: {
         CHECK_EQ(size, 1);
         size_t totalDim = slots_[slot].varIndices[dataPos[0]].size();
-        IVector::resizeOrCreate(cpuArguments[slot].ids, totalDim,
+        IVector::resizeOrCreate(cpuArguments[slot].ids,
+                                totalDim,
                                 /*  useGpu= */ false);
         int* buf = cpuArguments[slot].ids->getData();
-        memcpy(buf, slots_[slot].varIndices[dataPos[0]].data(),
+        memcpy(buf,
+               slots_[slot].varIndices[dataPos[0]].data(),
                sizeof(int) * totalDim);
 
-        ICpuGpuVector::resizeOrCreate(
-            cpuArguments[slot].sequenceStartPositions,
-            size + 1, /* size == 1 currently */
-            /* useGpu= */ false);
+        ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+                                      size + 1, /* size == 1 currently */
+                                      /* useGpu= */ false);
         int* bufStarts =
             cpuArguments[slot].sequenceStartPositions->getMutableData(false);
         bufStarts[0] = 0;
@@ -700,8 +727,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
         gpuArguments[i].sequenceStartPositions =
             cpuArguments[i].sequenceStartPositions;
       } else {
-        gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
-                                          HPPL_STREAM_1);
+        gpuArguments[i].resizeAndCopyFrom(
+            cpuArguments[i], useGpu_, HPPL_STREAM_1);
       }
     }
     hl_stream_synchronize(HPPL_STREAM_1);
@@ -746,10 +773,9 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
     sampleLoop(op, size);
 
     // current slot: sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(
-        cpuArguments[slot].sequenceStartPositions,
-        size + 1,
-        /* useGpu= */ false);
+    ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+                                  size + 1,
+                                  /* useGpu= */ false);
 
     switch (slotType) {
       case SlotDef::VECTOR_SPARSE_VALUE:
@@ -821,10 +847,10 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
           };
           int subSize = subSampleLoop(op, size, slot);
           ICpuGpuVector::resizeOrCreate(
-              cpuArguments[slot].subSequenceStartPositions, subSize + 1,
-              false);
+              cpuArguments[slot].subSequenceStartPositions, subSize + 1, false);
           int* currPosOfArgumentSubSeqStart =
-            cpuArguments[slot].subSequenceStartPositions->getMutableData(false);
+              cpuArguments[slot].subSequenceStartPositions->getMutableData(
+                  false);
           int64_t* subSeqs = dataSubPos.data();
           int64_t* subIndexs = slots_[slot].subIndices.data();
           int allSubSequenceLength = 0;
@@ -849,7 +875,8 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
       }
       case SlotDef::INDEX: {
         // label slot
-        IVector::resizeOrCreate(cpuArguments[slot].ids, size,
+        IVector::resizeOrCreate(cpuArguments[slot].ids,
+                                size,
                                 /* useGpu= */ false);
         // fill labels
         int* buf = cpuArguments[slot].ids->getData();
@@ -863,7 +890,9 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
       case SlotDef::VECTOR_DENSE: {
         // copy values
         size_t dim = header_.slot_defs(slot).dim();
-        Matrix::resizeOrCreate(cpuArguments[slot].value, size, dim,
+        Matrix::resizeOrCreate(cpuArguments[slot].value,
+                               size,
+                               dim,
                                false,   // trans = false
                                false);  // useGpu = false
         real* buf = cpuArguments[slot].value->getData();
@@ -887,8 +916,8 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
     gpuArguments.resize(cpuArguments.size());
     gpuBatch.setSize(size);
     for (size_t i = 0; i < cpuArguments.size(); ++i) {
-      gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
-                                        HPPL_STREAM_1);
+      gpuArguments[i].resizeAndCopyFrom(
+          cpuArguments[i], useGpu_, HPPL_STREAM_1);
     }
     hl_stream_synchronize(HPPL_STREAM_1);
     *batch = gpuBatch;
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.h b/paddle/gserver/dataproviders/ProtoDataProvider.h
index 846dd7673abe8b836be1b728bb690daa0e8acc20..ffdcc8fdc977f53e29dc9f03fa3cf7af56acb92f 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.h
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <vector>
@@ -48,7 +47,8 @@ namespace paddle {
  */
 class ProtoDataProvider : public DataProvider {
 public:
-  ProtoDataProvider(const DataConfig& config, bool useGpu,
+  ProtoDataProvider(const DataConfig& config,
+                    bool useGpu,
                     bool loadDataAll = true);
   virtual void reset();
 
@@ -161,14 +161,16 @@ protected:
 };
 
 /**
- * @brief Special use for Proto data: instances should contain sparse-non-value slots
+ * @brief Special use for Proto data: instances should contain sparse-non-value
+ * slots
  * and label.
  *
  * @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE
  */
 class ProtoSequenceDataProvider : public ProtoDataProvider {
 public:
-  ProtoSequenceDataProvider(const DataConfig& config, bool useGpu,
+  ProtoSequenceDataProvider(const DataConfig& config,
+                            bool useGpu,
                             bool loadDataAll = true);
   ~ProtoSequenceDataProvider() {}
   virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
diff --git a/paddle/gserver/dataproviders/ProtoReader.h b/paddle/gserver/dataproviders/ProtoReader.h
index 3b1eb7e9ef03c42df31c6efc9f0e0240d64e78df..b8fca3cd7f3c5efaea35dc8e09f7ca0ec250830f 100644
--- a/paddle/gserver/dataproviders/ProtoReader.h
+++ b/paddle/gserver/dataproviders/ProtoReader.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -138,7 +137,8 @@ protected:
    *
    * @note this code depends on protobuf 2.4.0. There is nothing like
    * CodedInputStream::CurrentPosition() in protobuf 2.5.0 to tell us how many
-   * bytes has the object readed so far. Therefore, we calculated bytes ourselves.
+   * bytes has the object readed so far. Therefore, we calculated bytes
+   * ourselves.
    */
   int approximateReadedBytes_;
 };
diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp
index 1332c0ab635b6ebec05f25fd77b9703b39227bc1..bee6ca14a2ec3995a3b432fc5a39419a5dd8a8ce 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@@ -12,21 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PyDataProvider.h"
 #include "paddle/utils/PythonUtil.h"
 #include <fenv.h>
 #include "paddle/utils/Util.h"
 #include "paddle/utils/Excepts.h"
 
-
 namespace paddle {
 
 #ifndef PADDLE_NO_PYTHON
 REGISTER_DATA_PROVIDER(py, PyDataProvider);
 #endif
 
-PyDataProvider::PyDataProvider(const DataConfig& config, bool useGpu,
+PyDataProvider::PyDataProvider(const DataConfig& config,
+                               bool useGpu,
                                bool loadDataAll)
     : DataProvider(config, useGpu), batchSize_(0) {
   PyGuard guard;
@@ -50,8 +49,8 @@ void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
   classInstance_ =
       createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
   CHECK(classInstance_) << "Create class instance failed.";
-  PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
-                                      const_cast<char*>("getHeader"), NULL));
+  PyObjectPtr obj(PyObject_CallMethod(
+      classInstance_.get(), const_cast<char*>("getHeader"), NULL));
   CHECK_PY(obj) << "Call function getHeader failed.";
   std::string headerInfo =
       std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
@@ -90,7 +89,8 @@ void PyDataProvider::resetSlots() {
   }
 }
 
-void PyDataProvider::fillDenseSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillDenseSlot(ProtoSlot& slot,
+                                   char*& data,
                                    const char* dataEnd) {
   unsigned int dim = slot.dim;
   slot.sampleNum = readT<unsigned int>(data, dataEnd);
@@ -102,14 +102,17 @@ void PyDataProvider::fillDenseSlot(ProtoSlot& slot, char*& data,
   float* dat = reinterpret_cast<float*>(data);
   std::copy(dat, dat + slot.sampleNum * dim, slot.denseData.begin());
 #else
-  memcpyWithCheck(slot.denseData.data(), data,
-                  sizeof(real) * dim * slot.sampleNum, dataEnd);
+  memcpyWithCheck(slot.denseData.data(),
+                  data,
+                  sizeof(real) * dim * slot.sampleNum,
+                  dataEnd);
 #endif
   // PyDataProvider always provide data in float
   data += sizeof(float) * dim * slot.sampleNum;
 }
 
-void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot,
+                                            char*& data,
                                             const char* dataEnd) {
   slot.sampleNum = readT<unsigned int>(data, dataEnd);
   unsigned int* indexPtr = (unsigned int*)data;
@@ -121,12 +124,15 @@ void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
   length = readT<unsigned int>(data, dataEnd);
   slot.indices.push_back(length);
   slot.sparseNonValueData.resize(length);
-  memcpyWithCheck(slot.sparseNonValueData.data(), data,
-                  sizeof(unsigned int) * length, dataEnd);
+  memcpyWithCheck(slot.sparseNonValueData.data(),
+                  data,
+                  sizeof(unsigned int) * length,
+                  dataEnd);
   data += sizeof(unsigned int) * length;
 }
 
-void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot,
+                                         char*& data,
                                          const char* dataEnd) {
   slot.sampleNum = readT<unsigned int>(data, dataEnd);
   unsigned int* indexPtr = (unsigned int*)data;
@@ -153,7 +159,8 @@ void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot, char*& data,
   }
 }
 
-void PyDataProvider::fillIndexSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillIndexSlot(ProtoSlot& slot,
+                                   char*& data,
                                    const char* dataEnd) {
   slot.sampleNum = readT<unsigned int>(data, dataEnd);
   CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
@@ -163,7 +170,8 @@ void PyDataProvider::fillIndexSlot(ProtoSlot& slot, char*& data,
   data += sizeof(unsigned int) * slot.sampleNum;
 }
 
-void PyDataProvider::fillStringSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillStringSlot(ProtoSlot& slot,
+                                    char*& data,
                                     const char* dataEnd) {
   slot.sampleNum = readT<unsigned int>(data, dataEnd);
   for (unsigned int i = 0; i < slot.sampleNum; ++i) {
@@ -225,9 +233,8 @@ void PyDataProvider::fillSlotsByStr(const std::string& samples) {
       }
       for (size_t i = 0; i < sequenceNum; ++i) {
         size_t begin = slot.sequenceStartPositions[i];
-        size_t end = (i < sequenceNum - 1)
-                         ? slot.sequenceStartPositions[i + 1]
-                         : slot.sampleNum;
+        size_t end = (i < sequenceNum - 1) ? slot.sequenceStartPositions[i + 1]
+                                           : slot.sampleNum;
         for (size_t ii = begin; ii < end; ++ii) {
           slot.sampleSequenceIdVec.push_back(ii);
         }
@@ -255,8 +262,8 @@ void PyDataProvider::fillSlotsByStr(const std::string& samples) {
 void PyDataProvider::reset() {
   {  // Invoke PyDataProvider Reset
     PyGuard guard;
-    PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
-                                        const_cast<char*>("reset"), NULL));
+    PyObjectPtr obj(PyObject_CallMethod(
+        classInstance_.get(), const_cast<char*>("reset"), NULL));
     CHECK_PY(obj) << "Call function reset failed.";
   }
 
@@ -270,15 +277,18 @@ void PyDataProvider::reset() {
 void PyDataProvider::shuffle() {
   // py shuffle
   PyGuard guard;
-  PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
-                                      const_cast<char*>("shuffle"), NULL));
+  PyObjectPtr obj(PyObject_CallMethod(
+      classInstance_.get(), const_cast<char*>("shuffle"), NULL));
   CHECK_PY(obj) << "Call function shuffle failed.";
 }
 
-void PyDataProvider::handleDenseSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleDenseSlot(ProtoSlot& slot,
+                                     size_t slotIndex,
                                      std::vector<Argument>& cpuArguments) {
   unsigned int dim = slot.dim;
-  Matrix::resizeOrCreate(cpuArguments[slotIndex].value, slot.sampleNum, dim,
+  Matrix::resizeOrCreate(cpuArguments[slotIndex].value,
+                         slot.sampleNum,
+                         dim,
                          false,   // trans = false
                          false);  // useGpu = false
   real* buf = cpuArguments[slotIndex].value->getData();
@@ -294,19 +304,27 @@ void PyDataProvider::handleSparseNonValueSlot(
     ProtoSlot& slot, size_t slotIndex, std::vector<Argument>& cpuArguments) {
   unsigned int dim = slot.dim;
   if (!(cpuArguments[slotIndex].value)) {
-    cpuArguments[slotIndex].value = Matrix::createSparseMatrix(
-        slot.sampleNum, dim, slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/, NO_VALUE,
-        SPARSE_CSR, false, useGpu_);
+    cpuArguments[slotIndex].value =
+        Matrix::createSparseMatrix(slot.sampleNum,
+                                   dim,
+                                   slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
+                                   NO_VALUE,
+                                   SPARSE_CSR,
+                                   false,
+                                   useGpu_);
   }
   auto mat = cpuArguments[slotIndex].value;
   mat->resize(slot.sampleNum, dim, slot.sampleNum, NO_VALUE, SPARSE_CSR);
   if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
     std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
-                   slot.sparseNonValueData.data(), HPPL_STREAM_1);
+        ->copyFrom(slot.sampleSequenceIdVec.data(),
+                   slot.indices.data(),
+                   slot.sparseNonValueData.data(),
+                   HPPL_STREAM_1);
   } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
     std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
+        ->copyFrom(slot.sampleSequenceIdVec.data(),
+                   slot.indices.data(),
                    slot.sparseNonValueData.data());
   } else {
     LOG(FATAL) << "Not Supported";
@@ -317,28 +335,38 @@ void PyDataProvider::handleSparseValueSlot(
     ProtoSlot& slot, size_t slotIndex, std::vector<Argument>& cpuArguments) {
   unsigned int dim = slot.dim;
   if (!(cpuArguments[slotIndex].value)) {
-    cpuArguments[slotIndex].value = Matrix::createSparseMatrix(
-        slot.sampleNum, dim, slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
-        FLOAT_VALUE, SPARSE_CSR, false, useGpu_);
+    cpuArguments[slotIndex].value =
+        Matrix::createSparseMatrix(slot.sampleNum,
+                                   dim,
+                                   slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
+                                   FLOAT_VALUE,
+                                   SPARSE_CSR,
+                                   false,
+                                   useGpu_);
   }
   auto mat = cpuArguments[slotIndex].value;
   mat->resize(slot.sampleNum, dim, slot.sampleNum, FLOAT_VALUE, SPARSE_CSR);
   if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
     std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
-                   slot.sparseFloatValueData.data(), HPPL_STREAM_DEFAULT);
+        ->copyFrom(slot.sampleSequenceIdVec.data(),
+                   slot.indices.data(),
+                   slot.sparseFloatValueData.data(),
+                   HPPL_STREAM_DEFAULT);
   } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
     std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
+        ->copyFrom(slot.sampleSequenceIdVec.data(),
+                   slot.indices.data(),
                    slot.sparseFloatValueData.data());
   } else {
     LOG(FATAL) << "Not Supported";
   }
 }
 
-void PyDataProvider::handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleIndexSlot(ProtoSlot& slot,
+                                     size_t slotIndex,
                                      std::vector<Argument>& cpuArguments) {
-  IVector::resizeOrCreate(cpuArguments[slotIndex].ids, slot.sampleNum,
+  IVector::resizeOrCreate(cpuArguments[slotIndex].ids,
+                          slot.sampleNum,
                           /*useGpu_*/ false);
   int* buf = cpuArguments[slotIndex].ids->getData();
   for (size_t i = 0; i < slot.sampleNum; ++i) {
@@ -346,7 +374,8 @@ void PyDataProvider::handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
   }
 }
 
-void PyDataProvider::handleStringSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleStringSlot(ProtoSlot& slot,
+                                      size_t slotIndex,
                                       std::vector<Argument>& cpuArguments) {
   if (cpuArguments[slotIndex].strs) {
     cpuArguments[slotIndex].strs->resize(slot.sampleNum);
@@ -364,7 +393,8 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
   PyGuard guard;
   PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
                                       const_cast<char*>("getNextBatch"),
-                                      const_cast<char*>("i"), size));
+                                      const_cast<char*>("i"),
+                                      size));
   CHECK_PY(obj) << "Call function getNextBatch failed.";
   const std::string& samples =
       std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
@@ -381,23 +411,24 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
   if (!iidData()) {
     for (size_t j = 0; j < slotNum_; ++j) {
       auto& slot = slots_[j];
-      ICpuGpuVector::resizeOrCreate(
-          cpuArguments[j].sequenceStartPositions,
-          slot.sequenceNum + 1, /* useGpu= */ false);
+      ICpuGpuVector::resizeOrCreate(cpuArguments[j].sequenceStartPositions,
+                                    slot.sequenceNum + 1,
+                                    /* useGpu= */ false);
       int* buf = cpuArguments[j].sequenceStartPositions->getMutableData(false);
       std::copy(slot.sequenceStartPositions.begin(),
-                slot.sequenceStartPositions.end(), buf);
+                slot.sequenceStartPositions.end(),
+                buf);
       buf[slot.sequenceStartPositions.size()] = slot.sampleNum;
 
       if (slot.subSequenceStartPositions.size()) {
-        ICpuGpuVector::resizeOrCreate(
-            cpuArguments[j].subSequenceStartPositions,
-            slot.subSequenceNum + 1,
-            /*  useGpu= */ false);
+        ICpuGpuVector::resizeOrCreate(cpuArguments[j].subSequenceStartPositions,
+                                      slot.subSequenceNum + 1,
+                                      /*  useGpu= */ false);
         int* buf =
-           cpuArguments[j].subSequenceStartPositions->getMutableData(false);
+            cpuArguments[j].subSequenceStartPositions->getMutableData(false);
         std::copy(slot.subSequenceStartPositions.begin(),
-                  slot.subSequenceStartPositions.end(), buf);
+                  slot.subSequenceStartPositions.end(),
+                  buf);
         buf[slot.subSequenceNum] = slot.sampleNum;
         // check subSequenceStartPositions and sequenceStartPositions
         cpuArguments[j].checkSubset();
@@ -452,8 +483,8 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
               cpuArguments[i].subSequenceStartPositions;
         }
       } else {
-        gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
-                                          HPPL_STREAM_1);
+        gpuArguments[i].resizeAndCopyFrom(
+            cpuArguments[i], useGpu_, HPPL_STREAM_1);
       }
     }
     hl_stream_synchronize(HPPL_STREAM_1);
diff --git a/paddle/gserver/dataproviders/PyDataProvider.h b/paddle/gserver/dataproviders/PyDataProvider.h
index 939d9cf725c2fe6e4989c17e1e768c9f8aedfc95..6bb7c831fdd451abc5241199d6a4d1b1ad814517 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.h
+++ b/paddle/gserver/dataproviders/PyDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <paddle/utils/PythonUtil.h>
@@ -25,7 +24,8 @@ namespace paddle {
 
 class PyDataProvider : public DataProvider {
 public:
-  PyDataProvider(const DataConfig& config, bool useGpu,
+  PyDataProvider(const DataConfig& config,
+                 bool useGpu,
                  bool loadDataAll = true);
 
   virtual void reset();
@@ -48,21 +48,27 @@ protected:
 
   void parseHeaderData(const std::string& headerData);
   void fillDenseSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
-  void fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
+  void fillSparseNonValueSlot(ProtoSlot& slot,
+                              char*& data,
                               const char* dataEnd);
   void fillSparseValueSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
   void fillIndexSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
   void fillStringSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
   void fillSlotsByStr(const std::string& samples);
-  void handleDenseSlot(ProtoSlot& slot, size_t slotIndex,
+  void handleDenseSlot(ProtoSlot& slot,
+                       size_t slotIndex,
                        std::vector<Argument>& cpuArguments);
-  void handleSparseNonValueSlot(ProtoSlot& slot, size_t slotIndex,
+  void handleSparseNonValueSlot(ProtoSlot& slot,
+                                size_t slotIndex,
                                 std::vector<Argument>& cpuArguments);
-  void handleSparseValueSlot(ProtoSlot& slot, size_t slotIndex,
+  void handleSparseValueSlot(ProtoSlot& slot,
+                             size_t slotIndex,
                              std::vector<Argument>& cpuArguments);
-  void handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
+  void handleIndexSlot(ProtoSlot& slot,
+                       size_t slotIndex,
                        std::vector<Argument>& cpuArguments);
-  void handleStringSlot(ProtoSlot& slot, size_t slotIndex,
+  void handleStringSlot(ProtoSlot& slot,
+                        size_t slotIndex,
                         std::vector<Argument>& cpuArguments);
   void resetSlots();
   void loadData(const std::vector<std::string>& fileList);
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index 90391a7c307d8dff7e289d445cafd27dc5008547..967fc9026a39967477d606862e060b680512901a 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -34,7 +34,7 @@ namespace paddle {
 namespace unittest {
 
 static std::unique_ptr<std::function<void(size_t /*poolActualSize */)>>
-         OnPoolFilled;
+    OnPoolFilled;
 
 namespace pydp2 {
 
@@ -43,15 +43,11 @@ void setOnPoolFilledHook(const std::function<void(size_t)>& callback) {
   *OnPoolFilled = callback;
 }
 
-void clearOnPoolFilledHook() {
-  OnPoolFilled.reset();
-}
+void clearOnPoolFilledHook() { OnPoolFilled.reset(); }
 
 }  // namespace pydp2
 }  // namespace unittest
 
-
-
 /**
  * Slot type
  */
@@ -65,17 +61,13 @@ enum SlotType {
 /**
  * Sequence type
  */
-enum SeqType {
-  SQT_NONE = 0,
-  SQT_SEQ,
-  SQT_SUBSEQ
-};
+enum SeqType { SQT_NONE = 0, SQT_SEQ, SQT_SUBSEQ };
 
 /**
  * Cache Type.
  */
 enum CacheType {
-  NO_CACHE = 0,  // Each pass will load data from PyDataProvider2.
+  NO_CACHE = 0,           // Each pass will load data from PyDataProvider2.
   CACHE_PASS_IN_MEM = 1,  // First pass will load data from PyDataProvider2,
                           // then cache all data in memory. Load data from
                           // memory in rest passes.
@@ -87,8 +79,8 @@ struct SlotHeader {  // Slot Header will parse from python object's slots field.
   SeqType seqType;
 };
 
-inline std::ostream& operator << (std::ostream& os, const SlotHeader& header) {
-  os <<"Dim = " << header.dim << " Type = " << header.slotType
+inline std::ostream& operator<<(std::ostream& os, const SlotHeader& header) {
+  os << "Dim = " << header.dim << " Type = " << header.slotType
      << " SeqType = " << header.seqType;
   return os;
 }
@@ -158,7 +150,6 @@ protected:
   SlotHeader* headerPtr_;
 };
 
-
 /**
  * Py Data Provider Cache Interface.
  */
@@ -209,17 +200,13 @@ public:
   PyDataProvider2(const DataConfig& config,
                   const ModelConfig& modelConfig,
                   bool useGpu)
-    :DataProvider(config, useGpu),
-      callingContextCreated_(2) {
-    if (PyArray_API == NULL)
-      import_array();
+      : DataProvider(config, useGpu), callingContextCreated_(2) {
+    if (PyArray_API == NULL) import_array();
     auto& args = config.load_data_args();
     PyObjectPtr kwargs = PyObjectPtr(PyDict_New());
     if (!args.empty()) {
       kwargs = callPythonFuncRetPyObj(
-            "paddle.trainer.PyDataProvider2",
-            "deserialize_args",
-            {args});
+          "paddle.trainer.PyDataProvider2", "deserialize_args", {args});
     }
 
     py::DictHelper kwargsDict(kwargs);
@@ -245,40 +232,38 @@ public:
    * Dtor
    * @note will stop loading thread when destructing
    */
-  virtual ~PyDataProvider2() {
-    resetImpl(false);
-  }
+  virtual ~PyDataProvider2() { resetImpl(false); }
 
 private:
   void createPyDataObj(const std::string& model,
                        const std::string& className,
                        const std::string& fileListName,
-                       PyObjectPtr && kwargs) {
-    LOG(INFO) << "loading dataprovider " << model <<"::" << className;
+                       PyObjectPtr&& kwargs  // NOLINT
+                       ) {
+    LOG(INFO) << "loading dataprovider " << model << "::" << className;
 
     PyObjectPtr module = py::import(model);
     PyObjectPtr moduleDict(PyModule_GetDict(module.get()));
     CHECK_PY(moduleDict) << "Invoke module.__dict__ error";
-    PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(),
-                                         className.c_str()));
+    PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(), className.c_str()));
     CHECK_PY(cls) << "load class " << className.c_str() << "error";
 
     // If there are multiple python instance share same module, the PyObjectPtr
     // only for instance will make python reference-count error.
     //
     // So here, we increase reference count manually.
-    if (gModuleClsPtrs_.find((uintptr_t) module.get())
-        != gModuleClsPtrs_.end()) {
+    if (gModuleClsPtrs_.find((uintptr_t)module.get()) !=
+        gModuleClsPtrs_.end()) {
       // Multi instance use same module
       Py_XINCREF(module.get());
       Py_XINCREF(moduleDict.get());
     } else {
-      gModuleClsPtrs_.insert((uintptr_t) module.get());
+      gModuleClsPtrs_.insert((uintptr_t)module.get());
     }
-    if (gModuleClsPtrs_.find((uintptr_t) cls.get()) != gModuleClsPtrs_.end()) {
+    if (gModuleClsPtrs_.find((uintptr_t)cls.get()) != gModuleClsPtrs_.end()) {
       Py_XINCREF(cls.get());
     } else {
-      gModuleClsPtrs_.insert((uintptr_t) cls.get());
+      gModuleClsPtrs_.insert((uintptr_t)cls.get());
     }
 
     PyObjectPtr fileListInPy = loadPyFileLists(fileListName);
@@ -294,8 +279,8 @@ private:
     py::ObjectHelper self(this->instance_);
     bool ok;
 
-    this->skipShuffle_ = !self.getBoolAttr("should_shuffle",
-                                           &ok /*isBoolType*/);
+    this->skipShuffle_ =
+        !self.getBoolAttr("should_shuffle", &ok /*isBoolType*/);
     if (!ok) {
       this->skipShuffle_ = testing;  // shuffle when is training, skip shuffle
                                      // when is testing.
@@ -335,12 +320,12 @@ private:
       PyObjectPtr headerPtrWrap(hdPtr);
       py::ObjectHelper hd(headerPtrWrap);
       header.dim = hd.getIntAttrWithError<size_t>("dim");
-      header.seqType = (SeqType) hd.getIntAttrWithError<int>("seq_type");
-      header.slotType = (SlotType) hd.getIntAttrWithError<int>("type");
+      header.seqType = (SeqType)hd.getIntAttrWithError<int>("seq_type");
+      header.slotType = (SlotType)hd.getIntAttrWithError<int>("type");
     }
 
     DBG << "Data header size " << headers_.size();
-    for (auto & header : headers_) {
+    for (auto& header : headers_) {
       DBG << header;
     }
     cache_.reset(IPyDataProviderCache::create(
@@ -351,8 +336,7 @@ private:
     loadFileList(fileListName, fileLists_);
     PyObject* lst = PyList_New(fileLists_.size());
     for (size_t i = 0; i < fileLists_.size(); ++i) {
-      PyList_SET_ITEM(lst, i,
-                      PyString_FromString(fileLists_[i].c_str()));
+      PyList_SET_ITEM(lst, i, PyString_FromString(fileLists_[i].c_str()));
     }
     return PyObjectPtr(lst);
   }
@@ -414,11 +398,12 @@ private:
         CHECK(ok) << "CalcBatchSize must return int or long";
       }
 
-      if (this->loadThread_){  // wait poolActualSize < poolSize;
+      if (this->loadThread_) {  // wait poolActualSize < poolSize;
         std::unique_lock<std::mutex> l(mtx_);
-        pushCV_.wait(l, [this, additionalBatchSize] {
-          return this->poolActualSize_ < poolSize_;
-        });
+        pushCV_.wait(l,
+                     [this, additionalBatchSize] {
+                       return this->poolActualSize_ < poolSize_;
+                     });
       }
 
       {
@@ -487,14 +472,14 @@ private:
   std::vector<std::string> fileLists_;
   std::vector<SlotHeader> headers_;
   static PyObjectPtr zeroTuple_;
-  static std::unordered_set<uintptr_t > gModuleClsPtrs_;
+  static std::unordered_set<uintptr_t> gModuleClsPtrs_;
 
   class PositionRandom {
   public:
-    inline explicit PositionRandom(bool skipRand):
-        eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
+    inline explicit PositionRandom(bool skipRand)
+        : eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
 
-    inline size_t operator() (size_t len) {
+    inline size_t operator()(size_t len) {
       if (!skipRand_) {
         if (!dist_ || dist_->b() != len - 1) {
           dist_.reset(new std::uniform_int_distribution<size_t>(0, len - 1));
@@ -525,32 +510,31 @@ public:
    * Shuffle. Do nothing because PyDataProvider do shuffle implicitly by random
    * select data from datapool.
    */
-  void shuffle() {
-  }
+  void shuffle() {}
 
   /**
    * Not limited size.
    */
-  int64_t getSize() {
-    return -1;
-  }
+  int64_t getSize() { return -1; }
 
   /**
    * Loading a batch of data.
    */
-  int64_t getNextBatchInternal(int64_t size_, DataBatch *batch) {
+  int64_t getNextBatchInternal(int64_t size_, DataBatch* batch) {
     std::lock_guard<std::mutex> guard(mutexForReset_);
     REGISTER_TIMER("PyDP2.getNextBatchInternal")
     CHECK_GE(size_, 0);
-    size_t size = (size_t) size_;
+    size_t size = (size_t)size_;
     if (loadThread_) {  // loading from thread should wait for data pool ready.
                         // but, loading from cache, cache object should ensure
                         // data pool ready.
       std::unique_lock<std::mutex> l(mtx_);
-      pullCV_.wait(l, [this, &size] {
-        return this->poolActualSize_ >= std::max(size, this->minPoolSize_)
-            || callingContexts_.empty();
-      });
+      pullCV_.wait(l,
+                   [this, &size] {
+                     return this->poolActualSize_ >=
+                                std::max(size, this->minPoolSize_) ||
+                            callingContexts_.empty();
+                   });
 
       if (unittest::OnPoolFilled) {
         (*unittest::OnPoolFilled)(this->poolActualSize_);
@@ -633,35 +617,35 @@ public:
     cpuBatch.setSize(bsize);
     auto& inArgs = cpuBatch.getStreams();
     inArgs.resize(headers_.size());
-    std::vector<std::unique_ptr<IFieldScanner> > scanners;
+    std::vector<std::unique_ptr<IFieldScanner>> scanners;
     scanners.reserve(headers_.size());
     for (auto& header : headers_) {
       scanners.emplace_back(IFieldScanner::create(&header));
     }
     DBG << "Scanner created.";
-    for (size_t i=0; i < headers_.size(); ++i) {
+    for (size_t i = 0; i < headers_.size(); ++i) {
       scanners[i]->startPrepare(inArgs[i]);
     }
-    for (auto & d : data) {
+    for (auto& d : data) {
       py::SequenceHelper s(d);
-      for (size_t i=0; i < headers_.size(); ++i) {
+      for (size_t i = 0; i < headers_.size(); ++i) {
         scanners[i]->prepare(inArgs[i], s[i]);
       }
     }
-    for (size_t i=0; i < headers_.size(); ++i) {
+    for (size_t i = 0; i < headers_.size(); ++i) {
       scanners[i]->finishPrepare(inArgs[i]);
     }
-    for (size_t i=0; i < headers_.size(); ++i) {
+    for (size_t i = 0; i < headers_.size(); ++i) {
       scanners[i]->startFill(inArgs[i]);
     }
-    for (auto & d : data) {
+    for (auto& d : data) {
       py::SequenceHelper s(d);
       for (size_t i = 0; i < headers_.size(); ++i) {
         scanners[i]->fill(inArgs[i], s[i]);
       }
     }
 
-    for (size_t i=0; i < headers_.size(); ++i) {
+    for (size_t i = 0; i < headers_.size(); ++i) {
       scanners[i]->finishFill(inArgs[i]);
     }
 
@@ -679,8 +663,8 @@ public:
       gpuArguments.resize(cpuArguments.size());
       gpuBatch.setSize(size);
       for (size_t i = 0; i < headers_.size(); ++i) {
-        gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
-                                          HPPL_STREAM_1);
+        gpuArguments[i].resizeAndCopyFrom(
+            cpuArguments[i], useGpu_, HPPL_STREAM_1);
       }
       hl_stream_synchronize(HPPL_STREAM_1);
     } else {
@@ -690,31 +674,28 @@ public:
   }
 };
 
-std::unordered_set<uintptr_t > PyDataProvider2::gModuleClsPtrs_;
+std::unordered_set<uintptr_t> PyDataProvider2::gModuleClsPtrs_;
 PyObjectPtr PyDataProvider2::zeroTuple_(PyTuple_New(0));
 
 REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2);
 
-
 /**
  * Scanner for dense slot.
  */
-class DenseScanner: public IFieldScanner {
+class DenseScanner : public IFieldScanner {
 public:
-  explicit DenseScanner(SlotHeader* ptr):IFieldScanner(ptr), height_(0) {}
+  explicit DenseScanner(SlotHeader* ptr) : IFieldScanner(ptr), height_(0) {}
 
   /**
    * Prepare.
    * @param argument target argument
    * @param obj each timestep of a sample.
    */
-  virtual void prepare(Argument &argument, PyObject *obj) {
-    ++height_;
-  }
+  virtual void prepare(Argument& argument, PyObject* obj) { ++height_; }
 
-  virtual void finishPrepare(Argument &argument) {
-    Matrix::resizeOrCreate(argument.value, height_, headerPtr_->dim,
-                           false, false);
+  virtual void finishPrepare(Argument& argument) {
+    Matrix::resizeOrCreate(
+        argument.value, height_, headerPtr_->dim, false, false);
     height_ = 0;
   }
 
@@ -723,24 +704,23 @@ public:
    * @param argument
    * @param obj
    */
-  virtual void fill(Argument &argument, PyObject *obj) {
+  virtual void fill(Argument& argument, PyObject* obj) {
     real* dat = argument.value->getData() + height_ * headerPtr_->dim;
     if (PyArray_Check(obj)) {
-        auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
-        if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
-            real * data = (real*)PyArray_DATA((PyArrayObject*)obj);
-            auto sz = PyArray_SIZE((PyArrayObject*)obj);
-            std::copy(data, data + sz, dat);
-        } else {
-            LOG(FATAL) << "You should yield float" << sizeof(real) * 8
-                       << " array";
-        }
-     } else {
-        py::SequenceHelper s(obj);
-        // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
-        for (size_t i=0; i < headerPtr_->dim; ++i) {
-          dat[i] = (real) s.getDouble(i);
-        }
+      auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
+      if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
+        real* data = (real*)PyArray_DATA((PyArrayObject*)obj);
+        auto sz = PyArray_SIZE((PyArrayObject*)obj);
+        std::copy(data, data + sz, dat);
+      } else {
+        LOG(FATAL) << "You should yield float" << sizeof(real) * 8 << " array";
+      }
+    } else {
+      py::SequenceHelper s(obj);
+      // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
+      for (size_t i = 0; i < headerPtr_->dim; ++i) {
+        dat[i] = (real)s.getDouble(i);
+      }
     }
     ++height_;
   }
@@ -752,20 +732,18 @@ private:
 /**
  * Scanner for index slot
  */
-class IndexScanner: public IFieldScanner {
+class IndexScanner : public IFieldScanner {
 public:
-  explicit IndexScanner(SlotHeader* ptr):IFieldScanner(ptr), cnt_(0) {}
+  explicit IndexScanner(SlotHeader* ptr) : IFieldScanner(ptr), cnt_(0) {}
 
   /**
    * Prepare memory space.
    *
    * @note obj is a single timestep of sample
    */
-  virtual void prepare(Argument &argument, PyObject *obj) {
-    ++cnt_;
-  }
+  virtual void prepare(Argument& argument, PyObject* obj) { ++cnt_; }
 
-  virtual void finishPrepare(Argument &argument) {
+  virtual void finishPrepare(Argument& argument) {
     IVector::resizeOrCreate(argument.ids, cnt_, false);
     cnt_ = 0;
   }
@@ -773,9 +751,9 @@ public:
   /**
    * Fill one index to argument.
    */
-  virtual void fill(Argument &argument, PyObject *obj) {
+  virtual void fill(Argument& argument, PyObject* obj) {
     bool ok;
-    argument.ids->getData()[cnt_++] = py::castInt<int >(obj, &ok);
+    argument.ids->getData()[cnt_++] = py::castInt<int>(obj, &ok);
     CHECK(ok) << "Cannot cast int " << py::repr(obj);
   }
 
@@ -785,27 +763,25 @@ private:
 
 class SparseNonValueScanner : public IFieldScanner {
 public:
-  explicit SparseNonValueScanner(SlotHeader* ptr): IFieldScanner(ptr),
-                                                   nnz_(0),
-                                                   height_(0) {}
+  explicit SparseNonValueScanner(SlotHeader* ptr)
+      : IFieldScanner(ptr), nnz_(0), height_(0) {}
 
   /**
    * Prepare memory space
    * @note obj is a timestep of one sample.
    */
-  virtual void prepare(Argument &argument, PyObject *obj) {
+  virtual void prepare(Argument& argument, PyObject* obj) {
     ++height_;
     nnz_ += py::SequenceHelper(obj).size();
   }
 
-  virtual void finishPrepare(Argument &argument) {
-    Matrix::resizeOrCreateSparseMatrix(argument.value, height_,
-                                       headerPtr_->dim,
-                                       nnz_, NO_VALUE);
+  virtual void finishPrepare(Argument& argument) {
+    Matrix::resizeOrCreateSparseMatrix(
+        argument.value, height_, headerPtr_->dim, nnz_, NO_VALUE);
   }
 
-  virtual void startFill(Argument & argument) {
-    auto smat = (CpuSparseMatrix*) (argument.value.get());
+  virtual void startFill(Argument& argument) {
+    auto smat = (CpuSparseMatrix*)(argument.value.get());
     smat->getRows()[0] = 0;
     nnz_ = 0;
     height_ = 1;
@@ -818,14 +794,14 @@ public:
   virtual void fill(Argument& argument, PyObject* obj) {
     py::SequenceHelper s(obj);
     auto sz = s.size();
-    auto smat = (CpuSparseMatrix*) (argument.value.get());
+    auto smat = (CpuSparseMatrix*)(argument.value.get());
     int* row = smat->getRows();
     int* col = smat->getCols();
     real* dat = smat->getData();
-    row[height_] = row[height_-1] + (int)sz;
+    row[height_] = row[height_ - 1] + (int)sz;
 
     for (decltype(sz) i = 0; i < sz; ++i) {
-      setData(col+nnz_, dat+nnz_, s[i]);
+      setData(col + nnz_, dat + nnz_, s[i]);
       ++nnz_;
     }
     ++height_;
@@ -839,7 +815,7 @@ protected:
    * @param [in] obj Python Object. For sparse_non_value is a PyInt or PyLong.
    *                 For sparse_value is a Tuple (int, float).
    */
-  virtual void setData(int* col, real * dat, PyObject* obj) {
+  virtual void setData(int* col, real* dat, PyObject* obj) {
     bool ok;
     *col = py::castInt<int>(obj, &ok);
     CHECK(ok);
@@ -851,26 +827,25 @@ protected:
 
 class SparseValueScanner : public SparseNonValueScanner {
 public:
-  explicit SparseValueScanner(SlotHeader *ptr) : SparseNonValueScanner(ptr) {}
+  explicit SparseValueScanner(SlotHeader* ptr) : SparseNonValueScanner(ptr) {}
 
-  virtual void finishPrepare(Argument &argument) {
-    Matrix::resizeOrCreateSparseMatrix(argument.value, height_,
-                                       headerPtr_->dim,
-                                       nnz_, FLOAT_VALUE);
+  virtual void finishPrepare(Argument& argument) {
+    Matrix::resizeOrCreateSparseMatrix(
+        argument.value, height_, headerPtr_->dim, nnz_, FLOAT_VALUE);
   }
 
 protected:
-  virtual void setData(int *col, real *dat, PyObject *obj) {
+  virtual void setData(int* col, real* dat, PyObject* obj) {
     py::SequenceHelper s(obj);
     SparseNonValueScanner::setData(col, dat, s[0]);
-    *dat = (real) s.getDouble(1);
+    *dat = (real)s.getDouble(1);
   }
 };
 
 /**
  * Sequence Scanner. Scanner for sequence or sub-sequence.
  */
-class SequenceScanner: public IFieldScanner {
+class SequenceScanner : public IFieldScanner {
 public:
   /**
    * Ctor
@@ -879,15 +854,18 @@ public:
    *                       return a sequence start position or a sub-sequence
    *                       start position.
    */
-  SequenceScanner(std::unique_ptr<IFieldScanner>&& innerScanner,
-    const std::function<ICpuGpuVectorPtr&(Argument&)>& getSeqStartPos)
-      : IFieldScanner(nullptr), inner_(std::move(innerScanner)),
-        cnt_(0), getSeqStartPos_(getSeqStartPos) {}
+  SequenceScanner(
+      std::unique_ptr<IFieldScanner>&& innerScanner,
+      const std::function<ICpuGpuVectorPtr&(Argument&)>& getSeqStartPos)
+      : IFieldScanner(nullptr),
+        inner_(std::move(innerScanner)),
+        cnt_(0),
+        getSeqStartPos_(getSeqStartPos) {}
 
   /**
    * Start prepare. Invoke inner->startPrepare too.
    */
-  virtual void startPrepare(Argument &argument) {
+  virtual void startPrepare(Argument& argument) {
     inner_->startPrepare(argument);
   }
 
@@ -895,10 +873,10 @@ public:
    * Prepare. obj is a list or tuple. it will invoke inner_->prepare for each
    * element of sequence obj.
    */
-  virtual void prepare(Argument &argument, PyObject *obj) {
+  virtual void prepare(Argument& argument, PyObject* obj) {
     py::SequenceHelper s(obj);
     ++cnt_;
-    for (size_t i=0; i < s.size(); ++i) {
+    for (size_t i = 0; i < s.size(); ++i) {
       inner_->prepare(argument, s[i]);
     }
   }
@@ -906,7 +884,7 @@ public:
   /**
    * Finish prepare. invoke inner_->finishPrepare too.
    */
-  virtual void finishPrepare(Argument &argument) {
+  virtual void finishPrepare(Argument& argument) {
     ICpuGpuVector::resizeOrCreate(getSeqStartPos_(argument), cnt_ + 1, false);
     inner_->finishPrepare(argument);
   }
@@ -914,7 +892,7 @@ public:
   /**
    * Start fill. invoke inner->startFill too.
    */
-  virtual void startFill(Argument &argument) {
+  virtual void startFill(Argument& argument) {
     getSeqStartPos_(argument)->getMutableData(false)[0] = 0;
     cnt_ = 1;
     inner_->startFill(argument);
@@ -925,13 +903,13 @@ public:
    * sequence obj. And set seqStartPos at same time. The seqStartPos will be
    * calculated by getSeqStartPos callback passed in ctor.
    */
-  virtual void fill(Argument &argument, PyObject *obj) {
+  virtual void fill(Argument& argument, PyObject* obj) {
     getSeqStartPos_(argument)->getMutableData(false)[cnt_] =
-      getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
-          (int)getSize(obj);
+        getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
+        (int)getSize(obj);
     py::SequenceHelper s(obj);
     ++cnt_;
-    for (size_t i=0; i < s.size(); ++i) {
+    for (size_t i = 0; i < s.size(); ++i) {
       inner_->fill(argument, s[i]);
     }
   }
@@ -939,9 +917,7 @@ public:
   /**
    * Finish fill. will invoke inner->finishFill too.
    */
-  virtual void finishFill(Argument &argument) {
-    inner_->finishFill(argument);
-  }
+  virtual void finishFill(Argument& argument) { inner_->finishFill(argument); }
 
 protected:
   size_t getSize(PyObject* obj) {
@@ -949,7 +925,7 @@ protected:
     auto sc = dynamic_cast<SequenceScanner*>(inner_.get());
     if (sc) {
       size_t sum = 0;
-      for (size_t i=0; i < s.size(); ++i) {
+      for (size_t i = 0; i < s.size(); ++i) {
         sum += sc->getSize(s[i]);
       }
       return sum;
@@ -964,8 +940,7 @@ private:
   std::function<ICpuGpuVectorPtr&(Argument&)> getSeqStartPos_;
 };
 
-
-IFieldScanner* IFieldScanner::create(SlotHeader *header) {
+IFieldScanner* IFieldScanner::create(SlotHeader* header) {
   IFieldScanner* retv = nullptr;
   switch (header->slotType) {
     case ST_DENSE:
@@ -989,15 +964,15 @@ IFieldScanner* IFieldScanner::create(SlotHeader *header) {
       break;
     case SQT_SUBSEQ:
       retv = new SequenceScanner(std::unique_ptr<IFieldScanner>(retv),
-            [](Argument& arg) -> ICpuGpuVectorPtr& {
-              return arg.subSequenceStartPositions;
-            });
-      // fall through, not break;
+                                 [](Argument& arg) -> ICpuGpuVectorPtr& {
+                                   return arg.subSequenceStartPositions;
+                                 });
+    // fall through, not break;
     case SQT_SEQ:
       retv = new SequenceScanner(std::unique_ptr<IFieldScanner>(retv),
-          [](Argument& arg) -> ICpuGpuVectorPtr& {
-            return arg.sequenceStartPositions;
-          });
+                                 [](Argument& arg) -> ICpuGpuVectorPtr& {
+                                   return arg.sequenceStartPositions;
+                                 });
       break;
     default:
       LOG(FATAL) << "Not implemented";
@@ -1010,19 +985,13 @@ IFieldScanner* IFieldScanner::create(SlotHeader *header) {
  * No Cache Strategy. Will destruct old data immediately and load data from
  * python every pass.
  */
-class NoCacheStrategy: public IPyDataProviderCache {
+class NoCacheStrategy : public IPyDataProviderCache {
 public:
-  virtual bool reset() {
-    return true;
-  }
+  virtual bool reset() { return true; }
 
-  virtual void drop(std::deque<PyObjectPtr> *data) {
-    data->clear();
-  }
+  virtual void drop(std::deque<PyObjectPtr>* data) { data->clear(); }
 
-  virtual std::deque<PyObjectPtr>* load() {
-    return nullptr;
-  }
+  virtual std::deque<PyObjectPtr>* load() { return nullptr; }
 };
 
 /**
@@ -1033,9 +1002,9 @@ public:
  */
 class CacheOnePassInMemory : public IPyDataProviderCache {
 public:
-  CacheOnePassInMemory() : objPool_(new std::deque<PyObjectPtr>()),
-                           droppedPool_(new std::deque<PyObjectPtr>())
-  {}
+  CacheOnePassInMemory()
+      : objPool_(new std::deque<PyObjectPtr>()),
+        droppedPool_(new std::deque<PyObjectPtr>()) {}
 
   virtual bool reset() {
     if (objPool_->empty() && droppedPool_->empty()) {
@@ -1048,25 +1017,22 @@ public:
     }
   }
 
-  virtual void drop(std::deque<PyObjectPtr> *data) {
+  virtual void drop(std::deque<PyObjectPtr>* data) {
     size_t orgSize = droppedPool_->size();
     droppedPool_->resize(orgSize + data->size());
-    for (size_t i=0; i < data->size(); ++i) {
+    for (size_t i = 0; i < data->size(); ++i) {
       std::swap((*droppedPool_)[orgSize + i], (*data)[i]);
     }
     data->clear();
   }
 
-  virtual std::deque<PyObjectPtr>* load() {
-    return objPool_.get();
-  }
+  virtual std::deque<PyObjectPtr>* load() { return objPool_.get(); }
 
 private:
-  std::unique_ptr<std::deque<PyObjectPtr> > objPool_;
-  std::unique_ptr<std::deque<PyObjectPtr> > droppedPool_;
+  std::unique_ptr<std::deque<PyObjectPtr>> objPool_;
+  std::unique_ptr<std::deque<PyObjectPtr>> droppedPool_;
 };
 
-
 IPyDataProviderCache* IPyDataProviderCache::create(CacheType ct) {
   switch (ct) {
     case NO_CACHE:
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
index e397c71c877dce8c34aefac12481373a037510f6..8f7d2fb80e9b6f2b4c83d90a04dab5219435d344 100644
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Evaluator.h"
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 
@@ -33,7 +32,8 @@ private:
     str.clear();
     int prevLabel = -1;
     for (std::vector<int>::const_iterator label = path.begin();
-         label != path.end(); label++) {
+         label != path.end();
+         label++) {
       if (*label != blank_ &&
           (str.empty() || *label != str.back() || prevLabel == blank_)) {
         str.push_back(*label);
@@ -58,8 +58,11 @@ private:
   /* "sp, dp, ip" is the weighting parameter of "substitution, deletion,
    * insertion"
    * in edit-distance error */
-  real stringAlignment(std::vector<int>& gtStr, std::vector<int>& recogStr,
-                       bool backtrace = true, real sp = 1.0, real dp = 1.0,
+  real stringAlignment(std::vector<int>& gtStr,
+                       std::vector<int>& recogStr,
+                       bool backtrace = true,
+                       real sp = 1.0,
+                       real dp = 1.0,
                        real ip = 1.0) {
     std::vector<std::vector<int>> matrix;
     int substitutions, deletions, insertions;
@@ -165,8 +168,8 @@ private:
     return distance / maxLen;
   }
 
-  real editDistance(real* output, int numTimes, int numClasses, int* labels,
-                    int labelsLen) {
+  real editDistance(
+      real* output, int numTimes, int numClasses, int* labels, int labelsLen) {
     numTimes_ = numTimes;
     numClasses_ = numClasses;
     blank_ = numClasses_ - 1;
@@ -207,7 +210,8 @@ public:
       real err = 0;
       err = editDistance(
           output.value->getData() + output.value->getWidth() * outputStarts[i],
-          outputStarts[i+1] - outputStarts[i], output.value->getWidth(),
+          outputStarts[i + 1] - outputStarts[i],
+          output.value->getWidth(),
           label.ids->getData() + labelStarts[i],
           labelStarts[i + 1] - labelStarts[i]);
 
@@ -240,7 +244,7 @@ public:
     seqClassficationError_ = 0;
   }
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     os << config_.name() << "="
        << (numSequences_ ? totalScore_ / numSequences_ : 0);
     os << "  deletions error"
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
index 22579891f397afe58d5b4285f0aece944d8b753c..923e77fc9df919794902daed6113792e7f89a552 100644
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -114,7 +114,7 @@ public:
     numCorrect_ = 0;
   }
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     double precision = (double)numCorrect_ / numOutputSegments_;
     double recall = (double)numCorrect_ / numLabelSegments_;
     double f1 =
@@ -144,7 +144,8 @@ public:
     size_t numSequences = sequenceStartPositions->getSize() - 1;
     const int* starts = sequenceStartPositions->getData();
     for (size_t i = 0; i < numSequences; ++i) {
-      eval1(output->getData() + starts[i], label->getData() + starts[i],
+      eval1(output->getData() + starts[i],
+            label->getData() + starts[i],
             starts[i + 1] - starts[i]);
     }
     return 0;
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
index 7bdcdaae53c638c93e567a2943586dcc27d75ded..f5df2b18dedde9022d04b034912e59be00f15413 100644
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "paddle/gserver/evaluators/Evaluator.h"
 
@@ -74,17 +73,19 @@ public:
     }
 
     const MatrixPtr errorMat = Matrix::create(output->getHeight(),
-      1, /* trans= */ false, useGpu(arguments[0].deviceId));
+                                              1,
+                                              /* trans= */ false,
+                                              useGpu(arguments[0].deviceId));
     errorMat->zeroMem();
     if (label != nullptr) {
       errorMat->classificationError(output, label);
     } else if (dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()) ||
                dynamic_cast<GpuSparseMatrix*>(multiBinaryLabel.get())) {
-      errorMat->classificationErrorMulti(*output, *multiBinaryLabel,
-                                         config_.classification_threshold());
+      errorMat->classificationErrorMulti(
+          *output, *multiBinaryLabel, config_.classification_threshold());
     } else {
-      errorMat->binaryClassificationError(0, *output, *multiBinaryLabel,
-                                          config_.classification_threshold());
+      errorMat->binaryClassificationError(
+          0, *output, *multiBinaryLabel, config_.classification_threshold());
     }
 
     if (supportWeight) {
@@ -126,8 +127,8 @@ public:
     int errCounter = 0;
     CpuVector errorVec(0, nullptr);
     for (size_t i = 0; i < sequenceStartPositions->getSize() - 1; ++i) {
-      errorVec.subVecFrom(errorMat->getData(), starts[i],
-                          starts[i + 1] - starts[i]);
+      errorVec.subVecFrom(
+          errorMat->getData(), starts[i], starts[i + 1] - starts[i]);
       if (errorVec.getSum() > 0) {
         errCounter += 1;
       }
@@ -315,7 +316,7 @@ public:
     return 0;
   }
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     CHECK(colIdx_ + (int32_t)colNum_ >= 0 && colIdx_ - (int32_t)colNum_ < 0)
         << "column index [" << colIdx_ << "] out of range [-" << colNum_ << ", "
         << colNum_ << ")";
@@ -330,8 +331,8 @@ public:
   }
 
   void distributeEval(ParameterClient2* client) {
-    client->reduce(sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id,
-                   0);
+    client->reduce(
+        sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id, 0);
     client->reduce(&numSamples_, &numSamples_, 1, FLAGS_trainer_id, 0);
   }
 
@@ -379,8 +380,11 @@ real AucEvaluator::evalImp(std::vector<Argument>& arguments) {
   }
 
   if (dynamic_cast<GpuMatrix*>(output.get())) {
-    Matrix::resizeOrCreate(cpuOutput_, insNum, outputDim,
-                           /* trans=*/false, /* useGpu=*/false);
+    Matrix::resizeOrCreate(cpuOutput_,
+                           insNum,
+                           outputDim,
+                           /* trans=*/false,
+                           /* useGpu=*/false);
     cpuOutput_->copyFrom(*output);
     IVector::resizeOrCreate(cpuLabel_, insNum, false);
     cpuLabel_->copyFrom(*label);
@@ -421,7 +425,7 @@ void AucEvaluator::distributeEval(ParameterClient2* client) {
   client->reduce(statNeg_, statNeg_, kBinNum_ + 1, FLAGS_trainer_id, 0);
 }
 
-double AucEvaluator::calcAuc() {
+double AucEvaluator::calcAuc() const {
   double totPos = 0.0;
   double totNeg = 0.0;
   double totPosPrev = 0.0;
@@ -479,19 +483,24 @@ real RankAucEvaluator::evalImp(std::vector<Argument>& arguments) {
   for (size_t i = 0; i < batchNum; ++i) {
     int beginPos = startPosData[i];
     int endPos = startPosData[i + 1];
-    batchAuc += calcRankAuc(outputData + beginPos, clickData + beginPos,
-                            pvData + beginPos, endPos - beginPos);
+    batchAuc += calcRankAuc(outputData + beginPos,
+                            clickData + beginPos,
+                            pvData + beginPos,
+                            endPos - beginPos);
   }
   return batchAuc;
 }
 
-double RankAucEvaluator::calcRankAuc(real* outputData, real* clickData,
-                                     real* pvData, size_t size) {
+double RankAucEvaluator::calcRankAuc(real* outputData,
+                                     real* clickData,
+                                     real* pvData,
+                                     size_t size) {
   outputPair_.clear();
   for (size_t i = 0; i < size; ++i) {
     outputPair_.push_back(std::make_pair(outputData[i], i));
   }
-  std::sort(outputPair_.begin(), outputPair_.end(),
+  std::sort(outputPair_.begin(),
+            outputPair_.end(),
             [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
               return a.first > b.first;
             });
@@ -584,7 +593,7 @@ real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
   return 0;
 }
 
-void PrecisionRecallEvaluator::printStats(std::ostream& os) {
+void PrecisionRecallEvaluator::printStats(std::ostream& os) const {
   int label = config_.positive_label();
   if (label != -1) {
     CHECK(label >= 0 && label < (int)statsInfo_.size())
@@ -790,8 +799,12 @@ real PnpairEvaluator::evalImp(std::vector<Argument>& arguments) {
   return 0;
 }
 
-void PnpairEvaluator::stat(size_t start, size_t end, PredictionResult* answers,
-                           double& pos, double& neg, double& spe) {
+void PnpairEvaluator::stat(size_t start,
+                           size_t end,
+                           PredictionResult* answers,
+                           double& pos,
+                           double& neg,
+                           double& spe) {
   for (size_t i = start; i < end; i++) {
     for (size_t j = i + 1; j < end; j++) {
       CHECK_EQ(answers[i].queryid, answers[j].queryid);
@@ -817,7 +830,8 @@ void PnpairEvaluator::stat(size_t start, size_t end, PredictionResult* answers,
 }
 
 void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
-  std::sort(predictArray.begin(), predictArray.end(),
+  std::sort(predictArray.begin(),
+            predictArray.end(),
             [](const PredictionResult& x, const PredictionResult& y) {
               return x.queryid < y.queryid;
             });
@@ -828,11 +842,16 @@ void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
   auto start = predictArray.begin();
   while (start != predictArray.end()) {
     auto end = std::find_if(
-        start + 1, predictArray.end(),
+        start + 1,
+        predictArray.end(),
         [=](const PredictionResult& x) { return x.queryid != start->queryid; });
     CHECK(end != start);
-    stat(start - predictArray.begin(), end - predictArray.begin(),
-         predictArray.data(), pos, neg, special);
+    stat(start - predictArray.begin(),
+         end - predictArray.begin(),
+         predictArray.data(),
+         pos,
+         neg,
+         special);
 
     start = end;
   }
@@ -1120,8 +1139,8 @@ public:
 
     auto resizeMatrix = [](MatrixPtr& dest, const MatrixPtr& src) {
       if (src && src->useGpu()) {
-        Matrix::resizeOrCreate(dest, src->getHeight(), src->getWidth(), false,
-                               false);
+        Matrix::resizeOrCreate(
+            dest, src->getHeight(), src->getWidth(), false, false);
         dest->copyFrom(*src);
       } else {
         dest = src;
diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h
index b79a539384e9f7620c118d14b915c3f76a9a43af..732abb6079523b1cce8d0727c94ef65581842b4c 100644
--- a/paddle/gserver/evaluators/Evaluator.h
+++ b/paddle/gserver/evaluators/Evaluator.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/pserver/ParameterClient2.h"
@@ -99,19 +98,19 @@ public:
    * @brief print the statistics of evaluate result
    * @note finish() should be called before printStats
    */
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     os << config_.name() << "="
        << (numSamples_ ? totalScore_ / numSamples_ : 0);
   }
 
   friend std::ostream& operator<<(std::ostream& os,
-                                  Evaluator& evaluator) {
+                                  const Evaluator& evaluator) {
     evaluator.printStats(os);
     return os;
   }
 
-  friend std::ostream&& operator<<(std::ostream&& os,    // NOLINT
-                                   Evaluator& evaluator) {
+  friend std::ostream&& operator<<(std::ostream&& os,  // NOLINT
+                                   const Evaluator& evaluator) {
     evaluator.printStats(os);
     return std::move(os);
   }
@@ -135,7 +134,7 @@ public:
     return -1;
   }
   virtual void finish() {}
-  virtual void printStats(std::ostream&) {}
+  virtual void printStats(std::ostream&) const {}
 };
 /**
  * @brief evaluate AUC using colIdx-th column as prediction.
@@ -165,7 +164,7 @@ public:
 
   virtual real evalImp(std::vector<Argument>& arguments);
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     os << config_.name() << "=" << calcAuc();
   }
 
@@ -184,12 +183,14 @@ private:
 
   AucEvaluator() {}
 
-  inline static double trapezoidArea(double X1, double X2, double Y1,
+  inline static double trapezoidArea(double X1,
+                                     double X2,
+                                     double Y1,
                                      double Y2) {
     return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
   }
 
-  double calcAuc();
+  double calcAuc() const;
 };
 
 /**
@@ -218,7 +219,9 @@ private:
   MatrixPtr pv_;
   std::vector<std::pair<real, int>> outputPair_;
 
-  double calcRankAuc(real* outputData, real* clickData, real* pvData,
+  double calcRankAuc(real* outputData,
+                     real* clickData,
+                     real* pvData,
                      size_t size);
 };
 /**
@@ -244,7 +247,7 @@ public:
 
   virtual real evalImp(std::vector<Argument>& arguments);
 
-  virtual void printStats(std::ostream& os);
+  virtual void printStats(std::ostream& os) const;
 
   virtual void distributeEval(ParameterClient2* client);
 
@@ -269,10 +272,12 @@ private:
   IVectorPtr cpuLabel_;
   MatrixPtr cpuWeight_;
 
-  void calcStatsInfo(const MatrixPtr& output, const IVectorPtr& label,
+  void calcStatsInfo(const MatrixPtr& output,
+                     const IVectorPtr& label,
                      const MatrixPtr& weight);
 
-  void calcStatsInfoMulti(const MatrixPtr& output, const MatrixPtr& label,
+  void calcStatsInfoMulti(const MatrixPtr& output,
+                          const MatrixPtr& label,
                           const MatrixPtr& weight);
 
   inline static double calcPrecision(double TP, double FP) {
@@ -333,13 +338,17 @@ public:
     }
   }
 
-  void stat(size_t start, size_t end, PredictionResult* answers, double& pos,
-            double& neg, double& spe);
+  void stat(size_t start,
+            size_t end,
+            PredictionResult* answers,
+            double& pos,
+            double& neg,
+            double& spe);
   void calc(std::vector<PredictionResult>& predictArray);
 
   virtual void finish() { calc(predictArray_); }
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     os << " pos/neg"
        << "=" << pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
   }
diff --git a/paddle/gserver/gradientmachines/GradientMachine.cpp b/paddle/gserver/gradientmachines/GradientMachine.cpp
index b20525f66431e16544ce1e05a617286bd5975cfc..3761fda5f370e3b1aef0e394286c49d8ec831694 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/GradientMachine.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "GradientMachine.h"
 
 #include "paddle/utils/Logging.h"
@@ -29,7 +28,8 @@ limitations under the License. */
 namespace paddle {
 
 GradientMachine* GradientMachine::create(
-    const ModelConfig& config, int mode,
+    const ModelConfig& config,
+    int mode,
     const std::vector<ParameterType>& parameterTypes) {
   if (auto gm = IGradientMachineMode::tryCreateGradientMachine(mode, config)) {
     return gm;
@@ -49,10 +49,11 @@ GradientMachine* GradientMachine::create(
       /* single thread calculate */
       nn = NeuralNetwork::create(config);
     }
-    ParamInitCallback testParamInitCb =
-        [](int paramId, Parameter* para) { para->enableType(PARAMETER_VALUE); };
-    nn->init(config, mode == kTesting ? testParamInitCb : nullptr,
-             parameterTypes);
+    ParamInitCallback testParamInitCb = [](int paramId, Parameter* para) {
+      para->enableType(PARAMETER_VALUE);
+    };
+    nn->init(
+        config, mode == kTesting ? testParamInitCb : nullptr, parameterTypes);
     return nn;
   }
   LOG(FATAL) << "Unknown model type: " << config.type();
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
index 986a1ee71dbb00781c6af93a06f3e16d6639c307..27cdf7f7890673673d5be63fecdd61d5d2a11447 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <iostream>
@@ -84,10 +83,11 @@ public:
    * Parameter will have parameterTypes
    */
   static GradientMachine* create(
-      const ModelConfig& config, int mode = kNormal,
+      const ModelConfig& config,
+      int mode = kNormal,
       const std::vector<ParameterType>& parameterTypes =
-          std::vector<ParameterType>{PARAMETER_VALUE, PARAMETER_GRADIENT,
-                                     PARAMETER_MOMENTUM});
+          std::vector<ParameterType>{
+              PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM});
 
   /**
    * Create a gradient machine from the merged model file.
@@ -137,7 +137,8 @@ public:
    * @note: if passType==PASS_TEST, then backward() should not be called
    */
   virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs, PassType passType) = 0;
+                       std::vector<Argument>* outArgs,
+                       PassType passType) = 0;
 
   /**
    * @brief Backward propagation.
@@ -211,7 +212,7 @@ public:
    * @note    This function will only been implemented and used in a
    *          multithreaded environment.
    */
- virtual void start(const TrainerConfig& config,
+  virtual void start(const TrainerConfig& config,
                      DataProviderPtr dataProvider) {
     (void)config;
     (void)dataProvider;
@@ -246,7 +247,6 @@ public:
    */
   virtual void restart() {}
 
-
   /// Set the gradient of the output from outside.
   virtual void setOutputGrad(const std::vector<Argument>& args) {
     LOG(FATAL) << "Not implemented!";
diff --git a/paddle/gserver/gradientmachines/GradientMachineMode.h b/paddle/gserver/gradientmachines/GradientMachineMode.h
index 9aff9c616cf514d53b5017dfdb6250a7cbce0198..f2f55a70671858145572e4a5c0f1c4b609145f98 100644
--- a/paddle/gserver/gradientmachines/GradientMachineMode.h
+++ b/paddle/gserver/gradientmachines/GradientMachineMode.h
@@ -23,10 +23,10 @@ public:
   virtual ~IGradientMachineMode() {}
 
 public:  // interfaces
-  /**
-   * @brief create current mode's gradient machine by model config.
-   * @param config model config
-   */
+         /**
+          * @brief create current mode's gradient machine by model config.
+          * @param config model config
+          */
   virtual GradientMachine* create(const ModelConfig& config) = 0;
 
   /**
@@ -37,11 +37,10 @@ public:  // interfaces
    * @param isGpu is using gpu.
    * @return true if mode should be this mode.
    */
-  virtual bool shouldBeMe(
-      const std::string& algo,
-      size_t trainerCount,
-      bool isLocal,
-      bool isGpu) const = 0;
+  virtual bool shouldBeMe(const std::string& algo,
+                          size_t trainerCount,
+                          bool isLocal,
+                          bool isGpu) const = 0;
 
   /**
    * @brief Is data must be in cpu even if using gpu mode.
@@ -57,13 +56,13 @@ public:  // interfaces
   virtual bool needTrainWholeDataInOneBatch() const = 0;
 
 public:  // static methods.
-  /**
-   * @brief register a custom gradient machine mode.
-   * @note For user to register a custom gradient machine mode, id should >=
-   * kCustom.
-   * @param mode mode id.
-   * @param ptr mode description object.
-   */
+         /**
+          * @brief register a custom gradient machine mode.
+          * @note For user to register a custom gradient machine mode, id should >=
+          * kCustom.
+          * @param mode mode id.
+          * @param ptr mode description object.
+          */
   static void regGradientMachineMode(
       int32_t mode, std::unique_ptr<IGradientMachineMode>&& ptr) {
     modes_.insert(std::make_pair(mode, std::move(ptr)));
@@ -102,9 +101,11 @@ public:  // static methods.
    * @param [in] isGpu using gpu or not.
    * @return true if there is a custom mode fit these conditions.
    */
-  static bool tryGetMode(int* mode, const std::string& algo,
+  static bool tryGetMode(int* mode,
+                         const std::string& algo,
                          int32_t trainerCount,
-                         bool isLocal, bool isGpu) {
+                         bool isLocal,
+                         bool isGpu) {
     for (auto it = modes_.begin(); it != modes_.end(); ++it) {
       if (it->second->shouldBeMe(algo, trainerCount, isLocal, isGpu)) {
         *mode = it->first;
@@ -130,8 +131,8 @@ public:  // static methods.
    * @brief try to create gradient machine by mode & config.
    * @return nullptr if we cannot create a gradient machine by such mode.
    */
-  static GradientMachine* tryCreateGradientMachine(
-      int32_t mode, const ModelConfig& config) {
+  static GradientMachine* tryCreateGradientMachine(int32_t mode,
+                                                   const ModelConfig& config) {
     auto m = IGradientMachineMode::mode(mode);
     if (m) {
       return m->create(config);
@@ -142,7 +143,7 @@ public:  // static methods.
 
 private:
   static std::unordered_map<int32_t, std::unique_ptr<IGradientMachineMode>>
-    modes_;
+      modes_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 0ded30eeb44e95b50ff91722ef96a9f24c81c16d..148451f18dceb0c470dadab01ff91915f994c68f 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "MultiGradientMachine.h"
 
 #include "paddle/utils/Logging.h"
@@ -22,7 +21,8 @@ limitations under the License. */
 #include "NeuralNetwork.h"
 #include "ParallelNeuralNetwork.h"
 
-P_DEFINE_bool(allow_only_one_model_on_one_gpu, true,
+P_DEFINE_bool(allow_only_one_model_on_one_gpu,
+              true,
               "If true, do not allow multiple models on one GPU device");
 #ifdef PADDLE_METRIC_LEARNING
 P_DECLARE_bool(external);
@@ -32,15 +32,15 @@ namespace paddle {
 
 // get types of the parameters which need to be merged after backward()
 static void fillMergeTypes(PassType passType,
-    std::vector<ParameterType>* mergeTypes) {
+                           std::vector<ParameterType>* mergeTypes) {
   mergeTypes->clear();
   if (passType != PASS_TEST) {
     mergeTypes->push_back(PARAMETER_GRADIENT);
   }
 }
 
-MultiGradientMachine::MultiGradientMachine(
-    const ModelConfig& config, bool useGpu)
+MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,
+                                           bool useGpu)
     : useGpu_(useGpu),
       trainerBarrier_(FLAGS_trainer_count),
       allBarrier_(FLAGS_trainer_count + 1),
@@ -65,13 +65,11 @@ MultiGradientMachine::MultiGradientMachine(
     if (para->useGpu()) return;
 
     if (para->isSparseRemoteUpdate()) {
-      para->enableType(
-        PARAMETER_VALUE,
-        FLAGS_loadsave_parameters_in_pserver
-          ? Parameter::MAT_SPARSE_ROW_PREFETCH
-          : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
-      para->enableType(
-        PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
+      para->enableType(PARAMETER_VALUE,
+                       FLAGS_loadsave_parameters_in_pserver
+                           ? Parameter::MAT_SPARSE_ROW_PREFETCH
+                           : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
+      para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
     } else if (para->isGradSparseUpdate()) {
       para->enableType(PARAMETER_VALUE);
       para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW_IDS);
@@ -100,17 +98,16 @@ MultiGradientMachine::MultiGradientMachine(
   if (useGpu_) {
     numLogicalDevices_ = 1;
 
-    for (size_t pid = 0; pid  < parameters_.size(); pid++) {
+    for (size_t pid = 0; pid < parameters_.size(); pid++) {
       if (parameters_[pid]->getConfig().device() + 1 > numLogicalDevices_) {
         numLogicalDevices_ = parameters_[pid]->getConfig().device() + 1;
       }
     }
     LOG(INFO) << "numLogicalDevices=" << numLogicalDevices_
-              << " numThreads=" << numThreads_
-              << " numDevices=" << numDevices_;
+              << " numThreads=" << numThreads_ << " numDevices=" << numDevices_;
 
-    if (numLogicalDevices_ * numThreads_ > numDevices_
-        && FLAGS_allow_only_one_model_on_one_gpu) {
+    if (numLogicalDevices_ * numThreads_ > numDevices_ &&
+        FLAGS_allow_only_one_model_on_one_gpu) {
       LOG(FATAL) << "trainer_count * num_devices_in_model "
                  << "(" << numThreads_ << "*" << numLogicalDevices_ << ")"
                  << "=" << numThreads_ * numLogicalDevices_
@@ -130,11 +127,7 @@ MultiGradientMachine::MultiGradientMachine(
   }
 
   for (int i = 0; i < numThreads_; ++i) {
-    threads_.emplace_back(
-        new TrainerThread(
-            config,
-            i,
-            this));
+    threads_.emplace_back(new TrainerThread(config, i, this));
   }
 
   bufferSizes_.resize(numLogicalDevices_, 0);
@@ -162,7 +155,7 @@ MultiGradientMachine::MultiGradientMachine(
 
   // combination of all trainers mainPara into GradientMachine parameters
   hasNonstaticCpuParamters_ = false;
-  for (size_t pid = 0; pid  < parameters_.size(); pid++) {
+  for (size_t pid = 0; pid < parameters_.size(); pid++) {
     if (parameters_[pid]->useGpu()) {
       parameters_[pid] = threads_[paraMainThread_[pid]]->getParameters()[pid];
     } else if (!parameters_[pid]->isStatic()) {
@@ -209,7 +202,7 @@ void MultiGradientMachine::allocGradBufs() {
       SetDevice device(logicalDeviceId2RealDeviceId(d, i));
       for (size_t j = 0; j < mergeTypes_.size(); j++) {
         gradBufs_[i][d].bufs.push_back(
-          Vector::create(bufferSizes_[d], /* useGpu= */true));
+            Vector::create(bufferSizes_[d], /* useGpu= */ true));
       }
     }
   }
@@ -249,18 +242,16 @@ void MultiGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
   }
 }
 
-void MultiGradientMachine::forward(
-    const std::vector<Argument>& inArgs,
-    std::vector<Argument>* outArgs,
-    PassType passType) {
+void MultiGradientMachine::forward(const std::vector<Argument>& inArgs,
+                                   std::vector<Argument>* outArgs,
+                                   PassType passType) {
   forwardImp(inArgs, outArgs, passType, TASK_FORWARD);
 }
 
-void MultiGradientMachine::forwardImp(
-    const std::vector<Argument>& inArgs,
-    std::vector<Argument>* outArgs,
-    PassType passType,
-    TaskType taskType) {
+void MultiGradientMachine::forwardImp(const std::vector<Argument>& inArgs,
+                                      std::vector<Argument>* outArgs,
+                                      PassType passType,
+                                      TaskType taskType) {
   updateThreadParameters();
   passType_ = passType;
 
@@ -282,18 +273,16 @@ void MultiGradientMachine::backward(const UpdateCallback& callback) {
   backwardImp(callback);
 }
 
-void MultiGradientMachine::forwardBackward(
-    const std::vector<Argument>& inArgs,
-    std::vector<Argument>* outArgs,
-    PassType passType,
-    const UpdateCallback& callback) {
+void MultiGradientMachine::forwardBackward(const std::vector<Argument>& inArgs,
+                                           std::vector<Argument>* outArgs,
+                                           PassType passType,
+                                           const UpdateCallback& callback) {
   backwardCallback_ = callback;
   forwardImp(inArgs, outArgs, passType, TASK_FORWARD_BACKWARD);
   backwardImp(callback);
 }
 
-void MultiGradientMachine::backwardImp(
-    const UpdateCallback& callback) {
+void MultiGradientMachine::backwardImp(const UpdateCallback& callback) {
   for (size_t i = 0; i < parameters_.size(); i++) {
     if (!parameters_[i]->useGpu() || parameters_[i]->isStatic()) continue;
     REGISTER_TIMER("controller_dequeue");
@@ -349,9 +338,8 @@ void MultiGradientMachine::eval(Evaluator* evaluator) {
   }
 }
 
-void MultiGradientMachine::getOutArgs(
-    std::vector<Argument>* outArgs,
-    PassType passType) {
+void MultiGradientMachine::getOutArgs(std::vector<Argument>* outArgs,
+                                      PassType passType) {
   for (auto& thread : threads_) {
     REGISTER_TIMER("waitOutArgs");
     thread->waitOutArgsReady();
@@ -375,7 +363,6 @@ void MultiGradientMachine::getOutArgs(
   *outArgs = outArgs_;
 }
 
-
 void MultiGradientMachine::setOutputGrad(const std::vector<Argument>& args) {
   CHECK_EQ(args.size(), outArgs_.size());
   for (size_t i = 0; i < args.size(); i++) {
@@ -390,10 +377,9 @@ void MultiGradientMachine::startTask(TaskType taskType) {
   }
 }
 
-TrainerThread::TrainerThread(
-    const ModelConfig& config,
-    int threadId,
-    MultiGradientMachine* multiMachine)
+TrainerThread::TrainerThread(const ModelConfig& config,
+                             int threadId,
+                             MultiGradientMachine* multiMachine)
     : multiMachine_(multiMachine),
       config_(config),
       threadId_(threadId),
@@ -407,8 +393,9 @@ TrainerThread::TrainerThread(
 
   partnerId_ = mod(threadId_ - 1, numThreads);
 
-  deviceId_ = !multiMachine_->useGpu() ? -1
-      : multiMachine_->logicalDeviceId2RealDeviceId(0, threadId_);
+  deviceId_ = !multiMachine_->useGpu()
+                  ? -1
+                  : multiMachine_->logicalDeviceId2RealDeviceId(0, threadId_);
   SetDevice gpuDevice(deviceId_);
 
   NeuralNetwork* nn = nullptr;
@@ -418,22 +405,20 @@ TrainerThread::TrainerThread(
     nn = new ParallelNeuralNetwork();
     for (auto& paraConfig : *config_.mutable_parameters()) {
       if (paraConfig.device() != -1) {
-        paraConfig.set_device(
-          multiMachine_->logicalDeviceId2RealDeviceId(
+        paraConfig.set_device(multiMachine_->logicalDeviceId2RealDeviceId(
             paraConfig.device(), threadId_));
       }
     }
     for (auto& layerConfig : *config_.mutable_layers()) {
       if (layerConfig.device() != -1) {
-        layerConfig.set_device(
-          multiMachine_->logicalDeviceId2RealDeviceId(
+        layerConfig.set_device(multiMachine_->logicalDeviceId2RealDeviceId(
             layerConfig.device(), threadId_));
       }
     }
   }
   // Only GPU do not share parameter values with main paramters.
-  ParamInitCallback slaveParamInitCb = std::bind(parameterInitNN, _1, _2,
-                                                 &mainParas);
+  ParamInitCallback slaveParamInitCb =
+      std::bind(parameterInitNN, _1, _2, &mainParas);
   nn->init(config_, slaveParamInitCb);
   gradientMachine_.reset(nn);
   parameters_ = gradientMachine_->getParameters();
@@ -443,9 +428,8 @@ TrainerThread::TrainerThread(
     }
   }
 
-  backwardCallback_ = std::bind(
-      &TrainerThread::backwardCallback,
-      this, std::placeholders::_1);
+  backwardCallback_ =
+      std::bind(&TrainerThread::backwardCallback, this, std::placeholders::_1);
 
   gradStream_ = HPPL_STREAM_2;
   valueStream_ = HPPL_STREAM_3;
@@ -454,25 +438,21 @@ TrainerThread::TrainerThread(
   parameterUpdated_ = false;
 }
 
-TrainerThread::~TrainerThread() {
-  stop();
-}
+TrainerThread::~TrainerThread() { stop(); }
 
 void TrainerThread::start() {
-  gradientMachine_->start(*(TrainerConfig*)nullptr, (DataProviderPtr)nullptr);
+  gradientMachine_->start(*(TrainerConfig*)nullptr, (DataProviderPtr) nullptr);
 
-  computeThread_.reset(new std::thread(
-      [this](){ computeThread(); }));
+  computeThread_.reset(new std::thread([this]() { computeThread(); }));
 
   if (multiMachine_->useGpu()) {
-    gradCollectThread_.reset(new std::thread(
-      [this](){ gradCollectThread(); }));
+    gradCollectThread_.reset(
+        new std::thread([this]() { gradCollectThread(); }));
 
-    valueDispatchThread_.reset(new std::thread(
-      [this](){ valueDispatchThread(); }));
+    valueDispatchThread_.reset(
+        new std::thread([this]() { valueDispatchThread(); }));
 
-    copyThread_.reset(new std::thread(
-      [this](){ copyGradToBufferThread(); }));
+    copyThread_.reset(new std::thread([this]() { copyGradToBufferThread(); }));
   }
 }
 
@@ -565,20 +545,14 @@ void TrainerThread::forward() {
 
   {
     REGISTER_TIMER("wait_value");
-    valueReadyCond_.wait(
-        [this]() {
-          return !parameterUpdated_;
-        });
+    valueReadyCond_.wait([this]() { return !parameterUpdated_; });
   }
 
-  {
-    fillMergeTypes(multiMachine_->getPassType(), &mergeTypes_);
-  }
+  { fillMergeTypes(multiMachine_->getPassType(), &mergeTypes_); }
 
   {
     REGISTER_TIMER("thread_forward");
-    gradientMachine_->forward(
-        inArgs_, &outArgs_, multiMachine_->getPassType());
+    gradientMachine_->forward(inArgs_, &outArgs_, multiMachine_->getPassType());
   }
   outArgsReadySem_.post();
 }
@@ -602,9 +576,8 @@ void TrainerThread::backwardCallback(Parameter* para) {
   if (multiMachine_->getNumThreads() == 1) {
     // no need to do merge if there is only one thread
     doCallback(paramId);
-  } else if (threadId_ ==
-             mod(multiMachine_->paraMainThread(paramId) - 1,
-                 multiMachine_->getNumThreads())) {
+  } else if (threadId_ == mod(multiMachine_->paraMainThread(paramId) - 1,
+                              multiMachine_->getNumThreads())) {
     notifyCopyGradToBuffer(paramId);
   } else {
     notifyGradientCollect(paramId);
@@ -625,7 +598,7 @@ void TrainerThread::copyGradToBufferThread() {
     if (stopping_) break;
 
     int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId(
-      parameters_[pid]->getDeviceId(), threadId_);
+        parameters_[pid]->getDeviceId(), threadId_);
 
     auto& gradBuf = gradBufs[pdeviceId];
 
@@ -639,9 +612,9 @@ void TrainerThread::copyGradToBufferThread() {
       SetDevice setDevice(parameters_[pid]->getDeviceId());
       for (size_t i = 0; i < mergeTypes_.size(); ++i) {
         gradBuf.bufs[i]->resize(
-          parameters_[pid]->getBuf(mergeTypes_[i])->getSize());
-        gradBuf.bufs[i]->copyFrom(
-            *parameters_[pid]->getBuf(mergeTypes_[i]), gradStream_);
+            parameters_[pid]->getBuf(mergeTypes_[i])->getSize());
+        gradBuf.bufs[i]->copyFrom(*parameters_[pid]->getBuf(mergeTypes_[i]),
+                                  gradStream_);
       }
       hl_stream_synchronize(gradStream_);
     }
@@ -667,7 +640,7 @@ void TrainerThread::gradCollectThread() {
     if (++gradReadyCount[pid] < 2) continue;
     gradReadyCount[pid] = 0;
     int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId(
-      parameters_[pid]->getDeviceId(), threadId_);
+        parameters_[pid]->getDeviceId(), threadId_);
 
     auto& gradBuf = gradBufs[pdeviceId];
 
@@ -741,8 +714,7 @@ void TrainerThread::valueDispatchThread() {
 
 void TrainerThread::notifyValueReady(int paramId) {
   if (--updateCounter_ == 0) {
-    valueReadyCond_.notify_all(
-        [this] { parameterUpdated_ = false; });
+    valueReadyCond_.notify_all([this] { parameterUpdated_ = false; });
   }
 
   notifyValueDispatch(paramId);
@@ -750,7 +722,7 @@ void TrainerThread::notifyValueReady(int paramId) {
 
 void TrainerThread::copyInArgs() {
   const std::vector<Argument>& fullInArgs = multiMachine_->getInArgs();
-  int     numThreads = multiMachine_->getAllThreads().size();
+  int numThreads = multiMachine_->getAllThreads().size();
   int32_t numSequences = fullInArgs[0].getNumSequences();
   int32_t startSeq = numSequences * threadId_ / numThreads;
   int32_t endSeq = numSequences * (threadId_ + 1) / numThreads;
@@ -767,9 +739,11 @@ void TrainerThread::copyInArgs() {
     return;
   }
 
-  for (size_t i=0; i < fullInArgs.size(); i++) {
+  for (size_t i = 0; i < fullInArgs.size(); i++) {
     inArgs_[i].resizeAndCopyFrom(
-        fullInArgs[i], startSeq, copySize,
+        fullInArgs[i],
+        startSeq,
+        copySize,
         FLAGS_parallel_nn ? false : multiMachine_->useGpu());
   }
 }
@@ -814,10 +788,8 @@ void TrainerThread::mergeGradSparse(
   std::vector<uint32_t>& ids = mainMat->getIds(threadId_);
 
   for (auto slaveParams : slaveParameters) {
-    SparseRowCpuMatrix* mat =
-        dynamic_cast<SparseRowCpuMatrix*>((*slaveParams)[pid]
-                                              ->getMat(PARAMETER_GRADIENT)
-                                              .get());
+    SparseRowCpuMatrix* mat = dynamic_cast<SparseRowCpuMatrix*>(
+        (*slaveParams)[pid]->getMat(PARAMETER_GRADIENT).get());
     mat->addTo(*mainMat, ids, threadId_, multiMachine_->getNumThreads());
     // we use a sample hash method(%) instead of range partition,
     // because range partition has balance issue sometimes,
@@ -847,9 +819,10 @@ void TrainerThread::mergeGradDense(
     Parameter* para,
     std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
   size_t pid = para->getID();
-  auto interval =
-      calcSplitArrayInterval(para->getSize(), (size_t)threadId_,
-                             multiMachine_->getNumThreads(), 8LU /*for avx*/);
+  auto interval = calcSplitArrayInterval(para->getSize(),
+                                         (size_t)threadId_,
+                                         multiMachine_->getNumThreads(),
+                                         8LU /*for avx*/);
   size_t startSeq = interval.first;
   size_t copySize = interval.second - interval.first;
 
@@ -861,8 +834,7 @@ void TrainerThread::mergeGradDense(
   CpuVector slaveGradSub(0, nullptr);
   for (auto slaveParams : slaveParameters) {
     slaveGradSub.subVecFrom(
-      *(*slaveParams)[pid]->getBuf(PARAMETER_GRADIENT),
-      startSeq, copySize);
+        *(*slaveParams)[pid]->getBuf(PARAMETER_GRADIENT), startSeq, copySize);
     destGrad.add(slaveGradSub);
   }
 }
@@ -876,7 +848,9 @@ void TrainerThread::copyOutputGrad() {
   int32_t copySize = endSeq - startSeq;
   outArgs_.resize(outputGradArgs.size());
   for (size_t i = 0; i < outputGradArgs.size(); i++) {
-    outArgs_[i].resizeAndCopyFrom(outputGradArgs[i], startSeq, copySize,
+    outArgs_[i].resizeAndCopyFrom(outputGradArgs[i],
+                                  startSeq,
+                                  copySize,
                                   multiMachine_->useGpu(),
                                   HPPL_STREAM_DEFAULT);
   }
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
index d13cf426c29e4e9f6806178f2362e8189fdb0dec..58c5486810cf280c48c62f2256480c1a4bb047bc 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <atomic>
@@ -52,7 +51,8 @@ struct GradBuffer {
  *
  *  It handles GPU and Cpu parameters differently.  In GPU, one computing thread
  *  generally corresponds to one GPU device. Thus, each thread keeps a separate
- *  copy of the parameter in its own device's memory. In CPU, we only need to keep
+ *  copy of the parameter in its own device's memory. In CPU, we only need to
+ keep
  *  one copy of the parameters in the main memory. After, each computing thread
  *  computes its own parameter gradient, the update process needs to accumulate
  *  the parameter gradients from all the computing threads, and update the
@@ -66,16 +66,21 @@ struct GradBuffer {
  *  computing thread so that the parameters in all the computing threads are
  *  synchronized. The scatter and gather process are implemented by ring-style
  *  communication. Assume we have N computing threads, its thread ids will be
- *  0, 1, ..., N-1. For each parameter, the id of the main thread is specified in
- *  paraMainThread_[pid], where pid is the id of the parameter. Each thread i only
+ *  0, 1, ..., N-1. For each parameter, the id of the main thread is specified
+ in
+ *  paraMainThread_[pid], where pid is the id of the parameter. Each thread i
+ only
  *  sends data to its partner thread (i - 1) % N. For example, for a parameter
  *  gradient that is computed in thread 4, and its main thread is 2. Its
- *  traveling process would be 4, 5,..., N-1, 0, 1, 2. In each step, the gradient
+ *  traveling process would be 4, 5,..., N-1, 0, 1, 2. In each step, the
+ gradient
  *  buffer is added to the local gradient, and the local gradient is then copied
  *  to the gradient buffer of the next thread. At last, its main thread 2 will
  *  get the accumulated parameter gradient. For the same parameter, after its
- *  value is updated, the value's traveling process would be 2, 1, 0, N-1, ... 3.
- *  At the end, all the computing threads would have the updated parameter value.
+ *  value is updated, the value's traveling process would be 2, 1, 0, N-1, ...
+ 3.
+ *  At the end, all the computing threads would have the updated parameter
+ value.
  *
  *  A computing thread (TrainerThread) uses 4 threads to do different jobs:
  *
@@ -94,8 +99,10 @@ struct GradBuffer {
  *  * Handling of sparse update
  *  Currently, sparse update is only supported for CPU parameters.
 
- *  Sparse updates refers to gradient caculation where the gradient is sparse. For
- *  example, if the input argument to a 'fc' layer is sparse, the gradient of the
+ *  Sparse updates refers to gradient caculation where the gradient is sparse.
+ For
+ *  example, if the input argument to a 'fc' layer is sparse, the gradient of
+ the
  *  weight matrix of this layer will be sparse. It is usually more efficient to
  *  treat the gradient explicitly as sparse vector during the parameter update.
 
@@ -104,7 +111,8 @@ struct GradBuffer {
 
  *  For both types of sparse updates, there is one copy of parameter value and
  *  gradient called main parameter value and gradient, and there is a copy of
- *  parameter value and gradient for each computing thread called slave parameter
+ *  parameter value and gradient for each computing thread called slave
+ parameter
  *  value and gradient. The slave parameter values are always shared with the
  *  corresponding main parameter value. The slave parameter grad is a sparse row
  *  matrix. The sparse pattern for slave parameter grads are different, because
@@ -124,7 +132,8 @@ struct GradBuffer {
  *     (SparseAutoGrowRowCpuMatrix). It is a sparse row matrix.
  *
  *     During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will
- *     gather all the non-zero gradient. And After backward(), they will be merged
+ *     gather all the non-zero gradient. And After backward(), they will be
+ merged
  *     into main parameter grad (SparseRowIdsCpuMatrix), with indices indicating
  *     which rows have nonzero gradient.
  *
@@ -136,9 +145,11 @@ struct GradBuffer {
  *     parameter values that are prefetched is up-to-date.
  *
  *     Main parameter grad type is MAT_SPARSE_ROW (SparseRowCpuMatrix).
- *     And it shares sparse pattern with value by sharing indexDictHandle_, which
+ *     And it shares sparse pattern with value by sharing indexDictHandle_,
+ which
  *     is an internal data structure used by SparseRowCpuMatrixto specify the
- *     sparsity pattern of Slave parameter value shares with main parameter value.
+ *     sparsity pattern of Slave parameter value shares with main parameter
+ value.
  *
  *     Slave parameter grad type is MAT_SPARSE_ROW_AUTO_GROW
  *     (SparsePrefetchRowCpuMatrix). It is a sparse row matrix
@@ -148,8 +159,10 @@ struct GradBuffer {
  *     parameter server.
  *
  *     During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will
- *     gather all the non-zero gradient. And After backward(), they will be merged
- *     into main parameter grad (SparseRowCpuMatrix). And the framework will send
+ *     gather all the non-zero gradient. And After backward(), they will be
+ merged
+ *     into main parameter grad (SparseRowCpuMatrix). And the framework will
+ send
  *     the merged gradient to parameter server.
  */
 class MultiGradientMachine : public GradientMachine {
@@ -165,18 +178,16 @@ public:
 
   virtual void prefetch(const std::vector<Argument>& inArgs);
 
-  virtual void forward(
-      const std::vector<Argument>& inArgs,
-      std::vector<Argument>* outArgs,
-      PassType passType);
+  virtual void forward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
 
   virtual void backward(const UpdateCallback& callback = nullptr);
 
-  void forwardBackward(
-    const std::vector<Argument>& inArgs,
-    std::vector<Argument>* outArgs,
-    PassType passType,
-    const UpdateCallback& callback);
+  void forwardBackward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType,
+                       const UpdateCallback& callback);
 
   virtual void onPassEnd();
 
@@ -186,9 +197,7 @@ public:
 
   virtual void eval(Evaluator* evaluator);
 
-  bool useGpu() const {
-    return useGpu_;
-  }
+  bool useGpu() const { return useGpu_; }
 
   /// @return whether to pass the gradients in outArgs_ to each threads.
   bool isPassGrad() { return isPassGrad_; }
@@ -203,9 +212,7 @@ public:
 protected:
   friend class TrainerThread;
 
-  std::vector<TrainerThreadPtr>& getAllThreads() {
-    return threads_;
-  }
+  std::vector<TrainerThreadPtr>& getAllThreads() { return threads_; }
   /// Calculate the real device id based on the logical device id and the
   /// thread id.
   int logicalDeviceId2RealDeviceId(int logicalId, int threadId = 0) const {
@@ -229,9 +236,7 @@ protected:
 
   std::vector<const std::vector<ParameterPtr>*> getSlaveParameters();
 
-  bool hasNonstaticCpuParamters() const {
-    return hasNonstaticCpuParamters_;
-  }
+  bool hasNonstaticCpuParamters() const { return hasNonstaticCpuParamters_; }
 
   /// Called TrainerThread to wait before merging CPU parameter gradients.
   void waitBeforeMerge() { trainerBarrier_.wait(); }
@@ -244,59 +249,41 @@ protected:
   /// finishing
   void waitForCopyInArgs() { allBarrier_.wait(); }
 
-  TrainerThreadPtr& getThread(int threadId) {
-    return threads_[threadId];
-  }
+  TrainerThreadPtr& getThread(int threadId) { return threads_[threadId]; }
 
   std::vector<GradBuffer>& getGradBuf(int threadId) {
     return gradBufs_[threadId];
   }
 
-  PassType getPassType() const {
-    return passType_;
-  }
+  PassType getPassType() const { return passType_; }
 
   /// Called by TrainerThread to notify MultiGradientMachine that the gradient
   /// for paramId is ready
   void notifyGradientTransfer(int paramId);
 
-  const std::vector<Argument>& getInArgs() {
-    return inArgs_;
-  }
+  const std::vector<Argument>& getInArgs() { return inArgs_; }
 
-  TaskType getTaskType() const {
-    return taskType_;
-  }
+  TaskType getTaskType() const { return taskType_; }
 
   const UpdateCallback& getBackwardCallback() const {
     return backwardCallback_;
   }
 
-  int getNumDevices() const {
-    return numDevices_;
-  }
+  int getNumDevices() const { return numDevices_; }
 
-  int getNumLogicalDevices() const {
-    return numLogicalDevices_;
-  }
+  int getNumLogicalDevices() const { return numLogicalDevices_; }
 
-  int getNumThreads() const {
-    return numThreads_;
-  }
+  int getNumThreads() const { return numThreads_; }
 
-  int paraMainThread(int pid) const {
-    return paraMainThread_[pid];
-  }
+  int paraMainThread(int pid) const { return paraMainThread_[pid]; }
 
 protected:
-  virtual void forwardImp(
-      const std::vector<Argument>& inArgs,
-      std::vector<Argument>* outArgs,
-      PassType passType,
-      TaskType taskType);
+  virtual void forwardImp(const std::vector<Argument>& inArgs,
+                          std::vector<Argument>* outArgs,
+                          PassType passType,
+                          TaskType taskType);
 
-  virtual void backwardImp(
-      const UpdateCallback& callback = NULL);
+  virtual void backwardImp(const UpdateCallback& callback = NULL);
 
   /// update all parameters
   void updateThreadParameters();
@@ -329,9 +316,9 @@ protected:
 
   /// ParameterType which needs to be merged from each GPU
   std::vector<ParameterType> mergeTypes_;
-  int numDevices_;  /* number of gpu devices */
+  int numDevices_;         /* number of gpu devices */
   int numLogicalDevices_;  // number of GPU used by one NN
-  int numThreads_;  /* number of train threads */
+  int numThreads_;         /* number of train threads */
 
   UpdateCallback backwardCallback_;
 
@@ -350,38 +337,25 @@ protected:
 
 class TrainerThread {
 public:
-  TrainerThread(
-      const ModelConfig& config,
-      int threadId,
-      MultiGradientMachine* multiMachine);
+  TrainerThread(const ModelConfig& config,
+                int threadId,
+                MultiGradientMachine* multiMachine);
 
   ~TrainerThread();
 
   void start();
 
-  void onPassEnd() {
-    gradientMachine_->onPassEnd();
-  }
+  void onPassEnd() { gradientMachine_->onPassEnd(); }
 
-  void waitOutArgsReady() {
-    outArgsReadySem_.wait();
-  }
+  void waitOutArgsReady() { outArgsReadySem_.wait(); }
 
-  void notifyTaskReady() {
-    taskReadySem_.post();
-  }
+  void notifyTaskReady() { taskReadySem_.post(); }
 
-  int getDeviceId() const {
-    return deviceId_;
-  }
+  int getDeviceId() const { return deviceId_; }
 
-  GradientMachine* getGradientMachine() {
-    return gradientMachine_.get();
-  }
+  GradientMachine* getGradientMachine() { return gradientMachine_.get(); }
 
-  const std::vector<ParameterPtr>& getParameters() {
-    return parameters_;
-  }
+  const std::vector<ParameterPtr>& getParameters() { return parameters_; }
 
   void stop();
 
@@ -391,26 +365,18 @@ public:
     return parameters_[paramId]->getBuf(PARAMETER_VALUE);
   }
 
-  const std::vector<Argument>& getOutArgs() {
-    return outArgs_;
-  }
+  const std::vector<Argument>& getOutArgs() { return outArgs_; }
 
   void incUpdateCounter(int n = 1) {
     updateCounter_ += n;
     parameterUpdated_ = true;
   }
 
-  void notifyGradientCollect(int paramId) {
-    gradQueue_.enqueue(paramId);
-  }
+  void notifyGradientCollect(int paramId) { gradQueue_.enqueue(paramId); }
 
-  void notifyCopyGradToBuffer(int paramId) {
-    gradBufQueue_.enqueue(paramId);
-  }
+  void notifyCopyGradToBuffer(int paramId) { gradBufQueue_.enqueue(paramId); }
 
-  void notifyValueDispatch(int paramId) {
-    valueReadyQueue_.enqueue(paramId);
-  }
+  void notifyValueDispatch(int paramId) { valueReadyQueue_.enqueue(paramId); }
 
   void prefetch();
 
@@ -421,16 +387,16 @@ protected:
   void mergeCpuGradients();
 
   void mergeGradSparse(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
+      Parameter* para,
+      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
 
   void mergeGradSparseRemote(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
+      Parameter* para,
+      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
 
   void mergeGradDense(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
+      Parameter* para,
+      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
 
   void computeThread();
   void valueDispatchThread();
@@ -499,5 +465,4 @@ protected:
   bool inArgsCopied_;
 };
 
-
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.cpp b/paddle/gserver/gradientmachines/MultiNetwork.cpp
index d30ca6f28e8647d3cb565a7899ac5f8ef879883a..e5be19cad6b450850de4cc5776017b79d3243681 100644
--- a/paddle/gserver/gradientmachines/MultiNetwork.cpp
+++ b/paddle/gserver/gradientmachines/MultiNetwork.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 #include <algorithm>
@@ -24,7 +23,8 @@ limitations under the License. */
 
 namespace paddle {
 
-void MultiNetwork::init(const ModelConfig& config, ParamInitCallback callback,
+void MultiNetwork::init(const ModelConfig& config,
+                        ParamInitCallback callback,
                         const std::vector<ParameterType>& parameterTypes,
                         bool useGpu) {
   CHECK_GT(config.sub_models_size(), 1) << "sub_models_size should GT 1";
@@ -40,10 +40,10 @@ void MultiNetwork::init(const ModelConfig& config, ParamInitCallback callback,
     std::string subModelName = config.sub_models(i).name();
     if (FLAGS_parallel_nn) {
       subNetworks_[i - 1] = std::unique_ptr<ParallelNeuralNetwork>(
-                           new ParallelNeuralNetwork(subModelName, this));
+          new ParallelNeuralNetwork(subModelName, this));
     } else {
       subNetworks_[i - 1] = std::unique_ptr<NeuralNetwork>(
-                           NeuralNetwork::newNeuralNetwork(subModelName, this));
+          NeuralNetwork::newNeuralNetwork(subModelName, this));
     }
     subNetworks_[i - 1]->init(config);
   }
@@ -64,7 +64,8 @@ void MultiNetwork::prefetch(const std::vector<Argument>& inArgs) {
 }
 
 void MultiNetwork::forward(const std::vector<Argument>& inArgs,
-                           std::vector<Argument>* outArgs, PassType passType) {
+                           std::vector<Argument>* outArgs,
+                           PassType passType) {
   // split inArgs to several vectors
   std::vector<std::vector<Argument>> argumentGroups;
   Argument::splitByDataId(inArgs, &argumentGroups);
@@ -154,7 +155,7 @@ public:
     return -1;
   }
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     for (auto& evaluator : evaluators_) {
       evaluator->printStats(os);
       os << ' ';
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.h b/paddle/gserver/gradientmachines/MultiNetwork.h
index a162420c3bfe63fdca26dc5da0514dc7854df091..779a2267f55c8e1b5d120d9fd1e2a0d455cc5c59 100644
--- a/paddle/gserver/gradientmachines/MultiNetwork.h
+++ b/paddle/gserver/gradientmachines/MultiNetwork.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "GradientMachine.h"
@@ -27,19 +26,22 @@ public:
   explicit MultiNetwork(std::string subModelName = "")
       : NeuralNetwork(subModelName) {}
 
-  virtual void init(const ModelConfig& config, ParamInitCallback callback,
+  virtual void init(const ModelConfig& config,
+                    ParamInitCallback callback,
                     const std::vector<ParameterType>& parameterTypes,
                     bool useGpu);
 
   virtual void prefetch(const std::vector<Argument>& inArgs);
 
   virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs, PassType passType);
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
 
   virtual void backward(const UpdateCallback& callback = nullptr);
 
   void forwardBackward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs, PassType passType,
+                       std::vector<Argument>* outArgs,
+                       PassType passType,
                        const UpdateCallback& callback);
 
   virtual void onPassEnd();
@@ -52,8 +54,7 @@ public:
     return subNetworks_;
   }
 
-  virtual void start(const TrainerConfig& config,
-                     DataProviderPtr dataProvider);
+  virtual void start(const TrainerConfig& config, DataProviderPtr dataProvider);
 
   virtual void finish();
 
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 3127b4dd9a2fd3a3da26b90100763c4ec2470cae..9932ea655ebdceb2eb1ae8920f4d320163d14262 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 
 #include "paddle/utils/Logging.h"
@@ -26,7 +25,8 @@ limitations under the License. */
 #include "paddle/gserver/layers/AgentLayer.h"
 
 namespace paddle {
-void parameterInitNN(int paramId, Parameter* para,
+void parameterInitNN(int paramId,
+                     Parameter* para,
                      std::vector<ParameterPtr>* sharedParams) {
   // Create parameters values.
   if (!para->useGpu() && sharedParams) {
@@ -35,10 +35,10 @@ void parameterInitNN(int paramId, Parameter* para,
                            (*sharedParams)[paramId]->getMat(PARAMETER_VALUE));
   } else {
     if (para->isSparseRemoteUpdate()) {
-      para->enableType(
-          PARAMETER_VALUE, FLAGS_loadsave_parameters_in_pserver
-                              ? Parameter::MAT_SPARSE_ROW_PREFETCH
-                              : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
+      para->enableType(PARAMETER_VALUE,
+                       FLAGS_loadsave_parameters_in_pserver
+                           ? Parameter::MAT_SPARSE_ROW_PREFETCH
+                           : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
     } else {
       para->enableType(PARAMETER_VALUE);
     }
@@ -65,7 +65,8 @@ NeuralNetwork* NeuralNetwork::create(const ModelConfig& config) {
 
 std::map<std::string, bool> NeuralNetwork::dllInitMap;
 
-void NeuralNetwork::init(const ModelConfig& config, ParamInitCallback callback,
+void NeuralNetwork::init(const ModelConfig& config,
+                         ParamInitCallback callback,
                          const std::vector<ParameterType>& parameterTypes,
                          bool useGpu) {
   using std::placeholders::_1;
@@ -89,12 +90,13 @@ void NeuralNetwork::init(const ModelConfig& config, ParamInitCallback callback,
   } else {
     parameters_.reserve(config.parameters_size());
     for (const auto& para_config : config.parameters()) {
-      auto parameter = std::make_shared<Parameter>(para_config, useGpu,
+      auto parameter = std::make_shared<Parameter>(para_config,
+                                                   useGpu,
                                                    /*initialize=*/false);
       paramCallback(parameters_.size(), parameter.get());
       if (!callback) {
         for (ParameterType type :
-                 (parameter->isStatic()
+             (parameter->isStatic()
                   ? std::vector<ParameterType>{PARAMETER_VALUE}
                   : parameterTypes)) {
           if (type != PARAMETER_VALUE && type != PARAMETER_GRADIENT) {
@@ -117,18 +119,19 @@ void NeuralNetwork::init(const ModelConfig& config, ParamInitCallback callback,
     layerMap_[layer->getName()] = layer;
   };
 
-  auto subModelConfig =
-      std::find_if(config.sub_models().begin(), config.sub_models().end(),
-                   [=](const SubModelConfig& sub_model) {
-                     return sub_model.name() == subModelName_;
-                   });
+  auto subModelConfig = std::find_if(config.sub_models().begin(),
+                                     config.sub_models().end(),
+                                     [=](const SubModelConfig& sub_model) {
+                                       return sub_model.name() == subModelName_;
+                                     });
   bool useSubModel = (subModelConfig != config.sub_models().end());
   CHECK_EQ(useSubModel, !subModelName_.empty());
   if (useSubModel) {
     layers_.reserve(subModelConfig->layer_names_size());
     for (const auto& layer_name : subModelConfig->layer_names()) {
       auto layer_config =
-          std::find_if(config.layers().begin(), config.layers().end(),
+          std::find_if(config.layers().begin(),
+                       config.layers().end(),
                        [=](const LayerConfig& layer_config) {
                          return layer_config.name() == layer_name;
                        });
@@ -176,14 +179,16 @@ void NeuralNetwork::init(const ModelConfig& config, ParamInitCallback callback,
   }
 }
 
-void NeuralNetwork::connect(LayerPtr agentLayer, LayerPtr realLayer,
+void NeuralNetwork::connect(LayerPtr agentLayer,
+                            LayerPtr realLayer,
                             int height) {
   AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get());
   CHECK_NOTNULL(agent);
   agent->setRealLayer(realLayer, height);
 }
 
-void NeuralNetwork::connect(std::string agentLayerName, NeuralNetwork* srcNN,
+void NeuralNetwork::connect(std::string agentLayerName,
+                            NeuralNetwork* srcNN,
                             std::string realLayerName) {
   connect(this->getLayer(agentLayerName), srcNN->getLayer(realLayerName));
 }
@@ -195,7 +200,7 @@ void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
     for (auto& para : parameters_) {
       if (para->isSparseRemoteUpdate()) {
         auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-          para->getMat(PARAMETER_VALUE).get());
+            para->getMat(PARAMETER_VALUE).get());
         para->clearGradient();
         mat->clearIndices();
       }
@@ -217,10 +222,10 @@ void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
     for (auto& para : parameters_) {
       if (para->isSparseRemoteUpdate()) {
         auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-          para->getMat(PARAMETER_VALUE).get());
+            para->getMat(PARAMETER_VALUE).get());
         mat->setupIndices();
         auto matGrad = dynamic_cast<SparseRowCpuMatrix*>(
-          para->getMat(PARAMETER_GRADIENT).get());
+            para->getMat(PARAMETER_GRADIENT).get());
         matGrad->reserveStore();
       }
     }
@@ -228,7 +233,8 @@ void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
 }
 
 void NeuralNetwork::forward(const std::vector<Argument>& inArgs,
-                            std::vector<Argument>* outArgs, PassType passType) {
+                            std::vector<Argument>* outArgs,
+                            PassType passType) {
   CHECK_EQ(inArgs.size(), dataLayers_.size());
   outArgs->resize(outputLayers_.size());
   for (size_t i = 0; i != dataLayers_.size(); ++i) {
@@ -325,7 +331,7 @@ public:
     (void)arguments;
     return -1;
   }
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     for (auto& evaluator : evaluators_) {
       evaluator->printStats(os);
       os << ' ';
@@ -344,11 +350,11 @@ protected:
 
 Evaluator* NeuralNetwork::makeEvaluator() {
   CombinedEvaluator* combinedEvaluator = new CombinedEvaluator();
-  auto subModelConfig =
-      std::find_if(config_.sub_models().begin(), config_.sub_models().end(),
-                   [=](const SubModelConfig& sub_model) {
-                     return sub_model.name() == subModelName_;
-                   });
+  auto subModelConfig = std::find_if(config_.sub_models().begin(),
+                                     config_.sub_models().end(),
+                                     [=](const SubModelConfig& sub_model) {
+                                       return sub_model.name() == subModelName_;
+                                     });
   bool useSubModel = (subModelConfig != config_.sub_models().end());
   CHECK_EQ(useSubModel, !subModelName_.empty());
   if (useSubModel) {
@@ -356,7 +362,8 @@ Evaluator* NeuralNetwork::makeEvaluator() {
     for (int i = 0; i < subModelConfig->evaluator_names_size(); ++i) {
       // find evaluator by name
       auto thisEvalConfig = std::find_if(
-          config_.evaluators().begin(), config_.evaluators().end(),
+          config_.evaluators().begin(),
+          config_.evaluators().end(),
           [=](const EvaluatorConfig& ecfg) {
             return ecfg.name() == subModelConfig->evaluator_names(i);
           });
@@ -385,17 +392,17 @@ void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
   }
 }
 
-extern NeuralNetwork* newCustomNerualNetwork(
-  const std::string& name, NeuralNetwork* network) __attribute__((weak));
+extern NeuralNetwork* newCustomNerualNetwork(const std::string& name,
+                                             NeuralNetwork* network)
+    __attribute__((weak));
 
-NeuralNetwork* NeuralNetwork::newNeuralNetwork(
-    const std::string& name,
-    NeuralNetwork* rootNetwork) {
-    if (newCustomNerualNetwork) {
-      return newCustomNerualNetwork(name, rootNetwork);
-    } else {
-      return new NeuralNetwork(name, rootNetwork);
-    }
+NeuralNetwork* NeuralNetwork::newNeuralNetwork(const std::string& name,
+                                               NeuralNetwork* rootNetwork) {
+  if (newCustomNerualNetwork) {
+    return newCustomNerualNetwork(name, rootNetwork);
+  } else {
+    return new NeuralNetwork(name, rootNetwork);
+  }
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index 06c679a63cc79b68b9fd27dfb64dfa9add8a1078..55ef45c5eeddc770ec3bc8fd0055d561eaf3b754 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -52,14 +51,15 @@ namespace paddle {
  * GPU value: NORMAL
  * GPU param: NORMAL
  */
-void parameterInitNN(int paramId, Parameter* para,
+void parameterInitNN(int paramId,
+                     Parameter* para,
                      std::vector<ParameterPtr>* sharedParams);
 
-
 class NeuralNetwork : public GradientMachine {
 public:
   virtual void init(
-      const ModelConfig& config, ParamInitCallback callback = nullptr,
+      const ModelConfig& config,
+      ParamInitCallback callback = nullptr,
       const std::vector<ParameterType>&
           parameterTypes = std::vector<ParameterType>{PARAMETER_VALUE,
                                                       PARAMETER_GRADIENT,
@@ -76,13 +76,15 @@ public:
    * @param agentLayer The up-submodel's input agent layer.
    */
   static void connect(LayerPtr agentLayer, LayerPtr realLayer, int height = 0);
-  void connect(std::string agentLayerName, NeuralNetwork* srcNN,
+  void connect(std::string agentLayerName,
+               NeuralNetwork* srcNN,
                std::string realLayerName);
 
   virtual void prefetch(const std::vector<Argument>& inArgs);
 
   virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs, PassType passType);
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
 
   virtual void backward(const UpdateCallback& callback = nullptr);
 
@@ -117,16 +119,15 @@ public:
    */
   template <typename T>
   void forEachLayer(T callback) {
-    for (auto & l : layers_) {
+    for (auto& l : layers_) {
       if (callback(l)) {
         break;
       }
     }
   }
 
-
   static NeuralNetwork* newNeuralNetwork(const std::string& name = "",
-                                        NeuralNetwork* rootNetwork = nullptr);
+                                         NeuralNetwork* rootNetwork = nullptr);
 
 protected:
   /**
@@ -139,8 +140,7 @@ protected:
    */
   NeuralNetwork(std::string subModelName = "",
                 NeuralNetwork* rootNetwork = nullptr)
-      : subModelName_(subModelName),
-        rootNetwork_(rootNetwork) {}
+      : subModelName_(subModelName), rootNetwork_(rootNetwork) {}
 
   std::string subModelName_;
   ModelConfig config_;
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
index 22698f586701774d884e6eeca943f6bf75fe7a96..9dbf418c31b0969eef7477a22b6f1bf63dab9b03 100644
--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 
@@ -24,14 +23,16 @@ limitations under the License. */
 namespace paddle {
 
 void ParallelNeuralNetwork::init(
-    const ModelConfig& config, ParamInitCallback callback,
-    const std::vector<ParameterType>& parameterTypes, bool useGpu) {
+    const ModelConfig& config,
+    ParamInitCallback callback,
+    const std::vector<ParameterType>& parameterTypes,
+    bool useGpu) {
   NeuralNetwork::init(config, callback, parameterTypes, useGpu);
 
   if (config.type() == "recurrent_nn") {
     LOG(FATAL)
-      << "You can not add `--parallel_nn=true` on the command line, "
-      << "parallel_nn training mode does not support the recurrent_nn model.";
+        << "You can not add `--parallel_nn=true` on the command line, "
+        << "parallel_nn training mode does not support the recurrent_nn model.";
   }
 
   useGpu_ = useGpu;
@@ -54,8 +55,8 @@ void ParallelNeuralNetwork::addComputeThread(int deviceId) {
     }
   }
 
-  threads_.emplace_back(new ParallelThread(threads_.size(), deviceId,
-                                           deviceId >= 0 ? useGpu_ : false));
+  threads_.emplace_back(new ParallelThread(
+      threads_.size(), deviceId, deviceId >= 0 ? useGpu_ : false));
 }
 
 void ParallelNeuralNetwork::waitAllThread() {
@@ -68,7 +69,8 @@ void ParallelNeuralNetwork::waitAllThread() {
   }
 }
 
-void ParallelNeuralNetwork::dispatchByDeviceId(int deviceId, LayerPtr layer,
+void ParallelNeuralNetwork::dispatchByDeviceId(int deviceId,
+                                               LayerPtr layer,
                                                TaskType task) {
   for (auto& thread : threads_) {
     if (thread->getDeviceId() == deviceId) {
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
index 2a3db654f4e16c0ecd4be91425330208046b4a6c..71488bc3b7a52d851d0e3fb77c48f3fd36bdce83 100644
--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "NeuralNetwork.h"
@@ -35,24 +34,27 @@ enum TaskType {
 class ParallelNeuralNetwork : public NeuralNetwork {
 public:
   ParallelNeuralNetwork(std::string subModelName = "",
-      NeuralNetwork* rootNetwork = nullptr)
-    : NeuralNetwork(subModelName, rootNetwork) {}
+                        NeuralNetwork *rootNetwork = nullptr)
+      : NeuralNetwork(subModelName, rootNetwork) {}
 
   virtual void init(
-      const ModelConfig &config, ParamInitCallback callback = nullptr,
-      const std::vector<ParameterType> &
-          parameterTypes = std::vector<ParameterType>{PARAMETER_VALUE,
-                                                      PARAMETER_GRADIENT,
-                                                      PARAMETER_MOMENTUM},
+      const ModelConfig &config,
+      ParamInitCallback callback = nullptr,
+      const std::vector<ParameterType>
+          &parameterTypes = std::vector<ParameterType>{PARAMETER_VALUE,
+                                                       PARAMETER_GRADIENT,
+                                                       PARAMETER_MOMENTUM},
       bool useGpu = FLAGS_use_gpu);
 
   virtual void forward(const std::vector<Argument> &inArgs,
-                       std::vector<Argument> *outArgs, PassType passType);
+                       std::vector<Argument> *outArgs,
+                       PassType passType);
 
   virtual void backward(const UpdateCallback &callback = nullptr);
 
   void forwardBackward(const std::vector<Argument> &inArgs,
-                       std::vector<Argument> *outArgs, PassType passType,
+                       std::vector<Argument> *outArgs,
+                       PassType passType,
                        const UpdateCallback &callback = NULL);
 
   virtual void start(const TrainerConfig &config, DataProviderPtr dataProvider);
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 340cd1b9f8e927ded5d06ab0c7ab15ec75bc8469..516b61757698923eb0fde1f3b1d28074cac10044 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -53,8 +53,8 @@ typedef int (*DiyStartCalcProbCallback)(size_t nNodes, int* nodes);
  *          path.
  * @NOTE: Return -INFINITY will DROP this path IMMEDIATELY!!
  */
-typedef real (*DiyCalcProbCallback)(int handler, size_t nNodes, int* nodes,
-                                    real curProb, bool atEos);
+typedef real (*DiyCalcProbCallback)(
+    int handler, size_t nNodes, int* nodes, real curProb, bool atEos);
 
 /**
  * Finish Custom Calculation of Probability callback type.
@@ -190,13 +190,16 @@ public:
 };
 
 void RecurrentGradientMachine::init(
-    const ModelConfig& config, ParamInitCallback callback,
-    const std::vector<ParameterType>& parameterTypes, bool useGpu) {
+    const ModelConfig& config,
+    ParamInitCallback callback,
+    const std::vector<ParameterType>& parameterTypes,
+    bool useGpu) {
   NeuralNetwork::init(config, callback, parameterTypes, useGpu);
   useGpu_ = useGpu;
 
   auto subModelConfig =
-      std::find_if(config.sub_models().begin(), config.sub_models().end(),
+      std::find_if(config.sub_models().begin(),
+                   config.sub_models().end(),
                    [this](const SubModelConfig& sub_model) {
                      return sub_model.name() == this->subModelName_;
                    });
@@ -224,7 +227,8 @@ void RecurrentGradientMachine::init(
     memoryFrameLines_[i].layerName = memoryConfig.layer_name();
     memoryFrameLines_[i].linkName = memoryConfig.link_name();
     auto agentConfig =
-        std::find_if(config.layers().begin(), config.layers().end(),
+        std::find_if(config.layers().begin(),
+                     config.layers().end(),
                      [&memoryConfig](const LayerConfig& layerConfig) {
                        return layerConfig.name() == memoryConfig.link_name();
                      });
@@ -413,7 +417,8 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     //    sample is one sentence
     if (shareInlinkInfo) {
       CHECK_EQ(input1.getBatchSize(), batchSize);
-      CHECK(std::equal(starts, starts + numSequences + 1,
+      CHECK(std::equal(starts,
+                       starts + numSequences + 1,
                        input1.sequenceStartPositions->getData(false)));
     }
   }
@@ -428,7 +433,8 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
       const Argument& input1 = inFrameLines_[i].inLayer->getOutput();
       CHECK_EQ((size_t)input1.getNumSubSequences(), numSubSequences);
       if (shareInlinkInfo) {
-        CHECK(std::equal(subStarts, subStarts + numSubSequences + 1,
+        CHECK(std::equal(subStarts,
+                         subStarts + numSubSequences + 1,
                          input1.subSequenceStartPositions->getData(false)));
       }
     }
@@ -460,8 +466,10 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     // inFrameLine select rows in real layer one time
     for (size_t i = 0; i < inFrameLines_.size(); i++) {
       int curInlinkId = shareInlinkInfo ? 0 : i;
-      selectRowsOneTime(inFrameLines_[i].inLayer, info_[curInlinkId].allIds,
-                        &(inFrameLines_[i].outArg), passType);
+      selectRowsOneTime(inFrameLines_[i].inLayer,
+                        info_[curInlinkId].allIds,
+                        &(inFrameLines_[i].outArg),
+                        passType);
     }
   }
   resizeOrCreateFrames(maxSequenceLength_);
@@ -472,15 +480,17 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
       auto scatterAgent =
           dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
       createMemoryFrameInfo(&memoryFrameLine, passType);
-      scatterAgent->setRealLayerAndOutput(
-          memoryFrameLine.rootLayer, memoryFrameLine.outArg,
-          memoryFrameLine.allIds,
-          /* idIndex */ 0, memoryFrameLine.allIds->getSize());
+      scatterAgent->setRealLayerAndOutput(memoryFrameLine.rootLayer,
+                                          memoryFrameLine.outArg,
+                                          memoryFrameLine.allIds,
+                                          /* idIndex */ 0,
+                                          memoryFrameLine.allIds->getSize());
       if (memoryFrameLine.is_sequence) {  // memoryConfig is sequence
         int size = memoryFrameLine.sequenceStartPositions->getSize();
         scatterAgent->setSequenceStartPositions(
             memoryFrameLine.sequenceStartPositions,
-            /* seqStartPosIndex */ 0, size);
+            /* seqStartPosIndex */ 0,
+            size);
       }
     }
   }
@@ -489,7 +499,8 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     auto gatherAgent =
         dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
     CHECK_NOTNULL(gatherAgent);
-    gatherAgent->copyIdAndSequenceInfo(input, info_[targetInfoInlinkId_].allIds,
+    gatherAgent->copyIdAndSequenceInfo(input,
+                                       info_[targetInfoInlinkId_].allIds,
                                        info_[targetInfoInlinkId_].idIndex);
   }
 
@@ -504,15 +515,15 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
       auto scatterAgent =
           dynamic_cast<ScatterAgentLayer*>(inFrameLine.agents[i].get());
       scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer,
-                                          inFrameLine.outArg, info.allIds,
-                                          info.idIndex[i], idSize);
+                                          inFrameLine.outArg,
+                                          info.allIds,
+                                          info.idIndex[i],
+                                          idSize);
       if (hasSubseq) {
         // size: the length of subsequence
-        int size =
-            info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i];
-        scatterAgent->setSequenceStartPositions(info.sequenceStartPositions,
-                                                info.seqStartPosIndex[i],
-                                                size);
+        int size = info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i];
+        scatterAgent->setSequenceStartPositions(
+            info.sequenceStartPositions, info.seqStartPosIndex[i], size);
       }
     }
 
@@ -547,7 +558,7 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     if (hasSubseq) {
       for (auto& outFrameLine : outFrameLines_) {
         CHECK(outFrameLine.frames[i]->getOutput().sequenceStartPositions)
-          << "In hierachical RNN, all out links should be from sequences.";
+            << "In hierachical RNN, all out links should be from sequences.";
       }
     }
   }
@@ -573,8 +584,10 @@ void RecurrentGradientMachine::backward(const UpdateCallback& callback) {
 }
 
 void RecurrentGradientMachine::forwardBackward(
-    const std::vector<Argument>& inArgs, std::vector<Argument>* outArgs,
-    PassType passType, const UpdateCallback& callback) {
+    const std::vector<Argument>& inArgs,
+    std::vector<Argument>* outArgs,
+    PassType passType,
+    const UpdateCallback& callback) {
   LOG(FATAL) << "should not use this function";
 }
 
@@ -729,12 +742,15 @@ void RecurrentGradientMachine::createMemoryFrameInfo(
   // copy and check scatterId
   copyScattedId(allIds, &(*memoryFrameLine).allIds, input.getBatchSize());
   // memoryFrameLine select rows in real layer one time
-  selectRowsOneTime((*memoryFrameLine).rootLayer, (*memoryFrameLine).allIds,
-                    &(*memoryFrameLine).outArg, passType);
+  selectRowsOneTime((*memoryFrameLine).rootLayer,
+                    (*memoryFrameLine).allIds,
+                    &(*memoryFrameLine).outArg,
+                    passType);
 }
 
 void RecurrentGradientMachine::copyScattedId(std::vector<int>& srcIds,
-                                             IVectorPtr* dstIds, int size) {
+                                             IVectorPtr* dstIds,
+                                             int size) {
   int idSize = srcIds.size();
   CHECK_EQ(idSize, size);
   IVector::resizeOrCreate(*dstIds, idSize, useGpu_);
@@ -756,12 +772,12 @@ void RecurrentGradientMachine::selectRowsOneTime(LayerPtr layer,
     int height = realV->getHeight();
     int width = realV->getWidth();
     Matrix::resizeOrCreate(
-      arg->value, height, width, /* trans */ false, useGpu_);
+        arg->value, height, width, /* trans */ false, useGpu_);
     arg->value->zeroMem();
     arg->value->selectRows(*realV, *allIds);
     if (passType != PASS_TEST) {
-      Matrix::resizeOrCreate(arg->grad, height, width, /* trans */ false,
-                             useGpu_);
+      Matrix::resizeOrCreate(
+          arg->grad, height, width, /* trans */ false, useGpu_);
       arg->grad->zeroMem();
     }
   }
@@ -833,8 +849,8 @@ void RecurrentGradientMachine::generateSequence() {
             << "boot layer must be a sequence when is_sequence = true";
       }
     }
-    NeuralNetwork::connect(memoryFrameLine.agents[0], memoryFrameLine.bootLayer,
-                           ids.size());
+    NeuralNetwork::connect(
+        memoryFrameLine.agents[0], memoryFrameLine.bootLayer, ids.size());
   }
 
   // boot layer forward
@@ -847,14 +863,19 @@ void RecurrentGradientMachine::generateSequence() {
   size_t resultNum = generator_.config.num_results_per_sample();
   IVector::resizeOrCreate(
       generator_.outArg.ids,
-      generator_.config.max_num_frames() * numSequences * resultNum, false);
+      generator_.config.max_num_frames() * numSequences * resultNum,
+      false);
   if (resultNum > 1) {
     CHECK_LE(resultNum, static_cast<size_t>(generator_.config.beam_size()));
-    Matrix::resizeOrCreate(generator_.outArg.in, /* height */ numSequences,
-                           /* width */ resultNum, false, /* useGpu */ false);
+    Matrix::resizeOrCreate(generator_.outArg.in,
+                           /* height */ numSequences,
+                           /* width */ resultNum,
+                           false,
+                           /* useGpu */ false);
   }
   ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
-                                numSequences + 1, /* useGpu */ false);
+                                numSequences + 1,
+                                /* useGpu */ false);
   if (getBeamSize() > 1) {
     beamSearch(numSequences);
   } else {
@@ -906,7 +927,8 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
         auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
             memoryFrameLine.scatterAgents[machineCur].get());
         scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
-                                   scatterIds, memoryFrameLine.is_sequence);
+                                   scatterIds,
+                                   memoryFrameLine.is_sequence);
         scatterAgent->forward(PASS_TEST);
         NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
                                memoryFrameLine.scatterAgents[machineCur]);
@@ -948,7 +970,8 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
   starts[0] = 0;
   generator_.ids.clear();
   for (size_t i = 0; i < batchSize; ++i) {
-    generator_.ids.insert(generator_.ids.end(), finalPaths[i].ids.begin(),
+    generator_.ids.insert(generator_.ids.end(),
+                          finalPaths[i].ids.begin(),
                           finalPaths[i].ids.end());
     starts[i + 1] = generator_.ids.size();
     batchMachineIdVec_.insert(batchMachineIdVec_.end(),
@@ -999,8 +1022,11 @@ void RecurrentGradientMachine::forwardFrame(int machineCur) {
   if (useGpu_) {
     IVector::resizeOrCreate(cpuId_, ids->getSize(), false /* useGpu */);
     cpuId_->copyFrom(*ids);
-    Matrix::resizeOrCreate(cpuProb_, in->getHeight(), in->getWidth(),
-                           false /* trans */, false /* useGpu */);
+    Matrix::resizeOrCreate(cpuProb_,
+                           in->getHeight(),
+                           in->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
     cpuProb_->copyFrom(*in);
     IVector::resizeOrCreate(cpuEos_, eos->getSize(), false /* useGpu */);
     cpuEos_->copyFrom(*eos);
@@ -1011,7 +1037,8 @@ void RecurrentGradientMachine::forwardFrame(int machineCur) {
   }
 }
 
-void RecurrentGradientMachine::singlePathExpand(Path& curPath, size_t curPathId,
+void RecurrentGradientMachine::singlePathExpand(Path& curPath,
+                                                size_t curPathId,
                                                 std::vector<Path>& newPaths,
                                                 size_t expandWidth) {
   int calc_id =
@@ -1037,8 +1064,8 @@ void RecurrentGradientMachine::singlePathExpand(Path& curPath, size_t curPathId,
     if (id == -1) break;
 
     real newLogProb = generator_.config.log_prob() ? std::log(prob) : prob;
-    Path newPath(curPath, id, newLogProb, curPathId /*machineId*/,
-                 k /*topIndex*/);
+    Path newPath(
+        curPath, id, newLogProb, curPathId /*machineId*/, k /*topIndex*/);
     if (this->beamSearchCtrlCallbacks_) {
       if (beamSearchCtrlCallbacks_->stopDetermineCandidates(
               newPath.seqId, newPath.ids, newPath.probHistory))
@@ -1104,7 +1131,8 @@ size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
   }
   std::nth_element(newPaths.begin() + totalExpandCount,
                    newPaths.begin() + totalExpandCount + minNewPathSize,
-                   newPaths.end(), Path::greaterPath);
+                   newPaths.end(),
+                   Path::greaterPath);
   newPaths.resize(totalExpandCount + minNewPathSize);
 
   real minPathLogProb =
@@ -1116,7 +1144,8 @@ size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
 
   // Remove the already formed paths that are relatively short
   finalPaths_[seqId].erase(
-      std::remove_if(finalPaths_[seqId].begin(), finalPaths_[seqId].end(),
+      std::remove_if(finalPaths_[seqId].begin(),
+                     finalPaths_[seqId].end(),
                      [&](Path& p) { return p.logProb < minPathLogProb; }),
       finalPaths_[seqId].end());
   for (auto p : finalPaths_[seqId]) {
@@ -1139,7 +1168,8 @@ void RecurrentGradientMachine::fillGenOutputs() {
     size_t minFinalPathsSize = std::min(numResults, finalPaths_[i].size());
     std::partial_sort(finalPaths_[i].begin(),
                       finalPaths_[i].begin() + minFinalPathsSize,
-                      finalPaths_[i].end(), Path::greaterPath);
+                      finalPaths_[i].end(),
+                      Path::greaterPath);
     finalPaths_[i].resize(minFinalPathsSize);
   }
 
@@ -1154,8 +1184,8 @@ void RecurrentGradientMachine::fillGenOutputs() {
       for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
         Path& path = finalPaths_[i][j];
         generator_.ids.push_back(path.ids.size());  // sequence size
-        generator_.ids.insert(generator_.ids.end(), path.ids.begin(),
-                              path.ids.end());
+        generator_.ids.insert(
+            generator_.ids.end(), path.ids.begin(), path.ids.end());
         generator_.ids.push_back(-1);  // end of sequence
         probs[i * numResults + j] = path.logProb;
 
@@ -1198,8 +1228,12 @@ void RecurrentGradientMachine::createDataOutlink(
   }
 
   for (size_t i = 0; i < dataArgsSize_; i++) {
-    dataArgs_[i].concat(dataArgsFrame_[i], machineIdVec, starts, useGpu_,
-                        HPPL_STREAM_1, PASS_TEST);
+    dataArgs_[i].concat(dataArgsFrame_[i],
+                        machineIdVec,
+                        starts,
+                        useGpu_,
+                        HPPL_STREAM_1,
+                        PASS_TEST);
 
     auto dataAgent =
         dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
@@ -1235,7 +1269,8 @@ void RecurrentGradientMachine::beamSearch(size_t batchSize) {
       auto ptr =
           new ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&,
                               int>(beamSearchStatistics_->onEachStepStarted,
-                                   beamSearchStatistics_->onEachStepStoped, i);
+                                   beamSearchStatistics_->onEachStepStoped,
+                                   i);
       statisticsBlock.reset(ptr);
     }
     if (stopBeamSearch_) break;
@@ -1246,7 +1281,9 @@ void RecurrentGradientMachine::beamSearch(size_t batchSize) {
       std::vector<std::vector<int>*> prefixes;
       prefixes.resize(paths.size());
       std::transform(
-          paths.begin(), paths.end(), prefixes.begin(),
+          paths.begin(),
+          paths.end(),
+          prefixes.begin(),
           [](const Path& p) { return const_cast<std::vector<int>*>(&p.ids); });
       beamSearchCtrlCallbacks_->beamSearchCandidateAdjust(
           prefixes, frames_[machineCur].get(), i);
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index 6328213793ed6ca39214ec00124570ecb1ce273b..cb74a67e52f5f48d106b9fe93b1230a1675d3341 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -44,19 +44,22 @@ public:
     this->removeBeamSearchControlCallbacks();
   }
 
-  virtual void init(const ModelConfig& config, ParamInitCallback callback,
+  virtual void init(const ModelConfig& config,
+                    ParamInitCallback callback,
                     const std::vector<ParameterType>& parameterTypes,
                     bool useGpu);
 
   virtual void prefetch(const std::vector<Argument>& inArgs);
 
   virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs, PassType passType);
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
 
   virtual void backward(const UpdateCallback& callback = nullptr);
 
   void forwardBackward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs, PassType passType,
+                       std::vector<Argument>* outArgs,
+                       PassType passType,
                        const UpdateCallback& callback);
 
   virtual void resetState() {}
@@ -81,8 +84,8 @@ public:
    * beam search, so that user can customize different operations in different
    * beam search iterations.
    */
-  typedef std::function<void(const std::vector<std::vector<int>*>&,
-                             NeuralNetwork*, const int)>
+  typedef std::function<void(
+      const std::vector<std::vector<int>*>&, NeuralNetwork*, const int)>
       BeamSearchCandidatesAdjustCallback;
 
   /**
@@ -99,8 +102,9 @@ public:
    *
    * Return true if this prefix or candidate is expected to be dropped.
    */
-  typedef std::function<bool(int seqId, const std::vector<int>&,
-                             const std::vector<real>&)> DropCallback;
+  typedef std::function<bool(
+      int seqId, const std::vector<int>&, const std::vector<real>&)>
+      DropCallback;
 
   /**
     * @brief NormOrDropNodeCallback
@@ -115,8 +119,9 @@ public:
     *
     * The fourth parameter is the probability of the whole path.
     */
-  typedef std::function<void(int seqId, const std::vector<int>&,
-                             std::vector<real>&, real*)> NormOrDropNodeCallback;
+  typedef std::function<void(
+      int seqId, const std::vector<int>&, std::vector<real>&, real*)>
+      NormOrDropNodeCallback;
 
   /**
    * @brief Register beam search control callbacks. Used for prediction.
@@ -346,7 +351,8 @@ protected:
   *  If hasSubseq, will also create scattered sequenceStartPositions infomation
   *  for all realLayer of inFrameLines one time.
   */
-  void createInFrameInfo(int inlinks_id, const Argument& input,
+  void createInFrameInfo(int inlinks_id,
+                         const Argument& input,
                          PassType passType);
 
   void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine,
@@ -354,8 +360,10 @@ protected:
 
   void copyScattedId(std::vector<int>& srcIds, IVectorPtr* dstIds, int size);
 
-  void selectRowsOneTime(LayerPtr layer, const IVectorPtr& allIds,
-                         Argument* arg, PassType passType);
+  void selectRowsOneTime(LayerPtr layer,
+                         const IVectorPtr& allIds,
+                         Argument* arg,
+                         PassType passType);
 
   void createSeqPos(const std::vector<int>& sequenceStartPosition,
                     ICpuGpuVectorPtr* sequenceStartPositions);
@@ -459,7 +467,8 @@ private:
    * @param totalExpandCount : number of already shrinked paths in newPaths
    * @return size of retained paths at the end of a beam search iteration
    */
-  size_t beamShrink(std::vector<Path>& newPaths, size_t seqId,
+  size_t beamShrink(std::vector<Path>& newPaths,
+                    size_t seqId,
                     size_t totalExpandCount);
 
   /*
@@ -469,8 +478,10 @@ private:
    * @param curPathId : index of curPath in member newPaths
    * @param expandWidth : number of paths to be expanded
    */
-  void singlePathExpand(Path& curPath, size_t curPathId,
-                        std::vector<Path>& newPaths, size_t expandWidth);
+  void singlePathExpand(Path& curPath,
+                        size_t curPathId,
+                        std::vector<Path>& newPaths,
+                        size_t expandWidth);
 
   /*
    * @brief A new beam search iteration. Each half-generated paths in previous
diff --git a/paddle/gserver/layers/AddtoLayer.cpp b/paddle/gserver/layers/AddtoLayer.cpp
index 083b1957f3a724370f1de0824a6ac79d74224a03..8a9aecfa19b815814a985183ee28344a6f4f9712 100644
--- a/paddle/gserver/layers/AddtoLayer.cpp
+++ b/paddle/gserver/layers/AddtoLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "AddtoLayer.h"
 
 #include "paddle/utils/Logging.h"
diff --git a/paddle/gserver/layers/AddtoLayer.h b/paddle/gserver/layers/AddtoLayer.h
index 0f2ca0bf19ee7dea57230042dbb13e422e8821e4..883d186f3e63f3a60789c0a4f0e05db1202f3ec8 100644
--- a/paddle/gserver/layers/AddtoLayer.h
+++ b/paddle/gserver/layers/AddtoLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -21,15 +20,16 @@ limitations under the License. */
 
 namespace paddle {
 
-/** 
- * This layer just simply add all input layers together, then activate 
- * the sum inputs. Each input of this layer should be the same size, 
+/**
+ * This layer just simply add all input layers together, then activate
+ * the sum inputs. Each input of this layer should be the same size,
  * which is also the output size of this layer.
  * \f[
  *   y=f(\sum_{i}x_i + b)
  * \f]
- * where \f$y\f$ is output, \f$x\f$ is input, \f$b\f$ is bias, and \f$f\f$ is activation function.
- * 
+ * where \f$y\f$ is output, \f$x\f$ is input, \f$b\f$ is bias, and \f$f\f$ is
+ * activation function.
+ *
  * The config file api is addto_layer.
  */
 class AddtoLayer : public Layer {
@@ -41,20 +41,20 @@ public:
 
   ~AddtoLayer() {}
 
-  /** 
-   * Intialization of AddtoLayer. 
+  /**
+   * Intialization of AddtoLayer.
    */
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
 
-  /** 
+  /**
    * Forward propagation.
-   * @note There is no weight matrix for each input, 
+   * @note There is no weight matrix for each input,
    *       because it just a simple add operation.
    */
   void forward(PassType passType);
 
-  /** 
-   * Backward propagation. 
+  /**
+   * Backward propagation.
    */
   void backward(const UpdateCallback& callback = nullptr);
 };
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
index 5e07446c71ff626684894cd99305ea8dc938d00d..eb89281cb1c75cb9b0679bd40ed4cfd4e2224188 100644
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -44,8 +44,8 @@ void AgentLayer::forward(PassType passType) {
     if (realOutput.ids) {
       output_.ids->subVecFrom(*realOutput.ids, 0, numSamples_);
     } else {
-      output_.subArgFrom(realOutput, /* offset */ 0, numSamples_, getSize(),
-                         useGpu_);
+      output_.subArgFrom(
+          realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_);
     }
   } else {
     output_ = realOutput;
@@ -64,9 +64,15 @@ void SequenceAgentLayer::forward(PassType passType) {
     int numRows =
         realOutput.sequenceStartPositions->getData(false)[numSamples_];
     CHECK(!realOutput.ids) << "Not supported";
-    output_.subArgFrom(realOutput, /* offset */ 0, numRows, getSize(), useGpu_,
-                       /* trans */ false, /* seqFlag */ true,
-                       /* seqStart */ 0, /* seqSize */ numSamples_ + 1);
+    output_.subArgFrom(realOutput,
+                       /* offset */ 0,
+                       numRows,
+                       getSize(),
+                       useGpu_,
+                       /* trans */ false,
+                       /* seqFlag */ true,
+                       /* seqStart */ 0,
+                       /* seqSize */ numSamples_ + 1);
   } else {
     output_ = realOutput;
   }
@@ -107,7 +113,8 @@ void GatherAgentLayer::forward(PassType passType) {
   for (size_t i = 0; i < realLayers_.size(); ++i) {
     const MatrixPtr& realV = realLayers_[i]->getOutputValue();
     idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i],
-                                 /* size */ realV->getHeight(), useGpu_);
+                                 /* size */ realV->getHeight(),
+                                 useGpu_);
     realV->addToRows(*outV, *idsVec_[i]);
   }
 }
@@ -140,8 +147,8 @@ void ScatterAgentLayer::forward(PassType passType) {
 
   int width = this->getSize();
   if (realOutArg_.value || realOutArg_.ids) {
-    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
-                       useGpu_);
+    output_.subArgFrom(
+        realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_);
   } else {  // used in generation
     if (realLayer_->getOutput().ids) {
       IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
@@ -223,8 +230,13 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
 
   if (realOutArg_.value || realOutArg_.ids) {
     CHECK(realOutArg_.sequenceStartPositions);
-    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
-                       useGpu_, /* trans */ false, /* seqFlag */ true,
+    output_.subArgFrom(realOutArg_,
+                       /* offset */ idIndex_,
+                       idSize_,
+                       width,
+                       useGpu_,
+                       /* trans */ false,
+                       /* seqFlag */ true,
                        /* seqStart */ seqStartPosIndex_,
                        /* seqSize */ numSequences_);
   } else {
@@ -247,8 +259,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
 
     CHECK_NE(input.sequenceStartPositions.get(),
              output_.sequenceStartPositions.get());
-    ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions,
-                                  numSequences + 1, false);
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions, numSequences + 1, false);
     int* outStarts = output_.sequenceStartPositions->getMutableData(false);
 
     ICpuGpuVector::resizeOrCreate(inputStartPos_, height, false);
diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h
index 3d7bf5583407078da4d66264e62581a59d5013ae..0186653c0f26cd2b53fc6d96d0dfad09dab6fa5b 100644
--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -82,7 +81,8 @@ public:
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
 
   // call before addRealLayer
-  void copyIdAndSequenceInfo(const Argument& input, const IVectorPtr& allIds,
+  void copyIdAndSequenceInfo(const Argument& input,
+                             const IVectorPtr& allIds,
                              const std::vector<int>& idIndex);
 
   // add one real layer, can call many times
@@ -140,11 +140,12 @@ public:
    *
    * @param layer[input]    realLayer
    * @param ids[input]      row id in real layer
-   * @param copyId[input]   whether to copy a cpu version of ids, 
-   *                        false(default) in ScatterAgentLayer, and 
+   * @param copyId[input]   whether to copy a cpu version of ids,
+   *                        false(default) in ScatterAgentLayer, and
    *                        true in SequenceScatterAgentLayer.
    */
-  void setRealLayer(LayerPtr layer, const std::vector<int>& ids,
+  void setRealLayer(LayerPtr layer,
+                    const std::vector<int>& ids,
                     bool copyId = false) {
     realLayer_ = layer;
     IVector::resizeOrCreate(ids_, ids.size(), useGpu_);
@@ -161,8 +162,11 @@ public:
 
   // set real layer and output, [idIndex, idIndex + idSize) of *ids*
   // are selected row for realOutArg in realLayer
-  void setRealLayerAndOutput(LayerPtr layer, const Argument& outArg,
-                             const IVectorPtr& ids, int idIndex, int idSize) {
+  void setRealLayerAndOutput(LayerPtr layer,
+                             const Argument& outArg,
+                             const IVectorPtr& ids,
+                             int idIndex,
+                             int idSize) {
     realLayer_ = layer;
     realOutArg_ = outArg;
     ids_ = ids;
@@ -170,9 +174,9 @@ public:
     idSize_ = idSize;
   }
 
-  void setSequenceStartPositions(
-      const ICpuGpuVectorPtr& sequenceStartPositions,
-      int seqStartPosIndex, int numSequences) {
+  void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions,
+                                 int seqStartPosIndex,
+                                 int numSequences) {
     realOutArg_.sequenceStartPositions = sequenceStartPositions;
     seqStartPosIndex_ = seqStartPosIndex;
     numSequences_ = numSequences;
diff --git a/paddle/gserver/layers/AverageLayer.cpp b/paddle/gserver/layers/AverageLayer.cpp
index 7401cdc9a516bb1f0f68bfd27b4ad422bb7078fa..af64e15fe3ba68c62f164c45400f55fcaa937068 100644
--- a/paddle/gserver/layers/AverageLayer.cpp
+++ b/paddle/gserver/layers/AverageLayer.cpp
@@ -75,8 +75,8 @@ void AverageLayer::backward(const UpdateCallback& callback) {
         // empty sequence
         continue;
       }
-      dataMtx_->setData(gradientData + starts[sequenceId] * dim, sequenceLength,
-                        dim);
+      dataMtx_->setData(
+          gradientData + starts[sequenceId] * dim, sequenceLength, dim);
       outMtx_->setData(gradient + sequenceId * dim);
       switch (mode_) {
         case kAverage: {
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp
index 8052b35ec69c500b9005d4ffef882ceafa3bdab8..fd534b2ac406d4c9a112c1098be84484f980f651 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "Layer.h"
 #include "BatchNormBaseLayer.h"
@@ -61,18 +60,16 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap,
 
 void BatchNormBaseLayer::calFeatureMapSize() {
   const ImageConfig& conf = config_.inputs(0).image_conf();
-  if (inputLayers_[0]->getOutput().getFrameHeight() == 0 &&
-      inputLayers_[0]->getOutput().getFrameWidth() == 0) {
-    imgSize_ = conf.img_size();
-    imageH_ = imgSize_;
-    imageW_ = imgSize_;
+  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imageH_ == 0 && imageW_ == 0) {
+    imageH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+    imageW_ = conf.img_size();
   } else {
-    imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
-    imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
+    getOutput().setFrameHeight(imageH_);
+    getOutput().setFrameWidth(imageW_);
   }
   imgPixels_ = imageH_ * imageW_;
-  getOutput().setFrameHeight(imageH_);
-  getOutput().setFrameWidth(imageW_);
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h
index 2302d1a8e0b17f4b67835e65a3453f8f6e20f721..f956646a6dca7a5b053e5d034866b659d90539d0 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Stat.h"
@@ -21,14 +20,15 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * @brief Batch normalization layer use to normalizes the input to across the batch.
+ * @brief Batch normalization layer use to normalizes the input to across the
+ * batch.
  *
  * By default, calculating global mean and variance statistics via a running
  * average in the training peroid. Then the pre-calculated global mean and
  * variance are used for testing.
  *
  * Moving mean and variance are located in Parameter object when constructing
- * and the calculation will change them. Now we only save global mean and 
+ * and the calculation will change them. Now we only save global mean and
  * variance of one thread in first node for GPU.
  * But the calculation in CPU is different, because parameters are shared by
  * multiple threads. Here using ShareCpuMatrix with lock to calculate. We
@@ -41,8 +41,7 @@ namespace paddle {
 
 class BatchNormBaseLayer : public Layer {
 public:
-  explicit BatchNormBaseLayer(const LayerConfig& config)
-      : Layer(config) {}
+  explicit BatchNormBaseLayer(const LayerConfig& config) : Layer(config) {}
 
   ~BatchNormBaseLayer() {}
 
@@ -55,8 +54,8 @@ public:
 
   virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
 
-  /** 
-   * @brief Calculate feature map size. Some input uses frameHeight and 
+  /**
+   * @brief Calculate feature map size. Some input uses frameHeight and
    * frameWidth to store feature size
    */
   void calFeatureMapSize();
@@ -78,9 +77,8 @@ protected:
   MatrixPtr savedMean_;
   MatrixPtr savedInvVar_;
 
-  /// Height or width of input image feature, now height is equal to width.
-  /// imgSize is 1 if the input is fully-connected layer.
-  int imgSize_;
+  /// Height or width of input image feature.
+  /// Both of them are 1 if the input is fully-connected layer.
   int imageH_;
   int imageW_;
   /// Height * Width.
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/gserver/layers/BatchNormalizationLayer.cpp
index b2921e6d40d3d5d3777fbb26fa9314aaa73f82da..bdc20c9d81b1c321ff55ccdf4880f212d845fb53 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #ifndef PADDLE_ONLY_CPU
 #include "hl_batch_transpose.h"
@@ -41,11 +40,11 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
   savedMean_->mulScalar(1.0 / numSamples);  // E[x]
 
   tmpMat_->assign(*mat);
-  tmpMat_->square();
+  tmpMat_->square2();
   savedInvVar_->zeroMem();
   savedInvVar_->accumulateColSum(*tmpMat_);
-  savedInvVar_->mulScalar(1.0 / numSamples);  // E[x^2]
-  savedInvVar_->addSquare(*savedMean_, -1.0);      // E[x^2] - E^2[x]
+  savedInvVar_->mulScalar(1.0 / numSamples);   // E[x^2]
+  savedInvVar_->addSquare(*savedMean_, -1.0);  // E[x^2] - E^2[x]
 
   // Variance may be small negative value
   // because of the subtraction operation.
@@ -55,7 +54,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
   calMovingMeanAndVar();
 
   savedInvVar_->subScalar(-EPS);
-  savedInvVar_->sqrt(*savedInvVar_);
+  savedInvVar_->sqrt2(*savedInvVar_);
 }
 
 void BatchNormalizationLayer::calMovingMeanAndVar() {
@@ -86,7 +85,7 @@ void BatchNormalizationLayer::setMeanAndStd() {
   savedInvVar_->downClip(real(0.0));
 
   savedInvVar_->subScalar(-EPS);
-  savedInvVar_->sqrt(*savedInvVar_);
+  savedInvVar_->sqrt2(*savedInvVar_);
 }
 
 void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {
@@ -104,17 +103,23 @@ void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {
 #ifdef PADDLE_ONLY_CPU
     LOG(FATAL) << "paddle is compiled only for cpu";
 #else
-    batchTranspose(in->getData(), out->getData(), imgPixels_,
-                   channels_, batchSize);
+    batchTranspose(
+        in->getData(), out->getData(), imgPixels_, channels_, batchSize);
 #endif
   } else {
     for (size_t i = 0; i < batchSize; i++) {
       const MatrixPtr inTmp =
-          Matrix::create(in->getData() + i * imgPixels_ * channels_, channels_,
-                         imgPixels_, false, useGpu_);
+          Matrix::create(in->getData() + i * imgPixels_ * channels_,
+                         channels_,
+                         imgPixels_,
+                         false,
+                         useGpu_);
       MatrixPtr outTmp =
           Matrix::create(out->getData() + i * imgPixels_ * channels_,
-                         imgPixels_, channels_, false, useGpu_);
+                         imgPixels_,
+                         channels_,
+                         false,
+                         useGpu_);
       inTmp->transpose(outTmp, false);
     }
   }
@@ -135,23 +140,27 @@ void BatchNormalizationLayer::shrinkMat(const MatrixPtr& in, MatrixPtr& out) {
 #ifdef PADDLE_ONLY_CPU
     LOG(FATAL) << "paddle is compiled only for cpu";
 #else
-    batchTranspose(in->getData(), out->getData(), channels_,
-                   imgPixels_, batchSize);
+    batchTranspose(
+        in->getData(), out->getData(), channels_, imgPixels_, batchSize);
 #endif
   } else {
     for (size_t i = 0; i < batchSize; i++) {
       const MatrixPtr inTmp =
-          Matrix::create(in->getData() + i * channels_ * imgPixels_, imgPixels_,
-                         channels_, false, useGpu_);
+          Matrix::create(in->getData() + i * channels_ * imgPixels_,
+                         imgPixels_,
+                         channels_,
+                         false,
+                         useGpu_);
       MatrixPtr outTmp =
-          Matrix::create(out->getData() + i * imgPixels_ * channels_, channels_,
-                         imgPixels_, useGpu_);
+          Matrix::create(out->getData() + i * imgPixels_ * channels_,
+                         channels_,
+                         imgPixels_,
+                         useGpu_);
       inTmp->transpose(outTmp, false);
     }
   }
 }
 
-
 void BatchNormalizationLayer::forward(PassType passType) {
   Layer::forward(passType);
 
@@ -165,12 +174,12 @@ void BatchNormalizationLayer::forward(PassType passType) {
     useGlobalStats_ = config_.use_global_stats();
   }
 
-  Matrix::resizeOrCreate(expandedIn_, batchSize * imgPixels_, channels_, false,
-                         useGpu_);
-  Matrix::resizeOrCreate(normIn_, batchSize * imgPixels_, channels_, false,
-                         useGpu_);
-  Matrix::resizeOrCreate(expandedOut_, batchSize * imgPixels_, channels_, false,
-                         useGpu_);
+  Matrix::resizeOrCreate(
+      expandedIn_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      normIn_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      expandedOut_, batchSize * imgPixels_, channels_, false, useGpu_);
   expandMat(getInputValue(0), expandedIn_);
 
   if (useGlobalStats_) {
@@ -184,7 +193,7 @@ void BatchNormalizationLayer::forward(PassType passType) {
   }
 
   normIn_->assign(*expandedIn_);
-  normIn_->addBias(*savedMean_, -1);  // subtract mean.
+  normIn_->addBias(*savedMean_, -1);     // subtract mean.
   normIn_->divRowVector(*savedInvVar_);  // divide std.
 
   expandedOut_->assign(*normIn_);
@@ -211,18 +220,18 @@ void BatchNormalizationLayer::backward(const UpdateCallback& callback) {
   Matrix::resizeOrCreate(meanGrad_, 1, channels_, false, useGpu_);
   Matrix::resizeOrCreate(stdGrad_, 1, channels_, false, useGpu_);
 
-  Matrix::resizeOrCreate(expandedInGrad_, batchSize * imgPixels_, channels_,
-                         false, useGpu_);
-  Matrix::resizeOrCreate(inGrad_, batchSize, imgPixels_ * channels_, false,
-                         useGpu_);
-  Matrix::resizeOrCreate(normInGrad_, batchSize * imgPixels_, channels_, false,
-                         useGpu_);
-  Matrix::resizeOrCreate(expandedOutGrad_, batchSize * imgPixels_, channels_,
-                         false, useGpu_);
-  Matrix::resizeOrCreate(tmpMat_, batchSize * imgPixels_, channels_, false,
-                         useGpu_);
-  Matrix::resizeOrCreate(tmpGrad_, batchSize * imgPixels_, channels_, false,
-                         useGpu_);
+  Matrix::resizeOrCreate(
+      expandedInGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      inGrad_, batchSize, imgPixels_ * channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      normInGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      expandedOutGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      tmpMat_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      tmpGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
 
   expandMat(getOutputGrad(), expandedOutGrad_);
 
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h
index 175b9a80e63f796d272af3940705def7b9857df7..36925a5ed2d56e4a5c58525cc238164f72bef40c 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/BilinearInterpLayer.cpp b/paddle/gserver/layers/BilinearInterpLayer.cpp
index ac5f87be7af070a1146f79b633c777e77633b80b..11028290dcd1015c1bc51d4c34655f527f55346d 100644
--- a/paddle/gserver/layers/BilinearInterpLayer.cpp
+++ b/paddle/gserver/layers/BilinearInterpLayer.cpp
@@ -26,24 +26,24 @@ size_t BilinearInterpLayer::getSize() {
 
   const BilinearInterpConfig& conf = config_.inputs(0).bilinear_interp_conf();
   if (inImgH_ == 0) {
-    inImgH_ = conf.img_size_y();
+    inImgH_ = conf.image_conf().img_size_y();
   }
   if (inImgW_ == 0) {
-    inImgW_ = conf.img_size_x();
+    inImgW_ = conf.image_conf().img_size();
   }
 
   outImgH_ = conf.out_size_y();
   outImgW_ = conf.out_size_x();
-  numChannels_ = conf.num_channels();
+  numChannels_ = conf.image_conf().channels();
 
   CHECK(outImgH_ > 0 && outImgW_ > 0);
   CHECK(inImgH_ > 0 && inImgW_ > 0);
   CHECK(numChannels_);
 
-  ratioH_ = (outImgH_ > 1) ?
-    static_cast<real>(inImgH_ - 1) / (outImgH_ - 1) : 0.f;
-  ratioW_ = (outImgW_ > 1) ?
-    static_cast<real>(inImgW_ - 1) / (outImgW_ - 1) : 0.f;
+  ratioH_ =
+      (outImgH_ > 1) ? static_cast<real>(inImgH_ - 1) / (outImgH_ - 1) : 0.f;
+  ratioW_ =
+      (outImgW_ > 1) ? static_cast<real>(inImgW_ - 1) / (outImgW_ - 1) : 0.f;
 
   getOutput().setFrameHeight(outImgH_);
   getOutput().setFrameWidth(outImgW_);
@@ -74,21 +74,33 @@ void BilinearInterpLayer::forward(PassType passType) {
   MatrixPtr outV = getOutputValue();
   {
     REGISTER_TIMER_INFO("FwBilinearInterpTimer", getName().c_str());
-    outV->bilinearForward(*inV, inImgH_, inImgW_, outImgH_, outImgW_,
-      numChannels_, ratioH_, ratioW_);
+    outV->bilinearForward(*inV,
+                          inImgH_,
+                          inImgW_,
+                          outImgH_,
+                          outImgW_,
+                          numChannels_,
+                          ratioH_,
+                          ratioW_);
   }
 }
 
 void BilinearInterpLayer::backward(const UpdateCallback& callback) {
-  (void) callback;
+  (void)callback;
 
   MatrixPtr inputG = getInputGrad(0);
   MatrixPtr outG = getOutputGrad();
   {
     REGISTER_TIMER_INFO("BwBilinearInterpTimer", getName().c_str());
     if (inputG) {
-      inputG->bilinearBackward(*outG, outImgH_, outImgW_, inImgH_, inImgW_,
-        numChannels_, ratioH_, ratioW_);
+      inputG->bilinearBackward(*outG,
+                               outImgH_,
+                               outImgW_,
+                               inImgH_,
+                               inImgW_,
+                               numChannels_,
+                               ratioH_,
+                               ratioW_);
     }
   }
 }
diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp
index 8da159def82b0cb91bc8ffbd8f29891319fa6f35..17d77879b27be332a49eae4e476b776ec2f5c8e2 100644
--- a/paddle/gserver/layers/BlockExpandLayer.cpp
+++ b/paddle/gserver/layers/BlockExpandLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "BlockExpandLayer.h"
 
 #include "paddle/utils/Logging.h"
@@ -52,7 +51,7 @@ size_t BlockExpandLayer::getBlockNum() {
   if (imgSizeW_ == 0) {
     imgSizeW_ = blockConf.img_size_x();
   }
-  size_t tmpH  = 2 * paddingH_ + imgSizeH_ - blockH_;
+  size_t tmpH = 2 * paddingH_ + imgSizeH_ - blockH_;
   outputH_ = (int)tmpH < 0 ? 1 : 1 + (tmpH + strideH_ - 1) / strideH_;
   size_t tmpW = 2 * paddingW_ + imgSizeW_ - blockW_;
   outputW_ = (int)tmpW < 0 ? 1 : 1 + (tmpW + strideW_ - 1) / strideW_;
@@ -73,8 +72,8 @@ void BlockExpandLayer::forward(PassType passType) {
 
   MatrixPtr input = getPrev(0)->getOutputValue();
   Matrix::resizeOrCreate(outVTrans_, blockSize, blockNum, false, useGpu_);
-  ICpuGpuVector::resizeOrCreate(out.sequenceStartPositions,
-                                batchSize + 1, false);
+  ICpuGpuVector::resizeOrCreate(
+      out.sequenceStartPositions, batchSize + 1, false);
   IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false);
   int* start = out.sequenceStartPositions->getMutableData(false);
   int* dims = out.cpuSequenceDims->getData();
@@ -82,14 +81,29 @@ void BlockExpandLayer::forward(PassType passType) {
     outVTrans_->zeroMem();
     /* expand each block as one row */
     MatrixPtr inputTmp =
-        Matrix::create(input->getData() + i * input->getWidth(), 1,
-                       input->getWidth(), false, useGpu_);
-    outVTrans_->convExpand(*inputTmp, imgSizeH_, imgSizeW_, channels_, blockH_,
-                          blockW_, strideH_, strideW_, paddingH_, paddingW_,
-                          outputH_, outputW_);
+        Matrix::create(input->getData() + i * input->getWidth(),
+                       1,
+                       input->getWidth(),
+                       false,
+                       useGpu_);
+    outVTrans_->convExpand(*inputTmp,
+                           imgSizeH_,
+                           imgSizeW_,
+                           channels_,
+                           blockH_,
+                           blockW_,
+                           strideH_,
+                           strideW_,
+                           paddingH_,
+                           paddingW_,
+                           outputH_,
+                           outputW_);
     MatrixPtr outVTmp =
-        Matrix::create(outV->getData() + i * blockNum * blockSize, blockNum,
-                       blockSize, false, useGpu_);
+        Matrix::create(outV->getData() + i * blockNum * blockSize,
+                       blockNum,
+                       blockSize,
+                       false,
+                       useGpu_);
     outVTrans_->transpose(outVTmp, false);
     start[i] = i * blockNum;
     dims[2 * i] = outputH_;
@@ -115,15 +129,32 @@ void BlockExpandLayer::backward(const UpdateCallback& callback) {
 
   for (size_t i = 0; i < batchSize; i++) {
     MatrixPtr gradTmp =
-        Matrix::create(grad->getData() + i * blockNum * blockSize, blockNum,
-                       blockSize, false, useGpu_);
+        Matrix::create(grad->getData() + i * blockNum * blockSize,
+                       blockNum,
+                       blockSize,
+                       false,
+                       useGpu_);
     gradTmp->transpose(gradTrans, false);
     MatrixPtr preGradTmp =
-        Matrix::create(preGrad->getData() + i * preGrad->getWidth(), 1,
-                       preGrad->getWidth(), false, useGpu_);
-    preGradTmp->convShrink(*gradTrans, imgSizeH_, imgSizeW_, channels_, blockH_,
-                           blockW_, strideH_, strideW_, paddingH_, paddingW_,
-                           outputH_, outputW_, 1.0, 1.0);
+        Matrix::create(preGrad->getData() + i * preGrad->getWidth(),
+                       1,
+                       preGrad->getWidth(),
+                       false,
+                       useGpu_);
+    preGradTmp->convShrink(*gradTrans,
+                           imgSizeH_,
+                           imgSizeW_,
+                           channels_,
+                           blockH_,
+                           blockW_,
+                           strideH_,
+                           strideW_,
+                           paddingH_,
+                           paddingW_,
+                           outputH_,
+                           outputW_,
+                           1.0,
+                           1.0);
   }
 }
 
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
index f8f81721278c6c70a2bbea5f10ab9a1b9e501b35..1496fb681acd7ca7190e43cce38c7eb347932d29 100644
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/CRFDecodingLayer.cpp b/paddle/gserver/layers/CRFDecodingLayer.cpp
index d3dfbb7c80f68b8134edc15625abf58504f27017..8986741dc307ba765707d6e5817a2e376b27828e 100644
--- a/paddle/gserver/layers/CRFDecodingLayer.cpp
+++ b/paddle/gserver/layers/CRFDecodingLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "CRFDecodingLayer.h"
 
 namespace paddle {
@@ -46,7 +45,8 @@ void CRFDecodingLayer::forward(PassType passType) {
 
   for (size_t i = 0; i < numSequences; ++i) {
     crf_->decode(output.value->getData() + numClasses_ * starts[i],
-                 output_.ids->getData() + starts[i], starts[i + 1] - starts[i]);
+                 output_.ids->getData() + starts[i],
+                 starts[i + 1] - starts[i]);
   }
 
   if (inputLayers_.size() == 2) {
diff --git a/paddle/gserver/layers/CRFDecodingLayer.h b/paddle/gserver/layers/CRFDecodingLayer.h
index 005bffff6b6b803dba4c72fcbdd61cf09838f014..1914062011d3bceba2f8765fb3cfd2d29ca6d6e9 100644
--- a/paddle/gserver/layers/CRFDecodingLayer.h
+++ b/paddle/gserver/layers/CRFDecodingLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/gserver/layers/CRFLayer.cpp
index c1dcad2b5f2a840ba06e8ef9833eee7a6e5e20cb..ed4f864ba9167129db1a3f56403940d9d7807a15 100644
--- a/paddle/gserver/layers/CRFLayer.cpp
+++ b/paddle/gserver/layers/CRFLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "CRFLayer.h"
 
 namespace paddle {
@@ -73,12 +72,13 @@ void CRFLayer::forward(PassType passType) {
       crfs_.emplace_back(numClasses_,
                          parameter_->getBuf(PARAMETER_VALUE)->getData(),
                          parameter_->getBuf(PARAMETER_GRADIENT)
-                            ? parameter_->getBuf(PARAMETER_GRADIENT)->getData()
-                            : nullptr);
+                             ? parameter_->getBuf(PARAMETER_GRADIENT)->getData()
+                             : nullptr);
     }
-    output_.value->getData()[i] = crfs_[i].forward(
-        output.value->getData() + numClasses_ * starts[i],
-        label.ids->getData() + starts[i], starts[i + 1] - starts[i]);
+    output_.value->getData()[i] =
+        crfs_[i].forward(output.value->getData() + numClasses_ * starts[i],
+                         label.ids->getData() + starts[i],
+                         starts[i + 1] - starts[i]);
   }
 
   if (weightLayer_) {
@@ -87,7 +87,7 @@ void CRFLayer::forward(PassType passType) {
   }
 }
 
-void CRFLayer::backward(const UpdateCallback &callback) {
+void CRFLayer::backward(const UpdateCallback& callback) {
   const Argument& output = getInput(0);
   const Argument& label = getInput(1);
   const int* starts = label.sequenceStartPositions->getData(false);
@@ -100,7 +100,7 @@ void CRFLayer::backward(const UpdateCallback &callback) {
                       starts[i + 1] - starts[i]);
     if (weightLayer_) {
       real weight = getInputValue(*weightLayer_)->getElement(i, 0);
-      MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i+1]);
+      MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
       grad->mulScalar(weight);
     }
   }
diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h
index 58902a0d3b7e4cad67dac94be10c35ebbf83b001..21c7fc61e168cea438339db4e7abce59082fc58d 100644
--- a/paddle/gserver/layers/CRFLayer.h
+++ b/paddle/gserver/layers/CRFLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -39,7 +38,7 @@ protected:
   ParameterPtr parameter_;
   std::vector<LinearChainCRF> crfs_;
   LayerPtr weightLayer_;  // weight for each sequence
-  real coeff_;  // weight for the layer
+  real coeff_;            // weight for the layer
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CTCLayer.cpp b/paddle/gserver/layers/CTCLayer.cpp
index 6b9ffc5c749fb45be567881b8e625b48e28f69b4..be5d2c8c75d6eb2381a2c1758088de0eff462200 100644
--- a/paddle/gserver/layers/CTCLayer.cpp
+++ b/paddle/gserver/layers/CTCLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "CTCLayer.h"
 
 /* Please reference the Chapter7  in
@@ -71,8 +70,7 @@ void CTCLayer::forwardImp(const Argument& softmaxSeqs,
   resizeOutput(numSequences, 1);
   std::vector<real> out(numSequences);
 
-  const int* labelSeqsStarts =
-      labelSeqs.sequenceStartPositions->getData(false);
+  const int* labelSeqsStarts = labelSeqs.sequenceStartPositions->getData(false);
   const int* softmaxSeqsStarts =
       softmaxSeqs.sequenceStartPositions->getData(false);
 
@@ -81,22 +79,22 @@ void CTCLayer::forwardImp(const Argument& softmaxSeqs,
       ctcs_.emplace_back(numClasses_, normByTimes_);
     }
     out[i] = ctcs_[i].forward(
-            softmaxSeqs.value->getData() + numClasses_ * softmaxSeqsStarts[i],
-            softmaxSeqsStarts[i + 1] - softmaxSeqsStarts[i],
-            labelSeqs.ids->getData() + labelSeqsStarts[i],
-            labelSeqsStarts[i + 1] - labelSeqsStarts[i]);
+        softmaxSeqs.value->getData() + numClasses_ * softmaxSeqsStarts[i],
+        softmaxSeqsStarts[i + 1] - softmaxSeqsStarts[i],
+        labelSeqs.ids->getData() + labelSeqsStarts[i],
+        labelSeqsStarts[i + 1] - labelSeqsStarts[i]);
   }
   output_.value->copyFrom(out.data(), numSequences);
 }
 
-void CTCLayer::backward(const UpdateCallback &callback) {
+void CTCLayer::backward(const UpdateCallback& callback) {
   (void)callback;
   if (useGpu_) {
     backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]);
-    const_cast<Argument&>(getInput(0)).
-            resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_DEFAULT);
-    const_cast<Argument&>(getInput(1)).
-            resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_DEFAULT);
+    const_cast<Argument&>(getInput(0))
+        .resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_DEFAULT);
+    const_cast<Argument&>(getInput(1))
+        .resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_DEFAULT);
   } else {
     backwardImp(callback, getInput(0), getInput(1));
   }
@@ -107,8 +105,7 @@ void CTCLayer::backwardImp(const UpdateCallback& callback,
                            const Argument& labelSeqs) {
   size_t numSequences = labelSeqs.sequenceStartPositions->getSize() - 1;
 
-  const int* labelSeqsStarts =
-      labelSeqs.sequenceStartPositions->getData(false);
+  const int* labelSeqsStarts = labelSeqs.sequenceStartPositions->getData(false);
   const int* softmaxSeqsStarts =
       softmaxSeqs.sequenceStartPositions->getData(false);
 
diff --git a/paddle/gserver/layers/CTCLayer.h b/paddle/gserver/layers/CTCLayer.h
index 49a059e43e6af4194bf50fbab14f545b81f65795..18ba12583b5a22849f1ee849a3cce7249730fdaf 100644
--- a/paddle/gserver/layers/CTCLayer.h
+++ b/paddle/gserver/layers/CTCLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -28,7 +27,8 @@ public:
   void forwardImp(const Argument& softmaxSeqs, const Argument& labelSeqs);
   virtual void backward(const UpdateCallback& callback);
   void backwardImp(const UpdateCallback& callback,
-                   const Argument& softmaxSeqs, const Argument& labelSeqs);
+                   const Argument& softmaxSeqs,
+                   const Argument& labelSeqs);
 
 protected:
   size_t numClasses_;
diff --git a/paddle/gserver/layers/ConcatenateLayer.cpp b/paddle/gserver/layers/ConcatenateLayer.cpp
index a986ec10b4a01c8cc87b067d13e76b8c456bda34..910eec8bbc10ef10f5dd4e4688eef5e87c21f506 100644
--- a/paddle/gserver/layers/ConcatenateLayer.cpp
+++ b/paddle/gserver/layers/ConcatenateLayer.cpp
@@ -97,8 +97,7 @@ void ConcatenateLayer::backward(const UpdateCallback& callback) {
  */
 class ConcatenateLayer2 : public Layer {
 public:
-  explicit ConcatenateLayer2(const LayerConfig& config) :
-      Layer(config) {}
+  explicit ConcatenateLayer2(const LayerConfig& config) : Layer(config) {}
 
   ~ConcatenateLayer2() {}
 
@@ -130,8 +129,8 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap,
   size_t startCol = 0;
   size_t endCol = 0;
   for (size_t i = 0; i < inputLayers_.size(); i++) {
-    projections_.emplace_back(Projection::create(config_.inputs(i).proj_conf(),
-                                                 parameters_[i], useGpu_));
+    projections_.emplace_back(Projection::create(
+        config_.inputs(i).proj_conf(), parameters_[i], useGpu_));
 
     endCol += projections_[i]->getOutputSize();
     projCol_.push_back(std::make_pair(startCol, endCol));
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index 3b1498f7e986737e01115c44f964b4a7ee924095..30dbf168fb6e439048e0168af572d1f20a303e79 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "ContextProjection.h"
 
@@ -21,7 +20,8 @@ namespace paddle {
 REGISTER_PROJECTION(context, ContextProjection);
 
 ContextProjection::ContextProjection(const ProjectionConfig& config,
-                                     ParameterPtr parameter, bool useGpu)
+                                     ParameterPtr parameter,
+                                     bool useGpu)
     : Projection(config, parameter, useGpu) {
   CHECK(config.has_context_start());
   CHECK(config.has_context_length());
@@ -44,10 +44,13 @@ void ContextProjection::resetState() {
   CHECK_LE(config_.context_start() + config_.context_length(), 1)
       << "state is not allowed for future context";
   if (config_.context_start() >= 0) return;
-  Matrix::resizeOrCreate(state_, -config_.context_start(), config_.input_size(),
+  Matrix::resizeOrCreate(state_,
+                         -config_.context_start(),
+                         config_.input_size(),
                          false,  // trans
                          useGpu_);
-  Matrix::resizeOrCreate(state2_, -config_.context_start(),
+  Matrix::resizeOrCreate(state2_,
+                         -config_.context_start(),
                          config_.input_size(),
                          false,  // trans
                          useGpu_);
@@ -78,8 +81,7 @@ void ContextProjection::forward() {
   CHECK(in_->value);
   CHECK(in_->sequenceStartPositions);
 
-  auto startPositions =
-    in_->sequenceStartPositions->getVector(useGpu_);
+  auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
 
   int64_t inputDim = in_->value->getWidth();
   int64_t dim = out_->value->getWidth();
@@ -88,9 +90,13 @@ void ContextProjection::forward() {
   REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
   bool isPadding = config_.trainable_padding();
   out_->value->contextProjectionForward(
-      in_->value, state_ ? state_ : isPadding ? weight_->getW() : nullptr,
-      *startPositions, config_.context_length(), config_.context_start(),
-      beginPad_, state_ ? true : isPadding);
+      in_->value,
+      state_ ? state_ : isPadding ? weight_->getW() : nullptr,
+      *startPositions,
+      config_.context_length(),
+      config_.context_start(),
+      beginPad_,
+      state_ ? true : isPadding);
 
   if (state_ && config_.context_start() < 0) {
     CHECK_EQ(1, in_->getNumSequences());
@@ -116,27 +122,35 @@ void ContextProjection::backward(const UpdateCallback& callback) {
   int64_t inputDim = in_->value->getWidth();
   int64_t dim = out_->value->getWidth();
   CHECK_EQ(dim, inputDim * config_.context_length());
-  auto startPositions =
-    in_->sequenceStartPositions->getVector(useGpu_);
+  auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
 
   REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str());
   bool isPadding = config_.trainable_padding();
   if (!out_->grad->useGpu()) {
     out_->grad->contextProjectionBackward(
-        in_->grad, isPadding ? weight_->getWGrad() : nullptr, *startPositions,
-        config_.context_length(), config_.context_start(), beginPad_,
+        in_->grad,
+        isPadding ? weight_->getWGrad() : nullptr,
+        *startPositions,
+        config_.context_length(),
+        config_.context_start(),
+        beginPad_,
         isPadding);
   } else {
     if (in_->grad) {
-      out_->grad->contextProjectionBackwardData(in_->grad, *startPositions,
+      out_->grad->contextProjectionBackwardData(in_->grad,
+                                                *startPositions,
                                                 config_.context_length(),
                                                 config_.context_start());
     }
 
     if (isPadding && weight_->getWGrad()) {
       out_->grad->contextProjectionBackwardWeight(
-          weight_->getWGrad(), *startPositions, config_.context_length(),
-          config_.context_start(), weight_->getWGrad()->getHeight(), beginPad_);
+          weight_->getWGrad(),
+          *startPositions,
+          config_.context_length(),
+          config_.context_start(),
+          weight_->getWGrad()->getHeight(),
+          beginPad_);
     }
   }
 
diff --git a/paddle/gserver/layers/ContextProjection.h b/paddle/gserver/layers/ContextProjection.h
index 0786ee28f2eed9c73659eb2ca0d691da8d1e3e29..188dec0fb31bf468c76b9b922e0972c86e819a2d 100644
--- a/paddle/gserver/layers/ContextProjection.h
+++ b/paddle/gserver/layers/ContextProjection.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Projection.h"
@@ -50,7 +49,8 @@ public:
    * and if it is set, constructor will set learned weight, which is used to
    * pad output.
    */
-  ContextProjection(const ProjectionConfig& config, ParameterPtr parameter,
+  ContextProjection(const ProjectionConfig& config,
+                    ParameterPtr parameter,
                     bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp
index 6bc3b3b801796a227a7b767c8da048a3ccf88827..b5a2f8b8e10e6f81d06e9722c09c5d43b1620ad1 100644
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -22,7 +22,8 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
   /* Initialize the basic parent class */
   Layer::init(layerMap, parameterMap);
   isDeconv_ = (config_.type() == "exconv" || config_.type() == "cudnn_conv")
-              ? false : true;
+                  ? false
+                  : true;
 
   /* Initialize the convolutional layer parameter */
   numFilters_ = config_.num_filters();
@@ -37,11 +38,12 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
     filterSizeY_.push_back(conf.filter_size_y());
     filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
     channels_.push_back(conf.channels());
-    imgSizeH_.push_back(conf.img_size());
+    imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y()
+                                              : conf.img_size());
     imgSizeW_.push_back(conf.img_size());
     groups_.push_back(conf.groups());
     filterChannels_.push_back(conf.filter_channels());
-    outputH_.push_back(conf.output_x());
+    outputH_.push_back(conf.has_output_y() ? conf.output_y() : conf.output_x());
     outputW_.push_back(conf.output_x());
   }
 
@@ -88,33 +90,28 @@ size_t ConvBaseLayer::calOutputSize() {
 
   auto setLayerSize = [&](IntV& inH, IntV& inW, IntV& outH, IntV& outW) {
     for (size_t i = 0; i < inputLayers_.size(); i++) {
-       inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
-       inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
-       if (isDeconv_) {
-         if (inH[i] == 0)
-           inH[i] = config_.inputs(i).conv_conf().output_x();
-         if (inW[i] == 0)
-           inW[i] = config_.inputs(i).conv_conf().output_x();
-         outH.push_back(
-             imageSize(inH[i], filterSizeY_[i], paddingY_[i], strideY_[i],
-                       caffeMode_));
-         outW.push_back(
-             imageSize(inW[i], filterSize_[i], padding_[i], stride_[i],
-                       caffeMode_));
-       } else {
-         if (inH[i] == 0)
-           inH[i] = config_.inputs(i).conv_conf().img_size();
-         if (inW[i] == 0)
-           inW[i] = config_.inputs(i).conv_conf().img_size();
-         outH.push_back(
-             outputSize(inH[i], filterSizeY_[i], paddingY_[i], strideY_[i],
-                        caffeMode_));
-         outW.push_back(
-             outputSize(inW[i], filterSize_[i], padding_[i], stride_[i],
-                        caffeMode_));
-       }
-       CHECK_EQ(outH[i], outH[0]);
-       CHECK_EQ(outW[i], outW[0]);
+      inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
+      inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
+      const ConvConfig& conf = config_.inputs(i).conv_conf();
+      if (isDeconv_) {
+        if (inH[i] == 0)
+          inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x();
+        if (inW[i] == 0) inW[i] = conf.output_x();
+        outH.push_back(imageSize(
+            inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_));
+        outW.push_back(imageSize(
+            inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_));
+      } else {
+        if (inH[i] == 0)
+          inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+        if (inW[i] == 0) inW[i] = conf.img_size();
+        outH.push_back(outputSize(
+            inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_));
+        outW.push_back(outputSize(
+            inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_));
+      }
+      CHECK_EQ(outH[i], outH[0]);
+      CHECK_EQ(outW[i], outW[0]);
     }
     getOutput().setFrameHeight(outH[0]);
     getOutput().setFrameWidth(outW[0]);
diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
index b80cab899585e7bd93bfc86d8afa116d343d36d7..85f57dbe0b7c9683ba0941ea0edc611f683cf1b4 100644
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/ConvOperator.cpp b/paddle/gserver/layers/ConvOperator.cpp
index 2d9c892fe595f2f4dcdc9dcc3cd392a6c29fac01..dc06c89dab2524d9b640bfd88f3b3f3ce0117711 100644
--- a/paddle/gserver/layers/ConvOperator.cpp
+++ b/paddle/gserver/layers/ConvOperator.cpp
@@ -93,9 +93,9 @@ private:
   bool caffeMode_;
   int inputOffset_, outputOffset_, weightOffset_;
   int numFilters_;
-  int padding_, stride_, filterSize_, channels_, imgSize_;
+  int padding_, stride_, filterSize_, channels_, imgSize_, imgSizeY_;
   int paddingY_, strideY_, filterSizeY_;
-  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputs_;
+  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;
 
   /// Following member variables are same with CudnnConvLayer.
   /// There is no explanation here.
@@ -144,7 +144,7 @@ void ConvOperator::allocConvWorkSpace(size_t maxWorkSpace) {
 void ConvOperator::reshape(int batchSize) {
   imageH_ = ins_[0]->getFrameHeight();
   imageW_ = ins_[0]->getFrameWidth();
-  if (imageH_ == 0) imageH_ = imgSize_;
+  if (imageH_ == 0) imageH_ = imgSizeY_;
   if (imageW_ == 0) imageW_ = imgSize_;
   outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
   outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_);
@@ -155,9 +155,15 @@ void ConvOperator::reshape(int batchSize) {
   reshapeImageDescriptors();
 
   if (!isSelectAlgo_) {
-    hl_conv_workspace(inputDesc_, outputDesc_, filterDesc_, convDesc_,
-                      &fwdAlgo_, &fwdLimitBytes_, &bwdDataAlgo_,
-                      &bwdDataLimitBytes_, &bwdFilterAlgo_,
+    hl_conv_workspace(inputDesc_,
+                      outputDesc_,
+                      filterDesc_,
+                      convDesc_,
+                      &fwdAlgo_,
+                      &fwdLimitBytes_,
+                      &bwdDataAlgo_,
+                      &bwdDataLimitBytes_,
+                      &bwdFilterAlgo_,
                       &bwdFilterLimitBytes_);
 
     size_t maxWorkSpace = 0;
@@ -171,26 +177,51 @@ void ConvOperator::reshape(int batchSize) {
 }
 
 void ConvOperator::computeConvSizes() {
-  hl_create_filter_descriptor(&filterDesc_, channels_, numFilters_,
-                              filterSizeY_, filterSize_);
+  hl_create_filter_descriptor(
+      &filterDesc_, channels_, numFilters_, filterSizeY_, filterSize_);
   hl_create_tensor_descriptor(&inputDesc_);
   int outputX =
       outputSize(imgSize_, filterSize_, padding_, stride_, caffeMode_);
+  int outputY =
+      outputSize(imgSizeY_, filterSizeY_, paddingY_, strideY_, caffeMode_);
   CHECK_EQ(outputX, outputX_);
+  CHECK_EQ(outputY, outputY_);
   hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_, inputDesc_, filterDesc_,
-                                   paddingY_, padding_, strideY_, stride_);
+  hl_create_convolution_descriptor(&convDesc_,
+                                   inputDesc_,
+                                   filterDesc_,
+                                   paddingY_,
+                                   padding_,
+                                   strideY_,
+                                   stride_);
 }
 
 void ConvOperator::reshapeImageDescriptors() {
-  hl_tensor_reshape(inputDesc_, 1, channels_, imageH_, imageW_,
-                    channels_ * imageH_ * imageW_, imageH_ * imageW_, imageW_,
+  hl_tensor_reshape(inputDesc_,
+                    1,
+                    channels_,
+                    imageH_,
+                    imageW_,
+                    channels_ * imageH_ * imageW_,
+                    imageH_ * imageW_,
+                    imageW_,
                     1);
-  hl_tensor_reshape(outputDesc_, 1, numFilters_, outputH_, outputW_,
-                    numFilters_ * outputH_ * outputW_, outputH_ * outputW_,
-                    outputW_, 1);
-  hl_reset_convolution_descriptor(convDesc_, inputDesc_, filterDesc_, paddingY_,
-                                  padding_, strideY_, stride_);
+  hl_tensor_reshape(outputDesc_,
+                    1,
+                    numFilters_,
+                    outputH_,
+                    outputW_,
+                    numFilters_ * outputH_ * outputW_,
+                    outputH_ * outputW_,
+                    outputW_,
+                    1);
+  hl_reset_convolution_descriptor(convDesc_,
+                                  inputDesc_,
+                                  filterDesc_,
+                                  paddingY_,
+                                  padding_,
+                                  strideY_,
+                                  stride_);
   inputOffset_ = channels_ * imageH_ * imageW_;
   outputOffset_ = numFilters_ * outputH_ * outputW_;
   weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSize_;
@@ -208,10 +239,12 @@ void ConvOperator::getConvParams() {
   filterPixels_ = filterSize_ * filterSizeY_;
   channels_ = conf.channels();
   imgSize_ = conf.img_size();
-  imgPixels_ = imgSize_ * imgSize_;
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  imgPixels_ = imgSize_ * imgSizeY_;
   CHECK_EQ(conf.groups(), 1U);
   filterChannels_ = conf.filter_channels();
   outputX_ = conf.output_x();
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
   outputs_ = outputX_ * outputX_;
 }
 
@@ -220,17 +253,27 @@ void ConvOperator::forward() {
   reshape(batchSize);
   CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
   checkFilterSize(ins_[1]->value);
-  Matrix::resizeOrCreate(out_->value, batchSize,
-                         outputH_ * outputW_ * numFilters_, false, useGpu_);
+  Matrix::resizeOrCreate(out_->value,
+                         batchSize,
+                         outputH_ * outputW_ * numFilters_,
+                         false,
+                         useGpu_);
   {
     AsyncGpuBlock block;
     for (size_t batchId = 0; batchId < batchSize; ++batchId) {
       real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
       real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
       real *outData = out_->value->getData() + outputOffset_ * batchId;
-      hl_convolution_forward(inputDesc_, inputData, outputDesc_, outData,
-                             filterDesc_, wgtData, convDesc_, workSpace_,
-                             workSpaceInBytes_, fwdAlgo_);
+      hl_convolution_forward(inputDesc_,
+                             inputData,
+                             outputDesc_,
+                             outData,
+                             filterDesc_,
+                             wgtData,
+                             convDesc_,
+                             workSpace_,
+                             workSpaceInBytes_,
+                             fwdAlgo_);
     }
   }
 }
@@ -244,9 +287,15 @@ void ConvOperator::backward() {
       if (ins_[1]->grad) {
         real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
         real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_filter(inputDesc_, inputData, outputDesc_,
-                                       outGrad, filterDesc_, weightGrad,
-                                       convDesc_, workSpace_, workSpaceInBytes_,
+        hl_convolution_backward_filter(inputDesc_,
+                                       inputData,
+                                       outputDesc_,
+                                       outGrad,
+                                       filterDesc_,
+                                       weightGrad,
+                                       convDesc_,
+                                       workSpace_,
+                                       workSpaceInBytes_,
                                        bwdFilterAlgo_);
       }
 
@@ -254,9 +303,16 @@ void ConvOperator::backward() {
       if (NULL != preGrad) {
         real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
         real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_data(
-            inputDesc_, inputGrad, outputDesc_, outGrad, filterDesc_, wgtData,
-            convDesc_, workSpace_, workSpaceInBytes_, bwdDataAlgo_);
+        hl_convolution_backward_data(inputDesc_,
+                                     inputGrad,
+                                     outputDesc_,
+                                     outGrad,
+                                     filterDesc_,
+                                     wgtData,
+                                     convDesc_,
+                                     workSpace_,
+                                     workSpaceInBytes_,
+                                     bwdDataAlgo_);
       }
     }
   }
diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp
index d1ce53fe26351926196a04418900a1555e0282c2..5a68fb08da3d742fe6067a8be00b831230e6b0af 100644
--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "ConvProjection.h"
 
@@ -20,12 +19,12 @@ namespace paddle {
 
 REGISTER_PROJECTION(conv, ConvProjection);
 
-ThreadLocalD<std::vector<MemoryHandle*>> ConvProjection::convMem_;
+ThreadLocalD<std::vector<MemoryHandle *>> ConvProjection::convMem_;
 
-ConvProjection::ConvProjection(const ProjectionConfig& config,
-                               ParameterPtr parameter, bool useGpu)
+ConvProjection::ConvProjection(const ProjectionConfig &config,
+                               ParameterPtr parameter,
+                               bool useGpu)
     : Projection(config, parameter, useGpu) {
-
   CHECK(useGpu);  // only support GPU
   getConvParams();
   initCudnn();
@@ -47,7 +46,7 @@ void ConvProjection::getConvParams() {
   filterH_ = conf.filter_size_y();
   filterW_ = conf.filter_size();
 
-  configImgH_ = conf.img_size();
+  configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
   configImgW_ = conf.img_size();
 
   channels_ = conf.channels();
@@ -59,12 +58,20 @@ void ConvProjection::getConvParams() {
 }
 
 void ConvProjection::initCudnn() {
-  hl_create_filter_descriptor(&filterDesc_, channels_, numFilters_,
-                              filterH_, filterW_);
+  hl_create_filter_descriptor(&filterDesc_,
+                              channels_ / groups_,
+                              numFilters_ / groups_,
+                              filterH_,
+                              filterW_);
   hl_create_tensor_descriptor(&inputDesc_);
   hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_, inputDesc_, filterDesc_,
-                                   paddingH_, paddingW_, strideH_, strideW_);
+  hl_create_convolution_descriptor(&convDesc_,
+                                   inputDesc_,
+                                   filterDesc_,
+                                   paddingH_,
+                                   paddingW_,
+                                   strideH_,
+                                   strideW_);
 
   // initialize all to default algorithms
   fwdAlgo_ = 0;
@@ -80,11 +87,22 @@ void ConvProjection::initCudnn() {
 }
 
 void ConvProjection::reshapeTensorDesc(int batchSize) {
-  hl_tensor_reshape(inputDesc_, batchSize, channels_, imageH_, imageW_,
-                    channels_ * imageH_ * imageW_, imageH_ * imageW_,
-                    imageW_, 1);
-  hl_reset_convolution_descriptor(convDesc_, inputDesc_, filterDesc_,
-                                  paddingH_, paddingW_, strideH_, strideW_);
+  hl_tensor_reshape(inputDesc_,
+                    batchSize,
+                    channels_ / groups_,
+                    imageH_,
+                    imageW_,
+                    channels_ * imageH_ * imageW_,
+                    imageH_ * imageW_,
+                    imageW_,
+                    1);
+  hl_reset_convolution_descriptor(convDesc_,
+                                  inputDesc_,
+                                  filterDesc_,
+                                  paddingH_,
+                                  paddingW_,
+                                  strideH_,
+                                  strideW_);
 
   // The stride between two consecutive images in ConvProjection may not be 1,
   // for example, in the case of layer ConcatenateLayer2 with two
@@ -98,8 +116,15 @@ void ConvProjection::reshapeTensorDesc(int batchSize) {
     nStride = out_->value->getStride();
   }
 
-  hl_tensor_reshape(outputDesc_, batchSize, numFilters_, outputH_, outputW_,
-                    nStride, outputH_ * outputW_, outputW_, 1);
+  hl_tensor_reshape(outputDesc_,
+                    batchSize,
+                    numFilters_ / groups_,
+                    outputH_,
+                    outputW_,
+                    nStride,
+                    outputH_ * outputW_,
+                    outputW_,
+                    1);
 }
 
 void ConvProjection::reshape(int batchSize) {
@@ -111,20 +136,24 @@ void ConvProjection::reshape(int batchSize) {
 
   if (!isSelectAlgo_) {
     reshapeTensorDesc(batchSize);
-    hl_conv_workspace(inputDesc_, outputDesc_, filterDesc_,
-                      convDesc_, &fwdAlgo_, &fwdLimitBytes_,
-                      &bwdDataAlgo_, &bwdDataLimitBytes_,
-                      &bwdFilterAlgo_, &bwdFilterLimitBytes_);
+    hl_conv_workspace(inputDesc_,
+                      outputDesc_,
+                      filterDesc_,
+                      convDesc_,
+                      &fwdAlgo_,
+                      &fwdLimitBytes_,
+                      &bwdDataAlgo_,
+                      &bwdDataLimitBytes_,
+                      &bwdFilterAlgo_,
+                      &bwdFilterLimitBytes_);
 
     size_t maxWorkSpace = 0;
     maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
     maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
     workSpaceInBytes_ = maxWorkSpace;
 
-
     VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
-                         << " / " << bwdDataAlgo_
-                         << " / " << bwdFilterAlgo_;
+            << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
   }
 
   isSelectAlgo_ = true;
@@ -134,7 +163,7 @@ void ConvProjection::forward() {
   int batchSize = in_->value->getHeight();
   reshape(batchSize);
 
-  void* workSpace = NULL;
+  void *workSpace = NULL;
   if (workSpaceInBytes_ > 0) {
     workSpace = getSpaceBytes(workSpaceInBytes_);
   }
@@ -145,17 +174,23 @@ void ConvProjection::forward() {
     real *inputData = in_->value->getData() + g * inputOffset_;
     real *wgtData = weight_->getW()->getData() + g * weightOffset_;
     real *outData = out_->value->getData() + g * outputOffset_;
-    hl_convolution_forward(inputDesc_, inputData, outputDesc_,
-                           outData, filterDesc_, wgtData,
-                           convDesc_, workSpace,
-                           fwdLimitBytes_, fwdAlgo_);
+    hl_convolution_forward(inputDesc_,
+                           inputData,
+                           outputDesc_,
+                           outData,
+                           filterDesc_,
+                           wgtData,
+                           convDesc_,
+                           workSpace,
+                           fwdLimitBytes_,
+                           fwdAlgo_);
   }
 }
 
-void ConvProjection::backward(const UpdateCallback& callback) {
+void ConvProjection::backward(const UpdateCallback &callback) {
   REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
 
-  void* workSpace = NULL;
+  void *workSpace = NULL;
   if (workSpaceInBytes_ > 0) {
     workSpace = getSpaceBytes(workSpaceInBytes_);
   }
@@ -165,35 +200,47 @@ void ConvProjection::backward(const UpdateCallback& callback) {
     if (weight_->getWGrad()) {
       real *inputData = in_->value->getData() + g * inputOffset_;
       real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
-      hl_convolution_backward_filter(
-          inputDesc_, inputData, outputDesc_, outGrad, filterDesc_,
-          weightGrad, convDesc_, workSpace, bwdFilterLimitBytes_,
-          bwdFilterAlgo_);
+      hl_convolution_backward_filter(inputDesc_,
+                                     inputData,
+                                     outputDesc_,
+                                     outGrad,
+                                     filterDesc_,
+                                     weightGrad,
+                                     convDesc_,
+                                     workSpace,
+                                     bwdFilterLimitBytes_,
+                                     bwdFilterAlgo_);
     }
 
     MatrixPtr preGrad = in_->grad;
     if (NULL != preGrad) {
       real *inputGrad = preGrad->getData() + g * inputOffset_;
-      real *wgtData = weight_->getW()->getData() + g* weightOffset_;
-      hl_convolution_backward_data(
-          inputDesc_, inputGrad, outputDesc_, outGrad, filterDesc_,
-          wgtData, convDesc_, workSpace, bwdDataLimitBytes_,
-          bwdDataAlgo_);
+      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+      hl_convolution_backward_data(inputDesc_,
+                                   inputGrad,
+                                   outputDesc_,
+                                   outGrad,
+                                   filterDesc_,
+                                   wgtData,
+                                   convDesc_,
+                                   workSpace,
+                                   bwdDataLimitBytes_,
+                                   bwdDataAlgo_);
     }
   }
 
   weight_->getParameterPtr()->incUpdate(callback);
 }
 
-void* ConvProjection::getSpaceBytes(size_t size) {
-  std::vector<MemoryHandle*>& convMem = *convMem_;
+void *ConvProjection::getSpaceBytes(size_t size) {
+  std::vector<MemoryHandle *> &convMem = *convMem_;
   if (convMem.empty()) {
     int numDevices = hl_get_device_count();
     convMem.resize(numDevices);
   }
 
   int devId = hl_get_device();
-  MemoryHandle** localMem = &(convMem[devId]);
+  MemoryHandle **localMem = &(convMem[devId]);
   if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
     *localMem = new GpuMemoryHandle(size);
   }
diff --git a/paddle/gserver/layers/ConvProjection.h b/paddle/gserver/layers/ConvProjection.h
index d0bfe9a6edba05230202da065ca42741439ce190..779fe1455ade10ba55e32f4d9478d446b01b8a19 100644
--- a/paddle/gserver/layers/ConvProjection.h
+++ b/paddle/gserver/layers/ConvProjection.h
@@ -27,7 +27,8 @@ public:
   /**
    * Constructor.
    */
-  ConvProjection(const ProjectionConfig& config, ParameterPtr parameter,
+  ConvProjection(const ProjectionConfig& config,
+                 ParameterPtr parameter,
                  bool useGpu);
 
   ~ConvProjection();
@@ -47,9 +48,15 @@ protected:
     imageW_ = in_->getFrameWidth();
     if (imageH_ == 0) imageH_ = configImgH_;
     if (imageW_ == 0) imageW_ = configImgW_;
-    outputH_ = outputSize(imageH_, filterH_, paddingH_, strideH_,
+    outputH_ = outputSize(imageH_,
+                          filterH_,
+                          paddingH_,
+                          strideH_,
                           /* caffeMode */ true);
-    outputW_ = outputSize(imageW_, filterW_, paddingW_, strideW_,
+    outputW_ = outputSize(imageW_,
+                          filterW_,
+                          paddingW_,
+                          strideW_,
                           /* caffeMode */ true);
 
     const_cast<Argument*>(out_)->setFrameHeight(outputH_);
diff --git a/paddle/gserver/layers/ConvShiftLayer.cpp b/paddle/gserver/layers/ConvShiftLayer.cpp
index 6b3881e3cc80396bfa0b801ba296cb1118fabc74..6e77c1f14e6a6896f6ef7c4042954b25bd58266a 100644
--- a/paddle/gserver/layers/ConvShiftLayer.cpp
+++ b/paddle/gserver/layers/ConvShiftLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp
index a81cf939af671f3fb34fb52ae33035a7bb524aed..7e1fef8bc600329ac62002dab7b91238b83b8023 100644
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -70,12 +69,21 @@ bool ConvexCombinationLayer::init(const LayerMap& layerMap,
   CHECK_EQ(weightDim * dataDim, inputLayers_[1]->getSize())
       << "Dimension mismatch";
 
-  tmpRow0 = Matrix::create(nullptr, /* height= */ 1, weightDim,
-                           /* trans= */ false, useGpu_);
-  tmpRow1 = Matrix::create(nullptr, /* height= */ 1, dataDim,
-                           /* trans= */ false, useGpu_);
-  tmpMtx0 = Matrix::create(nullptr, /* height= */ weightDim, dataDim,
-                           /* trans= */ false, useGpu_);
+  tmpRow0 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           weightDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow1 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpMtx0 = Matrix::create(nullptr,
+                           /* height= */ weightDim,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
 
   return true;
 }
diff --git a/paddle/gserver/layers/CosSimLayer.cpp b/paddle/gserver/layers/CosSimLayer.cpp
index 05a70aeff5e8ff3789bca966d351bffc8efb1cb3..894cb5b0d8226cc3b4b60bac38801bf0a7ec6b6a 100644
--- a/paddle/gserver/layers/CosSimLayer.cpp
+++ b/paddle/gserver/layers/CosSimLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "CosSimLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -57,9 +56,12 @@ void CosSimLayer::backward(const UpdateCallback& callback) {
     REGISTER_TIMER_INFO("CosBpAtvTimer", getName().c_str());
     MatrixPtr outG = this->getOutputGrad();
 
-    outG->cosSimDerivative(*this->getOutputValue(), *getInputValue(0),
-                           *getInputValue(1), *getInputGrad(0),
-                           *getInputGrad(1), config_.cos_scale());
+    outG->cosSimDerivative(*this->getOutputValue(),
+                           *getInputValue(0),
+                           *getInputValue(1),
+                           *getInputGrad(0),
+                           *getInputGrad(1),
+                           config_.cos_scale());
   }
 }
 
diff --git a/paddle/gserver/layers/CosSimLayer.h b/paddle/gserver/layers/CosSimLayer.h
index 65eb807ab2e6f16aab5ef2a9b08d697868c743a3..bc47998c11f267a1737ff82e8aa2958f6859bf86 100644
--- a/paddle/gserver/layers/CosSimLayer.h
+++ b/paddle/gserver/layers/CosSimLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -35,8 +34,7 @@ namespace paddle {
  */
 class CosSimLayer : public Layer {
 public:
-  explicit CosSimLayer(const LayerConfig& config)
-      : Layer(config) {}
+  explicit CosSimLayer(const LayerConfig& config) : Layer(config) {}
 
   ~CosSimLayer() {}
 
diff --git a/paddle/gserver/layers/CosSimVecMatLayer.cpp b/paddle/gserver/layers/CosSimVecMatLayer.cpp
index 7d251ace6fdfde2506e4890b276db5b0d08d51f5..56d177da6458a590299fee5b24b8a9c935510916 100644
--- a/paddle/gserver/layers/CosSimVecMatLayer.cpp
+++ b/paddle/gserver/layers/CosSimVecMatLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -67,19 +66,37 @@ bool CosSimVecMatLayer::init(const LayerMap& layerMap,
 
   CHECK_EQ(dataDim * numKeys, memoryDim) << "Dimension mismatch";
 
-  tmpRow0 = Matrix::create(nullptr, /* height= */ 1, dataDim,
-                           /* trans= */ false, useGpu_);
-  tmpRow1 = Matrix::create(nullptr, /* height= */ 1, dataDim,
-                           /* trans= */ false, useGpu_);
-  tmpRow2 = Matrix::create(nullptr, /* height= */ numKeys, 1,
-                           /* trans= */ false, useGpu_);
-  tmpRow3 = Matrix::create(nullptr, /* height= */ numKeys, 1,
-                           /* trans= */ false, useGpu_);
-
-  tmpMtx0 = Matrix::create(nullptr, /* height= */ numKeys, dataDim,
-                           /* trans= */ false, useGpu_);
-  tmpMtx1 = Matrix::create(nullptr, /* height= */ numKeys, dataDim,
-                           /* trans= */ false, useGpu_);
+  tmpRow0 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow1 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow2 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           1,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow3 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           1,
+                           /* trans= */ false,
+                           useGpu_);
+
+  tmpMtx0 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpMtx1 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
   return true;
 }
 
@@ -131,8 +148,12 @@ void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
       tmpRow2->setData(outV->rowBuf(i));
       tmpRow3->setData(outG->rowBuf(i));
 
-      tmpRow3->cosSimDerivative(*(tmpRow2), *(tmpMtx0), *(tmpRow0), *(tmpMtx1),
-                                *(tmpRow1), config_.cos_scale());
+      tmpRow3->cosSimDerivative(*(tmpRow2),
+                                *(tmpMtx0),
+                                *(tmpRow0),
+                                *(tmpMtx1),
+                                *(tmpRow1),
+                                config_.cos_scale());
     }
   } else {
     CHECK(!inG0 || !inG1) << "Not supported";
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 3c2df52fed4f86675ce8f1ead6a3b66e4babde34..5c839f2d6c4777c0b69fa95478648b286ae4c855 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <memory>
 #include <algorithm>
 #include "paddle/utils/Logging.h"
@@ -88,13 +87,15 @@ bool MultiClassCrossEntropy::init(const LayerMap& layerMap,
   return CostLayer::init(layerMap, parameterMap);
 }
 
-void MultiClassCrossEntropy::forwardImp(Matrix& output, Argument& label,
+void MultiClassCrossEntropy::forwardImp(Matrix& output,
+                                        Argument& label,
                                         Matrix& target) {
   target.oneHotCrossEntropy(output, *label.ids);
 }
 
-void MultiClassCrossEntropy::backwardImp(
-    Matrix& output, Argument& label, Matrix& outputG) {
+void MultiClassCrossEntropy::backwardImp(Matrix& output,
+                                         Argument& label,
+                                         Matrix& outputG) {
   outputG.oneHotCrossEntropyBp(output, *label.ids);
 }
 
@@ -114,12 +115,12 @@ void MultiClassCrossEntropyWithSelfNorm::forwardImp(Matrix& output,
                                                     Matrix& target) {
   Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
   output.rowSum(*sftMaxSum_);
-  sftMaxSum_->log();
+  sftMaxSum_->log2();
 
   target.oneHotCrossEntropy(output, *label.ids);
   target.add(*sftMaxSum_);
 
-  sftMaxSum_->square();
+  sftMaxSum_->square2();
   target.add(*sftMaxSum_, config_.softmax_selfnorm_alpha());
 }
 
@@ -130,12 +131,12 @@ void MultiClassCrossEntropyWithSelfNorm::backwardImp(Matrix& output,
   output.rowSum(*sftMaxSum_);
 
   Matrix::resizeOrCreate(sumInv_, output.getHeight(), 1, false, useGpu_);
-  sftMaxSum_->reciprocal(*sumInv_);
+  sftMaxSum_->reciprocal2(*sumInv_);
 
   outputG.oneHotCrossEntropyBp(output, *label.ids);
   outputG.addColumnVector(*sumInv_);
 
-  sftMaxSum_->log();
+  sftMaxSum_->log2();
   sumInv_->dotMul(*sumInv_, *sftMaxSum_);
   sumInv_->mulScalar(2 * config_.softmax_selfnorm_alpha());
 
@@ -152,17 +153,19 @@ bool SoftBinaryClassCrossEntropy::init(const LayerMap& layerMap,
   return CostLayer::init(layerMap, parameterMap);
 }
 
-void SoftBinaryClassCrossEntropy::forwardImp(Matrix& output, Argument& label,
+void SoftBinaryClassCrossEntropy::forwardImp(Matrix& output,
+                                             Argument& label,
                                              Matrix& target) {
-  Matrix::resizeOrCreate(targetPerDim_, output.getHeight(), output.getWidth(),
-                         false, useGpu_);
+  Matrix::resizeOrCreate(
+      targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
 
   targetPerDim_->softCrossEntropy(output, *label.value);
   targetPerDim_->rowSum(target);
 }
 
-void SoftBinaryClassCrossEntropy::backwardImp(
-    Matrix& output, Argument& label, Matrix& outputG) {
+void SoftBinaryClassCrossEntropy::backwardImp(Matrix& output,
+                                              Argument& label,
+                                              Matrix& outputG) {
   outputG.softCrossEntropyBp(output, *label.value);
 }
 
@@ -177,13 +180,15 @@ bool SumOfSquaresCostLayer::init(const LayerMap& layerMap,
   return CostLayer::init(layerMap, parameterMap);
 }
 
-void SumOfSquaresCostLayer::forwardImp(Matrix& output, Argument& label,
+void SumOfSquaresCostLayer::forwardImp(Matrix& output,
+                                       Argument& label,
                                        Matrix& target) {
   target.sumOfSquares(output, *label.value);
 }
 
-void SumOfSquaresCostLayer::backwardImp(
-    Matrix& output, Argument& label, Matrix& outputG) {
+void SumOfSquaresCostLayer::backwardImp(Matrix& output,
+                                        Argument& label,
+                                        Matrix& outputG) {
   outputG.sumOfSquaresBp(output, *label.value);
 }
 
@@ -219,8 +224,8 @@ void RankingCost::forward(PassType passType) {
     IVectorPtr idLabel = getInput(*getLabelLayer()).ids;
     CHECK(idLabel) << "label layer has neither value nor ids";
     CHECK_EQ((size_t)batchSize, idLabel->getSize());
-    Matrix::resizeOrCreate(labelBuf_, batchSize, /*width*/ 1, /*trans*/ false,
-                           useGpu_);
+    Matrix::resizeOrCreate(
+        labelBuf_, batchSize, /*width*/ 1, /*trans*/ false, useGpu_);
     labelBuf_->copyFrom(*idLabel);
     label = labelBuf_;
   }
@@ -261,8 +266,8 @@ void RankingCost::backward(const UpdateCallback& callback) {
     label = labelBuf_;
   }
 
-  Matrix::resizeOrCreate(marginGrad_, label->getHeight(), 1, /* trans= */ false,
-                         useGpu_);
+  Matrix::resizeOrCreate(
+      marginGrad_, label->getHeight(), 1, /* trans= */ false, useGpu_);
   marginGrad_->zeroMem();
   marginGrad_->logisticRegressionLossBp(*margin_, *label);
   if (weightLayer_) {
@@ -317,15 +322,14 @@ void LambdaCost::forward(PassType passType) {
   real* outputData = output->getData();
   real* targetData = target->getData();
 
-  auto startPos =
-      getInput(*getOutputLayer()).sequenceStartPositions;
+  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
   const int* startPosData = startPos->getData(false);
   size_t batchNum = startPos->getSize() - 1;
   for (size_t i = 0; i < batchNum; ++i) {
     int beginPos = startPosData[i];
     int endPos = startPosData[i + 1];
-    real NDCG = calcNDCG(outputData + beginPos, scoreData + beginPos,
-                         endPos - beginPos);
+    real NDCG = calcNDCG(
+        outputData + beginPos, scoreData + beginPos, endPos - beginPos);
     for (int j = beginPos; j < endPos; ++j) {
       targetData[j] = NDCG;
     }
@@ -336,23 +340,27 @@ void LambdaCost::backward(const UpdateCallback& callback) {
   (void)callback;
   MatrixPtr score = getInputValue(*getScoreLayer());
   MatrixPtr output = getInputValue(*getOutputLayer());
-  Matrix::resizeOrCreate(marginGrad_, score->getHeight(), 1,
-                         /* trans= */ false, useGpu_);
+  Matrix::resizeOrCreate(marginGrad_,
+                         score->getHeight(),
+                         1,
+                         /* trans= */ false,
+                         useGpu_);
   marginGrad_->zeroMem();
 
   real* gradData = marginGrad_->getData();
   real* scoreData = score->getData();
   real* outputData = output->getData();
 
-  auto startPos =
-      getInput(*getOutputLayer()).sequenceStartPositions;
+  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
   const int* startPosData = startPos->getData(false);
   size_t batchNum = startPos->getSize() - 1;
 
   for (size_t i = 0; i < batchNum; ++i) {
     int beginPos = startPosData[i];
     int endPos = startPosData[i + 1];
-    calcGrad(outputData + beginPos, scoreData + beginPos, gradData + beginPos,
+    calcGrad(outputData + beginPos,
+             scoreData + beginPos,
+             gradData + beginPos,
              endPos - beginPos);
   }
 
@@ -361,8 +369,10 @@ void LambdaCost::backward(const UpdateCallback& callback) {
 
 void LambdaCost::onPassEnd() {}
 
-void LambdaCost::calcGrad(const real* outputScore, const real* score,
-                          real* gradData, int size) {
+void LambdaCost::calcGrad(const real* outputScore,
+                          const real* score,
+                          real* gradData,
+                          int size) {
   CHECK_GE(size, truncationSize_)
       << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
   int sortSize = maxSortSize_ == -1 ? size : std::min(maxSortSize_, size);
@@ -372,13 +382,16 @@ void LambdaCost::calcGrad(const real* outputScore, const real* score,
     scorePair_.push_back(std::make_pair(score[i], i));
   }
   if (size <= sortSize) {
-    std::sort(scorePair_.begin(), scorePair_.end(),
+    std::sort(scorePair_.begin(),
+              scorePair_.end(),
               [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
                 return a.first > b.first;
               });
   } else {
     std::partial_sort(
-        scorePair_.begin(), scorePair_.begin() + sortSize, scorePair_.end(),
+        scorePair_.begin(),
+        scorePair_.begin() + sortSize,
+        scorePair_.end(),
         [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
           return a.first > b.first;
         });
@@ -414,7 +427,8 @@ void LambdaCost::calcGrad(const real* outputScore, const real* score,
   }
 }
 
-real LambdaCost::calcNDCG(const real* outputScore, const real* score,
+real LambdaCost::calcNDCG(const real* outputScore,
+                          const real* score,
                           int size) {
   CHECK_GE(size, truncationSize_)
       << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
@@ -424,7 +438,8 @@ real LambdaCost::calcNDCG(const real* outputScore, const real* score,
     outputScorePair_.push_back(std::make_pair(outputScore[i], i));
   }
   std::partial_sort(
-      outputScorePair_.begin(), outputScorePair_.begin() + truncationSize_,
+      outputScorePair_.begin(),
+      outputScorePair_.begin() + truncationSize_,
       outputScorePair_.end(),
       [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
         return a.first > b.first;
@@ -439,8 +454,10 @@ real LambdaCost::calcNDCG(const real* outputScore, const real* score,
   scoreVec_.resize(size);
   std::copy(score, score + size, scoreVec_.begin());
   real maxDCG = 0;
-  std::partial_sort(scoreVec_.begin(), scoreVec_.begin() + truncationSize_,
-                    scoreVec_.end(), std::greater<real>());
+  std::partial_sort(scoreVec_.begin(),
+                    scoreVec_.begin() + truncationSize_,
+                    scoreVec_.end(),
+                    std::greater<real>());
   for (int i = 0; i < truncationSize_; ++i) {
     maxDCG += (std::pow(2, scoreVec_[i]) - 1) / std::log(i + 2);
   }
@@ -460,7 +477,8 @@ bool MultiBinaryLabelCrossEntropy::init(const LayerMap& layerMap,
   return CostLayer::init(layerMap, parameterMap);
 }
 
-void MultiBinaryLabelCrossEntropy::forwardImp(Matrix& output, Argument& label,
+void MultiBinaryLabelCrossEntropy::forwardImp(Matrix& output,
+                                              Argument& label,
                                               Matrix& target) {
   MatrixPtr value = nullptr;
   if (label.ids) {
@@ -475,16 +493,17 @@ void MultiBinaryLabelCrossEntropy::forwardImp(Matrix& output, Argument& label,
       dynamic_cast<GpuSparseMatrix*>(value.get())) {
     target.multiBinaryLabelCrossEntropy(output, *value);
   } else {
-    Matrix::resizeOrCreate(targetPerDim_, output.getHeight(), output.getWidth(),
-                           false, useGpu_);
+    Matrix::resizeOrCreate(
+        targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
 
     targetPerDim_->binaryLabelCrossEntropy(output, *value);
     targetPerDim_->rowSum(target);
   }
 }
 
-void MultiBinaryLabelCrossEntropy::backwardImp(
-    Matrix& output, Argument& label, Matrix& outputG) {
+void MultiBinaryLabelCrossEntropy::backwardImp(Matrix& output,
+                                               Argument& label,
+                                               Matrix& outputG) {
   MatrixPtr value = nullptr;
   if (label.ids) {
     CHECK(!value);
@@ -519,8 +538,7 @@ bool HuberTwoClass::init(const LayerMap& layerMap,
   return true;
 }
 
-void HuberTwoClass::forwardImp(Matrix &output, Argument &label,
-                               Matrix &cost) {
+void HuberTwoClass::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
   if (useGpu_) {
     for (size_t i = 0; i < inputLayers_.size(); i++) {
       tmpCpuInput_[i].resizeAndCopyFrom(
@@ -531,7 +549,8 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label,
   forwardImpIn(output, label, cost);
 }
 
-void HuberTwoClass::forwardImpIn(Matrix& output, Argument& label,
+void HuberTwoClass::forwardImpIn(Matrix& output,
+                                 Argument& label,
                                  Matrix& target) {
   size_t numSamples = target.getHeight();
   CHECK_EQ((*label.ids).getSize(), numSamples);
@@ -539,7 +558,7 @@ void HuberTwoClass::forwardImpIn(Matrix& output, Argument& label,
   CHECK_EQ(output.getWidth(), (size_t)1);
   CHECK_EQ(target.getWidth(), (size_t)1);
 
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData(): output.getData();
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
   int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
   std::vector<real> cost(numSamples);
   for (size_t i = 0; i < numSamples; ++i) {
@@ -554,19 +573,21 @@ void HuberTwoClass::forwardImpIn(Matrix& output, Argument& label,
   target.copyFrom(cost.data(), numSamples);
 }
 
-void HuberTwoClass::backwardImp(Matrix &outputValue,
-                                Argument &label, Matrix &outputGrad) {
+void HuberTwoClass::backwardImp(Matrix& outputValue,
+                                Argument& label,
+                                Matrix& outputGrad) {
   if (useGpu_) {
-    backwardImpIn(*tmpCpuInput_[0].value, tmpCpuInput_[1],
-                  *tmpCpuInput_[0].grad);
+    backwardImpIn(
+        *tmpCpuInput_[0].value, tmpCpuInput_[1], *tmpCpuInput_[0].grad);
     outputGrad.copyFrom(*tmpCpuInput_[0].grad);
   } else {
     backwardImpIn(outputValue, label, outputGrad);
   }
 }
 
-void HuberTwoClass::backwardImpIn(
-    Matrix& output, Argument& label, Matrix& outputG) {
+void HuberTwoClass::backwardImpIn(Matrix& output,
+                                  Argument& label,
+                                  Matrix& outputG) {
   size_t numSamples = output.getHeight();
   real* out = output.getData();
   real* grad = outputG.getData();
@@ -605,7 +626,7 @@ public:
     int batchSize = input->getHeight();
     int size = 1;
     resizeOutput(batchSize, size);
-    output_.value->sumRows(*input, /* scaleSum= */1, /* scaleDest= */0);
+    output_.value->sumRows(*input, /* scaleSum= */ 1, /* scaleDest= */ 0);
   }
 
   virtual void backward(const UpdateCallback& callback = nullptr) {
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
index f263c688213ae6a83d5db4a1025aa252344dfab8..120ff9bd2d1b402e8ef2d074a84b76b0183dcab0 100644
--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -42,10 +42,12 @@ public:
 
   virtual void backward(const UpdateCallback& callback = nullptr);
 
-  virtual void forwardImp(Matrix& outputValue, Argument& label,
+  virtual void forwardImp(Matrix& outputValue,
+                          Argument& label,
                           Matrix& cost) = 0;
 
-  virtual void backwardImp(Matrix& outputValue, Argument& label,
+  virtual void backwardImp(Matrix& outputValue,
+                           Argument& label,
                            Matrix& outputGrad) = 0;
 
 protected:
@@ -225,7 +227,9 @@ public:
   void onPassEnd();
 
   real calcNDCG(const real* outputScore, const real* score, int size);
-  void calcGrad(const real* outputScore, const real* score, real* gradData,
+  void calcGrad(const real* outputScore,
+                const real* score,
+                real* gradData,
                 int size);
 
 private:
@@ -274,6 +278,7 @@ public:
  */
 class HuberTwoClass : public CostLayer {
   std::vector<Argument> tmpCpuInput_;
+
 public:
   explicit HuberTwoClass(const LayerConfig& config) : CostLayer(config) {}
 
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
index 3c6d13b0bf92ea98eb5c3331a1fdff6b177529b6..6be62b1a25407a5340bb5cdd99745db5d33ec3da 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "Layer.h"
 #include "CudnnBatchNormLayer.h"
@@ -65,16 +64,31 @@ void CudnnBatchNormLayer::forward(PassType passType) {
     REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
     real* savedMean = savedMean_->getData();
     real* savedInvVar = savedInvVar_->getData();
-    hl_batch_norm_forward_training(ioDesc_, input, ioDesc_, output,
+    hl_batch_norm_forward_training(ioDesc_,
+                                   input,
+                                   ioDesc_,
+                                   output,
                                    bnParamDesc_,
-                                   gamma, beta, 1.0 - movingAvgFraction_,
-                                   movingMean, movingVar,
-                                   EPS, savedMean, savedInvVar);
+                                   gamma,
+                                   beta,
+                                   1.0 - movingAvgFraction_,
+                                   movingMean,
+                                   movingVar,
+                                   EPS,
+                                   savedMean,
+                                   savedInvVar);
   } else {
     // used movingMean and movingVar in testing
-    hl_batch_norm_forward_inference(ioDesc_, input, ioDesc_, output,
-                                    bnParamDesc_, gamma, beta,
-                                    movingMean, movingVar, EPS);
+    hl_batch_norm_forward_inference(ioDesc_,
+                                    input,
+                                    ioDesc_,
+                                    output,
+                                    bnParamDesc_,
+                                    gamma,
+                                    beta,
+                                    movingMean,
+                                    movingVar,
+                                    EPS);
   }
 
   /* activation */ {
@@ -115,10 +129,19 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
     create(tmpBiasGrad_, 1, channels_, &betaGrad);
   }
 
-  hl_batch_norm_backward(ioDesc_, input, ioDesc_, outGrad,
-                         ioDesc_, inGrad, bnParamDesc_,
-                         gamma, gammaGrad, betaGrad,
-                         EPS, savedMean, savedInvVar);
+  hl_batch_norm_backward(ioDesc_,
+                         input,
+                         ioDesc_,
+                         outGrad,
+                         ioDesc_,
+                         inGrad,
+                         bnParamDesc_,
+                         gamma,
+                         gammaGrad,
+                         betaGrad,
+                         EPS,
+                         savedMean,
+                         savedInvVar);
 
   {
     REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h
index 03f4f591c3bfa0139c6b10f180fbdeaa19a231b8..6220e77ceb5e248e5678c9170e85aff1cb40e1cd 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Stat.h"
@@ -23,7 +22,8 @@ namespace paddle {
 
 /**
  * @brief Cudnn Batch normalization layer use to cuDNN lib to implentment.
- * @note Cudnn version must >= v4.0, and better to use the latest version (v5.1).
+ * @note Cudnn version must >= v4.0, and better to use the latest version
+ * (v5.1).
  *
  * The config file api is batch_norm_layer.
  */
diff --git a/paddle/gserver/layers/CudnnConvLayer.cpp b/paddle/gserver/layers/CudnnConvLayer.cpp
index 23ba2341185d1b86b90dee58939f8ca07fda9364..93c5565d2f401549959d6b067b05289592433a3a 100644
--- a/paddle/gserver/layers/CudnnConvLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvLayer.cpp
@@ -32,16 +32,16 @@ bool CudnnConvLayer::init(const LayerMap &layerMap,
   numFilters_ = config_.num_filters();
   CHECK(config_.shared_biases());
   for (size_t i = 0; i < inputLayers_.size(); i++) {
-    ProjectionConfig* conf = new ProjectionConfig();
+    ProjectionConfig *conf = new ProjectionConfig();
     conf->set_type("conv");
     conf->set_num_filters(numFilters_);
-    ConvConfig* convConf = conf->mutable_conv_conf();
+    ConvConfig *convConf = conf->mutable_conv_conf();
     *convConf = *(config_.mutable_inputs(i)->mutable_conv_conf());
     conf->set_input_size(getPrev(i)->getSize());
     conf->set_output_size(getSize());
     projConf_.emplace_back(conf);
-    projections_.emplace_back(Projection::create(*projConf_[i],
-                                                 parameters_[i], useGpu_));
+    projections_.emplace_back(
+        Projection::create(*projConf_[i], parameters_[i], useGpu_));
   }
 
   if (biases_.get() && sharedBiases_) {
@@ -67,15 +67,21 @@ void CudnnConvLayer::forward(PassType passType) {
   if (biases_) {
     REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
     int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-    hl_tensor_reshape(outputDesc_, batchSize, numFilters_ / groups_[0],
-        outputH_[0], outputW_[0], numFilters_ * outputH_[0] * outputW_[0],
-        outputH_[0] * outputW_[0], outputW_[0], 1);
+    hl_tensor_reshape(outputDesc_,
+                      batchSize,
+                      numFilters_ / groups_[0],
+                      outputH_[0],
+                      outputW_[0],
+                      numFilters_ * outputH_[0] * outputW_[0],
+                      outputH_[0] * outputW_[0],
+                      outputW_[0],
+                      1);
     outputOffset_ = getOutputValue()->getWidth() / groups_[0];
     for (int g = 0; g < groups_[0]; ++g) {
       real *biasData = biases_->getW()->getData() + biasOffset_ * g;
       real *outData = getOutputValue()->getData() + outputOffset_ * g;
-      hl_convolution_forward_add_bias(biasDesc_, biasData,
-                                      outputDesc_, outData);
+      hl_convolution_forward_add_bias(
+          biasDesc_, biasData, outputDesc_, outData);
     }
   }
 
diff --git a/paddle/gserver/layers/CudnnConvLayer.h b/paddle/gserver/layers/CudnnConvLayer.h
index 6390d96315cc4422c65e52f0d219b903c66f2cbd..6cfbadfb53839d847b8b2bcf768da0f473ac05e5 100644
--- a/paddle/gserver/layers/CudnnConvLayer.h
+++ b/paddle/gserver/layers/CudnnConvLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "ConvBaseLayer.h"
diff --git a/paddle/gserver/layers/CudnnPoolLayer.cpp b/paddle/gserver/layers/CudnnPoolLayer.cpp
index 24adb50a985ff4020f1716a053aba325fcf076f3..21d8e2579f77c98da1e30a205952fa53e02fb853 100644
--- a/paddle/gserver/layers/CudnnPoolLayer.cpp
+++ b/paddle/gserver/layers/CudnnPoolLayer.cpp
@@ -61,8 +61,13 @@ bool CudnnPoolLayer::init(const LayerMap &layerMap,
   strideHeight = strideY_;
   strideWidth = stride_;
 
-  hl_create_pooling_descriptor(&poolingDesc_, mode_, windowHeight, windowWidth,
-                               heightPadding, widthPadding, strideHeight,
+  hl_create_pooling_descriptor(&poolingDesc_,
+                               mode_,
+                               windowHeight,
+                               windowWidth,
+                               heightPadding,
+                               widthPadding,
+                               strideHeight,
                                strideWidth);
 
   return true;
@@ -79,7 +84,10 @@ void CudnnPoolLayer::reshape(int batchSize) {
   }
   CHECK_EQ(inputLayers_[0]->getOutput().value->getWidth(),
            channels_ * imageH_ * imageW_);
-  outputH_ = outputSize(imageH_, sizeY_, confPaddingY_, strideY_,
+  outputH_ = outputSize(imageH_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
                         /* caffeMode */ false);
   outputW_ =
       outputSize(imageW_, sizeX_, confPadding_, stride_, /* caffeMode */ false);
@@ -113,8 +121,13 @@ void CudnnPoolLayer::backward(const UpdateCallback &callback) {
   real *inputGrad = getInputGrad(0)->getData();
   real *outData = getOutputValue()->getData();
   real *outGrad = getOutputGrad()->getData();
-  hl_pooling_backward(inputDesc_, inputData, inputGrad, outputDesc_, outData,
-                      outGrad, poolingDesc_);
+  hl_pooling_backward(inputDesc_,
+                      inputData,
+                      inputGrad,
+                      outputDesc_,
+                      outData,
+                      outGrad,
+                      poolingDesc_);
 }
 
 CudnnPoolLayer::~CudnnPoolLayer() {
diff --git a/paddle/gserver/layers/CudnnPoolLayer.h b/paddle/gserver/layers/CudnnPoolLayer.h
index 2ef94720d2b9f13597cb0fb546726a2c2a67cb36..6a6b28db961553506bcf5db206a65e1e9d90fe94 100644
--- a/paddle/gserver/layers/CudnnPoolLayer.h
+++ b/paddle/gserver/layers/CudnnPoolLayer.h
@@ -12,19 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "PoolLayer.h"
 
 namespace paddle {
 
- /**
-  * @brief CudnnPoolLayer is subclass of PoolLayer, which is implemented by
-  * cudnn api and only supports GPU.
-  *
-  * The config file api is img_pool_layer.
-  */
+/**
+ * @brief CudnnPoolLayer is subclass of PoolLayer, which is implemented by
+ * cudnn api and only supports GPU.
+ *
+ * The config file api is img_pool_layer.
+ */
 
 class CudnnPoolLayer : public PoolLayer {
 protected:
diff --git a/paddle/gserver/layers/DataLayer.cpp b/paddle/gserver/layers/DataLayer.cpp
index 79b9181e694f008d99bda170c562a524212b2c73..67c49230367d8597860e3c32df434a16944f5daa 100644
--- a/paddle/gserver/layers/DataLayer.cpp
+++ b/paddle/gserver/layers/DataLayer.cpp
@@ -32,24 +32,30 @@ void DataLayer::copyDataToOutput(Argument& output) {
                                           data_.value->getWidth(),
                                           useGpu(output.deviceId));
       } else {
-        output.value->resize(data_.value->getHeight(),
-                             data_.value->getWidth());
+        output.value->resize(data_.value->getHeight(), data_.value->getWidth());
       }
       output.value->copyFrom(*data_.value);
     }
     if (data_.grad) {
-      Matrix::resizeOrCreate(output.grad, data_.grad->getHeight(),
+      Matrix::resizeOrCreate(output.grad,
+                             data_.grad->getHeight(),
                              data_.grad->getWidth(),
-                             /* trans= */ false, useGpu(output.deviceId));
+                             /* trans= */ false,
+                             useGpu(output.deviceId));
     }
     if (data_.ids) {
-      IVector::resizeOrCreate(output.ids, data_.ids->getSize(),
-                              useGpu(output.deviceId));
+      IVector::resizeOrCreate(
+          output.ids, data_.ids->getSize(), useGpu(output.deviceId));
       output.ids->copyFrom(*data_.ids);
     }
   }
-  output.setFrameHeight(data_.getFrameHeight());
-  output.setFrameWidth(data_.getFrameWidth());
+  if (config_.height() && config_.width()) {
+    output.setFrameHeight(config_.height());
+    output.setFrameWidth(config_.width());
+  } else {
+    output.setFrameHeight(data_.getFrameHeight());
+    output.setFrameHeight(data_.getFrameHeight());
+  }
   output.cpuSequenceDims = data_.cpuSequenceDims;
   output.sequenceStartPositions = data_.sequenceStartPositions;
   output.subSequenceStartPositions = data_.subSequenceStartPositions;
diff --git a/paddle/gserver/layers/DataLayer.h b/paddle/gserver/layers/DataLayer.h
index 3abec1b0653a812dcb0a8d5e0a24d8ead55c1d0b..da74702201bd3af3cd73ad51ef2579da97674bc6 100644
--- a/paddle/gserver/layers/DataLayer.h
+++ b/paddle/gserver/layers/DataLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -20,7 +19,7 @@ limitations under the License. */
 #include "Layer.h"
 
 namespace paddle {
-/** 
+/**
  * This layer just copy data to output, and has no backward propagation.
  *
  * The config file api is data_layer.
@@ -34,12 +33,10 @@ public:
   /**
    * Prefetch sparse matrix/ids only.
    */
-  void prefetch() {
-    output_ = data_;
-  }
+  void prefetch() { output_ = data_; }
 
-  /** 
-   * Forward propagation. Copy data_ (value, in, grad, ids, cpuSequenceDims, 
+  /**
+   * Forward propagation. Copy data_ (value, in, grad, ids, cpuSequenceDims,
    * sequenceStartPositions, subSequenceStartPositions, strs) to output_.
    */
   virtual void forward(PassType passType) {
diff --git a/paddle/gserver/layers/DataNormLayer.cpp b/paddle/gserver/layers/DataNormLayer.cpp
index 150977ce1a589cc7cc2b00a495314218ecaa772c..b398f3dbedc44eb422124a725aa745f684e821e3 100644
--- a/paddle/gserver/layers/DataNormLayer.cpp
+++ b/paddle/gserver/layers/DataNormLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "DataNormLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -37,16 +36,28 @@ bool DataNormLayer::init(const LayerMap& layerMap,
       << "The parameter of DataNormLayer must be static";
 
   weight_ = std::unique_ptr<Weight>(new Weight(5, getSize(), parameters_[0]));
-  min_ = Matrix::create(nullptr, /* height= */ 1, getSize(), /* trans= */ false,
-                        useGpu_);
-  rangeReciprocal_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                    /* trans= */ false, useGpu_);
-  mean_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                         /* trans= */ false, useGpu_);
-  stdReciprocal_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                  /* trans= */ false, useGpu_);
-  decimalReciprocal_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                      /* trans= */ false, useGpu_);
+  min_ = Matrix::create(
+      nullptr, /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
+  rangeReciprocal_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
+  mean_ = Matrix::create(nullptr,
+                         /* height= */ 1,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
+  stdReciprocal_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  getSize(),
+                                  /* trans= */ false,
+                                  useGpu_);
+  decimalReciprocal_ = Matrix::create(nullptr,
+                                      /* height= */ 1,
+                                      getSize(),
+                                      /* trans= */ false,
+                                      useGpu_);
 
   min_->setData(weight_->getW()->getData());
   rangeReciprocal_->setData(weight_->getW()->getData() + getSize());
diff --git a/paddle/gserver/layers/DataNormLayer.h b/paddle/gserver/layers/DataNormLayer.h
index 232c73f0346a12d59fa0dc316ef510be75e6b2b1..1179d94fbbd4032c9275f0586de5b526eb21c095 100644
--- a/paddle/gserver/layers/DataNormLayer.h
+++ b/paddle/gserver/layers/DataNormLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/DotMulOperator.cpp b/paddle/gserver/layers/DotMulOperator.cpp
index e6d2375b474d811ce8d485ca838428dc2860b608..9409493fdaaf0e84ab2e650e2c5e3db0c1fb1fbc 100644
--- a/paddle/gserver/layers/DotMulOperator.cpp
+++ b/paddle/gserver/layers/DotMulOperator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Operator.h"
 
 namespace paddle {
@@ -42,8 +41,8 @@ DotMulOperator::DotMulOperator(const OperatorConfig& config, bool useGpu)
 }
 
 void DotMulOperator::forward() {
-  out_->value->addDotMul(*ins_[0]->value, *ins_[1]->value, 1,
-                         config_.dotmul_scale());
+  out_->value->addDotMul(
+      *ins_[0]->value, *ins_[1]->value, 1, config_.dotmul_scale());
 }
 
 void DotMulOperator::backward() {
diff --git a/paddle/gserver/layers/DotMulProjection.cpp b/paddle/gserver/layers/DotMulProjection.cpp
index f6f14c4429e2637ae722105c164a776758e1ca11..862eeb6f01db04451afb8a91ecb2c04e0f796952 100644
--- a/paddle/gserver/layers/DotMulProjection.cpp
+++ b/paddle/gserver/layers/DotMulProjection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Projection.h"
 
 namespace paddle {
@@ -29,7 +28,8 @@ namespace paddle {
 class DotMulProjection : public Projection {
 public:
   DotMulProjection(const ProjectionConfig& config,
-                   const ParameterPtr& parameter, bool useGpu);
+                   const ParameterPtr& parameter,
+                   bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
@@ -41,7 +41,8 @@ protected:
 REGISTER_PROJECTION(dot_mul, DotMulProjection);
 
 DotMulProjection::DotMulProjection(const ProjectionConfig& config,
-                                   const ParameterPtr& parameter, bool useGpu)
+                                   const ParameterPtr& parameter,
+                                   bool useGpu)
     : Projection(config, parameter, useGpu) {
   weight_.reset(new Weight(1LU, config.output_size(), parameter));
 }
diff --git a/paddle/gserver/layers/EosIdCheckLayer.cpp b/paddle/gserver/layers/EosIdCheckLayer.cpp
index 2d0778a451aae5997a4b39a7c106d96887a79a51..3a43705d263898bd407248b3d553185f7e40f798 100644
--- a/paddle/gserver/layers/EosIdCheckLayer.cpp
+++ b/paddle/gserver/layers/EosIdCheckLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 
@@ -20,7 +19,7 @@ namespace paddle {
 /**
  * A layer for checking EOS for each sample:
  * - output_id = (input_id == conf.eos_id)
- * 
+ *
  * The result is stored in output_.ids.
  * It is used by recurrent layer group.
  */
diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.cpp b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
index 0bab0ca764f4fea7dc37f0eae096de1a79c9df21..3724609720c97b66d7d1779a0c892628c5d13a44 100644
--- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ExpandConvBaseLayer.h"
 
 #include "paddle/utils/Logging.h"
 namespace paddle {
 
 bool ExpandConvBaseLayer::init(const LayerMap &layerMap,
-                           const ParameterMap &parameterMap) {
+                               const ParameterMap &parameterMap) {
   /* Initialize the basic convolutional parent class */
   ConvBaseLayer::init(layerMap, parameterMap);
 
@@ -30,17 +29,19 @@ bool ExpandConvBaseLayer::init(const LayerMap &layerMap,
    * meaning as in conv, we need to swap channels_ and numFilters here for
    * convTrans, and in other functions too.
    * */
-  int channel;
-  int numFilters;
+
   /* Initialize the projection */
   for (auto &inputConfig : config_.inputs()) {
     const ConvConfig &conf = inputConfig.conv_conf();
-    numFilters = isDeconv_ ? conf.channels() : numFilters_;
+    int numFilters = isDeconv_ ? conf.channels() : numFilters_;
     subM_.push_back(numFilters / conf.groups());
-    subN_.push_back(conf.output_x() * conf.output_x());
-    channel = isDeconv_ ? numFilters_ : conf.channels();
-    subK_.push_back(channel * conf.filter_size() * conf.filter_size() /
-                    conf.groups());
+    subN_.push_back(conf.output_x() *
+                    (conf.has_output_y() ? conf.output_y() : conf.output_x()));
+    int channel = isDeconv_ ? numFilters_ : conf.channels();
+    subK_.push_back(
+        channel * conf.filter_size() *
+        (conf.has_filter_size_y() ? conf.filter_size_y() : conf.filter_size()) /
+        conf.groups());
     /* Consistent caffe mode for multiple input */
     caffeMode_ = conf.caffe_mode();
   }
@@ -76,9 +77,11 @@ void ExpandConvBaseLayer::addSharedBias() {
   transOutValue_->reshape(transOutValue_->getElementCnt() / numFilters_,
                           numFilters_);
 
-  MatrixPtr bias =
-      Matrix::create(biases_->getW()->getData(), 1,
-                     biases_->getW()->getElementCnt(), false, useGpu_);
+  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
+                                  1,
+                                  biases_->getW()->getElementCnt(),
+                                  false,
+                                  useGpu_);
   transOutValue_->addBias(*bias, 1.0f);
 
   transOutValue_->reshape(mapW, mapH);
@@ -90,32 +93,46 @@ void ExpandConvBaseLayer::addSharedBias() {
 
 void ExpandConvBaseLayer::addUnsharedBias() {
   MatrixPtr outValue = getOutputValue();
-  MatrixPtr bias =
-      Matrix::create(biases_->getW()->getData(), 1,
-                     biases_->getW()->getElementCnt(), false, useGpu_);
+  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
+                                  1,
+                                  biases_->getW()->getElementCnt(),
+                                  false,
+                                  useGpu_);
   outValue->addBias(*bias, 1.0f);
 }
 
-
-void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image, size_t startIdx,
-                                     int inIdx) {
+void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image,
+                                         size_t startIdx,
+                                         int inIdx) {
   int channel = isDeconv_ ? numFilters_ : channels_[inIdx];
 
   resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]);
   real *imgData = image->getData() + startIdx * image->getWidth();
-  MatrixPtr imageTmp = Matrix::create(
-      imgData, 1, imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel, false,
-      useGpu_);
-  expandInput_->convExpand(*imageTmp, imgSizeH_[inIdx], imgSizeW_[inIdx],
-                           channel, filterSize_[inIdx],
-                           filterSize_[inIdx], stride_[inIdx], stride_[inIdx],
-                           padding_[inIdx], padding_[inIdx],
-                           outputH_[inIdx], outputW_[inIdx]);
+  MatrixPtr imageTmp =
+      Matrix::create(imgData,
+                     1,
+                     imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel,
+                     false,
+                     useGpu_);
+  expandInput_->convExpand(*imageTmp,
+                           imgSizeH_[inIdx],
+                           imgSizeW_[inIdx],
+                           channel,
+                           filterSizeY_[inIdx],
+                           filterSize_[inIdx],
+                           strideY_[inIdx],
+                           stride_[inIdx],
+                           paddingY_[inIdx],
+                           padding_[inIdx],
+                           outputH_[inIdx],
+                           outputW_[inIdx]);
   imageTmp->clear();
 }
 
-void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image, MatrixPtr out,
-                                     int inIdx, int startIdx) {
+void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image,
+                                        MatrixPtr out,
+                                        int inIdx,
+                                        int startIdx) {
   int subM = subM_[inIdx];
   int subN = subN_[inIdx];
   int subK = subK_[inIdx];
@@ -124,14 +141,13 @@ void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image, MatrixPtr out,
 
   int numFilters = isDeconv_ ? channels_[inIdx] : numFilters_;
 
-  real *outData =
-      out->getData() + startIdx * subN * numFilters;
+  real *outData = out->getData() + startIdx * subN * numFilters;
 
   real *wgtData = weights_[inIdx]->getW()->getData();
   real *expInData = expandInput_->getData();
   for (int g = 0; g < groups_[inIdx]; ++g) {
     MatrixPtr A =
-        Matrix::create(wgtData, subK, subM, true, useGpu_);  // mark transpose
+        Matrix::create(wgtData, subM, subK, false, useGpu_);  // mark transpose
     MatrixPtr B = Matrix::create(expInData, subK, subN, false, useGpu_);
     MatrixPtr C = Matrix::create(outData, subM, subN, false, useGpu_);
     C->mul(A, B, 1, 1);
@@ -145,7 +161,8 @@ void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image, MatrixPtr out,
   }
 }
 
-void ExpandConvBaseLayer::bpropActs(MatrixPtr out, MatrixPtr image,
+void ExpandConvBaseLayer::bpropActs(MatrixPtr out,
+                                    MatrixPtr image,
                                     int inpIdx) {
   int channel = isDeconv_ ? numFilters_ : channels_[inpIdx];
 
@@ -167,7 +184,7 @@ void ExpandConvBaseLayer::bpropActs(MatrixPtr out, MatrixPtr image,
       // create temporary matrix
       MatrixPtr C = Matrix::create(expandInData, subK, subN, false, useGpu_);
       MatrixPtr B = Matrix::create(localGradData, subM, subN, false, useGpu_);
-      MatrixPtr A = Matrix::create(wgtData, subK, subM, false, useGpu_);
+      MatrixPtr A = Matrix::create(wgtData, subM, subK, true, useGpu_);
       C->mul(A, B);  // mul
 
       // clear the temporary matrix
@@ -183,15 +200,26 @@ void ExpandConvBaseLayer::bpropActs(MatrixPtr out, MatrixPtr image,
     // shrink one frame outGrad
     MatrixPtr oneGradTmp = Matrix::create(
         expandInput_->getData(), subK * groups_[inpIdx], subN, false, useGpu_);
-    MatrixPtr vTmp = Matrix::create(
-        tgtGradData, 1,
-        imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel, false,
-        useGpu_);
-    vTmp->convShrink(*oneGradTmp, imgSizeH_[inpIdx], imgSizeW_[inpIdx],
-                     channel, filterSize_[inpIdx],
-                     filterSize_[inpIdx], stride_[inpIdx], stride_[inpIdx],
-                     padding_[inpIdx], padding_[inpIdx],
-                     outputH_[inpIdx], outputW_[inpIdx], 1.0f, 1.0f);
+    MatrixPtr vTmp =
+        Matrix::create(tgtGradData,
+                       1,
+                       imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel,
+                       false,
+                       useGpu_);
+    vTmp->convShrink(*oneGradTmp,
+                     imgSizeH_[inpIdx],
+                     imgSizeW_[inpIdx],
+                     channel,
+                     filterSizeY_[inpIdx],
+                     filterSize_[inpIdx],
+                     strideY_[inpIdx],
+                     stride_[inpIdx],
+                     paddingY_[inpIdx],
+                     padding_[inpIdx],
+                     outputH_[inpIdx],
+                     outputW_[inpIdx],
+                     1.0f,
+                     1.0f);
     vTmp->clear();
     oneGradTmp->clear();
 
@@ -200,8 +228,9 @@ void ExpandConvBaseLayer::bpropActs(MatrixPtr out, MatrixPtr image,
   }
 }
 
-void ExpandConvBaseLayer::bpropWeights(MatrixPtr image, MatrixPtr out,
-                                    int inpIdx) {
+void ExpandConvBaseLayer::bpropWeights(MatrixPtr image,
+                                       MatrixPtr out,
+                                       int inpIdx) {
   MatrixPtr weightGrad = weights_[inpIdx]->getWGrad();
 
   int subM = subM_[inpIdx];
@@ -220,10 +249,10 @@ void ExpandConvBaseLayer::bpropWeights(MatrixPtr image, MatrixPtr out,
 
     // expand-mul one-group by one
     for (int g = 0; g < groups_[inpIdx]; g++) {
-      MatrixPtr A = Matrix::create(expandInData, subK, subN, false, useGpu_);
-      MatrixPtr B = Matrix::create(gradData, subM, subN, true, useGpu_);
-      MatrixPtr C = Matrix::create(wGradData, subK, subM, false, useGpu_);
-      C->mul(A, B, 1, 1);
+      MatrixPtr A = Matrix::create(expandInData, subK, subN, true, useGpu_);
+      MatrixPtr B = Matrix::create(gradData, subM, subN, false, useGpu_);
+      MatrixPtr C = Matrix::create(wGradData, subM, subK, false, useGpu_);
+      C->mul(B, A, 1, 1);
 
       A->clear();
       B->clear();
@@ -249,9 +278,11 @@ void ExpandConvBaseLayer::bpropSharedBias(MatrixPtr biases, MatrixPtr v) {
 }
 
 void ExpandConvBaseLayer::bpropBiases(MatrixPtr v) {
-  MatrixPtr biases =
-      Matrix::create(biases_->getWGrad()->getData(), 1,
-                     biases_->getWGrad()->getElementCnt(), false, useGpu_);
+  MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
+                                    1,
+                                    biases_->getWGrad()->getElementCnt(),
+                                    false,
+                                    useGpu_);
   if (sharedBiases_) {
     bpropSharedBias(biases, v);
   } else {
diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.h b/paddle/gserver/layers/ExpandConvBaseLayer.h
index 9858fa348c3fc85fdea0c017ca44fa047a6eaf42..5939d27e2a873308d710c1670a3aec843c3573ad 100644
--- a/paddle/gserver/layers/ExpandConvBaseLayer.h
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "ConvBaseLayer.h"
@@ -45,7 +44,7 @@ protected:
 
 public:
   explicit ExpandConvBaseLayer(const LayerConfig& config)
-    : ConvBaseLayer(config) {}
+      : ConvBaseLayer(config) {}
 
   ~ExpandConvBaseLayer() {}
 
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
index 5ea1fdece5f7b83c7e1d576e7f02a4a2545f0cd8..0649289c1c671ae5952dd8db9d19f576da67409c 100644
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 #include "ExpandConvLayer.h"
@@ -58,7 +57,6 @@ void ExpandConvLayer::forward(PassType passType) {
   forwardActivation();
 }
 
-
 void ExpandConvLayer::backward(const UpdateCallback &callback) {
   backwardActivation();
 
diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h
index c07188a406183416cd57e2d027ba1205f6b65176..82a9e88a4208ea98a97bd56ef2f9f38de4f0031e 100644
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/math/Matrix.h"
@@ -31,8 +30,8 @@ namespace paddle {
 
 class ExpandConvLayer : public ExpandConvBaseLayer {
 public:
-  explicit ExpandConvLayer(const LayerConfig& config) :
-    ExpandConvBaseLayer(config) {}
+  explicit ExpandConvLayer(const LayerConfig& config)
+      : ExpandConvBaseLayer(config) {}
 
   ~ExpandConvLayer() {}
 
diff --git a/paddle/gserver/layers/ExpandConvTransLayer.cpp b/paddle/gserver/layers/ExpandConvTransLayer.cpp
index a3e160f1f4eb524d39ed90cb17f59f58c690f964..1132ab4f92000c96b22a295b360143d2f356ec5a 100644
--- a/paddle/gserver/layers/ExpandConvTransLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvTransLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 #include "ExpandConvTransLayer.h"
@@ -27,7 +26,7 @@ namespace paddle {
 REGISTER_LAYER(exconvt, ExpandConvTransLayer);
 
 bool ExpandConvTransLayer::init(const LayerMap &layerMap,
-                           const ParameterMap &parameterMap) {
+                                const ParameterMap &parameterMap) {
   /* Initialize the basic convolutional parent class */
   ExpandConvBaseLayer::init(layerMap, parameterMap);
 
@@ -88,5 +87,4 @@ void ExpandConvTransLayer::backward(const UpdateCallback &callback) {
   }
 }
 
-
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvTransLayer.h b/paddle/gserver/layers/ExpandConvTransLayer.h
index 87c464a97f2edd5c3528a4434a2aa741d10ddf2e..47efe3f65643fd17b86832fc240cda2e30d3fcc4 100644
--- a/paddle/gserver/layers/ExpandConvTransLayer.h
+++ b/paddle/gserver/layers/ExpandConvTransLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/math/Matrix.h"
@@ -30,8 +29,8 @@ namespace paddle {
  */
 class ExpandConvTransLayer : public ExpandConvBaseLayer {
 public:
-  explicit ExpandConvTransLayer(const LayerConfig& config) :
-    ExpandConvBaseLayer(config) {}
+  explicit ExpandConvTransLayer(const LayerConfig& config)
+      : ExpandConvBaseLayer(config) {}
 
   ~ExpandConvTransLayer() {}
 
diff --git a/paddle/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
index d18b51dd7973737768b4fde37b67987abea9e2c6..97c8d143fe0d84c4e59e224962b53995ee50b844 100644
--- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp
+++ b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/Stat.h"
@@ -79,9 +78,12 @@ void FeatureMapExpandLayer::forward(PassType passType) {
     for (size_t i = 0; i < batchSize; i++) {
       MatrixPtr outVTmp =
           Matrix::create(outputV->getData() + i * imgSize * numFilters_,
-                         numFilters_, imgSize, false, useGpu_);
-      MatrixPtr inVTmp = Matrix::create(inputV->getData() + i * imgSize, 1,
-                                        imgSize, false, useGpu_);
+                         numFilters_,
+                         imgSize,
+                         false,
+                         useGpu_);
+      MatrixPtr inVTmp = Matrix::create(
+          inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_);
       outVTmp->addRowVector(*inVTmp);
     }
   }
@@ -101,9 +103,12 @@ void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
     for (size_t i = 0; i < batchSize; i++) {
       MatrixPtr outGradTmp =
           Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
-                         numFilters_, imgSize, false, useGpu_);
-      MatrixPtr inGradTmp = Matrix::create(inGrad->getData() + i * imgSize, 1,
-                                           imgSize, false, useGpu_);
+                         numFilters_,
+                         imgSize,
+                         false,
+                         useGpu_);
+      MatrixPtr inGradTmp = Matrix::create(
+          inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_);
       inGradTmp->collectBias(*outGradTmp, 1);
     }
   }
diff --git a/paddle/gserver/layers/FullMatrixProjection.cpp b/paddle/gserver/layers/FullMatrixProjection.cpp
index f17c1b05bd892c7d933e4910887f977ac5cda79b..35a5cb5b7a450e7233b6dddbef58a2acccfb1608 100644
--- a/paddle/gserver/layers/FullMatrixProjection.cpp
+++ b/paddle/gserver/layers/FullMatrixProjection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "FullMatrixProjection.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/FullMatrixProjection.h b/paddle/gserver/layers/FullMatrixProjection.h
index e99444b33b82e4694ee6df4df5f5447bdc3baaa0..ddb1e7b18c4f967383feb922ce89d13a452109b2 100644
--- a/paddle/gserver/layers/FullMatrixProjection.h
+++ b/paddle/gserver/layers/FullMatrixProjection.h
@@ -30,7 +30,8 @@ namespace paddle {
 class FullMatrixProjection : public Projection {
 public:
   FullMatrixProjection(const ProjectionConfig& config,
-                       const ParameterPtr& parameter, bool useGpu);
+                       const ParameterPtr& parameter,
+                       bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
diff --git a/paddle/gserver/layers/FullyConnectedLayer.cpp b/paddle/gserver/layers/FullyConnectedLayer.cpp
index c754f8fd9480de73067b295ffacbbaab1866568a..70c56499a7738c12db40bfd0ca5fec399d72f99b 100644
--- a/paddle/gserver/layers/FullyConnectedLayer.cpp
+++ b/paddle/gserver/layers/FullyConnectedLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "FullyConnectedLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
diff --git a/paddle/gserver/layers/FullyConnectedLayer.h b/paddle/gserver/layers/FullyConnectedLayer.h
index 334eb4b722f4ff9a794a3818a1cf3087da27692f..e15e1236cdb75d1c41bbb993f86545334785909a 100644
--- a/paddle/gserver/layers/FullyConnectedLayer.h
+++ b/paddle/gserver/layers/FullyConnectedLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -20,9 +19,9 @@ limitations under the License. */
 #include "paddle/utils/ThreadLocal.h"
 
 namespace paddle {
-/** 
+/**
  * A layer has full connections to all neurons in the previous layer.
- * It computes an inner product with a set of learned weights, and 
+ * It computes an inner product with a set of learned weights, and
  * (optionally) adds biases.
  *
  * The config file api is fc_layer.
@@ -34,8 +33,7 @@ protected:
   std::unique_ptr<Weight> biases_;
 
 public:
-  explicit FullyConnectedLayer(const LayerConfig& config)
-      : Layer(config) {}
+  explicit FullyConnectedLayer(const LayerConfig& config) : Layer(config) {}
   ~FullyConnectedLayer() {}
 
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.cpp b/paddle/gserver/layers/GatedRecurrentLayer.cpp
index e0c6ff7ea28418d7bfb2db0b20281165f328976d..495c2174f3e9afbee676622d53248c7f5aeea404 100644
--- a/paddle/gserver/layers/GatedRecurrentLayer.cpp
+++ b/paddle/gserver/layers/GatedRecurrentLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 #include "GatedRecurrentLayer.h"
 #include "paddle/utils/Stat.h"
@@ -30,8 +29,8 @@ bool GatedRecurrentLayer::init(const LayerMap& layerMap,
   CHECK_EQ(getSize() * 3, biasParameter_->getSize());
   weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0]));
   gateWeight_.reset(new Weight(getSize(), getSize() * 2, parameters_[0], 0));
-  stateWeight_.reset(new Weight(getSize(), getSize(), parameters_[0],
-                                2 * getSize() * getSize()));
+  stateWeight_.reset(new Weight(
+      getSize(), getSize(), parameters_[0], 2 * getSize() * getSize()));
   if (biasParameter_.get() != NULL) {
     bias_.reset(new Weight(1, getSize() * 3, biasParameter_));
   }
@@ -48,8 +47,8 @@ bool GatedRecurrentLayer::init(const LayerMap& layerMap,
 void GatedRecurrentLayer::resetState() {
   CHECK(!reversed_) << "state is not allowed for reversed gated "
                        "recurrent layer";
-  Matrix::resizeOrCreate(prevOutput_, 1, getSize(), /* trans= */ false,
-                         useGpu_);
+  Matrix::resizeOrCreate(
+      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
   prevOutput_->zeroMem();
 
   // TODO(hedaoyuan): support prev_batch_state
@@ -85,10 +84,16 @@ void GatedRecurrentLayer::forward(PassType passType) {
   // batchSize = length of total frames in a batch (NOT size of mini-batch)
   CHECK_EQ(starts[numSequences], batchSize);
 
-  Matrix::resizeOrCreate(gate_.value, /* height= */batchSize,
-                         getSize() * 3, /* trans= */false, useGpu_);
-  Matrix::resizeOrCreate(resetOutput_.value, /* height= */batchSize,
-                         getSize(), /* trans= */false, useGpu_);
+  Matrix::resizeOrCreate(gate_.value,
+                         /* height= */ batchSize,
+                         getSize() * 3,
+                         /* trans= */ false,
+                         useGpu_);
+  Matrix::resizeOrCreate(resetOutput_.value,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
 
   if (useBatch_) {
     forwardBatch(batchSize, numSequences, starts, input.value);
@@ -105,10 +110,16 @@ void GatedRecurrentLayer::backward(const UpdateCallback& callback) {
   const int* starts = input.sequenceStartPositions->getData(false);
   size_t numSequences = input.getNumSequences();
 
-  Matrix::resizeOrCreate(gate_.grad, /* height= */batchSize,
-                         getSize() * 3, /* trans= */false, useGpu_);
-  Matrix::resizeOrCreate(resetOutput_.grad, /* height= */batchSize,
-                         getSize(), /* trans= */false, useGpu_);
+  Matrix::resizeOrCreate(gate_.grad,
+                         /* height= */ batchSize,
+                         getSize() * 3,
+                         /* trans= */ false,
+                         useGpu_);
+  Matrix::resizeOrCreate(resetOutput_.grad,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
 
   if (useBatch_) {
     backwardBatch(batchSize, input.grad);
@@ -125,7 +136,7 @@ void GatedRecurrentLayer::backward(const UpdateCallback& callback) {
 
 void GatedRecurrentLayer::forwardSequence(int batchSize,
                                           size_t numSequences,
-                                          const int *starts,
+                                          const int* starts,
                                           MatrixPtr inputValue) {
   REGISTER_TIMER_INFO("GruFwSequenceTime", getName().c_str());
   gate_.value->assign(*inputValue);
@@ -198,7 +209,7 @@ void GatedRecurrentLayer::forwardSequence(int batchSize,
 
 void GatedRecurrentLayer::backwardSequence(int batchSize,
                                            size_t numSequences,
-                                           const int *starts,
+                                           const int* starts,
                                            MatrixPtr inputGrad) {
   REGISTER_TIMER_INFO("GruBwSequenceTime", getName().c_str());
 
@@ -211,9 +222,10 @@ void GatedRecurrentLayer::backwardSequence(int batchSize,
 
   hl_gru_grad gruGrad;
   gruGrad.gateWeightGrad =
-    (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
+      (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
   gruGrad.stateWeightGrad =
-    (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData() : nullptr);
+      (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData()
+                                : nullptr);
   gruGrad.gateGrad = gate_.grad->getData();
   gruGrad.resetOutputGrad = resetOutput_.grad->getData();
   gruGrad.outputGrad = output_.grad->getData();
@@ -298,11 +310,10 @@ void GatedRecurrentLayer::forwardBatch(int batchSize,
   if (!batchValue_) {
     batchValue_.reset(new SequenceToBatch(useGpu_));
   }
-  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts,
-                                   reversed_);
+  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);
 
   batchValue_->resizeOrCreate(*output_.value);
-  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */true);
+  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
   if (bias_ && bias_->getWGrad()) {
     gate_.value->addBias(*(bias_->getW()), 1);
   }
@@ -315,14 +326,14 @@ void GatedRecurrentLayer::forwardBatch(int batchSize,
       MatrixPtr outputValueTmp = batchValue_->getBatchValue(n);
       gruValue.outputValue = outputValueTmp->getData();
       gruValue.gateValue =
-        (batchValue_->getBatchValue(*gate_.value, n))->getData();
+          (batchValue_->getBatchValue(*gate_.value, n))->getData();
       gruValue.resetOutputValue =
-        (batchValue_->getBatchValue(*resetOutput_.value, n))->getData();
+          (batchValue_->getBatchValue(*resetOutput_.value, n))->getData();
 
       batchSize = outputValueTmp->getHeight();
       gruValue.prevOutValue =
-        (n == 0 ? nullptr
-                : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
+          (n == 0 ? nullptr
+                  : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
 
       {
         if (useGpu_) {
@@ -333,13 +344,10 @@ void GatedRecurrentLayer::forwardBatch(int batchSize,
       }
     }
   }
-  {
-    batchValue_->copyBackSeq(*output_.value);
-  }
+  { batchValue_->copyBackSeq(*output_.value); }
 }
 
-void GatedRecurrentLayer::backwardBatch(int batchSize,
-                                        MatrixPtr inputGrad) {
+void GatedRecurrentLayer::backwardBatch(int batchSize, MatrixPtr inputGrad) {
   REGISTER_TIMER_INFO("GruBwBatchTime", getName().c_str());
   hl_gru_value gruValue;
   gruValue.gateWeight = (gateWeight_->getW())->getData();
@@ -347,18 +355,17 @@ void GatedRecurrentLayer::backwardBatch(int batchSize,
 
   hl_gru_grad gruGrad;
   gruGrad.gateWeightGrad =
-    (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
+      (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
   gruGrad.stateWeightGrad =
-    (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData() : nullptr);
+      (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData()
+                                : nullptr);
 
   if (!batchGrad_) {
     batchGrad_.reset(new SequenceToBatch(useGpu_));
   }
   batchGrad_->shareIndexWith(*batchValue_);
 
-  {
-    batchGrad_->copyFromSeq(*output_.grad);
-  }
+  { batchGrad_->copyFromSeq(*output_.grad); }
 
   {
     int numBatch = batchGrad_->getNumBatch();
@@ -366,39 +373,36 @@ void GatedRecurrentLayer::backwardBatch(int batchSize,
     AsyncGpuBlock asyncGpuBlock;
     for (int n = (int)numBatch - 1; n >= 0; n--) {
       gruValue.gateValue =
-        (batchGrad_->getBatchValue(*gate_.value, n))->getData();
+          (batchGrad_->getBatchValue(*gate_.value, n))->getData();
       gruValue.resetOutputValue =
-        (batchGrad_->getBatchValue(*resetOutput_.value, n))->getData();
+          (batchGrad_->getBatchValue(*resetOutput_.value, n))->getData();
 
-      MatrixPtr outputGradTmp  = batchGrad_->getBatchValue(n);
+      MatrixPtr outputGradTmp = batchGrad_->getBatchValue(n);
       gruGrad.outputGrad = outputGradTmp->getData();
-      gruGrad.gateGrad =
-        (batchGrad_->getBatchValue(*gate_.grad , n))->getData();
+      gruGrad.gateGrad = (batchGrad_->getBatchValue(*gate_.grad, n))->getData();
       gruGrad.resetOutputGrad =
-        (batchGrad_->getBatchValue(*resetOutput_.grad , n))->getData();
+          (batchGrad_->getBatchValue(*resetOutput_.grad, n))->getData();
 
       {
         batchSize = outputGradTmp->getHeight();
         gruValue.prevOutValue =
-          (n == 0 ? nullptr
-                  : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
+            (n == 0 ? nullptr : (batchValue_->getBatchValue(n - 1, batchSize))
+                                    ->getData());
         gruGrad.prevOutGrad =
-          (n == 0 ? nullptr
-                  : (batchGrad_->getBatchValue(n - 1, batchSize))->getData());
+            (n == 0 ? nullptr
+                    : (batchGrad_->getBatchValue(n - 1, batchSize))->getData());
 
         if (useGpu_) {
-          GruCompute::backward<1>(gruValue, gruGrad, getSize(),
-                                  batchSize);
+          GruCompute::backward<1>(gruValue, gruGrad, getSize(), batchSize);
         } else {
-          GruCompute::backward<0>(gruValue, gruGrad, getSize(),
-                                  batchSize);
+          GruCompute::backward<0>(gruValue, gruGrad, getSize(), batchSize);
         }
       }
     }
   }
 
   if (inputGrad) {
-    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */false);
+    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */ false);
   }
   if (bias_ && bias_->getWGrad()) {
     bias_->getWGrad()->collectBias(*gate_.grad, /* scale */ 1);
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.h b/paddle/gserver/layers/GatedRecurrentLayer.h
index 19f71206bc00a15892815cc1e0c039659b841df6..3b8706a44e21e5a780c6423b65369dc5b695b59b 100644
--- a/paddle/gserver/layers/GatedRecurrentLayer.h
+++ b/paddle/gserver/layers/GatedRecurrentLayer.h
@@ -63,13 +63,19 @@ public:
   LayerStatePtr getState();
 
 protected:
-  void forwardSequence(int batchSize, size_t numSequences,
-                       const int *starts, MatrixPtr inputValue);
-  void backwardSequence(int batchSize, size_t numSequences,
-                        const int *starts, MatrixPtr inputGrad);
-
-  void forwardBatch(int batchSize, size_t numSequences,
-                    const int *starts, MatrixPtr inputValue);
+  void forwardSequence(int batchSize,
+                       size_t numSequences,
+                       const int* starts,
+                       MatrixPtr inputValue);
+  void backwardSequence(int batchSize,
+                        size_t numSequences,
+                        const int* starts,
+                        MatrixPtr inputGrad);
+
+  void forwardBatch(int batchSize,
+                    size_t numSequences,
+                    const int* starts,
+                    MatrixPtr inputValue);
   void backwardBatch(int batchSize, MatrixPtr inputGrad);
 
 protected:
diff --git a/paddle/gserver/layers/GetOutputLayer.cpp b/paddle/gserver/layers/GetOutputLayer.cpp
index f036cd2b5284222bbcbcdfda7b7a0142eba750a7..01579d55fd9d0918b62ae0ddd9a7e90b4a697a13 100644
--- a/paddle/gserver/layers/GetOutputLayer.cpp
+++ b/paddle/gserver/layers/GetOutputLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/GruCompute.cpp b/paddle/gserver/layers/GruCompute.cpp
index c942122633c3d9e6dd89ce57c35d50db819ba3a1..d9d423af448fd267b777ef57964dced3b7a09f63 100644
--- a/paddle/gserver/layers/GruCompute.cpp
+++ b/paddle/gserver/layers/GruCompute.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "GruCompute.h"
 #include "hl_recurrent_apply.cuh"
@@ -20,14 +19,12 @@ limitations under the License. */
 namespace paddle {
 
 void GruCompute::init(LayerConfig &config) {
-    activeNode_ = hlActiveType(config.active_type());
-    activeGate_ = hlActiveType(config.active_gate_type());
+  activeNode_ = hlActiveType(config.active_type());
+  activeGate_ = hlActiveType(config.active_gate_type());
 }
 
 template <>
-void GruCompute::forward<0>(hl_gru_value value,
-                            int frameSize,
-                            int batchSize) {
+void GruCompute::forward<0>(hl_gru_value value, int frameSize, int batchSize) {
   hl_cpu_gru_forward(hppl::forward::gru_resetOutput(),
                      hppl::forward::gru_finalOutput(),
                      value,
@@ -39,17 +36,17 @@ void GruCompute::forward<0>(hl_gru_value value,
 
 template <>
 void GruCompute::backward<0>(hl_gru_value value,
-                            hl_gru_grad  grad,
-                            int frameSize,
-                            int batchSize) {
-hl_cpu_gru_backward(hppl::backward::gru_stateGrad(),
-                    hppl::backward::gru_resetGrad(),
-                    value,
-                    grad,
-                    frameSize,
-                    batchSize,
-                    activeNode_,
-                    activeGate_);
+                             hl_gru_grad grad,
+                             int frameSize,
+                             int batchSize) {
+  hl_cpu_gru_backward(hppl::backward::gru_stateGrad(),
+                      hppl::backward::gru_resetGrad(),
+                      value,
+                      grad,
+                      frameSize,
+                      batchSize,
+                      activeNode_,
+                      activeGate_);
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/GruCompute.h b/paddle/gserver/layers/GruCompute.h
index 3a1b69b940d089d8f346756d312e0eb21d445e05..58b5aacba0403f8d10e34b055f5a69ad5ffa4837 100644
--- a/paddle/gserver/layers/GruCompute.h
+++ b/paddle/gserver/layers/GruCompute.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/TypeDefs.h"
@@ -29,7 +28,9 @@ public:
   void forward(hl_gru_value value, int frameSize, int batchSize = 1);
 
   template <bool useGpu>
-  void backward(hl_gru_value value, hl_gru_grad grad, int frameSize,
+  void backward(hl_gru_value value,
+                hl_gru_grad grad,
+                int frameSize,
                 int batchSize = 1);
 
 public:
diff --git a/paddle/gserver/layers/GruStepLayer.cpp b/paddle/gserver/layers/GruStepLayer.cpp
index 501229d10ab87af0baa8b5d3f94a218f2d064d61..6c9b0c5771bec765d043cd654fbb30ba56f8c813 100644
--- a/paddle/gserver/layers/GruStepLayer.cpp
+++ b/paddle/gserver/layers/GruStepLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 #include "GruCompute.h"
 #include "paddle/utils/Stat.h"
@@ -32,7 +31,8 @@ namespace paddle {
  * \f[
  * update \ gate: z_t = actGate(xz_t + U_z * prev_out + bias_z) \\
  * reset \ gate: r_t = actGate(xr_t + U_r * prev_out + bias_r)  \\
- * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, prev_out) + bias_o) \\
+ * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, prev_out) + bias_o)
+ * \\
  * output: h_t = dot((1-z_t), prev_out) + dot(z_t, prev_out)
  * \f]
  *
@@ -91,10 +91,16 @@ void GruStepLayer::forward(PassType passType) {
 
   int batchSize = input.getBatchSize();
   resetOutput(batchSize, getSize());
-  resetSpecifyOutput(gate_, batchSize, getSize() * 3,
-                     /* isValueClean */ false, /* isGradClean */ false);
-  resetSpecifyOutput(resetOutput_, batchSize, getSize(),
-                     /* isValueClean */ false, /* isGradClean */ false);
+  resetSpecifyOutput(gate_,
+                     batchSize,
+                     getSize() * 3,
+                     /* isValueClean */ false,
+                     /* isGradClean */ false);
+  resetSpecifyOutput(resetOutput_,
+                     batchSize,
+                     getSize(),
+                     /* isValueClean */ false,
+                     /* isGradClean */ false);
   gate_.value->assign(*input.value);
   if (bias_) {
     gate_.value->addBias(*(bias_->getW()), 1);
@@ -103,7 +109,7 @@ void GruStepLayer::forward(PassType passType) {
   hl_gru_value gruValue;
   gruValue.gateWeight = weight_->getW()->getData();
   gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2;
-  gruValue.gateValue = gate_.value->getData();;
+  gruValue.gateValue = gate_.value->getData();
   gruValue.resetOutputValue = resetOutput_.value->getData();
   gruValue.outputValue = output_.value->getData();
   gruValue.prevOutValue = prevOutput.value->getData();
@@ -125,17 +131,18 @@ void GruStepLayer::backward(const UpdateCallback& callback) {
   hl_gru_value gruValue;
   gruValue.gateWeight = weight_->getW()->getData();
   gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2;
-  gruValue.gateValue = gate_.value->getData();;
+  gruValue.gateValue = gate_.value->getData();
   gruValue.resetOutputValue = resetOutput_.value->getData();
   gruValue.outputValue = output_.value->getData();
   gruValue.prevOutValue = prevOutput.value->getData();
 
-  hl_gru_grad  gruGrad;
+  hl_gru_grad gruGrad;
   gruGrad.gateWeightGrad =
-    (weight_->getWGrad() ? weight_->getWGrad()->getData() : nullptr);
+      (weight_->getWGrad() ? weight_->getWGrad()->getData() : nullptr);
   gruGrad.stateWeightGrad =
-    (weight_->getWGrad() ?
-     weight_->getWGrad()->getData() + getSize() * getSize() * 2 : nullptr);
+      (weight_->getWGrad()
+           ? weight_->getWGrad()->getData() + getSize() * getSize() * 2
+           : nullptr);
 
   gruGrad.gateGrad = gate_.grad->getData();
   gruGrad.resetOutputGrad = resetOutput_.grad->getData();
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
index 7091c6aa222e52e09603d84f52f88de11b9a7d73..61bc77778501fb9421cd2a72459d35ac9f47a5cb 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "HierarchicalSigmoidLayer.h"
 #include "paddle/utils/Util.h"
 
@@ -61,10 +60,16 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
   int batchSize = getInputValue(0)->getHeight();
   int size = getSize();
   reserveOutput(batchSize, size);
-  Matrix::resizeOrCreate(preOutput_.value, batchSize, codeLength_,
-                         /* trans */ false, useGpu(deviceId_));
-  Matrix::resizeOrCreate(preOutput_.grad, batchSize, codeLength_,
-                         /* trans */ false, useGpu(deviceId_));
+  Matrix::resizeOrCreate(preOutput_.value,
+                         batchSize,
+                         codeLength_,
+                         /* trans */ false,
+                         useGpu(deviceId_));
+  Matrix::resizeOrCreate(preOutput_.grad,
+                         batchSize,
+                         codeLength_,
+                         /* trans */ false,
+                         useGpu(deviceId_));
 
   IVectorPtr label = getInput(*getLabelLayer()).ids;
 
@@ -76,16 +81,18 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
   }
   for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
     MatrixPtr input = getInputValue(i);
-    preOutput_.value->mulByBitCode(numClasses_, *label, *weights_[i]->getW(),
-                                   *input);
+    preOutput_.value->mulByBitCode(
+        numClasses_, *label, *weights_[i]->getW(), *input);
   }
   // keep consistent with the clipping in the following softrelu
   preOutput_.value->clip(-40.0, 40.0);
-  preOutput_.value->sumByBitCode(numClasses_, *label, *output_.value,
+  preOutput_.value->sumByBitCode(numClasses_,
+                                 *label,
+                                 *output_.value,
                                  -1);  // scaleSum
   preOutput_.value->softrelu(*preOutput_.value);
-  MatrixPtr sum = Matrix::create(batchSize,
-    1, /* trans= */ false, useGpu(deviceId_));
+  MatrixPtr sum =
+      Matrix::create(batchSize, 1, /* trans= */ false, useGpu(deviceId_));
   preOutput_.value->rowSum(*sum);
   output_.value->add(*sum);
 }
@@ -97,8 +104,8 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
   preOutput_.grad->subByBitCode(numClasses_, *label);
 
   if (biases_ && biases_->getWGrad()) {
-    preOutput_.grad->addByBitCodeBackward(numClasses_, *label,
-                                          *biases_->getWGrad());
+    preOutput_.grad->addByBitCodeBackward(
+        numClasses_, *label, *biases_->getWGrad());
 
     /* Increasing the number of gradient */
     biases_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
index 1942c5fe1e4f4da1d3d9197a3ffd80e3e55ec2ac..10762bc92687a3ea8debb7b9aa26a0cf0f94421c 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -20,15 +19,15 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * Organize the classes into a binary tree. At each node, a sigmoid function 
+ * Organize the classes into a binary tree. At each node, a sigmoid function
  * is used to calculate the probability of belonging to the right branch.
- * This idea is from "F. Morin, Y. Bengio (AISTATS 05): 
+ * This idea is from "F. Morin, Y. Bengio (AISTATS 05):
  * Hierarchical Probabilistic Neural Network Language Model."
  *
  * Here we uses a simple way of making the binary tree.
  * Assuming the number of classes C = 6,
  * The classes are organized as a binary tree in the following way:
- * 
+ *
  * @code{.py}
  * *-*-*- 2
  * | | |- 3
@@ -44,15 +43,15 @@ namespace paddle {
  * - Node 0 ... C-2 are internal nodes.
  * - Node C-1 ... 2C-2 are leaf nodes.
  * - Class c is represented by leaf node \f$c+C-1\f$.
- * 
+ *
  * We assign an id for each node:
  * - the id of root be 0.
  * - the left child of a node i is 2*i+1.
  * - the right child of a node i is 2*i+2.
  *
  * It's easy to see that:
- * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$. 
- * - the j-th level ancestor of node i is 
+ * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$.
+ * - the j-th level ancestor of node i is
  * \f$\left\lfloor(i+1)/2^{j+1}\right\rfloor - 1\f$.
  * - A node i is a left child of its parent if \f$(i-1)\%2==0\f$.
  *
@@ -69,7 +68,7 @@ public:
 protected:
   /**
    * The last of inputs is label layer.
-   */ 
+   */
   LayerPtr getLabelLayer() { return inputLayers_.back(); }
 
   WeightList weights_;
diff --git a/paddle/gserver/layers/IdentityProjection.cpp b/paddle/gserver/layers/IdentityProjection.cpp
index 6b7d20cc507e453e49708c2418f6d67abf3326f8..b38656c960f17b2c2c315eba70c61c328ed3e49a 100644
--- a/paddle/gserver/layers/IdentityProjection.cpp
+++ b/paddle/gserver/layers/IdentityProjection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "Projection.h"
 
@@ -29,7 +28,8 @@ namespace paddle {
 class IdentityProjection : public Projection {
 public:
   IdentityProjection(const ProjectionConfig& config,
-                     const ParameterPtr& parameter, bool useGpu);
+                     const ParameterPtr& parameter,
+                     bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 };
@@ -70,7 +70,8 @@ void IdentityProjection::backward(const UpdateCallback& callback) {
 class IdentityOffsetProjection : public Projection {
 public:
   IdentityOffsetProjection(const ProjectionConfig& config,
-                           const ParameterPtr& parameter, bool useGpu);
+                           const ParameterPtr& parameter,
+                           bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 };
diff --git a/paddle/gserver/layers/InterpolationLayer.cpp b/paddle/gserver/layers/InterpolationLayer.cpp
index 4102df840a48412a9c4ceb476488febf43a8e80c..b00bee235693d56aecfdc676647e102fe8d0ebfc 100644
--- a/paddle/gserver/layers/InterpolationLayer.cpp
+++ b/paddle/gserver/layers/InterpolationLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -26,8 +25,8 @@ namespace paddle {
  * \f[
  *   y.row[i] = w[i] * x_1.row[i] + (1 - w[i]) * x_2.row[i]
  * \f]
- * where \f$x_1\f$ and \f$x_2\f$ are two (batchSize x dataDim) inputs, 
- * \f$w\f$ is (batchSize x 1) weight vector, 
+ * where \f$x_1\f$ and \f$x_2\f$ are two (batchSize x dataDim) inputs,
+ * \f$w\f$ is (batchSize x 1) weight vector,
  * and \f$y\f$ is (batchSize x dataDim) output.
  *
  * The config file api is interpolation_layer.
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index 78d15c553021de6bbda210cb782c8a240cc2bf73..a83b0e9ab4f8320db02cacee42730b2b579c22fd 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 
 #include "paddle/utils/Logging.h"
@@ -123,19 +122,22 @@ LayerPtr Layer::create(const LayerConfig& config) {
   return LayerPtr(registrar_.createByType(config.type(), config));
 }
 
-void Layer::resetSpecifyOutput(Argument& output, size_t height, size_t width,
-                               bool isValueClean, bool isGradClean) {
+void Layer::resetSpecifyOutput(Argument& output,
+                               size_t height,
+                               size_t width,
+                               bool isValueClean,
+                               bool isGradClean) {
   SetDevice device(output.deviceId);
 
-  Matrix::resizeOrCreate(output.value, height, width, /* trans */ false,
-                         useGpu(output.deviceId));
+  Matrix::resizeOrCreate(
+      output.value, height, width, /* trans */ false, useGpu(output.deviceId));
   if (isValueClean) {
     output.value->zeroMem();
   }
 
   if (passType_ != PASS_TEST && needGradient()) {
-    Matrix::resizeOrCreate(output.grad, height, width, /* trans */ false,
-                           useGpu(output.deviceId));
+    Matrix::resizeOrCreate(
+        output.grad, height, width, /* trans */ false, useGpu(output.deviceId));
     if (isGradClean) {
       output.grad->zeroMem();
     }
@@ -227,8 +229,10 @@ void Layer::waitAndMergeOutputGrad() {
     if (outputOtherDevice_.size() == 1) return;
   }
 
-  Matrix::resizeOrCreate(tmpGrad_, output_.grad->getHeight(),
-                         output_.grad->getWidth(), /* trans */ false,
+  Matrix::resizeOrCreate(tmpGrad_,
+                         output_.grad->getHeight(),
+                         output_.grad->getWidth(),
+                         /* trans */ false,
                          useGpu(output_.deviceId));
 
   for (; i != outputOtherDevice_.size(); i++) {
@@ -258,8 +262,8 @@ void Layer::zeroGrad() {
 }
 
 void Layer::initNeedFlags() {
-  auto initFlag = [this](bool& flag, bool (Layer::*flagQueryFunc)() const,
-                         ParameterType type) {
+  auto initFlag = [this](
+      bool& flag, bool (Layer::*flagQueryFunc)() const, ParameterType type) {
     flag = false;
     if (biasParameter_ && biasParameter_->hasType(type)) {
       flag = true;
@@ -293,10 +297,12 @@ void Layer::showOutputStats() {
   }
   MatrixPtr outSquare;
   if (dynamic_cast<GpuSparseMatrix*>(out.get())) {
-    GpuSparseMatrix *tmp = dynamic_cast<GpuSparseMatrix*>(out.get());
-    outSquare = std::make_shared<CpuSparseMatrix>(
-      tmp->getHeight(), tmp->getWidth(), tmp->getElementCnt(),
-      tmp->getValueType(), tmp->getFormat());
+    GpuSparseMatrix* tmp = dynamic_cast<GpuSparseMatrix*>(out.get());
+    outSquare = std::make_shared<CpuSparseMatrix>(tmp->getHeight(),
+                                                  tmp->getWidth(),
+                                                  tmp->getElementCnt(),
+                                                  tmp->getValueType(),
+                                                  tmp->getFormat());
   } else {
     outSquare = out->clone();
   }
@@ -310,19 +316,18 @@ void Layer::showOutputStats() {
     auto tmpMat = dynamic_cast<CpuSparseMatrix*>(outSquare.get());
     min = tmpMat->getMin();
     max = tmpMat->getMax();
-    tmpMat->square();
+    tmpMat->square2();
     LOG(INFO) << "show statistics of [none zero values] in sparse matrix";
   } else {
     min = outSquare->getMin();
     max = outSquare->getMax();
-    outSquare->square();
+    outSquare->square2();
   }
   real std = (outSquare->getSum() / outSquare->getElementCnt()) - mean * mean;
   std = std > 0 ? std : 0;
   LOG(INFO) << "The output state of " << config_.name() << ": mean=" << mean
             << ", "
-            << "std=" << std
-            << ", "
+            << "std=" << std << ", "
             << "min=" << min << ", "
             << "max=" << max;
 }
@@ -348,8 +353,8 @@ void Layer::backwardActivation() {
   if (config_.error_clipping_threshold() > 0.0f) {
     if (FLAGS_log_error_clipping) {
       CpuVector outGradVec(0, nullptr);
-      outGradVec.subVecFrom(output_.grad->getData(), 0,
-                            output_.grad->getElementCnt());
+      outGradVec.subVecFrom(
+          output_.grad->getData(), 0, output_.grad->getElementCnt());
       real maxAbsGrad = outGradVec.getAbsMax();
       if (maxAbsGrad > config_.error_clipping_threshold()) {
         real avgAbsGrad = outGradVec.getAbsSum() / outGradVec.getSize();
@@ -376,16 +381,19 @@ void Layer::forwardDropOut() {
   if (passType_ == PASS_TRAIN || passType_ == PASS_METRIC_TRAIN ||
       passType_ == PASS_METRIC_TRAIN_WITH_NOERROR) {
     // new dropOutMask_ if dropOutMask_ is null ptr
-    Matrix::resizeOrCreate(dropOutMask_, outV->getHeight(), outV->getWidth(),
-                           false, useGpu(deviceId_));
+    Matrix::resizeOrCreate(dropOutMask_,
+                           outV->getHeight(),
+                           outV->getWidth(),
+                           false,
+                           useGpu(deviceId_));
     dropOutMask_->randomizeUniform();  // generate a uniform random matrix
     dropOutMask_->biggerThanScalar(config_.drop_rate());  // random mask
     outV->dotMul(*outV, *dropOutMask_);                   // dropout
   } else if (passType_ == PASS_GC) {
     // only initialize once
     if (!dropOutMask_) {
-      dropOutMask_ = Matrix::create(outV->getHeight(), outV->getWidth(), false,
-                                    useGpu(deviceId_));
+      dropOutMask_ = Matrix::create(
+          outV->getHeight(), outV->getWidth(), false, useGpu(deviceId_));
       // We use cpu matrix to generate mask so that the mask
       // will be same for both gpu version and cpu version.
       // This will help unittest to make sure they have same result.
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index ae7cdb0028120748a3377d8f522c4af03d9cb82d..3d427a1ac6e38f2bcd49195504d1086b83e3cdf3 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -109,7 +108,7 @@ public:
   virtual void waitInputValue();
 
   /**
-   * Copy layer's output_ to other device. 
+   * Copy layer's output_ to other device.
    * If output layer is in other device, called after Layer::forward() function.
    */
   virtual void copyOutputToOtherDevice();
@@ -189,8 +188,11 @@ protected:
    * Reset to value zero if isValueClean = true,
    * Reset to grad zero if isGradClean = true.
    */
-  void resetSpecifyOutput(Argument& output, size_t height, size_t width,
-                          bool isValueClean, bool isGradClean);
+  void resetSpecifyOutput(Argument& output,
+                          size_t height,
+                          size_t width,
+                          bool isValueClean,
+                          bool isGradClean);
 
   /**
    * Add output argument to other devices.
@@ -204,48 +206,48 @@ public:
   /// Register a Layer
   static ClassRegistrar<Layer, LayerConfig> registrar_;
 
-  /** 
+  /**
    * Get the flag whether layer need to compute gradient.
    */
   bool needGradient() const { return needGradient_; }
 
-  /** 
+  /**
    * Set the flag whether layer need to compute gradient.
    */
   void setNeedGradient(bool need) { needGradient_ = need; }
 
-  /** 
+  /**
    * Set the flag whether layer need to re-compute sequence information,
    * which includes sequenceStartPositions or subSequenceStartPositions.
    */
   void setNeedSequenceInfo(bool need) { needSequenceInfo_ = need; }
 
-  /** 
+  /**
    * Get layer's name.
    */
   const std::string& getName() const { return config_.name(); }
 
-  /** 
+  /**
    * Get layer's type.
    */
   const std::string& getType() const { return config_.type(); }
 
-  /** 
+  /**
    * Get layer's size.
    */
   size_t getSize() const { return config_.size(); }
 
-  /** 
+  /**
    * Get layer's deviceId.
    */
   int getDeviceId() const { return deviceId_; }
 
-  /** 
+  /**
    * Add the inputLayer.
    */
   void addPrev(LayerPtr l) { inputLayers_.push_back(l); }
 
-  /** 
+  /**
    * Get the size of inputLayer[i].
    */
   const LayerPtr& getPrev(size_t i) { return inputLayers_[i]; }
@@ -265,7 +267,7 @@ public:
    */
   const MatrixPtr& getOutputGrad() { return output_.grad; }
   /**
-   * If layer has multi-output, set output into outputMap_. 
+   * If layer has multi-output, set output into outputMap_.
    */
   void setOutput(const std::string& name, Argument* output) {
     outputMap_[name] = output;
@@ -351,8 +353,8 @@ public:
   /**
    * Intialization for sub network if there has sub network.
    * @param rootNetwork root network
-   * @param config model config 
-   * @param parameterTypes parameter's type 
+   * @param config model config
+   * @param parameterTypes parameter's type
    * @param useGpu whether to use gpu or not
    */
   virtual void initSubNetwork(NeuralNetwork* rootNetwork,
@@ -391,7 +393,8 @@ public:
   /**
    * Reset the internal state variables.
    * Allocate them if they have not been allocated.
-   * This function need to called before Layer::forward() for generating sequence.
+   * This function need to called before Layer::forward() for generating
+   * sequence.
    *
    * This is used for sequence generation. When generating sequence, the
    * calculation at current timestamp depends on the state from previous
@@ -407,7 +410,7 @@ public:
   virtual void setState(LayerStatePtr state) {}
 
   /**
-   * Get layer state. 
+   * Get layer state.
    * @return A copy of internal state.
    */
   virtual LayerStatePtr getState() { return nullptr; }
diff --git a/paddle/gserver/layers/LinearChainCRF.cpp b/paddle/gserver/layers/LinearChainCRF.cpp
index fb54fd26cf36e2b23deba7186c3dcdd0cc445870..e2a4f69e717a9ce5e2ecf57d5002e4ac2267c9e2 100644
--- a/paddle/gserver/layers/LinearChainCRF.cpp
+++ b/paddle/gserver/layers/LinearChainCRF.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <algorithm>
 #include "LinearChainCRF.h"
 
@@ -61,7 +60,7 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
   expX_->assign(*matX);
   // subtract max to avoid overflow or underflow
   expX_->mul(maxX_, ones_, (real)-1, (real)1);
-  expX_->exp();
+  expX_->exp2();
 
   real* a = a_->getData();
   real* b = b_->getData();
@@ -70,7 +69,7 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
   real* expX = expX_->getData();
   real* maxX = maxX_->getData();
 
-  expW_->exp(*w_);
+  expW_->exp2(*w_);
   real* expW = expW_->getData();
 
   for (int i = 0; i < numClasses_; ++i) {
diff --git a/paddle/gserver/layers/LinearChainCRF.h b/paddle/gserver/layers/LinearChainCRF.h
index c33c83b25987e1b944a84d960cf6539cff1b872f..6368f2b9de2f993c6a113315be8d642784b04726 100644
--- a/paddle/gserver/layers/LinearChainCRF.h
+++ b/paddle/gserver/layers/LinearChainCRF.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/math/Matrix.h"
@@ -31,7 +30,8 @@ public:
    * \f$P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
    *                  + \sum_{l=1}^L x_{s_l}
    *                  + \sum_{l=2}^L w_{s_{l-1},s_l})\f$
-   * where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over all possible
+   * where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over
+   * all possible
    * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF.
    */
   LinearChainCRF(int numClasses, real* para, real* grad);
diff --git a/paddle/gserver/layers/LinearChainCTC.cpp b/paddle/gserver/layers/LinearChainCTC.cpp
index c0ffadbd91c78f5dcdb9fc2370aa7eb06bfb400e..3368eb4d8a796eef367042f78b8c18d47bc1330e 100644
--- a/paddle/gserver/layers/LinearChainCTC.cpp
+++ b/paddle/gserver/layers/LinearChainCTC.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <math.h>
 #include "LinearChainCTC.h"
 #include <limits>
@@ -90,7 +89,9 @@ LinearChainCTC::LinearChainCTC(int numClasses, bool normByTimes)
   Matrix::resizeOrCreate(gradTerms_, 1, numClasses_);
 }
 
-real LinearChainCTC::forward(real* softmaxSeq, int softmaxSeqLen, int* labelSeq,
+real LinearChainCTC::forward(real* softmaxSeq,
+                             int softmaxSeqLen,
+                             int* labelSeq,
                              int labelSeqLen) {
   isInvalid_ = false;
   totalTime_ = softmaxSeqLen;
@@ -215,7 +216,9 @@ real LinearChainCTC::forward(real* softmaxSeq, int softmaxSeqLen, int* labelSeq,
   return -logProb_;
 }
 
-void LinearChainCTC::backward(real* softmaxSeq, real* grad, int* labelSeq,
+void LinearChainCTC::backward(real* softmaxSeq,
+                              real* grad,
+                              int* labelSeq,
                               int labelSeqLen) {
   /* if not meet the conditions of CTC computing, then set the grads to zeros */
   if (isInvalid_) {
@@ -246,9 +249,9 @@ void LinearChainCTC::backward(real* softmaxSeq, real* grad, int* labelSeq,
                        logMul(logProb_, logActsData[i * numClasses_ + j]))) /
             totalTime_;
       } else {
-        grad[i * numClasses_ + j] += -safeExp(logDiv(
-            gradTermsData[j],
-            logMul(logProb_, logActsData[i * numClasses_ + j])));
+        grad[i * numClasses_ + j] += -safeExp(
+            logDiv(gradTermsData[j],
+                   logMul(logProb_, logActsData[i * numClasses_ + j])));
       }
     }
   }
diff --git a/paddle/gserver/layers/LinearChainCTC.h b/paddle/gserver/layers/LinearChainCTC.h
index b09218e3e78e16bd13e9dcde8138dd68a579d4ad..0a93d2e9a6d0d697f5f081abe9fad69faac9b04b 100644
--- a/paddle/gserver/layers/LinearChainCTC.h
+++ b/paddle/gserver/layers/LinearChainCTC.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <vector>
@@ -25,11 +24,15 @@ public:
   LinearChainCTC(int numClasses, bool normByTimes);
 
   // Calculate the negative log probability as loss
-  real forward(real* softmaxSeq, int softmaxSeqLen, int* labelSeq,
+  real forward(real* softmaxSeq,
+               int softmaxSeqLen,
+               int* labelSeq,
                int labelSeqLen);
 
   // calculate the gradient
-  void backward(real* softmaxSeq, real* softmaxSeqGrad, int* labelSeq,
+  void backward(real* softmaxSeq,
+                real* softmaxSeqGrad,
+                int* labelSeq,
                 int labelSeqLen);
 
 protected:
diff --git a/paddle/gserver/layers/LstmCompute.cpp b/paddle/gserver/layers/LstmCompute.cpp
index ced9636d3528ace044bc925285ac5db88f2ddc4e..38057636edbea5d1d25d20740b16c319a653e42e 100644
--- a/paddle/gserver/layers/LstmCompute.cpp
+++ b/paddle/gserver/layers/LstmCompute.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "hl_recurrent_apply.cuh"
 #include "LstmCompute.h"
@@ -27,22 +26,31 @@ void LstmCompute::init(LayerConfig &config) {
 
 template <>
 void LstmCompute::forwardOneSequence<0>(hl_lstm_value value, int frameSize) {
-  hl_cpu_lstm_forward(hppl::forward::lstm(), value,
-                      frameSize, activeNode_, activeGate_,
+  hl_cpu_lstm_forward(hppl::forward::lstm(),
+                      value,
+                      frameSize,
+                      activeNode_,
+                      activeGate_,
                       activeState_);
 }
 
 template <>
-void LstmCompute::backwardOneSequence<0>(hl_lstm_value value, hl_lstm_grad grad,
-                                        int frameSize) {
-  hl_cpu_lstm_backward(hppl::backward::lstm(), value, grad,
-                       frameSize, activeNode_, activeGate_,
+void LstmCompute::backwardOneSequence<0>(hl_lstm_value value,
+                                         hl_lstm_grad grad,
+                                         int frameSize) {
+  hl_cpu_lstm_backward(hppl::backward::lstm(),
+                       value,
+                       grad,
+                       frameSize,
+                       activeNode_,
+                       activeGate_,
                        activeState_);
 }
 
 template <>
-void LstmCompute::forwardBatch<0>(hl_lstm_value value, int frameSize,
-                                 int batchSize) {
+void LstmCompute::forwardBatch<0>(hl_lstm_value value,
+                                  int frameSize,
+                                  int batchSize) {
   for (int b = 0; b < batchSize; b++) {
     forwardOneSequence<0>(value, frameSize);
 
@@ -57,8 +65,10 @@ void LstmCompute::forwardBatch<0>(hl_lstm_value value, int frameSize,
 }
 
 template <>
-void LstmCompute::backwardBatch<0>(hl_lstm_value value, hl_lstm_grad grad,
-                                  int frameSize, int batchSize) {
+void LstmCompute::backwardBatch<0>(hl_lstm_value value,
+                                   hl_lstm_grad grad,
+                                   int frameSize,
+                                   int batchSize) {
   for (int b = 0; b < batchSize; b++) {
     backwardOneSequence<0>(value, grad, frameSize);
 
diff --git a/paddle/gserver/layers/LstmCompute.h b/paddle/gserver/layers/LstmCompute.h
index 638acdb56d75054387f5f368eaf8afc0dbed9107..97be7218f251f21a9a50c7f8ec28e7c487420a2f 100644
--- a/paddle/gserver/layers/LstmCompute.h
+++ b/paddle/gserver/layers/LstmCompute.h
@@ -35,7 +35,9 @@ public:
   void forwardBatch(hl_lstm_value value, int frameSize, int batchSize);
 
   template <bool useGpu>
-  void backwardBatch(hl_lstm_value value, hl_lstm_grad grad, int frameSize,
+  void backwardBatch(hl_lstm_value value,
+                     hl_lstm_grad grad,
+                     int frameSize,
                      int batchSize);
 
   /**
@@ -51,7 +53,8 @@ public:
   template <bool useGpu>
   void forwardOneSequence(hl_lstm_value value, int frameSize);
   template <bool useGpu>
-  void backwardOneSequence(hl_lstm_value value, hl_lstm_grad grad,
+  void backwardOneSequence(hl_lstm_value value,
+                           hl_lstm_grad grad,
                            int frameSize);
 
 public:
diff --git a/paddle/gserver/layers/LstmLayer.cpp b/paddle/gserver/layers/LstmLayer.cpp
index 61ad47a7fbd02f19a1a8e824b2cba3a3d114b9fc..e70a20e5c0217288b795f647f3918911e3713ceb 100644
--- a/paddle/gserver/layers/LstmLayer.cpp
+++ b/paddle/gserver/layers/LstmLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "LstmLayer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/BaseMatrix.h"
@@ -35,14 +34,26 @@ bool LstmLayer::init(const LayerMap &layerMap,
   if (biasParameter_.get() != NULL) {
     bias_.reset(new Weight(1, getSize() * 7, biasParameter_));
     if (bias_->getW()) {
-      localBias_ = Matrix::create(nullptr, /* height= */ 1, getSize() * 4,
-                                  /* trans= */ false, useGpu_);
-      checkIg_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                /* trans= */ false, useGpu_);
-      checkFg_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                /* trans= */ false, useGpu_);
-      checkOg_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                /* trans= */ false, useGpu_);
+      localBias_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  getSize() * 4,
+                                  /* trans= */ false,
+                                  useGpu_);
+      checkIg_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+      checkFg_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+      checkOg_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
 
       localBias_->setData(bias_->getW()->getData());
       checkIg_->setData(bias_->getW()->getData() + getSize() * 4);
@@ -51,14 +62,26 @@ bool LstmLayer::init(const LayerMap &layerMap,
     }
 
     if (bias_->getWGrad()) {
-      localBiasGrad_ = Matrix::create(nullptr, /* height= */ 1, getSize() * 4,
-                                      /* trans= */ false, useGpu_);
-      checkIgGrad_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                    /* trans= */ false, useGpu_);
-      checkFgGrad_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                    /* trans= */ false, useGpu_);
-      checkOgGrad_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                    /* trans= */ false, useGpu_);
+      localBiasGrad_ = Matrix::create(nullptr,
+                                      /* height= */ 1,
+                                      getSize() * 4,
+                                      /* trans= */ false,
+                                      useGpu_);
+      checkIgGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
+      checkFgGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
+      checkOgGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
       localBiasGrad_->setData(bias_->getWGrad()->getData());
       checkIgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 4);
       checkFgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 5);
@@ -84,8 +107,8 @@ bool LstmLayer::init(const LayerMap &layerMap,
 
 void LstmLayer::resetState() {
   CHECK(!reversed_) << "state is not allowed for reversed lstmemory layer";
-  Matrix::resizeOrCreate(prevOutput_, 1, getSize(), /* trans= */ false,
-                         useGpu_);
+  Matrix::resizeOrCreate(
+      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
   Matrix::resizeOrCreate(prevState_, 1, getSize(), /* trans= */ false, useGpu_);
   prevOutput_->resize(0, getSize());
   prevState_->resize(0, getSize());
@@ -138,8 +161,10 @@ void LstmLayer::forward(PassType passType) {
   CHECK_EQ(starts[numSequences], batchSize);
 
   Matrix::resizeOrCreate(gate_.value,
-                         /* height= */ batchSize, getSize() * 4,
-                         /* trans= */ false, useGpu_);
+                         /* height= */ batchSize,
+                         getSize() * 4,
+                         /* trans= */ false,
+                         useGpu_);
   if (prevOutput_) {
     size_t prevNumSeq = useBatch_ ? numSequences : 1;
     if (prevOutput_->getHeight() == 0) {
@@ -151,18 +176,29 @@ void LstmLayer::forward(PassType passType) {
       CHECK_EQ(prevOutput_->getHeight(), prevNumSeq)
           << "the number of sequences must be the same";
     }
-    Matrix::resizeOrCreate(totalState_, prevState_->getHeight() + batchSize,
-                           getSize(), /*trans*/ false, useGpu_);
-    state_.value = Matrix::create(nullptr, /* height= */ batchSize, getSize(),
-                                  /* trans= */ false, useGpu_);
+    Matrix::resizeOrCreate(totalState_,
+                           prevState_->getHeight() + batchSize,
+                           getSize(),
+                           /*trans*/ false,
+                           useGpu_);
+    state_.value = Matrix::create(nullptr,
+                                  /* height= */ batchSize,
+                                  getSize(),
+                                  /* trans= */ false,
+                                  useGpu_);
     state_.value->setData(totalState_->getData() +
                           prevState_->getHeight() * getSize());
   } else {
-    Matrix::resizeOrCreate(state_.value, /* height= */ batchSize, getSize(),
-                           /* trans= */ false, useGpu_);
+    Matrix::resizeOrCreate(state_.value,
+                           /* height= */ batchSize,
+                           getSize(),
+                           /* trans= */ false,
+                           useGpu_);
   }
   Matrix::resizeOrCreate(preOutput_.value,
-                         /* height= */ batchSize, getSize(), /* trans= */ false,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
                          useGpu_);
 
   if (!useBatch_) {
@@ -171,7 +207,7 @@ void LstmLayer::forward(PassType passType) {
     if (!useSeqParallel_) {
       forwardBatch(batchSize, numSequences, starts, input.value);
     } else {
-      const int* starts = input.sequenceStartPositions->getData(useGpu_);
+      const int *starts = input.sequenceStartPositions->getData(useGpu_);
       forwardSeqParallel(batchSize, numSequences, starts, input.value);
     }
   }
@@ -188,13 +224,19 @@ void LstmLayer::backward(const UpdateCallback &callback) {
   size_t numSequences = input.getNumSequences();
 
   Matrix::resizeOrCreate(gate_.grad,
-                         /* height= */ batchSize, getSize() * 4,
-                         /* trans= */ false, useGpu_);
+                         /* height= */ batchSize,
+                         getSize() * 4,
+                         /* trans= */ false,
+                         useGpu_);
   Matrix::resizeOrCreate(state_.grad,
-                         /* height= */ batchSize, getSize(), /* trans= */ false,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
                          useGpu_);
   Matrix::resizeOrCreate(preOutput_.grad,
-                         /* height= */ batchSize, getSize(), /* trans= */ false,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
                          useGpu_);
   state_.grad->zero();
 
@@ -205,7 +247,7 @@ void LstmLayer::backward(const UpdateCallback &callback) {
     if (!useSeqParallel_) {
       backwardBatch(batchSize, numSequences, starts, input.grad);
     } else {
-      const int* starts = input.sequenceStartPositions->getData(useGpu_);
+      const int *starts = input.sequenceStartPositions->getData(useGpu_);
       backwardSeqParallel(batchSize, numSequences, starts, input.grad);
     }
   }
@@ -216,8 +258,10 @@ void LstmLayer::backward(const UpdateCallback &callback) {
   weight_->getParameterPtr()->incUpdate(callback);
 }
 
-void LstmLayer::forwardSequence(int batchSize, size_t numSequences,
-                                const int *starts, MatrixPtr inputValue) {
+void LstmLayer::forwardSequence(int batchSize,
+                                size_t numSequences,
+                                const int *starts,
+                                MatrixPtr inputValue) {
   REGISTER_TIMER_INFO("LstmFwSequenceTime", getName().c_str());
   gate_.value->assign(*inputValue);
   if (bias_) {
@@ -255,10 +299,16 @@ void LstmLayer::forwardSequence(int batchSize, size_t numSequences,
     }
   };
 
-  MatrixPtr frameGate = Matrix::create(nullptr, /* height= */ 1, getSize() * 4,
-                                       /* trans= */ false, useGpu_);
-  MatrixPtr frameOutput = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                         /* trans= */ false, useGpu_);
+  MatrixPtr frameGate = Matrix::create(nullptr,
+                                       /* height= */ 1,
+                                       getSize() * 4,
+                                       /* trans= */ false,
+                                       useGpu_);
+  MatrixPtr frameOutput = Matrix::create(nullptr,
+                                         /* height= */ 1,
+                                         getSize(),
+                                         /* trans= */ false,
+                                         useGpu_);
 
   if (!reversed_) {
     if (prevState_) {
@@ -316,8 +366,10 @@ void LstmLayer::forwardSequence(int batchSize, size_t numSequences,
   }
 }
 
-void LstmLayer::backwardSequence(int batchSize, size_t numSequences,
-                                 const int *starts, MatrixPtr inputGrad) {
+void LstmLayer::backwardSequence(int batchSize,
+                                 size_t numSequences,
+                                 const int *starts,
+                                 MatrixPtr inputGrad) {
   REGISTER_TIMER_INFO("LstmBwSequenceTime", getName().c_str());
   MatrixPtr weightT = weight_->getW()->getTranspose();
 
@@ -381,10 +433,16 @@ void LstmLayer::backwardSequence(int batchSize, size_t numSequences,
     }
   };
 
-  MatrixPtr frameGate = Matrix::create(nullptr, /* height= */ 1, getSize() * 4,
-                                       /* trans= */ false, useGpu_);
-  MatrixPtr frameOutput = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                         /* trans= */ false, useGpu_);
+  MatrixPtr frameGate = Matrix::create(nullptr,
+                                       /* height= */ 1,
+                                       getSize() * 4,
+                                       /* trans= */ false,
+                                       useGpu_);
+  MatrixPtr frameOutput = Matrix::create(nullptr,
+                                         /* height= */ 1,
+                                         getSize(),
+                                         /* trans= */ false,
+                                         useGpu_);
 
   {
     AsyncGpuBlock asyncGpuBlock;
@@ -422,11 +480,15 @@ void LstmLayer::backwardSequence(int batchSize, size_t numSequences,
         if (!reversed_) {
           weight_->getWGrad()->mul(
               output_.value->subMatrix(start, length - 1)->getTranspose(),
-              gate_.grad->subMatrix(start + 1, length - 1), 1, 1);
+              gate_.grad->subMatrix(start + 1, length - 1),
+              1,
+              1);
         } else {
           weight_->getWGrad()->mul(
               output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
-              gate_.grad->subMatrix(start, length - 1), 1, 1);
+              gate_.grad->subMatrix(start, length - 1),
+              1,
+              1);
         }
       }
     }
@@ -440,8 +502,10 @@ void LstmLayer::backwardSequence(int batchSize, size_t numSequences,
   }
 }
 
-void LstmLayer::forwardBatch(int batchSize, size_t numSequences,
-                             const int *starts, MatrixPtr inputValue) {
+void LstmLayer::forwardBatch(int batchSize,
+                             size_t numSequences,
+                             const int *starts,
+                             MatrixPtr inputValue) {
   REGISTER_TIMER_INFO("LstmFwBatchTime", getName().c_str());
 
   hl_lstm_value lstmValue;
@@ -452,8 +516,8 @@ void LstmLayer::forwardBatch(int batchSize, size_t numSequences,
   if (!batchValue_) {
     batchValue_.reset(new SequenceToBatch(useGpu_));
   }
-  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_,
-                                   prevOutput_ ? true : false);
+  batchValue_->resizeOrCreateBatch(
+      batchSize, numSequences, starts, reversed_, prevOutput_ ? true : false);
 
   batchValue_->resizeOrCreate(*output_.value);
   batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
@@ -479,8 +543,11 @@ void LstmLayer::forwardBatch(int batchSize, size_t numSequences,
         MatrixPtr batch1 = batchValue_->getBatchValue(n - 1, batchSize);
         gateValue->mul(batch1, weight_->getW(), 1, 1);
       } else if (prevOutput_) {
-        Matrix::resizeOrCreate(prevBatchOutput2_, gateValue->getHeight(),
-                               getSize(), false, useGpu_);
+        Matrix::resizeOrCreate(prevBatchOutput2_,
+                               gateValue->getHeight(),
+                               getSize(),
+                               false,
+                               useGpu_);
         batchValue_->prevOutput2Batch(*prevOutput_, *prevBatchOutput2_);
         gateValue->mul(prevBatchOutput2_, weight_->getW(), 1, 1);
 
@@ -525,8 +592,10 @@ void LstmLayer::getPrevBatchState(size_t numSequences) {
   batchValue_->getSeqOutputFromBatch(*prevState_, *state_.value);
 }
 
-void LstmLayer::backwardBatch(int batchSize, size_t numSequences,
-                              const int *starts, MatrixPtr inputGrad) {
+void LstmLayer::backwardBatch(int batchSize,
+                              size_t numSequences,
+                              const int *starts,
+                              MatrixPtr inputGrad) {
   REGISTER_TIMER_INFO("LstmBwBatchTime", getName().c_str());
 
   hl_lstm_value lstmValue;
@@ -593,11 +662,11 @@ void LstmLayer::backwardBatch(int batchSize, size_t numSequences,
           }
         }
         if (useGpu_) {
-          LstmCompute::backwardBatch<1>(lstmValue, lstmGrad,
-                                        getSize(), batchSize);
+          LstmCompute::backwardBatch<1>(
+              lstmValue, lstmGrad, getSize(), batchSize);
         } else {
-          LstmCompute::backwardBatch<0>(lstmValue, lstmGrad,
-                                        getSize(), batchSize);
+          LstmCompute::backwardBatch<0>(
+              lstmValue, lstmGrad, getSize(), batchSize);
         }
       }
 
@@ -611,8 +680,8 @@ void LstmLayer::backwardBatch(int batchSize, size_t numSequences,
         MatrixPtr outputValue = batchValue_->getBatchValue(n - 1, batchSize);
         weight_->getWGrad()->mul(outputValue->getTranspose(), gateGrad, 1, 1);
       } else if (prevOutput_ && weight_->getWGrad()) {
-        weight_->getWGrad()->mul(prevBatchOutput2_->getTranspose(), gateGrad, 1,
-                                 1);
+        weight_->getWGrad()->mul(
+            prevBatchOutput2_->getTranspose(), gateGrad, 1, 1);
       }
     }
   }
@@ -625,8 +694,10 @@ void LstmLayer::backwardBatch(int batchSize, size_t numSequences,
   }
 }
 
-void LstmLayer::forwardSeqParallel(int batchSize, size_t numSequences,
-                                   const int *starts, MatrixPtr inputValue) {
+void LstmLayer::forwardSeqParallel(int batchSize,
+                                   size_t numSequences,
+                                   const int *starts,
+                                   MatrixPtr inputValue) {
   REGISTER_TIMER_INFO("LstmFwSeqParallelTime", getName().c_str());
   gate_.value->assign(*inputValue);
   if (bias_) {
@@ -641,14 +712,27 @@ void LstmLayer::forwardSeqParallel(int batchSize, size_t numSequences,
   real *checkFg = checkFg_->getData();
   real *checkOg = checkOg_->getData();
   real *weight = weight_->getW()->getData();
-  hl_lstm_parallel_forward(
-      gateValue, stateValue, preOutputValue, outputValue, checkIg, checkFg,
-      checkOg, weight, starts, getSize(), numSequences, reversed_, activeNode_,
-      activeGate_, activeState_);
+  hl_lstm_parallel_forward(gateValue,
+                           stateValue,
+                           preOutputValue,
+                           outputValue,
+                           checkIg,
+                           checkFg,
+                           checkOg,
+                           weight,
+                           starts,
+                           getSize(),
+                           numSequences,
+                           reversed_,
+                           activeNode_,
+                           activeGate_,
+                           activeState_);
 }
 
-void LstmLayer::backwardSeqParallel(int batchSize, size_t numSequences,
-                                    const int *starts, MatrixPtr inputGrad) {
+void LstmLayer::backwardSeqParallel(int batchSize,
+                                    size_t numSequences,
+                                    const int *starts,
+                                    MatrixPtr inputGrad) {
   REGISTER_TIMER_INFO("LstmBwSeqParallelTime", getName().c_str());
   real *gateValue = gate_.value->getData();
   real *gateGrad = gate_.grad->getData();
@@ -675,11 +759,27 @@ void LstmLayer::backwardSeqParallel(int batchSize, size_t numSequences,
     checkOgGrad = nullptr;
   }
 
-  hl_lstm_parallel_backward_data(
-      gateValue, gateGrad, stateValue, stateGrad, preOutputValue, preOutputGrad,
-      outputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
-      checkOgGrad, weight, starts, getSize(), numSequences, reversed_,
-      activeNode_, activeGate_, activeState_);
+  hl_lstm_parallel_backward_data(gateValue,
+                                 gateGrad,
+                                 stateValue,
+                                 stateGrad,
+                                 preOutputValue,
+                                 preOutputGrad,
+                                 outputGrad,
+                                 checkIg,
+                                 checkIgGrad,
+                                 checkFg,
+                                 checkFgGrad,
+                                 checkOg,
+                                 checkOgGrad,
+                                 weight,
+                                 starts,
+                                 getSize(),
+                                 numSequences,
+                                 reversed_,
+                                 activeNode_,
+                                 activeGate_,
+                                 activeState_);
 
   if (inputGrad) {
     inputGrad->add(*gate_.grad);
@@ -691,9 +791,14 @@ void LstmLayer::backwardSeqParallel(int batchSize, size_t numSequences,
   real *outputValue = output_.value->getData();
   if (weight_->getWGrad()) {
     real *weightGrad = weight_->getWGrad()->getData();
-    hl_lstm_parallel_backward_weight(weightGrad, outputValue, gateGrad,
-                                     starts, getSize(), batchSize,
-                                     numSequences, reversed_);
+    hl_lstm_parallel_backward_weight(weightGrad,
+                                     outputValue,
+                                     gateGrad,
+                                     starts,
+                                     getSize(),
+                                     batchSize,
+                                     numSequences,
+                                     reversed_);
   }
 }
 
diff --git a/paddle/gserver/layers/LstmLayer.h b/paddle/gserver/layers/LstmLayer.h
index e080a401416d55c8684342e00313ae4d5c9cf4e0..5b936ff44ef1bc26850c5051f4d5561529002cd4 100644
--- a/paddle/gserver/layers/LstmLayer.h
+++ b/paddle/gserver/layers/LstmLayer.h
@@ -97,12 +97,16 @@ protected:
    * @param starts Each start position of each samples.
    * @param inputValue The input values.
    */
-  void forwardSequence(int batchSize, size_t numSequences, const int *starts,
+  void forwardSequence(int batchSize,
+                       size_t numSequences,
+                       const int *starts,
                        MatrixPtr inputValue);
   /**
    * Compute lstm backward one sequence by one sequence.
    */
-  void backwardSequence(int batchSize, size_t numSequences, const int *starts,
+  void backwardSequence(int batchSize,
+                        size_t numSequences,
+                        const int *starts,
                         MatrixPtr inputGrad);
 
   /**
@@ -121,12 +125,16 @@ protected:
    * }
    * @endcode
    */
-  void forwardBatch(int batchSize, size_t numSequences, const int *starts,
+  void forwardBatch(int batchSize,
+                    size_t numSequences,
+                    const int *starts,
                     MatrixPtr inputValue);
   /**
    * Compute lstm backward one batch by one batch.
    */
-  void backwardBatch(int batchSize, size_t numSequences, const int *starts,
+  void backwardBatch(int batchSize,
+                     size_t numSequences,
+                     const int *starts,
                      MatrixPtr inputGrad);
 
   /**
@@ -134,13 +142,17 @@ protected:
    * batch value. It will launch one kernel to parallelly compute forward
    * propagation in sequence level.
    */
-  void forwardSeqParallel(int batchSize, size_t numSequences, const int *starts,
+  void forwardSeqParallel(int batchSize,
+                          size_t numSequences,
+                          const int *starts,
                           MatrixPtr inputValue);
   /**
    * Backward propagation corresponding to forwardSeqParallel.
    */
-  void backwardSeqParallel(int batchSize, size_t numSequences,
-                           const int *starts, MatrixPtr inputGrad);
+  void backwardSeqParallel(int batchSize,
+                           size_t numSequences,
+                           const int *starts,
+                           MatrixPtr inputGrad);
   /**
    * This function is used for sequence generation and get output after
    * forwardBatch.
diff --git a/paddle/gserver/layers/LstmStepLayer.cpp b/paddle/gserver/layers/LstmStepLayer.cpp
index fb0fdbf7e9c9a1a479b47ecf9463b26393642be2..e7a8d519f2dc5eade613f3ad1981434ae8d59b7c 100644
--- a/paddle/gserver/layers/LstmStepLayer.cpp
+++ b/paddle/gserver/layers/LstmStepLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 #include "LstmCompute.h"
 #include "paddle/utils/Stat.h"
@@ -49,24 +48,36 @@ bool LstmStepLayer::init(const LayerMap& layerMap,
   if (!Layer::init(layerMap, parameterMap)) return false;
   CHECK_EQ(2U, inputLayers_.size());
 
-  checkIg_ =
-      Matrix::create(nullptr,
-                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
-  checkFg_ =
-      Matrix::create(nullptr,
-                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
-  checkOg_ =
-      Matrix::create(nullptr,
-                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
-  checkIgGrad_ =
-      Matrix::create(nullptr,
-                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
-  checkFgGrad_ =
-      Matrix::create(nullptr,
-                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
-  checkOgGrad_ =
-      Matrix::create(nullptr,
-                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
+  checkIg_ = Matrix::create(nullptr,
+                            /* height= */ 1,
+                            getSize(),
+                            /* trans= */ false,
+                            useGpu_);
+  checkFg_ = Matrix::create(nullptr,
+                            /* height= */ 1,
+                            getSize(),
+                            /* trans= */ false,
+                            useGpu_);
+  checkOg_ = Matrix::create(nullptr,
+                            /* height= */ 1,
+                            getSize(),
+                            /* trans= */ false,
+                            useGpu_);
+  checkIgGrad_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+  checkFgGrad_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+  checkOgGrad_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
 
   if (biasParameter_.get() != NULL) {
     CHECK_EQ(getSize() * 3, biasParameter_->getSize());
@@ -101,12 +112,21 @@ void LstmStepLayer::forward(PassType passType) {
   CHECK_EQ(getSize(), prevState.value->getWidth());
   int batchSize = input.getBatchSize();
   reserveOutput(batchSize, getSize());
-  resetSpecifyOutput(state_, batchSize, getSize(), /*  isValueClean */ false,
+  resetSpecifyOutput(state_,
+                     batchSize,
+                     getSize(),
+                     /*  isValueClean */ false,
                      /* isGradClean */ true);
-  resetSpecifyOutput(gate_, batchSize, getSize() * 4,
-                     /* isValueClean */ false, /* isGradClean */ false);
-  resetSpecifyOutput(stateActive_, batchSize, getSize(),
-                     /*  isValueClean */ false, /* isGradClean */ false);
+  resetSpecifyOutput(gate_,
+                     batchSize,
+                     getSize() * 4,
+                     /* isValueClean */ false,
+                     /* isGradClean */ false);
+  resetSpecifyOutput(stateActive_,
+                     batchSize,
+                     getSize(),
+                     /*  isValueClean */ false,
+                     /* isGradClean */ false);
   gate_.value->assign(*input.value);
 
   hl_lstm_value lstmValue;
@@ -156,11 +176,9 @@ void LstmStepLayer::backward(const UpdateCallback& callback) {
   lstmGrad.checkOgGrad = checkOgGrad_->getData();
 
   if (useGpu_) {
-    LstmCompute::backwardBatch<1>(lstmValue, lstmGrad, getSize(),
-                                  batchSize);
+    LstmCompute::backwardBatch<1>(lstmValue, lstmGrad, getSize(), batchSize);
   } else {
-    LstmCompute::backwardBatch<0>(lstmValue, lstmGrad, getSize(),
-                                  batchSize);
+    LstmCompute::backwardBatch<0>(lstmValue, lstmGrad, getSize(), batchSize);
   }
 
   if (input.grad) {
diff --git a/paddle/gserver/layers/MDLstmLayer.cpp b/paddle/gserver/layers/MDLstmLayer.cpp
index 8ca92dee6d0720ad385ba85da3db2ba36372c43d..93f52c1c314105f9d0b2530218d43045224df948 100644
--- a/paddle/gserver/layers/MDLstmLayer.cpp
+++ b/paddle/gserver/layers/MDLstmLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "LstmLayer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/BaseMatrix.h"
@@ -106,7 +105,8 @@ public:
 
   bool end() { return end_; }
 
-  bool getPrePos(const std::vector<int>& delays, int idx,
+  bool getPrePos(const std::vector<int>& delays,
+                 int idx,
                  std::vector<int>& prePos) {
     bool isAvial = true;
     prePos.clear();
@@ -129,7 +129,8 @@ public:
     return isAvial;
   }
 
-  bool getNextPos(const std::vector<int>& delays, int idx,
+  bool getNextPos(const std::vector<int>& delays,
+                  int idx,
                   std::vector<int>& nextPos) {
     bool isAvial = true;
     nextPos.clear();
@@ -232,24 +233,46 @@ bool MDLstmLayer::init(const LayerMap& layerMap,
       new Weight(numBlocks_, numBlocks_ * (3 + numDims_), parameters_[0]));
   if (biasParameter_.get() != NULL) {
     bias_.reset(new Weight(1, numBlocks_ * (5 + 2 * numDims_), biasParameter_));
-    localBias_ =
-        Matrix::create(nullptr, /* height= */ 1, numBlocks_ * (3 + numDims_),
-                       /* trans= */ false, useGpu_);
-    checkIg_ = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                              /* trans= */ false, useGpu_);
-    checkFg_ = Matrix::create(nullptr, /* height= */ numDims_, numBlocks_,
-                              /* trans= */ false, useGpu_);
-    checkOg_ = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                              /* trans= */ false, useGpu_);
-    localBiasGrad_ =
-        Matrix::create(nullptr, /* height= */ 1, numBlocks_ * (3 + numDims_),
-                       /* trans= */ false, useGpu_);
-    checkIgGrad_ = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                                  /* trans= */ false, useGpu_);
-    checkFgGrad_ = Matrix::create(nullptr, /* height= */ numDims_, numBlocks_,
-                                  /* trans= */ false, useGpu_);
-    checkOgGrad_ = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                                  /* trans= */ false, useGpu_);
+    localBias_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                numBlocks_ * (3 + numDims_),
+                                /* trans= */ false,
+                                useGpu_);
+    checkIg_ = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    checkFg_ = Matrix::create(nullptr,
+                              /* height= */ numDims_,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    checkOg_ = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    localBiasGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    numBlocks_ * (3 + numDims_),
+                                    /* trans= */ false,
+                                    useGpu_);
+    checkIgGrad_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  numBlocks_,
+                                  /* trans= */ false,
+                                  useGpu_);
+    checkFgGrad_ = Matrix::create(nullptr,
+                                  /* height= */ numDims_,
+                                  numBlocks_,
+                                  /* trans= */ false,
+                                  useGpu_);
+    checkOgGrad_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  numBlocks_,
+                                  /* trans= */ false,
+                                  useGpu_);
 
     localBias_->setData(bias_->getW()->getData());
     checkIg_->setData(bias_->getW()->getData() + numBlocks_ * (3 + numDims_));
@@ -315,49 +338,79 @@ void MDLstmLayer::forward(PassType passType) {
   frameOutput_.reserve(batchSize);
 
   Matrix::resizeOrCreate(gate_.value,
-                         /* height= */ batchSize, numBlocks_ * (3 + numDims_),
-                         /* trans= */ false, useGpu_);
+                         /* height= */ batchSize,
+                         numBlocks_ * (3 + numDims_),
+                         /* trans= */ false,
+                         useGpu_);
 
   for (int i = frameGate_.size(); i < batchSize; i++) {
     Argument arg;
-    arg.value =
-        Matrix::create(nullptr, /* height= */ 1, numBlocks_ * (3 + numDims_),
-                       /* trans= */ false, useGpu_);
-    arg.grad =
-        Matrix::create(nullptr, /* height= */ 1, numBlocks_ * (3 + numDims_),
-                       /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_ * (3 + numDims_),
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_ * (3 + numDims_),
+                              /* trans= */ false,
+                              useGpu_);
     frameGate_.push_back(arg);
   }
   for (int i = frameInputGate_.size(); i < batchSize; i++) {
     Argument arg;
-    arg.value = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                               /* trans= */ false, useGpu_);
-    arg.grad = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                              /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
     frameInputGate_.push_back(arg);
   }
   for (int i = frameForgetGate_.size(); i < batchSize; i++) {
     Argument arg;
-    arg.value = Matrix::create(nullptr, /* height= */ numDims_, numBlocks_,
-                               /* trans= */ false, useGpu_);
-    arg.grad = Matrix::create(nullptr, /* height= */ numDims_, numBlocks_,
-                              /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ numDims_,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ numDims_,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
     frameForgetGate_.push_back(arg);
   }
   for (int i = frameOutputGate_.size(); i < batchSize; i++) {
     Argument arg;
-    arg.value = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                               /* trans= */ false, useGpu_);
-    arg.grad = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                              /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
     frameOutputGate_.push_back(arg);
   }
   for (int i = frameInputNode_.size(); i < batchSize; i++) {
     Argument arg;
-    arg.value = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                               /* trans= */ false, useGpu_);
-    arg.grad = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                              /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
     frameInputNode_.push_back(arg);
   }
   for (int i = frameState_.size(); i < batchSize; i++) {
@@ -374,10 +427,16 @@ void MDLstmLayer::forward(PassType passType) {
   }
   for (int i = frameOutput_.size(); i < batchSize; i++) {
     Argument arg;
-    arg.value = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                               /* trans= */ false, useGpu_);
-    arg.grad = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                              /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
     frameOutput_.push_back(arg);
   }
 
@@ -432,13 +491,19 @@ void MDLstmLayer::forwardGate2OutputSequence(int start,
           *frameState_[start + preOffsetV[i]].value, *checkIg_, 1.0, 1.0);
 
       MatrixPtr fgGateOneDim = Matrix::create(
-          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_, 1,
-          numBlocks_, false, useGpu_);
+          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
       MatrixPtr checkFgOneDim =
-          Matrix::create(checkFg_->getData() + i * numBlocks_, 1.0, numBlocks_,
-                         false, useGpu_);
-      fgGateOneDim->addDotMul(*frameState_[start + preOffsetV[i]].value,
-                              *checkFgOneDim, 1.0, 1.0);
+          Matrix::create(checkFg_->getData() + i * numBlocks_,
+                         1.0,
+                         numBlocks_,
+                         false,
+                         useGpu_);
+      fgGateOneDim->addDotMul(
+          *frameState_[start + preOffsetV[i]].value, *checkFgOneDim, 1.0, 1.0);
     }
   }
   activationGate_->forward(frameInputGate_[idxCurr]);
@@ -449,18 +514,22 @@ void MDLstmLayer::forwardGate2OutputSequence(int start,
   for (int i = 0; i < numDims_; i++) {
     if (preOffsetV[i] >= 0) {
       MatrixPtr fgGateOneDim = Matrix::create(
-          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_, 1,
-          numBlocks_, false, useGpu_);
+          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
       frameState_[idxCurr].value->addDotMul(
           *frameState_[start + preOffsetV[i]].value, *fgGateOneDim, 1.0, 1.0);
     }
   }
   frameState_[idxCurr].value->addDotMul(*frameInputNode_[idxCurr].value,
-                                        *frameInputGate_[idxCurr].value, 1.0,
+                                        *frameInputGate_[idxCurr].value,
+                                        1.0,
                                         1.0);
 
-  frameOutputGate_[idxCurr].value->addDotMul(*frameState_[idxCurr].value,
-                                             *checkOg_, 1.0, 1.0);
+  frameOutputGate_[idxCurr].value->addDotMul(
+      *frameState_[idxCurr].value, *checkOg_, 1.0, 1.0);
   activationGate_->forward(frameOutputGate_[idxCurr]);
 
   framePreOutput_[idxCurr].value->copyFrom(*(frameState_[idxCurr].value));
@@ -493,8 +562,10 @@ void MDLstmLayer::backward(const UpdateCallback& callback) {
   size_t numSequences = input.getNumSequences();
 
   Matrix::resizeOrCreate(gate_.grad,
-                         /* height= */ batchSize, numBlocks_ * (3 + numDims_),
-                         /* trans= */ false, useGpu_);
+                         /* height= */ batchSize,
+                         numBlocks_ * (3 + numDims_),
+                         /* trans= */ false,
+                         useGpu_);
 
   for (int i = 0; i < batchSize; i++) {
     if (frameState_[i].grad == NULL)
@@ -576,8 +647,8 @@ void MDLstmLayer::backwardGate2OutputSequence(int start,
                                          *framePreOutput_[idxCurr].value);
   activationGate_->backward(frameOutputGate_[idxCurr]);
 
-  frameState_[idxCurr].grad->addDotMul(*frameOutputGate_[idxCurr].grad,
-                                       *checkOg_, 1.0, 1.0);
+  frameState_[idxCurr].grad->addDotMul(
+      *frameOutputGate_[idxCurr].grad, *checkOg_, 1.0, 1.0);
   for (int i = 0; i < numDims_; i++) {
     if (nextOffsetV[i] >= 0) {
       frameState_[idxCurr].grad->addDotMul(
@@ -586,18 +657,26 @@ void MDLstmLayer::backwardGate2OutputSequence(int start,
       MatrixPtr fgGateOneDimGrad = Matrix::create(
           frameForgetGate_[start + nextOffsetV[i]].grad->getData() +
               i * numBlocks_,
-          1, numBlocks_, false, useGpu_);
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
       MatrixPtr fgGateOneDimVal = Matrix::create(
           frameForgetGate_[start + nextOffsetV[i]].value->getData() +
               i * numBlocks_,
-          1, numBlocks_, false, useGpu_);
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
       MatrixPtr checkFgOneDim = Matrix::create(
           checkFg_->getData() + i * numBlocks_, 1, numBlocks_, false, useGpu_);
 
-      frameState_[idxCurr].grad->addDotMul(*fgGateOneDimGrad, *checkFgOneDim,
-                                           1.0, 1.0);
       frameState_[idxCurr].grad->addDotMul(
-          *frameState_[start + nextOffsetV[i]].grad, *fgGateOneDimVal, 1.0,
+          *fgGateOneDimGrad, *checkFgOneDim, 1.0, 1.0);
+      frameState_[idxCurr].grad->addDotMul(
+          *frameState_[start + nextOffsetV[i]].grad,
+          *fgGateOneDimVal,
+          1.0,
           1.0);
     }
   }
@@ -611,11 +690,15 @@ void MDLstmLayer::backwardGate2OutputSequence(int start,
   for (int i = 0; i < numDims_; i++) {
     if (preOffsetV[i] >= 0) {
       MatrixPtr fgGateOneDimGrad = Matrix::create(
-          frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_, 1,
-          numBlocks_, false, useGpu_);
+          frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
       fgGateOneDimGrad->addDotMul(*frameState_[idxCurr].grad,
                                   *frameState_[start + preOffsetV[i]].value,
-                                  1.0, 1.0);
+                                  1.0,
+                                  1.0);
     }
   }
 
@@ -627,22 +710,30 @@ void MDLstmLayer::backwardGate2OutputSequence(int start,
     for (int i = 0; i < numDims_; i++) {
       if (preOffsetV[i] >= 0) {
         checkIgGrad_->addDotMul(*frameInputGate_[idxCurr].grad,
-                                *frameState_[start + preOffsetV[i]].value, 1.0,
+                                *frameState_[start + preOffsetV[i]].value,
+                                1.0,
                                 1.0);
 
         MatrixPtr fgGateOneDimGrad = Matrix::create(
-            frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_, 1,
-            numBlocks_, false, useGpu_);
+            frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
+            1,
+            numBlocks_,
+            false,
+            useGpu_);
         MatrixPtr checkFgOneDimGrad =
-            Matrix::create(checkFgGrad_->getData() + i * numBlocks_, 1,
-                           numBlocks_, false, useGpu_);
+            Matrix::create(checkFgGrad_->getData() + i * numBlocks_,
+                           1,
+                           numBlocks_,
+                           false,
+                           useGpu_);
         checkFgOneDimGrad->addDotMul(*fgGateOneDimGrad,
                                      *frameState_[start + preOffsetV[i]].value,
-                                     1.0, 1.0);
+                                     1.0,
+                                     1.0);
       }
     }
-    checkOgGrad_->addDotMul(*frameOutputGate_[idxCurr].grad,
-                            *frameState_[idxCurr].value, 1.0, 1.0);
+    checkOgGrad_->addDotMul(
+        *frameOutputGate_[idxCurr].grad, *frameState_[idxCurr].value, 1.0, 1.0);
   }
 }
 
@@ -660,7 +751,9 @@ void MDLstmLayer::backwardOneSequence(int start, CoordIterator& coordIter) {
         if (weight_->getWGrad()) {
           weight_->getWGrad()->mul(
               frameOutput_[start + preOffset].value->getTranspose(),
-              frameGate_[start + offset].grad, 1.0, 1.0);
+              frameGate_[start + offset].grad,
+              1.0,
+              1.0);
         }
       }
     }
diff --git a/paddle/gserver/layers/MaxIdLayer.cpp b/paddle/gserver/layers/MaxIdLayer.cpp
index b80de87b4e9cc56b272f172c304027026039be06..22670fa1210e1199266cb16a1f08826c3010a84e 100644
--- a/paddle/gserver/layers/MaxIdLayer.cpp
+++ b/paddle/gserver/layers/MaxIdLayer.cpp
@@ -45,7 +45,10 @@ public:
     const Argument& input = getInput(0);
     size_t batchSize = input.getBatchSize();
     IVector::resizeOrCreate(output_.ids, batchSize * beamSize_, useGpu_);
-    Matrix::resizeOrCreate(output_.in, batchSize, beamSize_, false,
+    Matrix::resizeOrCreate(output_.in,
+                           batchSize,
+                           beamSize_,
+                           false,
                            /* useGpu */ useGpu_);
     output_.value = nullptr;
     input.value->rowMax(*output_.ids, *output_.in);
diff --git a/paddle/gserver/layers/MaxLayer.cpp b/paddle/gserver/layers/MaxLayer.cpp
index c4ffe894eccd61b6fe6baf9cafa95faf543d8c98..42bc6bb815232ff8dfa6b49ebf47b10c252e28c5 100644
--- a/paddle/gserver/layers/MaxLayer.cpp
+++ b/paddle/gserver/layers/MaxLayer.cpp
@@ -23,8 +23,8 @@ REGISTER_LAYER(max, MaxLayer);
 void MaxLayer::forward(PassType passType) {
   SequencePoolLayer::forward(passType);
 
-  IVector::resizeOrCreate(maxIndex_, newBatchSize_ * getSize(),
-                          useGpu(deviceId_));
+  IVector::resizeOrCreate(
+      maxIndex_, newBatchSize_ * getSize(), useGpu(deviceId_));
   maxIndex_->zeroMem();
 
   MatrixPtr inputValue = getInputValue(0);
diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h
index e6dcfe9c6759d137d90d8d6f382b91a9fb551323..74df0b8b576c8ea1eef56d465e8c4ceee5019fdb 100644
--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "SequencePoolLayer.h"
diff --git a/paddle/gserver/layers/MaxOutLayer.cpp b/paddle/gserver/layers/MaxOutLayer.cpp
index a3de069bf7a6c9217e4adfeb2e65409955cc569c..b7f1b98041355624edbc1b480868079887264467 100644
--- a/paddle/gserver/layers/MaxOutLayer.cpp
+++ b/paddle/gserver/layers/MaxOutLayer.cpp
@@ -25,10 +25,10 @@ size_t MaxOutLayer::getSize() {
   imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
   imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
   if (imgSizeH_ == 0) {
-    imgSizeH_ = maxoutConf.img_size_y();
+    imgSizeH_ = maxoutConf.image_conf().img_size_y();
   }
   if (imgSizeW_ == 0) {
-    imgSizeW_ = maxoutConf.img_size_x();
+    imgSizeW_ = maxoutConf.image_conf().img_size();
   }
 
   featLen_ = imgSizeH_ * imgSizeW_;
@@ -50,7 +50,7 @@ bool MaxOutLayer::init(const LayerMap& layerMap,
 
   const MaxOutConfig& conf = config_.inputs(0).maxout_conf();
   groups_ = conf.groups();
-  channels_ = conf.channels();
+  channels_ = conf.image_conf().channels();
   CHECK_EQ(channels_ % groups_, 0UL);
   outputChannels_ = channels_ / groups_;
 
diff --git a/paddle/gserver/layers/MixedLayer.cpp b/paddle/gserver/layers/MixedLayer.cpp
index 26b1360290ffba316816db898855d8c0b9bdaaa7..1392188fcae715734d96b1402924515fa3618965 100644
--- a/paddle/gserver/layers/MixedLayer.cpp
+++ b/paddle/gserver/layers/MixedLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "MixedLayer.h"
 
@@ -29,8 +28,8 @@ bool MixedLayer::init(const LayerMap& layerMap,
   projections_.resize(inputLayers_.size());
   for (size_t i = 0; i < inputLayers_.size(); i++) {
     if (config_.inputs(i).has_proj_conf()) {
-      projections_[i].reset(Projection::create(config_.inputs(i).proj_conf(),
-                                               parameters_[i], useGpu_));
+      projections_[i].reset(Projection::create(
+          config_.inputs(i).proj_conf(), parameters_[i], useGpu_));
     } else {
       CHECK(!parameters_[i]) << "should no parameters for operators";
     }
@@ -46,8 +45,7 @@ bool MixedLayer::init(const LayerMap& layerMap,
   if (biasParameter_.get() != NULL) {
     sharedBias_ = config_.shared_biases();
     size_t psize = config_.bias_size();
-    biases_ = std::unique_ptr<Weight>(
-        new Weight(1, psize, biasParameter_));
+    biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
   }
 
   return true;
diff --git a/paddle/gserver/layers/MixedLayer.h b/paddle/gserver/layers/MixedLayer.h
index 5842e51e1d79d959d580e9cb92bead2d1961c9e6..271e0c2538d3b7239a5d54ec43180dddff569b76 100644
--- a/paddle/gserver/layers/MixedLayer.h
+++ b/paddle/gserver/layers/MixedLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -22,8 +21,8 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * A mixed layer has multiple input layers. 
- * Each input layer was processed by a Projection or Operator. 
+ * A mixed layer has multiple input layers.
+ * Each input layer was processed by a Projection or Operator.
  * The results of all projections or Operators are summed together with bias
  * (if configured), and then go through an activation function and dropout
  * (if configured).
@@ -43,7 +42,7 @@ public:
   virtual void backward(const UpdateCallback& callback = nullptr);
   virtual void resetState();
   /**
-   * setState() should be called after getState(). 
+   * setState() should be called after getState().
    * Argument state consists of all projections states.
    */
   virtual void setState(LayerStatePtr state);
diff --git a/paddle/gserver/layers/MultinomialSampler.cpp b/paddle/gserver/layers/MultinomialSampler.cpp
index 518dc0c60cbdc2a95b7eb9c8ff33dd6a9fb87c98..e85dca72d3162d857e768221e970fe8e3951ae9c 100644
--- a/paddle/gserver/layers/MultinomialSampler.cpp
+++ b/paddle/gserver/layers/MultinomialSampler.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "MultinomialSampler.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/MultinomialSampler.h b/paddle/gserver/layers/MultinomialSampler.h
index 442124704ac0a9bdfba7ce67da279e2bc8e03394..59683d2ee29924e76ca11eb43fbd8cd175c3c357 100644
--- a/paddle/gserver/layers/MultinomialSampler.h
+++ b/paddle/gserver/layers/MultinomialSampler.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <random>
diff --git a/paddle/gserver/layers/MultiplexLayer.cpp b/paddle/gserver/layers/MultiplexLayer.cpp
index a70172d9a6344b704cd775ce872186273d2aa4b9..c681eb0623ab7b8426fe34ce6817a3f5f4ad8246 100644
--- a/paddle/gserver/layers/MultiplexLayer.cpp
+++ b/paddle/gserver/layers/MultiplexLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp
index 4faebe5d2ad6f94c36de52b36b1a0828e7710005..50b29cdea5a352093c0508995da4cf3e2afcc995 100644
--- a/paddle/gserver/layers/NCELayer.cpp
+++ b/paddle/gserver/layers/NCELayer.cpp
@@ -23,7 +23,8 @@ namespace paddle {
 /**
  * Noise-contrastive estimation.
  * Implements the method in the following paper:
- * A fast and simple algorithm for training neural probabilistic language models.
+ * A fast and simple algorithm for training neural probabilistic language
+ * models.
  *
  * The config file api is nce_layer.
  */
@@ -180,8 +181,11 @@ public:
     int size = getSize();
     resetOutput(batchSize, size);
 
-    Matrix::resizeOrCreate(sampleOut_.value, 1, samples_.size(),
-                           /* trans= */ false, useGpu_);
+    Matrix::resizeOrCreate(sampleOut_.value,
+                           1,
+                           samples_.size(),
+                           /* trans= */ false,
+                           useGpu_);
 
     forwardBias();
 
@@ -195,8 +199,11 @@ public:
   }
 
   void backward(const UpdateCallback& callback) {
-    Matrix::resizeOrCreate(sampleOut_.grad, 1, samples_.size(),
-                           /* trans= */ false, useGpu_);
+    Matrix::resizeOrCreate(sampleOut_.grad,
+                           1,
+                           samples_.size(),
+                           /* trans= */ false,
+                           useGpu_);
 
     backwardCost();
 
@@ -241,7 +248,8 @@ public:
     real* sampleOut = sampleOut_.value->getData();
 
     for (size_t i = 0; i < samples_.size(); ++i) {
-      sampleOut[i] += dotProduct(dim, inputMat->getRowBuf(samples_[i].sampleId),
+      sampleOut[i] += dotProduct(dim,
+                                 inputMat->getRowBuf(samples_[i].sampleId),
                                  weightMat->getRowBuf(samples_[i].labelId));
     }
   }
@@ -257,7 +265,9 @@ public:
 
     if (weightGradMat) {
       for (size_t i = 0; i < samples_.size(); ++i) {
-        axpy(dim, sampleGrad[i], inputMat->getRowBuf(samples_[i].sampleId),
+        axpy(dim,
+             sampleGrad[i],
+             inputMat->getRowBuf(samples_[i].sampleId),
              weightGradMat->getRowBuf(samples_[i].labelId));
       }
       weights_[layerId]->incUpdate(callback);
@@ -265,7 +275,9 @@ public:
 
     if (inputGradMat) {
       for (size_t i = 0; i < samples_.size(); ++i) {
-        axpy(dim, sampleGrad[i], weightMat->getRowBuf(samples_[i].labelId),
+        axpy(dim,
+             sampleGrad[i],
+             weightMat->getRowBuf(samples_[i].labelId),
              inputGradMat->getRowBuf(samples_[i].sampleId));
       }
     }
diff --git a/paddle/gserver/layers/NormLayer.cpp b/paddle/gserver/layers/NormLayer.cpp
index ad8b92d2ff72426d30f2488af7d168ffd8e5b65d..445a1a0c52ed65a6321a265b158388f2d59e4722 100644
--- a/paddle/gserver/layers/NormLayer.cpp
+++ b/paddle/gserver/layers/NormLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "NormLayer.h"
 #include "NormProjectionLayer.h"
@@ -49,6 +48,9 @@ bool ResponseNormLayer::init(const LayerMap& layerMap,
   outputX_ = conf.output_x();
   imgSize_ = conf.img_size();
   denoms_ = NULL;
+
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
   return true;
 }
 
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
index 2b05be6fcb44fc3f61f9be4e464b2100284bf5c6..fcc57849d6b86df8f175184451a0fd459ce9ec28 100644
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <vector>
@@ -44,13 +43,13 @@ public:
 
 /**
  * @brief response normalization within feature maps
- * namely normalize in independent channel 
- * When code refactoring, we delete the original implementation. 
+ * namely normalize in independent channel
+ * When code refactoring, we delete the original implementation.
  * Need to implement in the futrue.
  */
 class ResponseNormLayer : public NormLayer {
 protected:
-  size_t channels_, size_, outputX_, imgSize_;
+  size_t channels_, size_, outputX_, imgSize_, outputY_, imgSizeY_;
   float scale_, pow_;
   MatrixPtr denoms_;
 
diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp
index eab6e904ee998b876a4dd7c503eec3a9a84f7412..da36cc2c9913796b65c451a5c4928143168a1104 100644
--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 #include "NormProjectionLayer.h"
@@ -24,7 +23,7 @@ size_t CMRProjectionNormLayer::getSize() {
   imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
   imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
   if (imgSizeH_ == 0) {
-    imgSizeH_ = imgSize_;
+    imgSizeH_ = imgSizeY_;
   }
   if (imgSizeW_ == 0) {
     imgSizeW_ = imgSize_;
@@ -65,8 +64,8 @@ void CMRProjectionNormLayer::forward(PassType passType) {
 
   denoms_->zeroMem();
 
-  outV->crossMapNormalFwd(*input, imgSizeH_, imgSizeW_, *denoms_, channels_,
-                          size_, scale_, pow_);
+  outV->crossMapNormalFwd(
+      *input, imgSizeH_, imgSizeW_, *denoms_, channels_, size_, scale_, pow_);
 }
 
 void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
@@ -81,8 +80,15 @@ void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
   MatrixPtr localOutV = getOutputValue();
   MatrixPtr preOutV = inputLayers_[0]->getOutputValue();
 
-  preOutGrad->crossMapNormalBwd(*localGrad, *denoms_, *preOutV, *localOutV,
-                                channels_, imgSizeH_, imgSizeW_, size_, scale_,
+  preOutGrad->crossMapNormalBwd(*localGrad,
+                                *denoms_,
+                                *preOutV,
+                                *localOutV,
+                                channels_,
+                                imgSizeH_,
+                                imgSizeW_,
+                                size_,
+                                scale_,
                                 pow_);
 }
 }  // namespace paddle
diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h
index 728806ea76958382a3ad06804f773c959598d043..b42e98ab0941e59a38bb1cfa73f49682dbef942c 100644
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ b/paddle/gserver/layers/NormProjectionLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "NormLayer.h"
diff --git a/paddle/gserver/layers/Operator.cpp b/paddle/gserver/layers/Operator.cpp
index 5fa8239ac5d6f11da0558c8c9eddf8af378f0df3..b89c4740142e377f0cbbe755377f37baac270552 100644
--- a/paddle/gserver/layers/Operator.cpp
+++ b/paddle/gserver/layers/Operator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Operator.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/Operator.h b/paddle/gserver/layers/Operator.h
index 9ee16f70ee3a3cae3b7e764c674bbef348a300fc..ff6558dc73b8d60f3b4a3d87c9d28c650c8f2987 100644
--- a/paddle/gserver/layers/Operator.h
+++ b/paddle/gserver/layers/Operator.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/parameter/Parameter.h"
@@ -48,12 +47,14 @@ public:
   static ClassRegistrar<Operator, OperatorConfig, bool> registrar_;
 
   /**
-   * Forward propagation. If backward() will be called, in and out must be kept valid until then.
+   * Forward propagation. If backward() will be called, in and out must be kept
+   * valid until then.
    * @param ins inputs of operator
    * @param out output of operator
    * @param passType PASS_TRAIN of PASS_TEST
    */
-  void forward(std::vector<const Argument*> ins, Argument* out,
+  void forward(std::vector<const Argument*> ins,
+               Argument* out,
                PassType passType) {
     ins_ = ins;
     out_ = out;
diff --git a/paddle/gserver/layers/OuterProdLayer.cpp b/paddle/gserver/layers/OuterProdLayer.cpp
index 708c901ba9e9d2a5421fc64789f4ac174b365dc1..9b24a4f440c9e1fc3b4e73a7234c791fff045ea9 100644
--- a/paddle/gserver/layers/OuterProdLayer.cpp
+++ b/paddle/gserver/layers/OuterProdLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -58,12 +57,15 @@ bool OuterProdLayer::init(const LayerMap& layerMap,
 
   CHECK_EQ(dim0 * dim1, getSize()) << "Dimension mismatch";
 
-  tmpRow0 = Matrix::create(nullptr, /* height= */ 1, dim0, /* trans= */ false,
-                           useGpu_);
-  tmpRow1 = Matrix::create(nullptr, /* height= */ 1, dim1, /* trans= */ false,
+  tmpRow0 = Matrix::create(
+      nullptr, /* height= */ 1, dim0, /* trans= */ false, useGpu_);
+  tmpRow1 = Matrix::create(
+      nullptr, /* height= */ 1, dim1, /* trans= */ false, useGpu_);
+  tmpMtx0 = Matrix::create(nullptr,
+                           /* height= */ dim0,
+                           dim1,
+                           /* trans= */ false,
                            useGpu_);
-  tmpMtx0 = Matrix::create(nullptr, /* height= */ dim0, dim1,
-                           /* trans= */ false, useGpu_);
   return true;
 }
 
diff --git a/paddle/gserver/layers/ParameterReluLayer.cpp b/paddle/gserver/layers/ParameterReluLayer.cpp
index 98d108db5f05252aefa76fcad3d3eb429d59e82a..cd3bffa2e1d01ef8367c39c20c8e6f366c583b68 100644
--- a/paddle/gserver/layers/ParameterReluLayer.cpp
+++ b/paddle/gserver/layers/ParameterReluLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ParameterReluLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -59,8 +58,8 @@ void ParameterReluLayer::backward(const UpdateCallback& callback) {
   }
 
   MatrixPtr preGrad = getInputGrad(0);
-  preGrad->paramReluBackwardDiff(*getOutputGrad(), *(getInputValue(0)),
-                                 *(weight_->getW()));
+  preGrad->paramReluBackwardDiff(
+      *getOutputGrad(), *(getInputValue(0)), *(weight_->getW()));
   {
     REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
     weight_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/gserver/layers/ParameterReluLayer.h b/paddle/gserver/layers/ParameterReluLayer.h
index 367e4e787c5ef24a934974af54c7b2bb8cd6de5f..029c09381f0e13de111ef30c4574d2255abfd018 100644
--- a/paddle/gserver/layers/ParameterReluLayer.h
+++ b/paddle/gserver/layers/ParameterReluLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
index 2fbc9001f11613cd987e3815f6f31caa8f9979cf..511dfd87c12551c91e8864364dbf1a1085a989b6 100644
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "PoolLayer.h"
 #include "PoolProjectionLayer.h"
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
index e87ad08251dd40214f61857251a03e56867a675e..59be295a538b007993e77f85f079f78a8b881eca 100644
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/PoolProjection.cpp b/paddle/gserver/layers/PoolProjection.cpp
index 9be5aba3d57d23e462c9ea3608491606f988c35f..1b227c8084991e4bbf1e380881a6018fe01e9180 100644
--- a/paddle/gserver/layers/PoolProjection.cpp
+++ b/paddle/gserver/layers/PoolProjection.cpp
@@ -19,7 +19,8 @@ namespace paddle {
 REGISTER_PROJECTION_CREATE_FUNC(pool, &PoolProjection::create);
 
 PoolProjection::PoolProjection(const ProjectionConfig& config,
-                               ParameterPtr parameter, bool useGpu)
+                               ParameterPtr parameter,
+                               bool useGpu)
     : Projection(config, parameter, useGpu) {
   const PoolConfig& conf = config_.pool_conf();
   poolType_ = conf.pool_type();
@@ -47,9 +48,15 @@ size_t PoolProjection::getSize() {
   if (imgSize_ == 0) {
     imgSize_ = conf.img_size();
   }
-  outputY_ = outputSize(imgSizeY_, sizeY_, confPaddingY_, strideY_,
+  outputY_ = outputSize(imgSizeY_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
                         /* caffeMode */ false);
-  outputX_ = outputSize(imgSize_, sizeX_, confPadding_, stride_,
+  outputX_ = outputSize(imgSize_,
+                        sizeX_,
+                        confPadding_,
+                        stride_,
                         /* caffeMode */ false);
 
   const_cast<Argument*>(out_)->setFrameHeight(outputY_);
@@ -59,7 +66,8 @@ size_t PoolProjection::getSize() {
 }
 
 PoolProjection* PoolProjection::create(const ProjectionConfig& config,
-                                       ParameterPtr parameter, bool useGpu) {
+                                       ParameterPtr parameter,
+                                       bool useGpu) {
   const std::string& pool = config.pool_conf().pool_type();
   if (pool == "max-projection") {
     return new MaxPoolProjection(config, parameter, useGpu);
@@ -76,8 +84,17 @@ void MaxPoolProjection::forward() {
   CHECK_EQ(width, out_->value->getWidth());
   MatrixPtr inputV = in_->value;
   MatrixPtr outV = out_->value;
-  outV->maxPoolForward(*inputV, imgSizeY_, imgSize_, channels_, sizeX_, sizeY_,
-                       strideY_, stride_, outputY_, outputX_, confPaddingY_,
+  outV->maxPoolForward(*inputV,
+                       imgSizeY_,
+                       imgSize_,
+                       channels_,
+                       sizeX_,
+                       sizeY_,
+                       strideY_,
+                       stride_,
+                       outputY_,
+                       outputX_,
+                       confPaddingY_,
                        confPadding_);
 }
 
@@ -91,9 +108,21 @@ void MaxPoolProjection::backward(const UpdateCallback& callback) {
   if (NULL == inputGrad) {
     return;
   }
-  inputGrad->maxPoolBackward(*inputV, imgSizeY_, imgSize_, *outGrad, *outV,
-                             sizeX_, sizeY_, strideY_, stride_, outputY_,
-                             outputX_, 1, 1, confPaddingY_, confPadding_);
+  inputGrad->maxPoolBackward(*inputV,
+                             imgSizeY_,
+                             imgSize_,
+                             *outGrad,
+                             *outV,
+                             sizeX_,
+                             sizeY_,
+                             strideY_,
+                             stride_,
+                             outputY_,
+                             outputX_,
+                             1,
+                             1,
+                             confPaddingY_,
+                             confPadding_);
 }
 
 void AvgPoolProjection::forward() {
@@ -101,8 +130,17 @@ void AvgPoolProjection::forward() {
   CHECK_EQ(width, out_->value->getWidth());
   MatrixPtr inputV = in_->value;
   MatrixPtr outV = out_->value;
-  outV->avgPoolForward(*inputV, imgSizeY_, imgSize_, channels_, sizeX_, sizeY_,
-                       strideY_, stride_, outputY_, outputX_, confPaddingY_,
+  outV->avgPoolForward(*inputV,
+                       imgSizeY_,
+                       imgSize_,
+                       channels_,
+                       sizeX_,
+                       sizeY_,
+                       strideY_,
+                       stride_,
+                       outputY_,
+                       outputX_,
+                       confPaddingY_,
                        confPadding_);
 }
 
@@ -116,8 +154,18 @@ void AvgPoolProjection::backward(const UpdateCallback& callback) {
     return;
   }
 
-  inputGrad->avgPoolBackward(*outputGrad, imgSizeY_, imgSize_, sizeX_, sizeY_,
-                             strideY_, stride_, outputY_, outputX_, 1, 1,
-                             confPaddingY_, confPadding_);
+  inputGrad->avgPoolBackward(*outputGrad,
+                             imgSizeY_,
+                             imgSize_,
+                             sizeX_,
+                             sizeY_,
+                             strideY_,
+                             stride_,
+                             outputY_,
+                             outputX_,
+                             1,
+                             1,
+                             confPaddingY_,
+                             confPadding_);
 }
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjection.h b/paddle/gserver/layers/PoolProjection.h
index a11e25b729cb7afabdb3547326f269e54ddf42da..9c3191bd80061c13b645c2a107eaa723e2495032 100644
--- a/paddle/gserver/layers/PoolProjection.h
+++ b/paddle/gserver/layers/PoolProjection.h
@@ -30,11 +30,13 @@ protected:
   std::string poolType_;
 
 public:
-  PoolProjection(const ProjectionConfig& config, ParameterPtr parameter,
+  PoolProjection(const ProjectionConfig& config,
+                 ParameterPtr parameter,
                  bool useGpu);
 
   static PoolProjection* create(const ProjectionConfig& config,
-                                ParameterPtr parameter, bool useGpu);
+                                ParameterPtr parameter,
+                                bool useGpu);
 
   const std::string& getPoolType() const { return poolType_; }
 
@@ -43,7 +45,8 @@ public:
 
 class MaxPoolProjection : public PoolProjection {
 public:
-  MaxPoolProjection(const ProjectionConfig& config, ParameterPtr parameter,
+  MaxPoolProjection(const ProjectionConfig& config,
+                    ParameterPtr parameter,
                     bool useGpu)
       : PoolProjection(config, parameter, useGpu) {}
 
@@ -53,7 +56,8 @@ public:
 
 class AvgPoolProjection : public PoolProjection {
 public:
-  AvgPoolProjection(const ProjectionConfig& config, ParameterPtr parameter,
+  AvgPoolProjection(const ProjectionConfig& config,
+                    ParameterPtr parameter,
                     bool useGpu)
       : PoolProjection(config, parameter, useGpu) {}
 
diff --git a/paddle/gserver/layers/PoolProjectionLayer.cpp b/paddle/gserver/layers/PoolProjectionLayer.cpp
index cabb346d6c99178f7c8ce049d495785c0a488173..aabc60af197af30a367c0f933276116ba316bd34 100644
--- a/paddle/gserver/layers/PoolProjectionLayer.cpp
+++ b/paddle/gserver/layers/PoolProjectionLayer.cpp
@@ -18,7 +18,6 @@ limitations under the License. */
 
 namespace paddle {
 
-
 size_t PoolProjectionLayer::getSize() {
   CHECK_EQ(inputLayers_.size(), 1UL);
   size_t layerSize = 0;
@@ -31,9 +30,15 @@ size_t PoolProjectionLayer::getSize() {
     imgSizeW_ = imgSize_;
   }
 
-  outputH_ = outputSize(imgSizeH_, sizeY_, confPaddingY_, strideY_,
+  outputH_ = outputSize(imgSizeH_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
                         /* caffeMode */ false);
-  outputW_ = outputSize(imgSizeW_, sizeX_, confPadding_, stride_,
+  outputW_ = outputSize(imgSizeW_,
+                        sizeX_,
+                        confPadding_,
+                        stride_,
                         /* caffeMode */ false);
 
   layerSize = outputH_ * outputW_ * channels_;
diff --git a/paddle/gserver/layers/PowerLayer.cpp b/paddle/gserver/layers/PowerLayer.cpp
index 44c5e6063b1aed93b3fbb175821f911ca26fac1a..dbe70a1d8785e971c592ca109e8f795a2c257812 100644
--- a/paddle/gserver/layers/PowerLayer.cpp
+++ b/paddle/gserver/layers/PowerLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -26,7 +25,7 @@ namespace paddle {
  * \f[
  *   y = x^w
  * \f]
- * where \f$x\f$ is a input vector, \f$w\f$ is scalar weight, 
+ * where \f$x\f$ is a input vector, \f$w\f$ is scalar weight,
  * and output \f$y\f$ is a vector.
  *
  * The config file api is power_layer.
@@ -100,7 +99,7 @@ void PowerLayer::backward(const UpdateCallback& callback) {
     Matrix::resizeOrCreate(tmpMtx, batchSize, dataDim, false, useGpu_);
 
     if (inG0) {
-      tmpMtx->log(*inV1);
+      tmpMtx->log2(*inV1);
       tmpMtx->dotMul(*tmpMtx, *outV);
 
       // inG0 += outG .* (log(inV1) * outV)
diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp
index 68fee69f44d0c2c144f6dde6fd8ff36bd96094f6..95be7b34cb106665d2465630233fca6b34d71e79 100644
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -18,8 +18,7 @@ namespace paddle {
 
 class PrintLayer : public Layer {
 public:
-  explicit PrintLayer(const LayerConfig& config)
-      : Layer(config) {}
+  explicit PrintLayer(const LayerConfig& config) : Layer(config) {}
   void forward(PassType passType);
   void backward(const UpdateCallback& callback) {}
 };
diff --git a/paddle/gserver/layers/Projection.cpp b/paddle/gserver/layers/Projection.cpp
index aebc08f4a0e5937e50d11a5cc832b27210c8ea42..c7eb4b644281ff6e7b58201c41888d3a8967f419 100644
--- a/paddle/gserver/layers/Projection.cpp
+++ b/paddle/gserver/layers/Projection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Projection.h"
 
 #include "ContextProjection.h"
@@ -25,7 +24,8 @@ ClassRegistrar<Projection, ProjectionConfig, ParameterPtr, bool>
     Projection::registrar_;
 
 Projection* Projection::create(const ProjectionConfig& config,
-                               ParameterPtr parameter, bool useGpu) {
+                               ParameterPtr parameter,
+                               bool useGpu) {
   return registrar_.createByType(config.type(), config, parameter, useGpu);
 }
 
diff --git a/paddle/gserver/layers/Projection.h b/paddle/gserver/layers/Projection.h
index 203edc5396a53cf72dcad6308335ba4731ba49bc..798503113d761091d1a1bdf9e4ec70e0c2c3b3a4 100644
--- a/paddle/gserver/layers/Projection.h
+++ b/paddle/gserver/layers/Projection.h
@@ -39,9 +39,11 @@ namespace paddle {
 class Projection {
 public:
   static Projection* create(const ProjectionConfig& config,
-                            ParameterPtr parameter, bool useGpu);
+                            ParameterPtr parameter,
+                            bool useGpu);
 
-  Projection(const ProjectionConfig& config, ParameterPtr parameter,
+  Projection(const ProjectionConfig& config,
+             ParameterPtr parameter,
              bool useGpu)
       : config_(config), parameter_(parameter), useGpu_(useGpu) {}
 
diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp
index 30ef679f92c073cce5bb6edd11896007c0a8e68e..08453e21b8ff27138f9fa44ac834b54eb94c0688 100644
--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 #include "paddle/utils/Stat.h"
 #include "SequenceToBatch.h"
@@ -143,8 +142,8 @@ bool RecurrentLayer::init(const LayerMap& layerMap,
 
 void RecurrentLayer::resetState() {
   CHECK(!reversed_) << "state is not allowed for reversed recurrent layer";
-  Matrix::resizeOrCreate(prevOutput_, 1, getSize(), /* trans= */ false,
-                         useGpu_);
+  Matrix::resizeOrCreate(
+      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
   prevOutput_->zeroMem();
 }
 
@@ -183,16 +182,23 @@ void RecurrentLayer::forward(PassType passType) {
   }
 }
 
-void RecurrentLayer::forwardSequence(int batchSize, size_t numSequences,
+void RecurrentLayer::forwardSequence(int batchSize,
+                                     size_t numSequences,
                                      const int* starts) {
   REGISTER_TIMER_INFO("RecurrentFwSequence", getName().c_str());
   frameOutput_.reserve(batchSize);
   for (int i = frameOutput_.size(); i < batchSize; ++i) {
     Argument arg;
-    arg.value = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                               /* trans= */ false, useGpu_);
-    arg.grad = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                              /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               getSize(),
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              getSize(),
+                              /* trans= */ false,
+                              useGpu_);
     frameOutput_.push_back(arg);
   }
 
@@ -213,8 +219,8 @@ void RecurrentLayer::forwardOneSequence(int start, int length) {
     }
     activation_->forward(frameOutput_[start]);
     for (int i = 1; i < length; ++i) {
-      frameOutput_[start + i].value->mul(frameOutput_[start + i - 1].value,
-                                         weight_->getW(), 1, 1);
+      frameOutput_[start + i].value->mul(
+          frameOutput_[start + i - 1].value, weight_->getW(), 1, 1);
       activation_->forward(frameOutput_[start + i]);
     }
     if (prevOutput_) {
@@ -223,8 +229,8 @@ void RecurrentLayer::forwardOneSequence(int start, int length) {
   } else {
     activation_->forward(frameOutput_[start + length - 1]);
     for (int i = length - 2; i >= 0; --i) {
-      frameOutput_[start + i].value->mul(frameOutput_[start + i + 1].value,
-                                         weight_->getW(), 1, 1);
+      frameOutput_[start + i].value->mul(
+          frameOutput_[start + i + 1].value, weight_->getW(), 1, 1);
       activation_->forward(frameOutput_[start + i]);
     }
   }
@@ -256,7 +262,8 @@ void RecurrentLayer::backward(const UpdateCallback& callback) {
   weight_->getParameterPtr()->incUpdate(callback);
 }
 
-void RecurrentLayer::backwardSequence(int batchSize, size_t numSequences,
+void RecurrentLayer::backwardSequence(int batchSize,
+                                      size_t numSequences,
                                       const int* starts) {
   REGISTER_TIMER_INFO("RecurrentBwSequence", getName().c_str());
   for (int i = 0; i < batchSize; ++i) {
@@ -274,31 +281,36 @@ void RecurrentLayer::backwardOneSequence(int start, int length) {
   if (!reversed_) {
     for (int i = length - 1; i > 0; --i) {
       activation_->backward(frameOutput_[start + i]);
-      frameOutput_[start + i - 1].grad->mul(frameOutput_[start + i].grad,
-                                            weightT, 1, 1);
+      frameOutput_[start + i - 1].grad->mul(
+          frameOutput_[start + i].grad, weightT, 1, 1);
     }
     activation_->backward(frameOutput_[start]);
     if (weight_->getWGrad()) {
       weight_->getWGrad()->mul(
           output_.value->subMatrix(start, length - 1)->getTranspose(),
-          output_.grad->subMatrix(start + 1, length - 1), 1, 1);
+          output_.grad->subMatrix(start + 1, length - 1),
+          1,
+          1);
     }
   } else {
     for (int i = 0; i < length - 1; ++i) {
       activation_->backward(frameOutput_[start + i]);
-      frameOutput_[start + i + 1].grad->mul(frameOutput_[start + i].grad,
-                                            weightT, 1, 1);
+      frameOutput_[start + i + 1].grad->mul(
+          frameOutput_[start + i].grad, weightT, 1, 1);
     }
     activation_->backward(frameOutput_[start + length - 1]);
     if (weight_->getWGrad()) {
       weight_->getWGrad()->mul(
           output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
-          output_.grad->subMatrix(start, length - 1), 1, 1);
+          output_.grad->subMatrix(start, length - 1),
+          1,
+          1);
     }
   }
 }
 
-void RecurrentLayer::forwardBatch(int batchSize, size_t numSequences,
+void RecurrentLayer::forwardBatch(int batchSize,
+                                  size_t numSequences,
                                   const int* starts) {
   if (!batchValue_) {
     batchValue_.reset(new SequenceToBatch(useGpu_));
@@ -327,7 +339,8 @@ void RecurrentLayer::forwardBatch(int batchSize, size_t numSequences,
   batchValue_->copyBackSeq(*output_.value);
 }
 
-void RecurrentLayer::backwardBatch(int batchSize, size_t numSequences,
+void RecurrentLayer::backwardBatch(int batchSize,
+                                   size_t numSequences,
                                    const int* starts) {
   if (!batchGrad_) {
     batchGrad_.reset(new SequenceToBatch(useGpu_));
@@ -377,11 +390,15 @@ void RecurrentLayer::backwardBatch(int batchSize, size_t numSequences,
       if (!reversed_) {
         weight_->getWGrad()->mul(
             output_.value->subMatrix(starts[seq], len - 1)->getTranspose(),
-            output_.grad->subMatrix(starts[seq] + 1, len - 1), 1, 1);
+            output_.grad->subMatrix(starts[seq] + 1, len - 1),
+            1,
+            1);
       } else {
         weight_->getWGrad()->mul(
             output_.value->subMatrix(starts[seq] + 1, len - 1)->getTranspose(),
-            output_.grad->subMatrix(starts[seq], len - 1), 1, 1);
+            output_.grad->subMatrix(starts[seq], len - 1),
+            1,
+            1);
       }
     }
   }
diff --git a/paddle/gserver/layers/RecurrentLayerGroup.cpp b/paddle/gserver/layers/RecurrentLayerGroup.cpp
index 62dbaa2674ce624dec44b8b3c86f9a08c1cfe0ee..a5443975da4ab6ecb302087fe71b018154d439b8 100644
--- a/paddle/gserver/layers/RecurrentLayerGroup.cpp
+++ b/paddle/gserver/layers/RecurrentLayerGroup.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/gserver/layers/Layer.h"
 #include <functional>
 
@@ -31,7 +30,8 @@ class RecurrentLayerGroup : public Layer {
 public:
   explicit RecurrentLayerGroup(const LayerConfig& config) : Layer(config) {}
 
-  void initSubNetwork(NeuralNetwork* rootNetwork, const ModelConfig& config,
+  void initSubNetwork(NeuralNetwork* rootNetwork,
+                      const ModelConfig& config,
                       const std::vector<ParameterType>& parameterTypes,
                       bool useGpu);
 
@@ -53,7 +53,7 @@ public:
   /**
    * @see Layer.accessSubNetwork
    */
-  void accessSubNetwork(const std::function<void(NeuralNetwork &)> &callback) {
+  void accessSubNetwork(const std::function<void(NeuralNetwork&)>& callback) {
     callback(*network_);
   }
 
@@ -64,8 +64,10 @@ private:
 REGISTER_LAYER(recurrent_layer_group, RecurrentLayerGroup);
 
 void RecurrentLayerGroup::initSubNetwork(
-    NeuralNetwork* rootNetwork, const ModelConfig& config,
-    const std::vector<ParameterType>& parameterTypes, bool useGpu) {
+    NeuralNetwork* rootNetwork,
+    const ModelConfig& config,
+    const std::vector<ParameterType>& parameterTypes,
+    bool useGpu) {
   setNeedGradient(true);
 
   network_.reset(new RecurrentGradientMachine(config_.name(), rootNetwork));
diff --git a/paddle/gserver/layers/ResizeLayer.cpp b/paddle/gserver/layers/ResizeLayer.cpp
index dc573e838f71623e6985b19a4ae2cba6109ef6b5..3c478a33e350cf0e901381890e3df1496893f4db 100644
--- a/paddle/gserver/layers/ResizeLayer.cpp
+++ b/paddle/gserver/layers/ResizeLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/BaseMatrix.h"
@@ -68,9 +67,11 @@ void ResizeLayer::backward(const UpdateCallback& callback) {
     return;
   }
 
-  MatrixPtr tmp =
-      Matrix::create(input.grad->getData(), height * width / getSize(),
-                     getSize(), false, useGpu_);
+  MatrixPtr tmp = Matrix::create(input.grad->getData(),
+                                 height * width / getSize(),
+                                 getSize(),
+                                 false,
+                                 useGpu_);
   tmp->add(*output_.grad);
 }
 
diff --git a/paddle/gserver/layers/ScalingLayer.cpp b/paddle/gserver/layers/ScalingLayer.cpp
index a494b401ff597290cf67ef55c4bf1b062da988ab..71570810f9576df74940968426c09ae421881ba6 100644
--- a/paddle/gserver/layers/ScalingLayer.cpp
+++ b/paddle/gserver/layers/ScalingLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -26,7 +25,7 @@ namespace paddle {
  * \f[
  *   y.row[i] = w[i] * x.row[i]
  * \f]
- * where \f$x\f$ is (batchSize x dataDim) input, \f$w\f$ is 
+ * where \f$x\f$ is (batchSize x dataDim) input, \f$w\f$ is
  * (batchSize x 1) weight vector, and \f$y\f$ is (batchSize x dataDim) output.
  *
  * The config file api is scaling_layer.
diff --git a/paddle/gserver/layers/ScalingProjection.cpp b/paddle/gserver/layers/ScalingProjection.cpp
index c0a7072c6a7cc1d37723f43d1068483779f56437..7999d02d384a06b900fbfa2c8bb271660b7fe008 100644
--- a/paddle/gserver/layers/ScalingProjection.cpp
+++ b/paddle/gserver/layers/ScalingProjection.cpp
@@ -19,7 +19,8 @@ namespace paddle {
 class ScalingProjection : public Projection {
 public:
   ScalingProjection(const ProjectionConfig& config,
-                    const ParameterPtr& parameter, bool useGpu)
+                    const ParameterPtr& parameter,
+                    bool useGpu)
       : Projection(config, parameter, useGpu) {
     CHECK_EQ(parameter->getSize(), 1UL);
     weight_.reset(new Weight(1, 1, parameter));
@@ -33,10 +34,13 @@ public:
   void backward(const UpdateCallback& callback) {
     if (weight_->getWGrad()) {
       auto sum = Matrix::create(in_->value->getHeight(), 1, false, useGpu_);
-      sum->sumOfProducts(*in_->value, *out_->grad,
-                         /* scaleSum= */1, /* scaleDest= */0);
+      sum->sumOfProducts(*in_->value,
+                         *out_->grad,
+                         /* scaleSum= */ 1,
+                         /* scaleDest= */ 0);
       weight_->getWGrad()->sumCols(*sum,
-                                   /* scaleSum= */1, /* scaleDest= */1);
+                                   /* scaleSum= */ 1,
+                                   /* scaleDest= */ 1);
       parameter_->incUpdate(callback);
     }
     if (in_->grad) {
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
index 25ae9d519533a912fb32348c8a521405f6c77eb3..4dfa2c179dafe0d8dcc6766fbafeae129edcc49a 100644
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "SelectiveFullyConnectedLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -49,11 +48,11 @@ bool SelectiveFullyConnectedLayer::init(const LayerMap& layerMap,
 
 void SelectiveFullyConnectedLayer::prefetch() {}
 
-void SelectiveFullyConnectedLayer::reserveOutput(size_t height, size_t width,
+void SelectiveFullyConnectedLayer::reserveOutput(size_t height,
+                                                 size_t width,
                                                  size_t nnz) {
   bool flag = (passType_ == PASS_TEST &&
-               config_.selective_fc_pass_generation() &&
-               !fullOutput_);
+               config_.selective_fc_pass_generation() && !fullOutput_);
   SetDevice device(output_.deviceId);
   if (flag) {
     // output_.value is sparse matrix
@@ -61,8 +60,12 @@ void SelectiveFullyConnectedLayer::reserveOutput(size_t height, size_t width,
         dynamic_cast<GpuMatrix*>(output_.value.get())) {
       output_.value = nullptr;
     }
-    Matrix::resizeOrCreateSparseMatrix(output_.value, height, width, nnz,
-                                       FLOAT_VALUE, SPARSE_CSR,
+    Matrix::resizeOrCreateSparseMatrix(output_.value,
+                                       height,
+                                       width,
+                                       nnz,
+                                       FLOAT_VALUE,
+                                       SPARSE_CSR,
                                        /*trans=*/false,
                                        /*useGpu=*/useGpu_);
     output_.value->copyFrom(*selCols_);
@@ -74,19 +77,31 @@ void SelectiveFullyConnectedLayer::reserveOutput(size_t height, size_t width,
           dynamic_cast<GpuSparseMatrix*>(output_.value.get())) {
         output_.value = nullptr;
       }
-      Matrix::resizeOrCreate(output_.value, height, width,
-                             /*trans=*/false, /*useGpu=*/useGpu_);
+      Matrix::resizeOrCreate(output_.value,
+                             height,
+                             width,
+                             /*trans=*/false,
+                             /*useGpu=*/useGpu_);
       interOutput_ = output_.value;
     } else {
       // output_.value is dense matrix, but width = nnz /height
       CHECK_EQ(nnz % height, 0U);
       CHECK(nnz / height);
-      Matrix::resizeOrCreate(output_.value, height, nnz / height,
-                             /*trans=*/false, /*useGpu=*/useGpu_);
-      interOutput_ = Matrix::createSparseMatrix(
-          output_.value->getData(), selCols_->getRows(), selCols_->getCols(),
-          height, width, nnz, FLOAT_VALUE, SPARSE_CSR,
-          /*trans=*/false, /*useGpu=*/useGpu_);
+      Matrix::resizeOrCreate(output_.value,
+                             height,
+                             nnz / height,
+                             /*trans=*/false,
+                             /*useGpu=*/useGpu_);
+      interOutput_ = Matrix::createSparseMatrix(output_.value->getData(),
+                                                selCols_->getRows(),
+                                                selCols_->getCols(),
+                                                height,
+                                                width,
+                                                nnz,
+                                                FLOAT_VALUE,
+                                                SPARSE_CSR,
+                                                /*trans=*/false,
+                                                /*useGpu=*/useGpu_);
     }
   }
   interOutput_->zeroMem();
@@ -97,8 +112,11 @@ void SelectiveFullyConnectedLayer::reserveOutput(size_t height, size_t width,
     CHECK(nnz / height)
         << "during training, "
            "each sample must have at least one column selected.";
-    Matrix::resizeOrCreate(output_.grad, height, nnz / height,
-                           /*trans=*/false, /*useGpu=*/useGpu_);
+    Matrix::resizeOrCreate(output_.grad,
+                           height,
+                           nnz / height,
+                           /*trans=*/false,
+                           /*useGpu=*/useGpu_);
     output_.grad->zeroMem();
   }
 }
@@ -131,7 +149,7 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) {
     real scaleT = i == 0 ? real(0) : real(1);
 
     flag = nnz < (hsize * wsize) * config_.selective_fc_full_mul_ratio() &&
-                !fullOutput_;
+           !fullOutput_;
     if (flag) {
       // if the indecies are highly sparse,
       // manully compute the multiplication of
@@ -145,8 +163,11 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) {
       if (fullOutput_) {
         interOutput_->mul(input, weight->getTranspose(), 1, scaleT);
       } else {
-        Matrix::resizeOrCreate(mmat_, hsize, wsize,
-                               /*trans=*/false, /*useGpu=*/useGpu_);
+        Matrix::resizeOrCreate(mmat_,
+                               hsize,
+                               wsize,
+                               /*trans=*/false,
+                               /*useGpu=*/useGpu_);
         mmat_->mul(input, weight->getTranspose());
         interOutput_->add3(mmat_);
       }
@@ -158,7 +179,7 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) {
   }
 
   flag = (passType_ == PASS_TEST && config_.selective_fc_pass_generation() &&
-         !fullOutput_);
+          !fullOutput_);
   if (flag) {
     // during generation, output of this layer is a sparse csr matrix,
     // which is probably the input of maxid layer
@@ -166,8 +187,11 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) {
     // activiation of this layer should be exponential, not softmax.
 
     Argument arg;
-    arg.value = Matrix::create(interOutput_->getData(), 1, nnz,
-                               /*trans=*/false, /*useGpu=*/useGpu_);
+    arg.value = Matrix::create(interOutput_->getData(),
+                               1,
+                               nnz,
+                               /*trans=*/false,
+                               /*useGpu=*/useGpu_);
     activation_->forward(arg);
   } else /* train and test in train, not generating */ {
     // during training, this layer output value is *Matrix*, which is input of
@@ -187,17 +211,22 @@ void SelectiveFullyConnectedLayer::backward(const UpdateCallback& callback) {
   backwardActivation();
   MatrixPtr oGrad = getOutputGrad();
   if (!fullOutput_) {
-    interOutGrad_ = Matrix::createSparseMatrix(
-        oGrad->getData(), interOutput_->getRows(), interOutput_->getCols(),
-        interOutput_->getHeight(), interOutput_->getWidth(),
-        interOutput_->getElementCnt(), FLOAT_VALUE, SPARSE_CSR,
-        /*trans=*/false,
-        /*useGpu=*/useGpu_);
+    interOutGrad_ = Matrix::createSparseMatrix(oGrad->getData(),
+                                               interOutput_->getRows(),
+                                               interOutput_->getCols(),
+                                               interOutput_->getHeight(),
+                                               interOutput_->getWidth(),
+                                               interOutput_->getElementCnt(),
+                                               FLOAT_VALUE,
+                                               SPARSE_CSR,
+                                               /*trans=*/false,
+                                               /*useGpu=*/useGpu_);
   } else {
-    interOutGrad_ =
-        Matrix::create(oGrad->getData(), oGrad->getHeight(), oGrad->getWidth(),
-                       /*trans=*/false,
-                       /*useGpu=*/useGpu_);
+    interOutGrad_ = Matrix::create(oGrad->getData(),
+                                   oGrad->getHeight(),
+                                   oGrad->getWidth(),
+                                   /*trans=*/false,
+                                   /*useGpu=*/useGpu_);
   }
 
   if (biases_ && biases_->getWGrad()) {
@@ -240,13 +269,21 @@ void paddle::SelectiveFullyConnectedLayer::fillSelectiveData(
   size_t sampleNum = candidates->size();
   size_t outputWidth = getSize();
   size_t nnz =
-      std::accumulate(candidates->begin(), candidates->end(), 0UL,
+      std::accumulate(candidates->begin(),
+                      candidates->end(),
+                      0UL,
                       [](size_t a, const std::pair<int*, size_t>& arr) {
                         return a + arr.second;
                       });
 
   Matrix::resizeOrCreateSparseMatrix(this->cpuSelCols_,
-    sampleNum, outputWidth, nnz, NO_VALUE, SPARSE_CSR, false, false);
+                                     sampleNum,
+                                     outputWidth,
+                                     nnz,
+                                     NO_VALUE,
+                                     SPARSE_CSR,
+                                     false,
+                                     false);
   CHECK(this->cpuSelCols_ != nullptr);
   CpuSparseMatrixPtr selCols =
       std::dynamic_pointer_cast<CpuSparseMatrix>(cpuSelCols_);
@@ -272,7 +309,13 @@ void paddle::SelectiveFullyConnectedLayer::fillSelectiveData(
     this->selCols_ = this->cpuSelCols_;
   } else {
     Matrix::resizeOrCreateSparseMatrix(this->selCols_,
-          sampleNum, outputWidth, nnz, NO_VALUE, SPARSE_CSR, false, true);
+                                       sampleNum,
+                                       outputWidth,
+                                       nnz,
+                                       NO_VALUE,
+                                       SPARSE_CSR,
+                                       false,
+                                       true);
     this->selCols_->copyFrom(*cpuSelCols_, HPPL_STREAM_1);
     hl_stream_synchronize(HPPL_STREAM_1);
   }
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
index c152151cff051bc0f62bcf6702d6c6c649be8003..9f92ae060521bd7852b67d45649d1cd0792961d4 100644
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -98,8 +97,6 @@ private:
   /**
    * @brief Make SelectiveFC act as FullyConnectedLayer
    */
-  void fillFullySelectiveData() {
-    fullOutput_ = true;
-  }
+  void fillFullySelectiveData() { fullOutput_ = true; }
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceConcatLayer.cpp b/paddle/gserver/layers/SequenceConcatLayer.cpp
index dfce4dcb196132414542d4fe9f0d97200e44779c..bd72ba3d167d99b5d3fdd047d6b1bfab611b3232 100644
--- a/paddle/gserver/layers/SequenceConcatLayer.cpp
+++ b/paddle/gserver/layers/SequenceConcatLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -68,13 +67,11 @@ void SequenceConcatLayer::forward(PassType passType) {
 
   const Argument& input1 = getInput(0);
   size_t numSequences1 = input1.getNumSequences();
-  auto startPositions1 =
-      input1.sequenceStartPositions->getVector(false);
+  auto startPositions1 = input1.sequenceStartPositions->getVector(false);
 
   const Argument& input2 = getInput(1);
   size_t numSequences2 = input2.getNumSequences();
-  auto startPositions2 =
-      input2.sequenceStartPositions->getVector(false);
+  auto startPositions2 = input2.sequenceStartPositions->getVector(false);
 
   CHECK_EQ(dim, input1.value->getWidth());
   CHECK_EQ(startPositions1->getData()[numSequences1], input1.getBatchSize());
@@ -117,8 +114,8 @@ void SequenceConcatLayer::forward(PassType passType) {
     }
 
     // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions,
-                                   numSequences1 + 1, false);
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions, numSequences1 + 1, false);
 
     int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
 
@@ -150,10 +147,8 @@ void SequenceConcatLayer::backward(const UpdateCallback& callback) {
   MatrixPtr inputGrad1 = getInputGrad(0);
   MatrixPtr inputGrad2 = getInputGrad(1);
   MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions1 =
-      getInput(0).sequenceStartPositions->getVector(false);
-  auto startPositions2 =
-      getInput(1).sequenceStartPositions->getVector(false);
+  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
+  auto startPositions2 = getInput(1).sequenceStartPositions->getVector(false);
 
   size_t numSequences1 = startPositions1->getSize() - 1;
   size_t numSequences2 = startPositions2->getSize() - 1;
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
index 26d9536dd57aa3e8f5b3b548730d06b89feed68f..0e9531eabb4b389b762e235ec01d5f16c88cd4a1 100644
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 
 #include "SequencePoolLayer.h"
diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp
index 55be73d363df19bc3c597252b0e5e2518bcee849..c9f19b7d3b66b3ac031135c04a96ffe27245aa01 100644
--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -58,7 +58,7 @@ void SequencePoolLayer::forward(PassType passType) {
   resetOutput(newBatchSize_, dim);
   if (type_) {
     CHECK(input.subSequenceStartPositions)
-      << "when trans_type = seq, input must hasSubseq";
+        << "when trans_type = seq, input must hasSubseq";
   }
   /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
    * thus, in this case, output_ has no sequenceStartPositions.
diff --git a/paddle/gserver/layers/SequenceReshapeLayer.cpp b/paddle/gserver/layers/SequenceReshapeLayer.cpp
index 05766706b002c0ab1a1ee3d5c34f134985a975eb..5ca9b8b300161688817234909f2b875801d90995 100644
--- a/paddle/gserver/layers/SequenceReshapeLayer.cpp
+++ b/paddle/gserver/layers/SequenceReshapeLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -69,8 +68,7 @@ void SequenceReshapeLayer::forward(PassType passType) {
   size_t outDim = getSize();
 
   size_t numSequences = input.getNumSequences();
-  auto startPositions =
-      input.sequenceStartPositions->getVector(false);
+  auto startPositions = input.sequenceStartPositions->getVector(false);
   const int* starts = startPositions->getData();
 
   CHECK_EQ(starts[numSequences], input.getBatchSize());
@@ -96,9 +94,7 @@ void SequenceReshapeLayer::forward(PassType passType) {
 
     // modify the sequenceStartPositions
     ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions,
-        numSequences + 1,
-        false);
+        output_.sequenceStartPositions, numSequences + 1, false);
 
     int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
 
@@ -134,8 +130,11 @@ void SequenceReshapeLayer::backward(const UpdateCallback& callback) {
   REGISTER_TIMER_INFO("SequenceReshapeLayerBackward", getName().c_str());
 
   if (inputGrad) {
-    Matrix::resizeOrCreate(reshapedOutputGrad, inputGrad->getHeight(),
-                           inputGrad->getWidth(), false, useGpu_);
+    Matrix::resizeOrCreate(reshapedOutputGrad,
+                           inputGrad->getHeight(),
+                           inputGrad->getWidth(),
+                           false,
+                           useGpu_);
     reshapedOutputGrad->copyFrom(*outputGrad);
     inputGrad->add(*reshapedOutputGrad);
   }
diff --git a/paddle/gserver/layers/SequenceToBatch.cpp b/paddle/gserver/layers/SequenceToBatch.cpp
index 88eace28b2afff982614375da8c1dd03ab324fdc..04402db9c8af2f51f30a09cbf1e9c4023fe3e531 100644
--- a/paddle/gserver/layers/SequenceToBatch.cpp
+++ b/paddle/gserver/layers/SequenceToBatch.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <vector>
 #include <algorithm>
 #include "SequenceToBatch.h"
@@ -21,8 +20,10 @@ limitations under the License. */
 
 namespace paddle {
 
-void SequenceToBatch::resizeOrCreateBatch(int batchSize, size_t numSequences,
-                                          const int *seqStarts, bool reversed,
+void SequenceToBatch::resizeOrCreateBatch(int batchSize,
+                                          size_t numSequences,
+                                          const int *seqStarts,
+                                          bool reversed,
                                           bool prevBatchState) {
   CHECK_EQ(seqStarts[numSequences], batchSize);
   IVector::resizeOrCreate(seq2BatchIdx_, batchSize, useGpu_);
@@ -50,7 +51,8 @@ void SequenceToBatch::resizeOrCreateBatch(int batchSize, size_t numSequences,
     int length = seqStarts[seqId + 1] - seqStarts[seqId];
     seqStartAndLength.emplace_back(seqStarts[seqId], length, seqId);
   }
-  std::sort(seqStartAndLength.begin(), seqStartAndLength.end(),
+  std::sort(seqStartAndLength.begin(),
+            seqStartAndLength.end(),
             [](SeqStartAndLength a, SeqStartAndLength b) {
               return a.length_ > b.length_;
             });
@@ -122,15 +124,19 @@ void SequenceToBatch::resizeOrCreateBatch(int batchSize, size_t numSequences,
 }
 
 void SequenceToBatch::resizeOrCreate(Matrix &seqValue) {
-  Matrix::resizeOrCreate(batchValue_, seqValue.getHeight(), seqValue.getWidth(),
-                         /* trans= */ false, useGpu_);
+  Matrix::resizeOrCreate(batchValue_,
+                         seqValue.getHeight(),
+                         seqValue.getWidth(),
+                         /* trans= */ false,
+                         useGpu_);
 }
 
 MatrixPtr SequenceToBatch::getBatchValue(int batchId, int numRows) {
   return getBatchValue(*batchValue_, batchId, numRows);
 }
 
-MatrixPtr SequenceToBatch::getBatchValue(Matrix &batchValue, int batchId,
+MatrixPtr SequenceToBatch::getBatchValue(Matrix &batchValue,
+                                         int batchId,
                                          int numRows) {
   int *batchStartPositions = batchStartPositions_->getData();
   int start = batchStartPositions[batchId];
@@ -151,7 +157,8 @@ void SequenceToBatch::getSeqOutputFromBatch(Matrix &sequence, Matrix &batch) {
   sequence2BatchCopy(sequence, batch, *seqEndIdxInBatch_, true);
 }
 
-void SequenceToBatch::sequence2BatchCopy(Matrix &batch, Matrix &sequence,
+void SequenceToBatch::sequence2BatchCopy(Matrix &batch,
+                                         Matrix &sequence,
                                          IVector &seq2BatchIdx,
                                          bool seq2batch) {
   int seqWidth = sequence.getWidth();
@@ -161,23 +168,27 @@ void SequenceToBatch::sequence2BatchCopy(Matrix &batch, Matrix &sequence,
   int *idxData = seq2BatchIdx.getData();
 
   if (useGpu_) {
-    hl_sequence2batch_copy(batchData, seqData, idxData, seqWidth,
-                           batchCount, seq2batch);
+    hl_sequence2batch_copy(
+        batchData, seqData, idxData, seqWidth, batchCount, seq2batch);
   } else {
     for (int i = 0; i < batchCount; ++i) {
       if (seq2batch) {
-        memcpy(batch.rowBuf(i), sequence.rowBuf(idxData[i]),
+        memcpy(batch.rowBuf(i),
+               sequence.rowBuf(idxData[i]),
                seqWidth * sizeof(real));
       } else {
-        memcpy(sequence.rowBuf(idxData[i]), batch.rowBuf(i),
+        memcpy(sequence.rowBuf(idxData[i]),
+               batch.rowBuf(i),
                seqWidth * sizeof(real));
       }
     }
   }
 }
 
-void SequenceToBatch::sequence2BatchAdd(Matrix &batch, Matrix &sequence,
-                                        IVector &seq2BatchIdx, bool seq2batch) {
+void SequenceToBatch::sequence2BatchAdd(Matrix &batch,
+                                        Matrix &sequence,
+                                        IVector &seq2BatchIdx,
+                                        bool seq2batch) {
   int seqWidth = sequence.getWidth();
   int batchCount = batch.getHeight();
   real *batchData = batch.getData();
@@ -185,8 +196,8 @@ void SequenceToBatch::sequence2BatchAdd(Matrix &batch, Matrix &sequence,
   int *idxData = seq2BatchIdx.getData();
 
   if (useGpu_) {
-    hl_sequence2batch_add(batchData, seqData, idxData, seqWidth,
-                          batchCount, seq2batch);
+    hl_sequence2batch_add(
+        batchData, seqData, idxData, seqWidth, batchCount, seq2batch);
   } else {
     for (int i = 0; i < batchCount; ++i) {
       if (seq2batch) {
@@ -199,8 +210,11 @@ void SequenceToBatch::sequence2BatchAdd(Matrix &batch, Matrix &sequence,
 }
 
 void SequenceToBatch::copyFromSeq(Matrix &seqValue) {
-  Matrix::resizeOrCreate(batchValue_, seqValue.getHeight(), seqValue.getWidth(),
-                         /* trans= */ false, useGpu_);
+  Matrix::resizeOrCreate(batchValue_,
+                         seqValue.getHeight(),
+                         seqValue.getWidth(),
+                         /* trans= */ false,
+                         useGpu_);
   sequence2BatchCopy(*batchValue_, seqValue, *seq2BatchIdx_, true);
 }
 
@@ -208,12 +222,14 @@ void SequenceToBatch::copyBackSeq(Matrix &seqValue) {
   sequence2BatchCopy(*batchValue_, seqValue, *seq2BatchIdx_, false);
 }
 
-void SequenceToBatch::copy(Matrix &seqValue, Matrix &batchValue,
+void SequenceToBatch::copy(Matrix &seqValue,
+                           Matrix &batchValue,
                            bool seq2batch) {
   sequence2BatchCopy(batchValue, seqValue, *seq2BatchIdx_, seq2batch);
 }
 
-void SequenceToBatch::add(Matrix &seqValue, Matrix &batchValue,
+void SequenceToBatch::add(Matrix &seqValue,
+                          Matrix &batchValue,
                           bool seq2batch) {
   sequence2BatchAdd(batchValue, seqValue, *seq2BatchIdx_, seq2batch);
 }
diff --git a/paddle/gserver/layers/SequenceToBatch.h b/paddle/gserver/layers/SequenceToBatch.h
index 8cba7ea3b98c3a7774f331ce88160cb9a7a89743..6bc12f207ee3fadbd2a75ca5a5dbb7ce199cc99b 100644
--- a/paddle/gserver/layers/SequenceToBatch.h
+++ b/paddle/gserver/layers/SequenceToBatch.h
@@ -43,8 +43,10 @@ public:
   explicit SequenceToBatch(bool useGpu) : useGpu_(useGpu) {}
 
   /* resize and calculate the batchIndex_ */
-  void resizeOrCreateBatch(int batchSize, size_t numSequences,
-                           const int *seqStarts, bool reversed,
+  void resizeOrCreateBatch(int batchSize,
+                           size_t numSequences,
+                           const int *seqStarts,
+                           bool reversed,
                            bool prevBatchState = false);
 
   /* sequence matrix and batch matrix copy:
@@ -81,9 +83,13 @@ public:
   }
 
 protected:
-  void sequence2BatchCopy(Matrix &batch, Matrix &sequence,
-                          IVector &seq2BatchIdx, bool seq2batch);
-  void sequence2BatchAdd(Matrix &batch, Matrix &sequence, IVector &seq2BatchIdx,
+  void sequence2BatchCopy(Matrix &batch,
+                          Matrix &sequence,
+                          IVector &seq2BatchIdx,
+                          bool seq2batch);
+  void sequence2BatchAdd(Matrix &batch,
+                         Matrix &sequence,
+                         IVector &seq2BatchIdx,
                          bool seq2batch);
 
   IVectorPtr batchStartPositions_;
diff --git a/paddle/gserver/layers/SlopeInterceptLayer.cpp b/paddle/gserver/layers/SlopeInterceptLayer.cpp
index af5fccf6506b6d37faaa030fc2696ac29586908f..dd6ffcd50b01cfa56ee9fbc428ffc2cb9b73ce17 100644
--- a/paddle/gserver/layers/SlopeInterceptLayer.cpp
+++ b/paddle/gserver/layers/SlopeInterceptLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -21,7 +20,8 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * @brief A layer for applying a slope and an intercept to the input element-wise.
+ * @brief A layer for applying a slope and an intercept to the input
+ * element-wise.
  * This layer is used in NEURAL TURING MACHINE.
  * @note There is no activation and weight in this layer.
  *
@@ -29,7 +29,8 @@ namespace paddle {
  *    y = ax + b
  * \f]
  *
- * Here, a is scale and b is offset, which are provided as attributes of the layer.
+ * Here, a is scale and b is offset, which are provided as attributes of the
+ * layer.
  *
  * The config file api is slope_intercept_layer.
  */
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp b/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp
index 2fcfc8e1ae68a47822ce8f375fb94ecdb196dea6..dce660a5bca792e99a16e187aaa4aa10187830ac 100644
--- a/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp
+++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp
@@ -56,14 +56,14 @@ ProjectionConfig SpatialPyramidPoolLayer::getConfig(size_t imgSizeW,
 size_t SpatialPyramidPoolLayer::getSize() {
   CHECK_EQ(inputLayers_.size(), 1UL);
   size_t layerSize = 0;
-  const SppConfig& sppConf = config_.inputs(0).spp_conf();
+  const ImageConfig& conf = config_.inputs(0).spp_conf().image_conf();
   imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
   imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
   if (imgSizeH_ == 0) {
-    imgSizeH_ = sppConf.has_img_size_y() ? sppConf.img_size_y() : imgSizeW_;
+    imgSizeH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
   }
   if (imgSizeW_ == 0) {
-    imgSizeW_ = sppConf.img_size();
+    imgSizeW_ = conf.img_size();
   }
 
   size_t outputH = 1;
@@ -82,9 +82,10 @@ bool SpatialPyramidPoolLayer::init(const LayerMap& layerMap,
   pyramidHeight_ = sppConf.pyramid_height();
   poolType_ = sppConf.pool_type();
 
-  channels_ = sppConf.channels();
-  imgSizeW_ = sppConf.img_size();
-  imgSizeH_ = sppConf.has_img_size_y() ? sppConf.img_size_y() : imgSizeW_;
+  const ImageConfig& imageConf = sppConf.image_conf();
+  channels_ = imageConf.channels();
+  imgSizeW_ = imageConf.img_size();
+  imgSizeH_ = imageConf.has_img_size_y() ? imageConf.img_size_y() : imgSizeW_;
   poolProjections_.reserve(pyramidHeight_);
   projCol_.reserve(pyramidHeight_);
   projOutput_.resize(pyramidHeight_);
@@ -93,7 +94,8 @@ bool SpatialPyramidPoolLayer::init(const LayerMap& layerMap,
   size_t endCol = 0;
   for (size_t i = 0; i < pyramidHeight_; i++) {
     poolProjections_.emplace_back(PoolProjection::create(
-        getConfig(imgSizeW_, imgSizeH_, channels_, i, poolType_), nullptr,
+        getConfig(imgSizeW_, imgSizeH_, channels_, i, poolType_),
+        nullptr,
         useGpu_));
     endCol += poolProjections_[i]->getOutputSize();
     projCol_.push_back(std::make_pair(startCol, endCol));
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
index e15b6d2f85c6f5b9620e28aaef9c6246341611f9..79db574d99bdb1137e6a55244c382f9c894239c8 100644
--- a/paddle/gserver/layers/SpatialPyramidPoolLayer.h
+++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
@@ -24,7 +24,7 @@ namespace paddle {
  * @brief A layer for spatial pyramid pooling on the input image by taking
  * the max, average, etc. within regions, so that the result vector of
  * different sized images are of the same size.
- * 
+ *
  * The config file api is spp_layer.
  */
 
@@ -47,8 +47,11 @@ public:
 
   virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
 
-  ProjectionConfig getConfig(size_t sizeX_, size_t sizeY_, size_t channels,
-                             size_t pyamidLevel_, std::string& poolType_);
+  ProjectionConfig getConfig(size_t sizeX_,
+                             size_t sizeY_,
+                             size_t channels,
+                             size_t pyamidLevel_,
+                             std::string& poolType_);
   size_t getSize();
 
   virtual void forward(PassType passType);
diff --git a/paddle/gserver/layers/SubSequenceLayer.cpp b/paddle/gserver/layers/SubSequenceLayer.cpp
index ccf65ba649f21478ae20902ccd8db0a4734e22e2..664f9e13c055df08552974048428326644b69a6e 100644
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubSequenceLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -75,18 +74,15 @@ void SubSequenceLayer::forward(PassType passType) {
 
   const Argument& input = getInput(0);
   size_t numSequences1 = input.getNumSequences();
-  auto startPositions1 =
-      input.sequenceStartPositions->getVector(false);
+  auto startPositions1 = input.sequenceStartPositions->getVector(false);
 
   const Argument& offsetSeq = getInput(1);
   size_t numSequences2 = offsetSeq.getNumSequences();
-  auto startPositions2 =
-      offsetSeq.sequenceStartPositions->getVector(false);
+  auto startPositions2 = offsetSeq.sequenceStartPositions->getVector(false);
 
   const Argument& sizeSeq = getInput(2);
   size_t numSequences3 = sizeSeq.getNumSequences();
-  auto startPositions3 =
-      sizeSeq.sequenceStartPositions->getVector(false);
+  auto startPositions3 = sizeSeq.sequenceStartPositions->getVector(false);
 
   CHECK_EQ(dim, input.value->getWidth());
 
@@ -143,8 +139,8 @@ void SubSequenceLayer::forward(PassType passType) {
     }
 
     // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions,
-                                   numSequences1 + 1, false);
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions, numSequences1 + 1, false);
 
     int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
     int offset = 0;
@@ -177,8 +173,7 @@ void SubSequenceLayer::backward(const UpdateCallback& callback) {
 
   MatrixPtr inputGrad1 = getInputGrad(0);
   MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions1 =
-      getInput(0).sequenceStartPositions->getVector(false);
+  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
   size_t numSequences1 = startPositions1->getSize() - 1;
   const int* starts1 = startPositions1->getData();
 
diff --git a/paddle/gserver/layers/SumToOneNormLayer.cpp b/paddle/gserver/layers/SumToOneNormLayer.cpp
index 7b61dd08227253c6ac8bbd44c4a852c972762fe0..bcf39168408d2bac50c17d0e22ed747cf0b33d80 100644
--- a/paddle/gserver/layers/SumToOneNormLayer.cpp
+++ b/paddle/gserver/layers/SumToOneNormLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -21,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * A layer for sum-to-one normalization, 
+ * A layer for sum-to-one normalization,
  * which is used in NEURAL TURING MACHINE.
  * \f[
  *   out[i] = \frac {in[i]} {\sum_{k=1}^N in[k]}
diff --git a/paddle/gserver/layers/TableProjection.cpp b/paddle/gserver/layers/TableProjection.cpp
index 947d8cf9be1b4a6a5ce87bdcc57aa3c23967393e..2bc0d329d9605850ecdce6b4a87351579493d834 100644
--- a/paddle/gserver/layers/TableProjection.cpp
+++ b/paddle/gserver/layers/TableProjection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "TableProjection.h"
 
 namespace paddle {
@@ -20,7 +19,8 @@ namespace paddle {
 REGISTER_PROJECTION(table, TableProjection);
 
 TableProjection::TableProjection(const ProjectionConfig& config,
-                                 const ParameterPtr& parameter, bool useGpu)
+                                 const ParameterPtr& parameter,
+                                 bool useGpu)
     : Projection(config, parameter, useGpu) {
   table_.reset(
       new Weight(config.input_size(), config.output_size(), parameter));
diff --git a/paddle/gserver/layers/TableProjection.h b/paddle/gserver/layers/TableProjection.h
index eadf2de623cdf2990bc731cefcac66958c61a311..97c672508a009735a9a8f9980b715881c1f824a2 100644
--- a/paddle/gserver/layers/TableProjection.h
+++ b/paddle/gserver/layers/TableProjection.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Projection.h"
@@ -34,7 +33,8 @@ namespace paddle {
  */
 class TableProjection : public Projection {
 public:
-  TableProjection(const ProjectionConfig& config, const ParameterPtr& parameter,
+  TableProjection(const ProjectionConfig& config,
+                  const ParameterPtr& parameter,
                   bool useGpu);
   /**
    * If use sparse row matrix as parameter, prefetch feature ids in input label.
diff --git a/paddle/gserver/layers/TensorLayer.cpp b/paddle/gserver/layers/TensorLayer.cpp
index 84fe9005b003db65e1ae9072669c215a961556ab..03586cc6ff3d148a63af33d89b85d565e2198057 100644
--- a/paddle/gserver/layers/TensorLayer.cpp
+++ b/paddle/gserver/layers/TensorLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "TensorLayer.h"
 
 #include "paddle/utils/Logging.h"
@@ -72,7 +71,9 @@ void TensorLayer::forward(PassType passType) {
     MatrixPtr input1 = getInputValue(0);
     MatrixPtr input2 = getInputValue(1);
     MatrixPtr tmpMat = Matrix::create(input2->getHeight(),
-      input2->getWidth(), /* trans= */ false, input2->useGpu());
+                                      input2->getWidth(),
+                                      /* trans= */ false,
+                                      input2->useGpu());
     REGISTER_TIMER_INFO("TensorFwMulTimer", getName().c_str());
     for (size_t i = 0; i < getSize(); ++i) {
       MatrixPtr weights = weights_[i]->getW();
@@ -101,7 +102,9 @@ void TensorLayer::backward(const UpdateCallback& callback) {
   MatrixPtr input2 = getInputValue(1);
   MatrixPtr oGrad = getOutputGrad();
   MatrixPtr tmpMat = Matrix::create(input1->getHeight(),
-    input1->getWidth(), /* trans= */ false, input1->useGpu());
+                                    input1->getWidth(),
+                                    /* trans= */ false,
+                                    input1->useGpu());
 
   /* trans(grad * e1) * e2 */ {
     REGISTER_TIMER_INFO("TensorGradMulTimer", getName().c_str());
diff --git a/paddle/gserver/layers/TensorLayer.h b/paddle/gserver/layers/TensorLayer.h
index 83b87b1307ac1faa5511b69aa89c6482cbfd9d44..9ac651de4d99a23a12394c674bda827e935749b9 100644
--- a/paddle/gserver/layers/TensorLayer.h
+++ b/paddle/gserver/layers/TensorLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/TransLayer.cpp b/paddle/gserver/layers/TransLayer.cpp
index f8827bec63a9bc0aa7391906af82d5053b9ccca3..53a24d4cc4633898cff1b56f5a377959a38f6354 100644
--- a/paddle/gserver/layers/TransLayer.cpp
+++ b/paddle/gserver/layers/TransLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "TransLayer.h"
 namespace paddle {
diff --git a/paddle/gserver/layers/TransLayer.h b/paddle/gserver/layers/TransLayer.h
index 867ccb4d1950cf6b9f5e6da01a11b0abfed14072..25b091f9f414ead5048cd65cfc16b67ae1387ad9 100644
--- a/paddle/gserver/layers/TransLayer.h
+++ b/paddle/gserver/layers/TransLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
index 6e3f6bf2e496cf2e1a4bada5a9dc621024b08996..c883283f782352e674d0fcf0369e8491e31d60ff 100644
--- a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
+++ b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "Projection.h"
 
@@ -27,7 +26,8 @@ namespace paddle {
 class TransposedFullMatrixProjection : public Projection {
 public:
   TransposedFullMatrixProjection(const ProjectionConfig& config,
-                                 ParameterPtr parameter, bool useGPu);
+                                 ParameterPtr parameter,
+                                 bool useGPu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
diff --git a/paddle/gserver/layers/ValidationLayer.cpp b/paddle/gserver/layers/ValidationLayer.cpp
index 48a7b54338fca36095d9cd4af49e09b7fb22dfdf..0fee4bd2463ac86dfcb5ecc0b5e75564d86971d2 100644
--- a/paddle/gserver/layers/ValidationLayer.cpp
+++ b/paddle/gserver/layers/ValidationLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <memory>
 #include <algorithm>
 #include <fstream>
@@ -68,8 +67,11 @@ void AucValidation::validationImp(MatrixPtr output, IVectorPtr label) {
   if (dynamic_cast<GpuMatrix*>(output.get())) {
     size_t height = output->getHeight();
     size_t width = output->getWidth();
-    Matrix::resizeOrCreate(cpuOutput_, height, width,
-                           /* trans=*/false, /* useGpu=*/false);
+    Matrix::resizeOrCreate(cpuOutput_,
+                           height,
+                           width,
+                           /* trans=*/false,
+                           /* useGpu=*/false);
     cpuOutput_->copyFrom(*output);
     IVector::resizeOrCreate(cpuLabel_, height, false);
     cpuLabel_->copyFrom(*label);
diff --git a/paddle/gserver/layers/WarpCTCLayer.cpp b/paddle/gserver/layers/WarpCTCLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e68363a1b2bb389fa6591daed0f31f78ff4585b1
--- /dev/null
+++ b/paddle/gserver/layers/WarpCTCLayer.cpp
@@ -0,0 +1,223 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "WarpCTCLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(warp_ctc, WarpCTCLayer);
+
+bool WarpCTCLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  /* Initialize the basic parament class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2UL);
+
+  /* The inputLayers_[0] must be sequence output without softmax */
+  numClasses_ = config_.size();
+  CHECK_GE(numClasses_, 2UL);
+  CHECK_EQ(numClasses_, inputLayers_[0]->getSize());
+
+  blank_ = config_.blank();
+  CHECK_GE(blank_, 0UL);
+  CHECK_LT(blank_, numClasses_);
+
+  normByTimes_ = config_.norm_by_times();
+
+  // We don't need sequenceStartPositions because each sample of output_ is
+  // for the cost of one sequence.
+  setNeedSequenceInfo(false);
+
+  return true;
+}
+
+void WarpCTCLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& output = getInput(0);
+  const Argument& labels = getInput(1);
+
+  CHECK(output.sequenceStartPositions);
+  CHECK(labels.sequenceStartPositions);
+  CHECK(labels.ids);
+
+  size_t numSequences = labels.sequenceStartPositions->getSize() - 1;
+  CHECK_EQ(numSequences, output.sequenceStartPositions->getSize() - 1);
+
+  resizeOutput(numSequences, 1);
+
+  const int* cpuLabelStartPositions =
+      labels.sequenceStartPositions->getData(false);
+  const int* cpuOutputStartPositions =
+      output.sequenceStartPositions->getData(false);
+
+  std::vector<int> cpuLabelLengths(numSequences);
+  std::vector<int> cpuOutputLengths(numSequences);
+  for (size_t i = 0; i < numSequences; i++) {
+    cpuLabelLengths[i] =
+        cpuLabelStartPositions[i + 1] - cpuLabelStartPositions[i];
+    cpuOutputLengths[i] =
+        cpuOutputStartPositions[i + 1] - cpuOutputStartPositions[i];
+  }
+
+  /* Get the maximum sequence length */
+  maxSequenceLength_ = 0;
+  maxSequenceLength_ = *std::max_element(
+      cpuOutputLengths.data(), cpuOutputLengths.data() + numSequences);
+
+  Matrix::resizeOrCreate(batchValue_,
+                         /* height */ numSequences * maxSequenceLength_,
+                         /* width */ numClasses_,
+                         /* trans */ false,
+                         /* useGpu */ useGpu_);
+
+  Matrix::resizeOrCreate(batchGrad_,
+                         /* height */ numSequences * maxSequenceLength_,
+                         /* width */ numClasses_,
+                         /* trans */ false,
+                         /* useGpu */ useGpu_);
+  batchGrad_->zeroMem();
+
+  seq2batchPadding(output.value, batchValue_, output.sequenceStartPositions);
+
+  /* labels always in CPU memory */
+  IVector::resizeOrCreate(cpuLabels_,
+                          /* size */ (labels.ids)->getSize(),
+                          /* useGpu */ false);
+  cpuLabels_->copyFrom(*(labels.ids));
+
+  /* labels always in CPU memory */
+  Matrix::resizeOrCreate(cpuCosts_,
+                         /* height */ numSequences,
+                         /* width */ 1,
+                         /* trans */ false,
+                         /* useGpu */ false);
+
+  /* Init warp-ctc options */
+  hl_warpctc_options_t options;
+  hl_warpctc_init(blank_, useGpu_, &options);
+
+  /* Get the needed workspace size */
+  size_t workspaceBytes = 0;
+  hl_warpctc_get_workspace_size(cpuLabelLengths.data(),
+                                cpuOutputLengths.data(),
+                                numClasses_,
+                                numSequences,
+                                &options,
+                                &workspaceBytes);
+  CHECK_GT(workspaceBytes, 0UL);
+
+  size_t workspaceLength = workspaceBytes / sizeof(real) + 1;
+  Vector::resizeOrCreate(workspace_,
+                         /* size */ workspaceLength,
+                         /* useGpu */ useGpu_);
+
+  hl_warpctc_compute_loss(batchValue_->getData(),
+                          batchGrad_->getData(),
+                          cpuLabels_->getData(),
+                          cpuLabelLengths.data(),
+                          cpuOutputLengths.data(),
+                          numClasses_,
+                          numSequences,
+                          cpuCosts_->getData(),
+                          workspace_->getData(),
+                          &options);
+
+  /* Copy the costs */
+  output_.value->copyFrom(*cpuCosts_);
+}
+
+void WarpCTCLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  const Argument& output = getInput(0);
+  CHECK(batchGrad_);
+
+  batch2seqPadding(
+      output.grad, batchGrad_, output.sequenceStartPositions, normByTimes_);
+}
+
+void WarpCTCLayer::seq2batchPadding(const MatrixPtr& seqValue,
+                                    MatrixPtr& batchValue,
+                                    const ICpuGpuVectorPtr& seqStartPositions) {
+  size_t numSequences = seqStartPositions->getSize() - 1;
+  const int* seqStartPositionsData = seqStartPositions->getData(useGpu_);
+
+  real* seqData = seqValue->getData();
+  real* batchData = batchValue->getData();
+  if (useGpu_) {
+    hl_sequence2batch_copy_padding(batchData,
+                                   seqData,
+                                   seqStartPositionsData,
+                                   numClasses_,
+                                   maxSequenceLength_,
+                                   numSequences,
+                                   false,
+                                   true);
+  } else {
+    for (size_t i = 0; i < maxSequenceLength_; i++) {
+      for (size_t j = 0; j < numSequences; j++) {
+        size_t sequenceStart = seqStartPositionsData[j];
+        size_t sequenceLength =
+            seqStartPositionsData[j + 1] - seqStartPositionsData[j];
+        if (i < sequenceLength) {
+          memcpy(batchData + (i * numSequences + j) * numClasses_,
+                 seqData + (sequenceStart + i) * numClasses_,
+                 numClasses_ * sizeof(real));
+        } else {
+          memset(batchData + (i * numSequences + j) * numClasses_,
+                 0,
+                 numClasses_ * sizeof(real));
+        }
+      }
+    }
+  }
+}
+
+void WarpCTCLayer::batch2seqPadding(const MatrixPtr& seqValue,
+                                    MatrixPtr& batchValue,
+                                    const ICpuGpuVectorPtr& seqStartPositions,
+                                    bool normByTimes) {
+  size_t numSequences = seqStartPositions->getSize() - 1;
+  const int* seqStartPositionsData = seqStartPositions->getData(useGpu_);
+
+  real* seqData = seqValue->getData();
+  real* batchData = batchValue->getData();
+  if (useGpu_) {
+    hl_sequence2batch_copy_padding(batchData,
+                                   seqData,
+                                   seqStartPositionsData,
+                                   numClasses_,
+                                   maxSequenceLength_,
+                                   numSequences,
+                                   normByTimes,
+                                   false);
+  } else {
+    for (size_t i = 0; i < numSequences; i++) {
+      int sequenceStart = seqStartPositionsData[i];
+      int sequenceLength =
+          seqStartPositionsData[i + 1] - seqStartPositionsData[i];
+      real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f;
+      for (int j = 0; j < sequenceLength; j++) {
+        for (size_t k = 0; k < numClasses_; k++) {
+          seqData[(sequenceStart + j) * numClasses_ + k] =
+              batchData[(j * numSequences + i) * numClasses_ + k] * scale;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/WarpCTCLayer.h b/paddle/gserver/layers/WarpCTCLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b0f5ba267ae593a0c967233967eac0deef04eb0
--- /dev/null
+++ b/paddle/gserver/layers/WarpCTCLayer.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer integrating the open-source warp-ctc library
+ *        <https://github.com/baidu-research/warp-ctc> to compute connectionist
+ *        temporal classification cost.
+ *
+ * The config file api is warp_ctc_layer.
+ */
+class WarpCTCLayer : public Layer {
+public:
+  explicit WarpCTCLayer(const LayerConfig& config) : Layer(config) {}
+  ~WarpCTCLayer() {}
+
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback);
+
+protected:
+  /**
+   * sequence matrix and batch matrix copy:
+   * sequence (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
+   * batch    (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0)
+   */
+  void seq2batchPadding(const MatrixPtr& seqValue,
+                        MatrixPtr& batchValue,
+                        const ICpuGpuVectorPtr& seqStartPositions);
+  void batch2seqPadding(const MatrixPtr& seqValue,
+                        MatrixPtr& batchValue,
+                        const ICpuGpuVectorPtr& seqStartPositions,
+                        bool normByTimes);
+
+protected:
+  size_t numClasses_;
+  size_t blank_;
+  size_t maxSequenceLength_;
+  bool normByTimes_;
+
+  MatrixPtr batchValue_;
+  MatrixPtr batchGrad_;
+  VectorPtr workspace_;
+
+  IVectorPtr cpuLabels_;
+  MatrixPtr cpuCosts_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 0651d0b4733ea9c3f54a42169774217b65091aa6..34dc375f21a54688c459236551fb1bc4d41f2eb1 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -34,7 +34,22 @@ add_unittest_without_exec(test_ConvTrans
 
 add_test(NAME test_ConvTrans
     COMMAND test_ConvTrans)
+################# test_ConvUnify #######################
+add_unittest_without_exec(test_ConvUnify
+    test_ConvUnify.cpp
+    LayerGradUtil.cpp
+    TestUtil.cpp)
+    
+add_test(NAME test_ConvUnify
+    COMMAND test_ConvUnify)
+################# test_BatchNorm #######################
+add_unittest_without_exec(test_BatchNorm
+    test_BatchNorm.cpp
+    LayerGradUtil.cpp
+    TestUtil.cpp)
 
+add_test(NAME test_BatchNorm
+    COMMAND test_BatchNorm)
 ################## test_Evaluator #######################
 add_unittest(test_Evaluator
     test_Evaluator.cpp
@@ -62,6 +77,17 @@ add_unittest(test_RecurrentLayer
     test_RecurrentLayer.cpp
     TestUtil.cpp)
 
+############### test_WarpCTCLayer #######################
+if(NOT WITH_DOUBLE)
+    add_unittest_without_exec(test_WarpCTCLayer
+        test_WarpCTCLayer.cpp
+        TestUtil.cpp)
+
+    add_test(NAME test_WarpCTCLayer
+        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${PROJ_ROOT}/warp-ctc/build
+        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+endif()
+
 ############### test_RecurrentGradientMachine ###############
 # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
 # I will fix it.
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index bc7bee0e4bbc8c365505619f6fa21d2a88433fcd..47575169172832cd3f95a53ed6e4dcb87a5b7a4b 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "LayerGradUtil.h"
 
 P_DECLARE_bool(thread_local_rand_use_global_seed);
@@ -28,8 +27,13 @@ real getCostSum(LayerPtr& testLayer, MatrixPtr weights) {
   return Argument::sumCosts(outArgs);
 }
 
-real getDiffAndPrint(real newCost1, real newCost2, real callbackCount,
-                     char fill, string testLayerName, string name, real step,
+real getDiffAndPrint(real newCost1,
+                     real newCost2,
+                     real callbackCount,
+                     char fill,
+                     string testLayerName,
+                     string name,
+                     real step,
                      real delta) {
   EXPECT_FALSE(std::isnan(newCost1));
   EXPECT_FALSE(std::isnan(newCost2));
@@ -49,7 +53,8 @@ real getDiffAndPrint(real newCost1, real newCost2, real callbackCount,
   return diff;
 }
 
-void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
+void testState(LayerPtr testLayer,
+               vector<DataLayerPtr>& dataLayers,
                vector<Argument>& datas) {
   auto batchSize = datas[0].getBatchSize();
   Argument data;
@@ -82,8 +87,8 @@ void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
         data.value = datas[j].value->subMatrix(batchId, 1);
       }
       if (datas[j].ids) {
-        data.ids = IVector::create(datas[j].ids->getData() + batchId, 1,
-                                   FLAGS_use_gpu);
+        data.ids = IVector::create(
+            datas[j].ids->getData() + batchId, 1, FLAGS_use_gpu);
       }
       dataLayers[j]->setData(data);
       dataLayers[j]->forward(PASS_TEST);
@@ -128,7 +133,8 @@ void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
   }
 }
 
-void testBatchState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
+void testBatchState(LayerPtr testLayer,
+                    vector<DataLayerPtr>& dataLayers,
                     vector<Argument>& datas) {
   auto batchSize = datas[0].getBatchSize();
   Argument data;
@@ -192,8 +198,10 @@ void testBatchState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
     splitData.sequenceStartPositions = cpuSeqStartPos;
     for (size_t j = 0; j < datas.size(); ++j) {
       if (datas[j].value) {
-        Matrix::resizeOrCreate(splitData.value, splitBatchSize,
-                               datas[j].value->getWidth(), false,
+        Matrix::resizeOrCreate(splitData.value,
+                               splitBatchSize,
+                               datas[j].value->getWidth(),
+                               false,
                                FLAGS_use_gpu);
         for (size_t seqId = 0; seqId < numSequences; ++seqId) {
           if (seqLens[seqId]) {
@@ -268,8 +276,10 @@ void initWeight(MatrixPtr& weights) {
   weights->copyFrom(*tmpMat);
 }
 
-void initBatchState(LayerPtr dataLayer, LayerPtr testLayer,
-                    LayerStatePtr state, bool useGpu) {
+void initBatchState(LayerPtr dataLayer,
+                    LayerPtr testLayer,
+                    LayerStatePtr state,
+                    bool useGpu) {
   int sequenceNum = dataLayer->getOutput().getNumSequences();
   MatrixPtr prevBatchOutput =
       Matrix::create(sequenceNum, testLayer->getSize(), false, useGpu);
@@ -282,9 +292,13 @@ void initBatchState(LayerPtr dataLayer, LayerPtr testLayer,
   state->value.push_back(prevBatchState);
 }
 
-void initDataLayer(TestConfig testConf, std::vector<DataLayerPtr>* dataLayers,
-                   vector<Argument>* datas, LayerMap* layerMap,
-                   string testLayerName, size_t batchSize, bool trans,
+void initDataLayer(TestConfig testConf,
+                   std::vector<DataLayerPtr>* dataLayers,
+                   vector<Argument>* datas,
+                   LayerMap* layerMap,
+                   string testLayerName,
+                   size_t batchSize,
+                   bool trans,
                    bool useGpu) {
   ICpuGpuVectorPtr sequenceStartPositions;
   ICpuGpuVectorPtr subSequenceStartPositions;
@@ -328,13 +342,17 @@ void initDataLayer(TestConfig testConf, std::vector<DataLayerPtr>* dataLayers,
         break;
       case INPUT_SPARSE_NON_VALUE_DATA:
         data.value = makeRandomSparseMatrix(
-            batchSize, layer->getSize(),
-            /* withValue= */ false, useGpu,
+            batchSize,
+            layer->getSize(),
+            /* withValue= */ false,
+            useGpu,
             testConf.inputDefs[i].sparse.equalNnzPerSample);
         break;
       case INPUT_SPARSE_FLOAT_VALUE_DATA:
-        data.value = makeRandomSparseMatrix(batchSize, layer->getSize(),
-                                            /* withValue= */ true, useGpu);
+        data.value = makeRandomSparseMatrix(batchSize,
+                                            layer->getSize(),
+                                            /* withValue= */ true,
+                                            useGpu);
         break;
       case INPUT_DENSE_DIM_DATA:
         fillData(trans, layer->getSize(), numSequence);
@@ -379,16 +397,21 @@ void initDataLayer(TestConfig testConf, std::vector<DataLayerPtr>* dataLayers,
   }
 }
 
-void initTestLayer(TestConfig testConf, LayerMap* layerMap,
-                   std::vector<ParameterPtr>* parameters, LayerPtr* testLayer) {
+void initTestLayer(TestConfig testConf,
+                   LayerMap* layerMap,
+                   std::vector<ParameterPtr>* parameters,
+                   LayerPtr* testLayer) {
   ParameterMap parameterMap;
   size_t index = 0;
   LayerConfig testConfig = testConf.layerConfig;
   CHECK_EQ(testConf.inputDefs.size(),
            size_t(testConf.layerConfig.inputs_size()));
 
-  auto initParameter = [&](string paraName, size_t paraSize, bool isStatic,
-                           bool initialize, ParameterConfig paraConfig) {
+  auto initParameter = [&](string paraName,
+                           size_t paraSize,
+                           bool isStatic,
+                           bool initialize,
+                           ParameterConfig paraConfig) {
     paraConfig.set_name(paraName);
     paraConfig.set_size(paraSize);
     paraConfig.set_initial_std(1);
@@ -431,8 +454,11 @@ void initTestLayer(TestConfig testConf, LayerMap* layerMap,
   if (testConf.biasSize) {
     testConfig.set_bias_parameter_name("bias");
     ParameterConfig paraConfig;
-    initParameter(testConfig.bias_parameter_name(), testConf.biasSize,
-                  testConf.staticBias, true, paraConfig);
+    initParameter(testConfig.bias_parameter_name(),
+                  testConf.biasSize,
+                  testConf.staticBias,
+                  true,
+                  paraConfig);
   }
 
   *testLayer = Layer::create(testConfig);
@@ -441,9 +467,13 @@ void initTestLayer(TestConfig testConf, LayerMap* layerMap,
   (*testLayer)->setNeedGradient(true);
 }
 
-void testPerturbParameter(TestConfig testConf, const MatrixPtr weights,
-                          const LayerStatePtr state, real cost,
-                          real callbackCount, real* maxDiff, LayerPtr testLayer,
+void testPerturbParameter(TestConfig testConf,
+                          const MatrixPtr weights,
+                          const LayerStatePtr state,
+                          real cost,
+                          real callbackCount,
+                          real* maxDiff,
+                          LayerPtr testLayer,
                           std::vector<ParameterPtr>* parameters) {
   char fill = ' ';
   for (auto& parameter : *parameters) {
@@ -481,9 +511,14 @@ void testPerturbParameter(TestConfig testConf, const MatrixPtr weights,
       parameter->setValueUpdated();
       newCost[k] = getCostSum(testLayer, weights);
     }
-    real diff = getDiffAndPrint(newCost[0], newCost[1], callbackCount, fill,
-                                testLayer->getName(), parameter->getName(),
-                                step, delta);
+    real diff = getDiffAndPrint(newCost[0],
+                                newCost[1],
+                                callbackCount,
+                                fill,
+                                testLayer->getName(),
+                                parameter->getName(),
+                                step,
+                                delta);
     *maxDiff = std::max(*maxDiff, abs(diff));
     // restore parameter
     parameter->getBuf(PARAMETER_VALUE)->copyFrom(oldPara);
@@ -492,9 +527,13 @@ void testPerturbParameter(TestConfig testConf, const MatrixPtr weights,
   }
 }
 
-void testPerturbInput(TestConfig testConf, const MatrixPtr weights,
-                      const LayerStatePtr state, real cost, real callbackCount,
-                      real* maxDiff, LayerPtr testLayer,
+void testPerturbInput(TestConfig testConf,
+                      const MatrixPtr weights,
+                      const LayerStatePtr state,
+                      real cost,
+                      real callbackCount,
+                      real* maxDiff,
+                      LayerPtr testLayer,
                       std::vector<DataLayerPtr> dataLayers) {
   char fill = ' ';
   for (size_t index = 0; index < testConf.inputDefs.size(); index++) {
@@ -539,9 +578,14 @@ void testPerturbInput(TestConfig testConf, const MatrixPtr weights,
       newCost[k] = getCostSum(testLayer, weights);
     }
 
-    real diff = getDiffAndPrint(newCost[0], newCost[1], callbackCount, fill,
+    real diff = getDiffAndPrint(newCost[0],
+                                newCost[1],
+                                callbackCount,
+                                fill,
                                 testLayer->getName(),
-                                dataLayers[index]->getName(), step, delta);
+                                dataLayers[index]->getName(),
+                                step,
+                                delta);
     *maxDiff = std::max(*maxDiff, abs(diff));
     // restore parameter
     outV->copyFrom(oldPara);
@@ -549,9 +593,13 @@ void testPerturbInput(TestConfig testConf, const MatrixPtr weights,
   }
 }
 
-void testLayerGradKernel(TestConfig testConf, string testLayerName,
-                         size_t batchSize, bool trans, bool useGpu,
-                         bool useWeight, float epsilon) {
+void testLayerGradKernel(TestConfig testConf,
+                         string testLayerName,
+                         size_t batchSize,
+                         bool trans,
+                         bool useGpu,
+                         bool useWeight,
+                         float epsilon) {
 #ifdef PADDLE_ONLY_CPU
   if (useGpu) return;
 #endif
@@ -566,8 +614,14 @@ void testLayerGradKernel(TestConfig testConf, string testLayerName,
   std::vector<DataLayerPtr> dataLayers;
   LayerMap layerMap;
   vector<Argument> datas;
-  initDataLayer(testConf, &dataLayers, &datas, &layerMap, testLayerName,
-                batchSize, trans, useGpu);
+  initDataLayer(testConf,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                testLayerName,
+                batchSize,
+                trans,
+                useGpu);
   // test layer initialize
   std::vector<ParameterPtr> parameters;
   LayerPtr testLayer;
@@ -620,17 +674,28 @@ void testLayerGradKernel(TestConfig testConf, string testLayerName,
     ++callbackCount;
   }
   for (size_t i = 0; i < parameters.size(); ++i) {
-    EXPECT_EQ(parameters[i]->isStatic() ? 0 : callbackCount,
-              callbackFlags[i]);
+    EXPECT_EQ(parameters[i]->isStatic() ? 0 : callbackCount, callbackFlags[i]);
   }
 
   // Test whether the layer's forward calculation is stable
   // by adding perturbation to its parameters or its input layers
   real maxDiff = 0;
-  testPerturbParameter(testConf, weights, state, cost, callbackCount, &maxDiff,
-                       testLayer, &parameters);
-  testPerturbInput(testConf, weights, state, cost, callbackCount, &maxDiff,
-                   testLayer, dataLayers);
+  testPerturbParameter(testConf,
+                       weights,
+                       state,
+                       cost,
+                       callbackCount,
+                       &maxDiff,
+                       testLayer,
+                       &parameters);
+  testPerturbInput(testConf,
+                   weights,
+                   state,
+                   cost,
+                   callbackCount,
+                   &maxDiff,
+                   testLayer,
+                   dataLayers);
   EXPECT_LE(fabs(maxDiff), epsilon);
 
   if (testConf.testState) {
@@ -641,10 +706,15 @@ void testLayerGradKernel(TestConfig testConf, string testLayerName,
   }
 }
 
-void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,
-                   bool trans, bool useGpu, bool useWeight, float epsilon) {
-  testLayerGradKernel(testConf, testLayerName, batchSize, trans, useGpu,
-                      useWeight, epsilon);
+void testLayerGrad(TestConfig testConf,
+                   string testLayerName,
+                   size_t batchSize,
+                   bool trans,
+                   bool useGpu,
+                   bool useWeight,
+                   float epsilon) {
+  testLayerGradKernel(
+      testConf, testLayerName, batchSize, trans, useGpu, useWeight, epsilon);
   bool isStaticTest = false;
   LayerConfig testConfig = testConf.layerConfig;
   for (size_t i = 0; i < testConf.inputDefs.size(); i++) {
@@ -662,14 +732,19 @@ void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,
     isStaticTest = true;
   }
   if (isStaticTest) {
-    testLayerGradKernel(testConf, testLayerName, batchSize, trans, useGpu,
-                        useWeight, epsilon);
+    testLayerGradKernel(
+        testConf, testLayerName, batchSize, trans, useGpu, useWeight, epsilon);
   }
 }
 
-void testProjectionGrad(ProjectionConfig conf, InputType inputType,
-                        size_t parameterSize, size_t batchSize, bool useGpu,
-                        bool testState, int biasSize, bool sharedBias) {
+void testProjectionGrad(ProjectionConfig conf,
+                        InputType inputType,
+                        size_t parameterSize,
+                        size_t batchSize,
+                        bool useGpu,
+                        bool testState,
+                        int biasSize,
+                        bool sharedBias) {
   TestConfig config;
   conf.set_name(conf.type());
   config.layerConfig.set_type("mixed");
@@ -684,8 +759,11 @@ void testProjectionGrad(ProjectionConfig conf, InputType inputType,
   testLayerGrad(config, "mixed", batchSize, false, useGpu);
 }
 
-void testOperatorGrad(TestConfig& config, OperatorConfig& operatorConf,
-                      size_t batchSize, bool useGpu, bool testState) {
+void testOperatorGrad(TestConfig& config,
+                      OperatorConfig& operatorConf,
+                      size_t batchSize,
+                      bool useGpu,
+                      bool testState) {
   config.layerConfig.set_type("mixed");
 
   operatorConf.set_output_size(config.layerConfig.size());
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
index 3b9ec803959b372a960ed705da5abf7d301a2c64..a061c7fc533ff2c639ceda4db6d89a33fd3f0435 100644
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -72,7 +72,10 @@ struct InputDef {
     sparse = {""};
     isStatic = false;
   }
-  InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn,
+  InputDef(InputType type,
+           string nameIn,
+           size_t dimIn,
+           size_t sizeIn,
            ParaSparse sparseIn) {
     inputType = type;
     name = nameIn;
@@ -98,11 +101,18 @@ struct TestConfig {
         testBatchState(false) {}
 };
 
-real getCostSum(ParameterPtr& parameter, CpuVector& cpuPara,
-                LayerPtr& testLayer, MatrixPtr weights = nullptr);
+real getCostSum(ParameterPtr& parameter,
+                CpuVector& cpuPara,
+                LayerPtr& testLayer,
+                MatrixPtr weights = nullptr);
 
-real getDiffAndPrint(real newCost1, real newCost2, real callbackCount,
-                     char fill, string testLayerName, string name, real step,
+real getDiffAndPrint(real newCost1,
+                     real newCost2,
+                     real callbackCount,
+                     char fill,
+                     string testLayerName,
+                     string name,
+                     real step,
                      real delta);
 
 /**
@@ -113,7 +123,8 @@ real getDiffAndPrint(real newCost1, real newCost2, real callbackCount,
  * @param dataLayers[in/out]   dataLayers
  * @param datas[in/out]        data of dataLayers
  */
-void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
+void testState(LayerPtr testLayer,
+               vector<DataLayerPtr>& dataLayers,
                vector<Argument>& datas);
 
 /**
@@ -124,7 +135,8 @@ void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
  * @param dataLayers[in/out]   dataLayers
  * @param datas[in/out]        data of dataLayers
  */
-void testBatchState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
+void testBatchState(LayerPtr testLayer,
+                    vector<DataLayerPtr>& dataLayers,
                     vector<Argument>& datas);
 
 /**
@@ -144,8 +156,10 @@ double genPerturbation(const real* oldGrad, real* newGrad, size_t dim);
 
 void initWeight(MatrixPtr& weights);
 
-void initBatchState(LayerPtr dataLayer, LayerPtr testLayer,
-                    LayerStatePtr state, bool useGpu);
+void initBatchState(LayerPtr dataLayer,
+                    LayerPtr testLayer,
+                    LayerStatePtr state,
+                    bool useGpu);
 
 /**
  * @brief initialize the dataLayer by its inputType
@@ -155,9 +169,13 @@ void initBatchState(LayerPtr dataLayer, LayerPtr testLayer,
  *        datas[out]          initialized data of dataLayers
  *        layerMap[out]       layerMap
  */
-void initDataLayer(TestConfig testConf, std::vector<DataLayerPtr>* dataLayers,
-                   vector<Argument>* datas, LayerMap* layerMap,
-                   string testLayerName, size_t batchSize, bool trans,
+void initDataLayer(TestConfig testConf,
+                   std::vector<DataLayerPtr>* dataLayers,
+                   vector<Argument>* datas,
+                   LayerMap* layerMap,
+                   string testLayerName,
+                   size_t batchSize,
+                   bool trans,
                    bool useGpu);
 
 /**
@@ -168,8 +186,10 @@ void initDataLayer(TestConfig testConf, std::vector<DataLayerPtr>* dataLayers,
  *        parameters[out]     parameters of testLayer
  *        testLayer[out]      testLayer
  */
-void initTestLayer(TestConfig testConf, LayerMap* layerMap,
-                   std::vector<ParameterPtr>* parameters, LayerPtr* testLayer);
+void initTestLayer(TestConfig testConf,
+                   LayerMap* layerMap,
+                   std::vector<ParameterPtr>* parameters,
+                   LayerPtr* testLayer);
 
 /**
  * @brief Test whether the layer's forward calculation is stable by adding
@@ -184,9 +204,13 @@ void initTestLayer(TestConfig testConf, LayerMap* layerMap,
  *        testLayer[in/out]    testLayer
  *        parameters[in/out]   parameters of testLayer
  */
-void testPerturbParameter(TestConfig testConf, const MatrixPtr weights,
-                          const LayerStatePtr state, real cost,
-                          real callbackCount, real* maxDiff, LayerPtr testLayer,
+void testPerturbParameter(TestConfig testConf,
+                          const MatrixPtr weights,
+                          const LayerStatePtr state,
+                          real cost,
+                          real callbackCount,
+                          real* maxDiff,
+                          LayerPtr testLayer,
                           std::vector<ParameterPtr>* parameters);
 
 /**
@@ -202,25 +226,44 @@ void testPerturbParameter(TestConfig testConf, const MatrixPtr weights,
  *        testLayer[in/out]    testLayer
  *        dataLayers[in/out]   dataLayers
  */
-void testPerturbInput(TestConfig testConf, const MatrixPtr weights,
-                      const LayerStatePtr state, real cost, real callbackCount,
-                      real* maxDiff, LayerPtr testLayer,
+void testPerturbInput(TestConfig testConf,
+                      const MatrixPtr weights,
+                      const LayerStatePtr state,
+                      real cost,
+                      real callbackCount,
+                      real* maxDiff,
+                      LayerPtr testLayer,
                       std::vector<DataLayerPtr> dataLayers);
 
-void testLayerGradKernel(TestConfig testConf, string testLayerName,
-                         size_t batchSize, bool trans, bool useGpu,
-                         bool useWeight = false, float epsilon = 0.02);
+void testLayerGradKernel(TestConfig testConf,
+                         string testLayerName,
+                         size_t batchSize,
+                         bool trans,
+                         bool useGpu,
+                         bool useWeight = false,
+                         float epsilon = 0.02);
 
-void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,
-                   bool trans, bool useGpu, bool useWeight = false,
+void testLayerGrad(TestConfig testConf,
+                   string testLayerName,
+                   size_t batchSize,
+                   bool trans,
+                   bool useGpu,
+                   bool useWeight = false,
                    float epsilon = 0.02);
 
-void testProjectionGrad(ProjectionConfig conf, InputType inputType,
-                        size_t parameterSize, size_t batchSize, bool useGpu,
-                        bool testState = false, int biasSize = 0,
+void testProjectionGrad(ProjectionConfig conf,
+                        InputType inputType,
+                        size_t parameterSize,
+                        size_t batchSize,
+                        bool useGpu,
+                        bool testState = false,
+                        int biasSize = 0,
                         bool sharedBias = false);
 
-void testOperatorGrad(TestConfig& config, OperatorConfig& operatorConf,
-                      size_t batchSize, bool useGpu, bool testState = false);
+void testOperatorGrad(TestConfig& config,
+                      OperatorConfig& operatorConf,
+                      size_t batchSize,
+                      bool useGpu,
+                      bool testState = false);
 
 }  //  namespace paddle
diff --git a/paddle/gserver/tests/TestUtil.cpp b/paddle/gserver/tests/TestUtil.cpp
index 97fbcc8176326357fdc406a9a04a4e3a937a2105..84d516683c18551765d707f26cc7003ba3432c7f 100644
--- a/paddle/gserver/tests/TestUtil.cpp
+++ b/paddle/gserver/tests/TestUtil.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "TestUtil.h"
 
 #include "paddle/utils/CommandLineParser.h"
@@ -30,8 +29,11 @@ std::string randStr(const int len) {
   return s;
 }
 
-MatrixPtr makeRandomSparseMatrix(size_t height, size_t width, bool withValue,
-                                 bool useGpu, bool equalNnzPerSample) {
+MatrixPtr makeRandomSparseMatrix(size_t height,
+                                 size_t width,
+                                 bool withValue,
+                                 bool useGpu,
+                                 bool equalNnzPerSample) {
   std::vector<int64_t> ids(height);
   std::vector<int64_t> indices(height + 1);
   indices[0] = 0;
@@ -55,8 +57,8 @@ MatrixPtr makeRandomSparseMatrix(size_t height, size_t width, bool withValue,
     for (size_t i = 0; i < data.size(); ++i) {
       data[i].col = uniformRandom(width);
     }
-    auto mat = Matrix::createSparseMatrix(height, width, data.size(), NO_VALUE,
-                                          SPARSE_CSR, false, useGpu);
+    auto mat = Matrix::createSparseMatrix(
+        height, width, data.size(), NO_VALUE, SPARSE_CSR, false, useGpu);
     if (useGpu) {
       std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
           ids.data(), indices.data(), data.data(), HPPL_STREAM_DEFAULT);
@@ -93,7 +95,7 @@ void generateSequenceStartPositions(size_t batchSize,
 }
 
 void generateSequenceStartPositions(size_t batchSize,
-    ICpuGpuVectorPtr& sequenceStartPositions) {
+                                    ICpuGpuVectorPtr& sequenceStartPositions) {
   int numSeqs;
   if (FLAGS_fixed_seq_length != 0) {
     numSeqs = std::ceil((float)batchSize / (float)FLAGS_fixed_seq_length);
@@ -101,7 +103,7 @@ void generateSequenceStartPositions(size_t batchSize,
     numSeqs = batchSize / 10 + 1;
   }
   sequenceStartPositions =
-      ICpuGpuVector::create(numSeqs + 1, /* useGpu= */false);
+      ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
   int* buf = sequenceStartPositions->getMutableData(false);
   int64_t pos = 0;
   int len = FLAGS_fixed_seq_length;
@@ -109,7 +111,8 @@ void generateSequenceStartPositions(size_t batchSize,
   for (int i = 0; i < numSeqs; ++i) {
     if (FLAGS_fixed_seq_length == 0) {
       len = uniformRandom(
-            std::min<int64_t>(maxLen, batchSize - pos - numSeqs + i)) + 1;
+                std::min<int64_t>(maxLen, batchSize - pos - numSeqs + i)) +
+            1;
     }
     buf[i] = pos;
     pos += len;
@@ -118,7 +121,6 @@ void generateSequenceStartPositions(size_t batchSize,
   buf[numSeqs] = batchSize;
 }
 
-
 void generateSubSequenceStartPositions(
     const ICpuGpuVectorPtr& sequenceStartPositions,
     ICpuGpuVectorPtr& subSequenceStartPositions) {
@@ -148,7 +150,6 @@ void generateSubSequenceStartPositions(
   subBuf[j] = buf[numSeqs];
 }
 
-
 void generateMDimSequenceData(const IVectorPtr& sequenceStartPositions,
                               IVectorPtr& cpuSequenceDims) {
   /* generate sequences with 2 dims */
@@ -174,9 +175,8 @@ void generateMDimSequenceData(const IVectorPtr& sequenceStartPositions,
   }
 }
 
-void generateMDimSequenceData(
-    const ICpuGpuVectorPtr& sequenceStartPositions,
-    IVectorPtr& cpuSequenceDims) {
+void generateMDimSequenceData(const ICpuGpuVectorPtr& sequenceStartPositions,
+                              IVectorPtr& cpuSequenceDims) {
   /* generate sequences with 2 dims */
   int numSeqs = sequenceStartPositions->getSize() - 1;
   int numDims = 2;
diff --git a/paddle/gserver/tests/TestUtil.h b/paddle/gserver/tests/TestUtil.h
index 6a75f92ffe2f640fddd45d610645274a941a61c3..000f8884e8681db8f4d2a2d6454791958b964f92 100644
--- a/paddle/gserver/tests/TestUtil.h
+++ b/paddle/gserver/tests/TestUtil.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <gtest/gtest.h>
@@ -28,8 +27,11 @@ inline bool approximatelyEqual(float a, float b, float epsilon) {
   return fabs(a - b) <= ((fabs(a) < fabs(b) ? fabs(b) : fabs(a)) * epsilon);
 }
 
-MatrixPtr makeRandomSparseMatrix(size_t height, size_t width, bool withValue,
-                                 bool useGpu, bool equalNnzPerSample = false);
+MatrixPtr makeRandomSparseMatrix(size_t height,
+                                 size_t width,
+                                 bool withValue,
+                                 bool useGpu,
+                                 bool equalNnzPerSample = false);
 
 /**
  * @brief generate sequenceStartPositions for INPUT_SEQUENCE_DATA,
@@ -39,10 +41,10 @@ MatrixPtr makeRandomSparseMatrix(size_t height, size_t width, bool withValue,
  *        sequenceStartPositions[out] generation output
  */
 void generateSequenceStartPositions(size_t batchSize,
-    IVectorPtr& sequenceStartPositions);
+                                    IVectorPtr& sequenceStartPositions);
 
 void generateSequenceStartPositions(size_t batchSize,
-   ICpuGpuVectorPtr& sequenceStartPositions);
+                                    ICpuGpuVectorPtr& sequenceStartPositions);
 
 /**
  * @brief generate subSequenceStartPositions for INPUT_HASSUB_SEQUENCE_DATA
@@ -51,9 +53,8 @@ void generateSequenceStartPositions(size_t batchSize,
  * @param sequenceStartPositions[in]     input
  *        subSequenceStartPositions[out] generation output
  */
-void generateSubSequenceStartPositions(
-    const IVectorPtr& sequenceStartPositions,
-    IVectorPtr& subSequenceStartPositions);
+void generateSubSequenceStartPositions(const IVectorPtr& sequenceStartPositions,
+                                       IVectorPtr& subSequenceStartPositions);
 
 void generateSubSequenceStartPositions(
     const ICpuGpuVectorPtr& sequenceStartPositions,
@@ -66,12 +67,10 @@ void generateSubSequenceStartPositions(
  * @param sequenceStartPositions[in]     input
  *        cpuSequenceDims[out]              generation output
  */
-void generateMDimSequenceData(
-    const IVectorPtr& sequenceStartPositions,
-    IVectorPtr& cpuSequenceDims);
-void generateMDimSequenceData(
-    const ICpuGpuVectorPtr& sequenceStartPositions,
-    IVectorPtr& cpuSequenceDims);
+void generateMDimSequenceData(const IVectorPtr& sequenceStartPositions,
+                              IVectorPtr& cpuSequenceDims);
+void generateMDimSequenceData(const ICpuGpuVectorPtr& sequenceStartPositions,
+                              IVectorPtr& cpuSequenceDims);
 
 void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b);
 
diff --git a/paddle/gserver/tests/img_conv_a.conf b/paddle/gserver/tests/img_conv_a.conf
index 940589ed9ac242d6a73a74c9be39fcaafe66b7be..20c89b875e84e10144e99cd1546146bfad8b057f 100644
--- a/paddle/gserver/tests/img_conv_a.conf
+++ b/paddle/gserver/tests/img_conv_a.conf
@@ -34,6 +34,7 @@ conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
                       num_channels=8,
                       num_filters=16, stride=1,
                       bias_attr=True,
-                      act=LinearActivation())
+                      act=LinearActivation(),
+                      groups=2)
 
 outputs(concat, conv)
diff --git a/paddle/gserver/tests/img_conv_b.conf b/paddle/gserver/tests/img_conv_b.conf
index 8ca9c94541504d208b94f45bf71c8da440d18411..19b99c77fdebc45d1a8b1e071987386574b1acb2 100644
--- a/paddle/gserver/tests/img_conv_b.conf
+++ b/paddle/gserver/tests/img_conv_b.conf
@@ -24,7 +24,7 @@ proj2 = conv_projection(input=data, filter_size=1, filter_size_y=1,
 concat = concat_layer(input=[proj1, proj2], bias_attr=False, act=ReluActivation())
 
 proj = conv_projection(input=data, filter_size=1, filter_size_y=1,
-                       num_channels=8, num_filters=16, stride=1)
+                       num_channels=8, num_filters=16, stride=1, groups=2)
 
 with mixed_layer(bias_attr=True, act=LinearActivation()) as conv:
     conv += proj
diff --git a/paddle/gserver/tests/img_conv_c.conf b/paddle/gserver/tests/img_conv_c.conf
new file mode 100644
index 0000000000000000000000000000000000000000..fea332f6d1b2c9681c3b9d0bb3c15ea8b1895d24
--- /dev/null
+++ b/paddle/gserver/tests/img_conv_c.conf
@@ -0,0 +1,43 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+data = data_layer(name ="input", size=8*16*16)
+conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                        num_channels=8,
+                        num_filters=16, stride=1,
+                        bias_attr=False,
+                        act=ReluActivation(),
+                        layer_type="exconv")
+conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation(),
+                       layer_type="exconv")
+
+concat = concat_layer(input=[conv1, conv2])
+
+conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                      num_channels=8,
+                      num_filters=16, stride=1,
+                      bias_attr=True,
+                      act=LinearActivation(),
+                      groups=2,
+                      layer_type="exconv")
+
+outputs(concat, conv)
diff --git a/paddle/gserver/tests/img_pool_a.conf b/paddle/gserver/tests/img_pool_a.conf
index 5938e7611201c9a4e3b44ca8aae2f39a80b1ff3b..9bd046b533de8200e6c945d1752ce240508b6338 100644
--- a/paddle/gserver/tests/img_pool_a.conf
+++ b/paddle/gserver/tests/img_pool_a.conf
@@ -28,7 +28,6 @@ maxpool = img_pool_layer(input=conv,
                          stride_y=2,
                          padding=1,
                          padding_y=2,
-                         img_width=16,
                          pool_type=MaxPooling(),
 )
 avgpool = img_pool_layer(input=conv,
@@ -39,7 +38,6 @@ avgpool = img_pool_layer(input=conv,
                          stride_y=2,
                          padding=1,
                          padding_y=2,
-                         img_width=16,
                          pool_type=AvgPooling(),
 )
 
diff --git a/paddle/gserver/tests/test_ActivationGrad.cpp b/paddle/gserver/tests/test_ActivationGrad.cpp
index 2c5d17090dfc7772c84477cb721b084b7a03c835..e54c5109e71de1a41ec2bda2af4a19745acbbc83 100644
--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
@@ -42,9 +42,9 @@ void testActivation(const string& act) {
     testLayerGrad(config,
                   act + "_activation",
                   100,
-                  /* trans= */false,
+                  /* trans= */ false,
                   useGpu,
-                  /* useWeight */true);
+                  /* useWeight */ true);
   }
 }
 
diff --git a/paddle/gserver/tests/test_BatchNorm.cpp b/paddle/gserver/tests/test_BatchNorm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0cb6f58dc000bd0fb408e6f3a3aa4ff4240adf26
--- /dev/null
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <vector>
+#include <string>
+#include "paddle/gserver/layers/DataLayer.h"
+#include "ModelConfig.pb.h"
+#include "paddle/trainer/Trainer.h"
+#include "paddle/utils/GlobalConstants.h"
+#include "paddle/gserver/layers/ExpandConvTransLayer.h"
+
+#include "TestUtil.h"
+#include "LayerGradUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+P_DECLARE_bool(use_gpu);
+P_DECLARE_int32(gpu_id);
+P_DECLARE_double(checkgrad_eps);
+P_DECLARE_bool(thread_local_rand_use_global_seed);
+P_DECLARE_bool(prev_batch_state);
+
+// Test that the batchNormLayer can be followed by a ConvLayer
+TEST(Layer, batchNorm) {
+    FLAGS_use_gpu = false;
+    TestConfig configBN;
+    const int CHANNELS = 6272;
+    const int IMG_SIZE = 1;
+    configBN.layerConfig.set_type("batch_norm");
+    configBN.layerConfig.set_name("bn");
+    configBN.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE);
+    configBN.layerConfig.set_active_type("relu");
+    configBN.biasSize = CHANNELS;
+    configBN.inputDefs.push_back({INPUT_DATA, "layer_0",
+                                /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS,
+                                /* paraSize= */ CHANNELS});
+
+    configBN.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean",
+                                    1, CHANNELS});
+    configBN.inputDefs.back().isStatic = true;
+    configBN.inputDefs.push_back({INPUT_DATA, "layer_2_running_var",
+                                    1, CHANNELS});
+    configBN.inputDefs.back().isStatic = true;
+
+    LayerInputConfig* input = configBN.layerConfig.add_inputs();
+    configBN.layerConfig.add_inputs();
+    configBN.layerConfig.add_inputs();
+
+    ImageConfig* img_conf = input->mutable_image_conf();
+    img_conf->set_channels(CHANNELS);
+    img_conf->set_img_size(IMG_SIZE);
+
+    // Setting up conv-layer config
+    TestConfig config;
+    config.biasSize = 64;
+    config.layerConfig.set_type("exconv");
+    config.layerConfig.set_num_filters(64);
+    config.layerConfig.set_partial_sum(1);
+    config.layerConfig.set_shared_biases(true);
+
+    config.inputDefs.push_back({INPUT_DATA, "bn", 6272, 204800});
+    input = config.layerConfig.add_inputs();
+    ConvConfig* conv = input->mutable_conv_conf();
+    conv->set_filter_size(5);
+    conv->set_filter_size_y(5);
+    conv->set_channels(128);
+    conv->set_padding(1);
+    conv->set_padding_y(1);
+    conv->set_stride(2);
+    conv->set_stride_y(2);
+    conv->set_groups(1);
+    conv->set_filter_channels(conv->channels() / conv->groups());
+    conv->set_img_size(7);
+    conv->set_output_x(3);
+    config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+                                config.layerConfig.num_filters());
+    config.layerConfig.set_name("conv");
+
+    // data layer initialize
+    std::vector<DataLayerPtr> dataLayers;
+    LayerMap layerMap;
+    vector<Argument> datas;
+    initDataLayer(configBN, &dataLayers, &datas, &layerMap, "batch_norm",
+                  100, false, false);
+    // test layer initialize
+    std::vector<ParameterPtr> parameters;
+    LayerPtr bnLayer;
+    initTestLayer(configBN, &layerMap, &parameters, &bnLayer);
+
+    std::vector<ParameterPtr> parameters2;
+    LayerPtr convLayer;
+    initTestLayer(config, &layerMap, &parameters2, &convLayer);
+
+    bnLayer->forward(PASS_GC);
+    convLayer->forward(PASS_GC);
+
+    CHECK_EQ(convLayer->getOutputValue()->getHeight(), 100);
+    CHECK_EQ(convLayer->getOutputValue()->getWidth(), 576);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_ConvTrans.cpp b/paddle/gserver/tests/test_ConvTrans.cpp
index bff7222b29907cb66d79decea76e1b5e26205ddf..f3efdfb428d14435fbfced6cfef3b7dadd8ff5a9 100644
--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ b/paddle/gserver/tests/test_ConvTrans.cpp
@@ -36,206 +36,206 @@ P_DECLARE_bool(prev_batch_state);
 
 // Test that the convTrans forward is the same as conv backward
 TEST(Layer, convTransLayerFwd) {
-    // Setting up conv-trans layer
-    TestConfig configt;
-    configt.biasSize = 3;
-    configt.layerConfig.set_type("exconvt");
-    configt.layerConfig.set_num_filters(3);
-    configt.layerConfig.set_partial_sum(1);
-    configt.layerConfig.set_shared_biases(true);
-
-    configt.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
-    LayerInputConfig* input = configt.layerConfig.add_inputs();
-    ConvConfig* conv = input->mutable_conv_conf();
-    conv->set_filter_size(2);
-    conv->set_filter_size_y(4);
-    conv->set_channels(16);
-    conv->set_padding(0);
-    conv->set_padding_y(1);
-    conv->set_stride(2);
-    conv->set_stride_y(2);
-    conv->set_groups(1);
-    conv->set_filter_channels(3 / conv->groups());
-    conv->set_img_size(16);
-    conv->set_output_x(outputSize(conv->img_size(), conv->filter_size(),
-                                  conv->padding(), conv->stride(),
-                                  /* caffeMode */ true));
-    configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
-                                configt.layerConfig.num_filters());
-    configt.layerConfig.set_name("convTrans");
-
-    // data layer initialize
-    std::vector<DataLayerPtr> dataLayers;
-    LayerMap layerMap;
-    vector<Argument> datas;
-    initDataLayer(configt, &dataLayers, &datas, &layerMap, "convTrans",
-                  100, false, false);
-    // test layer initialize
-    std::vector<ParameterPtr> parameters;
-    LayerPtr convtLayer;
-    initTestLayer(configt, &layerMap, &parameters, &convtLayer);
-    convtLayer->getBiasParameter()->zeroMem();
-    convtLayer->forward(PASS_GC);
-
-    // Setting up conv-layer config
-    TestConfig config;
-    config.biasSize = 16;
-    config.layerConfig.set_type("exconv");
-    config.layerConfig.set_num_filters(16);
-    config.layerConfig.set_partial_sum(1);
-    config.layerConfig.set_shared_biases(true);
-
-    config.inputDefs.push_back({INPUT_DATA, "layer_1", 768, 384});
-    input = config.layerConfig.add_inputs();
-    conv = input->mutable_conv_conf();
-    conv->set_filter_size(2);
-    conv->set_filter_size_y(4);
-    conv->set_channels(3);
-    conv->set_padding(0);
-    conv->set_padding_y(1);
-    conv->set_stride(2);
-    conv->set_stride_y(2);
-    conv->set_groups(1);
-    conv->set_filter_channels(conv->channels() / conv->groups());
-    conv->set_img_size(16);
-    conv->set_output_x(outputSize(conv->img_size(), conv->filter_size(),
-                                  conv->padding(), conv->stride(),
-                                  /* caffeMode */ true));
-    config.layerConfig.set_size(conv->output_x() * conv->output_x() *
-                                config.layerConfig.num_filters());
-    config.layerConfig.set_name("conv");
-
-    // data layer initialize
-    std::vector<DataLayerPtr> dataLayers2;
-    LayerMap layerMap2;
-    vector<Argument> datas2;
-    initDataLayer(config, &dataLayers2, &datas2, &layerMap2, "conv",
-                  100, false, false);
-    // test layer initialize
-    std::vector<ParameterPtr> parameters2;
-    LayerPtr convLayer;
-    initTestLayer(config, &layerMap2, &parameters2, &convLayer);
-
-    // Sync convLayer and convtLayer parameter
-    convLayer->getBiasParameter()->zeroMem();
-    convLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->copyFrom(
-            *(convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)));
-
-    // Set convLayer outputGrad as convTransLayer input value
-    convLayer->forward(PASS_GC);
-    convLayer->getOutput().grad->copyFrom(*(dataLayers[0]->getOutputValue()));
-
-    vector<int> callbackFlags(parameters2.size(), 0);
-    auto callback = [&](Parameter* para) { ++callbackFlags[para->getID()]; };
-    convLayer->backward(callback);
-
-    // Check that the convLayer backward is the same as convTransLayer forward
-    checkMatrixEqual(convtLayer->getOutputValue(),
-                     dataLayers2[0]->getOutputGrad());
+  // Setting up conv-trans layer
+  TestConfig configt;
+  configt.biasSize = 3;
+  configt.layerConfig.set_type("exconvt");
+  configt.layerConfig.set_num_filters(3);
+  configt.layerConfig.set_partial_sum(1);
+  configt.layerConfig.set_shared_biases(true);
+
+  configt.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
+  LayerInputConfig* input = configt.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(4);
+  conv->set_channels(16);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(3 / conv->groups());
+  conv->set_img_size(16);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
+                               configt.layerConfig.num_filters());
+  configt.layerConfig.set_name("convTrans");
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      configt, &dataLayers, &datas, &layerMap, "convTrans", 100, false, false);
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr convtLayer;
+  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
+  convtLayer->getBiasParameter()->zeroMem();
+  convtLayer->forward(PASS_GC);
+
+  // Setting up conv-layer config
+  TestConfig config;
+  config.biasSize = 16;
+  config.layerConfig.set_type("exconv");
+  config.layerConfig.set_num_filters(16);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 768, 384});
+  input = config.layerConfig.add_inputs();
+  conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(4);
+  conv->set_channels(3);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(16);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+                              config.layerConfig.num_filters());
+  config.layerConfig.set_name("conv");
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers2;
+  LayerMap layerMap2;
+  vector<Argument> datas2;
+  initDataLayer(
+      config, &dataLayers2, &datas2, &layerMap2, "conv", 100, false, false);
+  // test layer initialize
+  std::vector<ParameterPtr> parameters2;
+  LayerPtr convLayer;
+  initTestLayer(config, &layerMap2, &parameters2, &convLayer);
+
+  // Sync convLayer and convtLayer parameter
+  convLayer->getBiasParameter()->zeroMem();
+  convLayer->getParameters()[0]
+      ->getBuf(PARAMETER_VALUE)
+      ->copyFrom(*(convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)));
+
+  // Set convLayer outputGrad as convTransLayer input value
+  convLayer->forward(PASS_GC);
+  convLayer->getOutput().grad->copyFrom(*(dataLayers[0]->getOutputValue()));
+
+  vector<int> callbackFlags(parameters2.size(), 0);
+  auto callback = [&](Parameter* para) { ++callbackFlags[para->getID()]; };
+  convLayer->backward(callback);
+
+  // Check that the convLayer backward is the same as convTransLayer forward
+  checkMatrixEqual(convtLayer->getOutputValue(),
+                   dataLayers2[0]->getOutputGrad());
 }
 
-
 // Do one forward pass of convTrans layer and check to see if its output
 // matches the given result
-void doOneConvtTest(size_t imgSize, size_t output_x, size_t stride,
-                    size_t padding, size_t filter_size, MatrixPtr& result) {
-    TestConfig configt;
-    configt.biasSize = 1;
-    configt.layerConfig.set_type("exconvt");
-    configt.layerConfig.set_num_filters(1);
-    configt.layerConfig.set_partial_sum(1);
-    configt.layerConfig.set_shared_biases(true);
-
-    configt.inputDefs.push_back({INPUT_DATA, "layer_0", output_x * output_x,
-                                 filter_size * filter_size});
-    LayerInputConfig* input = configt.layerConfig.add_inputs();
-    ConvConfig* conv = input->mutable_conv_conf();
-    conv->set_filter_size(filter_size);
-    conv->set_filter_size_y(filter_size);
-    conv->set_channels(1);
-    conv->set_padding(padding);
-    conv->set_padding_y(padding);
-    conv->set_stride(stride);
-    conv->set_stride_y(stride);
-    conv->set_groups(1);
-    conv->set_filter_channels(1);
-    conv->set_img_size(imgSize);
-    conv->set_output_x(output_x);
-
-    configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
-                                configt.layerConfig.num_filters());
-    configt.layerConfig.set_name("convTrans");
-
-    std::vector<DataLayerPtr> dataLayers;
-    LayerMap layerMap;
-    vector<Argument> datas;
-    initDataLayer(configt, &dataLayers, &datas, &layerMap, "convTrans",
-                  1, false, false);
-    dataLayers[0]->getOutputValue()->zeroMem();
-    dataLayers[0]->getOutputValue()->add(1.0);
-
-    // test layer initialize
-    std::vector<ParameterPtr> parameters;
-    LayerPtr convtLayer;
-    initTestLayer(configt, &layerMap, &parameters, &convtLayer);
-    convtLayer->getBiasParameter()->zeroMem();
-    convtLayer->getParameters()[0]->zeroMem();
-    convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->add(1.0);
-    convtLayer->forward(PASS_GC);
-
-    checkMatrixEqual(convtLayer->getOutputValue(), result);
+void doOneConvtTest(size_t imgSize,
+                    size_t output_x,
+                    size_t stride,
+                    size_t padding,
+                    size_t filter_size,
+                    MatrixPtr& result) {
+  TestConfig configt;
+  configt.biasSize = 1;
+  configt.layerConfig.set_type("exconvt");
+  configt.layerConfig.set_num_filters(1);
+  configt.layerConfig.set_partial_sum(1);
+  configt.layerConfig.set_shared_biases(true);
+
+  configt.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", output_x * output_x, filter_size * filter_size});
+  LayerInputConfig* input = configt.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(filter_size);
+  conv->set_filter_size_y(filter_size);
+  conv->set_channels(1);
+  conv->set_padding(padding);
+  conv->set_padding_y(padding);
+  conv->set_stride(stride);
+  conv->set_stride_y(stride);
+  conv->set_groups(1);
+  conv->set_filter_channels(1);
+  conv->set_img_size(imgSize);
+  conv->set_output_x(output_x);
+
+  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
+                               configt.layerConfig.num_filters());
+  configt.layerConfig.set_name("convTrans");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      configt, &dataLayers, &datas, &layerMap, "convTrans", 1, false, false);
+  dataLayers[0]->getOutputValue()->zeroMem();
+  dataLayers[0]->getOutputValue()->add(1.0);
+
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr convtLayer;
+  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
+  convtLayer->getBiasParameter()->zeroMem();
+  convtLayer->getParameters()[0]->zeroMem();
+  convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->add(1.0);
+  convtLayer->forward(PASS_GC);
+
+  checkMatrixEqual(convtLayer->getOutputValue(), result);
 }
 
 TEST(Layer, convTransLayerFwd2) {
-    MatrixPtr result;
-    result = Matrix::create(1, 5 * 5, false, false);
-    result->zeroMem();
-    result->add(1.0);
-    doOneConvtTest(/* imgSize */ 5,
-                   /* output_x */ 1,
-                   /* stride */ 1,
-                   /* padding */ 0,
-                   /* filter_size */ 5,
-                   result);
-
-    float resultData[] = {1, 2, 2, 2, 1,
-                          2, 4, 4, 4, 2,
-                          2, 4, 4, 4, 2,
-                          2, 4, 4, 4, 2,
-                          1, 2, 2, 2, 1};
-    result->setData(resultData);
-    doOneConvtTest(/* imgSize */ 5,
-                   /* output_x */ 2,
-                   /* stride */ 1,
-                   /* padding */ 0,
-                   /* filter_size */ 4,
-                   result);
-
-    float resultData2[] = {1, 2, 2, 2, 1,
-                           2, 4, 4, 4, 2,
-                           2, 4, 4, 4, 2,
-                           2, 4, 4, 4, 2,
-                           1, 2, 2, 2, 1};
-    result->setData(resultData2);
-    doOneConvtTest(/* imgSize */ 5,
-                   /* output_x */ 2,
-                   /* stride */ 2,
-                   /* padding */ 1,
-                   /* filter_size */ 5,
-                   result);
-
-    float resultData3[] = {1, 1, 2, 1, 1,
-                           1, 1, 2, 1, 1,
-                           2, 2, 4, 2, 2,
-                           1, 1, 2, 1, 1,
-                           1, 1, 2, 1, 1};
-    result->setData(resultData3);
-    doOneConvtTest(/* imgSize */ 5,
-                   /* output_x */ 2,
-                   /* stride */ 2,
-                   /* padding */ 0,
-                   /* filter_size */ 3,
-                   result);}
+  MatrixPtr result;
+  result = Matrix::create(1, 5 * 5, false, false);
+  result->zeroMem();
+  result->add(1.0);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 1,
+                 /* stride */ 1,
+                 /* padding */ 0,
+                 /* filter_size */ 5,
+                 result);
+
+  float resultData[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
+                        4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
+  result->setData(resultData);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 2,
+                 /* stride */ 1,
+                 /* padding */ 0,
+                 /* filter_size */ 4,
+                 result);
+
+  float resultData2[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
+                         4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
+  result->setData(resultData2);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 2,
+                 /* stride */ 2,
+                 /* padding */ 1,
+                 /* filter_size */ 5,
+                 result);
+
+  float resultData3[] = {1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 4,
+                         2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1};
+  result->setData(resultData3);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 2,
+                 /* stride */ 2,
+                 /* padding */ 0,
+                 /* filter_size */ 3,
+                 result);
+}
 
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5acf02bea0e407c38cb68bcd0ae325f6af78f788
--- /dev/null
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -0,0 +1,199 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <vector>
+#include <string>
+#include "paddle/gserver/layers/DataLayer.h"
+#include "ModelConfig.pb.h"
+#include "paddle/trainer/Trainer.h"
+#include "paddle/utils/GlobalConstants.h"
+#include "paddle/gserver/layers/ExpandConvTransLayer.h"
+#include "paddle/math/MathUtils.h"
+
+#include "TestUtil.h"
+#include "LayerGradUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+P_DECLARE_bool(use_gpu);
+P_DECLARE_int32(gpu_id);
+P_DECLARE_double(checkgrad_eps);
+P_DECLARE_bool(thread_local_rand_use_global_seed);
+P_DECLARE_bool(prev_batch_state);
+
+// Do one forward pass of convTrans layer and check to see if its output
+// matches the given result
+MatrixPtr doOneConvTest(size_t imgSize, size_t output_x, size_t stride,
+                    size_t padding, size_t filter_size, size_t channel,
+                    size_t numfilters, size_t groups, MatrixPtr& inputData,
+                    real* param, bool useGpu) {
+  TestConfig config;
+  config.biasSize = numfilters;
+  if (useGpu) {
+    config.layerConfig.set_type("cudnn_conv");
+  } else {
+    config.layerConfig.set_type("exconv");
+  }
+  config.layerConfig.set_num_filters(numfilters);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  size_t weightSize = channel* filter_size * filter_size *
+      config.layerConfig.num_filters() / groups;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0",
+                              imgSize * imgSize * channel,
+                              weightSize});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(filter_size);
+  conv->set_filter_size_y(filter_size);
+  conv->set_channels(channel);
+  conv->set_padding(padding);
+  conv->set_padding_y(padding);
+  conv->set_stride(stride);
+  conv->set_stride_y(stride);
+  conv->set_groups(groups);
+  conv->set_filter_channels(channel/groups);
+  conv->set_img_size(imgSize);
+  conv->set_output_x(output_x);
+
+  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+                              config.layerConfig.num_filters());
+  config.layerConfig.set_name("conv");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(config, &dataLayers, &datas, &layerMap, "conv",
+                1, false, useGpu);
+  dataLayers[0]->getOutputValue()->zeroMem();
+  dataLayers[0]->getOutputValue()->copyFrom(*inputData);
+
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr convLayer;
+  initTestLayer(config, &layerMap, &parameters, &convLayer);
+  convLayer->getBiasParameter()->zeroMem();
+  convLayer->getParameters()[0]->zeroMem();
+  convLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->copyFrom(param,
+      weightSize);
+  convLayer->forward(PASS_GC);
+
+  return convLayer->getOutputValue();
+}
+
+TEST(Layer, convParaUnified) {
+  #ifndef PADDLE_ONLY_CPU
+    MatrixPtr input, resultCpu, resultGpu;
+    input = Matrix::create(1, 4 * 4, false, false);
+    float inputData[] = {1, 2, 3, 4,
+                         5, 6, 7, 8,
+                         9, 10, 11, 12,
+                         13, 14, 15, 16};
+    float param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9,
+                     9, 8, 7, 6, 5, 4, 3, 2, 1};
+
+    input->setData(inputData);
+
+    resultCpu = doOneConvTest(/* imgSize */ 4,
+                   /* output_x */ 2,
+                   /* stride */ 1,
+                   /* padding */ 0,
+                   /* filter_size */ 3,
+                   /*channel*/ 1,
+                   /*numfilters*/ 2,
+                   /*groups*/ 1,
+                   input, param, false);
+
+    resultGpu = doOneConvTest(/* imgSize */ 4,
+                       /* output_x */ 2,
+                       /* stride */ 1,
+                       /* padding */ 0,
+                       /* filter_size */ 3,
+                       /*channel*/ 1,
+                       /*numfilters*/ 2,
+                       /*groups*/ 1,
+                       input, param, true);
+    checkMatrixEqual(resultCpu, resultGpu);
+
+    input = Matrix::create(1, 3 * 3 * 2, false, false);
+    float inputData2[] = {1, 2, 3,
+                          4, 5, 6,
+                          7, 8, 9,
+
+                          10, 11, 12,
+                          13, 14, 15,
+                          16, 17, 18};
+    float param2[] = {1, 2, 3, 4, 5, 6, 7, 8,
+                      8, 7, 6, 5, 4, 3, 2, 1};
+
+    input->setData(inputData2);
+
+    resultCpu = doOneConvTest(/* imgSize */ 3,
+                   /* output_x */ 2,
+                   /* stride */ 1,
+                   /* padding */ 0,
+                   /* filter_size */ 2,
+                   /*channel*/ 2,
+                   /*numfilters*/ 2,
+                   /*groups*/ 1,
+                   input, param2, false);
+
+    resultGpu = doOneConvTest(/* imgSize */ 3,
+                       /* output_x */ 2,
+                       /* stride */ 1,
+                       /* padding */ 0,
+                       /* filter_size */ 2,
+                       /*channel*/ 2,
+                       /*numfilters*/ 2,
+                       /*groups*/ 1,
+                       input, param2, true);
+    checkMatrixEqual(resultCpu, resultGpu);
+
+
+    float param3[] = {1, 2, 3, 4,
+                      4, 3, 2, 1};
+
+    resultCpu = doOneConvTest(/* imgSize */ 3,
+                   /* output_x */ 2,
+                   /* stride */ 1,
+                   /* padding */ 0,
+                   /* filter_size */ 2,
+                   /*channel*/ 2,
+                   /*numfilters*/ 2,
+                   /*groups*/ 2,
+                   input, param3, false);
+
+    resultGpu = doOneConvTest(/* imgSize */ 3,
+                       /* output_x */ 2,
+                       /* stride */ 1,
+                       /* padding */ 0,
+                       /* filter_size */ 2,
+                       /*channel*/ 2,
+                       /*numfilters*/ 2,
+                       /*groups*/ 2,
+                       input, param3, true);
+    checkMatrixEqual(resultCpu, resultGpu);
+  #endif
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index 3a591a316b8bafccac9c59ff28e57b4e27f8377a..be639ea09380d02ed8251874bf690fc3596bddf2 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <gtest/gtest.h>
 #include <vector>
 #include "ModelConfig.pb.h"
@@ -48,8 +47,10 @@ struct TestConfig {
   TestConfig() : testAccumulate(true) {}
 };
 
-void testEvaluator(TestConfig testConf, string testEvaluatorName,
-                   size_t batchSize, bool useGpu) {
+void testEvaluator(TestConfig testConf,
+                   string testEvaluatorName,
+                   size_t batchSize,
+                   bool useGpu) {
 #ifdef PADDLE_ONLY_CPU
   if (useGpu) return;
 #endif
@@ -79,8 +80,10 @@ void testEvaluator(TestConfig testConf, string testEvaluatorName,
         data.ids->rand(dim);  // now rand number can be 0 to inputDefs[i].dim.
         break;
       case INPUT_SPARSE_NON_VALUE_DATA:
-        data.value = makeRandomSparseMatrix(batchSize, dim,
-                                            /* withValue= */ false, useGpu);
+        data.value = makeRandomSparseMatrix(batchSize,
+                                            dim,
+                                            /* withValue= */ false,
+                                            useGpu);
         break;
       default:
         LOG(FATAL) << " unknown inputType ";
@@ -116,8 +119,9 @@ void testEvaluator(TestConfig testConf, string testEvaluatorName,
   }
 }
 
-void testEvaluatorAll(TestConfig testConf, string testEvaluatorName,
-                   size_t batchSize) {
+void testEvaluatorAll(TestConfig testConf,
+                      string testEvaluatorName,
+                      size_t batchSize) {
   testEvaluator(testConf, testEvaluatorName, batchSize, true);
   testEvaluator(testConf, testEvaluatorName, batchSize, false);
 }
@@ -142,8 +146,8 @@ TEST(Evaluator, classification_error) {
   config.evaluatorConfig.set_classification_threshold(0.4);
   config.inputDefs.push_back({INPUT_DATA, "weight", 1});
   // Not support GPU
-  testEvaluator(config, "classification_error_weight_multi_binary_label", 50,
-                false);
+  testEvaluator(
+      config, "classification_error_weight_multi_binary_label", 50, false);
 }
 
 TEST(Evaluator, sum) {
@@ -211,8 +215,8 @@ TEST(Evaluator, precision_recall) {
   config.evaluatorConfig.set_classification_threshold(0.4);
   config.inputDefs.push_back({INPUT_DATA, "weight", 1});
   // Not support GPU
-  testEvaluator(config, "precision_recall_weight_multi_binary_label", 100,
-                false);
+  testEvaluator(
+      config, "precision_recall_weight_multi_binary_label", 100, false);
 }
 
 TEST(Evaluator, ctc_error_evaluator) {
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index a79dfe39c9bb26c7b2acec1051699e1804494d93..099e96aa6c7439adc76248fcbb88cf24e7496ed4 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -69,8 +69,10 @@ TEST(Projection, context) {
               std::max(0, conf.context_start() + conf.context_length() - 1);
           for (auto useGpu : {false, true}) {
             testProjectionGrad(
-                conf, INPUT_SEQUENCE_DATA,
-                trainablePadding ? conf.input_size() * pad : 0, batchSize,
+                conf,
+                INPUT_SEQUENCE_DATA,
+                trainablePadding ? conf.input_size() * pad : 0,
+                batchSize,
                 useGpu,
                 contextStart + contextLength <= 1);  // = testState
           }
@@ -86,8 +88,11 @@ TEST(Projection, trans_fc) {
   conf.set_input_size(50);
   conf.set_output_size(20);
   for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 1000,
-                       /* batchSize */ 100, useGpu);
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 1000,
+                       /* batchSize */ 100,
+                       useGpu);
   }
 }
 
@@ -97,8 +102,11 @@ TEST(Projection, fc) {
   conf.set_input_size(10);
   conf.set_output_size(20);
   for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 200,
-                       /* batchSize */ 100, useGpu);
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 200,
+                       /* batchSize */ 100,
+                       useGpu);
   }
 }
 
@@ -108,8 +116,11 @@ TEST(Projection, dot_mul) {
   conf.set_input_size(20);
   conf.set_output_size(20);
   for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 20,
-                       /* batchSize */ 100, useGpu);
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 20,
+                       /* batchSize */ 100,
+                       useGpu);
   }
 }
 
@@ -119,8 +130,11 @@ TEST(Projection, table) {
   conf.set_input_size(10);
   conf.set_output_size(20);
   for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf, INPUT_LABEL, /* parameterSize */ 200,
-                       /* batchSize */ 100, useGpu);
+    testProjectionGrad(conf,
+                       INPUT_LABEL,
+                       /* parameterSize */ 200,
+                       /* batchSize */ 100,
+                       useGpu);
   }
 }
 
@@ -130,8 +144,11 @@ TEST(Projection, identity) {
   conf.set_input_size(10);
   conf.set_output_size(10);
   for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 0,
-                       /* batchSize */ 100, useGpu);
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 0,
+                       /* batchSize */ 100,
+                       useGpu);
   }
 }
 
@@ -141,14 +158,16 @@ TEST(Projection, scaling) {
   conf.set_input_size(10);
   conf.set_output_size(10);
   for (auto useGpu : {false}) {
-    testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 1,
-                       /* batchSize */ 100, useGpu);
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 1,
+                       /* batchSize */ 100,
+                       useGpu);
   }
 }
 
-#ifndef PADDLE_ONLY_CPU
-TEST(Projection, conv) {
-  const int NUM_FILTERS = 16;
+void testProjectionConv(size_t groups) {
+  const int NUM_FILTERS = 18;
   const int FILTER_SIZE = 2;
   const int FILTER_SIZE_Y = 3;
   const int CHANNELS = 3;
@@ -166,23 +185,38 @@ TEST(Projection, conv) {
   conv->set_padding_y(1);
   conv->set_stride(2);
   conv->set_stride_y(2);
-  conv->set_groups(1);
+  conv->set_groups(groups);
   conv->set_filter_channels(conv->channels() / conv->groups());
   conv->set_img_size(IMAGE_SIZE);
-  int output_x =
-      outputSize(conv->img_size(), conv->filter_size(), conv->padding(),
-                 conv->stride(), /* caffeMode */ true);
-  int output_y =
-      outputSize(conv->img_size(), conv->filter_size_y(), conv->padding_y(),
-                 conv->stride_y(), /* caffeMode */ true);
+  int output_x = outputSize(conv->img_size(),
+                            conv->filter_size(),
+                            conv->padding(),
+                            conv->stride(),
+                            /* caffeMode */ true);
+  int output_y = outputSize(conv->img_size(),
+                            conv->filter_size_y(),
+                            conv->padding_y(),
+                            conv->stride_y(),
+                            /* caffeMode */ true);
   conv->set_output_x(output_x);
   conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
   conf.set_output_size(output_x * output_y * NUM_FILTERS);
 
-  testProjectionGrad(
-      conf, INPUT_DATA,
-      /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE * FILTER_SIZE_Y,
-      /* batchSize */ 100, true, false, NUM_FILTERS, true);
+  testProjectionGrad(conf,
+                     INPUT_DATA,
+                     /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE *
+                         FILTER_SIZE_Y / groups,
+                     /* batchSize */ 100,
+                     true,
+                     false,
+                     NUM_FILTERS,
+                     true);
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST(Projection, conv) {
+  testProjectionConv(1);
+  testProjectionConv(3);
 }
 #endif
 
@@ -194,9 +228,10 @@ TEST(Layer, BilinearInterpLayer) {
 
   LayerInputConfig* input = config.layerConfig.add_inputs();
   BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf();
-  bilinear->set_img_size_x(32);
-  bilinear->set_img_size_y(32);
-  bilinear->set_num_channels(4);
+  ImageConfig* image = bilinear->mutable_image_conf();
+  image->set_img_size(32);
+  image->set_img_size_y(32);
+  image->set_channels(4);
 
   for (auto useGpu : {false, true}) {
     for (auto outSize : {32, 64}) {
@@ -253,8 +288,13 @@ TEST(Layer, CRFLayer) {
   config.layerConfig.add_inputs();
 
   // Not support GPU now
-  testLayerGrad(config, "crf", 100, /* trans */ false, /* useGpu */ false,
-                false /*useWeight*/, 0.03 /*epsilon*/);
+  testLayerGrad(config,
+                "crf",
+                100,
+                /* trans */ false,
+                /* useGpu */ false,
+                false /*useWeight*/,
+                0.03 /*epsilon*/);
 }
 
 TEST(Layer, CTCLayer) {
@@ -314,7 +354,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
   config.layerConfig.set_partial_sum(1);
   config.layerConfig.set_shared_biases(true);
 
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 768, 288});
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 384, 288});
   LayerInputConfig* input = config.layerConfig.add_inputs();
   ConvConfig* conv = input->mutable_conv_conf();
   conv->set_filter_size(2);
@@ -327,10 +367,18 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
   conv->set_groups(1);
   conv->set_filter_channels(conv->channels() / conv->groups());
   conv->set_img_size(16);
-  conv->set_output_x(outputSize(conv->img_size(), conv->filter_size(),
-                                conv->padding(), conv->stride(),
+  conv->set_img_size_y(8);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
                                 /* caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
                               config.layerConfig.num_filters());
 
   testLayerGrad(config, "conv", 100, trans, useGpu);
@@ -346,7 +394,6 @@ TEST(Layer, convLayer) {
 #endif
 }
 
-
 void testConvTransLayer(const string& type, bool trans, bool useGpu) {
   TestConfig config;
   config.biasSize = 3;
@@ -368,8 +415,10 @@ void testConvTransLayer(const string& type, bool trans, bool useGpu) {
   conv->set_groups(1);
   conv->set_filter_channels(3 / conv->groups());
   conv->set_img_size(16);
-  conv->set_output_x(outputSize(conv->img_size(), conv->filter_size(),
-                                conv->padding(), conv->stride(),
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
                                 /* caffeMode */ true));
 
   config.layerConfig.set_size(conv->img_size() * conv->img_size() *
@@ -403,14 +452,16 @@ TEST(Layer, blockExpandLayer) {
   blockExpand->set_block_y(32);
   blockExpand->set_stride_x(2);
   blockExpand->set_stride_y(2);
-  blockExpand->set_output_x(
-      outputSize(blockExpand->img_size_x(), blockExpand->block_x(),
-                 blockExpand->padding_x(), blockExpand->stride_x(),
-                 /* caffeMode */ false));
-  blockExpand->set_output_y(
-      outputSize(blockExpand->img_size_y(), blockExpand->block_y(),
-                 blockExpand->padding_y(), blockExpand->stride_y(),
-                 /* caffeMode */ false));
+  blockExpand->set_output_x(outputSize(blockExpand->img_size_x(),
+                                       blockExpand->block_x(),
+                                       blockExpand->padding_x(),
+                                       blockExpand->stride_x(),
+                                       /* caffeMode */ false));
+  blockExpand->set_output_y(outputSize(blockExpand->img_size_y(),
+                                       blockExpand->block_y(),
+                                       blockExpand->padding_y(),
+                                       blockExpand->stride_y(),
+                                       /* caffeMode */ false));
   config.layerConfig.set_size(blockExpand->block_x() * blockExpand->block_y() *
                               blockExpand->channels());
 
@@ -427,10 +478,11 @@ TEST(Layer, maxoutLayer) {
   config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
   LayerInputConfig* input = config.layerConfig.add_inputs();
   MaxOutConfig* maxout = input->mutable_maxout_conf();
+  ImageConfig* image = maxout->mutable_image_conf();
 
-  maxout->set_img_size_x(32);
-  maxout->set_img_size_y(32);
-  maxout->set_channels(4);
+  image->set_img_size(32);
+  image->set_img_size_y(32);
+  image->set_channels(4);
   maxout->set_groups(2);
 
   for (auto useGpu : {false, true}) {
@@ -453,7 +505,11 @@ void testFcLayer(string format, size_t nnz) {
             << config.inputDefs[0].sparse.format;
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "fc", 100, /* trans */ false, useGpu,
+    testLayerGrad(config,
+                  "fc",
+                  100,
+                  /* trans */ false,
+                  useGpu,
                   /* weight */ true);
   }
 }
@@ -481,11 +537,19 @@ TEST(Layer, SelectiveFullyConnectedLayer) {
       {INPUT_SPARSE_NON_VALUE_DATA, "index", nout, 0, ParaSparse("csr", true)});
   config.layerConfig.add_inputs();
 
-  testLayerGrad(config, "selective_fc", 100,
-                /* trans= */ false, /* useGup= */ false, false);
+  testLayerGrad(config,
+                "selective_fc",
+                100,
+                /* trans= */ false,
+                /* useGup= */ false,
+                false);
 #ifndef PADDLE_ONLY_CPU
-  testLayerGrad(config, "selective_fc", 100,
-                /* trans= */ false, /* useGup= */ true, false);
+  testLayerGrad(config,
+                "selective_fc",
+                100,
+                /* trans= */ false,
+                /* useGup= */ true,
+                false);
 #endif
 }
 
@@ -502,7 +566,10 @@ TEST(Layer, DataNormLayer) {
   for (auto strategy : {"z-score", "min-max", "decimal-scaling"}) {
     config.layerConfig.set_data_norm_strategy(strategy);
     // The parameters are static, so not support GPU now
-    testLayerGrad(config, "data_norm", 200, /* trans */ false,
+    testLayerGrad(config,
+                  "data_norm",
+                  200,
+                  /* trans */ false,
                   /* useGpu */ false);
   }
 }
@@ -534,8 +601,8 @@ TEST(Layer, multi_cross) {
   config.layerConfig.add_inputs();
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "multi-class-cross-entropy", 100, /* trans */ false,
-                  useGpu);
+    testLayerGrad(
+        config, "multi-class-cross-entropy", 100, /* trans */ false, useGpu);
   }
 }
 
@@ -550,8 +617,11 @@ TEST(Layer, multi_binary_label_sparse_mat) {
   config.layerConfig.add_inputs();
 
   for (auto useGpu : {false, true}) {
-      testLayerGrad(config, "multi_binary_label_cross_entropy", 100,
-                    /* trans */ false, useGpu);
+    testLayerGrad(config,
+                  "multi_binary_label_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
   }
 }
 
@@ -566,8 +636,11 @@ TEST(layer, multi_binary_label_id) {
   config.layerConfig.add_inputs();
 
   for (auto useGpu : {false, true}) {
-      testLayerGrad(config, "multi_binary_label_cross_entropy", 100,
-                    /* trans */ false, useGpu);
+    testLayerGrad(config,
+                  "multi_binary_label_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
   }
 }
 
@@ -583,7 +656,9 @@ TEST(Layer, multi_cross_with_selfnorm) {
   config.layerConfig.add_inputs();
 
   // Not support GPU now
-  testLayerGrad(config, "multi_class_cross_entropy_with_selfnorm", 100,
+  testLayerGrad(config,
+                "multi_class_cross_entropy_with_selfnorm",
+                100,
                 /* trans */ false,
                 /* useGpu */ false);
 }
@@ -599,8 +674,11 @@ TEST(Layer, multi_cross_soft) {
   config.layerConfig.add_inputs();
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "soft_binary_class_cross_entropy", 100,
-                  /* trans */ false, useGpu);
+    testLayerGrad(config,
+                  "soft_binary_class_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
   }
 }
 
@@ -630,7 +708,10 @@ TEST(Layer, sparse_square_error) {
   config.layerConfig.add_inputs();
 
   // "GpuSparseMatrix" as label is not supported
-  testLayerGrad(config, "square_error", 100, /* trans */ false,
+  testLayerGrad(config,
+                "square_error",
+                100,
+                /* trans */ false,
                 /* useGpu */ false);
 }
 
@@ -645,7 +726,10 @@ TEST(Layer, sparse_float_square_error) {
   config.layerConfig.add_inputs();
 
   // "GpuSparseMatrix" as label is not supported
-  testLayerGrad(config, "square_error", 100, /* trans */ false,
+  testLayerGrad(config,
+                "square_error",
+                100,
+                /* trans */ false,
                 /* useGpu */ false);
 }
 
@@ -688,10 +772,14 @@ void testExpandLayer(string trans_type, bool hasSubseq) {
 
   config.inputDefs.push_back(
       {trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA,
-       "layer_0", 10, 0});
+       "layer_0",
+       10,
+       0});
   config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA, "layer_1",
-       10, 0});
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+       "layer_1",
+       10,
+       0});
   config.layerConfig.add_inputs();
   config.layerConfig.add_inputs();
   config.layerConfig.set_trans_type(trans_type);
@@ -715,8 +803,10 @@ void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) {
   config.biasSize = 0;
 
   config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA, "layer_0",
-       10, 0});
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+       "layer_0",
+       10,
+       0});
   config.layerConfig.add_inputs();
   config.layerConfig.set_trans_type(trans_type);
 
@@ -746,9 +836,11 @@ TEST(Layer, MaxLayer) {
 }
 
 TEST(Layer, SequenceLastInstanceLayer) {
-  testDegradeLayer(false, "seqlastins",
+  testDegradeLayer(false,
+                   "seqlastins",
                    "non-seq");  // seq seqlastins to non-seq
-  testDegradeLayer(true, "seqlastins",
+  testDegradeLayer(true,
+                   "seqlastins",
                    "non-seq");  // hasSubseq seqlastins to non-seq
   testDegradeLayer(true, "seqlastins", "seq");  // hasSubseq seqlastins to seq
 }
@@ -902,7 +994,7 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
   config.layerConfig.set_type("norm");
   config.layerConfig.set_active_type("relu");
 
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1568, 0});
   LayerInputConfig* input = config.layerConfig.add_inputs();
   NormConfig* norm = input->mutable_norm_conf();
   norm->set_norm_type(normType);
@@ -912,7 +1004,9 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
   norm->set_pow(0.75);
   norm->set_blocked(0);
   norm->set_img_size(14);
+  norm->set_img_size_y(7);
   norm->set_output_x(norm->img_size());
+  norm->set_output_y(norm->img_size_y());
   if (norm->norm_type() == "cmrnorm" ||
       norm->norm_type() == "cmrnorm-projection") {
     norm->set_scale(norm->scale() / norm->size());
@@ -920,7 +1014,7 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
     norm->set_scale(norm->scale() / (norm->size() * norm->size()));
   }
 
-  config.layerConfig.set_size(norm->output_x() * norm->output_x() *
+  config.layerConfig.set_size(norm->output_x() * norm->output_y() *
                               norm->channels());
   config.biasSize = 0;
 
@@ -933,7 +1027,8 @@ TEST(Layer, NormLayer) {
 }
 #endif
 
-void setPoolConfig(TestConfig* config, PoolConfig* pool,
+void setPoolConfig(TestConfig* config,
+                   PoolConfig* pool,
                    const string& poolType) {
   (*config).biasSize = 0;
   (*config).layerConfig.set_type("pool");
@@ -1009,7 +1104,9 @@ TEST(Layer, PoolLayer) {
 #endif
 }
 
-void testSppLayer(const string& poolType, const int pyramidHeight, bool trans,
+void testSppLayer(const string& poolType,
+                  const int pyramidHeight,
+                  bool trans,
                   bool useGpu) {
   TestConfig config;
   config.layerConfig.set_type("spp");
@@ -1018,11 +1115,12 @@ void testSppLayer(const string& poolType, const int pyramidHeight, bool trans,
   SppConfig* sppConfig = input->mutable_spp_conf();
   sppConfig->set_pool_type(poolType);
   sppConfig->set_pyramid_height(pyramidHeight);
-  sppConfig->set_channels(16);
-  sppConfig->set_img_size(10);
-  sppConfig->set_img_size_y(20);
+  ImageConfig* imageConfig = sppConfig->mutable_image_conf();
+  imageConfig->set_channels(16);
+  imageConfig->set_img_size(10);
+  imageConfig->set_img_size_y(20);
   int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1);
-  config.layerConfig.set_size(outputSize * sppConfig->channels());
+  config.layerConfig.set_size(outputSize * imageConfig->channels());
   testLayerGrad(config, "spp", 100, trans, useGpu);
 }
 
@@ -1232,7 +1330,8 @@ TEST(Layer, NCELayer) {
 
     for (auto isIdLabel : {false, true}) {
       config.inputDefs[1] = {
-          isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA, "label",
+          isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA,
+          "label",
           /* dim= */ numClasses,
           /* paraSize= */ 0};
 
@@ -1254,7 +1353,10 @@ TEST(Layer, NCELayer) {
                   << " isIdLabel=" << isIdLabel << " withWeight=" << withWeight
                   << " withDist=" << withDist;
         // Not support GPU now
-        testLayerGrad(config, "nce", 100, /* trans= */ false,
+        testLayerGrad(config,
+                      "nce",
+                      100,
+                      /* trans= */ false,
                       /* useGpu */ false);
       }
     }
@@ -1328,12 +1430,15 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
   TestConfig config;
   const int CHANNELS = 10;
   const int IMG_SIZE = 16;
+  const int IMG_SIZE_Y = 8;
+  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y;
   config.layerConfig.set_type(type);
-  config.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE);
+  config.layerConfig.set_size(size);
   config.layerConfig.set_active_type("sigmoid");
   config.biasSize = CHANNELS;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0",
-                              /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS,
+  config.inputDefs.push_back({INPUT_DATA,
+                              "layer_0",
+                              /* dim= */ size,
                               /* paraSize= */ CHANNELS});
 
   config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
@@ -1348,8 +1453,13 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
   ImageConfig* img_conf = input->mutable_image_conf();
   img_conf->set_channels(CHANNELS);
   img_conf->set_img_size(IMG_SIZE);
+  img_conf->set_img_size_y(IMG_SIZE_Y);
 
-  testLayerGrad(config, "batch_norm", 64, /* trans= */ trans, useGpu,
+  testLayerGrad(config,
+                "batch_norm",
+                64,
+                /* trans= */ trans,
+                useGpu,
                 /* useWeight */ true);
 }
 
@@ -1370,6 +1480,7 @@ TEST(Operator, conv) {
   const int FILTER_SIZE_Y = 3;
   const int CHANNELS = 3;
   const int IMAGE_SIZE = 16;
+  const int IMAGE_SIZE_Y = 8;
   OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
   operatorConf.set_type("conv");
   ConvConfig* conv = operatorConf.mutable_conv_conf();
@@ -1384,20 +1495,27 @@ TEST(Operator, conv) {
   conv->set_groups(1);
   conv->set_filter_channels(conv->channels() / conv->groups());
   conv->set_img_size(IMAGE_SIZE);
-  int output_x =
-      outputSize(conv->img_size(), conv->filter_size(), conv->padding(),
-                 conv->stride(), /* caffeMode */ true);
-  conv->set_output_x(output_x);
-  config.layerConfig.set_size(output_x * output_x *
-                              config.layerConfig.num_filters());
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+  conv->set_img_size_y(IMAGE_SIZE_Y);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /*  caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /*  caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
                               NUM_FILTERS);
 
   config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE * CHANNELS, 0});
+      {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
   config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1",
-       FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS, 0});
+      {INPUT_DATA,
+       "layer_1",
+       FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS,
+       0});
   config.layerConfig.add_inputs();
   config.layerConfig.add_inputs();
 
@@ -1411,12 +1529,17 @@ TEST(Layer, FeatureMapExpandLayer) {
   const int INPUT_SIZE = 100;
   config.layerConfig.set_size(INPUT_SIZE * CHANNELS);
   config.layerConfig.set_num_filters(CHANNELS);
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0",
-                              /* dim= */ INPUT_SIZE, /* paraSize= */ 0});
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
+                              "layer_0",
+                              /* dim= */ INPUT_SIZE,
+                              /* paraSize= */ 0});
   config.layerConfig.add_inputs();
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "featmap_expand",
-                  /*batch_size*/ 100, /* trans= */ false, useGpu,
+    testLayerGrad(config,
+                  "featmap_expand",
+                  /*batch_size*/ 100,
+                  /* trans= */ false,
+                  useGpu,
                   /* useWeight */ true);
   }
 }
diff --git a/paddle/gserver/tests/test_LinearChainCRF.cpp b/paddle/gserver/tests/test_LinearChainCRF.cpp
index f45e40c8b6acb5ff7d3e16f9a6f9a5acba13e84e..913d6ed7511a0c3c7c0b40e1fbdb48a17b51b1b2 100644
--- a/paddle/gserver/tests/test_LinearChainCRF.cpp
+++ b/paddle/gserver/tests/test_LinearChainCRF.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <gtest/gtest.h>
 #include <vector>
 #include "paddle/gserver/layers/LinearChainCRF.h"
diff --git a/paddle/gserver/tests/test_MultinomialSampler.cpp b/paddle/gserver/tests/test_MultinomialSampler.cpp
index 73b4d0b8b7110d4ab79809875e2481cd2b565a68..3fc099adbdb6cb562c4bfc419b777ef534bdfed7 100644
--- a/paddle/gserver/tests/test_MultinomialSampler.cpp
+++ b/paddle/gserver/tests/test_MultinomialSampler.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <random>
 
 #include <gtest/gtest.h>
@@ -43,7 +42,7 @@ TEST(MultinomialSampler, gen) {
   int size = 1024 * 4;
   default_random_engine reng;
 
-  for (size_t iter=0; iter < 256; ++iter) {
+  for (size_t iter = 0; iter < 256; ++iter) {
     uniform_int_distribution<int> rand(1, numGrids / size * 1.8);
     vector<real> prob;
     int sum = 0;
@@ -138,7 +137,6 @@ void benchmarkRandom() {
   LOG(INFO) << "sum1=" << sum1;
 }
 
-
 int main(int argc, char** argv) {
   initMain(argc, argv);
   testing::InitGoogleTest(&argc, argv);
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index 8d3eac5aca8d1567690f905b2e4b4f6fab7efdde..71ed3bc4b6fb44a74868da2f95597c1967206bd4 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -41,7 +41,8 @@ struct DataOut {
   std::vector<VectorPtr> paraGrads;
 };
 
-void initArgument(DataIn& data, const std::string& configPath,
+void initArgument(DataIn& data,
+                  const std::string& configPath,
                   bool useGpu = FLAGS_use_gpu) {
   TrainerConfigHelper config(configPath);
   size_t batchSize = config.getOptConfig().batch_size();
@@ -122,9 +123,10 @@ void calcGradient(DataIn& in, DataOut& out, const std::string& configPath) {
   }
   gradientMachine->backward();
   for (size_t i = 0; i < in.outGrads.size(); i++) {
-    MatrixPtr value =
-        Matrix::create(outArgs[i].value->getHeight(),
-                       outArgs[i].value->getWidth(), false, false);
+    MatrixPtr value = Matrix::create(outArgs[i].value->getHeight(),
+                                     outArgs[i].value->getWidth(),
+                                     false,
+                                     false);
     value->copyFrom(*outArgs[i].value);
     out.outValues.push_back(value);
   }
@@ -147,8 +149,12 @@ void calcGradient(DataIn& in, DataOut& out, const std::string& configPath) {
   gradientMachine->finish();
 }
 
-void checkBuffer(real* A, const char* desA, real* B, const char* desB,
-                 size_t len, size_t width = 1) {
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 size_t width = 1) {
   int nNum = 0;
   for (size_t i = 0; i < len; ++i) {
     real diff = fabs(A[i] - B[i]);
@@ -168,8 +174,10 @@ void compareGradient(DataOut& outA, DataOut& outB) {
             << "------------------------------";
   for (size_t i = 0; i < outA.outValues.size(); ++i) {
     LOG(INFO) << "OUTPUT VALUE: " << i;
-    checkBuffer(outA.outValues[i]->getData(), "network A output",
-                outB.outValues[i]->getData(), "network B output",
+    checkBuffer(outA.outValues[i]->getData(),
+                "network A output",
+                outB.outValues[i]->getData(),
+                "network B output",
                 outA.outValues[i]->getElementCnt(),
                 outA.outValues[i]->getWidth());
   }
@@ -180,8 +188,10 @@ void compareGradient(DataOut& outA, DataOut& outB) {
               << "------------------------------";
     for (size_t i = 0; i < outA.paraGrads.size(); ++i) {
       LOG(INFO) << "PARAMETER GRADIENT: " << i;
-      checkBuffer(outA.paraGrads[i]->getData(), "Network A",
-                  outB.paraGrads[i]->getData(), "Network B",
+      checkBuffer(outA.paraGrads[i]->getData(),
+                  "Network A",
+                  outB.paraGrads[i]->getData(),
+                  "Network B",
                   outA.paraGrads[i]->getSize());
     }
   }
@@ -245,8 +255,17 @@ TEST(Compare, img_conv) {
   compareNetwork(config_file_a, config_file_b);
   FLAGS_use_gpu = useGpu;
 }
-#endif
 
+// Test cudnn_conv and exconv give the same result
+TEST(Compare, img_conv2) {
+  std::string config_file_a = "./gserver/tests/img_conv_a.conf";
+  std::string config_file_b = "./gserver/tests/img_conv_c.conf";
+  bool useGpu = FLAGS_use_gpu;
+  FLAGS_use_gpu = true;
+  compareNetwork(config_file_a, config_file_b);
+  FLAGS_use_gpu = useGpu;
+}
+#endif
 
 P_DEFINE_string(config_file_a, "", "config of one network to compare");
 P_DEFINE_string(config_file_b, "", "config of another network to compare");
diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp
index 68f7f43261c8353b6836416bea97dad4f817ba75..01070bc1cb3023bc0321f0a8e867b8abd7030e08 100644
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <memory>
 #include <string>
 
@@ -41,7 +40,9 @@ const int kSpraseMatrixDim = 1024;
 
 using namespace paddle;  // NOLINT
 
-void prepareData(DataBatch* batch, const int* numPerSlotType, bool iid,
+void prepareData(DataBatch* batch,
+                 const int* numPerSlotType,
+                 bool iid,
                  bool useGpu) {
   batch->clear();
   int64_t size = uniformRandom(100) + 10;
@@ -137,7 +138,7 @@ inline int getSlotDim(const Argument& arg) {
 
 inline SlotDef::SlotType getSlotType(const Argument& arg) {
   if (arg.value) {
-    auto & m = *arg.value;
+    auto& m = *arg.value;
     auto& type = typeid(m);
     if (type == typeid(CpuMatrix) || type == typeid(GpuMatrix)) {
       return SlotDef::VECTOR_DENSE;
@@ -169,8 +170,12 @@ inline SlotDef::SlotType getSlotType(const Argument& arg) {
   return SlotDef::VECTOR_DENSE;
 }
 
-void getColRow(const Argument& arg, int64_t pos, bool useGpu, int* colNum,
-               const int** rowCols, const real** rowValues) {
+void getColRow(const Argument& arg,
+               int64_t pos,
+               bool useGpu,
+               int* colNum,
+               const int** rowCols,
+               const real** rowValues) {
   SlotDef::SlotType type = getSlotType(arg);
   GpuSparseMatrixPtr matGpu;
   CpuSparseMatrixPtr matCpu;
@@ -190,8 +195,11 @@ void getColRow(const Argument& arg, int64_t pos, bool useGpu, int* colNum,
   }
 }
 
-void makeSample(const vector<Argument>& arguments, int64_t pos,
-                bool isBeginning, DataSample* sample, bool useGpu) {
+void makeSample(const vector<Argument>& arguments,
+                int64_t pos,
+                bool isBeginning,
+                DataSample* sample,
+                bool useGpu) {
   sample->set_is_beginning(isBeginning);
   int slotid = 0;
   for (auto& arg : arguments) {
@@ -272,8 +280,7 @@ void writeData(const DataBatch& batch, bool useGpu, bool dataCompression) {
 
   int64_t totalSeqs = batch.getNumSequences();
   int64_t seq = 0;
-  ICpuGpuVectorPtr sequenceStartPositions =
-      arguments[0].sequenceStartPositions;
+  ICpuGpuVectorPtr sequenceStartPositions = arguments[0].sequenceStartPositions;
   int64_t numWritten = 0;
   vector<string> curProtoFiles =
       dataCompression ? protoFilesCompressed : protoFiles;
@@ -306,8 +313,11 @@ void writeData(const DataBatch& batch, bool useGpu, bool dataCompression) {
 }
 
 // check that the sample at pos1 in args1 is same as the sample at pos2 in args2
-void checkSample(const vector<Argument>& args1, int64_t pos1,
-                 const vector<Argument>& args2, int64_t pos2, bool useGpu) {
+void checkSample(const vector<Argument>& args1,
+                 int64_t pos1,
+                 const vector<Argument>& args2,
+                 int64_t pos2,
+                 bool useGpu) {
   EXPECT_EQ(args1.size(), args2.size());
   VLOG(1) << " pos1=" << pos1 << " pos2=" << pos2;
 
@@ -361,8 +371,11 @@ void checkSample(const vector<Argument>& args1, int64_t pos1,
   }
 }
 
-void testProtoDataProvider(int* numPerSlotType, bool iid, bool async,
-                           bool useGpu, bool dataCompression,
+void testProtoDataProvider(int* numPerSlotType,
+                           bool iid,
+                           bool async,
+                           bool useGpu,
+                           bool dataCompression,
                            int numConstantSlots = 0) {
   mkDir(kTestDir);
   DataBatch data;
@@ -377,7 +390,9 @@ void testProtoDataProvider(int* numPerSlotType, bool iid, bool async,
 
   for (int i = 0; i < numConstantSlots; ++i) {
     config.add_constant_slots(i + 11);
-    MatrixPtr w = Matrix::create(data.getSize(), 1, /* trans= */ false,
+    MatrixPtr w = Matrix::create(data.getSize(),
+                                 1,
+                                 /* trans= */ false,
                                  /* useGpu= */ false);
     w->assign(config.constant_slots(i));
     data.appendData(w);
@@ -393,16 +408,14 @@ void testProtoDataProvider(int* numPerSlotType, bool iid, bool async,
 
   size_t seq1 = 0;
   vector<Argument>& args1 = data.getStreams();
-  ICpuGpuVectorPtr sequenceStartPositions1 =
-      args1[0].sequenceStartPositions;
+  ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions;
 
   dataProvider->reset();
 
   while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
     CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
     vector<Argument>& args2 = batch.getStreams();
-    ICpuGpuVectorPtr sequenceStartPositions2 =
-        args2[0].sequenceStartPositions;
+    ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions;
     for (auto& arg : args2) {
       EXPECT_EQ(iid, !arg.sequenceStartPositions);
     }
@@ -494,8 +507,8 @@ TEST(ProtoDataProvider, test) {
                 numSparseValueVectorSlots;
             numPerSlotType[SlotDef::INDEX] = numIdSlots;
             numPerSlotType[SlotDef::STRING] = numStrSlots;
-            testProtoDataProvider(numPerSlotType, iid, async, useGpu,
-                                  dataCompression);
+            testProtoDataProvider(
+                numPerSlotType, iid, async, useGpu, dataCompression);
           }  // end for (int dataCompression : numTwoArray)
         }    // end for (int useGpu : numTwoArray)
       }      // end for (int async : numTwoArray)
@@ -531,7 +544,9 @@ TEST(ProtoDataProvider, constant_slots) {
             numPerSlotType[SlotDef::INDEX] = 1;
             testProtoDataProvider(numPerSlotType,
                                   /* iid= */ true,
-                                  /* async= */ false, useGpu, dataCompression,
+                                  /* async= */ false,
+                                  useGpu,
+                                  dataCompression,
                                   numConstantSlots);
           }  // end for (int dataCompression : numTwoArray)
         }    // end for (int useGpu : numTwoArray)
@@ -541,16 +556,17 @@ TEST(ProtoDataProvider, constant_slots) {
 }
 
 void checkSampleSequence(const vector<Argument>& args1,
-                         const vector<Argument>& args2, int64_t offset,
-                         int64_t numSeqs, bool useGpu) {
+                         const vector<Argument>& args2,
+                         int64_t offset,
+                         int64_t numSeqs,
+                         bool useGpu) {
   // check slot num are equal
   EXPECT_EQ(args1.size(), args2.size());
   for (size_t i = 0; i < args1.size(); i++) {
     auto type = getSlotType(args1[i]);
     // check for args2: sequenceStartPositions vs numSeqs
     // (1) size
-    EXPECT_EQ(args2[i].sequenceStartPositions->getSize(),
-              (size_t)numSeqs + 1);
+    EXPECT_EQ(args2[i].sequenceStartPositions->getSize(), (size_t)numSeqs + 1);
     // (2) content
     auto checkArgContent = [&](const Argument& args, int numSeqs) {
       for (int j = 0; j <= numSeqs; j++) {
@@ -579,8 +595,8 @@ void checkSampleSequence(const vector<Argument>& args1,
         const real* rowValues1;  // nullptr
         int totalLength = 0;
         for (int j = 0; j < numSeqs; j++) {
-          getColRow(args1[i], offset + j, useGpu, &colNum1, &rowCols1,
-                    &rowValues1);
+          getColRow(
+              args1[i], offset + j, useGpu, &colNum1, &rowCols1, &rowValues1);
           // (1) lengths
           EXPECT_EQ(totalLength,
                     args2[i].sequenceStartPositions->getElement(j));
@@ -626,13 +642,16 @@ void checkSampleSequence(const vector<Argument>& args1,
   }
 }
 
-void testProtoSequenceDataProvider(int* numPerSlotType, bool async,
+void testProtoSequenceDataProvider(int* numPerSlotType,
+                                   bool async,
                                    bool useGpu) {
   mkDir(kTestDir);
   DataBatch data;
 
-  prepareData(&data, numPerSlotType,
-              /* iid */ true, useGpu);
+  prepareData(&data,
+              numPerSlotType,
+              /* iid */ true,
+              useGpu);
   writeData(data, useGpu, /* dataCompression */ false);
 
   DataConfig config;
@@ -649,8 +668,7 @@ void testProtoSequenceDataProvider(int* numPerSlotType, bool async,
   DataBatch batch;
 
   vector<Argument>& args1 = data.getStreams();
-  ICpuGpuVectorPtr sequenceStartPositions1 =
-      args1[0].sequenceStartPositions;
+  ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions;
 
   dataProvider->reset();
 
@@ -658,8 +676,7 @@ void testProtoSequenceDataProvider(int* numPerSlotType, bool async,
   while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
     CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
     vector<Argument>& args2 = batch.getStreams();
-    ICpuGpuVectorPtr sequenceStartPositions2 =
-        args2[0].sequenceStartPositions;
+    ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions;
     for (auto& arg : args1) {
       // args1 should not has sequence
       EXPECT_EQ(true, !arg.sequenceStartPositions);
diff --git a/paddle/gserver/tests/test_PyDataProvider.cpp b/paddle/gserver/tests/test_PyDataProvider.cpp
index 6ad45e3a65a6276ea9fa5bf8b3878c943caf7cba..802f9aa4cb558f48fe55d7d7d5c882d25925bb32 100644
--- a/paddle/gserver/tests/test_PyDataProvider.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <memory>
 #include <string>
 
@@ -114,9 +113,10 @@ void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu) {
   // Dense
   real* data;
   if (useGpu) {
-    MatrixPtr cpuMatrixPtr =
-        Matrix::create(argumentList[0].value->getHeight(),
-                       argumentList[0].value->getWidth(), 0, 0);
+    MatrixPtr cpuMatrixPtr = Matrix::create(argumentList[0].value->getHeight(),
+                                            argumentList[0].value->getWidth(),
+                                            0,
+                                            0);
     cpuMatrixPtr->copyFrom(*argumentList[0].value);
     data = cpuMatrixPtr->getData();
   } else {
diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
index b9867a728d9b4cc8d318578ab3e45021f87daa4c..6674e6b87c6acdda78416eb1f8bf015e642b967f 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -15,30 +15,27 @@ limitations under the License. */
 #ifndef PADDLE_NO_PYTHON
 #include <gtest/gtest.h>
 #include <fstream>
-#include "paddle/utils/Util.h"
-#include "paddle/utils/PythonUtil.h"
 #include "paddle/gserver/dataproviders/DataProvider.h"
+#include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Util.h"
 
 P_DEFINE_string(train_list, "unittest.list", "file list for unittest");
 
 namespace paddle {
 namespace unittest {
 namespace pydp2 {
-extern void setOnPoolFilledHook(const std::function<void(size_t)>& func);
+extern void setOnPoolFilledHook(const std::function<void(size_t)> &func);
 extern void clearOnPoolFilledHook();
 
 }  // namespace pydp2
 }  // namespace unittest
 }  // namespace paddle
 
-
 const paddle::real epsilon = 1e-5;
 
-static inline int64_t readDataBatch(
-    paddle::DataBatch* batch,
-    const std::string& funcName,
-    int64_t batchSize = 65535) {
-
+static inline int64_t readDataBatch(paddle::DataBatch *batch,
+                                    const std::string &funcName,
+                                    int64_t batchSize = 65535) {
   paddle::DataConfig config;
   config.set_type("py2");
   config.set_files(FLAGS_train_list.c_str());
@@ -64,18 +61,19 @@ TEST(PyDataProvider2, dense_no_seq) {
   provider->setSkipShuffle();  // skip shuffle for unittest.
 
   paddle::DataBatch batch;
-  for (size_t pass=0; pass < 2; ++pass) {  // read 2 passes
+  for (size_t pass = 0; pass < 2; ++pass) {  // read 2 passes
     provider->reset();
     int64_t num = provider->getNextBatchInternal(100, &batch);
     ASSERT_NE(num, 0);
     ASSERT_EQ((size_t)batch.getStreams().size(), (size_t)1);
     ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
     // Check batch data.
-    for (size_t i=0; i < 100; ++i) {
-      for (size_t j=0; j < 200; ++j) {
-        paddle::real tmp = (paddle::real)((j-100.0) * (i+1) / 200.0);
-        ASSERT_NEAR(batch.getStreams()[0].value->getData()[i*200 + j],
-                    tmp, epsilon);}
+    for (size_t i = 0; i < 100; ++i) {
+      for (size_t j = 0; j < 200; ++j) {
+        paddle::real tmp = (paddle::real)((j - 100.0) * (i + 1) / 200.0);
+        ASSERT_NEAR(
+            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
+      }
     }
 
     num = provider->getNextBatchInternal(100, &batch);
@@ -83,12 +81,13 @@ TEST(PyDataProvider2, dense_no_seq) {
     ASSERT_EQ(batch.getStreams().size(), (size_t)1);
     ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
     // Check batch data.
-    for (size_t i=0; i < 100; ++i) {
+    for (size_t i = 0; i < 100; ++i) {
       size_t ii = i + 100;
-      for (size_t j=0; j < 200; ++j) {
-        paddle::real tmp = (paddle::real)((j-100.0) * (ii+1) / 200.0);
-        ASSERT_NEAR(batch.getStreams()[0].value->getData()[i*200 + j],
-                    tmp, epsilon);}
+      for (size_t j = 0; j < 200; ++j) {
+        paddle::real tmp = (paddle::real)((j - 100.0) * (ii + 1) / 200.0);
+        ASSERT_NEAR(
+            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
+      }
     }
     num = provider->getNextBatchInternal(100, &batch);
     ASSERT_EQ(num, 0);
@@ -106,11 +105,11 @@ TEST(PyDataProvider2, index_no_seq) {
 
   provider->setSkipShuffle();  // skip shuffle for unittest.
   paddle::DataBatch batch;
-  for (size_t pass=0; pass < 2; ++pass) {
+  for (size_t pass = 0; pass < 2; ++pass) {
     provider->reset();
     int64_t num = provider->getNextBatchInternal(10000, &batch);
     CHECK_EQ(num, 200);
-    for (int i=0; i < 200; ++i) {
+    for (int i = 0; i < 200; ++i) {
       CHECK_EQ(i, batch.getStreams()[0].ids->getData()[i]);
     }
   }
@@ -118,13 +117,14 @@ TEST(PyDataProvider2, index_no_seq) {
 
 TEST(PyDataProvider2, init_hook) {
   paddle::PyObjectPtr pickle = paddle::py::import("pickle");
-  paddle::PyObjectPtr globals(
-      PyModule_GetDict(PyImport_AddModule("__main__")));
+  paddle::PyObjectPtr globals(PyModule_GetDict(PyImport_AddModule("__main__")));
   PyDict_SetItemString(globals.get(), "pickle", pickle.get());
   paddle::PyObjectPtr locals(PyDict_New());
   paddle::PyObjectPtr mdl(PyRun_String(
       "dumps = pickle.dumps({'value':[float(x) for x in xrange(20)]})",
-      Py_file_input, globals.get(), locals.get()));
+      Py_file_input,
+      globals.get(),
+      locals.get()));
   CHECK_PY(mdl) << "Error!";
   paddle::PyObjectPtr dps(PyDict_GetItemString(locals.get(), "dumps"));
   CHECK_PY(dps) << "Error!";
@@ -143,11 +143,11 @@ TEST(PyDataProvider2, init_hook) {
   paddle::DataBatch batch;
   int64_t num = provider->getNextBatchInternal(100000, &batch);
   ASSERT_EQ(num, 200);
-  auto& mat = batch.getStreams()[0].value;
+  auto &mat = batch.getStreams()[0].value;
   ASSERT_EQ((size_t)mat->getWidth(), (size_t)20);
-  for (size_t i=0; i < 200; ++i) {
-    for (size_t j=0; j < 20; ++j) {
-      ASSERT_NEAR((paddle::real)j, mat->getData()[i*20 + j], epsilon);
+  for (size_t i = 0; i < 200; ++i) {
+    for (size_t j = 0; j < 20; ++j) {
+      ASSERT_NEAR((paddle::real)j, mat->getData()[i * 20 + j], epsilon);
     }
   }
 }
@@ -168,11 +168,11 @@ TEST(PyDataProvider2, sparse_no_value_no_seq) {
   auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
       batch.getStreams()[0].value);
   CHECK(csm != nullptr);
-  for (int i=0; i < 200; ++i) {
+  for (int i = 0; i < 200; ++i) {
     CHECK_EQ(csm->getColNum(i), (size_t)10);
-    int* cols = csm->getRowCols(i);
-    for (int j=0; j < 10; ++j) {
-      CHECK_EQ(cols[j], (i+1)*(j+1));
+    int *cols = csm->getRowCols(i);
+    for (int j = 0; j < 10; ++j) {
+      CHECK_EQ(cols[j], (i + 1) * (j + 1));
     }
   }
 }
@@ -183,13 +183,13 @@ TEST(PyDataProvider2, sparse_value_no_seq) {
   auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
       batch.getStreams()[0].value);
   CHECK(csm != nullptr);
-  for (int i=0; i < 200; ++i) {
+  for (int i = 0; i < 200; ++i) {
     CHECK_EQ(csm->getColNum(i), (size_t)10);
-    int* cols = csm->getRowCols(i);
-    real* dat = csm->getRowValues(i);
-    for (int j=0; j < 10; ++j) {
-      EXPECT_EQ(cols[j], (i+1)*(j+1));
-      EXPECT_EQ(dat[j], real(j)/real(i+1));
+    int *cols = csm->getRowCols(i);
+    real *dat = csm->getRowValues(i);
+    for (int j = 0; j < 10; ++j) {
+      EXPECT_EQ(cols[j], (i + 1) * (j + 1));
+      EXPECT_EQ(dat[j], real(j) / real(i + 1));
     }
   }
 }
@@ -197,11 +197,11 @@ TEST(PyDataProvider2, sparse_value_no_seq) {
 TEST(PyDataProvider2, index_seq) {
   paddle::DataBatch batch;
   CHECK_EQ(readDataBatch(&batch, "test_index_seq"), 200);
-  auto& arg = batch.getStreams()[0];
-  CHECK_EQ((int)arg.ids->getSize(), (200 + 1) * 200 /2);
+  auto &arg = batch.getStreams()[0];
+  CHECK_EQ((int)arg.ids->getSize(), (200 + 1) * 200 / 2);
   size_t tmp = 0;
-  for (size_t i=0; i < 200; ++i) {  // CHECK DATA CORRECT
-    for (size_t j=0; j < i+1; ++j) {
+  for (size_t i = 0; i < 200; ++i) {  // CHECK DATA CORRECT
+    for (size_t j = 0; j < i + 1; ++j) {
       ASSERT_EQ((size_t)arg.ids->getData()[tmp], j);
       ++tmp;
     }
@@ -219,11 +219,11 @@ TEST(PyDataProvider2, index_seq) {
 TEST(PyDataProvider2, index_sub_seq) {
   paddle::DataBatch batch;
   ASSERT_EQ(readDataBatch(&batch, "test_index_sub_seq"), 200);
-  auto& arg = batch.getStreams()[0];
+  auto &arg = batch.getStreams()[0];
   size_t tmp = 0;
-  for (size_t i=0; i < 200; ++i) {
-    for (size_t j=0; j < i+1; ++j) {
-      for (size_t k=0; k < j+1; ++k) {
+  for (size_t i = 0; i < 200; ++i) {
+    for (size_t j = 0; j < i + 1; ++j) {
+      for (size_t k = 0; k < j + 1; ++k) {
         CHECK_EQ((size_t)arg.ids->getData()[tmp++], k);
       }
     }
@@ -236,14 +236,14 @@ TEST(PyDataProvider2, index_sub_seq) {
   ASSERT_EQ(arg.sequenceStartPositions->getData(false)[0], 0);
   size_t idx = 1;
   tmp = 0;
-  for (size_t i=0; i < 200; ++i) {
-    for (size_t j=0; j < i+1; ++j) {
-      tmp += j+1;
+  for (size_t i = 0; i < 200; ++i) {
+    for (size_t j = 0; j < i + 1; ++j) {
+      tmp += j + 1;
       ASSERT_EQ((size_t)arg.subSequenceStartPositions->getData(false)[idx],
-          (size_t)tmp);
+                (size_t)tmp);
       ++idx;
     }
-    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i+1], tmp);
+    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i + 1], tmp);
   }
 }
 
@@ -264,11 +264,11 @@ TEST(PyDataProvider2, min_pool_size) {
 
   paddle::unittest::pydp2::setOnPoolFilledHook([&](size_t poolSize) {
     if (totalData > batchSize) {
-      CHECK_GE(poolSize, std::min(totalData-batchSize, minPoolSize));
+      CHECK_GE(poolSize, std::min(totalData - batchSize, minPoolSize));
     }
   });
   while (true) {
-    size_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
     if (realBatchSize) {
       totalData -= realBatchSize;
     } else {
@@ -287,11 +287,11 @@ TEST(PyDataProvider2, can_over_batch_size) {
   config.set_load_data_args("");
   paddle::DataBatch batch;
   std::unique_ptr<paddle::DataProvider> provider(
-  paddle::DataProvider::create(config, false));
+      paddle::DataProvider::create(config, false));
   provider->reset();
   constexpr size_t batchSize = 100;
   while (true) {
-    size_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
     if (realBatchSize) {
       CHECK_LE(realBatchSize, batchSize);
     } else {
@@ -313,16 +313,16 @@ TEST(PyDataProvider2, input_order) {
   *modelConfig.add_input_layer_names() = "input2";
   paddle::DataBatch batch;
   std::unique_ptr<paddle::DataProvider> provider(
-  paddle::DataProvider::create(config, modelConfig, false));
+      paddle::DataProvider::create(config, modelConfig, false));
   provider->reset();
   constexpr size_t batchSize = 100;
   while (true) {
-    size_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
     if (!realBatchSize) {
       break;
     }
-    ASSERT_EQ(batch.getStreams().size(), (size_t)2);
-    for (size_t i = 0; i < realBatchSize; ++i) {
+    ASSERT_EQ(batch.getStreams().size(), static_cast<size_t>(2));
+    for (int64_t i = 0; i < realBatchSize; ++i) {
       ASSERT_EQ(batch.getStream(0).ids->getData()[i], 0);
       ASSERT_EQ(batch.getStream(1).ids->getData()[i], 1);
     }
@@ -338,15 +338,15 @@ TEST(PyDataProvider2, test_check) {
   config.set_load_data_args("");
   paddle::DataBatch batch;
   std::unique_ptr<paddle::DataProvider> provider(
-  paddle::DataProvider::create(config, false));
+      paddle::DataProvider::create(config, false));
   provider->reset();
   while (true) {
-    size_t realBatchSize = provider->getNextBatchInternal(100, &batch);
+    int64_t realBatchSize = provider->getNextBatchInternal(100, &batch);
     if (!realBatchSize) {
       break;
     } else {
-      auto& ivec = batch.getStream(0).ids;
-      for (size_t i=0; i < ivec->getSize(); ++i) {
+      auto &ivec = batch.getStream(0).ids;
+      for (size_t i = 0; i < ivec->getSize(); ++i) {
         CHECK_LT(ivec->getData()[i], 10);
       }
     }
@@ -370,7 +370,30 @@ TEST(PyDataProvider2, multiThread) {
   provider.reset();
 }
 
-int main(int argc, char** argv) {
+TEST(PyDataProvider2, minPoolSizeWithCache) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_min_pool_size_with_cache");
+  config.set_async_load_data(true);
+
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+
+  paddle::DataBatch batch;
+
+  for (int i = 0; i < 10; ++i) {
+    provider->reset();
+    int64_t sum = 0;
+    while (int64_t actualNum = provider->getNextBatch(100, &batch)) {
+      sum += actualNum;
+    }
+    ASSERT_EQ(1 << 20, sum);
+  }
+}
+
+int main(int argc, char **argv) {
   testing::InitGoogleTest(&argc, argv);
   paddle::initMain(argc, argv);
   paddle::initPython(argc, argv);
diff --git a/paddle/gserver/tests/test_PyDataProvider2.py b/paddle/gserver/tests/test_PyDataProvider2.py
index 7ca30198fb1d0e7384db2c28524c7898dcd27e50..bf23c52fd78455c8ca7e480aa87438ee04ab2a74 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.py
+++ b/paddle/gserver/tests/test_PyDataProvider2.py
@@ -111,3 +111,13 @@ def test_check(settings, filename):
             if i < 10:
                 yield_good_value = True
             yield i
+
+
+@provider(
+    input_types=[index_slot(10)],
+    min_pool_size=1000,
+    cache=CacheType.CACHE_PASS_IN_MEM, )
+def test_min_pool_size_with_cache(settings, filename):
+    import random
+    for _ in xrange(2**20):
+        yield random.randint(0, 9)
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
index d104db3e5b32d5ae5c874f7ef3e5c51fea6366ec..80d713dac03a42b370d50ebb17d089e9be2f17ff 100644
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -45,12 +45,16 @@ public:
     auto p = const_cast<TrainerForTest*>(this);
     auto& params = p->getGradientMachine()->getParameters();
     return std::accumulate(
-        params.begin(), params.end(), 0UL,
+        params.begin(),
+        params.end(),
+        0UL,
         [](size_t a, const ParameterPtr& p) { return a + p->getSize(); });
   }
 };
 
-void CalCost(const string& conf, const string& dir, real* cost,
+void CalCost(const string& conf,
+             const string& dir,
+             real* cost,
              int num_passes) {
   auto config = std::make_shared<TrainerConfigHelper>(conf);
   TrainerForTest trainer;
@@ -82,8 +86,8 @@ void CalCost(const string& conf, const string& dir, real* cost,
       int num = dataProvider->getNextBatch(batchSize, &dataBatch);
       if (num == 0) break;
       totalCost += trainer.calcGradient(dataBatch, vecW, vecGradient);
-      sgdUpdate(learningRate, momentum, decayRate, &vecW, &vecGradient,
-                &vecMomentum);
+      sgdUpdate(
+          learningRate, momentum, decayRate, &vecW, &vecGradient, &vecMomentum);
     }
     cost[i] = totalCost;
   }
@@ -119,7 +123,8 @@ TEST(RecurrentGradientMachine, HasSubSequence) {
   for (bool useGpu : {false, true}) {
     test("gserver/tests/sequence_layer_group.conf",
          "gserver/tests/sequence_nest_layer_group.conf",
-         1e-5, useGpu);
+         1e-5,
+         useGpu);
   }
 }
 
@@ -127,7 +132,8 @@ TEST(RecurrentGradientMachine, rnn) {
   for (bool useGpu : {false, true}) {
     test("gserver/tests/sequence_rnn.conf",
          "gserver/tests/sequence_nest_rnn.conf",
-         1e-6, useGpu);
+         1e-6,
+         useGpu);
   }
 }
 
@@ -135,16 +141,18 @@ TEST(RecurrentGradientMachine, rnn_multi_input) {
   for (bool useGpu : {false, true}) {
     test("gserver/tests/sequence_rnn_multi_input.conf",
          "gserver/tests/sequence_nest_rnn_multi_input.conf",
-         1e-6, useGpu);
+         1e-6,
+         useGpu);
   }
 }
 
 TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) {
-    for (bool useGpu : {false, true}) {
-        test("gserver/tests/sequence_rnn_multi_unequalength_inputs.conf",
-        "gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.conf",
-             1e-6, useGpu);
-    }
+  for (bool useGpu : {false, true}) {
+    test("gserver/tests/sequence_rnn_multi_unequalength_inputs.conf",
+         "gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.conf",
+         1e-6,
+         useGpu);
+  }
 }
 
 int main(int argc, char** argv) {
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index 1c8497e8c526f84cabf6e0862ea96653f99f64be..0643cec38b3a5d96de64438c7342f827fde808a9 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -71,7 +71,9 @@ void checkError(const CpuVector& vector1, const CpuVector& vector2) {
   EXPECT_EQ(count, 0) << "There are " << count << " different element.";
 }
 
-LayerPtr creatDataLayer(string name, size_t batchSize, int layerSize,
+LayerPtr creatDataLayer(string name,
+                        size_t batchSize,
+                        int layerSize,
                         bool useGpu) {
   LayerConfig dataConfig;
   dataConfig.set_name(name);
@@ -96,7 +98,9 @@ LayerPtr creatDataLayer(string name, size_t batchSize, int layerSize,
   return layer;
 }
 
-ParameterPtr creatParameter(string name, int pid, size_t paraSize,
+ParameterPtr creatParameter(string name,
+                            int pid,
+                            size_t paraSize,
                             bool useGpu) {
   ParameterConfig paraConfig;
   paraConfig.set_name(name);
@@ -112,7 +116,9 @@ ParameterPtr creatParameter(string name, int pid, size_t paraSize,
   return parameter;
 }
 
-ParameterPtr creatParameterBias(string name, int pid, size_t paraSize,
+ParameterPtr creatParameterBias(string name,
+                                int pid,
+                                size_t paraSize,
                                 bool useGpu) {
   ParameterConfig paraConfig;
   paraConfig.set_name(name);
@@ -127,8 +133,10 @@ ParameterPtr creatParameterBias(string name, int pid, size_t paraSize,
   return parameter;
 }
 
-LayerPtr initRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
-                            int layerSize, bool useGpu) {
+LayerPtr initRecurrentLayer(LayerConfig layerConfig,
+                            size_t batchSize,
+                            int layerSize,
+                            bool useGpu) {
   FLAGS_use_gpu = useGpu;
   LayerMap layerMap;
   ParameterMap parameterMap;
@@ -214,7 +222,7 @@ TEST(Layer, RecurrentLayer) {
 #define protected public
 #include "paddle/gserver/layers/LstmLayer.h"
 #include "paddle/gserver/layers/GatedRecurrentLayer.h"
-template<class T>
+template <class T>
 class TestRecurrentLayer {
 public:
   LayerConfig config_;
@@ -227,25 +235,34 @@ public:
   LayerMap layerMap_;
   ParameterMap parameterMap_;
   TestRecurrentLayer(const LayerConfig& config,
-    bool useGpu, bool useBatch = false)
-    : config_(config), useGpu_(useGpu), useBatch_(useBatch) {}
+                     bool useGpu,
+                     bool useBatch = false)
+      : config_(config), useGpu_(useGpu), useBatch_(useBatch) {}
   void init(size_t batchSize) {
     FLAGS_use_gpu = useGpu_;
     testLayer_ = Layer::create(config_);
     if (typeid(T) == typeid(GatedRecurrentLayer)) {
       dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
-                                  batchSize, config_.size() * 3, useGpu_);
+                                  batchSize,
+                                  config_.size() * 3,
+                                  useGpu_);
       para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
-                             0, config_.size() * config_.size() * 3, useGpu_);
-      bias_ = creatParameterBias(config_.bias_parameter_name(),
-                                 1, config_.size() * 3, useGpu_);
+                             0,
+                             config_.size() * config_.size() * 3,
+                             useGpu_);
+      bias_ = creatParameterBias(
+          config_.bias_parameter_name(), 1, config_.size() * 3, useGpu_);
     } else if (typeid(T) == typeid(LstmLayer)) {
       dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
-                                  batchSize, config_.size() * 4, useGpu_);
+                                  batchSize,
+                                  config_.size() * 4,
+                                  useGpu_);
       para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
-                             0, config_.size() * config_.size() * 4, useGpu_);
-      bias_ = creatParameterBias(config_.bias_parameter_name(),
-                                 1, config_.size() * 7, useGpu_);
+                             0,
+                             config_.size() * config_.size() * 4,
+                             useGpu_);
+      bias_ = creatParameterBias(
+          config_.bias_parameter_name(), 1, config_.size() * 7, useGpu_);
     }
     layerMap_[dataLayer_->getName()] = dataLayer_;
     parameterMap_[para_->getName()] = para_;
@@ -266,15 +283,17 @@ public:
   }
 };
 
-template<class T>
-void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
-                         bool cpuBatch, bool gpuBatch) {
+template <class T>
+void checkRecurrentLayer(LayerConfig layerConfig,
+                         size_t batchSize,
+                         bool cpuBatch,
+                         bool gpuBatch) {
   TestRecurrentLayer<T> testCpu(layerConfig, false, cpuBatch);
   TestRecurrentLayer<T> testGpu(layerConfig, true, gpuBatch);
   testCpu.init(batchSize);
   testGpu.init(batchSize);
-  auto checkError = [](MatrixPtr cpu, MatrixPtr gpu,
-                       int numSequences, const char* str) {
+  auto checkError = [](
+      MatrixPtr cpu, MatrixPtr gpu, int numSequences, const char* str) {
     CpuMatrix check(gpu->getHeight(), gpu->getWidth());
     check.copyFrom(*gpu);
     int height = cpu->getHeight();
@@ -290,8 +309,8 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
         }
       }
     }
-    EXPECT_EQ(count, 0) << "[" << str << "]" <<
-      "There are " << count << " different element.";
+    EXPECT_EQ(count, 0) << "[" << str << "]"
+                        << "There are " << count << " different element.";
   };
   T* cpuLayer = dynamic_cast<T*>(testCpu.testLayer_.get());
   T* gpuLayer = dynamic_cast<T*>(testGpu.testLayer_.get());
@@ -312,8 +331,8 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
   testCpu.forward();
   testGpu.forward();
 
-  checkError(cpuLayer->getOutputValue(),
-             gpuLayer->getOutputValue(), 1, "outputValue");
+  checkError(
+      cpuLayer->getOutputValue(), gpuLayer->getOutputValue(), 1, "outputValue");
 
   /* check backward */
   cpuLayer->getOutputGrad()->randomizeUniform();
@@ -327,11 +346,15 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
   checkError(cpuInput.grad, gpuInput.grad, 1, "inputGrad");
   // check weight grad
   int numSequences = cpuInput.getNumSequences();
-  checkError(cpuLayer->weight_->getWGrad(), gpuLayer->weight_->getWGrad(),
-             numSequences, "weightGrad");
+  checkError(cpuLayer->weight_->getWGrad(),
+             gpuLayer->weight_->getWGrad(),
+             numSequences,
+             "weightGrad");
   // check bias grad
-  checkError(cpuLayer->bias_->getWGrad(), gpuLayer->bias_->getWGrad(),
-             numSequences, "biasGrad");
+  checkError(cpuLayer->bias_->getWGrad(),
+             gpuLayer->bias_->getWGrad(),
+             numSequences,
+             "biasGrad");
 }
 
 TEST(Layer, GatedRecurrentLayer) {
@@ -357,7 +380,7 @@ TEST(Layer, GatedRecurrentLayer) {
             layerConfig.set_size(frameSize);
             layerConfig.set_reversed(reversed);
             checkRecurrentLayer<GatedRecurrentLayer>(
-              layerConfig, batchSize, cpuBatch, gpuBatch);
+                layerConfig, batchSize, cpuBatch, gpuBatch);
           }
         }
       }
@@ -388,8 +411,8 @@ TEST(Layer, LstmLayer) {
                       << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
             layerConfig.set_size(frameSize);
             layerConfig.set_reversed(reversed);
-            checkRecurrentLayer<LstmLayer>
-              (layerConfig, batchSize, cpuBatch, gpuBatch);
+            checkRecurrentLayer<LstmLayer>(
+                layerConfig, batchSize, cpuBatch, gpuBatch);
           }
         }
       }
diff --git a/paddle/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
index 9a83217f1a8471e61c2938eff7185cfa585b6c7d..204b03332ff5bba3b9f3e5d98050942d6f0f390f 100644
--- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp
+++ b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <paddle/utils/PythonUtil.h>
 #include <cstdlib>
 #include <ctime>
@@ -53,7 +52,7 @@ int randint(int* data, size_t int_max, size_t size) {
   int this_int = 0;
 
   while (count < size) {
-    this_int = std::rand() % int_max; // NOLINT
+    this_int = std::rand() % int_max;  // NOLINT
     if (tmp.find(this_int) == tmp.end()) {
       tmp[this_int] = 0;
       count += 1;
@@ -71,8 +70,10 @@ int randint(int* data, size_t int_max, size_t size) {
   return 0;
 }
 
-void calcOutput(ComData& comData, const string configFile,
-    const string configArgs, bool useGpu) {
+void calcOutput(ComData& comData,
+                const string configFile,
+                const string configArgs,
+                bool useGpu) {
   FLAGS_config = configFile;
   FLAGS_config_args = configArgs;
   FLAGS_use_gpu = useGpu;
@@ -95,8 +96,8 @@ void calcOutput(ComData& comData, const string configFile,
 
   vector<Argument>& inArgs = dataBatch.getStreams();
   trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
-  trainer.getGradientMachine()->forwardBackward(inArgs, &comData.outArgs,
-                                                PASS_TRAIN);
+  trainer.getGradientMachine()->forwardBackward(
+      inArgs, &comData.outArgs, PASS_TRAIN);
   trainer.getGradientMachine()->finish();
 }
 
@@ -108,8 +109,8 @@ void checkMatrix(real* A, real* B, size_t matSize) {
 #endif
   int diffNum = 0;
   for (size_t i = 0; i < matSize; ++i) {
-    if (std::isinf(A[i]) || std::isnan(A[i])
-        || std::isinf(B[i]) || std::isnan(B[i])) {
+    if (std::isinf(A[i]) || std::isnan(A[i]) || std::isinf(B[i]) ||
+        std::isnan(B[i])) {
     } else if (fabs(A[i] - B[i]) > err) {
       diffNum++;
     }
@@ -117,8 +118,10 @@ void checkMatrix(real* A, real* B, size_t matSize) {
   EXPECT_EQ(0, diffNum);
 }
 
-void checkTranspose(real* matrix, real* transpose,
-    size_t width, size_t matSize) {
+void checkTranspose(real* matrix,
+                    real* transpose,
+                    size_t width,
+                    size_t matSize) {
 #ifndef PADDLE_TYPE_DOUBLE
   real err = 1e-3;
 #else
@@ -149,20 +152,20 @@ void compareOutput(ComData& fcData, ComData& selFcData) {
   // check cost
   LOG(INFO) << "Check cost";
   CpuMatrix fcCost(outArgsFc[0].value->getHeight(),
-                 outArgsFc[0].value->getWidth());
+                   outArgsFc[0].value->getWidth());
   CpuMatrix selfcCost(outArgsSelfc[0].value->getHeight(),
-                    outArgsSelfc[0].value->getWidth());
+                      outArgsSelfc[0].value->getWidth());
   fcCost.copyFrom(*outArgsFc[0].value);
   selfcCost.copyFrom(*outArgsSelfc[0].value);
   checkMatrix(fcCost.getData(), selfcCost.getData(), fcCost.getElementCnt());
 
   // check selective fc output and fc output
-  LOG(INFO) << "Compare output of SelectiveFullyConectedLayer " <<
-    "with FullyConectedLayer";
+  LOG(INFO) << "Compare output of SelectiveFullyConectedLayer "
+            << "with FullyConectedLayer";
   CpuMatrix fcOut(outArgsFc[1].value->getHeight(),
-                 outArgsFc[1].value->getWidth());
+                  outArgsFc[1].value->getWidth());
   CpuMatrix selfcOut(outArgsSelfc[1].value->getHeight(),
-                    outArgsSelfc[1].value->getWidth());
+                     outArgsSelfc[1].value->getWidth());
 
   fcOut.copyFrom(*outArgsFc[1].value);
   selfcOut.copyFrom(*outArgsSelfc[1].value);
@@ -189,32 +192,40 @@ void compareOutput(ComData& fcData, ComData& selFcData) {
     CpuVector paraGrad1(*p1->getBuf(PARAMETER_GRADIENT));
     CpuVector paraGrad2(*p2->getBuf(PARAMETER_GRADIENT));
     if (paramName == "rand_fc_param.bias") {
-      checkMatrix(paraValue1.getData(),
-                  paraValue2.getData(),
-                  paraValue1.getSize());
-      checkMatrix(paraGrad1.getData(),
-                 paraGrad2.getData(),
-                 paraGrad1.getSize());
+      checkMatrix(
+          paraValue1.getData(), paraValue2.getData(), paraValue1.getSize());
+      checkMatrix(
+          paraGrad1.getData(), paraGrad2.getData(), paraGrad1.getSize());
     } else {
-      checkTranspose(paraValue1.getData(), paraValue2.getData(),
-          fcLayerWidth, paraValue1.getSize());
-      checkTranspose(paraGrad1.getData(), paraGrad2.getData(),
-          fcLayerWidth, paraGrad1.getSize());
+      checkTranspose(paraValue1.getData(),
+                     paraValue2.getData(),
+                     fcLayerWidth,
+                     paraValue1.getSize());
+      checkTranspose(paraGrad1.getData(),
+                     paraGrad2.getData(),
+                     fcLayerWidth,
+                     paraGrad1.getSize());
     }
   }
 }
 
-void compareSparseMulOutput(real* fcOutput, real* selOutput, size_t nnz,
-    const std::shared_ptr<std::vector<std::pair<int*, size_t> > > &selCols) {
+void compareSparseMulOutput(
+    real* fcOutput,
+    real* selOutput,
+    size_t nnz,
+    const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& selCols) {
 #ifndef PADDLE_TYPE_DOUBLE
   real err = 1e-3;
 #else
   real err = 1e-10;
 #endif
-  size_t nnzCount = std::accumulate(selCols->begin(), selCols->end(), 0UL,
-                            [](size_t a, const std::pair<int*, size_t>& arr){
-    return a+arr.second;
-  });
+  size_t nnzCount =
+      std::accumulate(selCols->begin(),
+                      selCols->end(),
+                      0UL,
+                      [](size_t a, const std::pair<int*, size_t>& arr) {
+                        return a + arr.second;
+                      });
   EXPECT_EQ(nnz, nnzCount);
 
   size_t sampleNum = selCols->size();
@@ -225,18 +236,20 @@ void compareSparseMulOutput(real* fcOutput, real* selOutput, size_t nnz,
       size_t selIdx = (*selCols)[i].first[j];
       if (fabs(fcOutput[i * fcLayerWidth + selIdx] - selOutput[count]) > err) {
         diffNum++;
-        LOG(INFO) << count << " diff : "
-                  << fcOutput[i * fcLayerWidth + selIdx] << "\t"
-                  << selOutput[count];
-       }
+        LOG(INFO) << count << " diff : " << fcOutput[i * fcLayerWidth + selIdx]
+                  << "\t" << selOutput[count];
+      }
       count++;
     }
   }
   EXPECT_EQ(0, diffNum);
 }
 
-LayerPtr creatDataLayer(string name, size_t batchSize, size_t layerSize,
-    std::vector<real>& values, bool useGpu) {
+LayerPtr creatDataLayer(string name,
+                        size_t batchSize,
+                        size_t layerSize,
+                        std::vector<real>& values,
+                        bool useGpu) {
   LayerConfig dataConfig;
   dataConfig.set_name(name);
   dataConfig.set_type("data");
@@ -253,8 +266,8 @@ LayerPtr creatDataLayer(string name, size_t batchSize, size_t layerSize,
   return layer;
 }
 
-ParameterPtr creatParameter(string name, int pid, size_t paraSize,
-        string paramFile, bool useGpu) {
+ParameterPtr creatParameter(
+    string name, int pid, size_t paraSize, string paramFile, bool useGpu) {
   ParameterConfig paraConfig;
   paraConfig.set_name(name);
   paraConfig.set_size(paraSize);
@@ -268,16 +281,19 @@ ParameterPtr creatParameter(string name, int pid, size_t paraSize,
   return parameter;
 }
 
-LayerPtr initFcLayer(LayerPtr dataLayer, LayerConfig layerConfig,
-    int dataLayerSize, int fcLayerSize,
-    string paraName, string paraFile, bool useGpu) {
+LayerPtr initFcLayer(LayerPtr dataLayer,
+                     LayerConfig layerConfig,
+                     int dataLayerSize,
+                     int fcLayerSize,
+                     string paraName,
+                     string paraFile,
+                     bool useGpu) {
   LayerMap layerMap;
   ParameterMap parameterMap;
 
   layerMap[dataLayer->getName()] = dataLayer;
-  ParameterPtr para =
-      creatParameter(paraName, 0, dataLayerSize * fcLayerSize,
-      paraFile, useGpu);
+  ParameterPtr para = creatParameter(
+      paraName, 0, dataLayerSize * fcLayerSize, paraFile, useGpu);
   parameterMap[para->getName()] = para;
 
   layerConfig.add_inputs();
@@ -296,14 +312,13 @@ LayerPtr initFcLayer(LayerPtr dataLayer, LayerConfig layerConfig,
 #ifndef PADDLE_TYPE_DOUBLE
 // The parameter file used in fc.conf and selective_fc.conf is float
 TEST(Layer, SelectiveFcLayer_train_dense_mul) {
-  const string& fcConfig =
-      "gserver/tests/SelectiveFcTest/conf/fc.conf";
+  const string& fcConfig = "gserver/tests/SelectiveFcTest/conf/fc.conf";
   const string& fcConfigArgs =
-    "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
+      "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
   const string& selFcConfig =
       "gserver/tests/SelectiveFcTest/conf/selective_fc.conf";
   const string& selConfigArgs =
-    "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
+      "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
 
   for (auto useGpu : {false, true}) {
 #ifdef PADDLE_ONLY_CPU
@@ -323,7 +338,7 @@ TEST(Layer, SelectiveFcLayer_train_dense_mul) {
 }
 #endif  // PADDLE_TYPE_DOUBLE
 
-void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config,
+void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
                                         bool useGpu) {
   FLAGS_use_gpu = useGpu;
   size_t batchSize = 100;
@@ -332,21 +347,26 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config,
   for (size_t j = 0; j < batchSize * dataLayerSize; ++j) {
     values[j] = std::rand() / real(RAND_MAX);
   }
-  LayerPtr dataLayer = creatDataLayer(
-      "data", batchSize, dataLayerSize, values, useGpu);
+  LayerPtr dataLayer =
+      creatDataLayer("data", batchSize, dataLayerSize, values, useGpu);
 
   const string& selfcParaFile =
-    "gserver/tests/SelectiveFcTest/model/rand_fc_param.w.transpose";
+      "gserver/tests/SelectiveFcTest/model/rand_fc_param.w.transpose";
   const string& selfcParaName = "rand_fc_param.w.transpose";
 
   std::shared_ptr<SelectiveFullyConnectedLayer> selfcLayer =
-    std::dynamic_pointer_cast<SelectiveFullyConnectedLayer>(initFcLayer(
-        dataLayer, config, dataLayerSize, fcLayerWidth,
-        selfcParaName, selfcParaFile, useGpu));
+      std::dynamic_pointer_cast<SelectiveFullyConnectedLayer>(
+          initFcLayer(dataLayer,
+                      config,
+                      dataLayerSize,
+                      fcLayerWidth,
+                      selfcParaName,
+                      selfcParaFile,
+                      useGpu));
 
   // create selected columns
-  std::shared_ptr<std::vector<std::pair<int*, size_t> > > selCols(
-     new std::vector<std::pair<int*, size_t> > (batchSize));
+  std::shared_ptr<std::vector<std::pair<int*, size_t>>> selCols(
+      new std::vector<std::pair<int*, size_t>>(batchSize));
   size_t maxNNZ = 30;
   srand((size_t)(time(NULL)));
   int total = 0;
@@ -364,8 +384,9 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config,
 
   MatrixPtr outMatSelfc = selfcLayer->getOutputValue();
   CpuSparseMatrixPtr cpuOutMatSelfc(
-    new CpuSparseMatrix(outMatSelfc->getHeight(), outMatSelfc->getWidth(),
-                        outMatSelfc->getElementCnt()));
+      new CpuSparseMatrix(outMatSelfc->getHeight(),
+                          outMatSelfc->getWidth(),
+                          outMatSelfc->getElementCnt()));
   cpuOutMatSelfc->copyFrom(*outMatSelfc, HPPL_STREAM_DEFAULT);
 #ifndef PADDLE_ONLY_CPU
   if (useGpu) {
@@ -376,7 +397,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config,
   size_t nnz = cpuOutMatSelfc->getElementCnt();
 
   const string& fcParaFile =
-    "gserver/tests/SelectiveFcTest/model/rand_fc_param.w";
+      "gserver/tests/SelectiveFcTest/model/rand_fc_param.w";
   const string& fcParaName = "rand_fc_param.w";
   LayerConfig fcLayerConfig;
   fcLayerConfig.set_name("fc_layer");
@@ -384,13 +405,18 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config,
   fcLayerConfig.set_active_type("linear");
   fcLayerConfig.set_size(fcLayerWidth);
 
-  LayerPtr fcLayer = initFcLayer(dataLayer, fcLayerConfig,
-      dataLayerSize, fcLayerWidth, fcParaName, fcParaFile, useGpu);
+  LayerPtr fcLayer = initFcLayer(dataLayer,
+                                 fcLayerConfig,
+                                 dataLayerSize,
+                                 fcLayerWidth,
+                                 fcParaName,
+                                 fcParaFile,
+                                 useGpu);
   fcLayer->forward(PASS_TEST);
 
   MatrixPtr outMatFc = fcLayer->getOutputValue();
   MatrixPtr cpuOutMatFc(
-    new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
+      new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
   cpuOutMatFc->copyFrom(*outMatFc, HPPL_STREAM_DEFAULT);
 #ifndef PADDLE_ONLY_CPU
   if (useGpu) {
@@ -401,7 +427,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config,
 
   compareSparseMulOutput(outValueFc, outValueSelfc, nnz, selCols);
   for (size_t i = 0; i < batchSize; ++i) {
-    delete [](*selCols)[i].first;
+    delete[](*selCols)[i].first;
   }
 }
 
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2dd83db345132501a8947644a1319a4f197d754e
--- /dev/null
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -0,0 +1,250 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/utils/Version.h>
+#include "paddle/gserver/layers/Layer.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/gserver/layers/CTCLayer.h"
+#include "paddle/gserver/layers/WarpCTCLayer.h"
+#include "ModelConfig.pb.h"
+
+#include "TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+P_DECLARE_bool(use_gpu);
+
+const real* getData(const Matrix& matrix) {
+  if (matrix.useGpu()) {
+    MatrixPtr cpuMatrix = Matrix::create(
+        matrix.getHeight(), matrix.getWidth(), matrix.isTransposed(), false);
+    cpuMatrix->copyFrom(matrix);
+    return cpuMatrix->getData();
+  } else {
+    return matrix.getData();
+  }
+}
+
+int checkError(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK_EQ(matrix1.getHeight(), matrix2.getHeight());
+  CHECK_EQ(matrix1.getWidth(), matrix2.getWidth());
+  CHECK_EQ(matrix1.isTransposed(), matrix2.isTransposed());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+
+  const real* data1 = getData(matrix1);
+  const real* data2 = getData(matrix2);
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+  return count;
+}
+
+void initArgument(size_t batchSize,
+                  int layerSize,
+                  bool useGpu,
+                  Argument& data) {
+  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.grad = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.value->randomizeUniform();
+  data.value->add(-0.5);
+  data.grad->zeroMem();
+
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+}
+
+LayerPtr createDataLayer(
+    string name, size_t batchSize, int layerSize, bool useGpu, Argument& data) {
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("data");
+  layerConfig.set_size(layerSize);
+  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
+
+  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  dataLayer->setData(data);
+  dataLayer->forward(PASS_GC);
+
+  return layer;
+}
+
+LayerPtr createLabelLayer(string name,
+                          size_t batchSize,
+                          size_t numClasses,
+                          bool useGpu) {
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("data");
+  layerConfig.set_size(1);
+  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
+
+  Argument data;
+  data.ids = IVector::create(batchSize, useGpu);
+  data.ids->rand(numClasses - 1);
+
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+
+  DataLayerPtr labelLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  labelLayer->setData(data);
+  labelLayer->forward(PASS_GC);
+
+  return layer;
+}
+
+LayerPtr createCTCLayer(string name,
+                        size_t numClasses,
+                        bool useGpu,
+                        bool normByTimes,
+                        LayerPtr dataLayer,
+                        LayerPtr labelLayer) {
+  LayerMap layerMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  layerMap[labelLayer->getName()] = labelLayer;
+
+  ParameterMap parameterMap;
+
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("ctc");
+  layerConfig.set_size(numClasses);
+  layerConfig.set_norm_by_times(normByTimes);
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
+  input0.set_input_layer_name(dataLayer->getName());
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
+  input1.set_input_layer_name(labelLayer->getName());
+
+  LayerPtr layer = LayerPtr(new CTCLayer(layerConfig));
+  layerMap[layer->getName()] = layer;
+  layer->init(layerMap, parameterMap);
+
+  ActivationFunction* softmaxActivation = ActivationFunction::create("softmax");
+
+  softmaxActivation->forward(dataLayer->getOutput());
+  layer->forward(PASS_GC);
+
+  layer->backward();
+  softmaxActivation->backward(dataLayer->getOutput());
+
+  return layer;
+}
+
+LayerPtr createWarpCTCLayer(string name,
+                            size_t numClasses,
+                            bool useGpu,
+                            bool normByTimes,
+                            LayerPtr dataLayer,
+                            LayerPtr labelLayer) {
+  LayerMap layerMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  layerMap[labelLayer->getName()] = labelLayer;
+
+  ParameterMap parameterMap;
+
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("warp_ctc");
+  layerConfig.set_size(numClasses);
+  layerConfig.set_blank(numClasses - 1);
+  layerConfig.set_norm_by_times(normByTimes);
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
+  input0.set_input_layer_name(dataLayer->getName());
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
+  input1.set_input_layer_name(labelLayer->getName());
+
+  LayerPtr layer = LayerPtr(new WarpCTCLayer(layerConfig));
+  layerMap[layer->getName()] = layer;
+  layer->init(layerMap, parameterMap);
+
+  layer->forward(PASS_GC);
+  layer->backward();
+
+  return layer;
+}
+
+TEST(Layer, WarpCTCLayer) {
+  for (auto layerSize : {10, 64}) {
+    for (auto batchSize : {1, 10, 32}) {
+      for (auto normByTimes : {false, true}) {
+        for (auto useGpu : {false, true}) {
+#ifdef PADDLE_ONLY_CPU
+          if (useGpu) continue;
+#endif
+          LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize
+                    << " normByTimes = " << normByTimes << " useGpu=" << useGpu;
+
+          FLAGS_use_gpu = useGpu;
+
+          Argument data0;
+          initArgument(batchSize, layerSize, useGpu, data0);
+
+          Argument data1;
+          data1.resizeAndCopyFrom(data0);
+
+          LayerPtr dataLayer0 =
+              createDataLayer("data", batchSize, layerSize, useGpu, data0);
+          LayerPtr dataLayer1 =
+              createDataLayer("data", batchSize, layerSize, useGpu, data1);
+
+          LayerPtr labelLayer =
+              createLabelLayer("label", batchSize, layerSize, useGpu);
+
+          LayerPtr warpctcLayer = createWarpCTCLayer(
+              "cost", layerSize, useGpu, normByTimes, dataLayer0, labelLayer);
+          LayerPtr ctcLayer = createCTCLayer(
+              "cost", layerSize, useGpu, normByTimes, dataLayer1, labelLayer);
+
+          /// Check cost
+          LOG(INFO) << "Check cost: "
+                    << checkError(*(warpctcLayer->getOutput().value),
+                                  *(ctcLayer->getOutput().value))
+                    << " different elements.";
+
+          /// Check gradients
+          LOG(INFO) << "Check gradients: "
+                    << checkError(*(dataLayer0->getOutput().grad),
+                                  *(dataLayer1->getOutput().grad))
+                    << " different elements";
+        }
+      }
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index f7aa60380f23eeea91ee852480862f6b19caedec..cba8b37289b53b7d75c64a6a95c9e3900b193902 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <mutex>
@@ -48,10 +47,10 @@ public:
    * @return Pointer to the allocated memory
    */
   virtual void* alloc(size_t size) {
-      void* ptr;
-      CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
-      CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
-      return ptr;
+    void* ptr;
+    CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
+    CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
+    return ptr;
   }
 
   /**
@@ -59,12 +58,12 @@ public:
    * @param ptr  Pointer to be free.
    */
   virtual void free(void* ptr) {
-    if (ptr) { ::free(ptr); }
+    if (ptr) {
+      ::free(ptr);
+    }
   }
 
-  virtual std::string getName() {
-    return "cpu_alloc";
-  }
+  virtual std::string getName() { return "cpu_alloc"; }
 };
 
 /**
@@ -81,7 +80,7 @@ public:
    */
   virtual void* alloc(size_t size) {
     void* ptr = hl_malloc_device(size);
-    CHECK(ptr)<< "Fail to allocate GPU memory " << size << " bytes";
+    CHECK(ptr) << "Fail to allocate GPU memory " << size << " bytes";
     return ptr;
   }
 
@@ -95,9 +94,7 @@ public:
     }
   }
 
-  virtual std::string getName() {
-    return "gpu_alloc";
-  }
+  virtual std::string getName() { return "gpu_alloc"; }
 };
 
 /**
@@ -128,9 +125,7 @@ public:
     }
   }
 
-  virtual std::string getName() {
-    return "cuda_host_alloc";
-  }
+  virtual std::string getName() { return "cuda_host_alloc"; }
 };
 
 }  // namespace paddle
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 54448bdb5a9bb4f665f28f973eada30a07fb5eee..05faeff2e41225fd2a6bb3f1b6491c812a440655 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -355,11 +355,11 @@ void BaseMatrixT<T>::neg() { applyUnary(unary::Neg<T>()); }
 
 DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a));
 template<>
-void BaseMatrixT<real>::exp() { applyUnary(unary::Exp<real>()); }
+void BaseMatrixT<real>::exp2() { applyUnary(unary::Exp<real>()); }
 
 DEFINE_MATRIX_UNARY_OP(Log, a = log(a));
 template<>
-void BaseMatrixT<real>::log() {
+void BaseMatrixT<real>::log2() {
   if (useGpu_) {
     applyUnary(unary::Log<real>());
   } else {
@@ -369,23 +369,23 @@ void BaseMatrixT<real>::log() {
 
 DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a));
 template<>
-void BaseMatrixT<real>::sqrt() { applyUnary(unary::Sqrt<real>()); }
+void BaseMatrixT<real>::sqrt2() { applyUnary(unary::Sqrt<real>()); }
 
 DEFINE_MATRIX_UNARY_OP(Square, a = a * a);
 template<class T>
-void BaseMatrixT<T>::square() { applyUnary(unary::Square<T>()); }
+void BaseMatrixT<T>::square2() { applyUnary(unary::Square<T>()); }
 
 DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a);
 template<class T>
-void BaseMatrixT<T>::reciprocal() { applyUnary(unary::Reciprocal<T>()); }
+void BaseMatrixT<T>::reciprocal2() { applyUnary(unary::Reciprocal<T>()); }
 
 DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a);
 template<class T>
-void BaseMatrixT<T>::abs() { applyUnary(unary::Abs<T>()); }
+void BaseMatrixT<T>::abs2() { applyUnary(unary::Abs<T>()); }
 
 DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0));
 template<class T>
-void BaseMatrixT<T>::sign() { applyUnary(unary::Sign<T>()); }
+void BaseMatrixT<T>::sign2() { applyUnary(unary::Sign<T>()); }
 
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
 template<class T>
@@ -405,7 +405,7 @@ void BaseMatrixT<T>::one() { applyUnary(unary::One<T>()); }
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p));
 template<>
-void BaseMatrixT<real>::pow(real p) {
+void BaseMatrixT<real>::pow2(real p) {
   if (useGpu_) {
     applyUnary(unary::Pow<real>(p));
   } else {
@@ -534,7 +534,7 @@ void BaseMatrixT<T>::add(BaseMatrixT& b, T p) {
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p));
 template<>
-void BaseMatrixT<real>::pow(BaseMatrixT& b, real p) {
+void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
   if (useGpu_) {
     applyBinary(binary::Pow<real>(p), b);
   } else {
@@ -615,7 +615,7 @@ void BaseMatrixT<T>::breluDerivative(BaseMatrixT& b) {
 
 DEFINE_MATRIX_BINARY_OP(Square, b = a * a);
 template<class T>
-void BaseMatrixT<T>::square(BaseMatrixT& b) {
+void BaseMatrixT<T>::square2(BaseMatrixT& b) {
   applyBinary(binary::Square<T>(), b);
 }
 
@@ -657,7 +657,7 @@ void BaseMatrixT<T>::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) {
 
 DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a);
 template<class T>
-void BaseMatrixT<T>::reciprocal(BaseMatrixT& b) {
+void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b) {
   applyBinary(binary::Reciprocal<T>(), b);
 }
 
@@ -669,7 +669,7 @@ void BaseMatrixT<T>::reciprocalDerivative(BaseMatrixT& b) {
 
 DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a);
 template<class T>
-void BaseMatrixT<T>::abs(BaseMatrixT& b) { applyBinary(binary::Abs<T>(), b); }
+void BaseMatrixT<T>::abs2(BaseMatrixT& b) { applyBinary(binary::Abs<T>(), b); }
 
 DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0);
 template<class T>
@@ -729,17 +729,19 @@ void BaseMatrixT<T>::expDerivative(BaseMatrixT& b) {
 
 DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f);
 template<class T>
-void BaseMatrixT<T>::sign(BaseMatrixT& b) { applyBinary(binary::Sign<T>(), b); }
+void BaseMatrixT<T>::sign2(BaseMatrixT& b) {
+  applyBinary(binary::Sign<T>(), b);
+}
 
 DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b));
 template<>
-void BaseMatrixT<real>::exp(BaseMatrixT& b) {
+void BaseMatrixT<real>::exp2(BaseMatrixT& b) {
   applyBinary(binary::Exp<real>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Log, a = log(b));
 template<>
-void BaseMatrixT<real>::log(BaseMatrixT& b) {
+void BaseMatrixT<real>::log2(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::Log<real>(), b);
   } else {
@@ -749,7 +751,7 @@ void BaseMatrixT<real>::log(BaseMatrixT& b) {
 
 DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b));
 template<>
-void BaseMatrixT<real>::sqrt(BaseMatrixT& b) {
+void BaseMatrixT<real>::sqrt2(BaseMatrixT& b) {
   applyBinary(binary::Sqrt<real>(), b);
 }
 
@@ -1065,7 +1067,7 @@ void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
 
 DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c);
 template<class T>
-void BaseMatrixT<T>::max(BaseMatrixT& b, BaseMatrixT& c) {  // NOLINT
+void BaseMatrixT<T>::max2(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::Max<T>(), b, c);
 }
 
@@ -1168,7 +1170,7 @@ void BaseMatrixT<T>::reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2,
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, TWO_PARAMETER,
                                   a = 1 / (p1 * b + p2));
 template<class T>
-void BaseMatrixT<T>::reciprocal(BaseMatrixT& b, T p1, T p2) {
+void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::Reciprocal2<T>(p1, p2), b);
 }
 
@@ -1240,6 +1242,12 @@ void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
   }
 }
 
+DEFINE_MATRIX_BINARY_OP(DeepSwap, T tmp = a; a = b; b = tmp);
+template<class T>
+void BaseMatrixT<T>::deepSwap(BaseMatrixT& b) {
+    applyBinary(binary::DeepSwap<T>(), b);
+}
+
 template<>
 void BaseMatrixT<real>::rowDotMul(size_t destCol,
                                   BaseMatrixT& b,
@@ -1449,8 +1457,8 @@ template<>
 template <class Agg>
 int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
   CHECK_EQ(height_, numRows);
   CHECK_EQ(width_, 1UL);
   aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
@@ -1463,8 +1471,8 @@ template<>
 template <class Agg, class Saver>
 int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
   CHECK_EQ(height_, numRows);
   CHECK_EQ(width_, 1UL);
   aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
@@ -1493,8 +1501,8 @@ template <class Agg, class Op, class Saver>
 int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
                                 BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
   CHECK_EQ(height_, numRows);
   CHECK_EQ(width_, 1UL);
   CHECK_EQ(c.height_, numRows);
@@ -1524,8 +1532,8 @@ template<>
 template <class Agg>
 int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
   CHECK_EQ(width_, numCols);
   CHECK_EQ(height_, 1UL);
   aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
@@ -1538,8 +1546,8 @@ template<>
 template <class Agg, class Saver>
 int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
   CHECK_EQ(width_, numCols);
   CHECK_EQ(height_, 1UL);
   aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
@@ -1578,11 +1586,6 @@ void BaseMatrixT<real>::minRows(BaseMatrixT& b) {
   applyRow(aggregate::min(), b);
 }
 
-template<>
-void BaseMatrixT<real>::sumCols(BaseMatrixT& b) {
-  applyCol(aggregate::sum(), b);
-}
-
 template<>
 void BaseMatrixT<real>::maxCols(BaseMatrixT& b) {
   applyCol(aggregate::max(), b);
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 3a91fdc3c30c5332866a97c256b018eb0982260f..f4576985b8a9d46a25ee3e4ad96fa152a40875d3 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <cstddef>
 #include <stdint.h>
 #include "paddle/utils/TypeDefs.h"
+#include "TensorExpression.h"
 
 namespace paddle {
 
@@ -52,9 +52,14 @@ public:
   size_t cRow_;
   size_t dCol_;
   size_t dRow_;
-  MatrixOffset(size_t aCol = 0, size_t aRow = 0, size_t bCol = 0,
-               size_t bRow = 0, size_t cCol = 0, size_t cRow = 0,
-               size_t dCol = 0, size_t dRow = 0)
+  MatrixOffset(size_t aCol = 0,
+               size_t aRow = 0,
+               size_t bCol = 0,
+               size_t bRow = 0,
+               size_t cCol = 0,
+               size_t cRow = 0,
+               size_t dCol = 0,
+               size_t dRow = 0)
       : aCol_(aCol),
         aRow_(aRow),
         bCol_(bCol),
@@ -65,8 +70,8 @@ public:
         dRow_(dRow) {}
 };
 
-template<class T>
-class BaseMatrixT {
+template <class T>
+class BaseMatrixT : public TensorExpression<BaseMatrixT<T>, T> {
 public:
   size_t height_, width_;
   size_t stride_;
@@ -97,8 +102,12 @@ public:
         trans_(mat.trans_),
         useGpu_(useGpu) {}
 
-  BaseMatrixT(size_t height, size_t width, size_t stride, T* data, bool trans,
-             bool use_gpu)
+  BaseMatrixT(size_t height,
+              size_t width,
+              size_t stride,
+              T* data,
+              bool trans,
+              bool use_gpu)
       : height_(height),
         width_(width),
         stride_(stride),
@@ -167,12 +176,17 @@ public:
    * @endcode
    */
   template <class Op, class bAsRowVector, class bAsColVector>
-  int applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
-                  MatrixOffset& offset, bAsRowVector, bAsColVector);
+  int applyBinary(Op op,
+                  BaseMatrixT& b,
+                  int numRows,
+                  int numCols,
+                  MatrixOffset& offset,
+                  bAsRowVector,
+                  bAsColVector);
 
   template <class Op>
-  int applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
-                  MatrixOffset& offset);
+  int applyBinary(
+      Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset);
 
   /**
    * ternary operator: element wise op(a, b, c).
@@ -212,13 +226,22 @@ public:
    * @endcode
    */
   template <class Op, class cAsRowVector, class cAsColVector>
-  int applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, int numRows,
-                   int numCols, MatrixOffset& offset, cAsRowVector,
+  int applyTernary(Op op,
+                   BaseMatrixT& b,
+                   BaseMatrixT& c,
+                   int numRows,
+                   int numCols,
+                   MatrixOffset& offset,
+                   cAsRowVector,
                    cAsColVector);
 
   template <class Op>
-  int applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, int numRows,
-                   int numCols, MatrixOffset& offset);
+  int applyTernary(Op op,
+                   BaseMatrixT& b,
+                   BaseMatrixT& c,
+                   int numRows,
+                   int numCols,
+                   MatrixOffset& offset);
 
   /**
    * quaternary operator: element wise op(a, b, c, d).
@@ -247,8 +270,13 @@ public:
    * @endcode
    */
   template <class Op>
-  int applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d,
-                      int numRows, int numCols, MatrixOffset& offset);
+  int applyQuaternary(Op op,
+                      BaseMatrixT& b,
+                      BaseMatrixT& c,
+                      BaseMatrixT& d,
+                      int numRows,
+                      int numCols,
+                      MatrixOffset& offset);
 
   /**
    * a aggregate expression that apply each row(or column) of matrix b.
@@ -266,10 +294,20 @@ public:
    *    a[i] = sv(a[i], dst)
    * @endcode
    */
-  template <class Agg, class Op, class Saver, class aAsRowVector,
+  template <class Agg,
+            class Op,
+            class Saver,
+            class aAsRowVector,
             class aAsColVector>
-  int aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, int numRows,
-                int numCols, MatrixOffset& offset, aAsRowVector, aAsColVector);
+  int aggregate(Agg agg,
+                Op op,
+                Saver sv,
+                BaseMatrixT& b,
+                int numRows,
+                int numCols,
+                MatrixOffset& offset,
+                aAsRowVector,
+                aAsColVector);
 
   /**
    * a aggregate expression that apply each row(or column) of matrix b and c.
@@ -288,10 +326,20 @@ public:
    *     a[i] = sv(a[i], dst)
    * @endcode
    */
-  template <class Agg, class Op, class Saver, class aAsRowVector,
+  template <class Agg,
+            class Op,
+            class Saver,
+            class aAsRowVector,
             class aAsColVector>
-  int aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c,
-                int numRows, int numCols, MatrixOffset& offset, aAsRowVector,
+  int aggregate(Agg agg,
+                Op op,
+                Saver sv,
+                BaseMatrixT& b,
+                BaseMatrixT& c,
+                int numRows,
+                int numCols,
+                MatrixOffset& offset,
+                aAsRowVector,
                 aAsColVector);
 
   /**
@@ -319,8 +367,12 @@ public:
 
   // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
   template <class Agg, class Op>
-  int applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
-               BaseMatrixT& b, BaseMatrixT& c);
+  int applyRow(Agg agg,
+               Op op,
+               real scaleDest,
+               real scaleAgg,
+               BaseMatrixT& b,
+               BaseMatrixT& c);
 
   /**
    * a aggregate expression that apply each row of matrix b.
@@ -376,14 +428,14 @@ public:
    *
    */
   void neg();
-  void exp();
-  void pow(T p);
-  void log();
-  void sqrt();
-  void square();
-  void reciprocal();
-  void abs();
-  void sign();
+  void exp2();
+  void pow2(T p);
+  void log2();
+  void sqrt2();
+  void square2();
+  void reciprocal2();
+  void abs2();
+  void sign2();
   void zero();
 
   /**
@@ -404,6 +456,17 @@ public:
    */
   void assign(T p);
 
+  /**
+   * @code
+   * swap(this, b)
+   * example: swap two Matrices
+   * MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+   * MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+   * cpuA->deepSwap(*cpuB);
+   * @endcode
+   */
+  void deepSwap(BaseMatrixT& b);
+
   /**
    * @code
    * this = this + p
@@ -541,7 +604,7 @@ public:
    * b = this * this
    * @endcode
    */
-  void square(BaseMatrixT& b);
+  void square2(BaseMatrixT& b);
   void squareDerivative(BaseMatrixT& b);
 
   /**
@@ -565,7 +628,7 @@ public:
    * b = 1.0f / this
    * @endcode
    */
-  void reciprocal(BaseMatrixT& b);
+  void reciprocal2(BaseMatrixT& b);
   void reciprocalDerivative(BaseMatrixT& b);
 
   /**
@@ -573,7 +636,7 @@ public:
    * b = this > 0.0f ? this : -this
    * @endcode
    */
-  void abs(BaseMatrixT& b);
+  void abs2(BaseMatrixT& b);
   void absDerivative(BaseMatrixT& b);
 
   /**
@@ -591,12 +654,12 @@ public:
    */
   void expDerivative(BaseMatrixT& b);
 
-  void sign(BaseMatrixT& b);
+  void sign2(BaseMatrixT& b);
 
-  void exp(BaseMatrixT& b);
-  void pow(BaseMatrixT& b, T p);
-  void log(BaseMatrixT& b);
-  void sqrt(BaseMatrixT& b);
+  void exp2(BaseMatrixT& b);
+  void pow2(BaseMatrixT& b, T p);
+  void log2(BaseMatrixT& b);
+  void sqrt2(BaseMatrixT& b);
   void addScalar(BaseMatrixT& b, T p);
   void subScalar(BaseMatrixT& b, T p);
   void mulScalar(BaseMatrixT& b, T p);
@@ -664,8 +727,7 @@ public:
    * this = a*p1 + b*p2 + c*p3
    * @endcode
    */
-  void add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2,
-            T p3);
+  void add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3);
 
   /**
    * @code
@@ -675,9 +737,9 @@ public:
    */
   void sgdUpdate(BaseMatrixT& b,  //  grad
                  BaseMatrixT& c,  //  mom
-                 T p1,        //  learningRate,
-                 T p2,        //  momentum,
-                 T p3);       //  decayRate
+                 T p1,            //  learningRate,
+                 T p2,            //  momentum,
+                 T p3);           //  decayRate
 
   /**
    * @code
@@ -688,9 +750,9 @@ public:
   void sgdUpdate(BaseMatrixT& b,  // grad,
                  BaseMatrixT& c,  // mom,
                  BaseMatrixT& d,  // lr,
-                 T p1,        // learningRate,
-                 T p2,        // momentum,
-                 T p3);       // decayRate
+                 T p1,            // learningRate,
+                 T p2,            // momentum,
+                 T p3);           // decayRate
 
   /// apply L1/L2 to *this*
   void applyL1(T learningRate, T decayRate);
@@ -767,17 +829,21 @@ public:
    * this = b>c ? b : c
    * @endcode
    */
-   void max(BaseMatrixT& b, BaseMatrixT& c);  //  NOLINT
+  void max2(BaseMatrixT& b, BaseMatrixT& c);
 
   /**
    * @code
    * this[destCol] += (b>p1 == c>p1) ? 0 : 1)
    * @endcode
    */
-  void binaryClassificationError(size_t destCol, BaseMatrixT& b, BaseMatrixT& c,
+  void binaryClassificationError(size_t destCol,
+                                 BaseMatrixT& b,
+                                 BaseMatrixT& c,
                                  T p);
-  void binaryClassificationError2(size_t destCol, BaseMatrixT& b,
-                                  BaseMatrixT& c, T p);
+  void binaryClassificationError2(size_t destCol,
+                                  BaseMatrixT& b,
+                                  BaseMatrixT& c,
+                                  T p);
 
   /**
    * @code
@@ -833,8 +899,8 @@ public:
    * this += sqr(p1*b + p2*c + p3*d)
    * @endcode
    */
-  void addSquareSum(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1,
-                    T p2, T p3);
+  void addSquareSum(
+      BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3);
 
   /**
    * @code
@@ -862,7 +928,7 @@ public:
    * this = 1 / (p1 * b + p2)
    * @endcode
    */
-  void reciprocal(BaseMatrixT& b, T p1, T p2);
+  void reciprocal2(BaseMatrixT& b, T p1, T p2);
 
   /**
    * @code
@@ -953,8 +1019,6 @@ public:
   /// calculate the minimum value of each row of the matrix b.
   void minRows(BaseMatrixT& b);
 
-  /// calculate the sum of each column of the matrix b.
-  void sumCols(BaseMatrixT& b);
   /// calculate the maximum value of each column of the matrix b.
   void maxCols(BaseMatrixT& b);
   /// calculate the minimum value of each column of the matrix b.
@@ -965,12 +1029,13 @@ public:
   void sumCols(BaseMatrixT& b, T scaleSum, T scaleDest);
 
   /// this_i = scaleDest * this_i + scaleSum * \sum_j (b_{ij} - c_{ij})^2
-  void sumOfSquaredDiffs(BaseMatrixT& b, BaseMatrixT& c,
-                         T scaleSum, T scaleDest);
+  void sumOfSquaredDiffs(BaseMatrixT& b,
+                         BaseMatrixT& c,
+                         T scaleSum,
+                         T scaleDest);
 
   /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
-  void sumOfProducts(BaseMatrixT& b, BaseMatrixT& c,
-                     T scaleSum, T scaleDest);
+  void sumOfProducts(BaseMatrixT& b, BaseMatrixT& c, T scaleSum, T scaleDest);
 
   /**
    * @code
@@ -985,8 +1050,32 @@ public:
    */
   void rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
 
-  virtual bool isSparse() const {
-    return false;
+  virtual bool isSparse() const { return false; }
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (useGpu_) {
+      TensorGpuApply<T>(*this, expr);
+    } else {
+      TensorCpuApply<T>(*this, expr);
+    }
+  }
+
+  template <typename ExpressionType>
+  void operator+=(const ExpressionType& expr) {
+    (*this) = (*this) + expr;
+  }
+  template <typename ExpressionType>
+  void operator-=(const ExpressionType& expr) {
+    (*this) = (*this) - expr;
+  }
+  template <typename ExpressionType>
+  void operator*=(const ExpressionType& expr) {
+    (*this) = (*this) * expr;
+  }
+  template <typename ExpressionType>
+  void operator/=(const ExpressionType& expr) {
+    (*this) = (*this) / expr;
   }
 };
 
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
index 93b1bf46a10078b4ae83efdbf268f64b6da052dc..f5657c4690ca71200346efd4e2c5244c02c92eb1 100644
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -16,10 +16,12 @@ file(GLOB MATH_HEADERS . *.h)
 file(GLOB MATH_SOURCES . *.cpp)
 set(MATH_SOURCES
     "${PROJ_ROOT}/paddle/math/BaseMatrix.cu"
+    "${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu"
     ${MATH_SOURCES})
 if(NOT WITH_GPU)
     # then compile BaseMatrix.cu as c++ file
     compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/BaseMatrix.cu")
+    compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu")
     add_library(paddle_math STATIC
         ${MATH_SOURCES})
 else()
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index 64ee124a5613a99ac3d7ff36897e4f2d0489ad51..ad3f8e64efd37c27c7f462dd7c8311577a05a391 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "hl_gpu.h"
 #include "CpuSparseMatrix.h"
 #include "SparseMatrix.h"
@@ -24,24 +23,35 @@ namespace paddle {
 
 const size_t CpuSparseMatrix::DEFAULT_AVG_WIDTH;
 
-CpuSparseMatrix::CpuSparseMatrix(size_t height, size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
+CpuSparseMatrix::CpuSparseMatrix(size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
                                  bool trans)
     : Matrix(NULL, height, width, trans, false) {
   resize(height, width, nnz, valueType, format);
 }
 
-CpuSparseMatrix::CpuSparseMatrix(CpuMemHandlePtr dataHandle, size_t height,
-                                 size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
+CpuSparseMatrix::CpuSparseMatrix(CpuMemHandlePtr dataHandle,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
                                  bool trans)
     : Matrix(dataHandle, height, width, trans, false) {
   resize(height, width, nnz, valueType, format);
 }
 
-CpuSparseMatrix::CpuSparseMatrix(real* data, int* rows, int* cols,
-                                 size_t height, size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
+CpuSparseMatrix::CpuSparseMatrix(real* data,
+                                 int* rows,
+                                 int* cols,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
                                  bool trans)
     : Matrix(NULL, height, width, trans, false) {
   cols_ = cols;
@@ -54,8 +64,11 @@ CpuSparseMatrix::CpuSparseMatrix(real* data, int* rows, int* cols,
   format_ = format;
 }
 
-void CpuSparseMatrix::resize(size_t newHeight, size_t newWidth, size_t newNnz,
-                             SparseValueType valueType, SparseFormat format) {
+void CpuSparseMatrix::resize(size_t newHeight,
+                             size_t newWidth,
+                             size_t newNnz,
+                             SparseValueType valueType,
+                             SparseFormat format) {
   CHECK_LE(newNnz, newHeight * newWidth);
   size_t newSize = 0;
   if (format == SPARSE_CSR) {
@@ -110,23 +123,38 @@ void CpuSparseMatrix::sparseResize() {
 }
 
 void CpuSparseMatrix::resize(size_t newHeight, size_t newWidth) {
-  resize(newHeight, newWidth, newHeight * std::min(DEFAULT_AVG_WIDTH, newWidth),
-         valueType_, format_);
+  resize(newHeight,
+         newWidth,
+         newHeight * std::min(DEFAULT_AVG_WIDTH, newWidth),
+         valueType_,
+         format_);
 }
 
 MatrixPtr CpuSparseMatrix::getTranspose() {
   if (!memoryHandle_ && !value_) {
-    MatrixPtr dest(new CpuSparseMatrix(height_, width_, elementCnt_, valueType_,
-                                       format_, true));
+    MatrixPtr dest(new CpuSparseMatrix(
+        height_, width_, elementCnt_, valueType_, format_, true));
     return dest;
   } else if (memoryHandle_) {
     MatrixPtr dest(new CpuSparseMatrix(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_), height_,
-        width_, elementCnt_, valueType_, format_, true));
+        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
+        height_,
+        width_,
+        elementCnt_,
+        valueType_,
+        format_,
+        true));
     return dest;
   } else if (value_) {
-    MatrixPtr dest(new CpuSparseMatrix(value_, rows_, cols_, height_, width_,
-                                       elementCnt_, valueType_, format_, true));
+    MatrixPtr dest(new CpuSparseMatrix(value_,
+                                       rows_,
+                                       cols_,
+                                       height_,
+                                       width_,
+                                       elementCnt_,
+                                       valueType_,
+                                       format_,
+                                       true));
     return dest;
   } else {
     return NULL;
@@ -140,7 +168,10 @@ void CpuSparseMatrix::mul(MatrixPtr a, MatrixPtr b, real scaleAB, real scaleT) {
 
   if (dynamic_cast<CpuMatrix*>(a.get()) && dynamic_cast<CpuMatrix*>(b.get())) {
     CpuMatrix::mul(dynamic_cast<CpuMatrix*>(a.get()),
-                   dynamic_cast<CpuMatrix*>(b.get()), this, scaleAB, scaleT);
+                   dynamic_cast<CpuMatrix*>(b.get()),
+                   this,
+                   scaleAB,
+                   scaleT);
   } else {
     LOG(FATAL) << "not supported";
   }
@@ -243,7 +274,8 @@ void CpuSparseMatrix::randomizeUniform() {
   }
 }
 
-void CpuSparseMatrix::copyFrom(std::vector<int>& rows, std::vector<int>& cols,
+void CpuSparseMatrix::copyFrom(std::vector<int>& rows,
+                               std::vector<int>& cols,
                                std::vector<real>& values) {
   size_t size = format_ == SPARSE_CSR ? cols.size() : rows.size();
   resize(height_, width_, size, valueType_, format_);
@@ -302,11 +334,11 @@ MatrixPtr CpuSparseMatrix::clone(size_t height, size_t width, bool useGpu) {
   }
   CHECK(width && height);
   if (!useGpu) {
-    return std::make_shared<CpuSparseMatrix>(height, width, 0, valueType_,
-                                             format_);
+    return std::make_shared<CpuSparseMatrix>(
+        height, width, 0, valueType_, format_);
   } else {
-    return std::make_shared<GpuSparseMatrix>(height, width, elementCnt_,
-                                             valueType_, format_);
+    return std::make_shared<GpuSparseMatrix>(
+        height, width, elementCnt_, valueType_, format_);
   }
 }
 
@@ -315,13 +347,25 @@ MatrixPtr CpuSparseMatrix::subMatrix(size_t startRow, size_t numRows) {
   CHECK_EQ(format_, SPARSE_CSR);
   if (valueType_ == NO_VALUE) {
     return std::make_shared<CpuSparseMatrix>(
-        nullptr, rows_ + startRow, cols_, numRows, width_,
-        rows_[startRow + numRows] - rows_[startRow], valueType_, format_,
+        nullptr,
+        rows_ + startRow,
+        cols_,
+        numRows,
+        width_,
+        rows_[startRow + numRows] - rows_[startRow],
+        valueType_,
+        format_,
         trans_);
   } else {
     return std::make_shared<CpuSparseMatrix>(
-        value_, rows_ + startRow, cols_, numRows, width_,
-        rows_[startRow + numRows] - rows_[startRow], valueType_, format_,
+        value_,
+        rows_ + startRow,
+        cols_,
+        numRows,
+        width_,
+        rows_[startRow + numRows] - rows_[startRow],
+        valueType_,
+        format_,
         trans_);
   }
 }
@@ -404,8 +448,10 @@ void CpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   }
 }
 
-void CpuSparseMatrix::setRow(size_t row, size_t colNum,
-                             const unsigned int* cols, const real* values) {
+void CpuSparseMatrix::setRow(size_t row,
+                             size_t colNum,
+                             const unsigned int* cols,
+                             const real* values) {
   if (format_ == SPARSE_CSR) {
     CHECK_LT(row, height_);
     CHECK(NULL != cols);
@@ -494,11 +540,23 @@ void CpuSparseMatrix::copyFrom(const GpuSparseMatrix& src, hl_stream_t stream) {
   CHECK_EQ(size_t(elementCnt_), src.getElementCnt());
   size_t valSize = valueType_ == NO_VALUE ? 0 : elementCnt_;
   if (format_ == SPARSE_CSC)
-    hl_memcpy_from_csc_matrix(value_, valSize, rows_, elementCnt_, cols_,
-                              width_ + 1, src.sMatrix_.get(), stream);
+    hl_memcpy_from_csc_matrix(value_,
+                              valSize,
+                              rows_,
+                              elementCnt_,
+                              cols_,
+                              width_ + 1,
+                              src.sMatrix_.get(),
+                              stream);
   else
-    hl_memcpy_from_csr_matrix(value_, valSize, rows_, height_ + 1, cols_,
-                              elementCnt_, src.sMatrix_.get(), stream);
+    hl_memcpy_from_csr_matrix(value_,
+                              valSize,
+                              rows_,
+                              height_ + 1,
+                              cols_,
+                              elementCnt_,
+                              src.sMatrix_.get(),
+                              stream);
 }
 
 void CpuSparseMatrix::copyFrom(const CpuSparseMatrix& src) {
@@ -536,14 +594,16 @@ void CpuSparseMatrix::copyFrom(const CpuSparseMatrix& src) {
   }
 }
 
-void CpuSparseMatrix::copyRow(int offsets, size_t colNum,
+void CpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
                               const sparse_non_value_t* row) {
   for (size_t j = 0; j < colNum; j++) {
     cols_[offsets + j] = row[j].col;
   }
 }
 
-void CpuSparseMatrix::copyRow(int offsets, size_t colNum,
+void CpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
                               const sparse_float_value_t* row) {
   for (size_t j = 0; j < colNum; j++) {
     cols_[offsets + j] = row[j].col;
@@ -596,7 +656,8 @@ void CpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
   if (format_ == SPARSE_CSR) {
     int* srcCols = src.getCols();
     size_t numLessWidth =
-        std::count_if(srcCols, srcCols + src.getElementCnt(),
+        std::count_if(srcCols,
+                      srcCols + src.getElementCnt(),
                       [this](size_t n) { return n < this->width_; });
     resize(height_, width_, numLessWidth, valueType_, format_);
     rows_[0] = 0;
@@ -636,13 +697,15 @@ void CpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
 
 void CpuSparseMatrix::zeroMem() {
   CHECK(valueType_ == FLOAT_VALUE);
-  memset(value_, 0, elementCnt_* sizeof(real));
+  memset(value_, 0, elementCnt_ * sizeof(real));
 }
 
-template void CpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices,
+template void CpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
                                         sparse_non_value_t* data);
 
-template void CpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices,
+template void CpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
                                         sparse_float_value_t* data);
 
 template void CpuSparseMatrix::copyFrom(int64_t* indices,
@@ -673,7 +736,9 @@ void CpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
     }
 
     size_t outsize = std::min(num, beam);
-    std::partial_sort(vec.begin(), vec.begin() + outsize, vec.end(),
+    std::partial_sort(vec.begin(),
+                      vec.begin() + outsize,
+                      vec.end(),
                       [](const valuepair& a, const valuepair& b) {
                         return a.first > b.first;
                       });
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index fd3b5030bea7ac937d9cf828e29d3441446a65f6..50f3c1569a431fa746f1014b834eb48fb9fb76b6 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <cstddef>
 #include "Matrix.h"
@@ -21,24 +20,38 @@ namespace paddle {
 
 class CpuSparseMatrix : public Matrix {
 public:
-  CpuSparseMatrix(size_t height, size_t width,
+  CpuSparseMatrix(size_t height,
+                  size_t width,
                   size_t nnz, /* used to allocate space */
                   SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format = SPARSE_CSR, bool trans = false);
-
-  CpuSparseMatrix(CpuMemHandlePtr memHandle, size_t height, size_t width,
-                  size_t nnz, SparseValueType valueType, SparseFormat format,
+                  SparseFormat format = SPARSE_CSR,
+                  bool trans = false);
+
+  CpuSparseMatrix(CpuMemHandlePtr memHandle,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
                   bool trans);
 
-  CpuSparseMatrix(real* data, int* rows, int* cols, size_t height, size_t width,
-                  size_t nnz, SparseValueType valueType, SparseFormat format,
+  CpuSparseMatrix(real* data,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
                   bool trans);
 
   ~CpuSparseMatrix() {}
 
-  void resize(size_t newHeight, size_t newWidth,
+  void resize(size_t newHeight,
+              size_t newWidth,
               size_t newNnz, /* used to allocate space */
-              SparseValueType valueType, SparseFormat format);
+              SparseValueType valueType,
+              SparseFormat format);
   void resize(size_t newHeight, size_t newWidth);
 
   MatrixPtr getTranspose();
@@ -75,8 +88,6 @@ public:
     }
   }
 
-
-
   real* getColumn(size_t i) const {
     if (format_ == SPARSE_CSC) {
       return value_ + cols_[i];
@@ -125,7 +136,7 @@ public:
     return sum;
   }
 
-  virtual void square() {
+  virtual void square2() {
     CHECK(isContiguous());
     if (valueType_ == NO_VALUE) {
       return;
@@ -182,7 +193,7 @@ public:
    * getData is convenient to get value
    */
   real* getData() { return getValue(); }
-  const real* getData() const { return getValue();}
+  const real* getData() const { return getValue(); }
 
   /**
    * @brief only set value_ of FLOAT_VALUE sparse matrix to zero
@@ -220,7 +231,9 @@ public:
 
   void printOneRow(std::ostream& os, size_t idx) const;
 
-  void setRow(size_t row, size_t colNum, const unsigned int* cols,
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
               const real* values);
 
   void randomizeUniform();
@@ -241,7 +254,8 @@ public:
 
   virtual MatrixPtr subMatrix(size_t startRow, size_t numRows);
 
-  void copyFrom(std::vector<int>& rows, std::vector<int>& cols,
+  void copyFrom(std::vector<int>& rows,
+                std::vector<int>& cols,
                 std::vector<real>& values);
 
   void copyFrom(const CpuMatrix& src);
@@ -285,9 +299,7 @@ protected:
 
   // BaseMatrixT interface
 public:
-  bool isSparse() const {
-    return true;
-  }
+  bool isSparse() const { return true; }
 
 private:
   using Matrix::copyFrom;
diff --git a/paddle/math/ExecViaCpu.h b/paddle/math/ExecViaCpu.h
index 64e5b8312168499d4267937cdc7f0b872fa5ea37..67fb6c0cda6f46ddf4547b9ec9faaa8931c75eed 100644
--- a/paddle/math/ExecViaCpu.h
+++ b/paddle/math/ExecViaCpu.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 /*
  execViaCpu is used to do operations on GpuMatirx and/or GpuIVector through
  cpu functions. It can automatically make a temporary CPU copy for the
@@ -46,8 +45,10 @@ public:
   explicit CopyToCpu(Matrix& arg) : arg_(arg) {
     if (arg.useGpu()) {
       CHECK(!arg.isTransposed()) << "Not supported";
-      copied_ = Matrix::create(arg.getHeight(), arg.getWidth(),
-                               /* trans= */ false, /* useGpu= */ false);
+      copied_ = Matrix::create(arg.getHeight(),
+                               arg.getWidth(),
+                               /* trans= */ false,
+                               /* useGpu= */ false);
       copied_->copyFrom(arg);
     }
   }
@@ -69,8 +70,10 @@ public:
   explicit CopyToCpu(const Matrix& arg) : arg_(arg) {
     if (arg.useGpu()) {
       CHECK(!arg.isTransposed()) << "Not supported";
-      copied_ = Matrix::create(arg.getHeight(), arg.getWidth(),
-                               /* trans= */ false, /* useGpu= */ false);
+      copied_ = Matrix::create(arg.getHeight(),
+                               arg.getWidth(),
+                               /* trans= */ false,
+                               /* useGpu= */ false);
       copied_->copyFrom(arg);
     }
   }
@@ -165,7 +168,8 @@ class GpuFuncWrapper2
           std::is_function<F>::value,
           std::is_pointer<F>::value &&
               std::is_function<typename std::remove_pointer<F>::type>::value,
-          std::is_class<F>::value, F> {};
+          std::is_class<F>::value,
+          F> {};
 
 template <typename F>
 class GpuFuncWrapper
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index e0b2a2bb5b2cdbd845d9be08a8926f0514398458..1217163beecf19c2af215e3d4c72db644cd74b51 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -12,36 +12,79 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "MathFunctions.h"
 #include "hl_matrix_ops.cuh"
 #include "hl_matrix_apply.cuh"
 
 namespace paddle {
 
-template<>
-void gemm<float>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-                 const int M, const int N, const int K,
-                 const float alpha, const float* A, const int lda,
-                 const float* B, const int ldb,
-                 const float beta, float* C, const int ldc) {
-  cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
-}
-
-template<>
-void gemm<double>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-                  const int M, const int N, const int K,
-                  const double alpha, const double* A, const int lda,
-                  const double* B, const int ldb,
-                  const double beta, double* C, const int ldc) {
-  cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
-}
-
-template<>
-int getrf<float>(const CBLAS_ORDER order, const int M, const int N,
-                  float *A, const int lda, int *ipiv) {
+template <>
+void gemm<float>(const CBLAS_TRANSPOSE transA,
+                 const CBLAS_TRANSPOSE transB,
+                 const int M,
+                 const int N,
+                 const int K,
+                 const float alpha,
+                 const float* A,
+                 const int lda,
+                 const float* B,
+                 const int ldb,
+                 const float beta,
+                 float* C,
+                 const int ldc) {
+  cblas_sgemm(CblasRowMajor,
+              transA,
+              transB,
+              M,
+              N,
+              K,
+              alpha,
+              A,
+              lda,
+              B,
+              ldb,
+              beta,
+              C,
+              ldc);
+}
+
+template <>
+void gemm<double>(const CBLAS_TRANSPOSE transA,
+                  const CBLAS_TRANSPOSE transB,
+                  const int M,
+                  const int N,
+                  const int K,
+                  const double alpha,
+                  const double* A,
+                  const int lda,
+                  const double* B,
+                  const int ldb,
+                  const double beta,
+                  double* C,
+                  const int ldc) {
+  cblas_dgemm(CblasRowMajor,
+              transA,
+              transB,
+              M,
+              N,
+              K,
+              alpha,
+              A,
+              lda,
+              B,
+              ldb,
+              beta,
+              C,
+              ldc);
+}
+
+template <>
+int getrf<float>(const CBLAS_ORDER order,
+                 const int M,
+                 const int N,
+                 float* A,
+                 const int lda,
+                 int* ipiv) {
 #ifdef PADDLE_USE_ATLAS
   return clapack_sgetrf(order, M, N, A, lda, ipiv);
 #else
@@ -49,9 +92,13 @@ int getrf<float>(const CBLAS_ORDER order, const int M, const int N,
 #endif
 }
 
-template<>
-int getrf<double>(const CBLAS_ORDER order, const int M, const int N,
-                   double *A, const int lda, int *ipiv) {
+template <>
+int getrf<double>(const CBLAS_ORDER order,
+                  const int M,
+                  const int N,
+                  double* A,
+                  const int lda,
+                  int* ipiv) {
 #ifdef PADDLE_USE_ATLAS
   return clapack_dgetrf(order, M, N, A, lda, ipiv);
 #else
@@ -59,9 +106,12 @@ int getrf<double>(const CBLAS_ORDER order, const int M, const int N,
 #endif
 }
 
-template<>
-int getri<float>(const CBLAS_ORDER order, const int N, float *A,
-                  const int lda, const int *ipiv) {
+template <>
+int getri<float>(const CBLAS_ORDER order,
+                 const int N,
+                 float* A,
+                 const int lda,
+                 const int* ipiv) {
 #ifdef PADDLE_USE_ATLAS
   return clapack_sgetri(order, N, A, lda, ipiv);
 #else
@@ -69,9 +119,12 @@ int getri<float>(const CBLAS_ORDER order, const int N, float *A,
 #endif
 }
 
-template<>
-int getri<double>(const CBLAS_ORDER order, const int N, double *A,
-                  const int lda, const int *ipiv) {
+template <>
+int getri<double>(const CBLAS_ORDER order,
+                  const int N,
+                  double* A,
+                  const int lda,
+                  const int* ipiv) {
 #ifdef PADDLE_USE_ATLAS
   return clapack_dgetri(order, N, A, lda, ipiv);
 #else
@@ -79,149 +132,155 @@ int getri<double>(const CBLAS_ORDER order, const int N, double *A,
 #endif
 }
 
-template<>
+template <>
 void axpy<float>(const int n, const float alpha, const float* x, float* y) {
   cblas_saxpy(n, alpha, x, 1, y, 1);
 }
 
-template<>
+template <>
 void axpy<double>(const int n, const double alpha, const double* x, double* y) {
   cblas_daxpy(n, alpha, x, 1, y, 1);
 }
 
-template<>
+template <>
 float dotProduct<float>(const int n, const float* x, const float* y) {
   return cblas_sdot(n, x, 1, y, 1);
 }
 
-template<>
+template <>
 double dotProduct<double>(const int n, const double* x, const double* y) {
   return cblas_ddot(n, x, 1, y, 1);
 }
 
 #ifdef PADDLE_USE_MKL
 
-template<>
+template <>
 void vExp<float>(const int n, const float* a, float* r) {
   vsExp(n, a, r);
 }
 
-template<>
+template <>
 void vExp<double>(const int n, const double* a, double* r) {
   vdExp(n, a, r);
 }
 
-template<>
+template <>
 void vPow<float>(const int n, const float* a, const float b, float* r) {
   vsPowx(n, a, b, r);
 }
 
-template<>
+template <>
 void vPow<double>(const int n, const double* a, const double b, double* r) {
   vdPowx(n, a, b, r);
 }
 
-template<>
+template <>
 void vLog<float>(const int n, const float* a, float* r) {
   vsLn(n, a, r);
 }
 
-template<>
+template <>
 void vLog<double>(const int n, const double* a, double* r) {
   vdLn(n, a, r);
 }
 
-template<>
+template <>
 void vAdd<float>(const int n, const float* a, const float* b, float* r) {
   vsAdd(n, a, b, r);
 }
 
-template<>
+template <>
 void vAdd<double>(const int n, const double* a, const double* b, double* r) {
   vdAdd(n, a, b, r);
 }
 
-template<>
+template <>
 void vInvSqrt<float>(const int n, const float* a, float* r) {
   vsInvSqrt(n, a, r);
 }
 
-template<>
+template <>
 void vInvSqrt<double>(const int n, const double* a, double* r) {
   vdInvSqrt(n, a, r);
 }
 
-template<>
+template <>
 void vLog1p<float>(const int n, const float* a, float* r) {
   vsLog1p(n, a, r);
 }
 
-template<>
+template <>
 void vLog1p<double>(const int n, const double* a, double* r) {
   vdLog1p(n, a, r);
 }
 
-template<>
+template <>
 void vTanh<float>(const int n, const float* a, float* r) {
   vsTanh(n, a, r);
 }
 
-template<>
+template <>
 void vTanh<double>(const int n, const double* a, double* r) {
   vdTanh(n, a, r);
 }
 #else
 
 DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
-template<class T>
+template <class T>
 void vExp(const int n, const T* a, T* r) {
   hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
-    binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
+      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
 DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
-template<class T>
+template <class T>
 void vLog(const int n, const T* a, T* r) {
   hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
-    binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
+      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
 DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
-template<class T>
+template <class T>
 void vInvSqrt(const int n, const T* a, T* r) {
   hl_cpu_apply_binary_op<T, binary::vInvSqrt<T>, 0, 0>(
-    binary::vInvSqrt<T>(), const_cast<T*>(a), r, 1, n, n, n);
+      binary::vInvSqrt<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
 DEFINE_MATRIX_BINARY_OP(vLog1p, b = std::log(1.0f + a));
-template<class T>
+template <class T>
 void vLog1p(const int n, const T* a, T* r) {
   hl_cpu_apply_binary_op<T, binary::vLog1p<T>, 0, 0>(
-    binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
+      binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
-DEFINE_MATRIX_BINARY_OP(vTanh,
-    T tmp = -2.0 * a;
-    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-    b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
-template<class T>
+DEFINE_MATRIX_BINARY_OP(vTanh, T tmp = -2.0 * a;
+                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
+template <class T>
 void vTanh(const int n, const T* a, T* r) {
   hl_cpu_apply_binary_op<T, binary::vTanh<T>, 0, 0>(
-    binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
+      binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
-template<class T>
+template <class T>
 void vPow(const int n, const T* a, const T b, T* r) {
   hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
-    binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
+      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
 }
 
 DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
-template<class T>
+template <class T>
 void vAdd(const int n, const T* a, const T* b, T* r) {
   hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
-    const_cast<T*>(a), const_cast<T*>(b), r, 1, n, n, n , n);
+                                                     const_cast<T*>(a),
+                                                     const_cast<T*>(b),
+                                                     r,
+                                                     1,
+                                                     n,
+                                                     n,
+                                                     n,
+                                                     n);
 }
 
 template void vExp(const int n, const float* a, float* r);
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index 29c07467c7bac9c382f02a5f6ffdcfd87c5b09a0..0741c456780e36c6b87dd44d89ffc601ac928f31 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -35,46 +35,58 @@ extern "C" {
 
 namespace paddle {
 
-template<class T>
-void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-          const int M, const int N, const int K,
-          const T alpha, const T* A, const int lda,
-          const T* B, const int ldb,
-          const T beta, T* C, const int ldc);
-
-template<class T>
-int getrf(const CBLAS_ORDER Order, const int M, const int N,
-          T *A, const int lda, int *ipiv);
-
-template<class T>
-int getri(const CBLAS_ORDER Order, const int N, T *A,
-          const int lda, const int *ipiv);
-
-template<class T>
+template <class T>
+void gemm(const CBLAS_TRANSPOSE transA,
+          const CBLAS_TRANSPOSE transB,
+          const int M,
+          const int N,
+          const int K,
+          const T alpha,
+          const T* A,
+          const int lda,
+          const T* B,
+          const int ldb,
+          const T beta,
+          T* C,
+          const int ldc);
+
+template <class T>
+int getrf(const CBLAS_ORDER Order,
+          const int M,
+          const int N,
+          T* A,
+          const int lda,
+          int* ipiv);
+
+template <class T>
+int getri(
+    const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv);
+
+template <class T>
 void axpy(const int n, const T alpha, const T* x, T* y);
 
-template<class T>
+template <class T>
 T dotProduct(const int n, const T* x, const T* y);
 
-template<class T>
+template <class T>
 void vExp(const int n, const T* a, T* r);
 
-template<class T>
+template <class T>
 void vPow(const int n, const T* a, const T b, T* r);
 
-template<class T>
+template <class T>
 void vLog(const int n, const T* a, T* r);
 
-template<class T>
+template <class T>
 void vAdd(const int n, const T* a, const T* b, T* r);
 
-template<class T>
+template <class T>
 void vInvSqrt(const int n, const T* a, T* r);
 
-template<class T>
+template <class T>
 void vLog1p(const int n, const T* a, T* r);
 
-template<class T>
+template <class T>
 void vTanh(const int n, const T* a, T* r);
 
 }  // namespace paddle
diff --git a/paddle/math/MathUtils.cpp b/paddle/math/MathUtils.cpp
index 548f17936381c7e1c4d0c2c9661b197f3f06bd35..878e0b8723025e75f7838e981517f58a3dcb5424 100644
--- a/paddle/math/MathUtils.cpp
+++ b/paddle/math/MathUtils.cpp
@@ -23,8 +23,8 @@ namespace paddle {
  * major is rows and minor is cols, according to
  * major value to initialize minor value"
  */
-void sparseRand(int* major, int* minor, int nnz, int majorLen, int minorMax,
-                bool useGpu) {
+void sparseRand(
+    int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu) {
   CHECK(size_t(nnz) > size_t(1));
   int* cpuMajor;
   int* cpuMinor;
@@ -57,7 +57,8 @@ void sparseRand(int* major, int* minor, int nnz, int majorLen, int minorMax,
       cpuMinor[j] = idx;
       used[idx] = 1;
     }
-    std::sort(cpuMinor + cpuMajor[i], cpuMinor + cpuMajor[i + 1],
+    std::sort(cpuMinor + cpuMajor[i],
+              cpuMinor + cpuMajor[i + 1],
               [](int a, int b) { return a < b; });
   }
   /*memcpy result to gpu*/
@@ -67,8 +68,8 @@ void sparseRand(int* major, int* minor, int nnz, int majorLen, int minorMax,
   }
 }
 
-int outputSize(int imageSize, int filterSize, int padding, int stride,
-               bool caffeMode) {
+int outputSize(
+    int imageSize, int filterSize, int padding, int stride, bool caffeMode) {
   int outputSize;
   if (!caffeMode) {
     outputSize =
@@ -80,14 +81,14 @@ int outputSize(int imageSize, int filterSize, int padding, int stride,
   return outputSize;
 }
 
-int imageSize(int outputSize, int filterSize, int padding, int stride,
-              bool caffeMode) {
+int imageSize(
+    int outputSize, int filterSize, int padding, int stride, bool caffeMode) {
   int imageSize;
   if (!caffeMode) {
-   imageSize =
-       (outputSize - 1) * stride + filterSize - 2 * padding - stride + 1;
+    imageSize =
+        (outputSize - 1) * stride + filterSize - 2 * padding - stride + 1;
   } else {
-   imageSize = (outputSize - 1) * stride + filterSize - 2 * padding;
+    imageSize = (outputSize - 1) * stride + filterSize - 2 * padding;
   }
   CHECK_GE(imageSize, 1);
   return imageSize;
diff --git a/paddle/math/MathUtils.h b/paddle/math/MathUtils.h
index 91683dc3e9144df4664f46859ff5e2215dc34144..907116c00281bfcf34c6652564f55a37c3f47a8c 100644
--- a/paddle/math/MathUtils.h
+++ b/paddle/math/MathUtils.h
@@ -41,8 +41,8 @@ namespace paddle {
  *
  * rows is [1, 3, 4, 0, 2, 4, 1, 2, 3, 4]
  */
-void sparseRand(int* major, int* minor, int nnz, int majorLen, int minorMax,
-                bool useGpu);
+void sparseRand(
+    int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu);
 
 /**
  * Calculate output size based on caffeMode_.
@@ -57,14 +57,14 @@ void sparseRand(int* major, int* minor, int nnz, int majorLen, int minorMax,
  *   - output: (012), (234), (456), (678), (9)
  *   - outputSize = 5;
  */
-int outputSize(int imageSize, int filterSize, int padding, int stride,
-               bool caffeMode);
+int outputSize(
+    int imageSize, int filterSize, int padding, int stride, bool caffeMode);
 
 /**
  * Calculate image size based on output size and caffeMode_.
  * It is the reverse function of outputSize()
  */
-int imageSize(int outputSize, int filterSize, int padding, int stride,
-              bool caffeMode);
+int imageSize(
+    int outputSize, int filterSize, int padding, int stride, bool caffeMode);
 
 }  // namespace paddle
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 706a598d0c33762b0578190ea4a0aa06247a88ef..b70b47a5fcc72edea8fa5a680c4af962ea0f4ae9 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -40,58 +40,75 @@ inline real _square(real a) { return a * a; }
 
 inline real _safelog(real a) { return a > 0.0f ? std::log(a) : -40.0f; }
 
-Matrix::Matrix(MemoryHandlePtr memHandle, size_t height, size_t width,
-               bool trans, bool use_gpu)
+Matrix::Matrix(MemoryHandlePtr memHandle,
+               size_t height,
+               size_t width,
+               bool trans,
+               bool use_gpu)
     : BaseMatrix(
-          height, width,
+          height,
+          width,
           memHandle ? (reinterpret_cast<real*>(memHandle->getBuf())) : nullptr,
-          trans, use_gpu) {
+          trans,
+          use_gpu) {
   elementCnt_ = width * height;
   memoryHandle_ = memHandle;
 }
 
-Matrix::Matrix(real* data, size_t height, size_t width, bool trans,
-               bool use_gpu)
+Matrix::Matrix(
+    real* data, size_t height, size_t width, bool trans, bool use_gpu)
     : BaseMatrix(height, width, data, trans, use_gpu) {
   elementCnt_ = width * height;
 }
 
-Matrix::Matrix(real* data, size_t height, size_t width, size_t stride,
-               bool trans, bool use_gpu)
+Matrix::Matrix(real* data,
+               size_t height,
+               size_t width,
+               size_t stride,
+               bool trans,
+               bool use_gpu)
     : BaseMatrix(height, width, stride, data, trans, use_gpu) {
   elementCnt_ = width * height;
 }
 
-MatrixPtr Matrix::createSparseMatrix(real* data, int* row, int* col,
-                                     size_t height, size_t width,
+MatrixPtr Matrix::createSparseMatrix(real* data,
+                                     int* row,
+                                     int* col,
+                                     size_t height,
+                                     size_t width,
                                      size_t nnz, /* used to allocate space */
                                      SparseValueType valueType, /*value type*/
-                                     SparseFormat format, bool trans,
+                                     SparseFormat format,
+                                     bool trans,
                                      bool useGpu) {
   if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(data, row, col, height, width, nnz,
-                                             valueType, format, trans);
+    return std::make_shared<GpuSparseMatrix>(
+        data, row, col, height, width, nnz, valueType, format, trans);
   } else {
-    return std::make_shared<CpuSparseMatrix>(data, row, col, height, width, nnz,
-                                             valueType, format, trans);
+    return std::make_shared<CpuSparseMatrix>(
+        data, row, col, height, width, nnz, valueType, format, trans);
   }
 }
 
-MatrixPtr Matrix::createSparseMatrix(size_t height, size_t width,
+MatrixPtr Matrix::createSparseMatrix(size_t height,
+                                     size_t width,
                                      size_t nnz, /* used to allocate space */
                                      SparseValueType valueType, /*value type*/
-                                     SparseFormat format, bool trans,
+                                     SparseFormat format,
+                                     bool trans,
                                      bool useGpu) {
   if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(height, width, nnz, valueType,
-                                             format, trans);
+    return std::make_shared<GpuSparseMatrix>(
+        height, width, nnz, valueType, format, trans);
   } else {
-    return std::make_shared<CpuSparseMatrix>(height, width, nnz, valueType,
-                                             format, trans);
+    return std::make_shared<CpuSparseMatrix>(
+        height, width, nnz, valueType, format, trans);
   }
 }
 
-MatrixPtr Matrix::create(MemoryHandlePtr memHandle, size_t height, size_t width,
+MatrixPtr Matrix::create(MemoryHandlePtr memHandle,
+                         size_t height,
+                         size_t width,
                          bool trans) {
   if (auto gpuHandle = std::dynamic_pointer_cast<GpuMemoryHandle>(memHandle)) {
     return std::make_shared<GpuMatrix>(gpuHandle, height, width, trans);
@@ -112,8 +129,8 @@ MatrixPtr Matrix::create(size_t height, size_t width, bool trans, bool useGpu) {
   }
 }
 
-MatrixPtr Matrix::create(real* data, size_t height, size_t width, bool trans,
-                         bool useGpu) {
+MatrixPtr Matrix::create(
+    real* data, size_t height, size_t width, bool trans, bool useGpu) {
   if (useGpu) {
     return std::make_shared<GpuMatrix>(data, height, width, trans);
   } else {
@@ -121,8 +138,12 @@ MatrixPtr Matrix::create(real* data, size_t height, size_t width, bool trans,
   }
 }
 
-MatrixPtr Matrix::create(real* data, size_t height, size_t width, size_t stride,
-                         bool trans, bool useGpu) {
+MatrixPtr Matrix::create(real* data,
+                         size_t height,
+                         size_t width,
+                         size_t stride,
+                         bool trans,
+                         bool useGpu) {
   if (useGpu) {
     return std::make_shared<GpuMatrix>(data, height, width, stride, trans);
   } else {
@@ -130,20 +151,23 @@ MatrixPtr Matrix::create(real* data, size_t height, size_t width, size_t stride,
   }
 }
 
-MatrixPtr Matrix::createSparseMatrix(size_t height, size_t width, size_t nnz,
-                                     SparseValueType valueType, bool trans,
+MatrixPtr Matrix::createSparseMatrix(size_t height,
+                                     size_t width,
+                                     size_t nnz,
+                                     SparseValueType valueType,
+                                     bool trans,
                                      bool useGpu) {
   if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(height, width, nnz, valueType,
-                                             SPARSE_CSR, trans);
+    return std::make_shared<GpuSparseMatrix>(
+        height, width, nnz, valueType, SPARSE_CSR, trans);
   } else {
-    return std::make_shared<CpuSparseMatrix>(height, width, nnz, valueType,
-                                             SPARSE_CSR, trans);
+    return std::make_shared<CpuSparseMatrix>(
+        height, width, nnz, valueType, SPARSE_CSR, trans);
   }
 }
 
-void Matrix::resizeOrCreate(MatrixPtr& matrix, size_t height, size_t width,
-                            bool trans, bool useGpu) {
+void Matrix::resizeOrCreate(
+    MatrixPtr& matrix, size_t height, size_t width, bool trans, bool useGpu) {
   if (!matrix) {
     matrix = Matrix::create(height, width, trans, useGpu);
   } else {
@@ -152,14 +176,17 @@ void Matrix::resizeOrCreate(MatrixPtr& matrix, size_t height, size_t width,
   }
 }
 
-void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix, size_t height,
-                                        size_t width, size_t nnz,
+void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix,
+                                        size_t height,
+                                        size_t width,
+                                        size_t nnz,
                                         SparseValueType valueType,
-                                        SparseFormat format, bool trans,
+                                        SparseFormat format,
+                                        bool trans,
                                         bool useGpu) {
   if (!matrix) {
-    matrix = Matrix::createSparseMatrix(height, width, nnz, valueType, format,
-                                        trans, useGpu);
+    matrix = Matrix::createSparseMatrix(
+        height, width, nnz, valueType, format, trans, useGpu);
   } else {
     CHECK(dynamic_cast<CpuSparseMatrix*>(matrix.get()) ||
           dynamic_cast<GpuSparseMatrix*>(matrix.get()));
@@ -176,7 +203,9 @@ void Matrix::reshape(size_t height, size_t width) {
   stride_ = width_;
 }
 
-MatrixPtr Matrix::subMatrix(size_t startRow, size_t endRow, size_t startCol,
+MatrixPtr Matrix::subMatrix(size_t startRow,
+                            size_t endRow,
+                            size_t startCol,
                             size_t endCol) {
   CHECK_LE(startRow, endRow);
   CHECK_LE(endRow, getHeight());
@@ -184,8 +213,11 @@ MatrixPtr Matrix::subMatrix(size_t startRow, size_t endRow, size_t startCol,
   CHECK_LE(endCol, getWidth());
 
   return Matrix::create(getData() + startRow * getStride() + startCol,
-                        endRow - startRow, endCol - startCol, getStride(),
-                        trans_, useGpu_);
+                        endRow - startRow,
+                        endCol - startCol,
+                        getStride(),
+                        trans_,
+                        useGpu_);
 }
 
 void Matrix::setDiag(real value) {
@@ -199,7 +231,10 @@ void Matrix::setDiag(real value) {
 
 GpuMatrix::GpuMatrix(size_t height, size_t width, bool trans)
     : Matrix(std::make_shared<GpuMemoryHandle>(height * width * sizeof(real)),
-             height, width, trans, true) {}
+             height,
+             width,
+             trans,
+             true) {}
 
 GpuMatrix::~GpuMatrix() {}
 
@@ -258,11 +293,11 @@ void GpuMatrix::copyFrom(const Matrix& src) {
   CHECK(elementCnt_ == src.getElementCnt());
 
   if (typeid(src) == typeid(CpuMatrix)) {
-    hl_memcpy_host2device(data_, const_cast<real*>(src.getData()),
-                          sizeof(real) * elementCnt_);
+    hl_memcpy_host2device(
+        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
   } else if (typeid(src) == typeid(GpuMatrix)) {
-    hl_memcpy_device2device(data_, const_cast<real*>(src.getData()),
-                            sizeof(real) * elementCnt_);
+    hl_memcpy_device2device(
+        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
   } else {
     LOG(FATAL) << "Wrong";
   }
@@ -272,8 +307,10 @@ void GpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
   CHECK(isContiguous());
   CHECK(src.isContiguous());
   CHECK(elementCnt_ == src.getElementCnt());
-  hl_memcpy_async(this->getData(), const_cast<real*>(src.getData()),
-                  sizeof(real) * elementCnt_, stream);
+  hl_memcpy_async(this->getData(),
+                  const_cast<real*>(src.getData()),
+                  sizeof(real) * elementCnt_,
+                  stream);
 }
 
 void GpuMatrix::copyFrom(const real* hostSrc, size_t size) {
@@ -324,7 +361,9 @@ MatrixPtr GpuMatrix::getTranspose() {
   if (memoryHandle_.get() != NULL) {
     MatrixPtr copy_T(
         new GpuMatrix(std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
-                      height_, width_, true));
+                      height_,
+                      width_,
+                      true));
     return copy_T;
   } else {
     MatrixPtr copy_T(new GpuMatrix(data_, height_, width_, true));
@@ -346,7 +385,6 @@ void GpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   hl_matrix_transpose(data, dataTrans, height_, width_, lda, ldc);
 }
 
-
 MatrixPtr GpuMatrix::getInverse() {
   MatrixPtr matInv;
   inverse(matInv, true);
@@ -379,17 +417,16 @@ void GpuMatrix::addSharedBias(Matrix& b, real scale) {
   CHECK(b.getHeight() == 1) << "the Bias should be a vector";
   CHECK_LE(b.getWidth(), getWidth());
   CHECK_EQ(getWidth() % b.getWidth(), 0UL);
-  hl_matrix_add_shared_bias(getData(), b.getData(), b.getWidth(),
-                            getHeight(), getWidth(), scale);
+  hl_matrix_add_shared_bias(
+      getData(), b.getData(), b.getWidth(), getHeight(), getWidth(), scale);
 }
 
-
 void GpuMatrix::collectBias(Matrix& a, real scale) {
   CHECK_EQ(getHeight(), (size_t)1);
   CHECK_EQ(width_, a.getWidth());
   GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
   if (!sMatPtr) {
-    sumCols(a, /* scaleSum= */scale, /* scaleDest= */1);
+    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
   } else {
     real* data = getData();
     hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
@@ -397,15 +434,13 @@ void GpuMatrix::collectBias(Matrix& a, real scale) {
   }
 }
 
-
 void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
   CHECK_EQ(getHeight(), (size_t)1);
   CHECK_EQ(a.getWidth() % getWidth(), 0UL);
-  hl_matrix_collect_shared_bias(getData(), a.getData(), getWidth(),
-                                a.getHeight(), a.getWidth(), scale);
+  hl_matrix_collect_shared_bias(
+      getData(), a.getData(), getWidth(), a.getHeight(), a.getWidth(), scale);
 }
 
-
 void GpuMatrix::sequenceAvgForward(Matrix& a,
                                    const IVector& startsPos,
                                    int mode) {
@@ -421,7 +456,9 @@ void GpuMatrix::sequenceAvgForward(Matrix& a,
 }
 
 /* this = scaleAB*(a*b) +  scaleT*this */
-void GpuMatrix::mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB,
+void GpuMatrix::mul(const GpuMatrix& a,
+                    const GpuMatrix& b,
+                    real scaleAB,
                     real scaleT) {
   CHECK(!isTransposed()) << "Not supported";
 
@@ -453,11 +490,24 @@ void GpuMatrix::mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB,
   hl_trans_op_t transa = !a.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
   hl_trans_op_t transb = !b.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
 
-  hl_matrix_mul(A_d, transa, B_d, transb, C_d, dimM, dimN, dimK, scaleAB,
-                scaleT, lda, ldb, ldc);
-}
-
-void GpuMatrix::mul(const GpuSparseMatrix& a, const GpuMatrix& b, real scaleAB,
+  hl_matrix_mul(A_d,
+                transa,
+                B_d,
+                transb,
+                C_d,
+                dimM,
+                dimN,
+                dimK,
+                scaleAB,
+                scaleT,
+                lda,
+                ldb,
+                ldc);
+}
+
+void GpuMatrix::mul(const GpuSparseMatrix& a,
+                    const GpuMatrix& b,
+                    real scaleAB,
                     real scaleT) {
   CHECK(isContiguous());
   CHECK(b.isContiguous());
@@ -475,11 +525,21 @@ void GpuMatrix::mul(const GpuSparseMatrix& a, const GpuMatrix& b, real scaleAB,
   hl_sparse_matrix_s A_d = a.sMatrix_.get();
   real* B_d = b.data_;
   real* C_d = data_;
-  hl_matrix_csr_mul_dense(A_d, transA, B_d, HPPL_OP_N, C_d, height_, width_,
-                          b.height_, scaleAB, scaleT);
-}
-
-void GpuMatrix::mul(const GpuMatrix& a, const GpuSparseMatrix& b, real scaleAB,
+  hl_matrix_csr_mul_dense(A_d,
+                          transA,
+                          B_d,
+                          HPPL_OP_N,
+                          C_d,
+                          height_,
+                          width_,
+                          b.height_,
+                          scaleAB,
+                          scaleT);
+}
+
+void GpuMatrix::mul(const GpuMatrix& a,
+                    const GpuSparseMatrix& b,
+                    real scaleAB,
                     real scaleT) {
   CHECK(isContiguous());
   CHECK(a.isContiguous());
@@ -497,11 +557,27 @@ void GpuMatrix::mul(const GpuMatrix& a, const GpuSparseMatrix& b, real scaleAB,
         << "Matrix dimensions are not equal";
   }
   if (b.format_ == SPARSE_CSC) {
-    hl_matrix_dense_mul_csc(A_d, HPPL_OP_N, B_d, transB, C_d, height_, width_,
-                            a.width_, scaleAB, scaleT);
+    hl_matrix_dense_mul_csc(A_d,
+                            HPPL_OP_N,
+                            B_d,
+                            transB,
+                            C_d,
+                            height_,
+                            width_,
+                            a.width_,
+                            scaleAB,
+                            scaleT);
   } else {
-    hl_matrix_dense_mul_csr(A_d, HPPL_OP_N, B_d, transB, C_d, height_, width_,
-                            a.width_, scaleAB, scaleT);
+    hl_matrix_dense_mul_csr(A_d,
+                            HPPL_OP_N,
+                            B_d,
+                            transB,
+                            C_d,
+                            height_,
+                            width_,
+                            a.width_,
+                            scaleAB,
+                            scaleT);
   }
 }
 
@@ -510,7 +586,9 @@ void GpuMatrix::mul(const MatrixPtr a, const MatrixPtr b) {
   mul(a, b, 1.0, 0.0);
 }
 
-void GpuMatrix::mul(const MatrixPtr a, const MatrixPtr b, real scaleAB,
+void GpuMatrix::mul(const MatrixPtr a,
+                    const MatrixPtr b,
+                    real scaleAB,
                     real scaleT) {
   GpuMatrixPtr a_ptr = std::dynamic_pointer_cast<GpuMatrix>(a);
   GpuMatrixPtr b_ptr = std::dynamic_pointer_cast<GpuMatrix>(b);
@@ -563,8 +641,14 @@ void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
   size_t tableSize = table.getHeight();
   int* index = ids.getData();
 
-  hl_matrix_select_rows(a, stride_, table.getData(), table.stride_, index,
-                        numSamples, tableSize, dim);
+  hl_matrix_select_rows(a,
+                        stride_,
+                        table.getData(),
+                        table.stride_,
+                        index,
+                        numSamples,
+                        tableSize,
+                        dim);
 #endif
 }
 
@@ -581,15 +665,21 @@ void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
   size_t tableSize = table.getHeight();
   int* index = ids.getData();
 
-  hl_matrix_add_to_rows(table.getData(), table.stride_, a, stride_, index,
-                        numSamples, tableSize, dim);
+  hl_matrix_add_to_rows(table.getData(),
+                        table.stride_,
+                        a,
+                        stride_,
+                        index,
+                        numSamples,
+                        tableSize,
+                        dim);
 #endif
 }
 
 void GpuMatrix::colMerge(Matrix& src) {
   CHECK(src.height_ == height_);
   if (!trans_ && !src.trans_) {
-    sumRows(src, /* scaleSum= */1, /* scaleDest= */0);
+    sumRows(src, /* scaleSum= */ 1, /* scaleDest= */ 0);
   } else {
     LOG(FATAL) << "Is not supported";
   }
@@ -599,7 +689,7 @@ void GpuMatrix::rowSum(Matrix& sum) {
   CHECK_EQ(sum.getHeight(), getHeight());
   CHECK_EQ(sum.getWidth(), (size_t)1);
 
-  sum.sumRows(*this, /* scaleSum= */1, /* scaleDest= */0);
+  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
 }
 
 void GpuMatrix::rowMax(Matrix& max) {
@@ -617,8 +707,13 @@ void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
   CHECK_EQ(maxIds.getSize(), numSamples * beam);
   CHECK_EQ(maxVal.getHeight(), numSamples);
 
-  hl_matrix_top_k(maxVal.getData(), maxVal.getStride(), maxIds.getData(),
-                  this->getData(), this->getStride(), this->getWidth(), beam,
+  hl_matrix_top_k(maxVal.getData(),
+                  maxVal.getStride(),
+                  maxIds.getData(),
+                  this->getData(),
+                  this->getStride(),
+                  this->getWidth(),
+                  beam,
                   numSamples);
 #endif
 }
@@ -634,7 +729,9 @@ void GpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
   LOG(FATAL) << "Is not supported";
 }
 
-void GpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
+void GpuMatrix::maxoutForward(Matrix& a,
+                              IVector& id,
+                              size_t channels,
                               size_t groups) {
   CHECK(dynamic_cast<GpuMatrix*>(&a));
   CHECK(dynamic_cast<GpuIVector*>(&id));
@@ -646,11 +743,13 @@ void GpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
   real* output = getData();
   int* idForGpu = id.getData();
 
-  hl_maxout_forward(input, output, idForGpu, batchSize, size, size / channels,
-                    groups);
+  hl_maxout_forward(
+      input, output, idForGpu, batchSize, size, size / channels, groups);
 }
 
-void GpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
+void GpuMatrix::maxoutBackward(Matrix& a,
+                               IVector& id,
+                               size_t channels,
                                size_t groups) {
   CHECK(dynamic_cast<GpuMatrix*>(&a));
   CHECK(dynamic_cast<GpuIVector*>(&id));
@@ -662,8 +761,8 @@ void GpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
   const real* output = a.getData();
   const int* idForGpu = id.getData();
 
-  hl_maxout_backward(input, output, idForGpu, batchSize, size, size / channels,
-                     groups);
+  hl_maxout_backward(
+      input, output, idForGpu, batchSize, size, size / channels, groups);
 }
 
 /*calulate the error of classification */
@@ -679,8 +778,8 @@ void GpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) {
   real* recResult_d = data_;
   int* label_d = label_ptr->getData();
 
-  hl_matrix_classification_error(output_d, label_d, recResult_d, height_,
-                                 output_ptr->width_);
+  hl_matrix_classification_error(
+      output_d, label_d, recResult_d, height_, output_ptr->width_);
 }
 
 /* copy -log(output[i * width + label]) to this->data[i] */
@@ -717,13 +816,15 @@ void GpuMatrix::oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
   hl_matrix_cross_entropy_bp(grad_d, output_d, label_d, height_, width_);
 }
 
-void GpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
+void GpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                               IVector& label,
                                                real alpha) {
   LOG(FATAL) << "Not implemented";
 }
 
 void GpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                                 IVector& label, real alpha) {
+                                                 IVector& label,
+                                                 real alpha) {
   LOG(FATAL) << "Not implemented";
 }
 
@@ -790,8 +891,10 @@ void GpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
     LOG(FATAL) << "not supported: GpuSparseMatrix as label";
   }
 
-  BaseMatrix::sumOfSquaredDiffs(output, label,
-                                /* scaleSum= */1, /* scaleDest= */1);
+  BaseMatrix::sumOfSquaredDiffs(output,
+                                label,
+                                /* scaleSum= */ 1,
+                                /* scaleDest= */ 1);
 }
 
 void GpuMatrix::sumOfSquaresBp(Matrix& outputV, Matrix& label) {
@@ -826,9 +929,12 @@ void GpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
   real* y = output2.getData();
   hl_cossim(out, x, y, dim, output1.getHeight(), output2.getHeight(), scale);
 }
-void GpuMatrix::cosSimDerivative(Matrix& output, Matrix& prevOut1,
-                                 Matrix& prevOut2, Matrix& prevGrad1,
-                                 Matrix& prevGrad2, real scale) {
+void GpuMatrix::cosSimDerivative(Matrix& output,
+                                 Matrix& prevOut1,
+                                 Matrix& prevOut2,
+                                 Matrix& prevGrad1,
+                                 Matrix& prevGrad2,
+                                 real scale) {
   CHECK(output.useGpu_ == true && prevOut1.useGpu_ == true &&
         prevOut2.useGpu_ == true && prevGrad1.useGpu_ == true &&
         prevGrad2.useGpu_ == true)
@@ -852,8 +958,16 @@ void GpuMatrix::cosSimDerivative(Matrix& output, Matrix& prevOut1,
   real* prevOutY = prevOut2.getData();
   real* prevGradX = prevGrad1.getData();
   real* prevGradY = prevGrad2.getData();
-  hl_cossim_derivative(grad, out, prevOutX, prevOutY, prevGradX, prevGradY, dim,
-                       prevOut1.getHeight(), prevOut2.getHeight(), scale);
+  hl_cossim_derivative(grad,
+                       out,
+                       prevOutX,
+                       prevOutY,
+                       prevGradX,
+                       prevGradY,
+                       dim,
+                       prevOut1.getHeight(),
+                       prevOut2.getHeight(),
+                       scale);
 }
 
 void GpuMatrix::randomizeUniform() {
@@ -902,9 +1016,17 @@ void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
   LOG(INFO) << "the  diffCnt is " << diffCnt;
 }
 
-void GpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
-                           int channels, int blockH, int blockW, int strideH,
-                           int strideW, int paddingH, int paddingW, int outputH,
+void GpuMatrix::convExpand(Matrix& feature,
+                           int feaImgHeight,
+                           int feaImgWidth,
+                           int channels,
+                           int blockH,
+                           int blockW,
+                           int strideH,
+                           int strideW,
+                           int paddingH,
+                           int paddingW,
+                           int outputH,
                            int outputW) {
   CHECK(feature.useGpu_ == true) << "Matrix type are not equal";
 
@@ -915,15 +1037,34 @@ void GpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
   size_t elemCnt = outputH * outputW * blockH * blockW * channels;
   CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal";
 
-  hl_expand_feature2col(feature.getData(), channels, feaImgHeight, feaImgWidth,
-                        blockH, blockW, strideH, strideW, paddingH, paddingW,
-                        outputH, outputW, getData());
-}
-
-void GpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight,
-                           int thisImgWidth, int channels, int blockH,
-                           int blockW, int strideH, int strideW, int paddingH,
-                           int paddingW, int outputH, int outputW, real alpha,
+  hl_expand_feature2col(feature.getData(),
+                        channels,
+                        feaImgHeight,
+                        feaImgWidth,
+                        blockH,
+                        blockW,
+                        strideH,
+                        strideW,
+                        paddingH,
+                        paddingW,
+                        outputH,
+                        outputW,
+                        getData());
+}
+
+void GpuMatrix::convShrink(Matrix& expandFeat,
+                           int thisImgHeight,
+                           int thisImgWidth,
+                           int channels,
+                           int blockH,
+                           int blockW,
+                           int strideH,
+                           int strideW,
+                           int paddingH,
+                           int paddingW,
+                           int outputH,
+                           int outputW,
+                           real alpha,
                            real beta) {
   CHECK(expandFeat.useGpu_ == true) << "Matrix type are not equal";
   CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
@@ -933,16 +1074,34 @@ void GpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight,
   size_t elemCnt = outputH * outputW * blockW * blockH * channels;
   CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth())
       << "Matrix dimensions are not equal";
-  hl_shrink_col2feature(expandFeat.getData(), channels, thisImgHeight,
-                        thisImgWidth, blockH, blockW, strideH, strideW,
-                        paddingH, paddingW, outputH, outputW, getData(), alpha,
+  hl_shrink_col2feature(expandFeat.getData(),
+                        channels,
+                        thisImgHeight,
+                        thisImgWidth,
+                        blockH,
+                        blockW,
+                        strideH,
+                        strideW,
+                        paddingH,
+                        paddingW,
+                        outputH,
+                        outputW,
+                        getData(),
+                        alpha,
                         beta);
 }
 
-void GpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
-                               size_t imgSizeW, size_t channels, size_t sizeX,
-                               size_t sizeY, size_t strideH, size_t strideW,
-                               size_t outputH, size_t outputW, size_t paddingH,
+void GpuMatrix::maxPoolForward(Matrix& inputMat,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
                                size_t paddingW) {
   CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
 
@@ -954,17 +1113,38 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
   CHECK(height_ == inputMat.getHeight());
   CHECK(width_ == outputH * outputW * channels);
 
-  hl_maxpool_forward(frameNum, inputData, channels, height, width, outputH,
-                     outputW, sizeX, sizeY, strideH, strideW, paddingH,
-                     paddingW, data_, getStride());
-}
-
-void GpuMatrix::maxPoolBackward(Matrix& inputMat, size_t imgSizeH,
-                                size_t imgSizeW, Matrix& outGrad, Matrix& outV,
-                                size_t sizeX, size_t sizeY, size_t strideH,
-                                size_t strideW, size_t outputH, size_t outputW,
-                                real scaleTargets, real scaleOutput,
-                                size_t paddingH, size_t paddingW) {
+  hl_maxpool_forward(frameNum,
+                     inputData,
+                     channels,
+                     height,
+                     width,
+                     outputH,
+                     outputW,
+                     sizeX,
+                     sizeY,
+                     strideH,
+                     strideW,
+                     paddingH,
+                     paddingW,
+                     data_,
+                     getStride());
+}
+
+void GpuMatrix::maxPoolBackward(Matrix& inputMat,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                Matrix& outGrad,
+                                Matrix& outV,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW) {
   CHECK(inputMat.useGpu_ == true && outGrad.useGpu_ == true &&
         outV.useGpu_ == true)
       << "Matrix type are not equal";
@@ -982,16 +1162,38 @@ void GpuMatrix::maxPoolBackward(Matrix& inputMat, size_t imgSizeH,
   CHECK(outGrad.getHeight() == outV.getHeight() &&
         outGrad.getWidth() == outV.getWidth());
 
-  hl_maxpool_backward(frameNum, inputData, outData, outDiff, channels, height,
-                      width, outputH, outputW, sizeX, sizeY, strideH, strideW,
-                      paddingH, paddingW, scaleTargets, scaleOutput, data_,
+  hl_maxpool_backward(frameNum,
+                      inputData,
+                      outData,
+                      outDiff,
+                      channels,
+                      height,
+                      width,
+                      outputH,
+                      outputW,
+                      sizeX,
+                      sizeY,
+                      strideH,
+                      strideW,
+                      paddingH,
+                      paddingW,
+                      scaleTargets,
+                      scaleOutput,
+                      data_,
                       outGrad.getStride());
 }
 
-void GpuMatrix::avgPoolForward(Matrix& inputMat, size_t imgSizeH,
-                               size_t imgSizeW, size_t channels, size_t sizeX,
-                               size_t sizeY, size_t strideH, size_t strideW,
-                               size_t outputH, size_t outputW, size_t paddingH,
+void GpuMatrix::avgPoolForward(Matrix& inputMat,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
                                size_t paddingW) {
   CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
 
@@ -1003,16 +1205,35 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat, size_t imgSizeH,
   CHECK(height_ == inputMat.getHeight());
   CHECK(width_ == outputH * outputW * channels);
 
-  hl_avgpool_forward(frameNum, inputData, channels, height, width, outputH,
-                     outputW, sizeX, sizeY, strideH, strideW, paddingH,
-                     paddingW, data_, getStride());
-}
-
-void GpuMatrix::avgPoolBackward(Matrix& outGrad, size_t imgSizeH,
-                                size_t imgSizeW, size_t sizeX, size_t sizeY,
-                                size_t strideH, size_t strideW, size_t outputH,
-                                size_t outputW, real scaleTargets,
-                                real scaleOutput, size_t paddingH,
+  hl_avgpool_forward(frameNum,
+                     inputData,
+                     channels,
+                     height,
+                     width,
+                     outputH,
+                     outputW,
+                     sizeX,
+                     sizeY,
+                     strideH,
+                     strideW,
+                     paddingH,
+                     paddingW,
+                     data_,
+                     getStride());
+}
+
+void GpuMatrix::avgPoolBackward(Matrix& outGrad,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
                                 size_t paddingW) {
   CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
 
@@ -1025,15 +1246,32 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad, size_t imgSizeH,
   CHECK(height_ == outGrad.getHeight());
   CHECK(outGrad.getWidth() == outputH * outputW * channels);
 
-  hl_avgpool_backward(frameNum, outDiff, channels, height, width, outputH,
-                      outputW, sizeX, sizeY, strideH, strideW, paddingH,
-                      paddingW, scaleTargets, scaleOutput, data_,
+  hl_avgpool_backward(frameNum,
+                      outDiff,
+                      channels,
+                      height,
+                      width,
+                      outputH,
+                      outputW,
+                      sizeX,
+                      sizeY,
+                      strideH,
+                      strideW,
+                      paddingH,
+                      paddingW,
+                      scaleTargets,
+                      scaleOutput,
+                      data_,
                       outGrad.getStride());
 }
 
-void GpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH,
-                                  size_t imgSizeW, Matrix& denoms,
-                                  size_t channels, size_t sizeX, float scale,
+void GpuMatrix::crossMapNormalFwd(Matrix& input,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  Matrix& denoms,
+                                  size_t channels,
+                                  size_t sizeX,
+                                  float scale,
                                   float pow) {
   size_t num = input.getHeight();
   size_t height = imgSizeH;
@@ -1043,14 +1281,27 @@ void GpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH,
   CHECK(denoms.getHeight() == input.getHeight() &&
         denoms.getWidth() == input.getWidth() && input.getHeight() == height_ &&
         input.getWidth() == width_);
-  hl_CMRNorm_forward(num, input.getData(), denoms.getData(), data_, channels,
-                     height, width, sizeX, scale, -pow);
-}
-
-void GpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
-                                  Matrix& preOutV, Matrix& localOutV,
-                                  size_t channels, size_t imgSizeH,
-                                  size_t imgSizeW, size_t sizeX, float scale,
+  hl_CMRNorm_forward(num,
+                     input.getData(),
+                     denoms.getData(),
+                     data_,
+                     channels,
+                     height,
+                     width,
+                     sizeX,
+                     scale,
+                     -pow);
+}
+
+void GpuMatrix::crossMapNormalBwd(Matrix& localGrad,
+                                  Matrix& denoms,
+                                  Matrix& preOutV,
+                                  Matrix& localOutV,
+                                  size_t channels,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t sizeX,
+                                  float scale,
                                   float pow) {
   size_t num = preOutV.getHeight();
   size_t height = imgSizeH;
@@ -1063,12 +1314,22 @@ void GpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
   CHECK(denoms.getHeight() == localGrad.getHeight() &&
         denoms.getWidth() == localGrad.getWidth());
 
-  hl_CMRNorm_backward(num, preOutV.getData(), denoms.getData(),
-                      localOutV.getData(), localGrad.getData(), data_, channels,
-                      height, width, sizeX, -pow, 2.0f * pow * scale);
-}
-
-void GpuMatrix::maxSequenceForward(Matrix& input, const IVector& sequence,
+  hl_CMRNorm_backward(num,
+                      preOutV.getData(),
+                      denoms.getData(),
+                      localOutV.getData(),
+                      localGrad.getData(),
+                      data_,
+                      channels,
+                      height,
+                      width,
+                      sizeX,
+                      -pow,
+                      2.0f * pow * scale);
+}
+
+void GpuMatrix::maxSequenceForward(Matrix& input,
+                                   const IVector& sequence,
                                    IVector& index) {
   CHECK(dynamic_cast<GpuMatrix*>(&input));
   CHECK(dynamic_cast<const GpuIVector*>(&sequence));
@@ -1085,11 +1346,12 @@ void GpuMatrix::maxSequenceForward(Matrix& input, const IVector& sequence,
   CHECK_EQ(numSequences, sequence.getSize() - 1);
   CHECK_EQ(numSequences * dim, index.getSize());
 
-  hl_max_sequence_forward(inputData, starts, outData, maxIndex, numSequences,
-                          dim);
+  hl_max_sequence_forward(
+      inputData, starts, outData, maxIndex, numSequences, dim);
 }
 
-void GpuMatrix::maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
+void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
+                                    const IVector& sequence,
                                     IVector& index) {
   CHECK(dynamic_cast<GpuMatrix*>(&outputGrad));
   CHECK(dynamic_cast<const GpuIVector*>(&sequence));
@@ -1108,10 +1370,13 @@ void GpuMatrix::maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
   hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim);
 }
 
-void GpuMatrix::contextProjectionForward(MatrixPtr input, MatrixPtr weight,
+void GpuMatrix::contextProjectionForward(MatrixPtr input,
+                                         MatrixPtr weight,
                                          const IVector& sequence,
-                                         int contextLength, int contextStart,
-                                         size_t beginPad, bool isPadding) {
+                                         int contextLength,
+                                         int contextStart,
+                                         size_t beginPad,
+                                         bool isPadding) {
   CHECK(dynamic_cast<GpuMatrix*>(input.get()));
   CHECK(dynamic_cast<const GpuIVector*>(&sequence));
   if (weight) CHECK(dynamic_cast<GpuMatrix*>(weight.get()));
@@ -1125,9 +1390,16 @@ void GpuMatrix::contextProjectionForward(MatrixPtr input, MatrixPtr weight,
   real* inputData = input->getData();
   const int* starts = sequence.getData();
 
-  hl_context_projection_forward(
-      inputData, starts, isPadding ? weight->getData() : NULL, outData,
-      numSequences, inputDim, contextLength, contextStart, beginPad, isPadding);
+  hl_context_projection_forward(inputData,
+                                starts,
+                                isPadding ? weight->getData() : NULL,
+                                outData,
+                                numSequences,
+                                inputDim,
+                                contextLength,
+                                contextStart,
+                                beginPad,
+                                isPadding);
 }
 
 void GpuMatrix::contextProjectionBackwardData(MatrixPtr inputGrad,
@@ -1146,14 +1418,20 @@ void GpuMatrix::contextProjectionBackwardData(MatrixPtr inputGrad,
   real* inGrad = inputGrad->getData();
   const int* starts = sequence.getData();
 
-  hl_context_projection_backward_data(outGrad, starts, inGrad, numSequences,
-                                      inputDim, contextLength, contextStart);
+  hl_context_projection_backward_data(outGrad,
+                                      starts,
+                                      inGrad,
+                                      numSequences,
+                                      inputDim,
+                                      contextLength,
+                                      contextStart);
 }
 
 void GpuMatrix::contextProjectionBackwardWeight(MatrixPtr weightGrad,
                                                 const IVector& sequence,
                                                 int contextLength,
-                                                int contextStart, int totalPad,
+                                                int contextStart,
+                                                int totalPad,
                                                 size_t beginPad) {
   CHECK(dynamic_cast<GpuMatrix*>(weightGrad.get()));
   CHECK(dynamic_cast<const GpuIVector*>(&sequence));
@@ -1167,9 +1445,15 @@ void GpuMatrix::contextProjectionBackwardWeight(MatrixPtr weightGrad,
   real* wtGrad = weightGrad->getData();
   const int* starts = sequence.getData();
 
-  hl_context_projection_backward_weight(outGrad, starts, wtGrad, numSequences,
-                                        weightDim, totalPad, contextLength,
-                                        contextStart, beginPad);
+  hl_context_projection_backward_weight(outGrad,
+                                        starts,
+                                        wtGrad,
+                                        numSequences,
+                                        weightDim,
+                                        totalPad,
+                                        contextLength,
+                                        contextStart,
+                                        beginPad);
 }
 
 void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
@@ -1193,8 +1477,8 @@ void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
   size_t partial_sum = numElements / (this->getHeight() * this->getWidth());
-  hl_param_relu_backward_w(wgrad, ograd, input, numElements, numSamples,
-                           partial_sum);
+  hl_param_relu_backward_w(
+      wgrad, ograd, input, numElements, numSamples, partial_sum);
 }
 
 void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
@@ -1205,8 +1489,8 @@ void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
   size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
-  hl_param_relu_backward_diff(ograd, input, w, diff, numElements, numSamples,
-                              partial_sum);
+  hl_param_relu_backward_diff(
+      ograd, input, w, diff, numElements, numSamples, partial_sum);
 }
 
 void GpuMatrix::addColumnVector(const Matrix& b) {
@@ -1229,15 +1513,24 @@ void GpuMatrix::bilinearForward(const Matrix& in,
   const size_t inputH = in.getHeight();
 
   real* outData = getData();
-  const real* inData  = in.getData();
+  const real* inData = in.getData();
 
   if (inImgH == outImgW && inImgW == outImgW) {
     this->copyFrom(in);
   } else {
-    hl_bilinear_forward(
-      inData, inImgH, inImgW, inputH, inputW, outData,
-      outImgH, outImgW, outputH, outputW, numChannels,
-      ratioH, ratioW);
+    hl_bilinear_forward(inData,
+                        inImgH,
+                        inImgW,
+                        inputH,
+                        inputW,
+                        outData,
+                        outImgH,
+                        outImgW,
+                        outputH,
+                        outputW,
+                        numChannels,
+                        ratioH,
+                        ratioW);
   }
 }
 
@@ -1262,47 +1555,56 @@ void GpuMatrix::bilinearBackward(const Matrix& out,
   if (outImgH == inImgH && outImgW == inImgW) {
     this->add(const_cast<Matrix&>(out));
   } else {
-    hl_bilinear_backward(
-      inGrad, inImgH, inImgW, inputH, inputW, outGrad,
-      outImgH, outImgW, outputH, outputW, numChannels,
-      ratioH, ratioW);
+    hl_bilinear_backward(inGrad,
+                         inImgH,
+                         inImgW,
+                         inputH,
+                         inputW,
+                         outGrad,
+                         outImgH,
+                         outImgW,
+                         outputH,
+                         outputW,
+                         numChannels,
+                         ratioH,
+                         ratioW);
   }
 }
 
 void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
-    GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
-    auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
-
-    CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
-    CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
-    CHECK(height_ == outputPtr->height_ && width_ == 1
-          && outputPtr->width_ == labelPtr->getWidth()
-          && outputPtr->height_ == labelPtr->getHeight())
-            << "Matrix dimensions are not equal";
+  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
+  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
+
+  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
+  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
+  CHECK(height_ == outputPtr->height_ && width_ == 1 &&
+        outputPtr->width_ == labelPtr->getWidth() &&
+        outputPtr->height_ == labelPtr->getHeight())
+      << "Matrix dimensions are not equal";
 
-    real* output_d = outputPtr->data_;
-    real* entropy_d = data_;
-    hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
-    hl_matrix_multi_binary_cross_entropy(
-        output_d, entropy_d, mat_d, height_, outputPtr->width_);
+  real* output_d = outputPtr->data_;
+  real* entropy_d = data_;
+  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
+  hl_matrix_multi_binary_cross_entropy(
+      output_d, entropy_d, mat_d, height_, outputPtr->width_);
 }
 
-void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix &output, Matrix &label) {
-    GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
-    auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
+void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
+  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
 
-    CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
-    CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
-    CHECK(height_ == outputPtr->height_ && width_ == outputPtr->width_
-          && outputPtr->width_ == labelPtr->getWidth()
-          && outputPtr->height_ == labelPtr->getHeight())
-            << "Matrix dimensions are not equal";
+  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
+  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
+  CHECK(height_ == outputPtr->height_ && width_ == outputPtr->width_ &&
+        outputPtr->width_ == labelPtr->getWidth() &&
+        outputPtr->height_ == labelPtr->getHeight())
+      << "Matrix dimensions are not equal";
 
-    real* output_d = outputPtr->data_;
-    real* grad_d = data_;
-    hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
-    hl_matrix_multi_binary_cross_entropy_bp(
-        output_d, grad_d, mat_d, height_, width_);
+  real* output_d = outputPtr->data_;
+  real* grad_d = data_;
+  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
+  hl_matrix_multi_binary_cross_entropy_bp(
+      output_d, grad_d, mat_d, height_, width_);
 }
 
 /**
@@ -1311,7 +1613,10 @@ void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix &output, Matrix &label) {
 
 CpuMatrix::CpuMatrix(size_t height, size_t width, bool trans)
     : Matrix(std::make_shared<CpuMemoryHandle>(height * width * sizeof(real)),
-             height, width, trans, false) {}
+             height,
+             width,
+             trans,
+             false) {}
 
 CpuMatrix::~CpuMatrix() {}
 
@@ -1333,8 +1638,8 @@ void CpuMatrix::copyFrom(const Matrix& src) {
   if (typeid(src) == typeid(GpuMatrix)) {
     CHECK(src.isContiguous());
     CHECK(elementCnt_ == src.getElementCnt());
-    hl_memcpy_device2host(data_, const_cast<real*>(src.getData()),
-                          sizeof(real) * elementCnt_);
+    hl_memcpy_device2host(
+        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
   } else if (typeid(src) == typeid(CpuMatrix) ||
              typeid(src) == typeid(SharedCpuMatrix)) {
     CHECK(src.isContiguous());
@@ -1399,8 +1704,10 @@ void CpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
   CHECK(src.isContiguous());
   CHECK(elementCnt_ == src.getElementCnt());
   if (typeid(src) == typeid(GpuMatrix)) {
-    hl_memcpy_async(this->getData(), const_cast<real*>(src.getData()),
-                    sizeof(real) * elementCnt_, stream);
+    hl_memcpy_async(this->getData(),
+                    const_cast<real*>(src.getData()),
+                    sizeof(real) * elementCnt_,
+                    stream);
   } else if (typeid(src) == typeid(CpuMatrix)) {
     memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
   } else {
@@ -1502,7 +1809,7 @@ void CpuMatrix::accumulateColSum(Matrix& src) {
   CHECK_EQ(getWidth(), src.getWidth());
   CHECK_EQ(getHeight(), (size_t)1);
 
-  sumCols(src, /* scaleSum= */1, /* scaleDest= */1);
+  sumCols(src, /* scaleSum= */ 1, /* scaleDest= */ 1);
 }
 
 real CpuMatrix::getAbsSum() {
@@ -1519,8 +1826,10 @@ real CpuMatrix::getAbsSum() {
 MatrixPtr CpuMatrix::getTranspose() {
   if (memoryHandle_.get() != NULL) {
     return std::make_shared<CpuMatrix>(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_), height_,
-        width_, true);
+        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
+        height_,
+        width_,
+        true);
   } else {
     MatrixPtr copy_T(new CpuMatrix(data_, height_, width_, true));
     return copy_T;
@@ -1545,7 +1854,6 @@ void CpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   }
 }
 
-
 MatrixPtr CpuMatrix::getInverse() {
   MatrixPtr matInv;
   inverse(matInv, true);
@@ -1586,9 +1894,17 @@ void CpuMatrix::inverse(MatrixPtr matInv, bool memAlloc) {
   CHECK_EQ(info, 0);
 }
 
-void CpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
-                           int channels, int blockH, int blockW, int strideH,
-                           int strideW, int paddingH, int paddingW, int outputH,
+void CpuMatrix::convExpand(Matrix& feature,
+                           int feaImgHeight,
+                           int feaImgWidth,
+                           int channels,
+                           int blockH,
+                           int blockW,
+                           int strideH,
+                           int strideW,
+                           int paddingH,
+                           int paddingW,
+                           int outputH,
                            int outputW) {
   CHECK(feature.useGpu_ == false) << "Matrix type are not equal";
 
@@ -1626,10 +1942,19 @@ void CpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
   }
 }
 
-void CpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight,
-                           int thisImgWidth, int channels, int blockH,
-                           int blockW, int strideH, int strideW, int paddingH,
-                           int paddingW, int outputH, int outputW, real alpha,
+void CpuMatrix::convShrink(Matrix& expandFeat,
+                           int thisImgHeight,
+                           int thisImgWidth,
+                           int channels,
+                           int blockH,
+                           int blockW,
+                           int strideH,
+                           int strideW,
+                           int paddingH,
+                           int paddingW,
+                           int outputH,
+                           int outputW,
+                           real alpha,
                            real beta) {
   CHECK(expandFeat.useGpu_ == false) << "Matrix type are not equal";
   CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
@@ -1666,10 +1991,17 @@ void CpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight,
   }
 }
 
-void CpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
-                               size_t imgSizeW, size_t channels, size_t sizeX,
-                               size_t sizeY, size_t strideH, size_t strideW,
-                               size_t outputH, size_t outputW, size_t paddingH,
+void CpuMatrix::maxPoolForward(Matrix& inputMat,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
                                size_t paddingW) {
   real* inputData = inputMat.getData();
   real* outData = data_;
@@ -1717,12 +2049,21 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
   }
 }
 
-void CpuMatrix::maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
-                                Matrix& outGrad, Matrix& outV, size_t sizeX,
-                                size_t sizeY, size_t strideH, size_t strideW,
-                                size_t outputH, size_t outputW,
-                                real scaleTargets, real scaleOutput,
-                                size_t paddingH, size_t paddingW) {
+void CpuMatrix::maxPoolBackward(Matrix& image,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                Matrix& outGrad,
+                                Matrix& outV,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW) {
   size_t num = image.getHeight();
   size_t channels = size_t(width_ / imgSizeH / imgSizeW);
   CHECK(image.getWidth() == imgSizeH * imgSizeW * channels);
@@ -1772,10 +2113,17 @@ void CpuMatrix::maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
   }
 }
 
-void CpuMatrix::avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                               size_t channels, size_t sizeX, size_t sizeY,
-                               size_t strideH, size_t strideW, size_t outputH,
-                               size_t outputW, size_t paddingH,
+void CpuMatrix::avgPoolForward(Matrix& input,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
                                size_t paddingW) {
   // The main loop
   size_t num = input.getHeight();
@@ -1820,11 +2168,19 @@ void CpuMatrix::avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
   }
 }
 
-void CpuMatrix::avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                                size_t sizeX, size_t sizeY, size_t strideH,
-                                size_t strideW, size_t outputH, size_t outputW,
-                                real scaleTargets, real scaleOutput,
-                                size_t paddingH, size_t paddingW) {
+void CpuMatrix::avgPoolBackward(Matrix& input,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW) {
   size_t num = input.getHeight();
   size_t channels = input.getWidth() / outputH / outputW;
   CHECK(imgSizeH * imgSizeW * channels == getWidth());
@@ -1863,9 +2219,13 @@ void CpuMatrix::avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
   }
 }
 
-void CpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH,
-                                  size_t imgSizeW, Matrix& denoms,
-                                  size_t channels, size_t sizeX, float scale,
+void CpuMatrix::crossMapNormalFwd(Matrix& input,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  Matrix& denoms,
+                                  size_t channels,
+                                  size_t sizeX,
+                                  float scale,
                                   float pow) {
   size_t num = input.getHeight();
   size_t height = imgSizeH;
@@ -1915,10 +2275,15 @@ void CpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH,
   integralData = NULL;
 }
 
-void CpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
-                                  Matrix& preOutV, Matrix& localOutV,
-                                  size_t channels, size_t imgSizeH,
-                                  size_t imgSizeW, size_t size, float scale,
+void CpuMatrix::crossMapNormalBwd(Matrix& localGrad,
+                                  Matrix& denoms,
+                                  Matrix& preOutV,
+                                  Matrix& localOutV,
+                                  size_t channels,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t size,
+                                  float scale,
                                   float pow) {
   LOG(FATAL) << "Not implemented";
 
@@ -1937,7 +2302,8 @@ void CpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
  * Output: output size is the number of input sequences (NOT input instances).
  * output[i] is set to max_{for each instance in this sequence}{input[i]}
  */
-void CpuMatrix::maxSequenceForward(Matrix& input, const IVector& sequence,
+void CpuMatrix::maxSequenceForward(Matrix& input,
+                                   const IVector& sequence,
                                    IVector& index) {
   CHECK(dynamic_cast<CpuMatrix*>(&input));
   CHECK(dynamic_cast<const CpuIVector*>(&sequence));
@@ -1978,7 +2344,8 @@ void CpuMatrix::maxSequenceForward(Matrix& input, const IVector& sequence,
   }
 }
 
-void CpuMatrix::maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
+void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
+                                    const IVector& sequence,
                                     IVector& index) {
   CHECK(dynamic_cast<CpuMatrix*>(&outputGrad));
   CHECK(dynamic_cast<const CpuIVector*>(&sequence));
@@ -2004,10 +2371,13 @@ void CpuMatrix::maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
   }
 }
 
-void CpuMatrix::contextProjectionForward(MatrixPtr input, MatrixPtr weight,
+void CpuMatrix::contextProjectionForward(MatrixPtr input,
+                                         MatrixPtr weight,
                                          const IVector& sequence,
-                                         int contextLength, int contextStart,
-                                         size_t beginPad, bool isPadding) {
+                                         int contextLength,
+                                         int contextStart,
+                                         size_t beginPad,
+                                         bool isPadding) {
   CHECK(dynamic_cast<CpuMatrix*>(input.get()));
   CHECK(dynamic_cast<const CpuIVector*>(&sequence));
   if (weight) CHECK(dynamic_cast<CpuMatrix*>(weight.get()));
@@ -2058,8 +2428,10 @@ void CpuMatrix::contextProjectionForward(MatrixPtr input, MatrixPtr weight,
 void CpuMatrix::contextProjectionBackward(MatrixPtr inputGrad,
                                           MatrixPtr weightGrad,
                                           const IVector& sequence,
-                                          int contextLength, int contextStart,
-                                          size_t beginPad, bool isPadding) {
+                                          int contextLength,
+                                          int contextStart,
+                                          size_t beginPad,
+                                          bool isPadding) {
   if (inputGrad) CHECK(dynamic_cast<CpuMatrix*>(inputGrad.get()));
   if (weightGrad) CHECK(dynamic_cast<CpuMatrix*>(weightGrad.get()));
   CHECK(dynamic_cast<const CpuIVector*>(&sequence));
@@ -2125,15 +2497,15 @@ inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
   }
 }
 
-inline void colVecAddTo(real* a, const real* b, size_t len, size_t aWidth,
-                        size_t bWidth) {
+inline void colVecAddTo(
+    real* a, const real* b, size_t len, size_t aWidth, size_t bWidth) {
   for (unsigned int i = 0; i < len; ++i) {
     a[i * aWidth] += b[i * bWidth];
   }
 }
 
-inline void colVecAddTo(real* a, real* b, real c, size_t len, size_t aWidth,
-                        size_t bWidth) {
+inline void colVecAddTo(
+    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
   for (unsigned int i = 0; i < len; ++i) {
     a[i * aWidth] += b[i * bWidth] * c;
   }
@@ -2189,7 +2561,7 @@ void CpuMatrix::collectBias(Matrix& a, real scale) {
   CHECK_EQ(width_, a.getWidth());
   CpuSparseMatrix* aptr = dynamic_cast<CpuSparseMatrix*>(&a);
   if (!aptr) {
-    sumCols(a, /* scaleSum= */scale, /* scaleDest= */1);
+    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
   } else {
     size_t nnz = aptr->getElementCnt();
     int* cols = aptr->getCols();
@@ -2240,15 +2612,17 @@ void CpuMatrix::sequenceAvgForward(Matrix& a,
     dataMtx->setData(src + starts[i] * width, sequenceLength, width);
     if (mode == 0) {
       // plain average
-      outMtx->sumCols(*dataMtx, (real)1 / (real)sequenceLength,
-                      /* scaleDest= */1);
+      outMtx->sumCols(*dataMtx,
+                      (real)1 / (real)sequenceLength,
+                      /* scaleDest= */ 1);
     } else if (mode == 1) {
       // sum instead of average
-      outMtx->sumCols(*dataMtx,  /* scaleSum= */1, /* scaleDest= */1);
+      outMtx->sumCols(*dataMtx, /* scaleSum= */ 1, /* scaleDest= */ 1);
     } else if (mode == 2) {
       // divide by square root of sequenceLength
-      outMtx->sumCols(*dataMtx, (real)1 / std::sqrt(sequenceLength),
-                      /* scaleDest= */1);
+      outMtx->sumCols(*dataMtx,
+                      (real)1 / std::sqrt(sequenceLength),
+                      /* scaleDest= */ 1);
     } else {
       LOG(FATAL) << "should not reach here";
     }
@@ -2256,27 +2630,37 @@ void CpuMatrix::sequenceAvgForward(Matrix& a,
 }
 
 /* this = scaleAB*(a*b) + scaleT*this*/
-void CpuMatrix::mul(const MatrixPtr a, const MatrixPtr b, real scaleAB,
+void CpuMatrix::mul(const MatrixPtr a,
+                    const MatrixPtr b,
+                    real scaleAB,
                     real scaleT) {
   CHECK(!isTransposed()) << "Not supported";
 
   if (dynamic_cast<CpuMatrix*>(a.get()) && dynamic_cast<CpuMatrix*>(b.get())) {
-    mul(dynamic_cast<CpuMatrix*>(a.get()), dynamic_cast<CpuMatrix*>(b.get()),
-        scaleAB, scaleT);
+    mul(dynamic_cast<CpuMatrix*>(a.get()),
+        dynamic_cast<CpuMatrix*>(b.get()),
+        scaleAB,
+        scaleT);
   } else if (dynamic_cast<CpuSparseMatrix*>(a.get()) &&
              dynamic_cast<CpuMatrix*>(b.get())) {
     mul(dynamic_cast<CpuSparseMatrix*>(a.get()),
-        dynamic_cast<CpuMatrix*>(b.get()), scaleAB, scaleT);
+        dynamic_cast<CpuMatrix*>(b.get()),
+        scaleAB,
+        scaleT);
   } else if (dynamic_cast<CpuMatrix*>(a.get()) &&
              dynamic_cast<CpuSparseMatrix*>(b.get())) {
     mul(dynamic_cast<CpuMatrix*>(a.get()),
-        dynamic_cast<CpuSparseMatrix*>(b.get()), scaleAB, scaleT);
+        dynamic_cast<CpuSparseMatrix*>(b.get()),
+        scaleAB,
+        scaleT);
   } else {
     LOG(FATAL) << "Not supported";
   }
 }
 
-void CpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
+void CpuMatrix::mul(CpuSparseMatrix* a,
+                    CpuMatrix* b,
+                    real scaleAB,
                     real scaleT) {
   if (dynamic_cast<CacheRowCpuMatrix*>(b)) {
     return mul(a, dynamic_cast<CacheRowCpuMatrix*>(b), this, scaleAB, scaleT);
@@ -2326,11 +2710,35 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
   int ldb = b->getStride();
   int ldc = getStride();
 #ifndef PADDLE_TYPE_DOUBLE
-  cblas_sgemm(CblasRowMajor, a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb,
-              scaleT, C, ldc);
+  cblas_sgemm(CblasRowMajor,
+              a_trans,
+              b_trans,
+              M,
+              N,
+              K,
+              scaleAB,
+              A,
+              lda,
+              B,
+              ldb,
+              scaleT,
+              C,
+              ldc);
 #else
-  cblas_dgemm(CblasRowMajor, a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb,
-              scaleT, C, ldc);
+  cblas_dgemm(CblasRowMajor,
+              a_trans,
+              b_trans,
+              M,
+              N,
+              K,
+              scaleAB,
+              A,
+              lda,
+              B,
+              ldb,
+              scaleT,
+              C,
+              ldc);
 // TODO(yuyang18): Is gemm defined other place?
 #endif
 
@@ -2338,8 +2746,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
           << " B[1]=" << B[1] << " C[0]=" << C[0] << " C[1]=" << C[1];
 }
 
-void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c,
-                    real scaleAB, real scaleT) {
+void CpuMatrix::mul(
+    CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c, real scaleAB, real scaleT) {
   CHECK(!c->isTransposed()) << "Not supported";
   CHECK_EQ(c->getValueType(), FLOAT_VALUE);
 
@@ -2446,7 +2854,9 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c,
   }
 }
 
-void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB,
+void CpuMatrix::mul(CpuMatrix* a,
+                    CpuSparseMatrix* b,
+                    real scaleAB,
                     real scaleT) {
   CHECK(!trans_) << "Not supported";
   CHECK(!a->isTransposed()) << "Not supported";
@@ -2484,8 +2894,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB,
           int start = b->getColStartIdx(j);
           int end = b->getColStartIdx(j + 1);
           for (int i = start; i < end; ++i) {
-            colVecAddTo(C + j, A + rows[i], B[i], height_, width_,
-                        a->getWidth());
+            colVecAddTo(
+                C + j, A + rows[i], B[i], height_, width_, a->getWidth());
           }
         }
       }
@@ -2507,8 +2917,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB,
           int start = b->getColStartIdx(i);
           int end = b->getColStartIdx(i + 1);
           for (int j = start; j < end; ++j) {
-            colVecAddTo(C + rows[j], A + i, B[j], height_, width_,
-                        a->getWidth());
+            colVecAddTo(
+                C + rows[j], A + i, B[j], height_, width_, a->getWidth());
           }
         }
       }
@@ -2533,8 +2943,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB,
           int start = b->getRowStartIdx(j);
           int end = b->getRowStartIdx(j + 1);
           for (int i = start; i < end; ++i) {
-            colVecAddTo(C + cols[i], A + j, B[i], height_, width_,
-                        a->getWidth());
+            colVecAddTo(
+                C + cols[i], A + j, B[i], height_, width_, a->getWidth());
           }
         }
       }
@@ -2556,8 +2966,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB,
           int start = b->getRowStartIdx(i);
           int end = b->getRowStartIdx(i + 1);
           for (int j = start; j < end; ++j) {
-            colVecAddTo(C + i, A + cols[j], B[j], height_, width_,
-                        a->getWidth());
+            colVecAddTo(
+                C + i, A + cols[j], B[j], height_, width_, a->getWidth());
           }
         }
       }
@@ -2656,8 +3066,8 @@ void CpuMatrix::addToRowsImp(TableMatType& table, IVector& ids) {
 static ThreadLocal<std::vector<const real*>> threadLocalColArray;
 
 template <typename MatBType, typename MatCType>
-void CpuMatrix::mul(CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB,
-                    real scaleT) {
+void CpuMatrix::mul(
+    CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT) {
   CHECK(!c->isTransposed()) << "Not supported";
   CHECK(!b->isTransposed()) << "Not supported";
   // TODO(yuyang18): Maybe bug implementation here.
@@ -2760,18 +3170,26 @@ void CpuMatrix::mul(CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB,
 
 // instantiation mul() called in SparseRowMatrix.cpp
 template void CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(
-    CpuSparseMatrix* a, CpuMatrix* b, SparseRowCpuMatrix* c, real scaleAB,
+    CpuSparseMatrix* a,
+    CpuMatrix* b,
+    SparseRowCpuMatrix* c,
+    real scaleAB,
     real scaleT);
 template void CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
-    CpuSparseMatrix* a, CpuMatrix* b, SparseAutoGrowRowCpuMatrix* c,
-    real scaleAB, real scaleT);
+    CpuSparseMatrix* a,
+    CpuMatrix* b,
+    SparseAutoGrowRowCpuMatrix* c,
+    real scaleAB,
+    real scaleT);
 template void CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(CpuSparseMatrix* a,
                                                            CpuMatrix* b,
                                                            CacheRowCpuMatrix* c,
                                                            real scaleAB,
                                                            real scaleT);
 
-void SharedCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
+void SharedCpuMatrix::mul(CpuSparseMatrix* a,
+                          CpuMatrix* b,
+                          real scaleAB,
                           real scaleT) {
   CHECK(!isTransposed()) << "Not supported";
   CHECK(!b->isTransposed()) << "Not supported";
@@ -2811,8 +3229,8 @@ void SharedCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
     for (int k = 0; k < blockNum_; ++k) {
       blockSeq.push_back(k);
     }
-    std::shuffle(blockSeq.begin(), blockSeq.end(),
-                 ThreadLocalRandomEngine::get());
+    std::shuffle(
+        blockSeq.begin(), blockSeq.end(), ThreadLocalRandomEngine::get());
   }
   std::vector<int>& localBufRows = *localBufRows_;
   int* cols = a->getCols();
@@ -2850,8 +3268,8 @@ void SharedCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
         localBufRows.push_back(i);
         size_t bufPos = localBufRows.size() - 1;
         for (int j = start; j < end; ++j) {
-          vecAddTo(localC + bufPos * width, B + cols[j] * width, value[j],
-                   width);
+          vecAddTo(
+              localC + bufPos * width, B + cols[j] * width, value[j], width);
         }
       }
     }
@@ -2935,7 +3353,7 @@ void CpuMatrix::rowSum(Matrix& sum) {
   CHECK_EQ(sum.getHeight(), getHeight());
   CHECK_EQ(sum.getWidth(), (size_t)1);
 
-  sum.sumRows(*this, /* scaleSum= */1, /* scaleDest= */0);
+  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
 }
 
 void CpuMatrix::rowMaxId(IVector& maxIds) {
@@ -2987,7 +3405,9 @@ void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
     }
 
     std::partial_sort(
-        vec.begin(), vec.begin() + beam, vec.end(),
+        vec.begin(),
+        vec.begin() + beam,
+        vec.end(),
         [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
           return l.first > r.first;
         });
@@ -3023,7 +3443,9 @@ void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
     }
 
     std::partial_sort(
-        vec.begin(), vec.begin() + beam, vec.end(),
+        vec.begin(),
+        vec.begin() + beam,
+        vec.end(),
         [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
           return l.first > r.first;
         });
@@ -3034,7 +3456,9 @@ void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
   }
 }
 
-void CpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
+void CpuMatrix::maxoutForward(Matrix& a,
+                              IVector& id,
+                              size_t channels,
                               size_t groups) {
   CHECK(dynamic_cast<CpuMatrix*>(&a));
   CHECK(dynamic_cast<CpuIVector*>(&id));
@@ -3067,7 +3491,9 @@ void CpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
   }
 }
 
-void CpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
+void CpuMatrix::maxoutBackward(Matrix& a,
+                               IVector& id,
+                               size_t channels,
                                size_t groups) {
   CHECK(dynamic_cast<CpuMatrix*>(&a));
   CHECK(dynamic_cast<CpuIVector*>(&id));
@@ -3189,7 +3615,8 @@ void CpuMatrix::oneHotCrossEntropyBp(Matrix& output, IVector& label) {
     but we define the scalar function here for sanity check
     deletion of the function does not affect anything neverthelss
 */
-void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
+void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                               IVector& label,
                                                real alpha) {
   CHECK(dynamic_cast<CpuMatrix*>(&output));
   CHECK(dynamic_cast<CpuIVector*>(&label));
@@ -3220,7 +3647,8 @@ void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
     but we define the scalar function here for sanity check
     deletion of the function does not affect anything neverthelss
 */
-void CpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& output, IVector& label,
+void CpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& output,
+                                                 IVector& label,
                                                  real alpha) {
   CHECK(dynamic_cast<CpuMatrix*>(&output));
   CHECK(dynamic_cast<CpuIVector*>(&label));
@@ -3301,10 +3729,16 @@ void CpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
   CHECK_EQ(output.getWidth(), 1UL);
   CHECK(isContiguous());
 
-  MatrixPtr inTmp = Matrix::create(nullptr, /* height= */ 1, 1,
-                                   /* trans= */ false, false);
-  MatrixPtr outTmp = Matrix::create(nullptr, /* height= */ 1, 1,
-                                    /* trans= */ false, false);
+  MatrixPtr inTmp = Matrix::create(nullptr,
+                                   /* height= */ 1,
+                                   1,
+                                   /* trans= */ false,
+                                   false);
+  MatrixPtr outTmp = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    1,
+                                    /* trans= */ false,
+                                    false);
   size_t numSequences = index.getSize() - 1;
   auto starts = index.getData();
   for (size_t i = 0; i < numSequences; ++i) {
@@ -3360,9 +3794,12 @@ void CpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
   }
 }
 
-void CpuMatrix::cosSimDerivative(Matrix& output, Matrix& prevOut1,
-                                 Matrix& prevOut2, Matrix& prevGrad1,
-                                 Matrix& prevGrad2, real scale) {
+void CpuMatrix::cosSimDerivative(Matrix& output,
+                                 Matrix& prevOut1,
+                                 Matrix& prevOut2,
+                                 Matrix& prevGrad1,
+                                 Matrix& prevGrad2,
+                                 real scale) {
   CHECK(output.useGpu_ == false) << "Matrix type are not equal";
 
   CHECK_EQ(getWidth(), 1UL);
@@ -3392,8 +3829,11 @@ void CpuMatrix::cosSimDerivative(Matrix& output, Matrix& prevOut1,
     CHECK_EQ(prevOut2.getHeight(), numSamples);
     CHECK_EQ(prevGrad2.getHeight(), numSamples);
   }
-  for (size_t i = 0; i < numSamples; ++i, prevOutX += dim, prevOutY += yInc,
-              prevGradX += dim, prevGradY += yInc) {
+  for (size_t i = 0; i < numSamples; ++i,
+              prevOutX += dim,
+              prevOutY += yInc,
+              prevGradX += dim,
+              prevGradY += yInc) {
     real squareSumX = 0;
     real squareSumY = 0;
     real xy = 0;
@@ -3450,7 +3890,8 @@ void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
         int* cols = labelptr->getCols();
         for (size_t i = 0; i < numSamples; ++i) {
           for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1); ++j) {
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
             cost[i] += 1.0 - 2.0 * out[i * dim + cols[j]];
             /*
              * explanation of above line: original codes are follows:
@@ -3466,7 +3907,8 @@ void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
           real sum1 = 0;
           real sum2 = 0;
           for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1); ++j) {
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
             sum1 += values[j] * values[j];
             sum2 += values[j] * out[i * dim + cols[j]];
             /*
@@ -3488,8 +3930,10 @@ void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
     }
   }
 
-  BaseMatrix::sumOfSquaredDiffs(output, label,
-                                /* scaleSum= */1, /* scaleDest= */1);
+  BaseMatrix::sumOfSquaredDiffs(output,
+                                label,
+                                /* scaleSum= */ 1,
+                                /* scaleDest= */ 1);
 }
 
 /* calculate the error of outputV according to label */
@@ -3519,7 +3963,8 @@ void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
         int* cols = labelptr->getCols();
         for (size_t i = 0; i < numSamples; ++i) {
           for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1); ++j) {
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
             grad[i * dim + cols[j]] -= 2.0;
             /*
              * explanation of above line: original codes are follows:
@@ -3534,7 +3979,8 @@ void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
         real* values = labelptr->getValue();
         for (size_t i = 0; i < numSamples; ++i) {
           for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1); ++j) {
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
             grad[i * dim + cols[j]] -= 2.0 * values[j];
             /*
              * explanation of above line: original codes are follows:
@@ -3809,8 +4255,8 @@ void CpuMatrix::circularConv(Matrix& in0, Matrix& in1) {
   }
 }
 
-void CpuMatrix::circularConvDerivative(Matrix& outG, Matrix& in0, Matrix& in1,
-                                       Matrix& inG0, Matrix& inG1) {
+void CpuMatrix::circularConvDerivative(
+    Matrix& outG, Matrix& in0, Matrix& in1, Matrix& inG0, Matrix& inG1) {
   size_t height = in0.getHeight();
   size_t width0 = in0.getWidth();
   size_t width1 = in1.getWidth();
@@ -3830,8 +4276,12 @@ void CpuMatrix::circularConvDerivative(Matrix& outG, Matrix& in0, Matrix& in1,
   real* inGV1 = inG1.getData();
 
   int leftCtxLen = (width1 - 1) / 2;
-  for (size_t x = 0; x < height; ++x, outGV += width0, inV0 += width0,
-              inV1 += width1, inGV0 += width0, inGV1 += width1) {
+  for (size_t x = 0; x < height; ++x,
+              outGV += width0,
+              inV0 += width0,
+              inV1 += width1,
+              inGV0 += width0,
+              inGV1 += width1) {
     for (size_t j = 0; j < width1; ++j) {  // iterate over width1
       for (size_t i = 0; i < width0; ++i) {
         // such over all dimensions of outG
@@ -3900,7 +4350,8 @@ void CpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
 }
 
 /* calculate the classification error for multi binary label */
-void CpuMatrix::classificationErrorMulti(Matrix& output, Matrix& label,
+void CpuMatrix::classificationErrorMulti(Matrix& output,
+                                         Matrix& label,
                                          real threshold) {
   CHECK(dynamic_cast<CpuMatrix*>(&output));
   auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
@@ -3954,12 +4405,12 @@ void CpuMatrix::bilinearForward(const Matrix& in,
   (void)(inputH);
 
   real* outData = getData();
-  const real* inData  = in.getData();
+  const real* inData = in.getData();
 
   if (inImgH == outImgH && inImgW == outImgW) {
     this->copyFrom(in);
   } else {
-    for (size_t k = 0; k < batchSize; ++k) {   // loop for batches
+    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
       for (size_t i = 0; i < outImgH; ++i) {  // loop for images
         size_t h = ratioH * i;
         size_t hid = (h < inImgH - 1) ? 1 : 0;
@@ -3977,9 +4428,9 @@ void CpuMatrix::bilinearForward(const Matrix& in,
           for (size_t c = 0; c < numChannels; ++c) {  // loop for channels
             // bilinear interpolation
             outPos[0] =
-              h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wid]) +
-              h1lambda * (w2lambda * inPos[hid * inImgW] +
-              w1lambda * inPos[hid * inImgW + wid]);
+                h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wid]) +
+                h1lambda * (w2lambda * inPos[hid * inImgW] +
+                            w1lambda * inPos[hid * inImgW + wid]);
             inPos += inPosOffset;
             outPos += outPosOffset;
           }
@@ -4013,7 +4464,7 @@ void CpuMatrix::bilinearBackward(const Matrix& out,
   if (inImgH == outImgH && inImgW == outImgW) {
     this->add(const_cast<Matrix&>(out));
   } else {
-    for (size_t k = 0; k < batchSize; ++k) {   // loop for batches
+    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
       for (size_t i = 0; i < outImgH; ++i) {  // loop for images
         size_t h = ratioH * i;
         size_t hid = (h < inImgH - 1) ? 1 : 0;
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 6c3c4804d2fc67a3378c61e8b8ff1e3c0087dd83..5de78bb84c3e046b3a60cdb0b8ffe2e6384adc92 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -77,12 +76,19 @@ typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr;
  */
 class Matrix : public BaseMatrix {
 protected:
-  Matrix(MemoryHandlePtr memHandle, size_t height, size_t width, bool trans,
+  Matrix(MemoryHandlePtr memHandle,
+         size_t height,
+         size_t width,
+         bool trans,
          bool use_gpu);
 
   Matrix(real* data, size_t height, size_t width, bool trans, bool use_gpu);
 
-  Matrix(real* data, size_t height, size_t width, size_t stride, bool trans,
+  Matrix(real* data,
+         size_t height,
+         size_t width,
+         size_t stride,
+         bool trans,
          bool use_gpu);
 
   static ThreadLocal<MatrixPtr> tmpMat_;
@@ -94,38 +100,66 @@ public:
 public:
   virtual ~Matrix() {}
 
-  static MatrixPtr create(MemoryHandlePtr memHandle, size_t height,
-                          size_t width, bool trans = false);
-  static MatrixPtr create(size_t height, size_t width, bool trans = false,
+  static MatrixPtr create(MemoryHandlePtr memHandle,
+                          size_t height,
+                          size_t width,
+                          bool trans = false);
+  static MatrixPtr create(size_t height,
+                          size_t width,
+                          bool trans = false,
+                          bool useGpu = false);
+  static MatrixPtr create(real* data,
+                          size_t height,
+                          size_t width,
+                          bool trans = false,
                           bool useGpu = false);
-  static MatrixPtr create(real* data, size_t height, size_t width,
-                          bool trans = false, bool useGpu = false);
-  static MatrixPtr create(real* data, size_t height, size_t width,
-                          size_t stride, bool trans = false,
+  static MatrixPtr create(real* data,
+                          size_t height,
+                          size_t width,
+                          size_t stride,
+                          bool trans = false,
                           bool useGpu = false);
 
-  static MatrixPtr createSparseMatrix(size_t height, size_t width, size_t nnz,
+  static MatrixPtr createSparseMatrix(size_t height,
+                                      size_t width,
+                                      size_t nnz,
                                       SparseValueType valueType = FLOAT_VALUE,
-                                      bool trans = false, bool useGpu = false);
-  static MatrixPtr createSparseMatrix(size_t height, size_t width, size_t nnz,
+                                      bool trans = false,
+                                      bool useGpu = false);
+  static MatrixPtr createSparseMatrix(size_t height,
+                                      size_t width,
+                                      size_t nnz,
                                       SparseValueType valueType = FLOAT_VALUE,
                                       SparseFormat foramt = SPARSE_CSR,
-                                      bool trans = false, bool useGpu = false);
-
-  static MatrixPtr createSparseMatrix(real* data, int* row, int* col,
-                                      size_t height, size_t width,
+                                      bool trans = false,
+                                      bool useGpu = false);
+
+  static MatrixPtr createSparseMatrix(real* data,
+                                      int* row,
+                                      int* col,
+                                      size_t height,
+                                      size_t width,
                                       size_t nnz, /* used to allocate space */
                                       SparseValueType valueType, /*value type*/
-                                      SparseFormat format, bool trans,
+                                      SparseFormat format,
+                                      bool trans,
                                       bool useGpu);
 
   static void resizeOrCreateSparseMatrix(
-      MatrixPtr& matrix, size_t height, size_t width, size_t nnz,
-      SparseValueType valueType = FLOAT_VALUE, SparseFormat foramt = SPARSE_CSR,
-      bool trans = false, bool useGpu = false);
-
-  static void resizeOrCreate(MatrixPtr& a, size_t height, size_t width,
-                             bool trans = false, bool useGpu = false);
+      MatrixPtr& matrix,
+      size_t height,
+      size_t width,
+      size_t nnz,
+      SparseValueType valueType = FLOAT_VALUE,
+      SparseFormat foramt = SPARSE_CSR,
+      bool trans = false,
+      bool useGpu = false);
+
+  static void resizeOrCreate(MatrixPtr& a,
+                             size_t height,
+                             size_t width,
+                             bool trans = false,
+                             bool useGpu = false);
 
   /**
    * @brief  set the data buffer used to hold the matrix data.
@@ -163,12 +197,12 @@ public:
   // if refactor sparse matrix
   virtual int* getRows() const {
     LOG(FATAL) << "Not implemented";
-    return nullptr;   //! suppress warning for no return value.
+    return nullptr;  //! suppress warning for no return value.
   }
 
   virtual int* getCols() const {
     LOG(FATAL) << "Not implemented";
-    return nullptr;   //! suppress warning for no return value.
+    return nullptr;  //! suppress warning for no return value.
   }
 
   virtual SparseFormat getFormat() const {
@@ -178,7 +212,7 @@ public:
 
   virtual SparseValueType getValueType() const {
     LOG(FATAL) << "Not implemented";
-    return NO_VALUE;    //! suppress warning for no return value.
+    return NO_VALUE;  //! suppress warning for no return value.
   }
 
   /**
@@ -208,7 +242,9 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  MatrixPtr subMatrix(size_t startRow, size_t endRow, size_t startCol,
+  MatrixPtr subMatrix(size_t startRow,
+                      size_t endRow,
+                      size_t startCol,
                       size_t endCol);
 
   MatrixPtr subRowMatrix(size_t startRow, size_t endRow) {
@@ -221,8 +257,11 @@ public:
 
   virtual MatrixPtr subMatrix(size_t startRow, size_t numRows) {
     CHECK_LE(startRow + numRows, getHeight());
-    return Matrix::create(getData() + startRow * getWidth(), numRows,
-                          getWidth(), trans_, useGpu_);
+    return Matrix::create(getData() + startRow * getWidth(),
+                          numRows,
+                          getWidth(),
+                          trans_,
+                          useGpu_);
   }
   virtual MatrixPtr subMatrix(size_t startRow, size_t numRows, MatrixPtr dest) {
     CHECK_LE(startRow + numRows, getHeight());
@@ -267,7 +306,8 @@ public:
    * as this, otherwise the new matrix will have the specified size.
    *
    */
-  virtual MatrixPtr clone(size_t height = 0, size_t width = 0,
+  virtual MatrixPtr clone(size_t height = 0,
+                          size_t width = 0,
                           bool useGpu = false) {
     LOG(FATAL) << "Not implemented";
     return nullptr;
@@ -305,9 +345,11 @@ public:
   /**
    * @note This should only be used for sparse matrix.
    */
-  virtual void resize(size_t newHeight, size_t newWidth,
+  virtual void resize(size_t newHeight,
+                      size_t newWidth,
                       size_t newNnz, /* total item used to allocate space */
-                      SparseValueType valueType, SparseFormat format) = 0;
+                      SparseValueType valueType,
+                      SparseFormat format) = 0;
 
   /**
    * @brief This should only be used for sparse matrix.
@@ -315,7 +357,9 @@ public:
    * Currently must be called for each row in order.
    * The matrix is not valid until setRow is called for the last row.
    */
-  virtual void setRow(size_t row, size_t colNum, const unsigned int* cols,
+  virtual void setRow(size_t row,
+                      size_t colNum,
+                      const unsigned int* cols,
                       const real* values) = 0;
 
   virtual MatrixPtr getTranspose() = 0;
@@ -389,8 +433,9 @@ public:
     }
   }
 
-  virtual void sequenceAvgForward(Matrix& a, const IVector& startsPos,
-    int mode) {
+  virtual void sequenceAvgForward(Matrix& a,
+                                  const IVector& startsPos,
+                                  int mode) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -399,7 +444,9 @@ public:
    * this = scaleAB*(a*b) + scaleT*this
    * @endcode
    */
-  virtual void mul(const MatrixPtr a, const MatrixPtr b, real scaleAB,
+  virtual void mul(const MatrixPtr a,
+                   const MatrixPtr b,
+                   real scaleAB,
                    real scaleT) {
     LOG(FATAL) << "Not implemented";
   }
@@ -416,7 +463,8 @@ public:
    * where index(i, j) = ((codes(i) + numClasses) >> (j + 1)) - 1
    * @endcode
    */
-  virtual void addByBitCode(size_t numClasses, const IVector& codes,
+  virtual void addByBitCode(size_t numClasses,
+                            const IVector& codes,
                             const Matrix& vec) {
     (void)numClasses;
     (void)codes;
@@ -431,7 +479,8 @@ public:
    * where index is same as the index for addByBitCode
    * @endcode
    */
-  virtual void addByBitCodeBackward(size_t numClasses, const IVector& codes,
+  virtual void addByBitCodeBackward(size_t numClasses,
+                                    const IVector& codes,
                                     Matrix& vec) {
     (void)numClasses;
     (void)codes;
@@ -446,8 +495,10 @@ public:
    * where index is same as the index for addByBitCode
    * @endcode
    */
-  virtual void mulByBitCode(size_t numClasses, const IVector& codes,
-                            const Matrix& mat, const Matrix& input) {
+  virtual void mulByBitCode(size_t numClasses,
+                            const IVector& codes,
+                            const Matrix& mat,
+                            const Matrix& input) {
     (void)numClasses;
     (void)codes;
     (void)mat;
@@ -463,7 +514,8 @@ public:
    * @endcode
    */
   virtual void mulByBitCodeBackwardWeight(size_t numClasses,
-                                          const IVector& codes, Matrix& mat,
+                                          const IVector& codes,
+                                          Matrix& mat,
                                           const Matrix& input) {
     (void)numClasses;
     (void)codes;
@@ -481,7 +533,8 @@ public:
    */
   virtual void mulByBitCodeBackwardError(size_t numClasses,
                                          const IVector& codes,
-                                         const Matrix& mat, Matrix& input) {
+                                         const Matrix& mat,
+                                         Matrix& input) {
     (void)numClasses;
     (void)codes;
     (void)mat;
@@ -496,7 +549,9 @@ public:
    * where bit(i, j) = ((codes(i) + numClasses) & 2^j) ? 1 : 0
    * @endcode
    */
-  virtual void sumByBitCode(size_t numClasses, IVector& codes, Matrix& sum,
+  virtual void sumByBitCode(size_t numClasses,
+                            IVector& codes,
+                            Matrix& sum,
                             real scaleSum) {
     (void)numClasses;
     (void)codes;
@@ -550,12 +605,16 @@ public:
     LOG(FATAL) << "not implemented";
   }
 
-  virtual void maxoutForward(Matrix& a, IVector& id, size_t channels,
+  virtual void maxoutForward(Matrix& a,
+                             IVector& id,
+                             size_t channels,
                              size_t groups) {
     LOG(FATAL) << "not implemented";
   }
 
-  virtual void maxoutBackward(Matrix& a, IVector& id, size_t channels,
+  virtual void maxoutBackward(Matrix& a,
+                              IVector& id,
+                              size_t channels,
                               size_t groups) {
     LOG(FATAL) << "not implemented";
   }
@@ -634,7 +693,8 @@ public:
   }
 
   /// copy -log(output[label]) to this->data[i].
-  virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
+  virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                              IVector& label,
                                               real alpha) {
     LOG(FATAL) << "Not implemented";
   }
@@ -660,13 +720,14 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  virtual void circularConvDerivative(Matrix& output, Matrix& prevOut1,
-                                      Matrix& prevOut2, Matrix& prevGrad1,
+  virtual void circularConvDerivative(Matrix& output,
+                                      Matrix& prevOut1,
+                                      Matrix& prevOut2,
+                                      Matrix& prevGrad1,
                                       Matrix& prevGrad2) {
     LOG(FATAL) << "Not implemented";
   }
 
-
   /* output_ij = exp(this_{ij}) / (sum_j exp(this_ij)) */
   virtual void softmax(Matrix& output) {
     (void)output;
@@ -727,9 +788,12 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  virtual void cosSimDerivative(Matrix& output, Matrix& prevOut1,
-                                Matrix& prevOut2, Matrix& prevGrad1,
-                                Matrix& prevGrad2, real scale = 1.0f) {
+  virtual void cosSimDerivative(Matrix& output,
+                                Matrix& prevOut1,
+                                Matrix& prevOut2,
+                                Matrix& prevGrad1,
+                                Matrix& prevGrad2,
+                                real scale = 1.0f) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -781,10 +845,18 @@ public:
    * It will expand a feature matrix according to the
    * convolution filters
    */
-  virtual void convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
-                          int channels, int blockH, int blockW, int strideH,
-                          int strideW, int paddingH, int paddingW,
-                          int outputH, int outputW) {
+  virtual void convExpand(Matrix& feature,
+                          int feaImgHeight,
+                          int feaImgWidth,
+                          int channels,
+                          int blockH,
+                          int blockW,
+                          int strideH,
+                          int strideW,
+                          int paddingH,
+                          int paddingW,
+                          int outputH,
+                          int outputW) {
     LOG(FATAL) << "Not implemeted";
   }
 
@@ -793,11 +865,20 @@ public:
    *
    * Its function is to restore a expanded-matrix into a feature matrix
    */
-  virtual void convShrink(Matrix& expandColMat, int thisImgHeight,
-                          int thisImgWidth, int channels, int blockH,
-                          int blockW, int strideH, int strideW, int paddingH,
-                          int paddingW, int outputH, int outputW,
-                          real alpha = 1.0f, real beta = 0.0f) {
+  virtual void convShrink(Matrix& expandColMat,
+                          int thisImgHeight,
+                          int thisImgWidth,
+                          int channels,
+                          int blockH,
+                          int blockW,
+                          int strideH,
+                          int strideW,
+                          int paddingH,
+                          int paddingW,
+                          int outputH,
+                          int outputW,
+                          real alpha = 1.0f,
+                          real beta = 0.0f) {
     LOG(FATAL) << "Not implemeted";
   }
 
@@ -805,54 +886,93 @@ public:
    * Pooling forward operation, pick out the largest element
    * in the sizeX of value
    */
-  virtual void maxPoolForward(Matrix& inputMat, size_t imgSizeH,
-                              size_t imgSizeW, size_t channels, size_t sizeX,
-                              size_t sizeY, size_t strideH, size_t strideW,
-                              size_t outputH, size_t outputW,
-                              size_t paddingH, size_t paddingW) {
+  virtual void maxPoolForward(Matrix& inputMat,
+                              size_t imgSizeH,
+                              size_t imgSizeW,
+                              size_t channels,
+                              size_t sizeX,
+                              size_t sizeY,
+                              size_t strideH,
+                              size_t strideW,
+                              size_t outputH,
+                              size_t outputW,
+                              size_t paddingH,
+                              size_t paddingW) {
     LOG(FATAL) << "Not implemeted";
   }
 
   /// Pooling backward operation.
-  virtual void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
-                               Matrix& outGrad, Matrix& outV, size_t sizeX,
-                               size_t sizeY, size_t strideH, size_t strideW,
-                               size_t outputH, size_t outputW,
-                               real scaleTargets, real scaleOutput,
-                               size_t paddingH, size_t paddingW) {
+  virtual void maxPoolBackward(Matrix& image,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               Matrix& outGrad,
+                               Matrix& outV,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               real scaleTargets,
+                               real scaleOutput,
+                               size_t paddingH,
+                               size_t paddingW) {
     LOG(FATAL) << "Not implemeted";
   }
 
   /// Pooling forward operation, caculate the average of sizeX elements.
-  virtual void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                              size_t channels, size_t sizeX, size_t sizeY,
-                              size_t strideH, size_t strideW,
-                              size_t outputH, size_t outputW,
-                              size_t paddingH, size_t paddingW) {
+  virtual void avgPoolForward(Matrix& input,
+                              size_t imgSizeH,
+                              size_t imgSizeW,
+                              size_t channels,
+                              size_t sizeX,
+                              size_t sizeY,
+                              size_t strideH,
+                              size_t strideW,
+                              size_t outputH,
+                              size_t outputW,
+                              size_t paddingH,
+                              size_t paddingW) {
     LOG(FATAL) << "Not implemeted";
   }
 
-  virtual void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                               size_t sizeX, size_t sizeY,
-                               size_t strideH, size_t strideW,
-                               size_t outputH, size_t outputW,
-                               real scaleTargets, real scaleOutput,
-                               size_t paddingH, size_t paddingW) {
+  virtual void avgPoolBackward(Matrix& input,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               real scaleTargets,
+                               real scaleOutput,
+                               size_t paddingH,
+                               size_t paddingW) {
     LOG(FATAL) << "Not implemeted";
   }
 
   /// normalize-operation.
-  virtual void crossMapNormalFwd(Matrix& input, size_t imgSizeH,
-                                 size_t imgSizeW, Matrix& denoms,
-                                 size_t channels, size_t sizeX, float scale,
+  virtual void crossMapNormalFwd(Matrix& input,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 Matrix& denoms,
+                                 size_t channels,
+                                 size_t sizeX,
+                                 float scale,
                                  float pow) {
     LOG(FATAL) << "Not implemeted";
   }
 
-  virtual void crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
-                                 Matrix& preOutV, Matrix& localOutV,
-                                 size_t channels, size_t imgSizeH,
-                                 size_t imgSizeW, size_t size, float scale,
+  virtual void crossMapNormalBwd(Matrix& localGrad,
+                                 Matrix& denoms,
+                                 Matrix& preOutV,
+                                 Matrix& localOutV,
+                                 size_t channels,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t size,
+                                 float scale,
                                  float pow) {
     LOG(FATAL) << "Not implemeted";
   }
@@ -865,20 +985,24 @@ public:
    *
    * output[i] is set to max_input[i].
    */
-  virtual void maxSequenceForward(Matrix& input, const IVector& sequence,
+  virtual void maxSequenceForward(Matrix& input,
+                                  const IVector& sequence,
                                   IVector& index) {
     LOG(FATAL) << "Not implemeted";
   }
 
-  virtual void maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
+  virtual void maxSequenceBackward(Matrix& outputGrad,
+                                   const IVector& sequence,
                                    IVector& index) {
     LOG(FATAL) << "Not implemeted";
   }
 
-  virtual void contextProjectionForward(MatrixPtr input, MatrixPtr weight,
+  virtual void contextProjectionForward(MatrixPtr input,
+                                        MatrixPtr weight,
                                         const IVector& sequence,
                                         int contextLength,
-                                        int contextStart, size_t beginPad,
+                                        int contextStart,
+                                        size_t beginPad,
                                         bool isPadding) {
     LOG(FATAL) << "Not implemeted";
   }
@@ -887,7 +1011,8 @@ public:
                                          MatrixPtr weightGrad,
                                          const IVector& sequence,
                                          int contextLength,
-                                         int contextStart, size_t beginPad,
+                                         int contextStart,
+                                         size_t beginPad,
                                          bool isPadding) {
     LOG(FATAL) << "Not implemeted";
   }
@@ -902,7 +1027,8 @@ public:
   virtual void contextProjectionBackwardWeight(MatrixPtr weightGrad,
                                                const IVector& sequence,
                                                int contextLength,
-                                               int contextStart, int totalPad,
+                                               int contextStart,
+                                               int totalPad,
                                                size_t beginPad) {
     LOG(FATAL) << "Not implemeted";
   }
@@ -981,7 +1107,8 @@ public:
    *            / output->getWidth()
    * @endcode
    */
-  virtual void classificationErrorMulti(Matrix& output, Matrix& label,
+  virtual void classificationErrorMulti(Matrix& output,
+                                        Matrix& label,
                                         real threshold) {
     LOG(FATAL) << "Not implemented";
   }
@@ -995,6 +1122,7 @@ public:
   virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
     LOG(FATAL) << "Not implemented";
   }
+
   virtual void bilinearForward(const Matrix& in,
                                const size_t inImgH,
                                const size_t inImgW,
@@ -1015,6 +1143,15 @@ public:
                                 const real ratioW) {
     LOG(FATAL) << "Not implemented";
   }
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (useGpu_) {
+      TensorGpuApply<real>(*this, expr);
+    } else {
+      TensorCpuApply<real>(*this, expr);
+    }
+  }
 };
 
 inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
@@ -1029,10 +1166,15 @@ public:
   GpuMatrix(size_t height, size_t width, bool trans = false);
   GpuMatrix(real* data, size_t height, size_t width, bool trans = false)
       : Matrix(data, height, width, trans, true) {}
-  GpuMatrix(real* data, size_t height, size_t width, size_t stride,
+  GpuMatrix(real* data,
+            size_t height,
+            size_t width,
+            size_t stride,
             bool trans = false)
       : Matrix(data, height, width, stride, trans, true) {}
-  GpuMatrix(GpuMemHandlePtr dataHandle, size_t height, size_t width,
+  GpuMatrix(GpuMemHandlePtr dataHandle,
+            size_t height,
+            size_t width,
             bool trans = false)
       : Matrix(dataHandle, height, width, trans, true) {}
   ~GpuMatrix();
@@ -1042,12 +1184,16 @@ public:
   void setDiag(real value);
 
   void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight, size_t newWidth,
+  void resize(size_t newHeight,
+              size_t newWidth,
               size_t newNnz, /* used to allocate space */
-              SparseValueType valueType, SparseFormat format) {
+              SparseValueType valueType,
+              SparseFormat format) {
     LOG(FATAL) << "Only Support Sparse Matrix";
   }
-  void setRow(size_t row, size_t colNum, const unsigned int* cols,
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
               const real* values) {
     LOG(FATAL) << "Only Support Sparse Matrix";
   }
@@ -1137,10 +1283,14 @@ public:
 
   void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
 
-  void mul(const GpuSparseMatrix& a, const GpuMatrix& b, real scaleAB,
+  void mul(const GpuSparseMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
            real scaleT);
 
-  void mul(const GpuMatrix& a, const GpuSparseMatrix& b, real scaleAB,
+  void mul(const GpuMatrix& a,
+           const GpuSparseMatrix& b,
+           real scaleAB,
            real scaleT);
 
   /**
@@ -1182,9 +1332,11 @@ public:
 
   void oneHotCrossEntropy(Matrix& output, IVector& label);
   void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
+  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                      IVector& label,
                                       real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV, IVector& label,
+  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                        IVector& label,
                                         real alpha);
 
   void softmax(Matrix& output);
@@ -1204,8 +1356,12 @@ public:
   void scaledTanh(Matrix& output, real p1, real p2);
 
   void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output, Matrix& prevOut1, Matrix& prevOut2,
-                        Matrix& prevGrad1, Matrix& prevGrad2, real scale);
+  void cosSimDerivative(Matrix& output,
+                        Matrix& prevOut1,
+                        Matrix& prevOut2,
+                        Matrix& prevGrad1,
+                        Matrix& prevGrad2,
+                        real scale);
 
   virtual void print(std::ostream& os) const;
   virtual void print(std::ostream& os, size_t height, size_t width) const;
@@ -1219,71 +1375,136 @@ public:
 
   void classificationError(MatrixPtr output, IVectorPtr label);
 
-  void convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
-                  int channels, int blockH, int blockW, int strideH,
-                  int strideW, int paddingH, int paddingW,
-                  int outputH, int outputW);
-
-  void convShrink(Matrix& expandColMat, int thisImgHeight, int thisImgWidth,
-                  int channels, int blockH, int blochW, int strideH,
-                  int strideW, int paddingH, int paddingWreal,
-                  int outputH, int outputW,
-                  real alpha = 1.0f, real beta = 0.0f);
-
-  void maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW,
-                      size_t channels, size_t sizeX, size_t sizeY,
-                      size_t strideH, size_t strideW,
-                      size_t outputH, size_t outputW,
-                      size_t paddingH, size_t paddingW);
-
-  void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
-                       Matrix& outGrad, Matrix& outV, size_t sizeX,
-                       size_t sizeY, size_t strideH, size_t strideW,
-                       size_t outputH, size_t outputW,
-                       real scaleTargets, real scaleOutput,
-                       size_t paddingH, size_t paddingW);
-
-  void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                      size_t channels, size_t sizeX, size_t sizeY,
-                      size_t strideH, size_t strideW,
-                      size_t outputH, size_t outputW,
-                      size_t paddingH, size_t paddingW);
-
-  void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                       size_t sizeX, size_t sizeY,
-                       size_t strideH, size_t strideW,
-                       size_t outputH, size_t outputW,
-                       real scaleTargets, real scaleOutput,
-                       size_t paddingH, size_t paddingW);
-
-  void crossMapNormalFwd(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                         Matrix& denoms, size_t channels, size_t sizeX,
-                         float scale, float pow);
-
-  void crossMapNormalBwd(Matrix& localGrad, Matrix& denoms, Matrix& preOutV,
-                         Matrix& localOutV, size_t channels, size_t imgSizeH,
-                         size_t imgSizeW, size_t sizeX,
-                         float scale, float pow);
-
-  void maxSequenceForward(Matrix& input, const IVector& sequence,
+  void convExpand(Matrix& feature,
+                  int feaImgHeight,
+                  int feaImgWidth,
+                  int channels,
+                  int blockH,
+                  int blockW,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  int outputH,
+                  int outputW);
+
+  void convShrink(Matrix& expandColMat,
+                  int thisImgHeight,
+                  int thisImgWidth,
+                  int channels,
+                  int blockH,
+                  int blochW,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingWreal,
+                  int outputH,
+                  int outputW,
+                  real alpha = 1.0f,
+                  real beta = 0.0f);
+
+  void maxPoolForward(Matrix& inputMat,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW);
+
+  void maxPoolBackward(Matrix& image,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       Matrix& outGrad,
+                       Matrix& outV,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void avgPoolForward(Matrix& input,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW);
+
+  void avgPoolBackward(Matrix& input,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void crossMapNormalFwd(Matrix& input,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         Matrix& denoms,
+                         size_t channels,
+                         size_t sizeX,
+                         float scale,
+                         float pow);
+
+  void crossMapNormalBwd(Matrix& localGrad,
+                         Matrix& denoms,
+                         Matrix& preOutV,
+                         Matrix& localOutV,
+                         size_t channels,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t sizeX,
+                         float scale,
+                         float pow);
+
+  void maxSequenceForward(Matrix& input,
+                          const IVector& sequence,
                           IVector& index);
 
-  void maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
+  void maxSequenceBackward(Matrix& outputGrad,
+                           const IVector& sequence,
                            IVector& index);
 
-  void contextProjectionForward(MatrixPtr input, MatrixPtr weight,
-                                const IVector& sequence, int contextLength,
-                                int contextStart, size_t beginPad,
+  void contextProjectionForward(MatrixPtr input,
+                                MatrixPtr weight,
+                                const IVector& sequence,
+                                int contextLength,
+                                int contextStart,
+                                size_t beginPad,
                                 bool isPadding);
 
   void contextProjectionBackwardData(MatrixPtr inputGrad,
                                      const IVector& sequence,
-                                     int contextLength, int contextStart);
+                                     int contextLength,
+                                     int contextStart);
 
   void contextProjectionBackwardWeight(MatrixPtr weightGrad,
                                        const IVector& sequence,
                                        int contextLength,
-                                       int contextStart, int totalPad,
+                                       int contextStart,
+                                       int totalPad,
                                        size_t beginPad);
 
   void bilinearForward(const Matrix& in,
@@ -1307,6 +1528,11 @@ public:
   void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
 
   void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorGpuApply<real>(*this, expr);
+  }
 };
 
 class CpuMatrix : public Matrix {
@@ -1314,11 +1540,16 @@ public:
   CpuMatrix(size_t height, size_t width, bool trans = false);
   CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
       : Matrix(data, height, width, trans, false) {}
-  CpuMatrix(real* data, size_t height, size_t width, size_t stride,
+  CpuMatrix(real* data,
+            size_t height,
+            size_t width,
+            size_t stride,
             bool trans = false)
       : Matrix(data, height, width, stride, trans, false) {}
 
-  CpuMatrix(CpuMemHandlePtr dataHandle, size_t height, size_t width,
+  CpuMatrix(CpuMemHandlePtr dataHandle,
+            size_t height,
+            size_t width,
             bool trans = false)
       : Matrix(dataHandle, height, width, trans, false) {}
 
@@ -1329,12 +1560,16 @@ public:
   void setDiag(real value);
 
   void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight, size_t newWidth,
+  void resize(size_t newHeight,
+              size_t newWidth,
               size_t newNnz, /* used to allocate space */
-              SparseValueType valueType, SparseFormat format) {
+              SparseValueType valueType,
+              SparseFormat format) {
     LOG(FATAL) << "Only Support Sparse Matrix";
   }
-  void setRow(size_t row, size_t colNum, const unsigned int* cols,
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
               const real* values) {
     LOG(FATAL) << "Only Support Sparse Matrix";
   }
@@ -1366,67 +1601,132 @@ public:
 
   MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
 
-  void convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
-                  int channels, int blcokH, int blockW, int strideH,
-                  int strideW, int paddingH, int paddingW,
-                  int outputH, int outputW);
-
-  void convShrink(Matrix& expandFeat, int thisImgHeight, int thisImgWidth,
-                  int channels, int blockH, int blockW, int strideH,
-                  int strideW, int paddingH, int paddingW,
-                  int outputH, int outputW,
-                  real alpha = 1.0f, real beta = 0.0f);
-
-  void maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW,
-                      size_t channels, size_t sizeX, size_t sizeY,
-                      size_t strideH, size_t strideW,
-                      size_t outputH, size_t outputW,
-                      size_t paddingH, size_t paddingW);
-
-  void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
-                       Matrix& outGrad, Matrix& outV,
-                       size_t sizeX, size_t sizeY,
-                       size_t strideH, size_t strideW,
-                       size_t outputH, size_t outputW,
-                       real scaleTargets, real scaleOutput,
-                       size_t paddingH, size_t paddingW);
-
-  void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                      size_t channels, size_t sizeX, size_t sizeY,
-                      size_t strideH, size_t strideW,
-                      size_t outputH, size_t outputW,
-                      size_t paddingH, size_t paddingW);
-
-  void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                       size_t sizeX, size_t sizeY,
-                       size_t strideH, size_t strideW,
-                       size_t outputH, size_t outputW,
-                       real scaleTargets, real scaleOutput,
-                       size_t paddingH, size_t paddingW);
-
-  void crossMapNormalFwd(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                         Matrix& denoms, size_t channels, size_t sizeX,
-                         float scale, float pow);
-
-  void crossMapNormalBwd(Matrix& localGrad, Matrix& denoms, Matrix& preOutV,
-                         Matrix& localOutV, size_t channels, size_t imgSizeH,
-                         size_t imgSizeW, size_t sizeX,
-                         float scale, float pow);
-
-  void maxSequenceForward(Matrix& input, const IVector& sequence,
+  void convExpand(Matrix& feature,
+                  int feaImgHeight,
+                  int feaImgWidth,
+                  int channels,
+                  int blcokH,
+                  int blockW,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  int outputH,
+                  int outputW);
+
+  void convShrink(Matrix& expandFeat,
+                  int thisImgHeight,
+                  int thisImgWidth,
+                  int channels,
+                  int blockH,
+                  int blockW,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  int outputH,
+                  int outputW,
+                  real alpha = 1.0f,
+                  real beta = 0.0f);
+
+  void maxPoolForward(Matrix& inputMat,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW);
+
+  void maxPoolBackward(Matrix& image,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       Matrix& outGrad,
+                       Matrix& outV,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void avgPoolForward(Matrix& input,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW);
+
+  void avgPoolBackward(Matrix& input,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void crossMapNormalFwd(Matrix& input,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         Matrix& denoms,
+                         size_t channels,
+                         size_t sizeX,
+                         float scale,
+                         float pow);
+
+  void crossMapNormalBwd(Matrix& localGrad,
+                         Matrix& denoms,
+                         Matrix& preOutV,
+                         Matrix& localOutV,
+                         size_t channels,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t sizeX,
+                         float scale,
+                         float pow);
+
+  void maxSequenceForward(Matrix& input,
+                          const IVector& sequence,
                           IVector& index);
 
-  void maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
+  void maxSequenceBackward(Matrix& outputGrad,
+                           const IVector& sequence,
                            IVector& index);
 
-  void contextProjectionForward(MatrixPtr input, MatrixPtr weight,
-                                const IVector& sequence, int contextLength,
-                                int contextStart, size_t beginPad,
+  void contextProjectionForward(MatrixPtr input,
+                                MatrixPtr weight,
+                                const IVector& sequence,
+                                int contextLength,
+                                int contextStart,
+                                size_t beginPad,
                                 bool isPadding);
 
-  void contextProjectionBackward(MatrixPtr inputGrad, MatrixPtr weightGrad,
-                                 const IVector& sequence, int contextLength,
-                                 int contextStart, size_t beginPad,
+  void contextProjectionBackward(MatrixPtr inputGrad,
+                                 MatrixPtr weightGrad,
+                                 const IVector& sequence,
+                                 int contextLength,
+                                 int contextStart,
+                                 size_t beginPad,
                                  bool isPadding);
 
   real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
@@ -1443,7 +1743,6 @@ public:
 
   void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
 
-
   /**
    * @code
    * this.row[i] += table.row[ids[i]]
@@ -1490,7 +1789,10 @@ public:
 
   void mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, real scaleT);
 
-  static void mul(CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c, real scaleAB,
+  static void mul(CpuMatrix* a,
+                  CpuMatrix* b,
+                  CpuSparseMatrix* c,
+                  real scaleAB,
                   real scaleT);
 
   /**
@@ -1500,8 +1802,8 @@ public:
    * Define B,C as template instead of virtual class for performance sake.
    */
   template <typename MatBType, typename MatCType>
-  static void mul(CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB,
-                  real scaleT);
+  static void mul(
+      CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT);
 
   virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
 
@@ -1525,14 +1827,18 @@ public:
 
   void oneHotCrossEntropy(Matrix& output, IVector& label);
   void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
+  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                      IVector& label,
                                       real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV, IVector& label,
+  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                        IVector& label,
                                         real alpha);
 
   void circularConv(Matrix& b, Matrix& c);
-  void circularConvDerivative(Matrix& output, Matrix& prevOut1,
-                              Matrix& prevOut2, Matrix& prevGrad1,
+  void circularConvDerivative(Matrix& output,
+                              Matrix& prevOut1,
+                              Matrix& prevOut2,
+                              Matrix& prevGrad1,
                               Matrix& prevGrad2);
 
   void softmax(Matrix& output);
@@ -1553,8 +1859,12 @@ public:
   void scaledTanh(Matrix& output, real p1, real p2);
 
   void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output, Matrix& prevOut1, Matrix& prevOut2,
-                        Matrix& prevGrad1, Matrix& prevGrad2, real scale);
+  void cosSimDerivative(Matrix& output,
+                        Matrix& prevOut1,
+                        Matrix& prevOut2,
+                        Matrix& prevGrad1,
+                        Matrix& prevGrad2,
+                        real scale);
 
   void print(std::ostream& os) const;
   void print(std::ostream& os, size_t height, size_t width) const;
@@ -1575,19 +1885,28 @@ public:
 
   void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
 
-  void addByBitCodeBackward(size_t numClasses, const IVector& codes,
+  void addByBitCodeBackward(size_t numClasses,
+                            const IVector& codes,
                             Matrix& vec);
 
-  void mulByBitCode(size_t numClasses, const IVector& codes, const Matrix& mat,
+  void mulByBitCode(size_t numClasses,
+                    const IVector& codes,
+                    const Matrix& mat,
                     const Matrix& input);
 
-  void mulByBitCodeBackwardWeight(size_t numClasses, const IVector& codes,
-                                  Matrix& mat, const Matrix& input);
+  void mulByBitCodeBackwardWeight(size_t numClasses,
+                                  const IVector& codes,
+                                  Matrix& mat,
+                                  const Matrix& input);
 
-  void mulByBitCodeBackwardError(size_t numClasses, const IVector& codes,
-                                 const Matrix& mat, Matrix& input);
+  void mulByBitCodeBackwardError(size_t numClasses,
+                                 const IVector& codes,
+                                 const Matrix& mat,
+                                 Matrix& input);
 
-  void sumByBitCode(size_t numClasses, IVector& codes, Matrix& sum,
+  void sumByBitCode(size_t numClasses,
+                    IVector& codes,
+                    Matrix& sum,
                     real scaleSum);
 
   void subByBitCode(size_t numClasses_, IVector& codes);
@@ -1613,6 +1932,11 @@ public:
                         const size_t numChannels,
                         const real ratioH,
                         const real ratioW);
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorCpuApply<real>(*this, expr);
+  }
 };
 
 class SharedCpuMatrix : public CpuMatrix {
@@ -1622,20 +1946,25 @@ public:
       : CpuMatrix(height, width, trans) {
     initShared(blockNum);
   }
-  SharedCpuMatrix(int blockNum, real* data, size_t height, size_t width,
-                  bool trans = false)
+  SharedCpuMatrix(
+      int blockNum, real* data, size_t height, size_t width, bool trans = false)
       : CpuMatrix(data, height, width, trans) {
     initShared(blockNum);
   }
 
-  SharedCpuMatrix(int blockNum, CpuMemHandlePtr dataHandle, size_t height,
-                  size_t width, bool trans = false)
+  SharedCpuMatrix(int blockNum,
+                  CpuMemHandlePtr dataHandle,
+                  size_t height,
+                  size_t width,
+                  bool trans = false)
       : CpuMatrix(dataHandle, height, width, trans) {
     initShared(blockNum);
   }
 
-  SharedCpuMatrix(CpuMemHandlePtr dataHandle, size_t height,
-                  size_t width, bool trans = false)
+  SharedCpuMatrix(CpuMemHandlePtr dataHandle,
+                  size_t height,
+                  size_t width,
+                  bool trans = false)
       : CpuMatrix(dataHandle, height, width, trans) {
     initBlock(1);
   }
@@ -1648,6 +1977,7 @@ public:
   void add(real p1, real p2);
 
 private:
+  using Matrix::mul;
   void initShared(int blockNum);
   void initBlock(int blockNum);
 
diff --git a/paddle/math/MatrixBitCode.cpp b/paddle/math/MatrixBitCode.cpp
index 8497c26e35404a4de970bc2d28b23ebf1090ae6c..ac5b10c7bd56bb34393ac8abb98900351afc2e41 100644
--- a/paddle/math/MatrixBitCode.cpp
+++ b/paddle/math/MatrixBitCode.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Util.h"
 #include "Matrix.h"
@@ -80,8 +79,8 @@ private:
        op(tmat(i, j), vec(0, index(i, j)))
 */
 template <class CodeTable, class Op, class TMat, class Mat>
-static void addByBitCodeT(Op op, CodeTable codeTable, const IVector& codes,
-                          TMat& tmat, Mat& vec) {
+static void addByBitCodeT(
+    Op op, CodeTable codeTable, const IVector& codes, TMat& tmat, Mat& vec) {
   CHECK(!vec.useGpu());
 
   size_t numClasses = codeTable.size();
@@ -109,7 +108,8 @@ static void addByBitCodeT(Op op, CodeTable codeTable, const IVector& codes,
 /* For j < codeLength:
    this(i, j) += vec(0, index(i, j))
 */
-void CpuMatrix::addByBitCode(size_t numClasses, const IVector& codes,
+void CpuMatrix::addByBitCode(size_t numClasses,
+                             const IVector& codes,
                              const Matrix& vec) {
   auto op = [](real& t, real v) { t += v; };
   addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec);
@@ -118,7 +118,8 @@ void CpuMatrix::addByBitCode(size_t numClasses, const IVector& codes,
 /* For j < codeLength:
    vec(0, index(i, j)) += this(i, j)
 */
-void CpuMatrix::addByBitCodeBackward(size_t numClasses, const IVector& codes,
+void CpuMatrix::addByBitCodeBackward(size_t numClasses,
+                                     const IVector& codes,
                                      Matrix& vec) {
   auto op = [](real t, real& v) { v += t; };
   addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec);
@@ -129,10 +130,18 @@ void CpuMatrix::addByBitCodeBackward(size_t numClasses, const IVector& codes,
     for j < codeLength:
       op(tmat(i, j), mat.row(index(i, j)), input.row(i))
 */
-template <class Op, class CodeTable, class IVec, class TMat, class WMat,
+template <class Op,
+          class CodeTable,
+          class IVec,
+          class TMat,
+          class WMat,
           class InMat>
-void mulByBitCodeT(Op op, CodeTable codeTable, IVec& codes, TMat& tmat,
-                   WMat& weight, InMat& input) {
+void mulByBitCodeT(Op op,
+                   CodeTable codeTable,
+                   IVec& codes,
+                   TMat& tmat,
+                   WMat& weight,
+                   InMat& input) {
   CHECK(!tmat.useGpu() && !weight.useGpu() && !input.useGpu());
 
   size_t numClasses = codeTable.size();
@@ -161,10 +170,12 @@ void mulByBitCodeT(Op op, CodeTable codeTable, IVec& codes, TMat& tmat,
 /* For j < codeLength:
    this(i, j) += <weight.row(index(i, j)), input.row(i)>
 */
-void CpuMatrix::mulByBitCode(size_t numClasses, const IVector& codes,
-                             const Matrix& weight, const Matrix& input) {
-  auto op = [](real& t, const real* weightRow, const real* inputRow,
-               size_t inputDim) {
+void CpuMatrix::mulByBitCode(size_t numClasses,
+                             const IVector& codes,
+                             const Matrix& weight,
+                             const Matrix& input) {
+  auto op = [](
+      real& t, const real* weightRow, const real* inputRow, size_t inputDim) {
     real sum = 0;
     for (size_t k = 0; k < inputDim; ++k) {
       sum += weightRow[k] * inputRow[k];
@@ -179,14 +190,15 @@ void CpuMatrix::mulByBitCode(size_t numClasses, const IVector& codes,
    weight.row(index(i, j)) += this(i, j) * input.row(i)
 */
 void CpuMatrix::mulByBitCodeBackwardWeight(size_t numClasses,
-                                           const IVector& codes, Matrix& weight,
+                                           const IVector& codes,
+                                           Matrix& weight,
                                            const Matrix& input) {
-  auto op =
-      [](const real t, real* weightRow, const real* inputRow, size_t inputDim) {
-        for (size_t k = 0; k < inputDim; ++k) {
-          weightRow[k] += t * inputRow[k];
-        }
-      };
+  auto op = [](
+      const real t, real* weightRow, const real* inputRow, size_t inputDim) {
+    for (size_t k = 0; k < inputDim; ++k) {
+      weightRow[k] += t * inputRow[k];
+    }
+  };
 
   mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
 }
@@ -196,20 +208,24 @@ void CpuMatrix::mulByBitCodeBackwardWeight(size_t numClasses,
 */
 void CpuMatrix::mulByBitCodeBackwardError(size_t numClasses,
                                           const IVector& codes,
-                                          const Matrix& weight, Matrix& input) {
-  auto op =
-      [](const real t, const real* weightRow, real* inputRow, size_t inputDim) {
-        for (size_t k = 0; k < inputDim; ++k) {
-          inputRow[k] += t * weightRow[k];
-        }
-      };
+                                          const Matrix& weight,
+                                          Matrix& input) {
+  auto op = [](
+      const real t, const real* weightRow, real* inputRow, size_t inputDim) {
+    for (size_t k = 0; k < inputDim; ++k) {
+      inputRow[k] += t * weightRow[k];
+    }
+  };
 
   mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
 }
 
 template <class CodeTable>
-void sumByBitCodeT(CodeTable codeTable, IVector& codes, const CpuMatrix& tmat,
-                   Matrix& sum, real scaleSum) {
+void sumByBitCodeT(CodeTable codeTable,
+                   IVector& codes,
+                   const CpuMatrix& tmat,
+                   Matrix& sum,
+                   real scaleSum) {
   size_t maxCodeLength = codeTable.getMaxCodeLength();
   size_t numSamples = tmat.getHeight();
   size_t oWidth = tmat.getWidth();
@@ -237,7 +253,9 @@ void sumByBitCodeT(CodeTable codeTable, IVector& codes, const CpuMatrix& tmat,
 /* For j < codeLength:
    sum(i, 0) = \sum_j  bit(i, j) * this(i, j)
 */
-void CpuMatrix::sumByBitCode(size_t numClasses, IVector& codes, Matrix& sum,
+void CpuMatrix::sumByBitCode(size_t numClasses,
+                             IVector& codes,
+                             Matrix& sum,
                              real scaleSum) {
   sumByBitCodeT(SimpleCodeTable(numClasses), codes, *this, sum, scaleSum);
 }
diff --git a/paddle/math/MemoryHandle.cpp b/paddle/math/MemoryHandle.cpp
index 11f746df5c2fb32175ebace1fd7dac3a2934cf9d..9101957fc6c221bed4aa8e0c76b4c6735e50fd2d 100644
--- a/paddle/math/MemoryHandle.cpp
+++ b/paddle/math/MemoryHandle.cpp
@@ -21,8 +21,7 @@ namespace paddle {
 /**
  * Calculate the actual allocation size according to the required size.
  */
-MemoryHandle::MemoryHandle(size_t size)
-    : size_(size), buf_(nullptr) {
+MemoryHandle::MemoryHandle(size_t size) : size_(size), buf_(nullptr) {
   if (size_ <= 256) {
     // Memory allocation in cuda is always aligned to at least 256 bytes.
     // In many cases it is 512 bytes.
@@ -44,9 +43,7 @@ GpuMemoryHandle::GpuMemoryHandle(size_t size) : MemoryHandle(size) {
   buf_ = allocator_->alloc(allocSize_);
 }
 
-GpuMemoryHandle::~GpuMemoryHandle() {
-  allocator_->free(buf_, allocSize_);
-}
+GpuMemoryHandle::~GpuMemoryHandle() { allocator_->free(buf_, allocSize_); }
 
 CpuMemoryHandle::CpuMemoryHandle(size_t size) : MemoryHandle(size) {
   CHECK(size != 0) << " allocate 0 bytes";
@@ -54,8 +51,6 @@ CpuMemoryHandle::CpuMemoryHandle(size_t size) : MemoryHandle(size) {
   buf_ = allocator_->alloc(allocSize_);
 }
 
-CpuMemoryHandle::~CpuMemoryHandle() {
-  allocator_->free(buf_, allocSize_);
-}
+CpuMemoryHandle::~CpuMemoryHandle() { allocator_->free(buf_, allocSize_); }
 
 }  // namespace paddle
diff --git a/paddle/math/MemoryHandle.h b/paddle/math/MemoryHandle.h
index 809fba2d0a8963ba60f5abaa2d2daf415c2d985d..f12635d5d4b6ff7204d4d3e8d6f07d438c0ce1e8 100644
--- a/paddle/math/MemoryHandle.h
+++ b/paddle/math/MemoryHandle.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -32,9 +31,9 @@ public:
 
 protected:
   PoolAllocator* allocator_;
-  size_t size_;         // the requested size
-  size_t allocSize_;    // the allocated size
-  int deviceId_;        // the device id of memory if gpu memory
+  size_t size_;       // the requested size
+  size_t allocSize_;  // the allocated size
+  int deviceId_;      // the device id of memory if gpu memory
   void* buf_;
 };
 
diff --git a/paddle/math/PoolAllocator.cpp b/paddle/math/PoolAllocator.cpp
index 3a03496eb190ba6792708d9bcffd77cd0e45d4fc..2c150949dd4eca08824401685beecc19142cbd76 100644
--- a/paddle/math/PoolAllocator.cpp
+++ b/paddle/math/PoolAllocator.cpp
@@ -12,21 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PoolAllocator.h"
 
 namespace paddle {
 
 PoolAllocator::PoolAllocator(Allocator* allocator,
-  size_t sizeLimit, const std::string& name)
+                             size_t sizeLimit,
+                             const std::string& name)
     : allocator_(allocator),
       sizeLimit_(sizeLimit),
       poolMemorySize_(0),
       name_(name) {}
 
-PoolAllocator::~PoolAllocator() {
-  freeAll();
-}
+PoolAllocator::~PoolAllocator() { freeAll(); }
 
 void* PoolAllocator::alloc(size_t size) {
   if (sizeLimit_ > 0) {
diff --git a/paddle/math/PoolAllocator.h b/paddle/math/PoolAllocator.h
index aca8ffb0ab42e10d76dc9fbaad657a8afab316e9..5d33b453127a5aaa355ba8c569baf1eefe931c96 100644
--- a/paddle/math/PoolAllocator.h
+++ b/paddle/math/PoolAllocator.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
diff --git a/paddle/math/SIMDFunctions.cpp b/paddle/math/SIMDFunctions.cpp
index 6147bed3d81112d57f03d23bbb6f5c2f327d4dc1..1fb156f29bbb586b6251f961bb4fd5f4d5da0737 100644
--- a/paddle/math/SIMDFunctions.cpp
+++ b/paddle/math/SIMDFunctions.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
-
 #include "SIMDFunctions.h"
 #include <immintrin.h>
 #include <algorithm>
@@ -85,7 +83,9 @@ static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) {
   return;
 }
 
-static void col_max_sse(float* result, const float* data, int dim,
+static void col_max_sse(float* result,
+                        const float* data,
+                        int dim,
                         int numSamples) {
   // first sample, direct copy
   for (int d = 0; d < dim; ++d) {
@@ -195,7 +195,9 @@ static void batch_addto_avx(float* a, const float* b[], int batch, size_t len) {
   return;
 }
 
-static void col_max_avx(float* result, const float* data, int dim,
+static void col_max_avx(float* result,
+                        const float* data,
+                        int dim,
                         int numSamples) {
   // first sample, direct copy
   for (int d = 0; d < dim; ++d) {
@@ -289,8 +291,8 @@ static void decayL1_avx(float* dst, float* src, float lambda, size_t sz) {
   }
 }
 
-static void decayL1_avx(float* dst, float* src, float* lr, float lambda,
-                        size_t sz) {
+static void decayL1_avx(
+    float* dst, float* src, float* lr, float lambda, size_t sz) {
   int64_t i;
   int64_t size = sz;
   float src_val;
@@ -379,8 +381,8 @@ void colMaxImpl(float* result, const float* data, int dim, int numSamples) {
 void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len) {
   decayL1_avx(dst, src, lambda, len);
 }
-void decayL1AvxImpl(float* dst, float* src, float* lr, float lambda,
-                    size_t len) {
+void decayL1AvxImpl(
+    float* dst, float* src, float* lr, float lambda, size_t len) {
   decayL1_avx(dst, src, lr, lambda, len);
 }
 
diff --git a/paddle/math/SIMDFunctions.h b/paddle/math/SIMDFunctions.h
index 2b984d5f96a620a95752231749a8b8b74f47d010..ac82f109104d7c21f346f909984306de105c0fd4 100644
--- a/paddle/math/SIMDFunctions.h
+++ b/paddle/math/SIMDFunctions.h
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
-
 #pragma once
 #include <stddef.h>
 #include <stdint.h>
@@ -123,8 +121,8 @@ void batchAddToImpl(float* a, const float* b[], int batch, size_t len);
 void colMaxImpl(float* result, const float* data, int dim, int numSamples);
 #ifdef __AVX__
 void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len);
-void decayL1AvxImpl(float* dst, float* src, float* lr, float lambda,
-                    size_t len);
+void decayL1AvxImpl(
+    float* dst, float* src, float* lr, float lambda, size_t len);
 #endif
 }  // namespace internal
 
@@ -153,8 +151,8 @@ inline void decayL1(float* dst, float* src, float lambda, size_t len) {
 }
 
 template <>
-inline void decayL1(float* dst, float* src, float* lr, float lambda,
-                    size_t len) {
+inline void decayL1(
+    float* dst, float* src, float* lr, float lambda, size_t len) {
 #ifdef __AVX__
   internal::decayL1AvxImpl(dst, src, lr, lambda, len);
 #else
diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp
index 67ac0488623075729996aa603bd0e89c7ce98d9f..2b0bff9535d5a9ba4a47def4d6f964c799325535 100644
--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
@@ -22,18 +22,25 @@ limitations under the License. */
 
 namespace paddle {
 
-GpuSparseMatrix::GpuSparseMatrix(size_t height, size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
+GpuSparseMatrix::GpuSparseMatrix(size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
                                  bool trans)
     : Matrix(NULL, height, width, trans, true) {
   resize(height, width, nnz, valueType, format);
 }
 
 GpuSparseMatrix::GpuSparseMatrix(GpuMemHandlePtr dataHandle,
-                                 hl_sparse_matrix_s_ptr sMatrix, size_t height,
-                                 size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
-                                 bool trans, MemoryHandlePtr sMemoryHandle)
+                                 hl_sparse_matrix_s_ptr sMatrix,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans,
+                                 MemoryHandlePtr sMemoryHandle)
     : Matrix(dataHandle, height, width, trans, true) {
   CHECK(dataHandle && sMatrix) << "Invalid argument pointer";
 
@@ -67,10 +74,14 @@ GpuSparseMatrix::GpuSparseMatrix(GpuMemHandlePtr dataHandle,
     sparseResizeCSC();
 }
 
-GpuSparseMatrix::GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix, size_t height,
-                                 size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
-                                 bool trans, MemoryHandlePtr sMemoryHandle)
+GpuSparseMatrix::GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans,
+                                 MemoryHandlePtr sMemoryHandle)
     : Matrix(NULL, height, width, trans, true) {
   CHECK(sMatrix) << "Invalid argument pointer";
   sMatrix_ = sMatrix;
@@ -80,9 +91,14 @@ GpuSparseMatrix::GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix, size_t height,
   valueType_ = valueType;
 }
 
-GpuSparseMatrix::GpuSparseMatrix(real* value, int* rows, int* cols,
-                                 size_t height, size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
+GpuSparseMatrix::GpuSparseMatrix(real* value,
+                                 int* rows,
+                                 int* cols,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
                                  bool trans)
     : Matrix(NULL, height, width, trans, true) {
   size_t size = 0;
@@ -118,9 +134,15 @@ GpuSparseMatrix::GpuSparseMatrix(real* value, int* rows, int* cols,
       /* construct hl_sparse_matrix_s */
       hl_sparse_matrix_s tmp;
       hl_construct_sparse_matrix(
-          &tmp, value, rows, cols, HL_SPARSE_CSR,
-          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, height_,
-          width_, elementCnt_);
+          &tmp,
+          value,
+          rows,
+          cols,
+          HL_SPARSE_CSR,
+          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
+          height_,
+          width_,
+          elementCnt_);
       hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
       sMatrix_ = tmp2;
     }
@@ -143,9 +165,15 @@ GpuSparseMatrix::GpuSparseMatrix(real* value, int* rows, int* cols,
       /* construct hl_sparse_matrix_s */
       hl_sparse_matrix_s tmp;
       hl_construct_sparse_matrix(
-          &tmp, value, rows, cols, HL_SPARSE_CSC,
-          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, height_,
-          width_, elementCnt_);
+          &tmp,
+          value,
+          rows,
+          cols,
+          HL_SPARSE_CSC,
+          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
+          height_,
+          width_,
+          elementCnt_);
       hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
       sMatrix_ = tmp2;
     }
@@ -171,8 +199,13 @@ void GpuSparseMatrix::sparseResizeCSR() {
     /* construct hl_sparse_matrix_s */
     hl_sparse_matrix_s tmp;
     hl_construct_sparse_matrix(
-        &tmp, data_, memoryHandle_->getSize(), HL_SPARSE_CSR,
-        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, height_, width_,
+        &tmp,
+        data_,
+        memoryHandle_->getSize(),
+        HL_SPARSE_CSR,
+        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
+        height_,
+        width_,
         elementCnt_);
     hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
     sMatrix_ = tmp2;
@@ -197,16 +230,24 @@ void GpuSparseMatrix::sparseResizeCSC() {
     /* construct hl_sparse_matrix_s */
     hl_sparse_matrix_s tmp;
     hl_construct_sparse_matrix(
-        &tmp, memoryHandle_->getBuf(), memoryHandle_->getSize(), HL_SPARSE_CSC,
-        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, height_, width_,
+        &tmp,
+        memoryHandle_->getBuf(),
+        memoryHandle_->getSize(),
+        HL_SPARSE_CSC,
+        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
+        height_,
+        width_,
         elementCnt_);
     hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
     sMatrix_ = tmp2;
   }
 }
 
-void GpuSparseMatrix::resize(size_t newHeight, size_t newWidth, size_t newNnz,
-                             SparseValueType valueType, SparseFormat format) {
+void GpuSparseMatrix::resize(size_t newHeight,
+                             size_t newWidth,
+                             size_t newNnz,
+                             SparseValueType valueType,
+                             SparseFormat format) {
   if (format == SPARSE_CSR) {
     resizeCSR(newHeight, newWidth, newNnz, valueType);
   } else {
@@ -214,8 +255,10 @@ void GpuSparseMatrix::resize(size_t newHeight, size_t newWidth, size_t newNnz,
   }
 }
 
-void GpuSparseMatrix::resizeCSR(size_t newHeight, size_t newWidth,
-                                size_t newNnz, SparseValueType valueType) {
+void GpuSparseMatrix::resizeCSR(size_t newHeight,
+                                size_t newWidth,
+                                size_t newNnz,
+                                SparseValueType valueType) {
   size_t newSize = (newHeight + 1) * sizeof(int) + newNnz * sizeof(int);
   if (NO_VALUE != valueType) {
     newSize += newNnz * sizeof(real);
@@ -266,8 +309,10 @@ void GpuSparseMatrix::resizeCSR(size_t newHeight, size_t newWidth,
   }
 }
 
-void GpuSparseMatrix::resizeCSC(size_t newHeight, size_t newWidth,
-                                size_t newNnz, SparseValueType valueType) {
+void GpuSparseMatrix::resizeCSC(size_t newHeight,
+                                size_t newWidth,
+                                size_t newNnz,
+                                SparseValueType valueType) {
   size_t newSize = (newWidth + 1) * sizeof(int) + newNnz * sizeof(int);
   if (NO_VALUE != valueType) {
     newSize += newNnz * sizeof(real);
@@ -327,24 +372,37 @@ MatrixPtr GpuSparseMatrix::getTranspose() {
   CHECK(memoryHandle_.get() || sMatrix_) << "not supported";
   if (memoryHandle_.get()) {
     MatrixPtr copy_T(new GpuSparseMatrix(
-        std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_), sMatrix_,
-        height_, width_, elementCnt_, valueType_, format_, true,
+        std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
+        sMatrix_,
+        height_,
+        width_,
+        elementCnt_,
+        valueType_,
+        format_,
+        true,
         sMemoryHandle_));
     return copy_T;
   } else {
-    MatrixPtr copy_T(new GpuSparseMatrix(sMatrix_, height_, width_, elementCnt_,
-                                         valueType_, format_, true,
+    MatrixPtr copy_T(new GpuSparseMatrix(sMatrix_,
+                                         height_,
+                                         width_,
+                                         elementCnt_,
+                                         valueType_,
+                                         format_,
+                                         true,
                                          sMemoryHandle_));
     return copy_T;
   }
 }
 
-void GpuSparseMatrix::copyRow(int offsets, size_t colNum,
+void GpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
                               const sparse_non_value_t* row) {
   memcpy(cols_ + offsets, row, sizeof(int) * colNum);
 }
 
-void GpuSparseMatrix::copyRow(int offsets, size_t colNum,
+void GpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
                               const sparse_float_value_t* row) {
   for (size_t j = 0; j < colNum; j++) {
     cols_[offsets + j] = row[j].col;
@@ -368,7 +426,9 @@ void GpuSparseMatrix::copyFrom(const Matrix& src) {
 }
 
 template <class T>
-void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data,
+void GpuSparseMatrix::copyFrom(int64_t* ids,
+                               int64_t* indices,
+                               T* data,
                                hl_stream_t stream) {
   CHECK_EQ(format_, SPARSE_CSR);
   size_t nnz = 0;
@@ -377,7 +437,9 @@ void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data,
     nnz += indices[id + 1] - indices[id];
   }
 
-  resize(height_, width_, nnz,
+  resize(height_,
+         width_,
+         nnz,
          sizeof(T) == sizeof(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE,
          format_);
 
@@ -399,8 +461,10 @@ void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data,
   hl_memcpy_csr_matrix(sMatrix_.get(), value_, rows_, cols_, stream);
 }
 
-void GpuSparseMatrix::setRow(size_t row, size_t colNum,
-                             const unsigned int* cols, const real* values) {
+void GpuSparseMatrix::setRow(size_t row,
+                             size_t colNum,
+                             const unsigned int* cols,
+                             const real* values) {
   CHECK_EQ(format_, SPARSE_CSR);
   if (NO_VALUE == valueType_) {
     CHECK_LT(row, height_);
@@ -427,8 +491,8 @@ void GpuSparseMatrix::setRow(size_t row, size_t colNum,
     sMatrix_->rows = height_;
     sMatrix_->cols = width_;
     sMatrix_->nnz = elementCnt_;
-    hl_memcpy_csr_matrix(sMatrix_.get(), value_, rows_, cols_,
-                         HPPL_STREAM_DEFAULT);
+    hl_memcpy_csr_matrix(
+        sMatrix_.get(), value_, rows_, cols_, HPPL_STREAM_DEFAULT);
   }
 }
 
@@ -438,8 +502,8 @@ void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   CHECK_EQ(format_, SPARSE_CSC);
   int nnz = sMatrix_->nnz;
   if (memAlloc) {
-    matTrans = std::make_shared<GpuSparseMatrix>(width_, height_, nnz,
-                                                 valueType_, format_, false);
+    matTrans = std::make_shared<GpuSparseMatrix>(
+        width_, height_, nnz, valueType_, format_, false);
   } else {
     CHECK(matTrans != nullptr);
   }
@@ -449,9 +513,14 @@ void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   CpuIVector cols_full(nnz);
   CpuVector value(nnz);
   hl_stream_t stream = HPPL_STREAM_1;
-  hl_memcpy_from_csc_matrix(value.getData(), nnz, rows.getData(), nnz,
-                            cols.getData(), width_ + 1,
-                            sMatrix_.get(), stream);
+  hl_memcpy_from_csc_matrix(value.getData(),
+                            nnz,
+                            rows.getData(),
+                            nnz,
+                            cols.getData(),
+                            width_ + 1,
+                            sMatrix_.get(),
+                            stream);
 
   hl_stream_synchronize(stream);
 
@@ -465,12 +534,14 @@ void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
 
   /*sort row index and column index by the ascending order*/
   for (int i = 0; i < nnz; i++) {
-    dataVec.emplace_back(rows.getData()[i], cols_full.getData()[i],
-                         value.getData()[i]);
+    dataVec.emplace_back(
+        rows.getData()[i], cols_full.getData()[i], value.getData()[i]);
   }
-  std::sort(dataVec.begin(), dataVec.end(), [](Element a, Element b) {
-    return a.row < b.row || (a.row == b.row && a.col < b.col);
-  });
+  std::sort(dataVec.begin(),
+            dataVec.end(),
+            [](Element a, Element b) {
+              return a.row < b.row || (a.row == b.row && a.col < b.col);
+            });
 
   /*get sorted data, row index, and col index, put them in the right place*/
   cols.resize(height_ + 1);
@@ -494,13 +565,18 @@ void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   /*copy back from cpu*/
   GpuSparseMatrixPtr dest =
       std::dynamic_pointer_cast<GpuSparseMatrix>(matTrans);
-  hl_memcpy_csc_matrix((dest->sMatrix_).get(), value.getData(),
-                       rows.getData(), cols.getData(), stream);
+  hl_memcpy_csc_matrix((dest->sMatrix_).get(),
+                       value.getData(),
+                       rows.getData(),
+                       cols.getData(),
+                       stream);
   hl_stream_synchronize(stream);
 }
 
-void GpuSparseMatrix::mul(const GpuMatrixPtr a, const GpuMatrixPtr b,
-                          real scaleAB, real scaleT) {
+void GpuSparseMatrix::mul(const GpuMatrixPtr a,
+                          const GpuMatrixPtr b,
+                          real scaleAB,
+                          real scaleT) {
   CHECK(a->useGpu_ && b->useGpu_) << "type not match";
   CHECK(!trans_) << "trans not supported";
   real* A_d = a->getData();
@@ -527,11 +603,13 @@ void GpuSparseMatrix::mul(const GpuMatrixPtr a, const GpuMatrixPtr b,
   int dimM = height_;
   int dimN = width_;
   int dimK = !b->trans_ ? b->getHeight() : b->getWidth();
-  hl_sparse_matrix_mul(A_d, a_trans, B_d, b_trans, C_d, dimM,
-                       dimN, dimK, scaleAB, scaleT);
+  hl_sparse_matrix_mul(
+      A_d, a_trans, B_d, b_trans, C_d, dimM, dimN, dimK, scaleAB, scaleT);
 }
 
-void GpuSparseMatrix::mul(const MatrixPtr a, const MatrixPtr b, real scaleAB,
+void GpuSparseMatrix::mul(const MatrixPtr a,
+                          const MatrixPtr b,
+                          real scaleAB,
                           real scaleT) {
   if (std::dynamic_pointer_cast<GpuMatrix>(a) &&
       std::dynamic_pointer_cast<GpuMatrix>(b)) {
@@ -559,9 +637,14 @@ void GpuSparseMatrix::print(std::ostream& os) const {
     IVectorPtr cols = IVector::create(width_ + 1, false);
     VectorPtr value = Vector::create(nnz, false);
     hl_stream_t stream = HPPL_STREAM_DEFAULT;
-    hl_memcpy_from_csc_matrix(
-        value->getData(), value->getSize(), rows->getData(), rows->getSize(),
-        cols->getData(), cols->getSize(), sMatrix_.get(), stream);
+    hl_memcpy_from_csc_matrix(value->getData(),
+                              value->getSize(),
+                              rows->getData(),
+                              rows->getSize(),
+                              cols->getData(),
+                              cols->getSize(),
+                              sMatrix_.get(),
+                              stream);
     hl_stream_synchronize(stream);
 
     printBuf(os, cols->getData(), width_ + 1, "col idx");
@@ -574,11 +657,10 @@ void GpuSparseMatrix::copyFromCSR(CpuSparseMatrix& src, hl_stream_t stream) {
   trans_ = src.trans_;
   size_t nnz = src.getElementCnt();
 
-  resize(src.getHeight(), src.getWidth(), nnz, valueType_,
-         src.getFormat());
+  resize(src.getHeight(), src.getWidth(), nnz, valueType_, src.getFormat());
   // if have different value type, only copy rows and cols
   SparseValueType vType =
-    valueType_ != src.getValueType() ? NO_VALUE : valueType_;
+      valueType_ != src.getValueType() ? NO_VALUE : valueType_;
 
   sMatrix_->format = HL_SPARSE_CSR;
   sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
@@ -588,7 +670,9 @@ void GpuSparseMatrix::copyFromCSR(CpuSparseMatrix& src, hl_stream_t stream) {
 
   hl_memcpy_csr_matrix(sMatrix_.get(),
                        vType == NO_VALUE ? NULL : src.getValue(),
-                       src.getRows(), src.getCols(), stream);
+                       src.getRows(),
+                       src.getCols(),
+                       stream);
 
   // restore type of sMatrix_
   sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
@@ -598,12 +682,11 @@ void GpuSparseMatrix::copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream) {
   trans_ = src.trans_;
   size_t nnz = src.getElementCnt();
 
-  resize(src.getHeight(), src.getWidth(), nnz, valueType_,
-         src.getFormat());
+  resize(src.getHeight(), src.getWidth(), nnz, valueType_, src.getFormat());
 
   // if have different value type, only copy rows and cols
   SparseValueType vType =
-    valueType_ != src.getValueType() ? NO_VALUE : valueType_;
+      valueType_ != src.getValueType() ? NO_VALUE : valueType_;
 
   sMatrix_->format = HL_SPARSE_CSC;
   sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
@@ -613,7 +696,9 @@ void GpuSparseMatrix::copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream) {
 
   hl_memcpy_csc_matrix(sMatrix_.get(),
                        vType == NO_VALUE ? NULL : src.getValue(),
-                       src.getRows(), src.getCols(), stream);
+                       src.getRows(),
+                       src.getCols(),
+                       stream);
 
   // restore type of sMatrix_
   sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
@@ -622,23 +707,24 @@ void GpuSparseMatrix::copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream) {
 void GpuSparseMatrix::copyFrom(GpuSparseMatrix& src, hl_stream_t stream) {
   CHECK(trans_ == src.trans_);
   CHECK(format_ == src.getFormat());
-  resize(src.getHeight(), src.getWidth(), elementCnt_, valueType_,
+  resize(src.getHeight(),
+         src.getWidth(),
+         elementCnt_,
+         valueType_,
          src.getFormat());
 
   size_t rowSize = format_ == SPARSE_CSC ? elementCnt_ : height_ + 1;
   size_t colSize = format_ == SPARSE_CSC ? width_ + 1 : elementCnt_;
 
   if (valueType_ == FLOAT_VALUE && src.getValueType() == FLOAT_VALUE) {
-    hl_memcpy_async(getValue(), src.getValue(),
-                    sizeof(real) * elementCnt_, stream);
+    hl_memcpy_async(
+        getValue(), src.getValue(), sizeof(real) * elementCnt_, stream);
   }
   CHECK(getRows());
   CHECK(src.getRows());
 
-  hl_memcpy_async(getRows(), src.getRows(),
-                  sizeof(int) * rowSize, stream);
-  hl_memcpy_async(getCols(), src.getCols(),
-                  sizeof(int) * colSize, stream);
+  hl_memcpy_async(getRows(), src.getRows(), sizeof(int) * rowSize, stream);
+  hl_memcpy_async(getCols(), src.getCols(), sizeof(int) * colSize, stream);
 }
 
 void GpuSparseMatrix::copyFrom(CpuSparseMatrix& src, hl_stream_t stream) {
@@ -652,7 +738,8 @@ void GpuSparseMatrix::copyFrom(CpuSparseMatrix& src, hl_stream_t stream) {
 void GpuSparseMatrix::trimFromCSR(const CpuSparseMatrix& src) {
   trans_ = src.trans_;
   int* srcCols = src.getCols();
-  size_t nnz = std::count_if(srcCols, srcCols + src.getElementCnt(),
+  size_t nnz = std::count_if(srcCols,
+                             srcCols + src.getElementCnt(),
                              [this](size_t n) { return n < this->width_; });
   resize(height_, width_, nnz, valueType_, format_);
 
@@ -678,9 +765,11 @@ void GpuSparseMatrix::trimFromCSR(const CpuSparseMatrix& src) {
   sMatrix_->cols = width_;
   sMatrix_->nnz = nnz;
 
-  hl_memcpy_csr_matrix(
-      sMatrix_.get(), valueType_ == NO_VALUE ? NULL : value_, rows_, cols_,
-      /*default stream = */ HPPL_STREAM_DEFAULT);
+  hl_memcpy_csr_matrix(sMatrix_.get(),
+                       valueType_ == NO_VALUE ? NULL : value_,
+                       rows_,
+                       cols_,
+                       /*default stream = */ HPPL_STREAM_DEFAULT);
 }
 
 void GpuSparseMatrix::trimFromCSC(const CpuSparseMatrix& src) {
@@ -703,9 +792,11 @@ void GpuSparseMatrix::trimFromCSC(const CpuSparseMatrix& src) {
   sMatrix_->cols = width_;
   sMatrix_->nnz = nnz;
 
-  hl_memcpy_csc_matrix(
-      sMatrix_.get(), valueType_ == NO_VALUE ? NULL : value_, rows_, cols_,
-      /*default stream = */ HPPL_STREAM_DEFAULT);
+  hl_memcpy_csc_matrix(sMatrix_.get(),
+                       valueType_ == NO_VALUE ? NULL : value_,
+                       rows_,
+                       cols_,
+                       /*default stream = */ HPPL_STREAM_DEFAULT);
 }
 
 void GpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
@@ -766,10 +857,12 @@ void GpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
 #endif
 }
 
-template void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices,
+template void GpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
                                         sparse_non_value_t* data,
                                         hl_stream_t stream);
-template void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices,
+template void GpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
                                         sparse_float_value_t* data,
                                         hl_stream_t stream);
 }  // namespace paddle
diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h
index 4b9a03302bf531a08b889a4b15d36fc8e71458dd..175ef54b858b7f8f31f45796d733af81a9d67066 100644
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <cstddef>
 #include "Matrix.h"
@@ -35,25 +34,41 @@ public:
   SparseFormat format_;
 
 public:
-  GpuSparseMatrix(size_t height, size_t width,
+  GpuSparseMatrix(size_t height,
+                  size_t width,
                   size_t nnz, /* used to allocate space */
                   SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format_ = SPARSE_CSR, bool trans = false);
+                  SparseFormat format_ = SPARSE_CSR,
+                  bool trans = false);
 
-  GpuSparseMatrix(GpuMemHandlePtr dataHandle, hl_sparse_matrix_s_ptr sMatrix,
-                  size_t height, size_t width,
+  GpuSparseMatrix(GpuMemHandlePtr dataHandle,
+                  hl_sparse_matrix_s_ptr sMatrix,
+                  size_t height,
+                  size_t width,
                   size_t nnz, /* used to allocate space */
                   SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format_ = SPARSE_CSR, bool trans = false,
+                  SparseFormat format_ = SPARSE_CSR,
+                  bool trans = false,
                   MemoryHandlePtr sMemoryHandle = NULL);
 
-  GpuSparseMatrix(real* value, int* rows, int* cols, size_t height,
-                  size_t width, size_t nnz, SparseValueType valueType,
-                  SparseFormat format, bool trans);
-
-  GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix, size_t height, size_t width,
-                  size_t nnz, SparseValueType valueType, SparseFormat format,
-                  bool trans, MemoryHandlePtr sMemoryHandle);
+  GpuSparseMatrix(real* value,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans);
+
+  GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans,
+                  MemoryHandlePtr sMemoryHandle);
 
 protected:
   struct Element {
@@ -67,9 +82,11 @@ protected:
 public:
   ~GpuSparseMatrix() {}
 
-  void resize(size_t newHeight, size_t newWidth,
+  void resize(size_t newHeight,
+              size_t newWidth,
               size_t newNnz, /* used to allocate space */
-              SparseValueType valueType, SparseFormat format);
+              SparseValueType valueType,
+              SparseFormat format);
 
   void resize(size_t newHeight, size_t newWidth);
 
@@ -77,13 +94,19 @@ public:
 
   void sparseResizeCSC();
 
-  void resizeCSR(size_t newHeight, size_t newWidth, size_t newNnz,
+  void resizeCSR(size_t newHeight,
+                 size_t newWidth,
+                 size_t newNnz,
                  SparseValueType valueType);
 
-  void resizeCSC(size_t newHeight, size_t newWidth, size_t newNnz,
+  void resizeCSC(size_t newHeight,
+                 size_t newWidth,
+                 size_t newNnz,
                  SparseValueType valueType);
 
-  void mul(const GpuMatrixPtr a, const GpuMatrixPtr b, real scaleAB,
+  void mul(const GpuMatrixPtr a,
+           const GpuMatrixPtr b,
+           real scaleAB,
            real scaleT);
   /// B = A , B.trans = !A.trans
   MatrixPtr getTranspose();
@@ -104,7 +127,9 @@ public:
   template <class T>
   void copyFrom(int64_t* ids, int64_t* indices, T* data, hl_stream_t stream);
 
-  void setRow(size_t row, size_t colNum, const unsigned int* cols,
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
               const real* values);
   SparseValueType getValueType() const;
   SparseFormat getFormat() const { return format_; }
@@ -173,7 +198,7 @@ public:
    * getData is convenient to get value
    */
   real* getData() { return getValue(); }
-  const real* getData() const { return getValue();}
+  const real* getData() const { return getValue(); }
 
   /**
    * @brief  Get top k value of each row in sparse matrix.
@@ -204,9 +229,7 @@ public:
 
   // BaseMatrixT interface
 public:
-  bool isSparse() const {
-    return true;
-  }
+  bool isSparse() const { return true; }
 
 private:
   using Matrix::mul;
diff --git a/paddle/math/SparseRowMatrix.cpp b/paddle/math/SparseRowMatrix.cpp
index 6986624d25c7a498da923f4f77b78c25c874b41f..100827e321effb3240c024b4d16da2d0524a2b1c 100644
--- a/paddle/math/SparseRowMatrix.cpp
+++ b/paddle/math/SparseRowMatrix.cpp
@@ -12,21 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "SparseRowMatrix.h"
 #include "CpuSparseMatrix.h"
 
-#include <cmath>
 #include <algorithm>
 
 #include "paddle/utils/Logging.h"
 
 #include "SIMDFunctions.h"
 
-#include "paddle/utils/Util.h"
 #include "paddle/utils/Thread.h"
+#include "paddle/utils/Util.h"
 
-P_DEFINE_bool(allow_inefficient_sparse_update, false,
+P_DEFINE_bool(allow_inefficient_sparse_update,
+              false,
               "Whether to allow inefficient sparse update");
 
 namespace paddle {
@@ -34,8 +33,6 @@ namespace paddle {
 const unsigned int SparseRowCpuMatrix::kUnusedId_ = -1U;
 
 void SparseRowCpuMatrix::init(size_t height, size_t width) {
-  // @TODO(yuyang18) Just remove this limit
-  CHECK(simd::vec_check(width)) << width;
   height_ = height;
   if (!indexDictHandle_) {
     indexDictHandle_.reset(new IndexDict);
@@ -45,7 +42,9 @@ void SparseRowCpuMatrix::init(size_t height, size_t width) {
   globalIndices_ = indexDictHandle_->globalIndices.data();
 }
 
-void SparseRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
+void SparseRowCpuMatrix::mul(CpuSparseMatrix* a,
+                             CpuMatrix* b,
+                             real scaleAB,
                              real scaleT) {
   CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(a, b, this, scaleAB, scaleT);
 }
@@ -55,24 +54,25 @@ void SparseRowCpuMatrix::copyFrom(const real* src, size_t size) {
 }
 
 void SparseRowCpuMatrix::zeroMem() {
-  apply(
-    [](real* buf, size_t len) {
-      memset(buf, 0, sizeof(real) * len);
-    });
+  apply([](real* buf, size_t len) { memset(buf, 0, sizeof(real) * len); });
   clearRows();
 }
 
 void SparseRowCpuMatrix::applyL1Decay(real learningRate, real decayRate) {
   apply([=](real* buf, size_t len) {
-      CpuVector value(0, nullptr);
-      value.subVecFrom(buf, 0, len);
-      value.applyL1(learningRate, decayRate);
-    });
+    CpuVector value(0, nullptr);
+    value.subVecFrom(buf, 0, len);
+    value.applyL1(learningRate, decayRate);
+  });
 }
 
-void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value, IVector& t0,
-                                   real learningRate, int currentTime,
-                                   real decayRate, bool useL1, bool fini) {
+void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value,
+                                   IVector& t0,
+                                   real learningRate,
+                                   int currentTime,
+                                   real decayRate,
+                                   bool useL1,
+                                   bool fini) {
   std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
 
   // t0 and value are vectors
@@ -124,7 +124,7 @@ void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value, IVector& t0,
       for (size_t j = 0; j < this->width_; ++j) {
         v[j] -= learningRate * g[j];
       }
-      simd::decayL1(v, v, learningRate*decayRate, this->width_);
+      simd::decayL1(v, v, learningRate * decayRate, this->width_);
 
       // state update to t+1
       t[0] = currentTime + 1;
@@ -173,8 +173,10 @@ void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value, IVector& t0,
   }
 }
 
-void SparseRowCpuMatrix::addTo(BaseMatrix& dest, std::vector<uint32_t>& ids,
-                               size_t tid, size_t numThreads) {
+void SparseRowCpuMatrix::addTo(BaseMatrix& dest,
+                               std::vector<uint32_t>& ids,
+                               size_t tid,
+                               size_t numThreads) {
   CHECK(!dest.useGpu_);
   CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);
 
@@ -182,14 +184,14 @@ void SparseRowCpuMatrix::addTo(BaseMatrix& dest, std::vector<uint32_t>& ids,
   for (size_t i = 0; i < localIndices.size(); ++i) {
     uint32_t id = localIndices[i];
     if (id % numThreads == tid) {
-      simd::addTo(dest.rowBuf(id), getLocalRow(i),
-                  this->width_);
+      simd::addTo(dest.rowBuf(id), getLocalRow(i), this->width_);
       ids.push_back(id);
     }
   }
 }
 
-void SparseRowCpuMatrix::addTo(SparseRowCpuMatrix& dest, size_t tid,
+void SparseRowCpuMatrix::addTo(SparseRowCpuMatrix& dest,
+                               size_t tid,
                                size_t numThreads) {
   CHECK(!dest.useGpu_);
   CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);
@@ -214,24 +216,28 @@ void SparseRowCpuMatrix::zeroMemThread(size_t tid, size_t numThreads) {
   }
 }
 
-void SparseAutoGrowRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b,
-                                     real scaleAB, real scaleT) {
-  CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(a, b, this, scaleAB,
-                                                        scaleT);
+void SparseAutoGrowRowCpuMatrix::mul(CpuSparseMatrix* a,
+                                     CpuMatrix* b,
+                                     real scaleAB,
+                                     real scaleT) {
+  CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
+      a, b, this, scaleAB, scaleT);
 }
 
-void CacheRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
+void CacheRowCpuMatrix::mul(CpuSparseMatrix* a,
+                            CpuMatrix* b,
+                            real scaleAB,
                             real scaleT) {
   CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(a, b, this, scaleAB, scaleT);
 }
 
 void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) {
   std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < len; i ++) {
+  for (size_t i = 0; i < len; i++) {
     CHECK_LT(*(ids + i), this->getHeight())
-      << "id:" << *(ids + i) << "Height:" << this->getHeight()
-      << "sparse id value exceeds the max input dimension, "
-      << "it could be caused invalid input data samples";
+        << "id:" << *(ids + i) << "Height:" << this->getHeight()
+        << "sparse id value exceeds the max input dimension, "
+        << "it could be caused invalid input data samples";
   }
   localIndices.insert(localIndices.end(), ids, ids + len);
 }
@@ -252,9 +258,9 @@ void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) {
 
     unsigned int id = (unsigned int)index[i];
     CHECK_LT(id, this->getHeight())
-      << "id:" << id << "Height:" << this->getHeight()
-      << "sparse id value exceeds the max input dimension, "
-      << "it could be caused invalid input data samples";
+        << "id:" << id << "Height:" << this->getHeight()
+        << "sparse id value exceeds the max input dimension, "
+        << "it could be caused invalid input data samples";
     localIndices.push_back(id);
   }
 }
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index 2dcd81188d6431c317e82ee35e968cddfb334f59..56f113a3614e2e22809abbdaa708557ed3344464 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <algorithm>
@@ -41,12 +40,15 @@ public:
 
   /// heightStore is max number of rows of the sparse matrix.
   SparseRowCpuMatrix(CpuMemHandlePtr dataHandle,
-                     size_t height, size_t width,
-                     IndexDictPtr indexDictHandle = nullptr, bool trans = false)
+                     size_t height,
+                     size_t width,
+                     IndexDictPtr indexDictHandle = nullptr,
+                     bool trans = false)
       : CpuMatrix(nullptr, height, width, trans),
         storeMat_(dataHandle,
                   dataHandle ? dataHandle->getSize() / sizeof(real) / width : 0,
-                  width, trans),
+                  width,
+                  trans),
         indexDictHandle_(indexDictHandle) {
     init(height, width);
   }
@@ -123,8 +125,12 @@ public:
    * While pass finished, caller should call this func one more time
    *  with (fini=true) to let weight decay catch up current time.
    */
-  void sgdUpdate(BaseMatrix& value, IVector& t0, real learningRate,
-                 int currentTime, real decayRate, bool useL1,
+  void sgdUpdate(BaseMatrix& value,
+                 IVector& t0,
+                 real learningRate,
+                 int currentTime,
+                 real decayRate,
+                 bool useL1,
                  bool fini = false);
 
   /**
@@ -135,7 +141,9 @@ public:
    *  ids occured in *this* append to *ids*
    *  filtered by  (id % numThreads == tid)
    */
-  void addTo(BaseMatrix& dest, std::vector<uint32_t>& ids, size_t tid,
+  void addTo(BaseMatrix& dest,
+             std::vector<uint32_t>& ids,
+             size_t tid,
              size_t numThreads);
 
   /**
@@ -166,7 +174,7 @@ public:
   }
 
 protected:
-  template<typename Func>
+  template <typename Func>
   void apply(Func f) {
     real* data = storeMat_.getData() ? storeMat_.getData() : rowStore_.data();
     f(data, localIndices_->size() * width_);
@@ -211,9 +219,11 @@ class SyncThreadPool;
 class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
 public:
   SparsePrefetchRowCpuMatrix(CpuMemHandlePtr dataHandle,
-                             size_t height, size_t width,
+                             size_t height,
+                             size_t width,
                              IndexDictPtr indexDictHandle = nullptr,
-                             SyncThreadPool* pool = nullptr, bool trans = false)
+                             SyncThreadPool* pool = nullptr,
+                             bool trans = false)
       : SparseRowCpuMatrix(dataHandle, height, width, indexDictHandle, trans),
         pool_(pool) {}
 
@@ -239,7 +249,8 @@ protected:
 
 class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {
 public:
-  SparseAutoGrowRowCpuMatrix(size_t height, size_t width,
+  SparseAutoGrowRowCpuMatrix(size_t height,
+                             size_t width,
                              IndexDictPtr indexDictHandle = nullptr,
                              bool trans = false)
       : SparseRowCpuMatrix(nullptr, height, width, indexDictHandle, trans) {}
@@ -261,8 +272,10 @@ public:
 
 class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {
 public:
-  CacheRowCpuMatrix(size_t height, size_t width,
-                    IndexDictPtr indexDictHandle = nullptr, bool trans = false)
+  CacheRowCpuMatrix(size_t height,
+                    size_t width,
+                    IndexDictPtr indexDictHandle = nullptr,
+                    bool trans = false)
       : SparseAutoGrowRowCpuMatrix(height, width, indexDictHandle, trans),
         sourceData_(nullptr) {}
 
@@ -277,8 +290,8 @@ public:
       id = globalIndices_[row] = localIndices_->size();
       localIndices_->push_back(row);
       checkStoreSize();
-      memcpy(getLocalRow(id), sourceData_ + width_ * row,
-             sizeof(float) * width_);
+      memcpy(
+          getLocalRow(id), sourceData_ + width_ * row, sizeof(float) * width_);
     }
     return getLocalRow(id);
   }
@@ -300,7 +313,9 @@ public:
  */
 class SparseRowIdsCpuMatrix : public CpuMatrix {
 public:
-  SparseRowIdsCpuMatrix(CpuMemHandlePtr dataHandle, size_t height, size_t width,
+  SparseRowIdsCpuMatrix(CpuMemHandlePtr dataHandle,
+                        size_t height,
+                        size_t width,
                         bool trans = false)
       : CpuMatrix(dataHandle, height, width, trans) {}
 
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 0403c3521cf54d833b32ff0810ba6d29dfc8f3c6..57ea5c926647d21a82c87fc262e2999e45e7534f 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "Allocator.h"
 #include "Storage.h"
 
-P_DEFINE_int32(pool_limit_size, 536870912,
+P_DEFINE_int32(pool_limit_size,
+               536870912,
                "maximum memory size managed by a memory pool, default is 512M");
 
 namespace paddle {
@@ -25,11 +25,10 @@ namespace paddle {
 // Initialization StorageEngine singleton.
 // Other modules may rely on storage management,
 // so StorageEngine need to be initialized before other modules.
-static InitFunction __init_storage_engine([](){StorageEngine::singleton();},
+static InitFunction __init_storage_engine([]() { StorageEngine::singleton(); },
                                           std::numeric_limits<int>::max());
 
-StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {
-}
+StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {}
 
 StorageEngine::~StorageEngine() {
   if (cpuAllocator_) {
@@ -49,8 +48,8 @@ PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) {
   {
     // if gpuAllocator_ has been constructed
     ReadLockGuard guard(lock_);
-    if (deviceId < static_cast<int>(gpuAllocator_.size())
-        && (gpuAllocator_[deviceId] != nullptr)) {
+    if (deviceId < static_cast<int>(gpuAllocator_.size()) &&
+        (gpuAllocator_[deviceId] != nullptr)) {
       return gpuAllocator_[deviceId];
     }
   }
@@ -63,9 +62,9 @@ PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) {
     }
     if (gpuAllocator_[deviceId] == nullptr) {
       std::string name =
-        "gpu" + std::to_string(deviceId) + std::string("_pool");
-      gpuAllocator_[deviceId] = new PoolAllocator(
-        new GpuAllocator(), FLAGS_pool_limit_size, name);
+          "gpu" + std::to_string(deviceId) + std::string("_pool");
+      gpuAllocator_[deviceId] =
+          new PoolAllocator(new GpuAllocator(), FLAGS_pool_limit_size, name);
     }
     return gpuAllocator_[deviceId];
   }
@@ -86,10 +85,10 @@ PoolAllocator* StorageEngine::getCpuAllocator() {
     if (cpuAllocator_ == nullptr) {
       if (FLAGS_use_gpu) {
         cpuAllocator_ = new PoolAllocator(
-          new CudaHostAllocator(), FLAGS_pool_limit_size, "cuda_host_pool");
+            new CudaHostAllocator(), FLAGS_pool_limit_size, "cuda_host_pool");
       } else {
         cpuAllocator_ = new PoolAllocator(
-          new CpuAllocator(), FLAGS_pool_limit_size, "cpu_pool");
+            new CpuAllocator(), FLAGS_pool_limit_size, "cpu_pool");
       }
     }
     return cpuAllocator_;
diff --git a/paddle/math/TensorApply.h b/paddle/math/TensorApply.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b2a9a7cd2b3388b991d7d1b05ac9104a092e5e0
--- /dev/null
+++ b/paddle/math/TensorApply.h
@@ -0,0 +1,211 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+
+/**
+ * \brief The tensor evaluator classes.
+ */
+template <typename Derived, class T>
+class TensorApply {
+public:
+  explicit INLINE TensorApply(const Derived& p)
+      : data_(p.data_),
+        stride_(p.stride_),
+        height_(p.height_),
+        width_(p.width_),
+        useGpu_(p.useGpu_) {}
+
+  INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
+  INLINE T apply(int index) const { return data_[index]; }
+  INLINE T& applyRef(int i, int j) { return data_[i * stride_ + j]; }
+  INLINE T& applyRef(int index) { return data_[index]; }
+
+  INLINE size_t getWidth() const { return width_; }
+  INLINE size_t getHeight() const { return height_; }
+  INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
+  INLINE bool useGpu() const { return useGpu_; }
+
+  T* data_;
+  size_t stride_;
+  size_t height_;
+  size_t width_;
+  bool useGpu_;
+};
+
+/**
+ * \brief The tensor evaluator classes.
+ * evaluator for rvalues
+ */
+template <typename Derived, class T>
+class TensorApply<const Derived, T> {
+public:
+  explicit INLINE TensorApply(const Derived& p)
+      : data_(p.data_),
+        stride_(p.stride_),
+        height_(p.height_),
+        width_(p.width_),
+        useGpu_(p.useGpu_) {}
+
+  INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
+  INLINE T apply(int index) const { return data_[index]; }
+
+  INLINE size_t getWidth() const { return width_; }
+  INLINE size_t getHeight() const { return height_; }
+  INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
+  INLINE bool useGpu() const { return useGpu_; }
+
+  const T* data_;
+  size_t stride_;
+  size_t height_;
+  size_t width_;
+  bool useGpu_;
+};
+
+template <typename Derived, class T>
+class TensorApply<const TensorExpression<Derived, T>, T> {
+public:
+  explicit TensorApply(const TensorExpression<Derived, T>& expr)
+      : expr_(expr.derived()) {}
+
+  INLINE T apply(int i, int j) const { return expr_.apply(i, j); }
+  INLINE T apply(int index) const { return expr_.apply(index); }
+
+  INLINE size_t getWidth() const { return expr_.getWidth(); }
+  INLINE size_t getHeight() const { return expr_.getHeight(); }
+  INLINE bool isContiguous() const { return expr_.isContiguous(); }
+  INLINE bool useGpu() const { return expr_.useGpu(); }
+
+  TensorApply<const Derived, T> expr_;
+};
+
+/**
+ * \brief The unary expression evaluator classes.
+ */
+template <class OP, typename ArgType, class T>
+class TensorApply<const TensorUnaryOp<OP, ArgType, T>, T> {
+public:
+  explicit INLINE TensorApply(const TensorUnaryOp<OP, ArgType, T>& expr)
+      : op_(expr.op_), expr_(expr.expr_) {}
+
+  INLINE T apply(int i, int j) const { return op_(expr_.apply(i, j)); }
+  INLINE T apply(int index) const { return op_(expr_.apply(index)); }
+
+  INLINE size_t getWidth() const { return expr_.getWidth(); }
+  INLINE size_t getHeight() const { return expr_.getHeight(); }
+  INLINE bool isContiguous() const { return expr_.isContiguous(); }
+  INLINE bool useGpu() const { return expr_.useGpu(); }
+
+  const OP op_;
+  TensorApply<ArgType, T> expr_;
+};
+
+/**
+ * \brief The binary expression evaluator classes.
+ */
+template <class OP, typename LhsType, typename RhsType, class T>
+class TensorApply<const TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
+public:
+  explicit INLINE TensorApply(
+      const TensorBinaryOp<OP, LhsType, RhsType, T>& expr)
+      : op_(expr.op_), lhs_(expr.lhs_), rhs_(expr.rhs_) {
+#ifndef __CUDA_ARCH__
+    CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+    CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+    CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+#endif
+  }
+
+  INLINE T apply(int i, int j) const {
+    return op_(lhs_.apply(i, j), rhs_.apply(i, j));
+  }
+  INLINE T apply(int index) const {
+    return op_(lhs_.apply(index), rhs_.apply(index));
+  }
+
+  INLINE size_t getWidth() const { return lhs_.getWidth(); }
+  INLINE size_t getHeight() const { return rhs_.getHeight(); }
+  INLINE bool isContiguous() const {
+    return lhs_.isContiguous() && rhs_.isContiguous();
+  }
+  INLINE bool useGpu() const { return lhs_.useGpu(); }
+
+  const OP op_;
+  TensorApply<LhsType, T> lhs_;
+  TensorApply<RhsType, T> rhs_;
+};
+
+/**
+ * \brief The ternary expression evaluator classes.
+ */
+template <typename ArgType1, typename ArgType2, typename ArgType3, class T>
+class TensorApply<const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>, T> {
+public:
+  explicit INLINE TensorApply(
+      const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>& expr)
+      : expr1_(expr.expr1_), expr2_(expr.expr2_), expr3_(expr.expr3_) {
+#ifndef __CUDA_ARCH__
+    CHECK_EQ(expr1_.getWidth(), expr2_.getWidth());
+    CHECK_EQ(expr1_.getWidth(), expr3_.getWidth());
+    CHECK_EQ(expr1_.getHeight(), expr2_.getHeight());
+    CHECK_EQ(expr1_.getHeight(), expr3_.getHeight());
+    CHECK_EQ(expr1_.useGpu(), expr2_.useGpu());
+    CHECK_EQ(expr1_.useGpu(), expr3_.useGpu());
+#endif
+  }
+
+  INLINE T apply(int i, int j) const {
+    return expr1_.apply(i, j) ? expr2_.apply(i, j) : expr3_.apply(i, j);
+  }
+  INLINE T apply(int index) const {
+    return expr1_.apply(index) ? expr2_.apply(index) : expr3_.apply(index);
+  }
+
+  INLINE size_t getWidth() const { return expr1_.getWidth(); }
+  INLINE size_t getHeight() const { return expr1_.getHeight(); }
+  INLINE bool isContiguous() const {
+    return expr1_.isContiguous() && expr2_.isContiguous() &&
+           expr3_.isContiguous();
+  }
+  INLINE bool useGpu() const { return expr1_.useGpu(); }
+
+  TensorApply<ArgType1, T> expr1_;
+  TensorApply<ArgType2, T> expr2_;
+  TensorApply<ArgType3, T> expr3_;
+};
+
+/**
+ * \brief The const expression evaluator classes.
+ */
+template <class OP, typename ArgType, class T>
+class TensorApply<const TensorConstant<OP, ArgType, T>, T> {
+public:
+  explicit INLINE TensorApply(const TensorConstant<OP, ArgType, T>& expr)
+      : op_(expr.op_), expr_(expr.expr_) {}
+
+  INLINE T apply(int i, int j) const { return op_(i, j); }
+  INLINE T apply(int index) const { return op_(index); }
+
+  INLINE size_t getWidth() const { return expr_.getWidth(); }
+  INLINE size_t getHeight() const { return expr_.getHeight(); }
+  INLINE bool isContiguous() const { return true; }
+  INLINE bool useGpu() const { return expr_.useGpu(); }
+
+  const OP op_;
+  TensorApply<ArgType, T> expr_;
+};
+
+}  // namespace paddle
diff --git a/paddle/math/TensorAssign.h b/paddle/math/TensorAssign.h
new file mode 100644
index 0000000000000000000000000000000000000000..03f7048d2d0576eaa0f759f913c35dc223ca336d
--- /dev/null
+++ b/paddle/math/TensorAssign.h
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+
+/**
+ * \brief Tensor Assign Expression(return by lazyAssign,
+ * and evaluated by AssignEvaluate)
+ */
+template <typename LhsType, typename RhsType, class T>
+class TensorAssignOp {
+public:
+  explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
+      : lhs_(lhs), rhs_(rhs) {
+#ifndef __CUDA_ARCH__
+    CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+    CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+    CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+#endif
+  }
+
+  INLINE void apply(const int i, const int j) {
+    lhs_.applyRef(i, j) = rhs_.apply(i, j);
+  }
+  INLINE void apply(const int index) {
+    lhs_.applyRef(index) = rhs_.apply(index);
+  }
+
+  INLINE size_t getWidth() const { return lhs_.getWidth(); }
+  INLINE size_t getHeight() const { return rhs_.getHeight(); }
+  INLINE bool isContiguous() const {
+    return lhs_.isContiguous() && rhs_.isContiguous();
+  }
+  INLINE bool useGpu() const { return lhs_.useGpu(); }
+
+private:
+  TensorApply<LhsType, T> lhs_;
+  TensorApply<const RhsType, T> rhs_;
+};
+
+template <typename Assign, typename... AssignOp>
+void AssignCpuEvaluate(int height,
+                       int width,
+                       bool isContiguous,
+                       Assign&& assign,
+                       AssignOp&&... args) {
+  if (isContiguous) {
+    int size = height * width;
+    for (int index = 0; index < size; index++) {
+      assign.apply(index);
+      __attribute__((unused)) int dummy[] = {(((args)).apply(index), 0)...};
+    }
+  } else {
+    for (int i = 0; i < height; i++) {
+      for (int j = 0; j < width; j++) {
+        assign.apply(i, j);
+        __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
+      }
+    }
+  }
+}
+
+#ifdef __NVCC__
+template <typename Assign, typename... AssignOp>
+__global__ void AssignGpuEvaluate1(const int border,
+                                   Assign assign,
+                                   AssignOp... args) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    assign.apply(idx);
+    __attribute__((unused)) int dummy[] = {(((args)).apply(idx), 0)...};
+  }
+}
+
+template <typename Assign, typename... AssignOp>
+__global__ void AssignGpuEvaluate2(const int height,
+                                   const int width,
+                                   Assign assign,
+                                   AssignOp... args) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < height; i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < width; j += gridDim.x * blockDim.x) {
+      assign.apply(i, j);
+      __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
+    }
+  }
+}
+#endif
+
+/**
+ * \brief Evaluate one or more TensorAssignOp objects.
+ *
+ * \note At least one assignment expression is required
+ */
+template <typename Assign, typename... AssignOp>
+void AssignEvaluate(Assign&& assign, AssignOp&&... args) {
+  const bool useGpu_ = assign.useGpu();
+  bool isContiguous_ = assign.isContiguous();
+  const size_t height = assign.getHeight();
+  const size_t width = assign.getWidth();
+
+  const int packSize = sizeof...(args);
+  const bool packUseGpu[] = {((args)).useGpu()...};
+  const bool packIsContiguous[] = {((args)).isContiguous()...};
+  const size_t packHeight[] = {((args)).getHeight()...};
+  const size_t packWidth[] = {((args)).getWidth()...};
+
+  for (int i = 0; i < packSize; i++) {
+    CHECK_EQ(useGpu_, packUseGpu[i]);
+    CHECK_EQ(height, packHeight[i]);
+    CHECK_EQ(width, packWidth[i]);
+    isContiguous_ = isContiguous_ && packIsContiguous[i];
+  }
+
+  if (useGpu_) {
+#ifdef __NVCC__
+    if (isContiguous_) {
+      int size = height * width;
+      int blockSize = size <= 1024 ? size : 1024;
+      int gridSize = (size + 1024 - 1) / 1024;
+      AssignGpuEvaluate1<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+          size, assign, args...);
+    } else {
+      int blockSizeY = std::min(32, (int)height);
+      int blockSizeX = (32 / blockSizeY) * 32;
+      int gridSizeX = std::min(32, (int)(width + blockSizeX - 1) / blockSizeX);
+      int gridSizeY = std::min(32, (int)(height + blockSizeY - 1) / blockSizeY);
+      dim3 threads(blockSizeX, blockSizeY);
+      dim3 grid(gridSizeX, gridSizeY);
+      AssignGpuEvaluate2<<<grid, threads, 0, STREAM_DEFAULT>>>(
+          height, width, assign, args...);
+    }
+
+    CHECK_SYNC("AssignEvaluate failed");
+#endif
+  } else {
+    AssignCpuEvaluate(height, width, isContiguous_, assign, args...);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/math/TensorEvaluate.h b/paddle/math/TensorEvaluate.h
new file mode 100644
index 0000000000000000000000000000000000000000..39981246f021f4185c6c70b15f08907468764397
--- /dev/null
+++ b/paddle/math/TensorEvaluate.h
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include "paddle/utils/Logging.h"
+#include "hl_base.h"
+
+namespace paddle {
+
+/**
+ * \brief The tensor cpu evaluate api.
+ */
+template <class T, typename LeftType, typename RightType>
+inline void TensorCpuApply(LeftType& lhs, const RightType& rhs) {
+  TensorApply<LeftType, T> lhs_(lhs);
+  TensorApply<const RightType, T> rhs_(rhs);
+  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+
+  int height = lhs_.getHeight();
+  int width = lhs_.getWidth();
+  if (lhs_.isContiguous() && rhs_.isContiguous()) {
+    int size = height * width;
+    for (int index = 0; index < size; index++) {
+      lhs_.applyRef(index) = rhs_.apply(index);
+    }
+  } else {
+    for (int i = 0; i < height; i++) {
+      for (int j = 0; j < width; j++) {
+        lhs_.applyRef(i, j) = rhs_.apply(i, j);
+      }
+    }
+  }
+}
+
+#ifdef __NVCC__
+template <typename LeftType, typename RightType>
+__global__ void TensorElementWiseOp(LeftType lhs,
+                                    RightType rhs,
+                                    const int border) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    lhs.applyRef(idx) = rhs.apply(idx);
+  }
+}
+
+template <typename LeftType, typename RightType>
+__global__ void TensorElementWiseOp(LeftType lhs, RightType rhs) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < lhs.getHeight(); i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < lhs.getWidth(); j += gridDim.x * blockDim.x) {
+      lhs.applyRef(i, j) = rhs.apply(i, j);
+    }
+  }
+}
+
+/**
+ * \brief The tensor gpu evaluate api.
+ */
+template <class T, typename LeftType, typename RightType>
+inline void TensorGpuApply(LeftType& lhs, const RightType& rhs) {
+  TensorApply<LeftType, T> lhs_(lhs);
+  TensorApply<const RightType, T> rhs_(rhs);
+  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+
+  int dimM = lhs_.getHeight();
+  int dimN = lhs_.getWidth();
+
+  if (lhs_.isContiguous() && rhs_.isContiguous()) {
+    int size = dimM * dimN;
+    int blockSize = size <= 1024 ? size : 1024;
+    int gridSize = (size + 1024 - 1) / 1024;
+    TensorElementWiseOp<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+        lhs_, rhs_, size);
+  } else {
+    int blockSizeY = std::min(32, dimM);
+    int blockSizeX = (32 / blockSizeY) * 32;
+    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
+    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
+    dim3 threads(blockSizeX, blockSizeY);
+    dim3 grid(gridSizeX, gridSizeY);
+    TensorElementWiseOp<<<grid, threads, 0, STREAM_DEFAULT>>>(lhs_, rhs_);
+  }
+
+  CHECK_SYNC("TensorGpuApply failed");
+}
+#else
+template <class T, typename LeftType, typename RightType>
+inline void TensorGpuApply(LeftType& lhs, RightType& rhs) {}
+#endif
+
+}  // namespace paddle
diff --git a/paddle/math/TensorExpression.h b/paddle/math/TensorExpression.h
new file mode 100644
index 0000000000000000000000000000000000000000..b28ea2be1d90b2e8e71a72e07913d15ccfade456
--- /dev/null
+++ b/paddle/math/TensorExpression.h
@@ -0,0 +1,446 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cstddef>
+#include <stdint.h>
+#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/Logging.h"
+#include "hl_tensor_ops.h"
+
+namespace paddle {
+
+template <class OP, typename ExprType, class T>
+class TensorConstant;
+template <class OP, typename ExprType, class T>
+class TensorUnaryOp;
+template <class OP, typename LhsType, typename RhsType, class T>
+class TensorBinaryOp;
+template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
+class TensorTernaryOp;
+
+template <typename LhsType, typename RhsType, class T>
+class TensorAssignOp;
+
+/**
+ * \brief Tensor base class.
+ *
+ * This is the base class of all Tensor and Expression class.
+ */
+template <typename Derived, class T>
+class TensorExpression {
+public:
+  /**
+   * Element wise unary expression.
+   */
+  template <typename UnaryOp>
+  const TensorUnaryOp<UnaryOp, const Derived, T> unaryExpression(
+      const UnaryOp& op) const {
+    return TensorUnaryOp<UnaryOp, const Derived, T>(op, derived());
+  }
+
+  const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
+      T p) const {
+    return unaryExpression(hppl::unary::add_scale<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::sub_scale<T>, const Derived, T> operator-(
+      T p) const {
+    return unaryExpression(hppl::unary::sub_scale<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
+      T p) const {
+    return unaryExpression(hppl::unary::mul_scale<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::div_scale<T>, const Derived, T> operator/(
+      T p) const {
+    return unaryExpression(hppl::unary::div_scale<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::neg<T>, const Derived, T> operator-() const {
+    return unaryExpression(hppl::unary::neg<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::exp_op<T>, const Derived, T> exp() const {
+    return unaryExpression(hppl::unary::exp_op<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::log_op<T>, const Derived, T> log() const {
+    return unaryExpression(hppl::unary::log_op<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::sqrt_op<T>, const Derived, T> sqrt() const {
+    return unaryExpression(hppl::unary::sqrt_op<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::square<T>, const Derived, T> square() const {
+    return unaryExpression(hppl::unary::square<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::reciprocal<T>, const Derived, T> reciprocal()
+      const {
+    return unaryExpression(hppl::unary::reciprocal<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::abs<T>, const Derived, T> abs() const {
+    return unaryExpression(hppl::unary::abs<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::sign<T>, const Derived, T> sign() const {
+    return unaryExpression(hppl::unary::sign<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::pow_op<T>, const Derived, T> pow(T p) const {
+    return unaryExpression(hppl::unary::pow_op<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::min<T>, const Derived, T> min(T p) const {
+    return unaryExpression(hppl::unary::min<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::max<T>, const Derived, T> max(T p) const {
+    return unaryExpression(hppl::unary::max<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_eq<T>, const Derived, T> operator==(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_eq<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_ne<T>, const Derived, T> operator!=(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_ne<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_le<T>, const Derived, T> operator<=(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_le<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_lt<T>, const Derived, T> operator<(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_lt<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_ge<T>, const Derived, T> operator>=(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_ge<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_gt<T>, const Derived, T> operator>(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_gt<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::and_op<T>, const Derived, T> operator&&(
+      T p) const {
+    return unaryExpression(hppl::unary::and_op<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::or_op<T>, const Derived, T> operator||(
+      T p) const {
+    return unaryExpression(hppl::unary::or_op<T>(p));
+  }
+
+  /**
+   * Element wise binary expression.
+   */
+  template <typename BinaryOp, typename ExpressionType>
+  const TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>
+  binaryExpression(const BinaryOp& op, const ExpressionType& expr) const {
+    return TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>(
+        op, derived(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_eq<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator==(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_eq<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_ne<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator!=(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_ne<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_le<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator<=(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_le<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_lt<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator<(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_lt<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_ge<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator>=(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_ge<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_gt<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator>(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_gt<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::and_op<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator&&(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::and_op<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::or_op<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator||(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::or_op<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::add<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator+(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::add<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::sub<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator-(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::sub<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::mul<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator*(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::mul<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::div<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator/(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::div<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::min<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  min(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::min<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::max<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  max(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::max<T>(), expr);
+  }
+
+  /**
+   * Element wise ternary expression.
+   *
+   * ternary conditional operator(?: operator).
+   * The conditional expression returns one of two values depending on
+   * the result of derived expression.
+   * If derived expression evaluates to true, then expression1 is evaluated.
+   * If derived expression evaluates to false, then expression2 is evaluated.
+   */
+  template <typename ExprType1, typename ExprType2>
+  const TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>
+  condition(const ExprType1& expr1, const ExprType2& expr2) const {
+    return TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>(
+        derived(), expr1, expr2);
+  }
+
+  template <typename ExprType>
+  const TensorTernaryOp<
+      const Derived,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      const ExprType,
+      T>
+  condition(T p, const ExprType& expr) const {
+    return condition(constant(p), expr);
+  }
+
+  template <typename ExprType>
+  const TensorTernaryOp<
+      const Derived,
+      const ExprType,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      T>
+  condition(const ExprType& expr, T p) const {
+    return condition(expr, constant(p));
+  }
+
+  const TensorTernaryOp<
+      const Derived,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      T>
+  condition(T p1, T p2) const {
+    return condition(constant(p1), constant(p2));
+  }
+
+  /**
+   * return a TensorConstant. A TensorConstant object hold a constant value.
+   */
+  const TensorConstant<hppl::unary::constant<T>, const Derived, T> constant(
+      T p) const {
+    return TensorConstant<hppl::unary::constant<T>, const Derived, T>(
+        hppl::unary::constant<T>(p), derived());
+  }
+
+  /**
+   * return a TensorAssignOp, and use AssignEvaluate to evaluate one or more
+   * TensorAssignOp objects.
+   */
+  template <typename ExpressionType>
+  TensorAssignOp<Derived, ExpressionType, T> lazyAssign(
+      const ExpressionType& expr) const {
+    return TensorAssignOp<Derived, ExpressionType, T>(derived(), expr);
+  }
+
+protected:
+  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+};
+
+/**
+ * \brief Unary Operator Expression
+ */
+template <class OP, typename ExprType, class T>
+class TensorUnaryOp
+    : public TensorExpression<TensorUnaryOp<OP, ExprType, T>, T> {
+public:
+  explicit TensorUnaryOp(const OP op, const ExprType& expr)
+      : op_(op), expr_(expr) {}
+
+  const OP op_;
+  const ExprType expr_;
+};
+
+/**
+ * \brief Binary Operator Expression
+ */
+template <class OP, typename LhsType, typename RhsType, class T>
+class TensorBinaryOp
+    : public TensorExpression<TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
+public:
+  explicit TensorBinaryOp(const OP op, const LhsType& lhs, const RhsType& rhs)
+      : op_(op), lhs_(lhs), rhs_(rhs) {}
+
+  const OP op_;
+  const LhsType lhs_;
+  const RhsType rhs_;
+};
+
+/**
+ * \brief Ternary Operator Expression
+ */
+template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
+class TensorTernaryOp : public TensorExpression<
+                            TensorTernaryOp<ExprType1, ExprType2, ExprType3, T>,
+                            T> {
+public:
+  explicit TensorTernaryOp(const ExprType1& expr1,
+                           const ExprType2& expr2,
+                           const ExprType3& expr3)
+      : expr1_(expr1), expr2_(expr2), expr3_(expr3) {}
+
+  const ExprType1 expr1_;
+  const ExprType2 expr2_;
+  const ExprType3 expr3_;
+};
+
+/**
+ * \brief Constant Expression
+ */
+template <class OP, typename ExprType, class T>
+class TensorConstant
+    : public TensorExpression<TensorConstant<OP, ExprType, T>, T> {
+public:
+  explicit TensorConstant(const OP op, const ExprType& expr)
+      : op_(op), expr_(expr) {}
+
+  const OP op_;
+  const ExprType expr_;
+};
+
+/**
+ * \brief operator+ overload
+ * \return a unary operator expression
+ */
+template <typename Derived, class T>
+const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
+    T p, const TensorExpression<Derived, T>& expr) {
+  return expr + p;
+}
+
+/**
+ * \brief operator* overload
+ * \return a unary operator expression
+ */
+template <typename Derived, class T>
+const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
+    T p, const TensorExpression<Derived, T>& expr) {
+  return expr * p;
+}
+
+}  // namespace paddle
+
+#include "TensorApply.h"
+#include "TensorEvaluate.h"
diff --git a/paddle/math/TrainingAlgorithmOp.cu b/paddle/math/TrainingAlgorithmOp.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d8d9c793fba0cb09503885de925a9844c9c0f554
--- /dev/null
+++ b/paddle/math/TrainingAlgorithmOp.cu
@@ -0,0 +1,357 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Logging.h"
+#include "BaseMatrix.h"
+#include "TrainingAlgorithmOp.h"
+
+#if __cplusplus > 199711L
+
+#include "TensorAssign.h"
+
+namespace paddle {
+
+void sparseMomentumApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& momU,
+                         BaseMatrix& momV,
+                         real alpha,
+                         real beta,
+                         real gamma,
+                         real tau,
+                         real learningRate) {
+  auto expr1 = momU.lazyAssign(momU - (alpha * gamma * learningRate) * grad);
+  auto expr2 = momV.lazyAssign(
+    momV + (tau * alpha * gamma * learningRate) * grad);
+  auto expr3 = value.lazyAssign(
+    (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV);
+
+  AssignEvaluate(expr1, expr2, expr3);
+}
+
+void adadeltaApply(BaseMatrix& value,
+                   BaseMatrix& grad,
+                   BaseMatrix& mom,
+                   BaseMatrix& accum,
+                   BaseMatrix& accum_update,
+                   BaseMatrix& lr,
+                   real rou,
+                   real epsilon,
+                   real learningRate,
+                   real momentum,
+                   real decayRate) {
+  auto expr1 = accum.lazyAssign(rou * accum + ((real)1 - rou) * grad.square());
+  auto expr2 = lr.lazyAssign(
+    ((accum_update + epsilon) / (accum + epsilon)).sqrt());
+  auto expr3 = accum_update.lazyAssign(
+    rou * accum_update + ((real)1 - rou) * (grad * lr).square());
+  auto expr4 = mom.lazyAssign(
+    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr5 = value.lazyAssign(value + mom);
+
+  AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
+}
+
+void adagradApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& accum_buffer,
+                  BaseMatrix& accum,
+                  BaseMatrix& lr,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate) {
+  auto expr1 = accum.lazyAssign(accum + grad.square());
+  auto expr2 = lr.lazyAssign(
+    (accum_buffer + accum + epsilon).sqrt().reciprocal());
+  auto expr3 = mom.lazyAssign(
+    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr4 = value.lazyAssign(value + mom);
+
+  AssignEvaluate(expr1, expr2, expr3, expr4);
+}
+
+void rmspropApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& g,
+                  BaseMatrix& f,
+                  BaseMatrix& lr,
+                  real accumulatedRou,
+                  real rou,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate,
+                  bool firstTime) {
+  auto expr2 = f.lazyAssign(accumulatedRou * f + ((real)1 - rou) * grad);
+  auto expr3 = lr.lazyAssign((g - f.square() + epsilon).sqrt().reciprocal());
+  auto expr4 = mom.lazyAssign(
+    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr5 = value.lazyAssign(value + mom);
+
+  if (firstTime) {
+    auto expr1 = g.lazyAssign(accumulatedRou * g + grad.square());
+
+    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
+  } else {
+    auto expr1 = g.lazyAssign(
+      accumulatedRou * g + ((real)1 - rou) * grad.square());
+
+    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
+  }
+}
+
+void decayedAdagradApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& mom,
+                         BaseMatrix& accum,
+                         BaseMatrix& lr,
+                         real accumulatedRou,
+                         real rou,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate,
+                         bool firstTime) {
+  auto expr2 = lr.lazyAssign((accum + epsilon).sqrt().reciprocal());
+  auto expr3 = mom.lazyAssign(
+    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr4 = value.lazyAssign(value + mom);
+
+  if (firstTime) {
+    auto expr1 = accum.lazyAssign(accumulatedRou * accum + grad.square());
+
+    AssignEvaluate(expr1, expr2, expr3, expr4);
+  } else {
+    auto expr1 = accum.lazyAssign(
+      accumulatedRou * accum + ((real)1 - rou) * grad.square());
+
+    AssignEvaluate(expr1, expr2, expr3, expr4);
+  }
+}
+
+void adamApply(BaseMatrix& value,
+               BaseMatrix& grad,
+               BaseMatrix& mom,  // firse moment
+               BaseMatrix& v,    // second moment
+               real beta1,
+               real beta2,
+               real beta1_power,
+               real beta2_power,
+               real epsilon,
+               real learningRate) {
+  real alpha = learningRate *
+      std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+
+  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
+  auto expr2 = v.lazyAssign(beta2 * v + ((real)1 - beta2) * grad.square());
+  auto expr3 = value.lazyAssign(
+    value - (mom * alpha) / (v.sqrt() + epsilon));
+
+  AssignEvaluate(expr1, expr2, expr3);
+}
+
+void adamaxApply(BaseMatrix& value,
+                 BaseMatrix& grad,
+                 BaseMatrix& mom,  // firse moment
+                 BaseMatrix& u,    // weighted infinity norm
+                 real beta1,
+                 real beta2,
+                 int64_t step,
+                 real alpha) {
+  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
+  auto expr2 = u.lazyAssign(
+    (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs()));
+  auto expr3 = value.lazyAssign(
+    value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u));
+
+  AssignEvaluate(expr1, expr2, expr3);
+}
+
+}  // namespace paddle
+
+#else
+
+namespace paddle {
+
+void sparseMomentumApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& momU,
+                         BaseMatrix& momV,
+                         real alpha,
+                         real beta,
+                         real gamma,
+                         real tau,
+                         real learningRate) {
+  /**
+   * \alpha_t = \alpha_{t-1} / k
+   * \beta_t = \beta_{t-1} / (1 + \lambda\gamma_t)
+   * u_t = u_{t-1} - \alpha_t \gamma_t g_t
+   * v_t = v_{t-1} + \tau_{t-1} \alpha_t \gamma_t g_t
+   * \tau_t = \tau_{t-1} + \beta_t / \alpha_t
+   */
+  momU -= (alpha * gamma * learningRate) * grad;
+  momV += (tau * alpha * gamma * learningRate) * grad;
+  value = (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV;
+}
+
+void adadeltaApply(BaseMatrix& value,
+                   BaseMatrix& grad,
+                   BaseMatrix& mom,
+                   BaseMatrix& accum,
+                   BaseMatrix& accum_update,
+                   BaseMatrix& lr,
+                   real rou,
+                   real epsilon,
+                   real learningRate,
+                   real momentum,
+                   real decayRate) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  accum = rou * accum + ((real)1 - rou) * grad.square();
+
+  // learn_rate: sqrt(( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ))
+  lr = ((accum_update + epsilon) / (accum + epsilon)).sqrt();
+
+  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
+  accum_update = rou * accum_update + ((real)1 - rou) * (grad * lr).square();
+
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+
+void adagradApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& accum_buffer,
+                  BaseMatrix& accum,
+                  BaseMatrix& lr,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate) {
+  accum += grad.square();
+  lr = (accum_buffer + accum + epsilon).sqrt().reciprocal();
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+
+void rmspropApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& g,
+                  BaseMatrix& f,
+                  BaseMatrix& lr,
+                  real accumulatedRou,
+                  real rou,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate,
+                  bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  if (firstTime) {
+    g = accumulatedRou * g + grad.square();
+  } else {
+    g = accumulatedRou * g + ((real)1 - rou) * grad.square();
+  }
+
+  // E(f_t) = \rou * E(f_{t-1}) + (1-\rou) * g
+  f = accumulatedRou * f + ((real)1 - rou) * grad;
+
+  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(f_t))^2 + epsilon )
+  // Basiclly if the sign of the gradient changes more often,
+  // the learning rate will be decreased.
+  lr = (g - f.square() + epsilon).sqrt().reciprocal();
+
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+
+void decayedAdagradApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& mom,
+                         BaseMatrix& accum,
+                         BaseMatrix& lr,
+                         real accumulatedRou,
+                         real rou,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate,
+                         bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  if (firstTime) {
+    accum = accumulatedRou * accum + grad.square();
+  } else {
+    accum = accumulatedRou * accum + ((real)1 - rou) * grad.square();
+  }
+
+  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
+  // Basiclly if the bigger the magnitude gradient is,
+  // the smaller the learning rate will be.
+  lr = (accum + epsilon).sqrt().reciprocal();
+
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+
+void adamApply(BaseMatrix& value,
+               BaseMatrix& grad,
+               BaseMatrix& mom,  // firse moment
+               BaseMatrix& v,    // second moment
+               real beta1,
+               real beta2,
+               real beta1_power,
+               real beta2_power,
+               real epsilon,
+               real learningRate) {
+  real alpha = learningRate *
+      std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  mom = beta1 * mom + ((real)1 - beta1) * grad;
+
+  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
+  v = beta2 * v + ((real)1 - beta2) * grad.square();
+
+  value -=  (mom * alpha) / (v.sqrt() + epsilon);
+}
+
+void adamaxApply(BaseMatrix& value,
+                 BaseMatrix& grad,
+                 BaseMatrix& mom,  // firse moment
+                 BaseMatrix& u,    // weighted infinity norm
+                 real beta1,
+                 real beta2,
+                 int64_t step,
+                 real alpha) {
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  mom = beta1 * mom + ((real)1 - beta1) * grad;
+
+  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
+  u = (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs());
+
+  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
+  value -= (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u);
+}
+
+}  // namespace paddle
+
+#endif
diff --git a/paddle/math/TrainingAlgorithmOp.h b/paddle/math/TrainingAlgorithmOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..68eb98a93e423fa064168bc930cea1b62b7c62d0
--- /dev/null
+++ b/paddle/math/TrainingAlgorithmOp.h
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/utils/Logging.h"
+#include "BaseMatrix.h"
+
+namespace paddle {
+
+/**
+ * \brief Sparse Momentum optimizer.
+ */
+extern void sparseMomentumApply(BaseMatrix& value,
+                                BaseMatrix& grad,
+                                BaseMatrix& momU,
+                                BaseMatrix& momV,
+                                real alpha,
+                                real beta,
+                                real gamma,
+                                real tau,
+                                real learningRate);
+
+/**
+ * \brief AdaDelta optimizer.
+ */
+extern void adadeltaApply(BaseMatrix& value,
+                          BaseMatrix& grad,
+                          BaseMatrix& sum,
+                          BaseMatrix& sum1,
+                          BaseMatrix& mom,
+                          BaseMatrix& lr,
+                          real rou,
+                          real epsilon,
+                          real learningRate,
+                          real momentum,
+                          real decayRate);
+
+/**
+ * \brief AdaGrad optimizer.
+ */
+extern void adagradApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& sum,
+                         BaseMatrix& sum1,
+                         BaseMatrix& mom,
+                         BaseMatrix& lr,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate);
+
+/**
+ * \brief RMSProp optimizer.
+ */
+extern void rmspropApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& g,
+                         BaseMatrix& f,
+                         BaseMatrix& mom,
+                         BaseMatrix& lr,
+                         real accumulatedRou,
+                         real rou,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate,
+                         bool firstTime);
+
+/**
+ * \brief Decayed AdaGrad optimizer.
+ */
+extern void decayedAdagradApply(BaseMatrix& value,
+                                BaseMatrix& grad,
+                                BaseMatrix& mom,
+                                BaseMatrix& accum,
+                                BaseMatrix& lr,
+                                real accumulatedRou,
+                                real rou,
+                                real epsilon,
+                                real learningRate,
+                                real momentum,
+                                real decayRate,
+                                bool firstTime);
+
+/**
+ * \brief Adam optimizer.
+ */
+extern void adamApply(BaseMatrix& value,
+                      BaseMatrix& grad,
+                      BaseMatrix& mom,
+                      BaseMatrix& v,
+                      real beta1,
+                      real beta2,
+                      real beta1_power,
+                      real beta2_power,
+                      real epsilon,
+                      real learningRate);
+
+/**
+ * \brief AdaMax optimizer.
+ */
+extern void adamaxApply(BaseMatrix& value,
+                        BaseMatrix& grad,
+                        BaseMatrix& mom,  // firse moment
+                        BaseMatrix& u,    // weighted infinity norm
+                        real beta1,
+                        real beta2,
+                        int64_t step,
+                        real alpha);
+}  // namespace paddle
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
index 23c9cacceaea2a2e265108b2467c1d21a2fe312f..b2ade83138428a510e6be1bfa82290008e4167d0 100644
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "Vector.h"
 
@@ -49,7 +48,8 @@ std::shared_ptr<VectorT<T>> VectorT<T>::createParallelVector(
 }
 
 template <class T>
-std::shared_ptr<VectorT<T>> VectorT<T>::create(T* data, size_t size,
+std::shared_ptr<VectorT<T>> VectorT<T>::create(T* data,
+                                               size_t size,
                                                bool useGpu) {
   if (useGpu) {
     return std::make_shared<GpuVectorT<T>>(size, data);
@@ -63,10 +63,10 @@ std::shared_ptr<VectorT<T>> VectorT<T>::create(size_t size,
                                                MemoryHandlePtr memoryHandle,
                                                size_t offset) {
   if (auto cpuMemHandle =
-      std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle)) {
+          std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle)) {
     return std::make_shared<CpuVectorT<T>>(size, cpuMemHandle, offset);
   } else if (auto gpuMemHandle =
-             std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle)) {
+                 std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle)) {
     return std::make_shared<GpuVectorT<T>>(size, gpuMemHandle, offset);
   } else {
     LOG(FATAL) << "Wrong";
@@ -76,22 +76,22 @@ std::shared_ptr<VectorT<T>> VectorT<T>::create(size_t size,
 
 template <>
 MatrixPtr VectorT<real>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
-    LOG(FATAL) << "Wrong for real vector";
-    return nullptr;
+  LOG(FATAL) << "Wrong for real vector";
+  return nullptr;
 }
 
 template <>
 MatrixPtr VectorT<int>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
-  int height = getSize();
-  int width = idRange;
+  size_t height = getSize();
+  size_t width = idRange;
   MatrixPtr mat = Matrix::createSparseMatrix(
       height, idRange, height, NO_VALUE, SPARSE_CSR, false, useGpu);
 
   CpuIVector cpuIds(height);
   cpuIds.copyFrom(*this);
-  int *idData = cpuIds.getData();
+  int* idData = cpuIds.getData();
 
-  for (int i = 0; i < height; i ++) {
+  for (decltype(height) i = 0; i < height; i++) {
     const unsigned int id = idData[i];
     CHECK_LT(id, width);
     mat->setRow(i, 1, &id, nullptr);
@@ -101,21 +101,20 @@ MatrixPtr VectorT<int>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
 
 template <class T>
 GpuVectorT<T>::GpuVectorT(size_t size)
-    : VectorT<T>(size, std::make_shared<GpuMemoryHandle>(sizeof(T) * size),
+    : VectorT<T>(size,
+                 std::make_shared<GpuMemoryHandle>(sizeof(T) * size),
                  0, /* offset = 0 */
                  true /* useGpu = true */) {}
 
 template <class T>
 T GpuVectorT<T>::getElement(size_t i) const {
   T elem = 0;
-  hl_memcpy_device2host(&elem, const_cast<T*>(&this->getData()[i]),
-                        sizeof(T));
+  hl_memcpy_device2host(&elem, const_cast<T*>(&this->getData()[i]), sizeof(T));
   return elem;
 }
 template <class T>
 void GpuVectorT<T>::setElement(size_t i, const T& value) {
-  hl_memcpy_host2device(&this->getData()[i], const_cast<T*>(&value),
-                        sizeof(T));
+  hl_memcpy_host2device(&this->getData()[i], const_cast<T*>(&value), sizeof(T));
 }
 
 template <class T>
@@ -219,8 +218,7 @@ real GpuVectorT<real>::getMin() {
 template <class T>
 T GpuVectorT<T>::get(size_t pos) {
   T val = (T)0;
-  hl_memcpy_device2host((void*)&val, (void*)(this->getData() + pos),
-                        sizeof(T));
+  hl_memcpy_device2host((void*)&val, (void*)(this->getData() + pos), sizeof(T));
   return val;
 }
 
@@ -229,7 +227,7 @@ void GpuVectorT<T>::histogram(std::ostream& os, int type) {
   LOG(FATAL) << "Not implemented";
 }
 
-template<class T>
+template <class T>
 void GpuVectorT<T>::zeroMem() {
   BaseMatrixT<T>::zero();
 }
@@ -252,8 +250,10 @@ void GpuVectorT<T>::copyFrom(const VectorT<T>& src) {
 template <class T>
 void GpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
   CHECK_EQ(src.getSize(), this->getSize());
-  hl_memcpy_async((void*)this->getData(), (void*)src.getData(),
-                  sizeof(T) * this->getSize(), stream);
+  hl_memcpy_async((void*)this->getData(),
+                  (void*)src.getData(),
+                  sizeof(T) * this->getSize(),
+                  stream);
 }
 
 template <class T>
@@ -269,15 +269,16 @@ void GpuVectorT<T>::copyFrom(const T* gpuSrc, size_t size, hl_stream_t stream) {
   CHECK(gpuSrc != NULL);
   CHECK_LE(size, this->size_);
 
-  hl_memcpy_async((void*)this->getData(), (void*)gpuSrc,
-                  sizeof(T) * size, stream);
+  hl_memcpy_async(
+      (void*)this->getData(), (void*)gpuSrc, sizeof(T) * size, stream);
 }
 
 template <class T>
 void GpuVectorT<T>::copyTo(CpuVectorT<T>* dest) const {
   CHECK_EQ(this->getSize(), dest->getSize());
 
-  hl_memcpy_device2host((void*)dest->getData(), (void*)this->getData(),
+  hl_memcpy_device2host((void*)dest->getData(),
+                        (void*)this->getData(),
                         sizeof(T) * this->getSize());
 }
 
@@ -285,7 +286,8 @@ template <class T>
 void GpuVectorT<T>::copyTo(GpuVectorT<T>* dest) const {
   CHECK_EQ(this->getSize(), dest->getSize());
 
-  hl_memcpy_device2device((void*)dest->getData(), (void*)this->getData(),
+  hl_memcpy_device2device((void*)dest->getData(),
+                          (void*)this->getData(),
                           sizeof(T) * this->getSize());
 }
 
@@ -297,7 +299,8 @@ void GpuVectorT<int>::rand() {
 template <>
 void GpuVectorT<int>::print(std::ostream& os, size_t num) const {
   IVectorPtr dest = IVector::create(this->size_, false);
-  hl_memcpy_device2host((void*)dest->getData(), (void*)this->getData(),
+  hl_memcpy_device2host((void*)dest->getData(),
+                        (void*)this->getData(),
                         sizeof(int) * this->getSize());
   dest->print(os, num);
 }
@@ -305,7 +308,8 @@ void GpuVectorT<int>::print(std::ostream& os, size_t num) const {
 template <>
 void GpuVectorT<real>::print(std::ostream& os, size_t num) const {
   VectorPtr dest = Vector::create(this->size_, false);
-  hl_memcpy_device2host((void*)dest->getData(), (void*)this->getData(),
+  hl_memcpy_device2host((void*)dest->getData(),
+                        (void*)this->getData(),
                         sizeof(int) * this->getSize());
   dest->print(os, num);
 }
@@ -428,8 +432,8 @@ void GpuVectorT<real>::randnorm(real mean, real std) {
   CpuVector cpuVec = CpuVector(this->getSize());
   cpuVec.randnorm(mean, std);
 
-  hl_memcpy_host2device(data_, cpuVec.getData(),
-                        this->getSize() * sizeof(real));
+  hl_memcpy_host2device(
+      data_, cpuVec.getData(), this->getSize() * sizeof(real));
 }
 
 template <>
@@ -437,19 +441,22 @@ void GpuVectorT<real>::uniform(real left, real right) {
   CpuVector cpuVec = CpuVector(this->getSize());
   cpuVec.uniform(left, right);
 
-  hl_memcpy_host2device(data_, cpuVec.getData(),
-                        this->getSize() * sizeof(real));
+  hl_memcpy_host2device(
+      data_, cpuVec.getData(), this->getSize() * sizeof(real));
 }
 
 template <class T>
 CpuVectorT<T>::CpuVectorT(size_t size)
-    : VectorT<T>(size, std::make_shared<CpuMemoryHandle>(sizeof(T) * size),
+    : VectorT<T>(size,
+                 std::make_shared<CpuMemoryHandle>(sizeof(T) * size),
                  0, /* offset = 0 */
                  false /* useGpu = false */) {}
 
 template <class T>
 CpuVectorT<T>::CpuVectorT(const VectorT<T>& src)
-    : VectorT<T>(src.getSize(), src.getMemoryHandle(), 0, /* offset = 0 */
+    : VectorT<T>(src.getSize(),
+                 src.getMemoryHandle(),
+                 0, /* offset = 0 */
                  false /* useGpu = false */) {
   if (typeid(*this->memoryHandle_.get()) != typeid(CpuMemoryHandle)) {
     this->memoryHandle_ =
@@ -646,8 +653,10 @@ void CpuVectorT<T>::copyFrom(const VectorT<T>& src) {
 template <class T>
 void CpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
   if (typeid(src) == typeid(GpuVectorT<T>)) {
-    hl_memcpy_async((void*)this->getData(), (void*)src.getData(),
-                    sizeof(T) * this->getSize(), stream);
+    hl_memcpy_async((void*)this->getData(),
+                    (void*)src.getData(),
+                    sizeof(T) * this->getSize(),
+                    stream);
   } else {
     src.copyTo(this);
   }
@@ -661,7 +670,8 @@ void CpuVectorT<T>::copyFrom(const T* hostSrc, size_t size) {
 }
 
 template <class T>
-void CpuVectorT<T>::copyFrom(const T* hostSrc, size_t size,
+void CpuVectorT<T>::copyFrom(const T* hostSrc,
+                             size_t size,
                              hl_stream_t stream) {
   (void)stream;
 
@@ -679,7 +689,8 @@ void CpuVectorT<T>::copyTo(CpuVectorT<T>* dest) const {
 template <class T>
 void CpuVectorT<T>::copyTo(GpuVectorT<T>* dest) const {
   CHECK_EQ(this->getSize(), dest->getSize());
-  hl_memcpy_host2device((void*)dest->getData(), (void*)this->getData(),
+  hl_memcpy_host2device((void*)dest->getData(),
+                        (void*)this->getData(),
                         sizeof(T) * this->getSize());
 }
 
@@ -723,8 +734,8 @@ void ParallelCpuVectorT<T>::parallelExec(ExecFunc func) {
 template <>
 void ParallelCpuVectorT<real>::parallelExec(ExecFunc func) {
   pool_->exec([this, func](int tid, size_t numThreads) {
-    auto interval = calcSplitArrayInterval(this->getSize(), (size_t)tid,
-                                           numThreads, 8LU /*for avx*/);
+    auto interval = calcSplitArrayInterval(
+        this->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
     // setup sub bufs
     CpuVector subVec(0, nullptr);
     subVec.subVecFrom(*this, interval);
@@ -743,7 +754,8 @@ void ParallelCpuVectorT<real>::exec(SyncThreadPool::JobFunc func) {
 }
 
 template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, bool useGpu) : sync_(nullptr) {
+CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, bool useGpu)
+    : sync_(nullptr) {
   if (!useGpu) {
     cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size);
   } else {
@@ -754,7 +766,7 @@ CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, bool useGpu) : sync_(nullptr) {
 
 template <class T>
 CpuGpuVectorT<T>::CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src)
-  : sync_(nullptr) {
+    : sync_(nullptr) {
   bool useGpu = src->useGpu();
   if (useGpu) {
     gpuVectorT_ = src;
@@ -766,7 +778,7 @@ CpuGpuVectorT<T>::CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src)
 
 template <class T>
 CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, T* data, bool useGpu)
-  : sync_(nullptr) {
+    : sync_(nullptr) {
   if (!useGpu) {
     cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size, data);
     setSync(DATA_AT_CPU);
@@ -777,8 +789,8 @@ CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, T* data, bool useGpu)
 }
 
 template <class T>
-std::shared_ptr<CpuGpuVectorT<T>>
-CpuGpuVectorT<T>::create(size_t size, bool useGpu) {
+std::shared_ptr<CpuGpuVectorT<T>> CpuGpuVectorT<T>::create(size_t size,
+                                                           bool useGpu) {
   return std::make_shared<CpuGpuVectorT<T>>(size, useGpu);
 }
 
@@ -809,9 +821,9 @@ void CpuGpuVectorT<T>::resize(size_t size, bool useGpu) {
 }
 
 template <class T>
-void CpuGpuVectorT<T>::resizeOrCreate(
-    std::shared_ptr<CpuGpuVectorT<T>>& vec,
-    size_t size, bool useGpu) {
+void CpuGpuVectorT<T>::resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>>& vec,
+                                      size_t size,
+                                      bool useGpu) {
   if (vec) {
     vec->resize(size, useGpu);
   } else {
@@ -833,7 +845,9 @@ void CpuGpuVectorT<T>::resizeOrCreate(size_t size, bool useGpu) {
 
 template <class T>
 CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
-  size_t offset, size_t size) : sync_(nullptr) {
+                                size_t offset,
+                                size_t size)
+    : sync_(nullptr) {
   CHECK_LE(offset + size, static_cast<size_t>(src.getSize()));
 #ifndef PADDLE_ONLY_CPU
   SyncedFlag* flag = src.getSync();
@@ -844,21 +858,21 @@ CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
   }
 #endif
   auto cMemHandle = (src.getVector(false))->getMemoryHandle();
-  cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size,
-    std::dynamic_pointer_cast<CpuMemoryHandle>(cMemHandle), offset);
+  cpuVectorT_ = std::make_shared<CpuVectorT<T>>(
+      size, std::dynamic_pointer_cast<CpuMemoryHandle>(cMemHandle), offset);
 #ifndef PADDLE_ONLY_CPU
   auto gMemHandle = (src.getVector(true))->getMemoryHandle();
-  gpuVectorT_ = std::make_shared<GpuVectorT<T>>(size,
-    std::dynamic_pointer_cast<GpuMemoryHandle>(gMemHandle), offset);
+  gpuVectorT_ = std::make_shared<GpuVectorT<T>>(
+      size, std::dynamic_pointer_cast<GpuMemoryHandle>(gMemHandle), offset);
   src.setSync(SYNCED);
 #endif
   setSync(src.getSync());
 }
 
 template <class T>
-std::shared_ptr<const VectorT<T>>
-CpuGpuVectorT<T>::getVector(bool useGpu) const {
-  auto * self = const_cast<CpuGpuVectorT<T>*>(this);
+std::shared_ptr<const VectorT<T>> CpuGpuVectorT<T>::getVector(
+    bool useGpu) const {
+  auto* self = const_cast<CpuGpuVectorT<T>*>(this);
   if (useGpu) {
     self->copyToGpu();
     return std::const_pointer_cast<const VectorT<T>>(gpuVectorT_);
@@ -964,8 +978,10 @@ void CpuGpuVectorT<T>::copyFrom(const T* data, size_t size, bool useGpu) {
 }
 
 template <class T>
-void CpuGpuVectorT<T>::copyFrom(const T* data, size_t size,
-    hl_stream_t stream, bool useGpu) {
+void CpuGpuVectorT<T>::copyFrom(const T* data,
+                                size_t size,
+                                hl_stream_t stream,
+                                bool useGpu) {
   if (useGpu) {
     copyToGpu(data, size, stream);
   } else {
@@ -975,7 +991,10 @@ void CpuGpuVectorT<T>::copyFrom(const T* data, size_t size,
 
 template <class T>
 void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src,
-    size_t offset, size_t size, bool useGpu, hl_stream_t stream) {
+                                size_t offset,
+                                size_t size,
+                                bool useGpu,
+                                hl_stream_t stream) {
   if (useGpu) {
     VectorT<T>::resizeOrCreate(gpuVectorT_, size, true);
     gpuVectorT_->copyFrom(src.getData(true) + offset, size, stream);
@@ -987,8 +1006,7 @@ void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src,
 }
 
 template <class T>
-void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src,
-    hl_stream_t stream) {
+void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src, hl_stream_t stream) {
   switch (*src.getSync()) {
     case DATA_AT_CPU:
       copyFrom(*(src.getVector(false)), stream);
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
index faf8186b6d10d7cbc14376ff3b6543d1303b2ab1..bcd8ff3fa3c1b1298e7b028eb405bc2a7476533c 100644
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -39,12 +38,11 @@ class SyncThreadPool;
 
 class Matrix;
 
-template<class T>
+template <class T>
 class BaseVector : public BaseMatrixT<T> {
 public:
   BaseVector(size_t size, T* data, bool useGpu)
-    : BaseMatrixT<T>(1, size, data, false, useGpu),
-      size_(this->width_) {}
+      : BaseMatrixT<T>(1, size, data, false, useGpu), size_(this->width_) {}
 
   ~BaseVector() {}
 
@@ -113,7 +111,8 @@ public:
     this->size_ = newSize;
   }
 
-  static void resizeOrCreate(std::shared_ptr<VectorT<T>>& vec, size_t size,
+  static void resizeOrCreate(std::shared_ptr<VectorT<T>>& vec,
+                             size_t size,
                              bool useGpu) {
     if (vec) {
       vec->resize(size);
@@ -266,6 +265,15 @@ public:
   /// print the "idx" element of the Vector
   virtual void printOneElement(std::ostream& os, size_t idx) const = 0;
 
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (BaseVector<T>::useGpu_) {
+      TensorGpuApply<T>(*this, expr);
+    } else {
+      TensorCpuApply<T>(*this, expr);
+    }
+  }
+
 protected:
   friend class GpuVectorT<T>;
   friend class CpuVectorT<T>;
@@ -323,6 +331,11 @@ public:
   virtual void print(std::ostream& os, size_t num) const;
   virtual void printOneElement(std::ostream& os, size_t idx) const;
 
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorGpuApply<T>(*this, expr);
+  }
+
 protected:
   virtual void copyTo(CpuVectorT<T>* dest) const;
   virtual void copyTo(GpuVectorT<T>* dest) const;
@@ -386,6 +399,11 @@ public:
   virtual T get(size_t pos);
   virtual void print(std::ostream& os, size_t num) const;
   virtual void printOneElement(std::ostream& os, size_t idx) const;
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorCpuApply<T>(*this, expr);
+  }
 };
 
 template <class T>
@@ -431,11 +449,7 @@ public:
    *
    * SYNCED: data is located in CPU and GPU simultaneously.
    */
-  enum SyncedFlag {
-    DATA_AT_CPU = 0,
-    DATA_AT_GPU = 1,
-    SYNCED = 2
-  };
+  enum SyncedFlag { DATA_AT_CPU = 0, DATA_AT_GPU = 1, SYNCED = 2 };
 
   /**
    * @brief A constructor, create cpuVectorT_ or gpuVectorT_.
@@ -469,8 +483,7 @@ public:
    */
   CpuGpuVectorT(size_t size, T* data, bool useGpu);
 
-  CpuGpuVectorT(CpuGpuVectorT<T>& src,
-    size_t offset, size_t size);
+  CpuGpuVectorT(CpuGpuVectorT<T>& src, size_t offset, size_t size);
 
   virtual ~CpuGpuVectorT() {}
 
@@ -489,8 +502,8 @@ public:
    * @brief resize or create CpuGpuVectorT.
    */
   static void resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>>& vec,
-                             size_t size, bool useGpu);
-
+                             size_t size,
+                             bool useGpu);
 
   /**
    * @brief return a const cpuVectorT_ or gpuVectorT_.
@@ -522,10 +535,10 @@ public:
    */
   const T* getData(bool useGpu) const;
 
-// TODO(yuyang18): Make getData more c++ style.
-//  inline T* getData(bool useGpu) {
-//    return getMutableData(useGpu);
-//  }
+  // TODO(yuyang18): Make getData more c++ style.
+  //  inline T* getData(bool useGpu) {
+  //    return getMutableData(useGpu);
+  //  }
 
   T* getMutableData(bool useGpu);
 
@@ -615,8 +628,11 @@ public:
   /**
    * @brief copy from (src + offset) using specifed-stream.
    */
-  void copyFrom(CpuGpuVectorT<T>& src, size_t offset, size_t size,
-                bool useGpu, hl_stream_t stream);
+  void copyFrom(CpuGpuVectorT<T>& src,
+                size_t offset,
+                size_t size,
+                bool useGpu,
+                hl_stream_t stream);
 
   /**
    * @brief copy from src using specifed-stream.
@@ -626,16 +642,12 @@ public:
   /**
    * @brief return sync_.
    */
-  inline SyncedFlag* getSync() const {
-    return sync_;
-  }
+  inline SyncedFlag* getSync() const { return sync_; }
 
   /**
    * @brief set sync_.
    */
-  inline void setSync(SyncedFlag* sync) {
-    sync_ = sync;
-  }
+  inline void setSync(SyncedFlag* sync) { sync_ = sync; }
 
   inline void setSync(SyncedFlag syncFlag) {
     if (sync_) {
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index 247be983ba3296383c8e2f30f1036859ecfde492..fe5177291c21c3505c3694201b36b54397150ccf 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -2,7 +2,8 @@
 
 add_simple_unittest(test_ExecViaCpu)
 add_simple_unittest(test_SIMDFunctions)
-add_simple_unittest(test_matrix)
+add_simple_unittest(test_TrainingAlgorithm)
+add_simple_unittest(test_SparseMatrix)
 
 # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
 add_unittest(test_matrixCompare
@@ -13,4 +14,22 @@ add_simple_unittest(test_sparseMatrixCompare)
 add_simple_unittest(test_perturbation)
 add_simple_unittest(test_CpuGpuVector)
 add_simple_unittest(test_Allocator)
+
+if(WITH_GPU)
+    if(COMPILER_SUPPORT_CXX11)
+    	CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu)
+		link_paddle_test(test_Tensor)
+        CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
+        link_paddle_test(test_lazyAssign)
+    endif()
+else()
+    compile_cu_as_cpp(test_Tensor.cu)
+    add_unittest(test_Tensor test_Tensor.cu)
+    compile_cu_as_cpp(test_lazyAssign.cu)
+    add_unittest(test_lazyAssign test_lazyAssign.cu)
+endif(WITH_GPU)
+
 add_simple_unittest(test_FPException)
+add_simple_unittest(test_GpuProfiler)
+add_simple_unittest(test_BaseMatrix)
+add_simple_unittest(test_Matrix)
diff --git a/paddle/math/tests/OriginalOptimizerApi.h b/paddle/math/tests/OriginalOptimizerApi.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe4d1ae54296b65d690b0a667fbf18023bee05cd
--- /dev/null
+++ b/paddle/math/tests/OriginalOptimizerApi.h
@@ -0,0 +1,201 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/utils/GlobalConstants.h"
+#include "paddle/math/Vector.h"
+
+using namespace paddle;  // NOLINT
+
+void SparseMomentumParameterOptimizer(const VectorPtr vecs[],
+                                      real alpha,
+                                      real beta,
+                                      real gamma,
+                                      real tau,
+                                      real learningRate) {
+  vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
+                                   -alpha * gamma * learningRate);
+  vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
+                                   tau * alpha * gamma * learningRate);
+  vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
+                             tau / beta + 1.0 / alpha,
+                             *vecs[PARAMETER_MOMENTUM_VT],
+                             1.0 / beta);
+}
+
+void AdagradParameterOptimizer(const VectorPtr vecs[],
+                               real epsilon,
+                               real learningRate,
+                               real momentum,
+                               real decayRate) {
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
+                                                1.0f);
+  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
+                                     *vecs[PARAMETER_GRADIENT_SQURESUM1]);
+  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+
+void AdaDeltaParameterOptimizer(const VectorPtr vecs[],
+                                real rou,
+                                real epsilon,
+                                real learningRate,
+                                real momentum,
+                                real decayRate) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], rou, 1.0f - rou);
+
+  // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
+  vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
+                                        *vecs[PARAMETER_GRADIENT_SQURESUM],
+                                        epsilon,
+                                        epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->sqrt2();
+
+  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
+      *vecs[PARAMETER_GRADIENT],
+      *vecs[PARAMETER_LEARNING_RATE],
+      rou,
+      1.0f - rou);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+
+void RMSPropParameterOptimizer(const VectorPtr vecs[],
+                               real accumulatedRou,
+                               real rou,
+                               real epsilon,
+                               real learningRate,
+                               real momentum,
+                               real decayRate,
+                               bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
+
+  // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou);
+
+  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
+  // Basiclly if the sign of the gradient changes more often,
+  // the learning rate will be decreased.
+  vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
+  vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
+                                           -1.0f);
+  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+
+void DecayedAdagradParameterOptimizer(const VectorPtr vecs[],
+                                      real accumulatedRou,
+                                      real rou,
+                                      real epsilon,
+                                      real learningRate,
+                                      real momentum,
+                                      real decayRate,
+                                      bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
+
+  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
+  // Basiclly if the bigger the magnitude gradient is,
+  // the smaller the learning rate will be.
+  vecs[PARAMETER_LEARNING_RATE]->assign(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+
+void AdamParameterOptimizer(const VectorPtr vecs[],
+                            real beta1,
+                            real beta2,
+                            real beta1_power,
+                            real beta2_power,
+                            real epsilon,
+                            real learningRate) {
+  Vector* m = vecs[PARAMETER_MOMENTUM].get();
+  Vector* g = vecs[PARAMETER_GRADIENT].get();
+  Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
+  Vector* theta = vecs[PARAMETER_VALUE].get();
+
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  m->add(*g, beta1, 1 - beta1);
+
+  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
+  g->square2();
+  v->add(*g, beta2, 1 - beta2);
+
+  // tmp = m_t / ( \sqrt{v_t} + \epsilon )
+  // \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
+  g->sqrt2(*v);
+  g->dotDiv(*m, *g, 0., epsilon);
+  real alpha =
+      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+  theta->add(*theta, 1.0, *g, -alpha);
+}
+
+void AdamaxParameterOptimizer(
+    const VectorPtr vecs[], real beta1, real beta2, int64_t step, real alpha) {
+  Vector* m = vecs[PARAMETER_MOMENTUM].get();
+  Vector* g = vecs[PARAMETER_GRADIENT].get();
+  Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
+  Vector* theta = vecs[PARAMETER_VALUE].get();
+
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  m->add(*g, beta1, 1 - beta1);
+
+  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
+  u->mulScalar(beta2);
+  g->abs2();
+  u->max2(*u, *g);
+
+  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
+  g->dotDiv(*m, *u);
+  real learningRate = alpha / (1 - std::pow(beta1, step));
+  theta->add(*theta, 1.0, *g, -learningRate);
+}
diff --git a/paddle/math/tests/PerfUtils.h b/paddle/math/tests/PerfUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..c32f4c634a691d5d747157d1d6d50406fdb99c3b
--- /dev/null
+++ b/paddle/math/tests/PerfUtils.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// Performance Check
+#ifdef PADDLE_DISABLE_TIMER
+
+#define EXPRESSION_PERFORMANCE(expression) expression;
+
+#else
+
+#include "paddle/utils/Stat.h"
+using namespace paddle;  // NOLINT
+
+#define EXPRESSION_PERFORMANCE(expression)                             \
+  do {                                                                 \
+    char expr[30];                                                     \
+    strncpy(expr, #expression, 30);                                    \
+    if (expr[29] != '\0') {                                            \
+      expr[27] = '.';                                                  \
+      expr[28] = '.';                                                  \
+      expr[29] = '\0';                                                 \
+    }                                                                  \
+    expression;                                                        \
+    for (int i = 0; i < 20; i++) {                                     \
+      REGISTER_TIMER(expr);                                            \
+      expression;                                                      \
+    }                                                                  \
+    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ') \
+              << *globalStat.getStat(expr);                            \
+    globalStat.reset();                                                \
+  } while (0)
+
+#endif
diff --git a/paddle/math/tests/TensorCheck.h b/paddle/math/tests/TensorCheck.h
new file mode 100644
index 0000000000000000000000000000000000000000..956bcf61a455dea6fdded823cd2fdd4801b0771a
--- /dev/null
+++ b/paddle/math/tests/TensorCheck.h
@@ -0,0 +1,216 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+/**
+ * This file provides a TensorCheck template function, which can be used to
+ * compare CpuMatrix and GpuMatrix, CpuVector and GpuVector, and so on.
+ */
+
+#include <cmath>
+#include "paddle/math/Matrix.h"
+
+namespace autotest {
+
+using paddle::Matrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::VectorT;
+using paddle::CpuVectorT;
+using paddle::GpuVectorT;
+
+class AssertEqual {
+public:
+  AssertEqual(real err = 0) : err_(err) {}
+
+  inline bool operator()(real a, real b) {
+    if (err_ == 0) {
+      if (a != b) {
+        return false;
+      }
+    } else {
+      if (std::fabs(a - b) > err_) {
+        if ((std::fabs(a - b) / std::fabs(a)) > (err_ / 10.0f)) {
+          return false;
+        }
+      }
+    }
+
+    return true;
+  }
+
+private:
+  real err_;
+};
+
+template <typename Tensor>
+class CopyToCpu;
+
+template <>
+class CopyToCpu<CpuMatrix> {
+public:
+  explicit CopyToCpu(const CpuMatrix& arg) : arg_(arg) {}
+  const CpuMatrix& copiedArg() const { return arg_; }
+
+private:
+  const CpuMatrix& arg_;
+};
+
+template <>
+class CopyToCpu<GpuMatrix> {
+public:
+  explicit CopyToCpu(const GpuMatrix& arg)
+      : arg_(arg.getHeight(), arg.getWidth()) {
+    arg_.copyFrom(arg);
+  }
+  CpuMatrix& copiedArg() { return arg_; }
+
+private:
+  CpuMatrix arg_;
+};
+
+template <>
+class CopyToCpu<Matrix> {
+public:
+  explicit CopyToCpu(const Matrix& arg)
+      : arg_(arg.getHeight(), arg.getWidth()) {
+    arg_.copyFrom(arg);
+  }
+  CpuMatrix& copiedArg() { return arg_; }
+
+private:
+  CpuMatrix arg_;
+};
+
+template <typename T>
+class CopyToCpu<CpuVectorT<T>> {
+public:
+  explicit CopyToCpu(const CpuVectorT<T>& arg) : arg_(arg) {}
+  const CpuVectorT<T>& copiedArg() const { return arg_; }
+
+private:
+  const CpuVectorT<T>& arg_;
+};
+
+template <typename T>
+class CopyToCpu<GpuVectorT<T>> {
+public:
+  explicit CopyToCpu(const GpuVectorT<T>& arg) : arg_(arg.getSize()) {
+    arg_.copyFrom(arg);
+  }
+  CpuVectorT<T>& copiedArg() { return arg_; }
+
+private:
+  CpuVectorT<T> arg_;
+};
+
+template <typename T>
+class CopyToCpu<VectorT<T>> {
+public:
+  explicit CopyToCpu(const VectorT<T>& arg) : arg_(arg.getSize()) {
+    arg_.copyFrom(arg);
+  }
+  CpuVectorT<T>& copiedArg() { return arg_; }
+
+private:
+  CpuVectorT<T> arg_;
+};
+
+template <typename AssertEq>
+void TensorCheck(AssertEq compare,
+                 const CpuMatrix& matrix1,
+                 const CpuMatrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      real a = data1[i * width + j];
+      real b = data2[i * width + j];
+      if (!compare(a, b)) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+template <typename AssertEq, class T>
+void TensorCheck(AssertEq compare,
+                 const CpuVectorT<T>& vector1,
+                 const CpuVectorT<T>& vector2) {
+  CHECK(vector1.getSize() == vector2.getSize());
+
+  const T* data1 = vector1.getData();
+  const T* data2 = vector2.getData();
+  size_t size = vector1.getSize();
+  int count = 0;
+  for (size_t i = 0; i < size; i++) {
+    real a = data1[i];
+    real b = data2[i];
+    if (!compare(a, b)) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+template <typename AssertEq, typename Tensor1, typename Tensor2>
+void TensorCheck(AssertEq compare,
+                 const Tensor1& tensor1,
+                 const Tensor2& tensor2) {
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+
+template <typename AssertEq>
+void TensorCheck(AssertEq compare, real args1, real args2) {
+  EXPECT_EQ(compare(args1, args2), true) << "[Test error] args1 = " << args1
+                                         << ", args2 = " << args2;
+}
+
+template <typename AssertEq>
+void TensorCheck(AssertEq compare, size_t args1, size_t args2) {
+  EXPECT_EQ(args1, args2) << "[Test error] args1 = " << args1
+                          << ", args2 = " << args2;
+}
+
+template <typename Tensor1, typename Tensor2>
+void TensorCheckEqual(const Tensor1& tensor1, const Tensor2& tensor2) {
+  AssertEqual compare(0);
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+
+template <typename Tensor1, typename Tensor2>
+void TensorCheckErr(const Tensor1& tensor1, const Tensor2& tensor2) {
+#ifndef PADDLE_TYPE_DOUBLE
+  AssertEqual compare(1e-3);
+#else
+  AssertEqual compare(1e-10);
+#endif
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+
+}  // namespace autotest
diff --git a/paddle/math/tests/TestUtils.h b/paddle/math/tests/TestUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..2edb07de0144ba194cd18e644fbc93efcbe4837a
--- /dev/null
+++ b/paddle/math/tests/TestUtils.h
@@ -0,0 +1,294 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+/**
+ * This file provides a AutoCompare calss to simplify the comparison
+ * of CPU and GPU member functions.
+ *
+ * This takes two steps
+ * 1. Construct an AutoCompare object.
+ *    When constructing an AutoCompare object, you can set the err argument
+ * to specify the maximum error for CPU and GPU functions.
+ *
+ * 2. Use the template functions cmpWithArg or cmpWithoutArg.
+ * A. [cmpWithArg] Requires the caller construct the cpu arguments.
+ *
+ *  AutoCompare test;
+ *  Init Argument arg1,arg2...
+ *  test.cmpWithArg(function, arg1, arg2....)
+ *
+ * B. [cmpWithoutArg] The caller do not need construct arguments.
+ *    If matrix used in these functions arguments is the same size.
+ *    Such as the element wise function and the aggregate function
+ *    defined in the BaseMatrix.cpp.
+ *
+ *  AutoCompare test;
+ *  test.cmpWithoutArg<I...>(function, height, width)
+*/
+
+#include <gtest/gtest.h>
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+#include "TensorCheck.h"
+
+namespace autotest {
+
+using paddle::BaseMatrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::CpuIVector;
+using paddle::GpuIVector;
+using paddle::CpuSparseMatrix;
+using paddle::GpuSparseMatrix;
+
+template <typename T1, typename T2>
+class ReplaceType {
+public:
+  typedef T1 type;
+};
+
+template <>
+class ReplaceType<BaseMatrix, CpuMatrix> {
+public:
+  typedef CpuMatrix type;
+};
+
+template <>
+class ReplaceType<BaseMatrix, GpuMatrix> {
+public:
+  typedef GpuMatrix type;
+};
+
+template <>
+class ReplaceType<Matrix, CpuMatrix> {
+public:
+  typedef CpuMatrix type;
+};
+
+template <>
+class ReplaceType<Matrix, GpuMatrix> {
+public:
+  typedef GpuMatrix type;
+};
+
+// construct a argument
+template <typename T>
+T construct(int height, int width);
+
+template <>
+float construct(int height, int width) {
+  return 0.5;
+}
+
+template <>
+double construct(int height, int width) {
+  return 0.5;
+}
+
+template <>
+size_t construct(int height, int width) {
+  size_t offset = std::rand() % (height < width ? height : width);
+  return offset;
+}
+
+template <>
+CpuMatrix construct(int height, int width) {
+  CpuMatrix a(height, width);
+  return a;
+}
+
+template <>
+GpuMatrix construct(int height, int width) {
+  GpuMatrix a(height, width);
+  return a;
+}
+
+// init a argument
+template <typename T>
+void init(T& v) {
+  return;
+}
+
+template <>
+void init(CpuMatrix& v) {
+  v.randomizeUniform();
+}
+
+template <>
+void init(GpuMatrix& v) {
+  v.randomizeUniform();
+}
+
+// init a tuple which contains a set of arguments.
+template <std::size_t I = 0, typename... Args>
+inline typename std::enable_if<I == sizeof...(Args), void>::type initTuple(
+    std::tuple<Args...>& t) {}
+
+template <std::size_t I = 0, typename... Args>
+    inline typename std::enable_if <
+    I<sizeof...(Args), void>::type initTuple(std::tuple<Args...>& t) {
+  init(std::get<I>(t));
+  initTuple<I + 1>(t);
+}
+
+// copy a argument, copy src to dest
+template <typename T1, typename T2>
+void copy(T1& dest, T2& src) {
+  dest = src;
+}
+
+template <>
+void copy(GpuMatrix& dest, CpuMatrix& src) {
+  dest.copyFrom(src);
+}
+
+// copy a tuple, copy src to dest
+template <std::size_t I = 0, typename... Args1, typename... Args2>
+inline typename std::enable_if<I == sizeof...(Args1), void>::type copyTuple(
+    std::tuple<Args1...>& dest, std::tuple<Args2...>& src) {}
+
+template <std::size_t I = 0, typename... Args1, typename... Args2>
+    inline typename std::enable_if <
+    I<sizeof...(Args1), void>::type copyTuple(std::tuple<Args1...>& dest,
+                                              std::tuple<Args2...>& src) {
+  copy(std::get<I>(dest), std::get<I>(src));
+  copyTuple<I + 1>(dest, src);
+}
+
+// call member function
+template <typename C,
+          typename FC,
+          typename R,
+          typename... FArgs,
+          typename... Args>
+R call(C& obj, R (FC::*f)(FArgs...), Args&&... args) {
+  return (obj.*f)(args...);
+}
+
+template <typename T>
+class ReturnType {
+public:
+  typedef T type;
+};
+
+template <>
+class ReturnType<CpuMatrix> {
+public:
+  typedef GpuMatrix type;
+};
+
+template <>
+class ReturnType<CpuIVector> {
+public:
+  typedef GpuIVector type;
+};
+
+template <>
+class ReturnType<CpuSparseMatrix> {
+public:
+  typedef GpuSparseMatrix type;
+};
+
+template <typename T>
+typename ReturnType<T>::type autoArgs(T& v) {
+  return v;
+}
+
+template <>
+GpuMatrix autoArgs(CpuMatrix& v) {
+  GpuMatrix a(v.getHeight(), v.getWidth());
+  a.copyFrom(v);
+  return a;
+}
+
+template <>
+GpuIVector autoArgs(CpuIVector& v) {
+  GpuIVector a(v.getSize());
+  a.copyFrom(v);
+  return a;
+}
+
+template <>
+GpuSparseMatrix autoArgs(CpuSparseMatrix& v) {
+  GpuSparseMatrix a(v.getHeight(),
+                    v.getWidth(),
+                    v.getElementCnt(),
+                    v.getValueType(),
+                    v.getFormat());
+  a.copyFrom(v, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  return a;
+}
+
+class AutoCompare {
+public:
+  /**
+   * err is the allowed calculation error.
+   * The smaller the value of err,
+   * the stricter the comparison is between CPU and GPU calculations.
+   */
+  AutoCompare(size_t height, size_t width, real err = 1e-3)
+      : cpu(height, width), gpu(height, width), compare(err) {
+    init(cpu);
+    copy(gpu, cpu);
+  }
+
+  template <typename C, typename R, typename... FArgs, typename... Args>
+  void cmpWithArg(R (C::*f)(FArgs...), Args&&... args) {
+    static_assert(sizeof...(FArgs) == sizeof...(Args),
+                  "size of parameter packs are not equal");
+    call(cpu, f, args...);
+    call(gpu, f, autoArgs(args)...);
+
+    TensorCheck(compare, cpu, gpu);
+  }
+
+  template <std::size_t... I, typename C, typename R, typename... Args>
+  void cmpWithoutArg(R (C::*f)(Args...), size_t height, size_t width) {
+    static_assert(sizeof...(I) == sizeof...(Args),
+                  "size of parameter packs are not equal");
+    (void)height;
+    (void)width;
+    auto tuple1 = std::make_tuple(
+        construct<typename ReplaceType<
+            typename std::decay<
+                typename std::tuple_element<I,
+                                            std::tuple<Args...>>::type>::type,
+            CpuMatrix>::type>(height, width)...);
+
+    auto tuple2 = std::make_tuple(
+        construct<typename ReplaceType<
+            typename std::decay<
+                typename std::tuple_element<I,
+                                            std::tuple<Args...>>::type>::type,
+            GpuMatrix>::type>(height, width)...);
+
+    initTuple(tuple1);
+    copyTuple(tuple2, tuple1);
+
+    call(cpu, f, std::get<I>(tuple1)...);
+    call(gpu, f, std::get<I>(tuple2)...);
+
+    TensorCheck(compare, cpu, gpu);
+  }
+
+protected:
+  CpuMatrix cpu;
+  GpuMatrix gpu;
+  AssertEqual compare;
+};
+
+}  // namespace autotest
diff --git a/paddle/math/tests/test_Allocator.cpp b/paddle/math/tests/test_Allocator.cpp
index c94e7f043c04a4551e0be76c6761a1078fadcd36..084322a1caf579cf6237b41c51efa220c6f2d5a2 100644
--- a/paddle/math/tests/test_Allocator.cpp
+++ b/paddle/math/tests/test_Allocator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <gtest/gtest.h>
 #include "paddle/utils/Util.h"
 #include "paddle/utils/Logging.h"
@@ -21,11 +20,12 @@ limitations under the License. */
 #include "paddle/math/Allocator.h"
 #include "paddle/math/PoolAllocator.h"
 
-using namespace paddle;     // NOLINT
+using namespace paddle;  // NOLINT
 
-template<typename Allocator>
+template <typename Allocator>
 void testPoolAllocator() {
-  PoolAllocator* pool = new PoolAllocator(new Allocator(), /* sizeLimit */1024);
+  PoolAllocator* pool =
+      new PoolAllocator(new Allocator(), /* sizeLimit */ 1024);
 
   /* alloc from system memory */
   void* ptr1 = pool->alloc(10);
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f8c795a63918c5d52c8b92fad2d487c6e9f6df05
--- /dev/null
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -0,0 +1,251 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_ONLY_CPU
+/**
+ * This test file use autotest::AutoCompare and cmpWithoutArg to compares the
+ * implementation of CPU and GPU member function in
+ * BaseMatrix.cpp and Matrix.cpp.
+ */
+
+#include <gtest/gtest.h>
+#include "paddle/math/BaseMatrix.h"
+#include "TestUtils.h"
+
+using paddle::BaseMatrix;
+using paddle::Matrix;
+using autotest::AutoCompare;
+
+// Test all void (BaseMatrix::*)() function
+TEST(BaseMatrix, void) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)()) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg(f, height, width);
+      };
+
+      compare(&BaseMatrix::neg);
+      compare(&BaseMatrix::exp2);
+      compare(&BaseMatrix::log2);
+      compare(&BaseMatrix::sqrt2);
+      compare(&BaseMatrix::square2);
+      compare(&BaseMatrix::reciprocal2);
+      compare(&BaseMatrix::abs2);
+      compare(&BaseMatrix::sign2);
+      compare(&BaseMatrix::zero);
+      compare(&BaseMatrix::one);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(real) function
+TEST(BaseMatrix, real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0>(f, height, width);
+      };
+
+      compare(&BaseMatrix::pow2);
+      compare(&BaseMatrix::subScalar);
+      compare(&BaseMatrix::mulScalar);
+      compare(&BaseMatrix::divScalar);
+      compare(&BaseMatrix::assign);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::biggerThanScalar);
+      compare(&BaseMatrix::downClip);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(BaseMatrix&) function
+TEST(BaseMatrix, BaseMatrix) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0>(f, height, width);
+      };
+
+      compare(&BaseMatrix::assign);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::relu);
+      compare(&BaseMatrix::reluDerivative);
+      compare(&BaseMatrix::softrelu);
+      compare(&BaseMatrix::softreluDerivative);
+      compare(&BaseMatrix::brelu);
+      compare(&BaseMatrix::breluDerivative);
+      compare(&BaseMatrix::square2);
+      compare(&BaseMatrix::squareDerivative);
+      compare(&BaseMatrix::tanh);
+      compare(&BaseMatrix::tanhDerivative);
+      compare(&BaseMatrix::reciprocal2);
+      compare(&BaseMatrix::reciprocalDerivative);
+      compare(&BaseMatrix::abs2);
+      compare(&BaseMatrix::absDerivative);
+      compare(&BaseMatrix::sigmoid);
+      compare(&BaseMatrix::sigmoidDerivative);
+      compare(&BaseMatrix::expDerivative);
+      compare(&BaseMatrix::sign2);
+      compare(&BaseMatrix::exp2);
+      compare(&BaseMatrix::log2);
+      compare(&BaseMatrix::sqrt2);
+      compare(&BaseMatrix::dotMul);
+      compare(&BaseMatrix::dotMulSquare);
+      compare(&BaseMatrix::dotSquareMul);
+      compare(&BaseMatrix::addColVector);
+      compare(&BaseMatrix::addRowVector);
+      compare(&BaseMatrix::mulRowVector);
+      compare(&BaseMatrix::divRowVector);
+      compare(&BaseMatrix::addP2P);
+      compare(&BaseMatrix::invSqrt);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(real, real) function
+TEST(BaseMatrix, real_real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(real, real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::clip);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(BaseMatrix&, real) function
+TEST(BaseMatrix, BaseMatrix_real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&, real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+
+      compare(&BaseMatrix::addBias);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::sub);
+      compare(&BaseMatrix::pow2);
+      compare(&BaseMatrix::addScalar);
+      compare(&BaseMatrix::subScalar);
+      compare(&BaseMatrix::mulScalar);
+      compare(&BaseMatrix::divScalar);
+      compare(&BaseMatrix::scalarDiv);
+      compare(&BaseMatrix::addSquare);
+      compare(&BaseMatrix::isEqualTo);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(BaseMatrix&, BaseMatrix&) function
+TEST(BaseMatrix, BaseMatrix_BaseMatrix) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height,
+                      width](void (BaseMatrix::*f)(BaseMatrix&, BaseMatrix&)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+
+      compare(&BaseMatrix::softCrossEntropy);
+      compare(&BaseMatrix::softCrossEntropyBp);
+      compare(&BaseMatrix::binaryLabelCrossEntropy);
+      compare(&BaseMatrix::binaryLabelCrossEntropyBp);
+      compare(&BaseMatrix::sub);
+      compare(&BaseMatrix::add2);
+      compare(&BaseMatrix::dotMul);
+      compare(&BaseMatrix::dotDiv);
+      compare(&BaseMatrix::logisticRegressionLoss);
+      compare(&BaseMatrix::logisticRegressionLossBp);
+      compare(&BaseMatrix::biggerThan);
+      compare(&BaseMatrix::max2);
+      compare(&BaseMatrix::dotMulSquare);
+      compare(&BaseMatrix::dotSquareSquare);
+    }
+  }
+}
+
+void TestEelementWise(size_t height, size_t width) {
+  AutoCompare rowScale(height, width);
+  rowScale.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowScale, height, width);
+
+  AutoCompare rowDotMul(height, width);
+  rowDotMul.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowDotMul, height, width);
+
+  AutoCompare binaryClassificationError(height, width);
+  binaryClassificationError.cmpWithoutArg<0, 1, 2, 3>(
+      &BaseMatrix::binaryClassificationError, height, width);
+
+  AutoCompare sumOfSquaresBp(height, width);
+  sumOfSquaresBp.cmpWithoutArg<0, 1>(&Matrix::sumOfSquaresBp, height, width);
+}
+
+void TestAggregateToRow(size_t height, size_t width) {
+  AutoCompare maxCols(1, width);
+  maxCols.cmpWithoutArg<0>(&BaseMatrix::maxCols, height, width);
+
+  AutoCompare minCols(1, width);
+  minCols.cmpWithoutArg<0>(&BaseMatrix::minCols, height, width);
+
+  AutoCompare addDotMulVMM(1, width);
+  addDotMulVMM.cmpWithoutArg<0, 1>(&BaseMatrix::addDotMulVMM, height, width);
+
+  AutoCompare sumCols(1, width);
+  sumCols.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumCols, height, width);
+
+  AutoCompare collectBias(1, width);
+  collectBias.cmpWithoutArg<0, 1>(
+      static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::collectBias),
+      height,
+      width);
+}
+
+void TestAggregateToCol(size_t height, size_t width) {
+  AutoCompare maxRows(height, 1);
+  maxRows.cmpWithoutArg<0>(&BaseMatrix::maxRows, height, width);
+
+  AutoCompare minRows(height, 1);
+  minRows.cmpWithoutArg<0>(&BaseMatrix::minRows, height, width);
+
+  AutoCompare sumRows(height, 1);
+  sumRows.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumRows, height, width);
+
+  AutoCompare sumOfSquares(height, 1);
+  sumOfSquares.cmpWithoutArg<0, 1>(&Matrix::sumOfSquares, height, width);
+}
+
+TEST(BaseMatrix, Other) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      TestEelementWise(height, width);
+      TestAggregateToRow(height, width);
+      TestAggregateToCol(height, width);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#endif
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
index ae201f172373caa45186cdc378cf9dd06a136181..b3eca19a7291d2b71b801793f824c1087a3ded27 100644
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
@@ -23,7 +23,10 @@ using namespace paddle;  // NOLINT
 const int height = 10;
 const int width = 16;
 
-real f(Matrix& mat1, const Matrix& mat2, IVector& vec1, const IVector& vec2,
+real f(Matrix& mat1,
+       const Matrix& mat2,
+       IVector& vec1,
+       const IVector& vec2,
        real scalar) {
   CHECK(!mat1.useGpu());
   CHECK(!mat2.useGpu());
@@ -37,8 +40,11 @@ real f(Matrix& mat1, const Matrix& mat2, IVector& vec1, const IVector& vec2,
 
 class Functor {
 public:
-  real operator()(Matrix& mat1, const Matrix& mat2, IVector& vec1,
-                  const IVector& vec2, real scalar) {
+  real operator()(Matrix& mat1,
+                  const Matrix& mat2,
+                  IVector& vec1,
+                  const IVector& vec2,
+                  real scalar) {
     a_ = f(mat1, mat2, vec1, vec2, scalar);
     return a_;
   }
@@ -93,9 +99,13 @@ TEST(ExecViaCpu, test1) {
   testWrapper(f);
   testWrapper(&f);
 
-  auto lambda =
-      [](Matrix& mat1, const Matrix& mat2, IVector& vec1, const IVector& vec2,
-         real scalar) -> real { return f(mat1, mat2, vec1, vec2, scalar); };
+  auto lambda = [](Matrix& mat1,
+                   const Matrix& mat2,
+                   IVector& vec1,
+                   const IVector& vec2,
+                   real scalar) -> real {
+    return f(mat1, mat2, vec1, vec2, scalar);
+  };
   LOG(INFO) << "lambda is_class=" << std::is_class<decltype(lambda)>::value
             << " is_function=" << std::is_function<decltype(lambda)>::value;
   testWrapper(lambda);
diff --git a/paddle/math/tests/test_FPException.cpp b/paddle/math/tests/test_FPException.cpp
index 174278c2aaac4575a6ea0b219bf7a389db712703..f996e0daddd3ef41e195de48640631a979a87192 100644
--- a/paddle/math/tests/test_FPException.cpp
+++ b/paddle/math/tests/test_FPException.cpp
@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 /**
  * This test is about floating point calculation exception.
  * Paddle catches FE_INVALID, FE DIVBYZERO and FE_OVERFLOW exceptions.
  *
- * Some exceptions occur in the middle of a set of formulas, 
+ * Some exceptions occur in the middle of a set of formulas,
  * that can be circumvented by some tricks.
- * For example, 
+ * For example,
  * calculate tanh
  *   b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
  *
@@ -34,7 +33,7 @@ limitations under the License. */
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/Excepts.h"
 
-using namespace paddle;     // NOLINT
+using namespace paddle;  // NOLINT
 
 void SetTensorValue(Matrix& matrix, real value) {
   int height = matrix.getHeight();
@@ -53,7 +52,7 @@ void SetTensorValue(Matrix& matrix, real value) {
   }
 }
 
-template<typename Matrix>
+template <typename Matrix>
 void testTanh(real illegal) {
   MatrixPtr A = std::make_shared<Matrix>(10, 10);
   MatrixPtr B = std::make_shared<Matrix>(10, 10);
@@ -65,7 +64,7 @@ void testTanh(real illegal) {
   A->tanh(*B);
 }
 
-template<typename Matrix>
+template <typename Matrix>
 void testSigmoid(real illegal) {
   MatrixPtr A = std::make_shared<Matrix>(10, 10);
   MatrixPtr B = std::make_shared<Matrix>(10, 10);
diff --git a/paddle/math/tests/test_GpuProfiler.cpp b/paddle/math/tests/test_GpuProfiler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c3542b7834224e2fa6fe323a1fbe8ea1e7cd68de
--- /dev/null
+++ b/paddle/math/tests/test_GpuProfiler.cpp
@@ -0,0 +1,137 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_ONLY_CPU
+
+#include "paddle/utils/Util.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+#include <gtest/gtest.h>
+#include "paddle/gserver/tests/TestUtil.h"
+#include "paddle/utils/Stat.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      real a = data1[i * width + j];
+      real b = data2[i * width + j];
+      if (fabs(a - b) > err) {
+        if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
+          count++;
+        }
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
+                        int channels) {
+  int inWidth = imgSizeH * imgSizeW * channels;
+  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
+  real ratioH = 0.5;
+  real ratioW = 0.5;
+
+  // forward
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+
+  input->randomizeUniform();
+  inputGpu->copyFrom(*input);
+
+  {
+    // nvprof: GPU Proflier
+    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
+    target->bilinearForward(*input, imgSizeH, imgSizeW,
+        2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
+    targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW,
+        2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
+  }
+
+  // check
+  targetCheck->copyFrom(*targetGpu);
+  MatrixCheckErr(*target, *targetCheck);
+
+  // backward
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, false,
+                                              true);
+  MatrixPtr targetCheckGrad =
+      CpuMatrix::create(numSamples, inWidth, false, false);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->bilinearBackward(*targetGrad, 2 * imgSizeH, 2 * imgSizeW,
+      imgSizeH, imgSizeW, channels, ratioH, ratioW);
+  inputGpuGrad->bilinearBackward(*targetGpuGrad, 2 * imgSizeH, 2 * imgSizeW,
+      imgSizeH, imgSizeW, channels, ratioH, ratioW);
+
+  // check
+  targetCheckGrad->copyFrom(*inputGpuGrad);
+  MatrixCheckErr(*inputGrad, *targetCheckGrad);
+}
+
+TEST(Profiler, testBilinearFwdBwd) {
+  auto numSamples = 10;
+  auto channels = 16;
+  auto imgSize = 64;
+  {
+    // nvprof: GPU Proflier
+    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
+    // Paddle built-in timer
+    REGISTER_TIMER_INFO("testBilinearFwdBwd",
+      "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
+    testBilinearFwdBwd(numSamples, imgSize, imgSize, channels);
+  }
+  globalStat.printAllStatus();
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+
+  // nvprof: GPU Proflier
+  REGISTER_GPU_PROFILER("RecursiveProfilingTest",
+    "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
+
+  return RUN_ALL_TESTS();
+}
+
+#endif /* PADDLE_ONLY_CPU */
diff --git a/paddle/math/tests/test_Matrix.cpp b/paddle/math/tests/test_Matrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..edc9d74103240ff3790a4baf2ae796cab4aca55b
--- /dev/null
+++ b/paddle/math/tests/test_Matrix.cpp
@@ -0,0 +1,300 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_ONLY_CPU
+/**
+ * This test file use autotest::AutoCompare and cmpWithArg to compares the
+ * implementation of CPU and GPU member function in Matrix.cpp.
+ */
+
+#include <gtest/gtest.h>
+#include "TestUtils.h"
+
+using paddle::BaseMatrix;
+using paddle::Matrix;
+using paddle::CpuMatrix;
+using paddle::CpuIVector;
+using paddle::CpuSparseMatrix;
+using autotest::AutoCompare;
+
+void testBilinearFwdBwd(int numSamples,
+                        int imgSizeH,
+                        int imgSizeW,
+                        int channels) {
+  int inWidth = imgSizeH * imgSizeW * channels;
+  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
+  real ratioH = 0.5;
+  real ratioW = 0.5;
+
+  AutoCompare forward(numSamples, outWidth);
+  CpuMatrix arg1(numSamples, inWidth);
+  arg1.randomizeUniform();
+  forward.cmpWithArg(&Matrix::bilinearForward,
+                     arg1,
+                     imgSizeH,
+                     imgSizeW,
+                     2 * imgSizeH,
+                     2 * imgSizeW,
+                     channels,
+                     ratioH,
+                     ratioW);
+
+  AutoCompare backward(numSamples, inWidth);
+  CpuMatrix arg2(numSamples, outWidth);
+  arg2.randomizeUniform();
+  backward.cmpWithArg(&Matrix::bilinearBackward,
+                      arg2,
+                      2 * imgSizeH,
+                      2 * imgSizeW,
+                      imgSizeH,
+                      imgSizeW,
+                      channels,
+                      ratioH,
+                      ratioW);
+}
+
+TEST(Matrix, BilinearFwdBwd) {
+  for (auto numSamples : {5, 10}) {
+    for (auto channels : {8, 16}) {
+      for (auto imgSizeH : {14, 28}) {
+        for (auto imgSizeW : {16, 30}) {
+          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
+          testBilinearFwdBwd(numSamples, imgSizeH, imgSizeW, channels);
+        }
+      }
+    }
+  }
+}
+
+void testMatrixAddBias(int height, int width, real scale) {
+  AutoCompare test(height, width);
+  CpuMatrix arg1(1, width);
+  arg1.randomizeUniform();
+  test.cmpWithArg(
+      static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::addBias),
+      arg1,
+      scale);
+}
+
+void testMatrixAddDotMulMMV(int height, int width) {
+  AutoCompare test(height, width);
+  CpuMatrix arg1(height, width);
+  CpuMatrix arg2(1, width);
+  arg1.randomizeUniform();
+  arg2.randomizeUniform();
+  test.cmpWithArg(&BaseMatrix::addDotMulMMV, arg1, arg2);
+}
+
+TEST(Matrix, unary) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+      testMatrixAddBias(height, width, 1.0);
+      testMatrixAddBias(height, width, 3.5);
+      testMatrixAddDotMulMMV(height, width);
+    }
+  }
+}
+
+void testMatrixAddAtOffset(int height, int width1, int width2, int offset) {
+  AutoCompare test(height, width2);
+  CpuMatrix arg1(height, width1);
+  arg1.randomizeUniform();
+  test.cmpWithArg(&Matrix::addAtOffset, arg1, offset);
+}
+
+void testMatrixAssignAtOffset(int height, int width1, int width2, int offset) {
+  AutoCompare test(height, width2);
+  CpuMatrix arg1(height, width1);
+  arg1.randomizeUniform();
+  test.cmpWithArg(&Matrix::assignAtOffset, arg1, offset);
+}
+
+TEST(Matrix, AtOffset) {
+  for (auto height : {1, 11, 73, 128, 200}) {
+    for (auto width1 : {1, 32, 100, 512, 1000}) {
+      for (auto width2 : {1, 32, 100, 512, 1000}) {
+        int columnOffset = 0;
+        int offset = std::abs(width1 - width2);
+        if (offset) {
+          columnOffset = std::rand() % offset;
+        }
+        VLOG(3) << " height=" << height << " width1=" << width1
+                << " width2=" << width2 << " columnOffset = " << columnOffset;
+        testMatrixAddAtOffset(height, width1, width2, columnOffset);
+        testMatrixAssignAtOffset(height, width1, width2, columnOffset);
+      }
+    }
+  }
+}
+
+void testMatrixSelectRows(int numSamples, int tableSize, int inputDim) {
+  AutoCompare test(numSamples, inputDim);
+  CpuMatrix arg1(tableSize, inputDim);
+  CpuIVector arg2(numSamples);
+  arg1.randomizeUniform();
+  arg2.rand(tableSize);
+  test.cmpWithArg(&Matrix::selectRows, arg1, arg2);
+}
+
+TEST(Matrix, tableProjection) {
+  for (auto numSamples : {10, 100, 1000, 10000, 80000}) {
+    for (auto tableSize : {10, 100}) {
+      for (auto inputDim : {20, 50}) {
+        VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
+                << " inputDim=" << inputDim;
+        testMatrixSelectRows(numSamples, tableSize, inputDim);
+      }
+    }
+  }
+}
+
+void testMatrixCopyByRowIndex(int outHeight, int inHeight, int width) {
+  AutoCompare test(outHeight, width);
+  CpuMatrix arg1(inHeight, width);
+  CpuIVector arg2(outHeight);
+  arg1.randomizeUniform();
+  arg2.rand(inHeight);
+  test.cmpWithArg(&Matrix::copyByRowIndex, arg1, arg2);
+}
+
+TEST(Matrix, copyByRowIndex) {
+  for (auto outHeight : {31, 500, 1000}) {
+    for (auto inHeight : {17, 257, 500, 1200}) {
+      for (auto width : {512, 1024}) {
+        VLOG(3) << outHeight << " " << inHeight << " " << width;
+        testMatrixCopyByRowIndex(outHeight, inHeight, width);
+      }
+    }
+  }
+}
+
+void testCosSim(int heightX, int heightY, int width, real scale) {
+  AutoCompare test(heightX, 1);
+  CpuMatrix arg1(heightX, width);
+  CpuMatrix arg2(heightY, width);
+  arg1.randomizeUniform();
+  arg2.randomizeUniform();
+  arg2.add(-0.5);
+  test.cmpWithArg(&Matrix::cosSim, arg1, arg2, scale);
+}
+
+TEST(Matrix, cosSim) {
+  for (auto heightX : {10, 100, 1000}) {
+    for (auto heightY : {1, heightX}) {
+      for (auto width : {10, 100, 1000}) {
+        for (auto scale : {1.0, 2.0}) {
+          testCosSim(heightX, heightY, width, scale);
+        }
+      }
+    }
+  }
+}
+
+void testParamReluForward(int height, int width, int w_height, int w_width) {
+  AutoCompare test(height, width);
+  CpuMatrix arg1(height, width);
+  CpuMatrix arg2(w_height, w_width);
+  arg1.randomizeUniform();
+  arg2.randomizeUniform();
+  arg1.add(-0.5);
+  test.cmpWithArg(&Matrix::paramReluForward, arg1, arg2);
+}
+
+void testParamReluBackwardW(int height, int width, int w_height, int w_width) {
+  AutoCompare test(w_height, w_width);
+  CpuMatrix arg1(height, width);
+  CpuMatrix arg2(height, width);
+  arg1.randomizeUniform();
+  arg2.randomizeUniform();
+  arg2.add(-0.5);
+  test.cmpWithArg(&Matrix::paramReluBackwardW, arg1, arg2);
+}
+
+TEST(Matrix, paramRelu) {
+  for (auto height : {10, 100}) {
+    for (auto width : {10, 100}) {
+      for (auto w_height : {1, 2}) {
+        for (auto w_width : {1, 2}) {
+          testParamReluForward(height, width, w_height, w_width);
+          testParamReluBackwardW(height, width, w_height, w_width);
+        }
+      }
+    }
+  }
+}
+
+void testAddSharedBias(int numSamples, int dim, int channel) {
+  AutoCompare test(numSamples, dim);
+  CpuMatrix arg1(1, channel);
+  arg1.randomizeUniform();
+  test.cmpWithArg(&Matrix::addSharedBias, arg1, 1.0);
+}
+
+void testCollectSharedBias(int numSamples, int dim, int channel) {
+  AutoCompare test(1, channel);
+  CpuMatrix arg1(numSamples, dim);
+  arg1.randomizeUniform();
+  test.cmpWithArg(&Matrix::collectSharedBias, arg1, 1.0);
+}
+
+TEST(Matrix, sharedBias) {
+  for (auto numSamples : {1, 100, 520}) {
+    for (auto dim : {100 * 16, 100 * 32}) {
+      for (auto channel : {8, 16}) {
+        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
+                << " channel=" << channel;
+        testAddSharedBias(numSamples, dim, channel);
+        testCollectSharedBias(numSamples, dim, channel);
+      }
+    }
+  }
+}
+
+void testMultiBinaryLabelCrossEntropy(int numSamples, int dim) {
+  AutoCompare forward(numSamples, 1);
+  CpuMatrix arg1(numSamples, dim);
+  CpuSparseMatrix arg2(
+      numSamples, dim, numSamples, paddle::NO_VALUE, paddle::SPARSE_CSR);
+
+  CpuMatrix output1(numSamples, dim);
+  output1.randomizeUniform();
+  output1.softmax(arg1);
+  for (int i = 0; i < numSamples; i++) {
+    const unsigned int id = std::rand() % dim;
+    arg2.setRow(i, 1, &id, nullptr);
+  }
+  forward.cmpWithArg(&Matrix::multiBinaryLabelCrossEntropy, arg1, arg2);
+
+  AutoCompare backward(numSamples, dim);
+  backward.cmpWithArg(&Matrix::multiBinaryLabelCrossEntropyBp, arg1, arg2);
+}
+
+TEST(Matrix, multiBinaryCrossEntropy) {
+  for (auto numSamples : {100, 1000, 10000}) {
+    for (auto dim : {100, 1000, 10000}) {
+      VLOG(3) << " numSamples=" << numSamples << " dim=" << dim;
+      testMultiBinaryLabelCrossEntropy(numSamples, dim);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#endif
diff --git a/paddle/math/tests/test_SIMDFunctions.cpp b/paddle/math/tests/test_SIMDFunctions.cpp
index 491b0cda7b9e1a13882aee6621e0de984709ae80..8405b96fc2b915e2e1a5676ab5e3f25b4acde75a 100644
--- a/paddle/math/tests/test_SIMDFunctions.cpp
+++ b/paddle/math/tests/test_SIMDFunctions.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
-
 #include "paddle/math/SIMDFunctions.h"
 #include "paddle/utils/Util.h"
 
@@ -128,13 +126,13 @@ TEST(SIMDFunction, decayL1_WithLR) {
   typedef std::function<void(float*, float*, float*, float, size_t)>
       DecayL1MethodType;
 
-  DecayL1MethodType naive = [](float* d, float* s, float* lr, float l,
-                               size_t len) {
+  DecayL1MethodType naive = [](
+      float* d, float* s, float* lr, float l, size_t len) {
     paddle::simd::naive::decayL1<float>(d, s, lr, l, len);
   };
 
-  DecayL1MethodType simd = [](float* d, float* s, float* lr, float l,
-                              size_t len) {
+  DecayL1MethodType simd = [](
+      float* d, float* s, float* lr, float l, size_t len) {
     paddle::simd::decayL1<float>(d, s, lr, l, len);
   };
 
diff --git a/paddle/math/tests/test_matrix.cpp b/paddle/math/tests/test_SparseMatrix.cpp
similarity index 80%
rename from paddle/math/tests/test_matrix.cpp
rename to paddle/math/tests/test_SparseMatrix.cpp
index 71c9622420aef73848ee7e85c505a6d40f64f3c1..3788218aab100d4ad683e85149a9513e54ca2480 100644
--- a/paddle/math/tests/test_matrix.cpp
+++ b/paddle/math/tests/test_SparseMatrix.cpp
@@ -48,7 +48,8 @@ struct MatrixPara {
 };
 
 #ifndef PADDLE_ONLY_CPU
-void test_sparse_matrix_mul(MatrixPara paraA, MatrixPara paraB,
+void test_sparse_matrix_mul(MatrixPara paraA,
+                            MatrixPara paraB,
                             MatrixPara paraC) {
   // for cpu sparse matrix mul
   MatrixPtr cpuMatrixA, cpuMatrixB, cpuMatrixC, gpuMatrixC_d2h;
@@ -58,12 +59,20 @@ void test_sparse_matrix_mul(MatrixPara paraA, MatrixPara paraB,
   MatrixPtr cpuDenseA, cpuDenseB, cpuDenseC;
 
   if (paraA.sparse) {
-    cpuMatrixA = Matrix::createSparseMatrix(paraA.height, paraA.width,
-                                            paraA.nnz, FLOAT_VALUE,
-                                            paraA.format, paraA.trans, false);
-    gpuMatrixA = Matrix::createSparseMatrix(paraA.height, paraA.width,
-                                            paraA.nnz, FLOAT_VALUE,
-                                            paraA.format, paraA.trans, true);
+    cpuMatrixA = Matrix::createSparseMatrix(paraA.height,
+                                            paraA.width,
+                                            paraA.nnz,
+                                            FLOAT_VALUE,
+                                            paraA.format,
+                                            paraA.trans,
+                                            false);
+    gpuMatrixA = Matrix::createSparseMatrix(paraA.height,
+                                            paraA.width,
+                                            paraA.nnz,
+                                            FLOAT_VALUE,
+                                            paraA.format,
+                                            paraA.trans,
+                                            true);
   } else {
     cpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, false);
     gpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, true);
@@ -71,12 +80,20 @@ void test_sparse_matrix_mul(MatrixPara paraA, MatrixPara paraB,
   cpuDenseA = Matrix::create(paraA.height, paraA.width, paraA.trans, false);
 
   if (paraB.sparse) {
-    cpuMatrixB = Matrix::createSparseMatrix(paraB.height, paraB.width,
-                                            paraB.nnz, FLOAT_VALUE,
-                                            paraB.format, paraB.trans, false);
-    gpuMatrixB = Matrix::createSparseMatrix(paraB.height, paraB.width,
-                                            paraB.nnz, FLOAT_VALUE,
-                                            paraB.format, paraB.trans, true);
+    cpuMatrixB = Matrix::createSparseMatrix(paraB.height,
+                                            paraB.width,
+                                            paraB.nnz,
+                                            FLOAT_VALUE,
+                                            paraB.format,
+                                            paraB.trans,
+                                            false);
+    gpuMatrixB = Matrix::createSparseMatrix(paraB.height,
+                                            paraB.width,
+                                            paraB.nnz,
+                                            FLOAT_VALUE,
+                                            paraB.format,
+                                            paraB.trans,
+                                            true);
   } else {
     cpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, false);
     gpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, true);
@@ -84,15 +101,27 @@ void test_sparse_matrix_mul(MatrixPara paraA, MatrixPara paraB,
   cpuDenseB = Matrix::create(paraB.height, paraB.width, paraB.trans, false);
 
   if (paraC.sparse) {
-    cpuMatrixC = Matrix::createSparseMatrix(paraC.height, paraC.width,
-                                            paraC.nnz, FLOAT_VALUE,
-                                            paraC.format, paraC.trans, false);
-    gpuMatrixC = Matrix::createSparseMatrix(paraC.height, paraC.width,
-                                            paraC.nnz, FLOAT_VALUE,
-                                            paraC.format, paraC.trans, true);
-    gpuMatrixC_d2h = Matrix::createSparseMatrix(
-        paraC.height, paraC.width, paraC.nnz, FLOAT_VALUE, paraC.format,
-        paraC.trans, false);
+    cpuMatrixC = Matrix::createSparseMatrix(paraC.height,
+                                            paraC.width,
+                                            paraC.nnz,
+                                            FLOAT_VALUE,
+                                            paraC.format,
+                                            paraC.trans,
+                                            false);
+    gpuMatrixC = Matrix::createSparseMatrix(paraC.height,
+                                            paraC.width,
+                                            paraC.nnz,
+                                            FLOAT_VALUE,
+                                            paraC.format,
+                                            paraC.trans,
+                                            true);
+    gpuMatrixC_d2h = Matrix::createSparseMatrix(paraC.height,
+                                                paraC.width,
+                                                paraC.nnz,
+                                                FLOAT_VALUE,
+                                                paraC.format,
+                                                paraC.trans,
+                                                false);
   } else {
     cpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, false);
     gpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, true);
@@ -267,8 +296,8 @@ TEST(Matrix, CpuSparseMatrixSubMatrix) {
   }
 }
 
-void sparseValid(int* major, int* minor, size_t nnz, size_t majorLen,
-                 size_t minorLen) {
+void sparseValid(
+    int* major, int* minor, size_t nnz, size_t majorLen, size_t minorLen) {
   CHECK_EQ(nnz, size_t(major[majorLen - 1]));
   CHECK_EQ(nnz, minorLen);
   for (size_t i = 0; i < majorLen - 1; i++) {
@@ -375,14 +404,25 @@ TEST(Matrix, SparseMatrixCSRFormatTrimFrom) {
   int64_t trimedIndices[11] = {0, 1, 3, 3, 7, 7, 9, 10, 12, 16, 19};
   sparse_float_value_t trimedData[19];
   int trimedValue[19] = {
-      1,           // row_0 : 1
-      3, 1,        // row_1 : 2
-      0, 1, 2, 3,  // row_3 : 4
-      2, 3,        // row_5 : 2
-      3,           // row_6 : 1
-      0, 1,        // row_7 : 2
-      0, 1, 2, 3,  // row_8 : 4
-      2, 3, 1      // row_9 : 3
+      1,  // row_0 : 1
+      3,
+      1,  // row_1 : 2
+      0,
+      1,
+      2,
+      3,  // row_3 : 4
+      2,
+      3,  // row_5 : 2
+      3,  // row_6 : 1
+      0,
+      1,  // row_7 : 2
+      0,
+      1,
+      2,
+      3,  // row_8 : 4
+      2,
+      3,
+      1  // row_9 : 3
   };
   for (size_t i = 0; i < 19; i++) {
     trimedData[i].col = trimedValue[i];
@@ -415,9 +455,13 @@ TEST(Matrix, SparseMatrixCSRFormatTrimFrom) {
       height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, true);
   matC->trimFrom(*mat);
 
-  CpuSparseMatrixPtr matD = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, matC->getElementCnt(), FLOAT_VALUE, SPARSE_CSR,
-      false);
+  CpuSparseMatrixPtr matD =
+      std::make_shared<CpuSparseMatrix>(height,
+                                        trimedWidth,
+                                        matC->getElementCnt(),
+                                        FLOAT_VALUE,
+                                        SPARSE_CSR,
+                                        false);
   matD->copyFrom(*matC, HPPL_STREAM_DEFAULT);
   hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   checkSMatrixEqual2(matA, matD);
@@ -462,11 +506,17 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
   int trimedIndices[6] = {0, 1, 5, 5, 9, 13};
   int trimedValue[13] = {
       1,  // col_0 : 1
-      5, 3, 1,
+      5,
+      3,
+      1,
       6,  // col_1 : 4
-      0, 1, 2,
+      0,
+      1,
+      2,
       3,  // col_3 : 4
-      4, 5, 6,
+      4,
+      5,
+      6,
       7  // col_4 : 4
   };
   std::vector<int> rowsA(trimedValue, trimedValue + 13);
@@ -499,9 +549,13 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
       height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, true);
   matC->trimFrom(*mat);
 
-  CpuSparseMatrixPtr matD = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, matC->getElementCnt(), FLOAT_VALUE, SPARSE_CSC,
-      false);
+  CpuSparseMatrixPtr matD =
+      std::make_shared<CpuSparseMatrix>(height,
+                                        trimedWidth,
+                                        matC->getElementCnt(),
+                                        FLOAT_VALUE,
+                                        SPARSE_CSC,
+                                        false);
   matD->copyFrom(*matC, HPPL_STREAM_DEFAULT);
   hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   checkSMatrixEqual2(matA, matD);
diff --git a/paddle/math/tests/test_Tensor.cu b/paddle/math/tests/test_Tensor.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8fa402055a6fa5963101b9e7b5c9d266161c9f78
--- /dev/null
+++ b/paddle/math/tests/test_Tensor.cu
@@ -0,0 +1,1173 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/math/Matrix.h"
+#include "TensorCheck.h"
+
+using paddle::Matrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::CpuVector;
+using paddle::GpuVector;
+using paddle::CpuIVector;
+using paddle::GpuIVector;
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;
+
+#define INIT_UNARY(A1, A2)                  \
+    Tensor A1(height, width);               \
+    Tensor A2(height, width);               \
+    A1.randomizeUniform();                  \
+    A2.copyFrom(A1)
+#define INIT_BINARY(A1, A2, B)              \
+    INIT_UNARY(A1, A2);                     \
+    Tensor B(height, width);                \
+    B.randomizeUniform()
+#define INIT_TERNARY(A1, A2, B, C)          \
+    INIT_BINARY(A1, A2, B);                 \
+    Tensor C(height, width);                \
+    C.randomizeUniform()
+#define INIT_QUATERNARY(A1, A2, B, C, D)    \
+    INIT_TERNARY(A1, A2, B, C);             \
+    Tensor D(height, width);                \
+    D.randomizeUniform()
+
+template<typename Tensor>
+struct TestUnaryMatrix {
+  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
+
+  explicit TestUnaryMatrix(UnaryFunc testUnaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_UNARY(A1, A2);
+        testUnaryFunc(A1, A2);
+      }
+    }
+  }
+};
+
+template<typename Tensor>
+struct TestBinaryMatrix {
+  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B)> BinaryFunc;
+
+  explicit TestBinaryMatrix(BinaryFunc testBinaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_BINARY(A1, A2, B);
+        testBinaryFunc(A1, A2, B);
+      }
+    }
+  }
+};
+
+template<typename Tensor>
+struct TestTernaryMatrix {
+  typedef std::function<void(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C)> TernaryFunc;
+
+  explicit TestTernaryMatrix(TernaryFunc testTernaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_TERNARY(A1, A2, B, C);
+        testTernaryFunc(A1, A2, B, C);
+      }
+    }
+  }
+};
+
+template<typename Tensor>
+struct TestQuaternaryMatrix {
+  typedef std::function<void(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)> QuaternaryFunc;
+
+  explicit TestQuaternaryMatrix(QuaternaryFunc testQuaternaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_QUATERNARY(A1, A2, B, C, D);
+        testQuaternaryFunc(A1, A2, B, C, D);
+      }
+    }
+  }
+};
+
+template<typename Tensor, class T>
+struct TestUnaryVectorT {
+  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
+
+  explicit TestUnaryVectorT(UnaryFunc testUnaryFunc) {
+    for (auto size : {1, 11, 73, 128, 200, 330, 512, 1000, 4210}) {
+      LOG(INFO) << " size=" << size;
+      Tensor A1(size);
+      Tensor A2(size);
+      if (typeid(T) == typeid(real)) {
+        A1.rand();
+      } else {
+        A1.rand(1000);
+      }
+      A2.copyFrom(A1);
+      testUnaryFunc(A1, A2);
+    }
+  }
+};
+
+void SetTensorValue(Matrix& matrix, real value) {
+  int height = matrix.getHeight();
+  int width = matrix.getWidth();
+  int stride = matrix.getStride();
+  real* data = matrix.getData();
+  for (int i = 0; i < height; i++) {
+    int j = rand() % width;  // NOLINT
+    if (typeid(matrix) == typeid(CpuMatrix)) {
+      data[i * stride + j] = value;
+    } else if (typeid(matrix) == typeid(GpuMatrix)) {
+      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
+    } else {
+    }
+  }
+}
+
+template<typename Tensor>
+void testTensorAddScalar(Tensor& A1, Tensor& A2) {
+  real p1 = 2.5;
+  real p2 = 3.0;
+  A1.add(p1);   // a += p
+  A2 += p1;
+  TensorCheckEqual(A1, A2);
+
+  A1.add(p1, p2);  // a = a * p1 + p2
+  A2 = A2 * p1 + p2;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSubScalar(Tensor& A1, Tensor& A2) {
+  real p = 2.5;
+  A1.subScalar(p);  // a -= p
+  A2 -= p;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorMulScalar(Tensor& A1, Tensor& A2) {
+  real p = 2.5;
+  A1.mulScalar(p);  // a *= p
+  A2 *= p;
+  TensorCheckEqual(A1, A2);
+
+  real learningRate = 0.7f;
+  real decayRate = 1.2f;
+  A1.applyL2(learningRate, decayRate);
+  A2 = A2 * (1.0f / (1.0f + learningRate * decayRate));
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorDivScalar(Tensor& A1, Tensor& A2) {
+  real p = 2.5;
+  A1.divScalar(p);  // a /= p
+  A2 /= p;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorNeg(Tensor& A1, Tensor& A2) {
+  A1.neg();  // a = -a
+  A2 = -A2;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorAbs(Tensor& A1, Tensor& A2) {
+  A1.abs2();  // a = a > 0 ? a : -a
+  A2 = A2.abs();
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSquare(Tensor& A1, Tensor& A2) {
+  A1.square2();  // a = a * a
+  A2 = A2.square();
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorReciprocal(Tensor& A1, Tensor& A2) {
+  A1.reciprocal2();  // a = 1.0f / a
+  A2 = A2.reciprocal();
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSign(Tensor& A1, Tensor& A2) {
+  A1.sign2();  // a = (a > 0) - (a < 0)
+  A2 = A2.sign();
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorAssign(Tensor& A1, Tensor& A2) {
+  A1.assign(1.5);   // a = p
+  A2 = A2.constant(1.5);
+  TensorCheckEqual(A1, A2);
+
+  A1.one();  // a = 1
+  A2 = A2.constant(1.0);
+  TensorCheckEqual(A1, A2);
+
+  A1.zero();  // a = 0
+  A2 = A2.constant(0.0);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
+  testTensorAddScalar(A1, A2);
+  testTensorSubScalar(A1, A2);
+  testTensorMulScalar(A1, A2);
+  testTensorDivScalar(A1, A2);
+  testTensorNeg(A1, A2);
+  testTensorAbs(A1, A2);
+  testTensorSquare(A1, A2);
+  testTensorReciprocal(A1, A2);
+  testTensorSign(A1, A2);
+  testTensorAssign(A1, A2);
+}
+
+template<typename Tensor>
+void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
+  A1.add(2);   // a += p
+  A2 += 2;
+  TensorCheckEqual(A1, A2);
+
+  A1.add(3, 2);  // a = a * p1 + p2
+  A2 = A2 * 3 + 2;
+  TensorCheckEqual(A1, A2);
+
+  testTensorNeg(A1, A2);
+  testTensorAbs(A1, A2);
+}
+
+TEST(Unary, BaseOp) {
+  TestUnaryMatrix<CpuMatrix> testCpuMatrix(testUnaryBaseOp<CpuMatrix>);
+  TestUnaryVectorT<CpuVector, real> testCpuVector(testUnaryBaseOp<CpuVector>);
+  TestUnaryVectorT<CpuIVector, int>
+    testCpuIVector(testUnaryBaseOpInt<CpuIVector>);
+
+#ifndef PADDLE_ONLY_CPU
+  TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>);
+  TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>);
+  TestUnaryVectorT<GpuIVector, int>
+    testGpuIVector(testUnaryBaseOpInt<GpuIVector>);
+#endif
+}
+
+template<typename Tensor>
+void testTensorExp(Tensor& A1, Tensor& A2) {
+  A1.exp2();  // a = exp(a)
+  A2 = A2.exp();
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorLog(Tensor& A1, Tensor& A2) {
+  A1.log2();  // a = log(a)
+  A2 = A2.log();
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSqrt(Tensor& A1, Tensor& A2) {
+  A1.sqrt2();  // a = sqrt(a)
+  A2 = A2.sqrt();
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorPow(Tensor& A1, Tensor& A2) {
+  A1.pow2(3.2);  // a = pow(a, p)
+  A2 = A2.pow(3.2);
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testUnayrMathOp(Tensor& A1, Tensor& A2) {
+  testTensorExp(A1, A2);
+  testTensorLog(A1, A2);
+  testTensorSqrt(A1, A2);
+  testTensorPow(A1, A2);
+}
+
+TEST(Unary, MathOp) {
+  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrMathOp<CpuMatrix>);
+
+#ifndef PADDLE_ONLY_CPU
+  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrMathOp<GpuMatrix>);
+#endif
+}
+
+template<typename Tensor>
+void testTensorClip(Tensor& A1, Tensor& A2) {
+  real p1 = 0.003f;
+  real p2 = 0.877f;
+  A1.clip(p1, p2);  // a = a < p1 ? p1 : (a > p2 ? p2 : a)
+  // A2 = A2.min(0.877f).max(0.003f);
+  A2 = (A2 < p1).condition(p1, (A2 > p2).condition(p2, A2));
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
+  real p = 0.5f;
+  A1.biggerThanScalar(p);  // a = a > p ? 1.0f : 0.0f
+  A2 = (A2 > p).condition((real)1.0, (real)0.0);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorapplyL1(Tensor& A1, Tensor& A2) {
+  /**
+   * T lambda = p;
+   * a = (a > lambda) ? (a - lambda)
+   *                  : (a < -lambda) ? (a + lambda) : 0
+   *
+   * p = learningRate * decayRate;
+   */
+  real learningRate = 0.7f;
+  real decayRate = 0.6f;
+  A1.applyL1(learningRate, decayRate);
+  A2 = (A2 > (learningRate * decayRate)).condition(
+    (A2 - (learningRate * decayRate)),
+    (A2 < -(learningRate * decayRate)).condition(
+      (A2 + (learningRate * decayRate)), (real)0.0));
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testUnayrCompareOp(Tensor& A1, Tensor& A2) {
+  testTensorClip(A1, A2);
+  testTensorBiggerThanScalar(A1, A2);
+
+  A1.randomizeUniform();
+  A1.subScalar(0.5f);
+  A2.copyFrom(A1);
+  testTensorapplyL1(A1, A2);
+}
+
+TEST(Unary, CompareOp) {
+  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrCompareOp<CpuMatrix>);
+
+#ifndef PADDLE_ONLY_CPU
+  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrCompareOp<GpuMatrix>);
+#endif
+}
+
+template<typename Tensor>
+void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p1 = 2.5;
+  real p2 = 3.2;
+  A1.add(B);  // a += b
+  A2 += B;
+  TensorCheckEqual(A1, A2);
+
+  A1.add(B, p1);  // a += b * p
+  A2 += B * p1;
+  TensorCheckEqual(A1, A2);
+
+  A1.add(B, p1, p2);  // a = p1 * a + p2 * b
+  A2 = A2 * p1 + B * p2;
+  TensorCheckEqual(A1, A2);
+
+  A1.addScalar(B, p1);  // a = b + p
+  A2 = B + p1;
+  TensorCheckEqual(A1, A2);
+
+  A1.addSquare(B, p1);  // a += p * b * b
+  A2 += B.constant(p1) * B * B;
+  TensorCheckEqual(A1, A2);
+
+  A1.decayAddSquare(B, p1, p2);  // a = p1 * a + p2 * b * b
+  A2 = A2 * p1 + B.constant(p2) * B * B;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 2.5;
+  A1.sub(B);  // a -= b
+  A2 -= B;
+  TensorCheckEqual(A1, A2);
+
+  A1.sub(B, p);  // a -= b * p
+  A2 -= B * p;
+  TensorCheckEqual(A1, A2);
+
+  A1.subScalar(B, p);  // a = b - p
+  A2 = B - p;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 2.5;
+  A1.mulScalar(B, p);  // a = b * p
+  A2 = B * p;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotMulSquare(B);  // a *= b * b
+  A2 *= B * B;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotSquareMul(B);  // a = a * a * b
+  A2 = A2 * A2 * B;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotMul(B);  // a *= b
+  A2 *= B;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 2.5;
+  A1.divScalar(B, p);  // a = b / p
+  A2 = B / p;
+  TensorCheckEqual(A1, A2);
+
+  A1.scalarDiv(B, p);  // a = p / b
+  A2 = B.constant(p) / B;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorAssign(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.assign(B);  // a = b
+  A2 = B;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSquare(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.square2(A1);   // b = a * a
+  A2 = B.square();
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSquareDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.squareDerivative(B);  // a *= 2.0 * b
+  A2 = A2 * (real)2.0 * B;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.reciprocal2(A1);  // b = 1.0f / a
+  A2 = B.reciprocal();
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 0.58;
+  real p2 = 0.32;
+  A1.reciprocal2(B, p1, p2);  // a = 1 / (p1 * b + p2)
+  A2 = (B * p1 + p2).reciprocal();
+  TensorCheckEqual(A1, A2);
+
+  real learningRate = 0.7f;
+  real decayRate = 1.2f;
+  A1.applyL2(B, learningRate, decayRate);  // a *= (1.0f / (1.0f + p * b))
+  A2 *= (B.constant(1.0f) +
+    B.constant(learningRate * decayRate) * B).reciprocal();
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorReciprocalDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.reciprocalDerivative(B);  // a *= -b * b
+  A2 *= (-B) * B;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSign(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.sign2(A1);  // b = a > 0.0f ? 1.0f : -1.0f
+  A2 = B.sign();
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorAbs(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.abs2(A1);  // b = a > 0.0f ? a : -a
+  A2 = B.abs();
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) {
+  testTensorAdd(A1, A2, B);
+  testTensorSub(A1, A2, B);
+  testTensorMul(A1, A2, B);
+  testTensorDiv(A1, A2, B);
+  testTensorSquare(A1, A2, B);
+  testTensorSquareDerivative(A1, A2, B);
+  testTensorReciprocal(A1, A2, B);
+  testTensorReciprocalDerivative(A1, A2, B);
+  testTensorAbs(A1, A2, B);
+  testTensorSign(A1, A2, B);
+  testTensorAssign(A1, A2, B);
+}
+
+TEST(Binary, BaseOp) {
+  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryBaseOp<CpuMatrix>);
+
+#ifndef PADDLE_ONLY_CPU
+  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryBaseOp<GpuMatrix>);
+#endif
+}
+
+template<typename Tensor>
+void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = exp(b)
+  A1.exp2(B);
+  A2 = B.exp();
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorExpDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.expDerivative(B);  // a *= b
+  A2 *= B;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = log(b)
+  A1.log2(B);
+  A2 = B.log();
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = sqrt(b)
+  A1.sqrt2(B);
+  A2 = B.sqrt();
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = 1.0f / sqrt(b)
+  A1.invSqrt(B);
+  A2 = B.sqrt().reciprocal();
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorPow(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.pow2(B, 2.5f);  // a = pow(b, p)
+  A2 = B.pow(2.5f);
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+   * const T THRESHOLD = 40.0;
+   * b = log(1.0 +
+   *         exp((a > THRESHOLD) ? THRESHOLD
+   *             : ((a < -THRESHOLD) ? (-THRESHOLD) : a)))
+   */
+  B.softrelu(A1);
+
+  real THRESHOLD = 40.0;
+  A2 = (B.constant(1.0f) +
+        (B > THRESHOLD).condition(
+          THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)).exp()).log();
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+   * const T THRESHOLD = 40.0;
+   * a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
+   *                             ? THRESHOLD
+   *                             : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
+   */
+  A1.softreluDerivative(B);
+  real THRESHOLD = 40.0;
+  A2 = A2 * (B.constant(1.0f) -
+             (B.constant(-1.0f) *
+              (B > THRESHOLD).condition(
+                THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))).exp());
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+    const T THRESHOLD_MIN = -40.0;
+    const T THRESHOLD_MAX = 13.0;
+    T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN
+            : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
+    b = 1.0f / (1.0f + exp(-tmp)))
+   */
+  B.sigmoid(A1);
+
+  const real THRESHOLD_MIN = -40.0;
+  const real THRESHOLD_MAX = 13.0;
+  auto tmp = (B < THRESHOLD_MIN).condition(
+    THRESHOLD_MIN, (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B));
+  A2 = (B.constant(1.0f) + (-tmp).exp()).reciprocal();
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSigmoidDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.sigmoidDerivative(B);  // a *= b * (1 - b)
+  A2 *= B * (B.constant(1.0f) - B);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorTanh(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.tanh(A1);  // b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
+  A2 = B.constant(2.0f) / ((B * ((real)-2.0f)).exp() + (real)1.0f) - (real)1.0f;
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.tanhDerivative(B);  // a *= 1 - b * b
+  A2 *= B.constant(1.0f) - B * B;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorScaledTanh(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p1 = 2.5;
+  real p2 = 3.1;
+  // b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)
+  B.scaledTanh(A1, p1, p2);
+  A2 = B.constant(p1) *
+      (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0)
+       - (real)1.0);
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p1 = 2.5;
+  real p2 = 3.1;
+  // a *= (p2 / p1) * (p1 * p1 - b * b));
+  A1.scaledTanhDerivative(B, p1, p2);
+  A2 = A2 * (B.constant(p2 / p1) * (B.constant(p1 * p1) - B * B));
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) {
+  testTensorTanhDerivative(A1, A2, B);
+  testTensorScaledTanhDerivative(A1, A2, B);
+  testTensorSigmoidDerivative(A1, A2, B);
+  testTensorExpDerivative(A1, A2, B);
+  testTensorScaledTanh(A1, A2, B);
+  testTensorTanh(A1, A2, B);
+  testTensorExp(A1, A2, B);
+  testTensorLog(A1, A2, B);
+  testTensorSqrt(A1, A2, B);
+  testTensorInvSqrt(A1, A2, B);
+  testTensorPow(A1, A2, B);
+
+  testTensorSoftrelu(A1, A2, B);
+  testTensorSoftreluDerivative(A1, A2, B);
+  testTensorSigmoid(A1, A2, B);
+}
+
+TEST(Binary, MathOp) {
+  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryMathOp<CpuMatrix>);
+
+#ifndef PADDLE_ONLY_CPU
+  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryMathOp<GpuMatrix>);
+#endif
+}
+
+template<typename Tensor>
+void testTensorRelu(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.relu(A1);  // b = a > 0.0f ? a : 0.0f
+  A2 = (B > (real)0.0f).condition(B, (real)0.0f);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorReluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.reluDerivative(B);  // a *= (b > 0.0f ? 1.0f : 0.0f)
+  A2 *= (B > (real)0.0).condition((real)1.0, (real)0.0);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+   * b = a > p1 ? a : p1
+   * b = b < p2 ? b : p2
+   * int p1 = 0, p2 = 24;
+   */
+  SetTensorValue(B, 32.0f);
+  B.brelu(A1);
+  auto tmp = (B > (real)0.0f).condition(B, (real)0.0f);
+  A2 = (tmp < (real)24.0f).condition(tmp, (real)24.0f);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  SetTensorValue(B, 32.0f);
+  /*
+   * a *= (b > p1 && b < p2) ? 1.0 : 0.0
+   * int p1 = 0, p2 = 24;
+   */
+  A1.breluDerivative(B);
+  A2 *= (B > (real)0.0f && B < (real)24.0f).condition((real)1.0f, (real)0.0f);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorAbsDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.absDerivative(B);  // a = (b > 0) ? a : (b < 0) ? -a : 0
+  A2 = (B > (real)0.0f).condition(A2,
+    (B < (real)0.0f).condition(-A2, (real)0.0f));
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 0.613;
+  SetTensorValue(B, p);
+  A1.isEqualTo(B, p);  // a = (b == p)
+  A2 = (B == p);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
+  /**
+   * T lambda = p * b;
+   * a = (a > lambda) ? (a - lambda)
+   *                  : (a < -lambda) ? (a + lambda) : 0
+   *
+   * p = learningRate * decayRate;
+   */
+  real learningRate = 0.7f;
+  real decayRate = 0.6f;
+  A1.applyL1(B, learningRate, decayRate);
+  auto lambda = B.constant(learningRate * decayRate) * B;
+  A2 = (A2 > lambda).condition(
+    (A2 - lambda), (A2 < -lambda).condition((A2 + lambda), (real)0.0f));
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.subScalar(0.5f);
+  SetTensorValue(B, 0.0f);
+  testTensorReluDerivative(A1, A2, B);
+
+  A1.randomizeUniform();
+  A2.copyFrom(A1);
+  testTensorBreluDerivative(A1, A2, B);
+
+  testTensorAbsDerivative(A1, A2, B);
+  testTensorRelu(A1, A2, B);
+  testTensorBrelu(A1, A2, B);
+  testTensorIsEqualTo(A1, A2, B);
+}
+
+TEST(Binary, CompareOp) {
+  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryCompareOp<CpuMatrix>);
+
+#ifndef PADDLE_ONLY_CPU
+  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryCompareOp<GpuMatrix>);
+#endif
+}
+
+template<typename Tensor>
+void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.add(B, C);  // a = b + c
+  A2 = B + C;
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 1.5;
+  real p2 = 2.5;
+  real p3 = 3.8;
+  A1.add(B, p1, C, p2);  // a = p1 * b + p2 * c
+  A2 = B * p1 + C * p2;
+  TensorCheckEqual(A1, A2);
+
+  A1.add2(B, C);  // a = a + b + c
+  A2 = A2 + B + C;
+  TensorCheckEqual(A1, A2);
+
+  A1.add2(B, C, p1, p2, p3);  // a = p1 * a + p2 * b + p3 * c
+  A2 = A2 * p1 + B * p2 + C * p3;
+  TensorCheckEqual(A1, A2);
+
+  A1.decayAddSquareMul(B, C, p1, p2);  // a = p1 * a + p2 * b * b * c * c
+  A2 = A2 * p1 + B.constant(p2) * B * B * C * C;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.sub(B, C);  // a = b - c
+  A2 = B - C;
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 1.5;
+  real p2 = 2.5;
+  A1.sub(B, p1, C, p2);  // a = p1 * b - p2 * c
+  A2 = B * p1 - C * p2;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.dotMul(B, C);  // a = b * c
+  A2 = B * C;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotMulSquare(B, C);  // a = b * c * c
+  A2 = B * C * C;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotSquareSquare(B, C);  // a = b * b * c * c
+  A2 = B * B * C * C;
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 1.5;
+  real p2 = 2.5;
+
+  /*
+   * T tmp = p1 * b + p2 * c;
+   * a *= tmp * tmp
+   */
+  A1.dotMulSquareSum(B, C, p1, p2);
+  auto tmp = B * p1 + C * p2;
+  A2 *= tmp * tmp;
+  TensorCheckEqual(A1, A2);
+
+  /*
+   * T tmp = p1 * b + p2 * c;
+   * a = tmp * tmp
+   */
+  A1.dotSquareSum(B, C, p1, p2);
+  auto tmp2 = B * p1 + C * p2;
+  A2 = tmp2 * tmp2;
+  TensorCheckEqual(A1, A2);
+
+  // a *= p1 * b + p2 * c
+  A1.dotMulSum(B, C, p1, p2);
+  A2 *= B * p1 + C * p2;
+  TensorCheckEqual(A1, A2);
+
+  // a = p1 * a + p2 * b * c
+  A1.addDotMul(B, C, p1, p2);
+  A2 = A2 * p1 + B.constant(p2) * B * C;
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.dotDiv(B, C);  // a = (b == 0.0) ? 0.0 : b / c
+  A2 = (B == (real)0.0).condition((real)0.0, B / C);
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 1.5;
+  real p2 = 2.5;
+  A1.dotDiv(B, C, p1, p2);  // a = (b + p1) / (c + p2)
+  A2 = (B + p1) / (C + p2);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  real p1 = 1.5;
+  real p2 = 2.5;
+  real p3 = 3.5;
+  A1.reciprocalSum(B, C, p1, p2, p3);  // a = 1 / (p1 * b + p2 * c + p3)
+  A2 = (B * p1 + C * p2 + p3).reciprocal();
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSoftCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.softCrossEntropy(B, C);  // a = -c * log(b) - (1 - c) * log(1 - b)
+  A2 = -C * B.log() - (C.constant(1.0f) - C) * (B.constant(1.0f) - B).log();
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorSoftCrossEntropyBp(Tensor& A1,
+                                  Tensor& A2,
+                                  Tensor& B,
+                                  Tensor& C) {
+  A1.softCrossEntropyBp(B, C);  // a += (b - c) / (b * (1 - b))
+  A2 += (B - C) / (B * (B.constant(1.0f) - B));
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  testTensorAdd(A1, A2, B, C);
+  testTensorSub(A1, A2, B, C);
+  testTensorMul(A1, A2, B, C);
+  testTensorDiv(A1, A2, B, C);
+  testTensorReciprocal(A1, A2, B, C);
+  testTensorSoftCrossEntropyBp(A1, A2, B, C);
+
+  testTensorSoftCrossEntropy(A1, A2, B, C);
+}
+
+TEST(Ternary, BaseOp) {
+  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryBaseOp<CpuMatrix>);
+
+#ifndef PADDLE_ONLY_CPU
+  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryBaseOp<GpuMatrix>);
+#endif
+}
+
+template<typename Tensor>
+void testTensorBinaryLabelCrossEntropy(Tensor& A1,
+                                       Tensor& A2,
+                                       Tensor& B,
+                                       Tensor& C) {
+  A1.binaryLabelCrossEntropy(B, C);  // a = c > 0.5 ? -log(b) : -log(1.0 - b)
+  A2 = (C > (real)0.5).condition(
+    -(B.log()), -((B.constant(1.0f) - B).log()));
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorBinaryLabelCrossEntropyBp(Tensor& A1,
+                                         Tensor& A2,
+                                         Tensor& B,
+                                         Tensor& C) {
+  // a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)
+  A1.binaryLabelCrossEntropyBp(B, C);
+  A2 += (C > (real)0.5).condition(
+    (B.constant(-1.0f) / B), (B.constant(1.0f) - B).reciprocal());
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorLogisticRegressionLoss(Tensor& A1,
+                                      Tensor& A2,
+                                      Tensor& B,
+                                      Tensor& C) {
+  SetTensorValue(B, 50.0f);
+  SetTensorValue(B, -50.0f);
+  /**
+   * const T THRESHOLD = 40.0;
+   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
+   *                                        ? -THRESHOLD
+   *                                        : b;
+   * a = log(1 + exp(x)) - c * x
+   */
+  A1.logisticRegressionLoss(B, C);
+  real THRESHOLD = 40.0;
+  auto tmp = (B > THRESHOLD).condition(
+    THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
+  A2 = (C.constant(1.0f) + tmp.exp()).log() - C * tmp;
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorLogisticRegressionLossBp(Tensor& A1,
+                                        Tensor& A2,
+                                        Tensor& B,
+                                        Tensor& C) {
+  SetTensorValue(B, 50.0f);
+  SetTensorValue(B, -50.0f);
+  /**
+   * const T THRESHOLD = 40.0;
+   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
+   *                                        ? -THRESHOLD
+   *                                        : b;
+   * x = exp(x); a = x / (1 + x) - c
+   */
+  A1.logisticRegressionLossBp(B, C);
+  real THRESHOLD = 40.0;
+  auto tmp = (B > THRESHOLD).condition(
+    THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
+  auto tmp2 = tmp.exp();
+  A2 = tmp2 / (C.constant(1.0) + tmp2) - C;
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorBiggerThan(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.biggerThan(B, C);  // a = (b > c) ? 1.0f : 0.0f
+  A2 = (B > C).condition((real)1.0f, (real)0.0f);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorMax(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.max2(B, C);  // a = (b > c) ? b : c
+  A2 = (B > C).condition(B, C);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  testTensorBinaryLabelCrossEntropyBp(A1, A2, B, C);
+  testTensorBinaryLabelCrossEntropy(A1, A2, B, C);
+  testTensorBiggerThan(A1, A2, B, C);
+  testTensorMax(A1, A2, B, C);
+
+  testTensorLogisticRegressionLoss(A1, A2, B, C);
+  testTensorLogisticRegressionLossBp(A1, A2, B, C);
+}
+
+TEST(Ternary, CompareOp) {
+  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryCompareOp<CpuMatrix>);
+
+#ifndef PADDLE_ONLY_CPU
+  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryCompareOp<GpuMatrix>);
+#endif
+}
+
+template<typename Tensor>
+void testQuaternaryAdd(Tensor& A1,
+                       Tensor& A2,
+                       Tensor& B,
+                       Tensor& C,
+                       Tensor& D) {
+  // A1.add3(B, C, D, 1.5f, 2.5f, 3.5f);  // a = p1 * b + p2 * c + p3 * d
+  // A2 = B * 1.5f + C * 2.5f + D * 3.5f;
+  // TensorCheckEqual(A1, A2);
+
+  /*
+   * T tmp = p1 * b + p2 * c + p3 * d;
+   * a += tmp * tmp
+   */
+  real p1 = 1.5f;
+  real p2 = 2.5f;
+  real p3 = 3.5f;
+  A1.addSquareSum(B, C, D, p1, p2, p3);
+  auto tmp = B * p1 + C * p2 + D * p3;
+  A2 += tmp * tmp;
+  TensorCheckEqual(A1, A2);
+}
+
+TEST(Quaternary, BaseOp) {
+  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryAdd<CpuMatrix>);
+
+#ifndef PADDLE_ONLY_CPU
+  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryAdd<GpuMatrix>);
+#endif
+}
+
+template<typename Tensor>
+void testTensorBiggerThan(Tensor& A1,
+                          Tensor& A2,
+                          Tensor& B,
+                          Tensor& C,
+                          Tensor& D) {
+  // a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
+  A1.biggerThan(B, C, D);
+  A2 = ((B > C && D > (real)0.5)
+        || (B < C && D < (real)0.5)).condition((real)1.0, (real)0.0);
+  TensorCheckEqual(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorRankLoss(Tensor& A1,
+                        Tensor& A2,
+                        Tensor& B,
+                        Tensor& C,
+                        Tensor& D) {
+  /**
+   * const T THRESHOLD = 40.0; a = b - c;
+   * a = (a > THRESHOLD)
+   *         ? THRESHOLD
+   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
+   * a = log(1 + exp(a)) - a * d
+   */
+  A1.rankLoss(B, C, D);
+
+  real THRESHOLD = 40.0;
+  auto tmp = B - C;
+  auto tmp2 = (tmp > THRESHOLD).condition(
+    THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
+  A2 = (D.constant(1.0f) + tmp2.exp()).log() - tmp2 * D;
+
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testTensorRankLossBp(Tensor& A1,
+                          Tensor& A2,
+                          Tensor& B,
+                          Tensor& C,
+                          Tensor& D) {
+  /**
+   * const T THRESHOLD = 40.0; a = b - c;
+   * a = (a > THRESHOLD)
+   *         ? THRESHOLD
+   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
+   * a = exp(a); a = (a / (1 + a) - d)
+   */
+  A1.rankLossBp(B, C, D);
+  real THRESHOLD = 40.0;
+  auto tmp = B - C;
+  auto tmp2 = (tmp > THRESHOLD).condition(
+    THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
+  auto tmp3 = tmp2.exp();
+  A2 = tmp3 / (D.constant(1.0f) + tmp3) - D;
+
+  TensorCheckErr(A1, A2);
+}
+
+template<typename Tensor>
+void testQuaternaryCompareOp(Tensor& A1,
+                             Tensor& A2,
+                             Tensor& B,
+                             Tensor& C,
+                             Tensor& D) {
+  testTensorBiggerThan(A1, A2, B, C, D);
+  testTensorRankLoss(A1, A2, B, C, D);
+  testTensorRankLossBp(A1, A2, B, C, D);
+}
+
+TEST(Quaternary, CompareOp) {
+  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryCompareOp<CpuMatrix>);
+
+#ifndef PADDLE_ONLY_CPU
+  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
+#endif
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  hl_start();
+  hl_init(0);
+  return RUN_ALL_TESTS();
+}
+
diff --git a/paddle/math/tests/test_TrainingAlgorithm.cpp b/paddle/math/tests/test_TrainingAlgorithm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b40c8d9dae5fc573d8696a853c74e48f5293b234
--- /dev/null
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
@@ -0,0 +1,469 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/utils/Util.h"
+#include "paddle/math/TrainingAlgorithmOp.h"
+#include "OriginalOptimizerApi.h"
+#include "TensorCheck.h"
+#include "PerfUtils.h"
+
+using namespace paddle;  // NOLINT
+
+#ifndef PADDLE_TYPE_DOUBLE
+P_DEFINE_double(max_diff, 1e-5, "max diff allowed");
+#else
+P_DEFINE_double(max_diff, 1e-13, "max diff allowed");
+#endif
+
+class SetMaxDiff {
+public:
+  explicit SetMaxDiff(double max_diff) {
+    max_diff_ = FLAGS_max_diff;
+    FLAGS_max_diff = max_diff;
+  }
+  ~SetMaxDiff() { FLAGS_max_diff = max_diff_; }
+
+private:
+  double max_diff_;
+};
+
+#define COPY_VECTOR_TO_CPU(cpuVec, vector)               \
+  do {                                                   \
+    if (vector->useGpu()) {                              \
+      cpuVec = Vector::create(vector->getSize(), false); \
+      cpuVec->copyFrom(*vector);                         \
+    } else {                                             \
+      cpuVec = vector;                                   \
+    }                                                    \
+  } while (0)
+
+int VectorCheckErr(const Vector& vector1, const Vector& vector2) {
+  CHECK(vector1.getSize() == vector2.getSize());
+
+  const real* data1 = vector1.getData();
+  const real* data2 = vector2.getData();
+  size_t size = vector1.getSize();
+  int count = 0;
+  for (size_t i = 0; i < size; i++) {
+    real a = data1[i];
+    real b = data2[i];
+    if (fabs(a - b) > FLAGS_max_diff) {
+      if ((fabsf(a - b) / fabsf(a)) > (FLAGS_max_diff / 10.0f)) {
+        count++;
+      }
+    }
+  }
+
+  return count;
+}
+
+int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
+  VectorPtr tmp1;
+  VectorPtr tmp2;
+  COPY_VECTOR_TO_CPU(tmp1, vector1);
+  COPY_VECTOR_TO_CPU(tmp2, vector2);
+  return VectorCheckErr(*tmp1, *tmp2);
+}
+
+#ifdef PADDLE_DISABLE_TIMER
+
+#define CHECK_VECTORPTR(vector1, vector2) \
+  EXPECT_EQ(VectorCheckErr(vector1, vector2), 0)
+
+#else
+
+#define CHECK_VECTORPTR(vector1, vector2)
+
+#endif
+
+typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
+
+void testCase(testMatrixFunc matrixFunc) {
+#ifndef PADDLE_ONLY_CPU
+  for (auto useGpu : {false, true}) {
+#else
+  for (auto useGpu : {false}) {
+#endif
+    for (auto size : {1,
+                      32,
+                      64,
+                      128,
+                      512,
+                      1024,
+                      4096,
+                      32768,
+                      65536,
+                      131072,
+                      262144,
+                      524288,
+                      1048576,
+                      2097152}) {
+      LOG(INFO) << " size=" << size << " useGpu=" << useGpu;
+      matrixFunc(size, useGpu);
+    }
+  }
+}
+
+#define INIT_VECTOR(vec1, vec2, type, size, useGpu) \
+  vec1[type] = Vector::create(size, useGpu);        \
+  vec2[type] = Vector::create(size, useGpu);        \
+  vec1[type]->rand();                               \
+  vec2[type]->copyFrom(*vec1[type]);
+
+void testAdagrad(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+
+  EXPRESSION_PERFORMANCE(AdagradParameterOptimizer(
+      bufs1, epsilon, learningRate, momentum, decayRate));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& accum_buffer = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+
+  EXPRESSION_PERFORMANCE(adagradApply(value,
+                                      grad,
+                                      mom,
+                                      accum_buffer,
+                                      accum,
+                                      lr,
+                                      epsilon,
+                                      learningRate,
+                                      momentum,
+                                      decayRate));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+
+TEST(Training, Adagrad) { testCase(testAdagrad); }
+
+void testAdaDelta(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+
+  EXPRESSION_PERFORMANCE(AdaDeltaParameterOptimizer(
+      bufs1, rou, epsilon, learningRate, momentum, decayRate));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum_update = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+
+  EXPRESSION_PERFORMANCE(adadeltaApply(value,
+                                       grad,
+                                       mom,
+                                       accum,
+                                       accum_update,
+                                       lr,
+                                       rou,
+                                       epsilon,
+                                       learningRate,
+                                       momentum,
+                                       decayRate));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+
+TEST(Training, AdaDelta) { testCase(testAdaDelta); }
+
+template <bool isFirstTime>
+void testRMSProp(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+
+  /* make sure 'g - f.square()' greater than 0 */
+  bufs1[PARAMETER_GRADIENT_SQURESUM]->add(1.0);
+  bufs2[PARAMETER_GRADIENT_SQURESUM]->copyFrom(
+      *bufs1[PARAMETER_GRADIENT_SQURESUM]);
+
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+  real accumulatedRou = rou;
+
+  EXPRESSION_PERFORMANCE(RMSPropParameterOptimizer(bufs1,
+                                                   accumulatedRou,
+                                                   rou,
+                                                   epsilon,
+                                                   learningRate,
+                                                   momentum,
+                                                   decayRate,
+                                                   isFirstTime));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& sum1 = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+
+  EXPRESSION_PERFORMANCE(rmspropApply(value,
+                                      grad,
+                                      mom,
+                                      sum,
+                                      sum1,
+                                      lr,
+                                      accumulatedRou,
+                                      rou,
+                                      epsilon,
+                                      learningRate,
+                                      momentum,
+                                      decayRate,
+                                      isFirstTime));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+
+TEST(Training, RMSProp) {
+  testCase(testRMSProp<true>);
+  testCase(testRMSProp<false>);
+}
+
+template <bool isFirstTime>
+void testDecayedAdagrad(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+  real accumulatedRou = rou;
+
+  if (isFirstTime) {
+    bufs1[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
+    bufs2[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
+  }
+
+  EXPRESSION_PERFORMANCE(DecayedAdagradParameterOptimizer(bufs1,
+                                                          accumulatedRou,
+                                                          rou,
+                                                          epsilon,
+                                                          learningRate,
+                                                          momentum,
+                                                          decayRate,
+                                                          isFirstTime));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+
+  EXPRESSION_PERFORMANCE(decayedAdagradApply(value,
+                                             grad,
+                                             mom,
+                                             sum,
+                                             lr,
+                                             accumulatedRou,
+                                             rou,
+                                             epsilon,
+                                             learningRate,
+                                             momentum,
+                                             decayRate,
+                                             isFirstTime));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+
+TEST(Training, DecayedAdagrad) {
+  testCase(testDecayedAdagrad<false>);
+  testCase(testDecayedAdagrad<true>);
+}
+
+void testAdam(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_SECOND_MOMENTUM, size, useGpu);
+
+  real beta1 = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta2 = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta1_power = (real)rand() / (real)RAND_MAX;   // NOLINT
+  real beta2_power = (real)rand() / (real)RAND_MAX;   // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+
+  EXPRESSION_PERFORMANCE(AdamParameterOptimizer(
+      bufs1, beta1, beta2, beta1_power, beta2_power, epsilon, learningRate));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& v = *bufs2[PARAMETER_SECOND_MOMENTUM];
+
+  EXPRESSION_PERFORMANCE(adamApply(value,
+                                   grad,
+                                   mom,
+                                   v,
+                                   beta1,
+                                   beta2,
+                                   beta1_power,
+                                   beta2_power,
+                                   epsilon,
+                                   learningRate));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_SECOND_MOMENTUM],
+                  bufs2[PARAMETER_SECOND_MOMENTUM]);
+}
+
+TEST(Training, Adam) { testCase(testAdam); }
+
+void testAdamax(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_WEIGHTED_INFINITY_NORM, size, useGpu);
+
+  real beta1 = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real beta2 = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real alpha = (real)rand() / (real)RAND_MAX;  // NOLINT
+  int64_t step = 2;
+
+  EXPRESSION_PERFORMANCE(
+      AdamaxParameterOptimizer(bufs1, beta1, beta2, step, alpha));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& u = *bufs2[PARAMETER_WEIGHTED_INFINITY_NORM];
+
+  EXPRESSION_PERFORMANCE(
+      adamaxApply(value, grad, mom, u, beta1, beta2, step, alpha));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_WEIGHTED_INFINITY_NORM],
+                  bufs2[PARAMETER_WEIGHTED_INFINITY_NORM]);
+}
+
+TEST(Training, Adamax) {
+#ifndef PADDLE_TYPE_DOUBLE
+  SetMaxDiff diff(1e-4);
+#endif
+  testCase(testAdamax);
+}
+
+void testSparseMomentum(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_UT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_VT, size, useGpu);
+
+  real alpha = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta = (real)rand() / (real)RAND_MAX;          // NOLINT
+  real gamma = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real tau = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+
+  EXPRESSION_PERFORMANCE(SparseMomentumParameterOptimizer(
+      bufs1, alpha, beta, gamma, tau, learningRate));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& momU = *bufs2[PARAMETER_MOMENTUM_UT];
+  BaseMatrix& momV = *bufs2[PARAMETER_MOMENTUM_VT];
+
+  EXPRESSION_PERFORMANCE(sparseMomentumApply(
+      value, grad, momU, momV, alpha, beta, gamma, tau, learningRate));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_UT], bufs2[PARAMETER_MOMENTUM_UT]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_VT], bufs2[PARAMETER_MOMENTUM_VT]);
+}
+
+TEST(Training, SparseMomentum) { testCase(testSparseMomentum); }
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/math/tests/test_batchTranspose.cpp b/paddle/math/tests/test_batchTranspose.cpp
index 737504da388be72de70d37d87dc866b8448f6cd2..a9596992b2b1fced417c048600b05b39882b2bf2 100644
--- a/paddle/math/tests/test_batchTranspose.cpp
+++ b/paddle/math/tests/test_batchTranspose.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "test_matrixUtil.h"
 #include "hl_batch_transpose.h"
 
@@ -48,8 +47,8 @@ TEST(MatrixBatchTransTest, test_batch_matrix_transpose) {
             cData[sample_id * nx * ny + j * nx + i];
   // device
   gMat->copyFrom(*cMat, HPPL_STREAM_DEFAULT);
-  batchTranspose(gMat->getData(), gBatchTransMat->getData(), nx, ny,
-                 numSamples);
+  batchTranspose(
+      gMat->getData(), gBatchTransMat->getData(), nx, ny, numSamples);
   cMat_d2h->copyFrom(*gBatchTransMat, HPPL_STREAM_DEFAULT);
   checkMatrixEqual(cBatchTransMat, cMat_d2h);
 }
diff --git a/paddle/math/tests/test_lazyAssign.cu b/paddle/math/tests/test_lazyAssign.cu
new file mode 100644
index 0000000000000000000000000000000000000000..52dfdacffe3b915db07eb04bad7b8d45c7ccf44e
--- /dev/null
+++ b/paddle/math/tests/test_lazyAssign.cu
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/math/Matrix.h"
+#include "paddle/math/TensorAssign.h"
+#include "TensorCheck.h"
+#include "PerfUtils.h"
+
+using paddle::BaseMatrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;
+
+typedef std::function<void(int height, int width)> testMatrixFunc;
+void testMatrixCase(testMatrixFunc matrixFunc) {
+  for (auto height : {1}) {
+    for (auto width : {1, 32, 64, 128, 512, 1024, 4096, 32768, 65536, 131072,
+                       262144, 524288, 1048576, 2097152, 4194304, 8388608}) {
+      matrixFunc(height, width);
+    }
+  }
+}
+
+template<typename Tensor>
+void testLazyAssign(int height, int width) {
+  Tensor A1(height, width);
+  Tensor A2(height, width);
+  Tensor B(height, width);
+  Tensor C(height, width);
+  Tensor D(height, width);
+  A1.randomizeUniform();
+  B.randomizeUniform();
+  C.randomizeUniform();
+  D.randomizeUniform();
+  A2.copyFrom(A1);
+
+  EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;);
+
+  EXPRESSION_PERFORMANCE(
+    auto expr1 = A2.lazyAssign(B + C);
+    auto expr2 = A2.lazyAssign(A2 * D);
+    AssignEvaluate(expr1, expr2););
+
+  TensorCheckErr(A1, A2);
+}
+
+TEST(lazyAssign, CPU) {
+  testMatrixCase(testLazyAssign<CpuMatrix>);
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST(lazyAssign, GPU) {
+  testMatrixCase(testLazyAssign<GpuMatrix>);
+}
+#endif
+
+template<typename Tensor>
+void sgdUpdateTensor(Tensor& A, Tensor& B, Tensor& C, Tensor& D,
+     real p1, real p2, real p3) {
+  C = C * p2 - D * (B + A * p3) * p1;
+  A += C;
+}
+
+void sgdUpdateLazyAssign(BaseMatrix& A, BaseMatrix& B,
+    BaseMatrix& C, BaseMatrix& D,
+    real p1, real p2, real p3) {
+  auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1);
+  auto expr2 = A.lazyAssign(A + C);
+  AssignEvaluate(expr1, expr2);
+}
+
+template<typename Tensor>
+void testSgdUpdate(int height, int width) {
+  Tensor A1(height, width);
+  Tensor A2(height, width);
+  Tensor A3(height, width);
+  A1.randomizeUniform();
+  A2.copyFrom(A1);
+  A3.copyFrom(A1);
+
+  Tensor B(height, width);
+  B.randomizeUniform();
+
+  Tensor C1(height, width);
+  Tensor C2(height, width);
+  Tensor C3(height, width);
+  C1.randomizeUniform();
+  C2.copyFrom(C1);
+  C3.copyFrom(C1);
+
+  Tensor D(height, width);
+  D.randomizeUniform();
+
+  real p1 = 0.2;
+  real p2 = 0.3;
+  real p3 = 0.5;
+
+  /**
+   * c = p2 * c - p1 * (b + p3 * a);
+   * a = a + c;
+   */
+  // BaseMatrix API
+  EXPRESSION_PERFORMANCE(
+  A1.sgdUpdate(B, C1, D, p1, p2, p3););
+
+  // Tensor expression
+  EXPRESSION_PERFORMANCE(
+    sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
+
+  // lazyAssign
+  EXPRESSION_PERFORMANCE(
+    sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
+
+  TensorCheckErr(A1, A2);
+  TensorCheckErr(A1, A3);
+  TensorCheckErr(C1, C2);
+  TensorCheckErr(C1, C3);
+}
+
+TEST(sgdUpdate, CPU) {
+  testMatrixCase(testSgdUpdate<CpuMatrix>);
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST(sgdUpdate, GPU) {
+  testMatrixCase(testSgdUpdate<GpuMatrix>);
+}
+#endif
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  hl_start();
+  hl_init(0);
+  return RUN_ALL_TESTS();
+}
+
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 9c03695ba5055c4bdb3e7c578d3e352fbd6fae6f..0883066947ae67cd55c2c505eef72168f3139b8d 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -22,143 +22,18 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include "paddle/gserver/tests/TestUtil.h"
 #include "paddle/utils/Stat.h"
-
+#include "TensorCheck.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
-
-template<class T>
-void VectorCheckEqual(const VectorT<T>& vector1, const VectorT<T>& vector2) {
-  CHECK(vector1.getSize() == vector2.getSize());
-
-  const T* data1 = vector1.getData();
-  const T* data2 = vector2.getData();
-  size_t size = vector1.getSize();
-  int count = 0;
-  for (size_t i = 0; i < size; i++) {
-    if (data1[i] != data2[i]) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void MatrixCheckEqual(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      if (data1[i * width + j] != data2[i * width + j]) {
-        count++;
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      real a = data1[i * width + j];
-      real b = data2[i * width + j];
-      if (fabs(a - b) > err) {
-        if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
-          count++;
-        }
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
-                        int channels) {
-  int inWidth = imgSizeH * imgSizeW * channels;
-  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
-  real ratioH = 0.5;
-  real ratioW = 0.5;
-  // forward
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-
-  input->randomizeUniform();
-  inputGpu->copyFrom(*input);
-
-  target->bilinearForward(*input, imgSizeH, imgSizeW,
-      2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
-  targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW,
-      2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
-
-  // check
-  targetCheck->copyFrom(*targetGpu);
-  MatrixCheckErr(*target, *targetCheck);
-
-  // backward
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, false,
-                                              true);
-  MatrixPtr targetCheckGrad =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->bilinearBackward(*targetGrad, 2 * imgSizeH, 2 * imgSizeW,
-      imgSizeH, imgSizeW, channels, ratioH, ratioW);
-  inputGpuGrad->bilinearBackward(*targetGpuGrad, 2 * imgSizeH, 2 * imgSizeW,
-      imgSizeH, imgSizeW, channels, ratioH, ratioW);
-
-  // check
-  targetCheckGrad->copyFrom(*inputGpuGrad);
-  MatrixCheckErr(*inputGrad, *targetCheckGrad);
-}
-
-TEST(Matrix, BilinearFwdBwd) {
-  for (auto numSamples : {5, 10}) {
-    for (auto channels : {8, 16}) {
-      for (auto imgSizeH : {14, 28}) {
-        for (auto imgSizeW : {16, 30}) {
-          VLOG(3) << " numSamples=" << numSamples
-                  << " channels=" << channels
-                  << " imgSizeH=" << imgSizeH
-                  << " imgSizeW=" << imgSizeW;
-          testBilinearFwdBwd(numSamples, imgSizeH, imgSizeW, channels);
-        }
-      }
-    }
-  }
-}
-
-void testMatrixProjectionForward(int contextStart, int contextLength,
-                                 bool padding, int batchSize, int inputDim) {
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;
+
+void testMatrixProjectionForward(int contextStart,
+                                 int contextLength,
+                                 bool padding,
+                                 int batchSize,
+                                 int inputDim) {
   MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
   MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
   cpuInput->randomizeUniform();
@@ -190,24 +65,30 @@ void testMatrixProjectionForward(int contextStart, int contextLength,
 
   // calculate
   int beginPad = std::max(0, -contextStart);
-  cpuOutput->contextProjectionForward(cpuInput, cpuWeight, *cpuSequence,
-                                      contextLength, contextStart, beginPad,
+  cpuOutput->contextProjectionForward(cpuInput,
+                                      cpuWeight,
+                                      *cpuSequence,
+                                      contextLength,
+                                      contextStart,
+                                      beginPad,
                                       padding);
 
-  gpuOutput->contextProjectionForward(gpuInput, gpuWeight, *gpuSequence,
-                                      contextLength, contextStart, beginPad,
+  gpuOutput->contextProjectionForward(gpuInput,
+                                      gpuWeight,
+                                      *gpuSequence,
+                                      contextLength,
+                                      contextStart,
+                                      beginPad,
                                       padding);
 
-  // check
-  MatrixPtr outputCheck =
-      std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
-  outputCheck->copyFrom(*gpuOutput);
-
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
+  TensorCheckEqual(*cpuOutput, *gpuOutput);
 }
 
-void testMatrixProjectionBackward(int contextStart, int contextLength,
-                                  bool padding, int batchSize, int inputDim) {
+void testMatrixProjectionBackward(int contextStart,
+                                  int contextLength,
+                                  bool padding,
+                                  int batchSize,
+                                  int inputDim) {
   MatrixPtr cpuOutputGrad =
       std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
   MatrixPtr gpuOutputGrad =
@@ -239,26 +120,27 @@ void testMatrixProjectionBackward(int contextStart, int contextLength,
 
   // calculate
   int beginPad = std::max(0, -contextStart);
-  cpuOutputGrad->contextProjectionBackward(cpuInputGrad, cpuWeightGrad,
-                                           *cpuSequence, contextLength,
-                                           contextStart, beginPad, padding);
-  gpuOutputGrad->contextProjectionBackwardData(gpuInputGrad, *gpuSequence,
-                                               contextLength, contextStart);
+  cpuOutputGrad->contextProjectionBackward(cpuInputGrad,
+                                           cpuWeightGrad,
+                                           *cpuSequence,
+                                           contextLength,
+                                           contextStart,
+                                           beginPad,
+                                           padding);
+  gpuOutputGrad->contextProjectionBackwardData(
+      gpuInputGrad, *gpuSequence, contextLength, contextStart);
   if (padding) {
-    gpuOutputGrad->contextProjectionBackwardWeight(
-        gpuWeightGrad, *gpuSequence, contextLength,
-        contextStart, pad, beginPad);
+    gpuOutputGrad->contextProjectionBackwardWeight(gpuWeightGrad,
+                                                   *gpuSequence,
+                                                   contextLength,
+                                                   contextStart,
+                                                   pad,
+                                                   beginPad);
   }
 
-  // check
-  MatrixPtr inputGradCheck = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  inputGradCheck->copyFrom(*gpuInputGrad);
-  MatrixCheckErr(*cpuInputGrad, *inputGradCheck);
-
+  TensorCheckErr(*cpuInputGrad, *gpuInputGrad);
   if (padding) {
-    MatrixPtr weightGradChcek = std::make_shared<CpuMatrix>(pad, inputDim);
-    weightGradChcek->copyFrom(*gpuWeightGrad);
-    MatrixCheckErr(*cpuWeightGrad, *weightGradChcek);
+    TensorCheckErr(*cpuWeightGrad, *gpuWeightGrad);
   }
 }
 
@@ -269,13 +151,19 @@ TEST(Matrix, projection) {
         for (auto batchSize : {1, 2, 5, 20, 100}) {
           for (auto inputDim : {15, 32, 63, 128, 200}) {
             VLOG(3) << " contextStart=" << contextStart
-                      << " contextLength=" << contextLength
-                      << " trainablePadding=" << trainablePadding
-                      << " batchSize=" << batchSize << " inputDim=" << inputDim;
-            testMatrixProjectionForward(contextStart, contextLength,
-                                        trainablePadding, batchSize, inputDim);
-            testMatrixProjectionBackward(contextStart, contextLength,
-                                         trainablePadding, batchSize, inputDim);
+                    << " contextLength=" << contextLength
+                    << " trainablePadding=" << trainablePadding
+                    << " batchSize=" << batchSize << " inputDim=" << inputDim;
+            testMatrixProjectionForward(contextStart,
+                                        contextLength,
+                                        trainablePadding,
+                                        batchSize,
+                                        inputDim);
+            testMatrixProjectionBackward(contextStart,
+                                         contextLength,
+                                         trainablePadding,
+                                         batchSize,
+                                         inputDim);
           }
         }
       }
@@ -311,15 +199,8 @@ void testMatrixMaxSequence(int batchSize, int inputDim) {
   cpuOutput->maxSequenceForward(*cpuInput, *cpuSequence, *cpuIndex);
   gpuOutput->maxSequenceForward(*gpuInput, *gpuSequence, *gpuIndex);
 
-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
-
-  IVectorPtr indexCheck = nullptr;
-  IVector::resizeOrCreate(indexCheck, newBatchSize * inputDim, false);
-  indexCheck->copyFrom(*gpuIndex);
-  VectorCheckEqual(*cpuIndex, *indexCheck);
+  TensorCheckEqual(*cpuOutput, *gpuOutput);
+  TensorCheckEqual(*cpuIndex, *gpuIndex);
 
   // backward
   MatrixPtr cpuOutputGrad = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
@@ -335,10 +216,7 @@ void testMatrixMaxSequence(int batchSize, int inputDim) {
   cpuInputGrad->maxSequenceBackward(*cpuOutputGrad, *cpuSequence, *cpuIndex);
   gpuInputGrad->maxSequenceBackward(*gpuOutputGrad, *gpuSequence, *gpuIndex);
 
-  // check
-  MatrixPtr inputGradCheck = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  inputGradCheck->copyFrom(*gpuInputGrad);
-  MatrixCheckEqual(*cpuInputGrad, *inputGradCheck);
+  TensorCheckEqual(*cpuInputGrad, *gpuInputGrad);
 }
 
 TEST(Matrix, maxSequence) {
@@ -381,6 +259,8 @@ void testMatrixZeroAtOffset(int height, int width) {
   int columnOffset = rand() % width;  // NOLINT we just use rand() for test.
   int numColumns = rand() % (width - columnOffset);  // NOLINT
 
+  if (numColumns == 0) return;
+
   cpuA->zeroAtOffset(columnOffset, numColumns);
   gpuA->zeroAtOffset(columnOffset, numColumns);
 
@@ -392,304 +272,26 @@ void testMatrixZeroAtOffset(int height, int width) {
     }
   }
 
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-  MatrixCheckEqual(*cpuA, *cpuTest);
-}
-
-void testMatrixBinaryAdd(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  cpuA->add(*cpuB);
-  gpuA->add(*gpuB);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-}
-
-void testMatrixAssign(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  cpuA->assign(2.5);
-  gpuA->assign(2.5);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-}
-
-void testMatrixAdd(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  cpuA->add(2.5);
-  gpuA->add(2.5);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-}
-
-void testMatrixSqrt(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  cpuA->sqrt();
-  gpuA->sqrt();
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixTanhDerivative(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  cpuA->tanhDerivative(*cpuB);
-  gpuA->tanhDerivative(*gpuB);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixTanh(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  cpuA->tanh(*cpuB);
-  gpuA->tanh(*gpuB);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixTernarySub(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  cpuA->sub(*cpuB, *cpuC);
-  gpuA->sub(*gpuB, *gpuC);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-}
-
-void testMatrixSumOfSquaresBp(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  cpuA->sumOfSquaresBp(*cpuB, *cpuC);
-  gpuA->sumOfSquaresBp(*gpuB, *gpuC);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixBinaryRowScale(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, 1);
-
-  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr gpuA1 = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB1 = std::make_shared<GpuMatrix>(height, 1);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  cpuA1->copyFrom(*cpuA);
-  cpuB1->copyFrom(*cpuB);
-  gpuA1->copyFrom(*cpuA);
-  gpuB1->copyFrom(*cpuB);
-
-  cpuA->addColVector(*cpuB);
-  gpuA->addColVector(*gpuB);
-  cpuA1->addColumnVector(*cpuB1);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-
-  MatrixCheckEqual(*cpuA, *cpuA1);
-}
-
-void testMatrixAddBias(int height, int width, real scale) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-
-  cpuA->addBias(*cpuB, scale);
-  gpuA->addBias(*gpuB, scale);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixTernaryRowScale(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
-
-  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC1 = std::make_shared<CpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-  cpuA1->copyFrom(*cpuA);
-  cpuB1->copyFrom(*cpuB);
-  cpuC1->copyFrom(*cpuC);
-
-  int columnOffset = rand() % width;  // NOLINT
-
-  cpuA->rowScale(columnOffset, *cpuB, *cpuC);
-  gpuA->rowScale(columnOffset, *gpuB, *gpuC);
-  cpuA1->rowScale2(columnOffset, *cpuB1, *cpuC1);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckEqual(*cpuA, *outputCheck);
-
-  MatrixCheckEqual(*cpuA, *cpuA1);
+  TensorCheckEqual(*cpuA, *gpuA);
+  TensorCheckEqual(*cpuA, *cpuTest);
 }
 
-void testMatrixTernaryRowDotMul(int height, int width) {
+void testMatrixDeepSwap(int height, int width) {
   MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
   MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-
-  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC1 = std::make_shared<CpuMatrix>(height, width);
-
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr cpuCopyA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuCopyB = std::make_shared<CpuMatrix>(height, width);
 
   cpuA->randomizeUniform();
   cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  cpuA1->copyFrom(*cpuA);
-  cpuB1->copyFrom(*cpuB);
-  cpuC1->copyFrom(*cpuC);
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
+  cpuCopyA->copyFrom(*cpuA);
+  cpuCopyB->copyFrom(*cpuB);
 
-  int columnOffset = rand() % width;  // NOLINT
+  // swap matrix cpuA and cpuB
+  cpuA->deepSwap(*cpuB);
 
-  cpuA->rowDotMul(columnOffset, *cpuB, *cpuC);
-  gpuA->rowDotMul(columnOffset, *gpuB, *gpuC);
-  cpuA1->rowDotMul2(columnOffset, *cpuB1, *cpuC1);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *cpuA1);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixAddDotMulMMV(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(1, width);
-
-  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC1 = std::make_shared<CpuMatrix>(1, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-  cpuA1->copyFrom(*cpuA);
-  cpuB1->copyFrom(*cpuB);
-  cpuC1->copyFrom(*cpuC);
-
-  cpuA->addDotMulMMV(*cpuB, *cpuC);
-  gpuA->addDotMulMMV(*gpuB, *gpuC);
-  cpuA1->addDotMulMMV2(*cpuB1, *cpuC1);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-  MatrixCheckEqual(*cpuA, *cpuA1);
+  TensorCheckEqual(*cpuA, *cpuCopyB);
+  TensorCheckEqual(*cpuB, *cpuCopyA);
 }
 
 void testMatrixTranspose(int height, int width) {
@@ -703,9 +305,7 @@ void testMatrixTranspose(int height, int width) {
   cpu->transpose(cpuT, false);
   gpu->transpose(gpuT, false);
 
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(width, height);
-  outputCheck->copyFrom(*gpuT);
-  MatrixCheckEqual(*cpuT, *outputCheck);
+  TensorCheckEqual(*cpuT, *gpuT);
 }
 
 void testMatrixInverse(int height) {
@@ -726,530 +326,127 @@ void testMatrixInverse(int height) {
   cpu->inverse(cpuI, false);
   gpu->inverse(gpuI, false);
 
-  outputCheck->copyFrom(*gpuI);
-  MatrixCheckErr(*cpuI, *outputCheck);
-
-  outputCheck->mul(cpu, cpuI);
-  cpu->setDiag(1.0);
-  MatrixCheckErr(*cpu, *outputCheck);
-}
-
-TEST(Matrix, unary) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-
-      // applyUnary
-      testMatrixAssign(height, width);
-      testMatrixAdd(height, width);
-      testMatrixSqrt(height, width);
-
-      // applyBinary
-      testMatrixBinaryAdd(height, width);
-      testMatrixTanh(height, width);
-      testMatrixTanhDerivative(height, width);
-
-      // applyTernary
-      testMatrixTernarySub(height, width);
-      testMatrixSumOfSquaresBp(height, width);
-
-      // asRowVector
-      testMatrixAddBias(height, width, 1.0);
-      testMatrixAddBias(height, width, 3.5);
-      testMatrixAddDotMulMMV(height, width);
-
-      // asColVector
-      testMatrixTernaryRowScale(height, width);
-      testMatrixBinaryRowScale(height, width);
-
-      // sum
-      testMatrixGetSum(height, width);
-
-      // transpose
-      testMatrixTranspose(height, width);
-    }
-    // inverse
-    testMatrixInverse(height);
-  }
-}
-
-void testMatrixSoftmax(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->zero();
-  gpuOutput->zero();
-  cpuInput->softmax(*cpuOutput);
-  gpuInput->softmax(*gpuOutput);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckErr(*cpuOutput, *outputCheck);
-}
-
-void testSequenceSoftmax(int batchSize) {
-  // forward
-  int inputDim = 1;
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  cpuInput->sequenceSoftmax(*cpuInput, *cpuSequence);
-  gpuInput->sequenceSoftmax(*gpuInput, *gpuSequence);
-
-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  outputCheck->copyFrom(*gpuInput);
-  MatrixCheckErr(*cpuInput, *outputCheck);
-}
-
-
-void testMatrixSoftmaxThreshold(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  cpuInput->getData()[0] = 100.0;
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->zero();
-  gpuOutput->zero();
-  cpuInput->softmax(*cpuOutput);
-  gpuInput->softmax(*gpuOutput);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuOutput);
-  // check output zero
-  int cpuCount = 0;
-  int gpuCount = 0;
-  auto zeroNum = [](MatrixPtr out, int& count) {
-    for (size_t i = 0; i < out->getHeight(); i++) {
-      for (size_t j = 0; j < out->getWidth(); j++) {
-        if (out->getElement(i, j) == 0) count++;
-      }
-    }
-  };
-  zeroNum(cpuOutput, cpuCount);
-  zeroNum(outputCheck, gpuCount);
-  EXPECT_EQ(cpuCount, 0) << "Cpu softmax output value 0";
-  EXPECT_EQ(gpuCount, 0) << "Gpu softmax output value 0";
-}
-
-void testMatrixSoftmaxBp(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-  gpuOutput->softmaxBackward(*gpuInput);
-
-  MatrixPtr sftMaxSum = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr sftMaxDot = std::make_shared<CpuMatrix>(height, width);
-  sftMaxDot->dotMul(*cpuOutput, *cpuInput);
-  sftMaxSum->colMerge(*sftMaxDot);
-  cpuOutput->softmaxDerivative(*cpuInput, *sftMaxSum);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckErr(*cpuOutput, *outputCheck);
-}
-
-TEST(Matrix, softmax) {
-  for (auto height : {1, 11, 73, 128, 200}) {
-    for (auto width : {1, 32, 100, 512, 1000}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-
-      testMatrixSoftmax(height, width);
-      testMatrixSoftmaxBp(height, width);
-      testMatrixSoftmaxThreshold(height, width);
-    }
-    testSequenceSoftmax(height);
-  }
-}
-
-void testMatrixAddDotMulVMM(int height, int width, int endCol = 0) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(1, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
-
-  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC1 = std::make_shared<CpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-  cpuA1->copyFrom(*cpuA);
-  cpuB1->copyFrom(*cpuB);
-  cpuC1->copyFrom(*cpuC);
-
-  if (!endCol) {
-    cpuA->addDotMulVMM(*cpuB, *cpuC);
-    gpuA->addDotMulVMM(*gpuB, *gpuC);
-    cpuA1->addDotMulVMM2(*cpuB1, *cpuC1);
-
-    MatrixCheckErr(*cpuA, *cpuA1);
-  } else {
-    MatrixPtr subCpuA = cpuA->subColMatrix(0, endCol);
-    MatrixPtr subCpuB = cpuB->subColMatrix(0, endCol);
-    MatrixPtr subCpuC = cpuC->subColMatrix(0, endCol);
-    MatrixPtr subGpuA = gpuA->subColMatrix(0, endCol);
-    MatrixPtr subGpuB = gpuB->subColMatrix(0, endCol);
-    MatrixPtr subGpuC = gpuC->subColMatrix(0, endCol);
-    subCpuA->addDotMulVMM(*subCpuB, *subCpuC);
-    subGpuA->addDotMulVMM(*subGpuB, *subGpuC);
-  }
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixRowSum(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, 1);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  MatrixPtr cpuA1 = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr cpuB1 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA1 = std::make_shared<GpuMatrix>(height, 1);
-  MatrixPtr gpuB1 = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  cpuA1->copyFrom(*cpuA);
-  cpuB1->copyFrom(*cpuB);
-  gpuA1->copyFrom(*cpuA);
-  gpuB1->copyFrom(*cpuB);
-
-  cpuA->colMerge(*cpuB);
-  gpuA->colMerge(*gpuB);
-
-  cpuB1->rowSum(*cpuA1);
-  gpuB1->rowSum(*gpuA1);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, 1);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-  outputCheck->copyFrom(*gpuA1);
-  MatrixCheckErr(*cpuA1, *outputCheck);
-}
-
-void testMatrixRowMax(int height, int width, int endCol = 0) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, 1);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-
-  if (!endCol) {
-    cpuB->rowMax(*cpuA);
-    gpuB->rowMax(*gpuA);
-  } else {
-    MatrixPtr subCpuB = cpuB->subColMatrix(0, endCol);
-    MatrixPtr subGpuB = gpuB->subColMatrix(0, endCol);
-    subCpuB->rowMax(*cpuA);
-    subGpuB->rowMax(*gpuA);
-  }
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, 1);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixColSum(int height, int width, int endCol = 0) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(1, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-
-  if (!endCol) {
-    cpuA->accumulateColSum(*cpuB);
-    gpuA->accumulateColSum(*gpuB);
-  } else {
-    MatrixPtr subCpuA = cpuA->subColMatrix(0, endCol);
-    MatrixPtr subGpuA = gpuA->subColMatrix(0, endCol);
-    MatrixPtr subCpuB = cpuB->subColMatrix(0, endCol);
-    MatrixPtr subGpuB = gpuB->subColMatrix(0, endCol);
-    subCpuA->accumulateColSum(*subCpuB);
-    subGpuA->accumulateColSum(*subGpuB);
-  }
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixColMax(int height, int width, int endCol = 0) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(1, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-
-  if (!endCol) {
-    cpuB->colMax(*cpuA);
-    gpuB->colMax(*gpuA);
-  } else {
-    MatrixPtr subCpuA = cpuA->subColMatrix(0, endCol);
-    MatrixPtr subGpuA = gpuA->subColMatrix(0, endCol);
-    MatrixPtr subCpuB = cpuB->subColMatrix(0, endCol);
-    MatrixPtr subGpuB = gpuB->subColMatrix(0, endCol);
-    subCpuB->colMax(*subCpuA);
-    subGpuB->colMax(*subGpuA);
-  }
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixCollectBias(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(1, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(1, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-
-  real scale = 1.0f / (rand() % 10);  // NOLINT
-
-  cpuA->collectBias(*cpuB, scale);
-  gpuA->collectBias(*gpuB, scale);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixSumOfSquares(int height, int width, int endCol = 0) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, 1);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  if (!endCol) {
-    cpuA->sumOfSquares(*cpuB, *cpuC);
-    gpuA->sumOfSquares(*gpuB, *gpuC);
-  } else {
-    MatrixPtr subCpuB = cpuB->subColMatrix(0, endCol);
-    MatrixPtr subCpuC = cpuC->subColMatrix(0, endCol);
-    MatrixPtr subGpuB = gpuB->subColMatrix(0, endCol);
-    MatrixPtr subGpuC = gpuC->subColMatrix(0, endCol);
-    cpuA->sumOfSquares(*subCpuB, *subCpuC);
-    gpuA->sumOfSquares(*subGpuB, *subGpuC);
-  }
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, 1);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-}
-
-void testMatrixBinaryClassificationError(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(height, width);
-
-  MatrixPtr cpuA2 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB2 = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuC2 = std::make_shared<CpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-  cpuA2->copyFrom(*cpuA);
-  cpuB2->copyFrom(*cpuB);
-  cpuC2->copyFrom(*cpuC);
-
-  real scale = 0.5;
-  int columnOffset = rand() % width;  // NOLINT
-
-  cpuA->binaryClassificationError(columnOffset, *cpuB, *cpuC, scale);
-  gpuA->binaryClassificationError(columnOffset, *gpuB, *gpuC, scale);
-  cpuA2->binaryClassificationError2(columnOffset, *cpuB2, *cpuC2, scale);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuA);
-  MatrixCheckErr(*cpuA, *outputCheck);
-  MatrixCheckErr(*cpuA, *cpuA2);
-}
-
-TEST(Matrix, aggregate) {
-  for (auto height : {1, 11, 16, 32, 64, 73, 128, 200, 1024, 2345}) {
-    for (auto width : {1, 9, 16, 32, 64, 100, 512, 1000, 1024, 2453}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-      testMatrixRowSum(height, width);
-      testMatrixRowMax(height, width);
-      testMatrixColSum(height, width);
-      testMatrixColMax(height, width);
-      testMatrixCollectBias(height, width);
-      testMatrixTernaryRowDotMul(height, width);
-      testMatrixAddDotMulVMM(height, width);
-
-      testMatrixSumOfSquares(height, width);
-      testMatrixBinaryClassificationError(height, width);
-    }
-  }
+  TensorCheckErr(*cpuI, *gpuI);
+
+  outputCheck->mul(cpu, cpuI);
+  cpu->setDiag(1.0);
+  TensorCheckErr(*cpu, *outputCheck);
 }
 
-TEST(Matrix, aggregate2) {
-  for (auto height : {16, 32, 128, 512, 1024}) {
-    for (auto width :
-         {16, 32, 64, 128, 256, 512, 768, 1024, 2048, 3072, 4096}) {
+TEST(Matrix, unary) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
       VLOG(3) << " height=" << height << " width=" << width;
 
-      int endCol = rand() % width;  // NOLINT
-      testMatrixRowMax(height, width, endCol);
-      testMatrixSumOfSquares(height, width, endCol);
-      testMatrixColSum(height, width, endCol);
-      testMatrixColMax(height, width, endCol);
-      testMatrixAddDotMulVMM(height, width, endCol);
+      testMatrixDeepSwap(height, width);
+      testMatrixZeroAtOffset(height, width);
+      testMatrixGetSum(height, width);
+      testMatrixTranspose(height, width);
     }
+    // inverse
+    testMatrixInverse(height);
   }
 }
 
-void testMatrixAddAtOffset(int height, int width1, int width2) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width1);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width2);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width1);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width2);
+void testMatrixSoftmax(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
 
   cpuInput->randomizeUniform();
   gpuInput->copyFrom(*cpuInput);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-
-  int columnOffset = 0;
-  int offset = std::abs(width1 - width2);
-  if (offset) {
-    columnOffset = rand() % offset;  // NOLINT
-  }
-  cpuOutput->addAtOffset(*cpuInput, columnOffset);
-  gpuOutput->addAtOffset(*gpuInput, columnOffset);
+  cpuOutput->zero();
+  gpuOutput->zero();
+  cpuInput->softmax(*cpuOutput);
+  gpuInput->softmax(*gpuOutput);
 
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width2);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
+  TensorCheckErr(*cpuOutput, *gpuOutput);
 }
 
-void testMatrixAssignAtOffset(int height, int width1, int width2) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width1);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width2);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width1);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width2);
-
+void testSequenceSoftmax(int batchSize) {
+  // forward
+  int inputDim = 1;
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
   cpuInput->randomizeUniform();
   gpuInput->copyFrom(*cpuInput);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
 
-  int columnOffset = 0;
-  int offset = std::abs(width1 - width2);
-  if (offset) {
-    columnOffset = rand() % offset;  // NOLINT
-  }
-  cpuOutput->assignAtOffset(*cpuInput, columnOffset);
-  gpuOutput->assignAtOffset(*gpuInput, columnOffset);
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
 
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width2);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
+  cpuInput->sequenceSoftmax(*cpuInput, *cpuSequence);
+  gpuInput->sequenceSoftmax(*gpuInput, *gpuSequence);
+
+  TensorCheckErr(*cpuInput, *gpuInput);
 }
 
-TEST(Matrix, AtOffset) {
-  for (auto height : {1, 11, 73, 128, 200}) {
-    for (auto width1 : {1, 32, 100, 512, 1000}) {
-      for (auto width2 : {1, 32, 100, 512, 1000}) {
-        VLOG(3) << " height=" << height << " width1=" << width1
-                  << " width2=" << width2;
+void testMatrixSoftmaxThreshold(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
+
+  cpuInput->randomizeUniform();
+  cpuInput->getData()[0] = 100.0;
+  gpuInput->copyFrom(*cpuInput);
+  cpuOutput->zero();
+  gpuOutput->zero();
+  cpuInput->softmax(*cpuOutput);
+  gpuInput->softmax(*gpuOutput);
 
-        testMatrixAddAtOffset(height, width1, width2);
-        testMatrixAssignAtOffset(height, width1, width2);
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuOutput);
+  // check output zero
+  int cpuCount = 0;
+  int gpuCount = 0;
+  auto zeroNum = [](MatrixPtr out, int& count) {
+    for (size_t i = 0; i < out->getHeight(); i++) {
+      for (size_t j = 0; j < out->getWidth(); j++) {
+        if (out->getElement(i, j) == 0) count++;
       }
     }
-  }
+  };
+  zeroNum(cpuOutput, cpuCount);
+  zeroNum(outputCheck, gpuCount);
+  EXPECT_EQ(cpuCount, 0) << "Cpu softmax output value 0";
+  EXPECT_EQ(gpuCount, 0) << "Gpu softmax output value 0";
 }
 
-void testMatrixSelectRows(int numSamples, int tableSize, int inputDim) {
-  MatrixPtr cpuTable = std::make_shared<CpuMatrix>(tableSize, inputDim);
-  MatrixPtr gpuTable = std::make_shared<GpuMatrix>(tableSize, inputDim);
-  cpuTable->randomizeUniform();
-  gpuTable->copyFrom(*cpuTable);
-
-  IVectorPtr cpuIds;
-  IVectorPtr gpuIds;
-  cpuIds = VectorT<int>::create(numSamples, false);
-  gpuIds = VectorT<int>::create(numSamples, true);
-  cpuIds->rand(tableSize);
-  gpuIds->copyFrom(*cpuIds);
+void testMatrixSoftmaxBp(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
 
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, inputDim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
   cpuOutput->randomizeUniform();
   gpuOutput->copyFrom(*cpuOutput);
+  gpuOutput->softmaxBackward(*gpuInput);
+
+  MatrixPtr sftMaxSum = std::make_shared<CpuMatrix>(height, 1);
+  MatrixPtr sftMaxDot = std::make_shared<CpuMatrix>(height, width);
+  sftMaxDot->dotMul(*cpuOutput, *cpuInput);
+  sftMaxSum->colMerge(*sftMaxDot);
+  cpuOutput->softmaxDerivative(*cpuInput, *sftMaxSum);
 
-  cpuOutput->selectRows(*cpuTable, *cpuIds);
-  gpuOutput->selectRows(*gpuTable, *gpuIds);
+  TensorCheckErr(*cpuOutput, *gpuOutput);
+}
 
-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(numSamples, inputDim);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
+TEST(Matrix, softmax) {
+  for (auto height : {1, 11, 73, 128, 200}) {
+    for (auto width : {1, 32, 100, 512, 1000}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+
+      testMatrixSoftmax(height, width);
+      testMatrixSoftmaxBp(height, width);
+      testMatrixSoftmaxThreshold(height, width);
+    }
+    testSequenceSoftmax(height);
+  }
 }
 
 void testMatrixAddToRows(int numSamples, int tableSize, int inputDim) {
@@ -1273,10 +470,7 @@ void testMatrixAddToRows(int numSamples, int tableSize, int inputDim) {
   cpuOutput->addToRows(*cpuTable, *cpuIds);
   gpuOutput->addToRows(*gpuTable, *gpuIds);
 
-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(tableSize, inputDim);
-  outputCheck->copyFrom(*gpuTable);
-  MatrixCheckErr(*cpuTable, *outputCheck);
+  TensorCheckErr(*cpuTable, *gpuTable);
 }
 
 TEST(Matrix, tableProjection) {
@@ -1284,8 +478,7 @@ TEST(Matrix, tableProjection) {
     for (auto tableSize : {10, 100}) {
       for (auto inputDim : {20, 50}) {
         VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
-                  << " inputDim=" << inputDim;
-        testMatrixSelectRows(numSamples, tableSize, inputDim);
+                << " inputDim=" << inputDim;
         testMatrixAddToRows(numSamples, tableSize, inputDim);
       }
     }
@@ -1319,9 +512,7 @@ void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
   cpuC->mul(cpuA, cpuB, alpha, beta);
   gpuC->mul(gpuA, gpuB, alpha, beta);
 
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(heightC, widthC);
-  outputCheck->copyFrom(*gpuC);
-  MatrixCheckErr(*cpuC, *outputCheck);
+  TensorCheckErr(*cpuC, *gpuC);
 }
 
 void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
@@ -1359,8 +550,12 @@ void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
     }
   };
 
-  auto subMatrix = [](MatrixPtr& sub, MatrixPtr matrix, size_t startRow,
-                      size_t endRow, size_t startCol, size_t endCol) {
+  auto subMatrix = [](MatrixPtr& sub,
+                      MatrixPtr matrix,
+                      size_t startRow,
+                      size_t endRow,
+                      size_t startCol,
+                      size_t endCol) {
     if (!matrix->isTransposed()) {
       sub = matrix->subMatrix(startRow, endRow, startCol, endCol);
     } else {
@@ -1389,9 +584,7 @@ void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
   subCpuC->mul(subCpuA, subCpuB, alpha, beta);
   subGpuC->mul(subGpuA, subGpuB, alpha, beta);
 
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(heightC, widthC);
-  outputCheck->copyFrom(*gpuC);
-  MatrixCheckErr(*cpuC, *outputCheck);
+  TensorCheckErr(*cpuC, *gpuC);
 }
 
 TEST(Matrix, mul) {
@@ -1404,9 +597,9 @@ TEST(Matrix, mul) {
               continue;
             }
             VLOG(3) << setiosflags(ios::left) << setfill(' ')
-                      << " transa=" << transa << " transb=" << transb
-                      << " dimM=" << setw(5) << dimM << " dimN=" << setw(5)
-                      << dimN << " dimK=" << setw(5) << dimK;
+                    << " transa=" << transa << " transb=" << transb
+                    << " dimM=" << setw(5) << dimM << " dimN=" << setw(5)
+                    << dimN << " dimK=" << setw(5) << dimK;
 
             testMatrixMul(transa, transb, dimM, dimN, dimK);
             testSubMatrixMul(transa, transb, dimM, dimN, dimK);
@@ -1436,7 +629,7 @@ TEST(Vector, rowFunc) {
   }
 }
 
-template<class T>
+template <class T>
 void testVectorReset(int size) {
   std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
   std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
@@ -1445,19 +638,17 @@ void testVectorReset(int size) {
   cpu->reset(value);
   gpu->reset(value);
 
-  std::shared_ptr<CpuVectorT<T>> out = std::make_shared<CpuVectorT<T>>(size);
-  out->copyFrom(*gpu);
-  VectorCheckEqual(*cpu, *out);
+  TensorCheckEqual(*cpu, *gpu);
 }
 
-template<class T>
+template <class T>
 void testVecortSelectFrom(int size) {
   std::shared_ptr<CpuVectorT<T>> cpuDst = std::make_shared<CpuVectorT<T>>(size);
   std::shared_ptr<GpuVectorT<T>> gpuDst = std::make_shared<GpuVectorT<T>>(size);
-  std::shared_ptr<CpuVectorT<T>>
-    cpuSrc = std::make_shared<CpuVectorT<T>>(size*2);
-  std::shared_ptr<GpuVectorT<T>>
-    gpuSrc = std::make_shared<GpuVectorT<T>>(size*2);
+  std::shared_ptr<CpuVectorT<T>> cpuSrc =
+      std::make_shared<CpuVectorT<T>>(size * 2);
+  std::shared_ptr<GpuVectorT<T>> gpuSrc =
+      std::make_shared<GpuVectorT<T>>(size * 2);
   CpuIVectorPtr cpuIds = std::make_shared<CpuVectorT<int>>(size);
   GpuIVectorPtr gpuIds = std::make_shared<GpuVectorT<int>>(size);
 
@@ -1473,12 +664,10 @@ void testVecortSelectFrom(int size) {
   cpuDst->selectFrom(*cpuSrc, *cpuIds);
   gpuDst->selectFrom(*gpuSrc, *gpuIds);
 
-  std::shared_ptr<CpuVectorT<T>> out = std::make_shared<CpuVectorT<T>>(size);
-  out->copyFrom(*gpuDst);
-  VectorCheckEqual(*cpuDst, *out);
+  TensorCheckEqual(*cpuDst, *gpuDst);
 }
 
-template<class T>
+template <class T>
 void testVecotrZeroMem(int size) {
   std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
   std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
@@ -1486,12 +675,10 @@ void testVecotrZeroMem(int size) {
   cpu->zeroMem();
   gpu->zeroMem();
 
-  std::shared_ptr<CpuVectorT<T>> out = std::make_shared<CpuVectorT<T>>(size);
-  out->copyFrom(*gpu);
-  VectorCheckEqual(*cpu, *out);
+  TensorCheckEqual(*cpu, *gpu);
 }
 
-template<class T>
+template <class T>
 void testVectorIsEqual(int size) {
   std::shared_ptr<CpuVectorT<T>> cpuA = std::make_shared<CpuVectorT<T>>(size);
   std::shared_ptr<CpuVectorT<T>> cpuB = std::make_shared<CpuVectorT<T>>(size);
@@ -1509,9 +696,7 @@ void testVectorIsEqual(int size) {
   cpuA->isEqualTo(*cpuB, value);
   gpuA->isEqualTo(*gpuB, value);
 
-  std::shared_ptr<CpuVectorT<T>> out = std::make_shared<CpuVectorT<T>>(size);
-  out->copyFrom(*gpuA);
-  VectorCheckEqual(*cpuA, *out);
+  TensorCheckEqual(*cpuA, *gpuA);
 }
 
 TEST(Vector, Equal) {
@@ -1542,19 +727,16 @@ void testMatrixTopK(int samples, int dim, int beamSize) {
   cpuSrc->rowMax(*cpuIds, *cpuVal);
   gpuSrc->rowMax(*gpuIds, *gpuVal);
 
-  MatrixPtr outVal = std::make_shared<CpuMatrix>(samples, beamSize);
-  outVal->copyFrom(*gpuVal);
-  MatrixCheckEqual(*cpuVal, *outVal);
+  TensorCheckEqual(*cpuVal, *gpuVal);
 }
 
 TEST(Matrix, topK) {
   for (auto samples : {1, 5, 31, 90, 150, 500}) {
-    for (auto dim : {1, 5 , 8, 10, 15, 64, 80, 120, 256, 300,
-                     1280, 5120, 50000}) {
+    for (auto dim :
+         {1, 5, 8, 10, 15, 64, 80, 120, 256, 300, 1280, 5120, 50000}) {
       for (auto beamSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
         if (beamSize > dim) continue;
-        VLOG(3) << " samples=" << samples
-                << " beamSize=" << beamSize
+        VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
                 << " dim=" << dim;
         testMatrixTopK(samples, dim, beamSize);
       }
@@ -1581,9 +763,7 @@ void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) {
   cpuSrc->rowMax(*cpuIds, *cpuVal);
   gpuSrc->rowMax(*gpuIds, *gpuVal);
 
-  MatrixPtr outCheckMaxVal = std::make_shared<CpuMatrix>(samples, beamSize);
-  outCheckMaxVal->copyFrom(*gpuVal);
-  MatrixCheckEqual(*cpuVal, *outCheckMaxVal);
+  TensorCheckEqual(*cpuVal, *gpuVal);
 
   IVectorPtr outCheckIds = std::make_shared<CpuIVector>(samples * beamSize);
   outCheckIds->copyFrom(*gpuIds);
@@ -1604,10 +784,8 @@ TEST(SMatrix, topK) {
       for (auto beamSize : {1, 5, 40, 100, 500}) {
         for (auto ratio : {0.01, 0.001}) {
           if (beamSize > dim) continue;
-          VLOG(3) << " samples=" << samples
-                  << " beamSize=" << beamSize
-                  << " dim=" << dim
-                  << " ratio=" << ratio;
+          VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
+                  << " dim=" << dim << " ratio=" << ratio;
           testSMatrixTopK(samples, dim, beamSize, ratio);
         }
       }
@@ -1615,42 +793,6 @@ TEST(SMatrix, topK) {
   }
 }
 
-void testMatrixCopyByRowIndex(int outHeight, int inHeight, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(inHeight, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(inHeight, width);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(outHeight, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(outHeight, width);
-  cpuOutput->zero();
-  gpuOutput->zero();
-
-  IVectorPtr cpuRowIndex = IVector::create(outHeight, false);
-  IVectorPtr gpuRowIndex = IVector::create(outHeight, true);
-  cpuRowIndex->rand(inHeight);
-  gpuRowIndex->copyFrom(*cpuRowIndex);
-
-  cpuOutput->copyByRowIndex(*cpuInput, *cpuRowIndex);
-  gpuOutput->copyByRowIndex(*gpuInput, *gpuRowIndex);
-
-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(outHeight, width);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckEqual(*cpuOutput, *outputCheck);
-}
-
-TEST(Matrix, copyByRowIndex) {
-  for (auto outHeight : {31, 500, 1000}) {
-    for (auto inHeight : {17, 257, 500, 1200}) {
-      for (auto width : {512, 1024}) {
-        VLOG(3) << outHeight << " " << inHeight << " " << width;
-        testMatrixCopyByRowIndex(outHeight, inHeight, width);
-      }
-    }
-  }
-}
-
 void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) {
   MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
   MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
@@ -1671,10 +813,7 @@ void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) {
   cpuOutput->sequenceAvgForward(*cpuInput, *cpuSequence, mode);
   gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode);
 
-  // check
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  outputCheck->copyFrom(*gpuOutput);
-  MatrixCheckErr(*cpuOutput, *outputCheck);
+  TensorCheckErr(*cpuOutput, *gpuOutput);
 }
 
 TEST(Matrix, sequenceAvgForward) {
@@ -1689,47 +828,7 @@ TEST(Matrix, sequenceAvgForward) {
   }
 }
 
-void testCosSim(int heightX, int heightY, int width, real scale) {
-  MatrixPtr prevOutX = CpuMatrix::create(heightX, width, false, false);
-  MatrixPtr prevOutY = CpuMatrix::create(heightY, width, false, false);
-  MatrixPtr output = CpuMatrix::create(heightX, 1, false, false);
-
-  prevOutX->randomizeUniform();
-  prevOutY->randomizeUniform();
-  prevOutX->add(-0.5);
-  prevOutY->add(-0.5);
-  output->randomizeUniform();
-
-  MatrixPtr prevOutXGpu = GpuMatrix::create(heightX, width, false, true);
-  MatrixPtr prevOutYGpu = GpuMatrix::create(heightY, width, false, true);
-  MatrixPtr outputGpu = GpuMatrix::create(heightX, 1, false, true);
-
-  prevOutXGpu->copyFrom(*prevOutX);
-  prevOutYGpu->copyFrom(*prevOutY);
-  outputGpu->copyFrom(*output);
-
-  output->cosSim(*prevOutX, *prevOutY, scale);
-  outputGpu->cosSim(*prevOutXGpu, *prevOutYGpu, scale);
-
-  MatrixPtr outputCheck = CpuMatrix::create(heightX, 1, false, false);
-  outputCheck->copyFrom(*outputGpu);
-  MatrixCheckErr(*output, *outputCheck);
-}
-
-TEST(Matrix, cosSim) {
-  for (auto heightX : {10, 100, 1000}) {
-    for (auto heightY : {1, heightX}) {
-      for (auto width : {10, 100, 1000}) {
-        for (auto scale : {1.0, 2.0}) {
-          testCosSim(heightX, heightY, width, scale);
-        }
-      }
-    }
-  }
-}
-
-void testCosSimDerivate(int heightX, int heightY, int width,
-                        real scale) {
+void testCosSimDerivate(int heightX, int heightY, int width, real scale) {
   MatrixPtr prevOutX = CpuMatrix::create(heightX, width, false, false);
   MatrixPtr prevOutY = CpuMatrix::create(heightY, width, false, false);
   MatrixPtr grad = CpuMatrix::create(heightX, 1, false, false);
@@ -1758,12 +857,8 @@ void testCosSimDerivate(int heightX, int heightY, int width,
   prevGradXGpu->copyFrom(*prevGradX);
   prevGradYGpu->copyFrom(*prevGradY);
 
-  grad->cosSimDerivative(*output,
-                         *prevOutX,
-                         *prevOutY,
-                         *prevGradX,
-                         *prevGradY,
-                         scale);
+  grad->cosSimDerivative(
+      *output, *prevOutX, *prevOutY, *prevGradX, *prevGradY, scale);
 
   gradGpu->cosSimDerivative(*outputGpu,
                             *prevOutXGpu,
@@ -1772,14 +867,8 @@ void testCosSimDerivate(int heightX, int heightY, int width,
                             *prevGradYGpu,
                             scale);
 
-  MatrixPtr prevGradXCheck = CpuMatrix::create(heightX, width, false,
-                                               false);
-  MatrixPtr prevGradYCheck = CpuMatrix::create(heightY, width, false,
-                                               false);
-  prevGradXCheck->copyFrom(*prevGradXGpu);
-  prevGradYCheck->copyFrom(*prevGradYGpu);
-  MatrixCheckErr(*prevGradX, *prevGradXCheck);
-  MatrixCheckErr(*prevGradY, *prevGradYCheck);
+  TensorCheckErr(*prevGradX, *prevGradXGpu);
+  TensorCheckErr(*prevGradY, *prevGradYGpu);
 }
 
 TEST(Matrix, cosSimDerivate) {
@@ -1794,84 +883,10 @@ TEST(Matrix, cosSimDerivate) {
   }
 }
 
-void testParamReluForward(int height, int width, int w_height,
-                                                 int w_width) {
-  MatrixPtr output = CpuMatrix::create(height, width, false, false);
-  MatrixPtr input = CpuMatrix::create(height, width, false, false);
-  MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
-
-  output->randomizeUniform();
-  input->randomizeUniform();
-  w->randomizeUniform();
-  input->add(-0.5);
-
-  MatrixPtr outputGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr inputGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr wGpu = GpuMatrix::create(w_height, w_width, false, true);
-
-  inputGpu->copyFrom(*input);
-  wGpu->copyFrom(*w);
-
-  output->paramReluForward(*input, *w);
-  outputGpu->paramReluForward(*inputGpu, *wGpu);
-
-  MatrixPtr outputCheck = CpuMatrix::create(height, width, false, false);
-  outputCheck->copyFrom(*outputGpu);
-  MatrixCheckEqual(*output, *outputCheck);
-}
-
-TEST(Matrix, paramReluForward) {
-  for (auto height : {10, 100}) {
-    for (auto width : {10, 100}) {
-      for (auto w_height : {1, 2}) {
-        for (auto w_width : {1, 2}) {
-          testParamReluForward(height, width, w_height, w_width);
-        }
-      }
-    }
-  }
-}
-
-void testParamReluBackwardW(int height, int width, int w_height,
-                                                   int w_width) {
-  MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
-  MatrixPtr input = CpuMatrix::create(height, width, false, false);
-  MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
-
-  oGrad->randomizeUniform();
-  input->randomizeUniform();
-  w->randomizeUniform();
-  input->add(-0.5);
-
-  MatrixPtr oGradGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr inputGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr wGpu = GpuMatrix::create(w_height, w_width, false, true);
-
-  oGradGpu->copyFrom(*oGrad);
-  inputGpu->copyFrom(*input);
-  wGpu->copyFrom(*w);
-
-  w->paramReluBackwardW(*oGrad, *input);
-  wGpu->paramReluBackwardW(*oGradGpu, *inputGpu);
-  MatrixPtr wCheck = CpuMatrix::create(w_height, w_width, false, false);
-  wCheck->copyFrom(*wGpu);
-  MatrixCheckErr(*w, *wCheck);
-}
-
-TEST(Matrix, paramReluBackwardW) {
-  for (auto height : {10, 100}) {
-    for (auto width : {10, 100}) {
-      for (auto w_height : {1, 2}) {
-        for (auto w_width : {1, 2}) {
-          testParamReluBackwardW(height, width, w_height, w_width);
-        }
-      }
-    }
-  }
-}
-
-void testParamReluBackwardDiff(int height, int width, int w_height,
-                                                      int w_width) {
+void testParamReluBackwardDiff(int height,
+                               int width,
+                               int w_height,
+                               int w_width) {
   MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
   MatrixPtr input = CpuMatrix::create(height, width, false, false);
   MatrixPtr diff = CpuMatrix::create(height, width, false, false);
@@ -1896,9 +911,7 @@ void testParamReluBackwardDiff(int height, int width, int w_height,
   diff->paramReluBackwardDiff(*oGrad, *input, *w);
   diffGpu->paramReluBackwardDiff(*oGradGpu, *inputGpu, *wGpu);
 
-  MatrixPtr diffCheck = CpuMatrix::create(height, width, false, false);
-  diffCheck->copyFrom(*diffGpu);
-  MatrixCheckErr(*diff, *diffCheck);
+  TensorCheckErr(*diff, *diffGpu);
 }
 
 TEST(Matrix, paramReluBackwardDiff) {
@@ -1929,9 +942,7 @@ void testClassificationError(int numSamples, int dim) {
   cpuError->classificationError(cpuOutput, cpuLabel);
   gpuError->classificationError(gpuOutput, gpuLabel);
 
-  MatrixPtr check = std::make_shared<CpuMatrix>(numSamples, 1);
-  check->copyFrom(*gpuError);
-  MatrixCheckEqual(*cpuError, *check);
+  TensorCheckEqual(*cpuError, *gpuError);
 }
 
 TEST(Matrix, classificationError) {
@@ -1943,11 +954,16 @@ TEST(Matrix, classificationError) {
   }
 }
 
-void testMaxPoolFwdBwd(int numSamples, int channels,
-                       int imgSizeH, int imgSizeW,
-                       int ksizeH, int ksizeW,
-                       int strideH, int strideW,
-                       int padH, int padW) {
+void testMaxPoolFwdBwd(int numSamples,
+                       int channels,
+                       int imgSizeH,
+                       int imgSizeW,
+                       int ksizeH,
+                       int ksizeW,
+                       int strideH,
+                       int strideW,
+                       int padH,
+                       int padW) {
   int outH = 0, outW = 0;
   outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
   outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
@@ -1965,12 +981,30 @@ void testMaxPoolFwdBwd(int numSamples, int channels,
   inputGpu->copyFrom(*input);
   targetGpu->copyFrom(*target);
 
-  target->maxPoolForward(*input, imgSizeH, imgSizeW,
-                         channels, ksizeW, ksizeH,
-                         strideH, strideW, outH, outW, padH, padW);
-  targetGpu->maxPoolForward(*inputGpu, imgSizeH, imgSizeW,
-                            channels, ksizeW, ksizeH,
-                            strideH, strideW, outH, outW, padH, padW);
+  target->maxPoolForward(*input,
+                         imgSizeH,
+                         imgSizeW,
+                         channels,
+                         ksizeW,
+                         ksizeH,
+                         strideH,
+                         strideW,
+                         outH,
+                         outW,
+                         padH,
+                         padW);
+  targetGpu->maxPoolForward(*inputGpu,
+                            imgSizeH,
+                            imgSizeW,
+                            channels,
+                            ksizeW,
+                            ksizeH,
+                            strideH,
+                            strideW,
+                            outH,
+                            outW,
+                            padH,
+                            padW);
   MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
   targetCheck->copyFrom(*targetGpu);
   checkMatrixEqual(target, targetCheck);
@@ -1978,35 +1012,60 @@ void testMaxPoolFwdBwd(int numSamples, int channels,
   MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
   MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
   MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth,
-                                              false, true);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
 
   inputGrad->randomizeUniform();
   targetGrad->randomizeUniform();
   inputGpuGrad->copyFrom(*inputGrad);
   targetGpuGrad->copyFrom(*targetGrad);
 
-  inputGrad->maxPoolBackward(*input, imgSizeH, imgSizeW,
-                             *targetGrad, *target,
-                             ksizeW, ksizeH,
-                             strideH, strideW,
-                             outH, outW, 1.0, 1.0, padH, padW);
-  inputGpuGrad->maxPoolBackward(*inputGpu, imgSizeH, imgSizeW,
-                                *targetGpuGrad, *targetGpu,
-                                ksizeW, ksizeH,
-                                strideH, strideW,
-                                outH, outW, 1.0, 1.0, padH, padW);
-  MatrixPtr targetBwdCheck = CpuMatrix::create(numSamples, inWidth,
-                                               false, false);
+  inputGrad->maxPoolBackward(*input,
+                             imgSizeH,
+                             imgSizeW,
+                             *targetGrad,
+                             *target,
+                             ksizeW,
+                             ksizeH,
+                             strideH,
+                             strideW,
+                             outH,
+                             outW,
+                             1.0,
+                             1.0,
+                             padH,
+                             padW);
+  inputGpuGrad->maxPoolBackward(*inputGpu,
+                                imgSizeH,
+                                imgSizeW,
+                                *targetGpuGrad,
+                                *targetGpu,
+                                ksizeW,
+                                ksizeH,
+                                strideH,
+                                strideW,
+                                outH,
+                                outW,
+                                1.0,
+                                1.0,
+                                padH,
+                                padW);
+  MatrixPtr targetBwdCheck =
+      CpuMatrix::create(numSamples, inWidth, false, false);
   targetBwdCheck->copyFrom(*inputGpuGrad);
   checkMatrixEqual(inputGrad, targetBwdCheck);
 }
 
-void testAvgPoolFwdBwd(int numSamples, int channels,
-                       int imgSizeH, int imgSizeW,
-                       int ksizeH, int ksizeW,
-                       int strideH, int strideW,
-                       int padH, int padW) {
+void testAvgPoolFwdBwd(int numSamples,
+                       int channels,
+                       int imgSizeH,
+                       int imgSizeW,
+                       int ksizeH,
+                       int ksizeW,
+                       int strideH,
+                       int strideW,
+                       int padH,
+                       int padW) {
   int outH = 0, outW = 0;
   outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
   outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
@@ -2024,39 +1083,72 @@ void testAvgPoolFwdBwd(int numSamples, int channels,
   inputGpu->copyFrom(*input);
   targetGpu->copyFrom(*target);
 
-  target->avgPoolForward(*input, imgSizeH, imgSizeW,
-                         channels, ksizeW, ksizeH,
-                         strideH, strideW, outH, outW, padH, padW);
-  targetGpu->avgPoolForward(*inputGpu, imgSizeH, imgSizeW,
-                            channels, ksizeW, ksizeH,
-                            strideH, strideW, outH, outW, padH, padW);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-  targetCheck->copyFrom(*targetGpu);
-  MatrixCheckErr(*target, *targetCheck);
+  target->avgPoolForward(*input,
+                         imgSizeH,
+                         imgSizeW,
+                         channels,
+                         ksizeW,
+                         ksizeH,
+                         strideH,
+                         strideW,
+                         outH,
+                         outW,
+                         padH,
+                         padW);
+  targetGpu->avgPoolForward(*inputGpu,
+                            imgSizeH,
+                            imgSizeW,
+                            channels,
+                            ksizeW,
+                            ksizeH,
+                            strideH,
+                            strideW,
+                            outH,
+                            outW,
+                            padH,
+                            padW);
+
+  TensorCheckErr(*target, *targetGpu);
 
   MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
   MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
   MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth,
-                                              false, true);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
 
   inputGrad->randomizeUniform();
   targetGrad->randomizeUniform();
   inputGpuGrad->copyFrom(*inputGrad);
   targetGpuGrad->copyFrom(*targetGrad);
 
-  inputGrad->avgPoolBackward(*targetGrad, imgSizeH, imgSizeW,
-                             ksizeW, ksizeH,
-                             strideH, strideW,
-                             outH, outW, 1.0, 1.0, padH, padW);
-  inputGpuGrad->avgPoolBackward(*targetGpuGrad, imgSizeH, imgSizeW,
-                                ksizeW, ksizeH,
-                                strideH, strideW,
-                                outH, outW, 1.0, 1.0, padH, padW);
-  MatrixPtr targetBwdCheck = CpuMatrix::create(numSamples, inWidth,
-                                               false, false);
-  targetBwdCheck->copyFrom(*inputGpuGrad);
-  MatrixCheckErr(*inputGrad, *targetBwdCheck);
+  inputGrad->avgPoolBackward(*targetGrad,
+                             imgSizeH,
+                             imgSizeW,
+                             ksizeW,
+                             ksizeH,
+                             strideH,
+                             strideW,
+                             outH,
+                             outW,
+                             1.0,
+                             1.0,
+                             padH,
+                             padW);
+  inputGpuGrad->avgPoolBackward(*targetGpuGrad,
+                                imgSizeH,
+                                imgSizeW,
+                                ksizeW,
+                                ksizeH,
+                                strideH,
+                                strideW,
+                                outH,
+                                outW,
+                                1.0,
+                                1.0,
+                                padH,
+                                padW);
+
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
 }
 
 TEST(Matrix, PoolFwdBwd) {
@@ -2068,24 +1160,37 @@ TEST(Matrix, PoolFwdBwd) {
             for (auto sizeY : {2, 5}) {
               for (auto sH : {1, 2}) {
                 for (auto sW : {1, 2}) {
-                   for (auto pH : {0, (sizeY - 1)/2}) {
-                     for (auto pW : {0, (sizeX - 1)/2}) {
-                       VLOG(3) << " numSamples=" << numSamples
-                               << " channels=" << channels
-                               << " imgSizeH=" << imgSizeH
-                               << " imgSizeW=" << imgSizeW
-                               << " sizeX=" << sizeX
-                               << " sizeY=" << sizeY
-                               << " strideH=" << sH
-                               << " strideW=" << sW
-                               << " padingH=" << pH
-                               << " padingW=" << pW;
-                       testMaxPoolFwdBwd(numSamples, channels, imgSizeH,
-                         imgSizeW, sizeX, sizeY, sH, sW, pH, pW);
-                       testAvgPoolFwdBwd(numSamples, channels, imgSizeH,
-                         imgSizeW, sizeX, sizeY, sH, sW, pH, pW);
-                     }
-                   }
+                  for (auto pH : {0, (sizeY - 1) / 2}) {
+                    for (auto pW : {0, (sizeX - 1) / 2}) {
+                      VLOG(3) << " numSamples=" << numSamples
+                              << " channels=" << channels
+                              << " imgSizeH=" << imgSizeH
+                              << " imgSizeW=" << imgSizeW << " sizeX=" << sizeX
+                              << " sizeY=" << sizeY << " strideH=" << sH
+                              << " strideW=" << sW << " padingH=" << pH
+                              << " padingW=" << pW;
+                      testMaxPoolFwdBwd(numSamples,
+                                        channels,
+                                        imgSizeH,
+                                        imgSizeW,
+                                        sizeX,
+                                        sizeY,
+                                        sH,
+                                        sW,
+                                        pH,
+                                        pW);
+                      testAvgPoolFwdBwd(numSamples,
+                                        channels,
+                                        imgSizeH,
+                                        imgSizeW,
+                                        sizeX,
+                                        sizeY,
+                                        sH,
+                                        sW,
+                                        pH,
+                                        pW);
+                    }
+                  }
                 }
               }
             }
@@ -2096,8 +1201,8 @@ TEST(Matrix, PoolFwdBwd) {
   }
 }
 
-void testMaxOutFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
-                      int channels, int groups) {
+void testMaxOutFwdBwd(
+    int numSamples, int imgSizeH, int imgSizeW, int channels, int groups) {
   int inWidth = imgSizeH * imgSizeW * channels;
   int outChannels = channels / groups;
   int outWidth = imgSizeH * imgSizeW * outChannels;
@@ -2108,11 +1213,9 @@ void testMaxOutFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
 
   MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
   MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
 
   IVectorPtr id = CpuIVector::create(numSamples * outWidth, false);
   IVectorPtr idGpu = GpuIVector::create(numSamples * outWidth, true);
-  IVectorPtr idCheck = CpuIVector::create(numSamples * outWidth, false);
 
   input->randomizeUniform();
   inputGpu->copyFrom(*input);
@@ -2120,21 +1223,16 @@ void testMaxOutFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
   target->maxoutForward(*input, *id, outChannels, groups);
   targetGpu->maxoutForward(*inputGpu, *idGpu, outChannels, groups);
 
-  // check
-  targetCheck->copyFrom(*targetGpu);
-  MatrixCheckErr(*target, *targetCheck);
-  idCheck->copyFrom(*idGpu);
-  VectorCheckEqual(*id, *idCheck);
+  TensorCheckErr(*target, *targetGpu);
+  TensorCheckEqual(*id, *idGpu);
 
   // backward
   MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
   MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
 
   MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, false,
-                                              true);
-  MatrixPtr targetCheckGrad = CpuMatrix::create(numSamples, inWidth, false,
-                                                false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
 
   inputGrad->randomizeUniform();
   targetGrad->randomizeUniform();
@@ -2144,9 +1242,7 @@ void testMaxOutFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
   inputGrad->maxoutBackward(*targetGrad, *id, outChannels, groups);
   inputGpuGrad->maxoutBackward(*targetGpuGrad, *idGpu, outChannels, groups);
 
-  // check
-  targetCheckGrad->copyFrom(*inputGpuGrad);
-  MatrixCheckErr(*inputGrad, *targetCheckGrad);
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
 }
 
 TEST(Matrix, MaxOutFwdBwd) {
@@ -2155,10 +1251,8 @@ TEST(Matrix, MaxOutFwdBwd) {
       for (auto imgSizeH : {14, 28}) {
         for (auto imgSizeW : {16, 30}) {
           for (auto groups : {2, 4}) {
-            VLOG(3) << " numSamples=" << numSamples
-                    << " channels=" << channels
-                    << " imgSizeH=" << imgSizeH
-                    << " imgSizeW=" << imgSizeW
+            VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                     << " groups=" << groups;
             testMaxOutFwdBwd(numSamples, imgSizeH, imgSizeW, channels, groups);
           }
@@ -2168,113 +1262,6 @@ TEST(Matrix, MaxOutFwdBwd) {
   }
 }
 
-void testAddSharedBias(int numSamples, int dim, int channel) {
-  MatrixPtr cpuData = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuData = std::make_shared<GpuMatrix>(numSamples, dim);
-
-  MatrixPtr cpuBias = std::make_shared<CpuMatrix>(1, channel);
-  MatrixPtr gpuBias = std::make_shared<GpuMatrix>(1, channel);
-
-  cpuData->randomizeUniform();
-  gpuData->copyFrom(*cpuData);
-  cpuBias->randomizeUniform();
-  gpuBias->copyFrom(*cpuBias);
-
-  cpuData->addSharedBias(*cpuBias, 1.0);
-  gpuData->addSharedBias(*gpuBias, 1.0);
-
-  MatrixPtr check = std::make_shared<CpuMatrix>(numSamples, dim);
-  check->copyFrom(*gpuData);
-  MatrixCheckErr(*cpuData, *check);
-}
-
-void testCollectSharedBias(int numSamples, int dim, int channel) {
-  MatrixPtr cpuData = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuData = std::make_shared<GpuMatrix>(numSamples, dim);
-
-  MatrixPtr cpuBias = std::make_shared<CpuMatrix>(1, channel);
-  MatrixPtr gpuBias = std::make_shared<GpuMatrix>(1, channel);
-
-  cpuData->randomizeUniform();
-  gpuData->copyFrom(*cpuData);
-  cpuBias->randomizeUniform();
-  gpuBias->copyFrom(*cpuBias);
-
-  cpuBias->collectSharedBias(*cpuData, 1.0);
-  gpuBias->collectSharedBias(*gpuData, 1.0);
-
-  MatrixPtr check = std::make_shared<CpuMatrix>(1, channel);
-  check->copyFrom(*gpuBias);
-  MatrixCheckErr(*cpuBias, *check);
-}
-
-TEST(Matrix, sharedBias) {
-  for (auto numSamples : {1, 100, 520}) {
-    for (auto dim : {100 * 16, 100 * 32}) {
-      for (auto channel : {8, 16}) {
-        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
-                << " channel=" << channel;
-        testAddSharedBias(numSamples, dim, channel);
-        testCollectSharedBias(numSamples, dim, channel);
-      }
-    }
-  }
-}
-
-void testMultiBinaryLabelCrossEntropy(int numSamples, int dim) {
-  MatrixPtr output = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, dim);
-
-  MatrixPtr cpuEntropy = std::make_shared<CpuMatrix>(numSamples, 1);
-  MatrixPtr gpuEntropy = std::make_shared<GpuMatrix>(numSamples, 1);
-
-  MatrixPtr cpuGrad = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuGrad = std::make_shared<GpuMatrix>(numSamples, dim);
-
-  MatrixPtr cpuLabel = std::make_shared<CpuSparseMatrix>
-          (numSamples, dim, numSamples, NO_VALUE, SPARSE_CSR, false);
-  MatrixPtr gpuLabel = std::make_shared<GpuSparseMatrix>
-          (numSamples, dim, numSamples, NO_VALUE, SPARSE_CSR, false);
-  for (int i = 0; i < numSamples; i ++) {
-    const unsigned int id = rand() % dim; // NOLINT
-    cpuLabel->setRow(i, 1, &id, nullptr);
-    gpuLabel->setRow(i, 1, &id, nullptr);
-  }
-
-  output->randomizeUniform();
-  cpuOutput->zeroMem();
-  output->softmax(*cpuOutput);
-  gpuOutput->copyFrom(*cpuOutput);
-
-  cpuEntropy->zeroMem();
-  gpuEntropy->zeroMem();
-  cpuEntropy->multiBinaryLabelCrossEntropy(*cpuOutput, *cpuLabel);
-  gpuEntropy->multiBinaryLabelCrossEntropy(*gpuOutput, *gpuLabel);
-
-  MatrixPtr check1 = std::make_shared<CpuMatrix>(numSamples, 1);
-  check1->copyFrom(*gpuEntropy);
-  MatrixCheckErr(*cpuEntropy, *check1);
-
-  cpuGrad->zeroMem();
-  gpuGrad->zeroMem();
-  cpuGrad->multiBinaryLabelCrossEntropyBp(*cpuOutput, *cpuLabel);
-  gpuGrad->multiBinaryLabelCrossEntropyBp(*gpuOutput, *gpuLabel);
-
-  MatrixPtr check2 = std::make_shared<CpuMatrix>(numSamples, dim);
-  check2->copyFrom(*gpuGrad);
-  MatrixCheckErr(*cpuGrad, *check2);
-}
-
-TEST(Matrix, multiBinaryCrossEntropy) {
-  for (auto numSamples : {100, 1000, 10000}) {
-    for (auto dim : {100, 1000, 10000}) {
-      VLOG(3) << " numSamples=" << numSamples << " dim=" << dim;
-      testMultiBinaryLabelCrossEntropy(numSamples, dim);
-    }
-  }
-}
-
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/math/tests/test_matrixUtil.h b/paddle/math/tests/test_matrixUtil.h
index fa682164aa8643dd088bd0ece757728e03488b76..5300e7168b9dc61b65e64346424e65c11665cf99 100644
--- a/paddle/math/tests/test_matrixUtil.h
+++ b/paddle/math/tests/test_matrixUtil.h
@@ -104,8 +104,7 @@ void checkSMatrixEqual2Dense(const CpuSparseMatrixPtr& a,
   }
 }
 
-void checkSMatrixErr(const CpuSparseMatrixPtr& a,
-                     const CpuSparseMatrixPtr& b) {
+void checkSMatrixErr(const CpuSparseMatrixPtr& a, const CpuSparseMatrixPtr& b) {
 #ifndef PADDLE_TYPE_DOUBLE
   real err = 1e-3;
 #else
@@ -126,7 +125,8 @@ void checkSMatrixErr(const CpuSparseMatrixPtr& a,
         real bVal = b->getValue()[r];
         if (std::abs(aVal - bVal) > err) {
           if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
-            LOG(INFO) << "a=" << aVal << "\t" << "b=" << bVal;
+            LOG(INFO) << "a=" << aVal << "\t"
+                      << "b=" << bVal;
             count++;
           }
         }
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
index 4fa9bc72013da6a3d551854516e0f0d2fe5ee1ef..837c2f47ba05a04988431e14cb6bc2490f42d32e 100644
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
@@ -37,7 +37,9 @@ protected:
 
   virtual void TearDown() {}
 
-  void allocateMem(real*& gpuAngle, real*& gpuScale, int*& gpuCenterR,
+  void allocateMem(real*& gpuAngle,
+                   real*& gpuScale,
+                   int*& gpuCenterR,
                    int*& gpuCenterC) {
     gpuAngle = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
     gpuScale = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
@@ -48,7 +50,8 @@ protected:
   }
 
   // Generate translation parameters for testing.
-  void generateTranslationParams(int*& gpuCenterR, int*& gpuCenterC,
+  void generateTranslationParams(int*& gpuCenterR,
+                                 int*& gpuCenterC,
                                  int imgSize) {
     int cpuCenterR[NUM_IMAGES * SAMPLING_RATE];
     int cpuCenterC[NUM_IMAGES * SAMPLING_RATE];
@@ -59,13 +62,13 @@ protected:
 
     gpuCenterR =
         (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-    hl_memcpy_host2device(gpuCenterR, cpuCenterR,
-                          sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
+    hl_memcpy_host2device(
+        gpuCenterR, cpuCenterR, sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
 
     gpuCenterC =
         (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-    hl_memcpy_host2device(gpuCenterC, cpuCenterC,
-                          sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
+    hl_memcpy_host2device(
+        gpuCenterC, cpuCenterC, sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
   }
 
   // Generate rotation parameters for testing.
@@ -84,8 +87,7 @@ protected:
       cpuScale[i] = static_cast<real>(TGT_SIZE - 2) / TGT_SIZE;
     }
     gpuScale = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
-    hl_memcpy_host2device(gpuScale, cpuScale,
-                          sizeof(real) * NUM_IMAGES);
+    hl_memcpy_host2device(gpuScale, cpuScale, sizeof(real) * NUM_IMAGES);
   }
 
   // Generate the test images, only the center regions are set to 1.
@@ -111,8 +113,7 @@ protected:
       }
     }
     gpuImages = (real*)hl_malloc_device(sizeof(real) * IMAGE_MEM_SIZE);
-    hl_memcpy_host2device(gpuImages, cpuImages,
-                          sizeof(real) * IMAGE_MEM_SIZE);
+    hl_memcpy_host2device(gpuImages, cpuImages, sizeof(real) * IMAGE_MEM_SIZE);
   }
 
   real* gpuImages_;
@@ -120,64 +121,99 @@ protected:
 
 // Random perturbation. Only to make sure the code does not break.
 TEST_F(PerturbationTest, random_perturb) {
-  real* gpuAngle, *gpuScaleRatio;
-  int* gpuCenterR, *gpuCenterC;
+  real *gpuAngle, *gpuScaleRatio;
+  int *gpuCenterR, *gpuCenterC;
   allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
 
   real* targets = NULL;
   const int TARGET_MEM_SIZE =
       NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
   targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb(gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS,
-                         NUM_IMAGES, 1.0, 1.0, SAMPLING_RATE, gpuAngle,
-                         gpuScaleRatio, gpuCenterR, gpuCenterC, 2, true,
+  hl_conv_random_disturb(gpuImages_,
+                         IMG_SIZE,
+                         TGT_SIZE,
+                         CHANNELS,
+                         NUM_IMAGES,
+                         1.0,
+                         1.0,
+                         SAMPLING_RATE,
+                         gpuAngle,
+                         gpuScaleRatio,
+                         gpuCenterR,
+                         gpuCenterC,
+                         2,
+                         true,
                          targets);
   real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets,
-                        sizeof(real) * TARGET_MEM_SIZE);
+  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
 }
 
 TEST_F(PerturbationTest, identity_perturb) {
-  real* gpuAngle, *gpuScaleRatio;
-  int* gpuCenterR, *gpuCenterC;
+  real *gpuAngle, *gpuScaleRatio;
+  int *gpuCenterR, *gpuCenterC;
   allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
 
   real* targets = NULL;
   const int TARGET_MEM_SIZE =
       NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
   targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb(gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS,
-                         NUM_IMAGES, 1.0, 1.0, SAMPLING_RATE, gpuAngle,
-                         gpuScaleRatio, gpuCenterR, gpuCenterC, 2, false,
+  hl_conv_random_disturb(gpuImages_,
+                         IMG_SIZE,
+                         TGT_SIZE,
+                         CHANNELS,
+                         NUM_IMAGES,
+                         1.0,
+                         1.0,
+                         SAMPLING_RATE,
+                         gpuAngle,
+                         gpuScaleRatio,
+                         gpuCenterR,
+                         gpuCenterC,
+                         2,
+                         false,
                          targets);
   real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets,
-                        sizeof(real) * TARGET_MEM_SIZE);
+  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
   for (int i = 0; i < TARGET_MEM_SIZE; ++i) {
     EXPECT_FLOAT_EQ(1.0, cpuTargets[i]);
   }
 }
 
 TEST_F(PerturbationTest, translation_test) {
-  real* gpuAngle, *gpuScaleRatio;
-  int* gpuCenterR, *gpuCenterC;
+  real *gpuAngle, *gpuScaleRatio;
+  int *gpuCenterR, *gpuCenterC;
   allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-  hl_generate_disturb_params(gpuAngle, gpuScaleRatio, gpuCenterR,
-                             gpuCenterC, NUM_IMAGES, IMG_SIZE, 0.0,
-                             0.0, SAMPLING_RATE, false);
+  hl_generate_disturb_params(gpuAngle,
+                             gpuScaleRatio,
+                             gpuCenterR,
+                             gpuCenterC,
+                             NUM_IMAGES,
+                             IMG_SIZE,
+                             0.0,
+                             0.0,
+                             SAMPLING_RATE,
+                             false);
   generateTranslationParams(gpuCenterR, gpuCenterC, IMG_SIZE);
 
   real* targets = NULL;
   const int TARGET_MEM_SIZE =
       NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
   targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb_with_params(
-      gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS, NUM_IMAGES, SAMPLING_RATE,
-      gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, 2, targets);
+  hl_conv_random_disturb_with_params(gpuImages_,
+                                     IMG_SIZE,
+                                     TGT_SIZE,
+                                     CHANNELS,
+                                     NUM_IMAGES,
+                                     SAMPLING_RATE,
+                                     gpuAngle,
+                                     gpuScaleRatio,
+                                     gpuCenterR,
+                                     gpuCenterC,
+                                     2,
+                                     targets);
 
   real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets,
-                        sizeof(real) * TARGET_MEM_SIZE);
+  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
   for (int i = 0; i < SAMPLING_RATE * NUM_IMAGES; ++i) {
     for (int p = 0; p < TGT_SIZE * TGT_SIZE * CHANNELS; ++p) {
       const int offset = i * TGT_SIZE * TGT_SIZE * CHANNELS + p;
@@ -191,50 +227,80 @@ TEST_F(PerturbationTest, translation_test) {
 }
 
 TEST_F(PerturbationTest, rotation_test) {
-  real* gpuAngle, *gpuScaleRatio;
-  int* gpuCenterR, *gpuCenterC;
+  real *gpuAngle, *gpuScaleRatio;
+  int *gpuCenterR, *gpuCenterC;
   allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-  hl_generate_disturb_params(gpuAngle, gpuScaleRatio, gpuCenterR,
-                             gpuCenterC, NUM_IMAGES, IMG_SIZE, 0.0,
-                             0.0, SAMPLING_RATE, false);
+  hl_generate_disturb_params(gpuAngle,
+                             gpuScaleRatio,
+                             gpuCenterR,
+                             gpuCenterC,
+                             NUM_IMAGES,
+                             IMG_SIZE,
+                             0.0,
+                             0.0,
+                             SAMPLING_RATE,
+                             false);
   generateRotationParams(gpuAngle);
 
   real* targets = NULL;
   const int TARGET_MEM_SIZE =
       NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
   targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb_with_params(
-      gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS, NUM_IMAGES, SAMPLING_RATE,
-      gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, 2, targets);
+  hl_conv_random_disturb_with_params(gpuImages_,
+                                     IMG_SIZE,
+                                     TGT_SIZE,
+                                     CHANNELS,
+                                     NUM_IMAGES,
+                                     SAMPLING_RATE,
+                                     gpuAngle,
+                                     gpuScaleRatio,
+                                     gpuCenterR,
+                                     gpuCenterC,
+                                     2,
+                                     targets);
 
   real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets,
-                        sizeof(real) * TARGET_MEM_SIZE);
+  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
   for (int i = 0; i < TARGET_MEM_SIZE; ++i) {
     EXPECT_FLOAT_EQ(1.0, cpuTargets[i]);
   }
 }
 
 TEST_F(PerturbationTest, scale_test) {
-  real* gpuAngle, *gpuScaleRatio;
-  int* gpuCenterR, *gpuCenterC;
+  real *gpuAngle, *gpuScaleRatio;
+  int *gpuCenterR, *gpuCenterC;
   allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-  hl_generate_disturb_params(gpuAngle, gpuScaleRatio, gpuCenterR,
-                             gpuCenterC, NUM_IMAGES, IMG_SIZE, 0.0,
-                             0.0, SAMPLING_RATE, false);
+  hl_generate_disturb_params(gpuAngle,
+                             gpuScaleRatio,
+                             gpuCenterR,
+                             gpuCenterC,
+                             NUM_IMAGES,
+                             IMG_SIZE,
+                             0.0,
+                             0.0,
+                             SAMPLING_RATE,
+                             false);
   generateScaleParams(gpuScaleRatio);
 
   real* targets = NULL;
   const int TARGET_MEM_SIZE =
       NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
   targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb_with_params(
-      gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS, NUM_IMAGES, SAMPLING_RATE,
-      gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, 2, targets);
+  hl_conv_random_disturb_with_params(gpuImages_,
+                                     IMG_SIZE,
+                                     TGT_SIZE,
+                                     CHANNELS,
+                                     NUM_IMAGES,
+                                     SAMPLING_RATE,
+                                     gpuAngle,
+                                     gpuScaleRatio,
+                                     gpuCenterR,
+                                     gpuCenterC,
+                                     2,
+                                     targets);
 
   real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets,
-                        sizeof(real) * TARGET_MEM_SIZE);
+  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
   for (int i = 0; i < SAMPLING_RATE * NUM_IMAGES; ++i) {
     for (int p = 0; p < TGT_SIZE * TGT_SIZE * CHANNELS; ++p) {
       const int offset = i * TGT_SIZE * TGT_SIZE * CHANNELS + p;
diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp
index 6048dd81122292c6af4a726217d13794ee0f019c..d7aa20eb984417ff3907b078a263c5651d6209d3 100644
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
@@ -155,7 +155,7 @@ TEST(SMatrix, sMatrixMul) {
   for (auto M : {1, 40, 128, 200}) {
     for (auto N : {100, 2000, 20480}) {
       for (auto K : {100, 512, 1024}) {
-        VLOG(3) << " M=" << M << " N=" << N << " K=" << K;;
+        VLOG(3) << " M=" << M << " N=" << N << " K=" << K;
         testSpMatrixMul(M, N, K, 0.05);
       }
     }
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 42c74661d2b2cebe0c2f5f14d0970ab2f1fec866..0f414b4463b6993ca7bf0bc1eafebbbf9f1a8e00 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -12,14 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Argument.h"
 #include "paddle/math/SparseMatrix.h"
 
 #include <algorithm>
 
 namespace paddle {
-static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu,
+static void resizeAndCopy(MatrixPtr& dest,
+                          const MatrixPtr& src,
+                          bool useGpu,
                           hl_stream_t stream) {
   if (src) {
     if (!dest) {
@@ -34,7 +35,9 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu,
   }
 }
 
-static void resizeAndCopy(IVectorPtr& dest, const IVectorPtr& src, bool useGpu,
+static void resizeAndCopy(IVectorPtr& dest,
+                          const IVectorPtr& src,
+                          bool useGpu,
                           hl_stream_t stream) {
   if (src) {
     IVector::resizeOrCreate(dest, src->getSize(), useGpu);
@@ -56,8 +59,11 @@ static void resizeAndCopy(ICpuGpuVectorPtr& dest,
   }
 }
 
-static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src,
-                          int32_t startRow, int32_t copySize, bool useGpu,
+static void resizeAndCopy(MatrixPtr& dest,
+                          const MatrixPtr& src,
+                          int32_t startRow,
+                          int32_t copySize,
+                          bool useGpu,
                           hl_stream_t stream = HPPL_STREAM_DEFAULT) {
   if (src) {
     CHECK_LE((size_t)startRow + copySize, src->getHeight());
@@ -84,8 +90,11 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src,
   }
 }
 
-static void resizeAndCopy(IVectorPtr& dest, const IVectorPtr& src,
-                          int32_t startPos, int32_t copySize, bool useGpu,
+static void resizeAndCopy(IVectorPtr& dest,
+                          const IVectorPtr& src,
+                          int32_t startPos,
+                          int32_t copySize,
+                          bool useGpu,
                           hl_stream_t stream = HPPL_STREAM_DEFAULT) {
   if (src) {
     CHECK_LE((size_t)startPos + copySize, src->getSize());
@@ -115,7 +124,8 @@ static void resizeAndCopy(ICpuGpuVectorPtr& dest,
 }
 
 static void resizeAndCopy(UserDefinedVectorPtr& dest,
-                          const UserDefinedVectorPtr& src, bool useGpu,
+                          const UserDefinedVectorPtr& src,
+                          bool useGpu,
                           hl_stream_t stream) {
   if (src) {
     CHECK(!useGpu) << "not implemented";
@@ -132,8 +142,10 @@ static void resizeAndCopy(UserDefinedVectorPtr& dest,
 }
 
 static void resizeAndCopy(UserDefinedVectorPtr& dest,
-                          const UserDefinedVectorPtr& src, int32_t startPos,
-                          int32_t copySize, bool useGpu,
+                          const UserDefinedVectorPtr& src,
+                          int32_t startPos,
+                          int32_t copySize,
+                          bool useGpu,
                           hl_stream_t stream = HPPL_STREAM_DEFAULT) {
   if (src) {
     CHECK(!useGpu) << "not implemented";
@@ -151,7 +163,9 @@ static void resizeAndCopy(UserDefinedVectorPtr& dest,
   }
 }
 
-static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src, bool useGpu,
+static void resizeAndCopy(SVectorPtr& dest,
+                          const SVectorPtr& src,
+                          bool useGpu,
                           hl_stream_t stream) {
   if (src) {
     size_t height = src->size();
@@ -166,8 +180,11 @@ static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src, bool useGpu,
   }
 }
 
-static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src,
-                          int32_t startPos, int32_t copySize, bool useGpu,
+static void resizeAndCopy(SVectorPtr& dest,
+                          const SVectorPtr& src,
+                          int32_t startPos,
+                          int32_t copySize,
+                          bool useGpu,
                           hl_stream_t stream = HPPL_STREAM_DEFAULT) {
   if (src) {
     CHECK_LE((size_t)startPos + copySize, src->size());
@@ -184,37 +201,48 @@ static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src,
 }
 
 void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) {
-   resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
-   hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
 }
 
-void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu,
+void Argument::resizeAndCopyFrom(const Argument& src,
+                                 bool useGpu,
                                  hl_stream_t stream) {
   dataId = src.dataId;
   resizeAndCopy(value, src.value, useGpu, stream);
   resizeAndCopy(grad, src.grad, useGpu, stream);
   resizeAndCopy(in, src.in, useGpu, stream);
   resizeAndCopy(ids, src.ids, useGpu, stream);
-  resizeAndCopy(sequenceStartPositions, src.sequenceStartPositions,
-                false /* useGpu */, stream);
+  resizeAndCopy(sequenceStartPositions,
+                src.sequenceStartPositions,
+                false /* useGpu */,
+                stream);
   if (src.hasSubseq()) {
     resizeAndCopy(subSequenceStartPositions,
-                  src.subSequenceStartPositions, false /* useGpu */, stream);
+                  src.subSequenceStartPositions,
+                  false /* useGpu */,
+                  stream);
   }
   resizeAndCopy(udp, src.udp, useGpu, stream);
   resizeAndCopy(strs, src.strs, useGpu, stream);
+  frameWidth = src.frameWidth;
+  frameHeight = src.frameHeight;
 }
 
-int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
-                                    int32_t copySize, bool useGpu) {
-    int32_t size = resizeAndCopyFrom(src, startSeq, copySize, useGpu,
-                                     HPPL_STREAM_DEFAULT);
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-    return size;
+int32_t Argument::resizeAndCopyFrom(const Argument& src,
+                                    int32_t startSeq,
+                                    int32_t copySize,
+                                    bool useGpu) {
+  int32_t size =
+      resizeAndCopyFrom(src, startSeq, copySize, useGpu, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  return size;
 }
 
-int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
-                                    int32_t copySize, bool useGpu,
+int32_t Argument::resizeAndCopyFrom(const Argument& src,
+                                    int32_t startSeq,
+                                    int32_t copySize,
+                                    bool useGpu,
                                     hl_stream_t stream) {
   dataId = src.dataId;
 
@@ -239,8 +267,12 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
     resizeAndCopy(grad, src.grad, startRow, copyFeatureSize, useGpu, stream);
     resizeAndCopy(ids, src.ids, startRow, copyFeatureSize, useGpu, stream);
     resizeAndCopy(udp, src.udp, startRow, copySize, useGpu, stream);
-    resizeAndCopy(sequenceStartPositions, src.sequenceStartPositions,
-                  startSeq, copySize + 1, false, stream);
+    resizeAndCopy(sequenceStartPositions,
+                  src.sequenceStartPositions,
+                  startSeq,
+                  copySize + 1,
+                  false,
+                  stream);
     // modify new sequenceStartPositions
     int* destSequences = sequenceStartPositions->getMutableData(false);
     for (int i = 0; i < copySize + 1; i++) {
@@ -264,8 +296,11 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
       }
       int32_t copySubSize = subEndSeq - subStartSeq;
       resizeAndCopy(subSequenceStartPositions,
-                    src.subSequenceStartPositions, subStartSeq,
-                    copySubSize + 1, false, stream);
+                    src.subSequenceStartPositions,
+                    subStartSeq,
+                    copySubSize + 1,
+                    false,
+                    stream);
       // modify new subSequenceStartPositions
       int* destSubSequences = subSequenceStartPositions->getMutableData(false);
       for (int i = 0; i < copySubSize + 1; i++) {
@@ -281,14 +316,19 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
 
 void Argument::concat(const std::vector<Argument>& args,
                       const std::vector<int>& selectRows,
-                      const std::vector<int>& seqStartPos, bool useGpu,
-                      hl_stream_t stream, PassType passType) {
+                      const std::vector<int>& seqStartPos,
+                      bool useGpu,
+                      hl_stream_t stream,
+                      PassType passType) {
   CHECK(!subSequenceStartPositions)
-          << "undefined behavior for subsequence positions";
+      << "undefined behavior for subsequence positions";
 
   size_t batchSize = selectRows.size();
-  auto copyArg = [batchSize, stream](MatrixPtr& dst, MatrixPtr src,
-                                     int startRow, int pos, int size,
+  auto copyArg = [batchSize, stream](MatrixPtr& dst,
+                                     MatrixPtr src,
+                                     int startRow,
+                                     int pos,
+                                     int size,
                                      bool useGpu) {
     if (!src) {
       dst.reset();
@@ -305,8 +345,11 @@ void Argument::concat(const std::vector<Argument>& args,
     tmpMatrix->copyFrom(*src->subMatrix(pos, size), stream);
   };
 
-  auto copyIds = [batchSize, stream](IVectorPtr& dst, const IVectorPtr& src,
-                                     int startRow, int pos, int size,
+  auto copyIds = [batchSize, stream](IVectorPtr& dst,
+                                     const IVectorPtr& src,
+                                     int startRow,
+                                     int pos,
+                                     int size,
                                      bool useGpu) {
     if (!src) {
       dst.reset();
@@ -316,8 +359,11 @@ void Argument::concat(const std::vector<Argument>& args,
     dst->subVec(startRow, size)->copyFrom(*src->subVec(pos, size), stream);
   };
 
-  auto copyStrs = [batchSize, stream](SVectorPtr& dst, const SVectorPtr& src,
-                                      int startRow, int pos, int size,
+  auto copyStrs = [batchSize, stream](SVectorPtr& dst,
+                                      const SVectorPtr& src,
+                                      int startRow,
+                                      int pos,
+                                      int size,
                                       bool useGpu) {
     if (!src) {
       dst.reset();
@@ -328,8 +374,8 @@ void Argument::concat(const std::vector<Argument>& args,
     } else {
       dst->resize(batchSize);
     }
-    std::copy(src->begin() + pos, src->begin() + pos + size,
-              dst->begin() + startRow);
+    std::copy(
+        src->begin() + pos, src->begin() + pos + size, dst->begin() + startRow);
   };
 
   dataId = args[0].dataId;
@@ -354,14 +400,16 @@ void Argument::concat(const std::vector<Argument>& args,
       copyStrs(strs, arg.strs, j, rowIdx, copySize, useGpu);
     }
   }
-  ICpuGpuVector::resizeOrCreate(sequenceStartPositions,
-                          seqStartPos.size(), useGpu);
-  sequenceStartPositions->copyFrom(seqStartPos.data(),
-                                   seqStartPos.size(), useGpu);
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, seqStartPos.size(), useGpu);
+  sequenceStartPositions->copyFrom(
+      seqStartPos.data(), seqStartPos.size(), useGpu);
 }
 
-void Argument::concat(const std::vector<Argument>& args, bool useGpu,
-                      hl_stream_t stream, PassType passType) {
+void Argument::concat(const std::vector<Argument>& args,
+                      bool useGpu,
+                      hl_stream_t stream,
+                      PassType passType) {
   int32_t batchSize = 0;
   int64_t numSequences = 0;
   int64_t numSubSequences = 0;
@@ -371,8 +419,8 @@ void Argument::concat(const std::vector<Argument>& args, bool useGpu,
     numSubSequences += arg.getNumSubSequences();
   }
 
-  auto copyArg = [batchSize, stream](MatrixPtr& dst, MatrixPtr src,
-                                     int startRow, bool useGpu) {
+  auto copyArg = [batchSize, stream](
+      MatrixPtr& dst, MatrixPtr src, int startRow, bool useGpu) {
     if (!src) {
       dst.reset();
       return;
@@ -388,8 +436,8 @@ void Argument::concat(const std::vector<Argument>& args, bool useGpu,
     tmpMatrix->copyFrom(*src, stream);
   };
 
-  auto copyIds = [batchSize, stream](IVectorPtr& dst, const IVectorPtr& src,
-                                     int startRow, bool useGpu) {
+  auto copyIds = [batchSize, stream](
+      IVectorPtr& dst, const IVectorPtr& src, int startRow, bool useGpu) {
     if (!src) {
       dst.reset();
       return;
@@ -398,8 +446,8 @@ void Argument::concat(const std::vector<Argument>& args, bool useGpu,
     dst->subVec(startRow, src->getSize())->copyFrom(*src, stream);
   };
 
-  auto copyStrs = [batchSize, stream](SVectorPtr& dst, const SVectorPtr& src,
-                                      int startRow, bool useGpu) {
+  auto copyStrs = [batchSize, stream](
+      SVectorPtr& dst, const SVectorPtr& src, int startRow, bool useGpu) {
     if (!src) {
       dst.reset();
       return;
@@ -412,21 +460,23 @@ void Argument::concat(const std::vector<Argument>& args, bool useGpu,
     std::copy(src->begin(), src->end(), dst->begin() + startRow);
   };
 
-  auto copySequencePos = []
-          (ICpuGpuVectorPtr& dstSeq, const ICpuGpuVectorPtr& srcSeq,
-           int dstNumSequences, int srcNumSequences,
-           int& startSequences, int startRow) {
-      if (srcSeq) {
-          ICpuGpuVector::resizeOrCreate(dstSeq, dstNumSequences + 1, false);
-          const int* src = srcSeq->getData(false);
-          int* dest = dstSeq->getMutableData(false);
-          for (int i = 0; i < srcNumSequences + 1; ++i) {
-              dest[i + startSequences] = src[i] + startRow;
-          }
-          startSequences += srcNumSequences;
-      } else {
-          dstSeq.reset();
+  auto copySequencePos = [](ICpuGpuVectorPtr& dstSeq,
+                            const ICpuGpuVectorPtr& srcSeq,
+                            int dstNumSequences,
+                            int srcNumSequences,
+                            int& startSequences,
+                            int startRow) {
+    if (srcSeq) {
+      ICpuGpuVector::resizeOrCreate(dstSeq, dstNumSequences + 1, false);
+      const int* src = srcSeq->getData(false);
+      int* dest = dstSeq->getMutableData(false);
+      for (int i = 0; i < srcNumSequences + 1; ++i) {
+        dest[i + startSequences] = src[i] + startRow;
       }
+      startSequences += srcNumSequences;
+    } else {
+      dstSeq.reset();
+    }
   };
 
   int startRow = 0;
@@ -479,8 +529,8 @@ void Argument::splitByDataId(const std::vector<Argument>& argus,
 
 void Argument::getSeqInfo(std::vector<SeqInfo>* seqInfo) const {
   const int* starts = sequenceStartPositions->getData(false);
-  const int* subStarts = hasSubseq()
-      ? subSequenceStartPositions->getData(false) : nullptr;
+  const int* subStarts =
+      hasSubseq() ? subSequenceStartPositions->getData(false) : nullptr;
   size_t numSequences = getNumSequences();
   seqInfo->reserve(numSequences);
   int subSeqEnd = 0;
@@ -501,7 +551,8 @@ void Argument::getSeqInfo(std::vector<SeqInfo>* seqInfo) const {
     }
     seqInfo->push_back(info);
   }
-  std::sort(seqInfo->begin(), seqInfo->end(),
+  std::sort(seqInfo->begin(),
+            seqInfo->end(),
             [](const SeqInfo& a, const SeqInfo& b) {
               return a.topLevelLength > b.topLevelLength;
             });
@@ -535,9 +586,8 @@ void Argument::degradeSequence(const Argument& input, bool useGpu) {
   CHECK_EQ(input.hasSubseq(), 1UL);
   size_t numSequences = input.getNumSequences();
   size_t numSubSequences = input.getNumSubSequences();
-  ICpuGpuVector::resizeOrCreate(sequenceStartPositions,
-                                 numSequences + 1,
-                                 false);
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, numSequences + 1, false);
   int* tgtBuf = sequenceStartPositions->getMutableData(false);
   const int* starts = input.sequenceStartPositions->getData(false);
   const int* subStarts = input.subSequenceStartPositions->getData(false);
@@ -551,24 +601,29 @@ void Argument::degradeSequence(const Argument& input, bool useGpu) {
   tgtBuf[numSequences] = numSubSequences;
 }
 
-void Argument::subArgFrom(const Argument& input, size_t offset, size_t height,
-                          size_t width, bool useGpu, bool trans, bool seqFlag,
-                          size_t seqStart, size_t seqSize) {
+void Argument::subArgFrom(const Argument& input,
+                          size_t offset,
+                          size_t height,
+                          size_t width,
+                          bool useGpu,
+                          bool trans,
+                          bool seqFlag,
+                          size_t seqStart,
+                          size_t seqSize) {
   if (input.value) {
-    value = Matrix::create(input.value->getData() + offset * width,
-                           height, width, trans, useGpu);
+    value = Matrix::create(
+        input.value->getData() + offset * width, height, width, trans, useGpu);
   }
   if (input.ids) {
     ids = IVector::create(input.ids->getData() + offset, height, useGpu);
   }
   if (input.grad) {
-    grad = Matrix::create(input.grad->getData() + offset * width,
-                          height, width, trans, useGpu);
+    grad = Matrix::create(
+        input.grad->getData() + offset * width, height, width, trans, useGpu);
   }
   if (seqFlag) {
     sequenceStartPositions = std::make_shared<ICpuGpuVector>(
-        *(input.sequenceStartPositions),
-        seqStart, seqSize);
+        *(input.sequenceStartPositions), seqStart, seqSize);
   }
 }
 
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index 81ff9029bc4c8fca7adbabd7ae65caf7ac2f3c2a..2b20122debf935562d36f29d872e8ef3243111e0 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "hl_gpu.h"
@@ -153,9 +152,8 @@ struct Argument {
   }
 
   int64_t getNumSubSequences() const {
-    return subSequenceStartPositions
-               ? subSequenceStartPositions->getSize() - 1
-               : getBatchSize();
+    return subSequenceStartPositions ? subSequenceStartPositions->getSize() - 1
+                                     : getBatchSize();
   }
 
   bool hasSubseq() const { return subSequenceStartPositions != nullptr; }
@@ -190,9 +188,14 @@ struct Argument {
    * @param seqStart[in]    offset of input.sequenceStartPositions
    * @param seqSize[in]     lenght of output.sequenceStartPositions
    */
-  void subArgFrom(const Argument& input, size_t offset, size_t height,
-                  size_t width, bool useGpu, bool trans = false,
-                  bool seqFlag = false, size_t seqStart = 0,
+  void subArgFrom(const Argument& input,
+                  size_t offset,
+                  size_t height,
+                  size_t width,
+                  bool useGpu,
+                  bool trans = false,
+                  bool seqFlag = false,
+                  size_t seqStart = 0,
                   size_t seqSize = 0);
   /*
    * for sequence input:
@@ -206,16 +209,21 @@ struct Argument {
    * Note that when specifying the stream explicitly in this case,
    * synchronize should also be called somewhere after this function
    */
-  int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
-                            int32_t copySize, bool useGpu, hl_stream_t stream);
+  int32_t resizeAndCopyFrom(const Argument& src,
+                            int32_t startSeq,
+                            int32_t copySize,
+                            bool useGpu,
+                            hl_stream_t stream);
 
   /*
    * same with the above function, except that the stream is
    * HPPL_STREAM_DEFAULT and synchronize is automatically called
    * inside it
    */
-  int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
-                            int32_t copySize, bool useGpu = FLAGS_use_gpu);
+  int32_t resizeAndCopyFrom(const Argument& src,
+                            int32_t startSeq,
+                            int32_t copySize,
+                            bool useGpu = FLAGS_use_gpu);
 
   void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);
 
@@ -237,13 +245,16 @@ struct Argument {
    */
   void concat(const std::vector<Argument>& args,
               const std::vector<int>& selectRows,
-              const std::vector<int>& seqStartPos, bool useGpu,
-              hl_stream_t stream, PassType passType);
+              const std::vector<int>& seqStartPos,
+              bool useGpu,
+              hl_stream_t stream,
+              PassType passType);
 
   /*
     Concatenate several args into one and put the result into this.
    */
-  void concat(const std::vector<Argument>& src, bool useGpu = FLAGS_use_gpu,
+  void concat(const std::vector<Argument>& src,
+              bool useGpu = FLAGS_use_gpu,
               hl_stream_t stream = HPPL_STREAM_DEFAULT,
               PassType passType = PASS_TEST);
 
diff --git a/paddle/parameter/AverageOptimizer.cpp b/paddle/parameter/AverageOptimizer.cpp
index 4f730059c748f36d690f388d29d213c676ac9626..593594761ed57495b92a30a8f3e8e86cdb45bfce 100644
--- a/paddle/parameter/AverageOptimizer.cpp
+++ b/paddle/parameter/AverageOptimizer.cpp
@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "AverageOptimizer.h"
 
 namespace paddle {
 
 // factory method to create an instance of AverageOptimizer
 ParameterOptimizer* AverageOptimizer::create(
-    const OptimizationConfig& optConfig, ParameterOptimizer* optimizer,
-    bool isParameterSparse, bool useParameterApply) {
+    const OptimizationConfig& optConfig,
+    ParameterOptimizer* optimizer,
+    bool isParameterSparse,
+    bool useParameterApply) {
   if (optConfig.average_window() <= 0) {
     return optimizer;
   }
@@ -44,8 +45,8 @@ AverageOptimizer::AverageOptimizer(const OptimizationConfig& optConfig,
       prevNumUpdates_(0),
       numAccumulates_(0),
       oldNumAccumulates_(0),
-      minAverageWindow_(std::min<int64_t>(
-        10000L, optConfig_.max_average_window())),
+      minAverageWindow_(
+          std::min<int64_t>(10000L, optConfig_.max_average_window())),
       maxAverageWindow_(optConfig_.max_average_window()) {
   parameterTypes_ = optimizer_->getParameterTypes();
   addParameterType(PARAMETER_SUM1);
@@ -121,17 +122,27 @@ ParameterOptimizer::TraverseCallback AverageOptimizer::apply() {
 
   real scale = 1. / (numAccumulates_ + oldNumAccumulates_);
   if (useApply_) {
-    return [scale](const VectorPtr vecs[], const ParameterConfig& config,
+    return [scale](const VectorPtr vecs[],
+                   const ParameterConfig& config,
                    size_t sparseId) {
-      vecs[PARAMETER_APPLY]->add3(*vecs[PARAMETER_SUM1], *vecs[PARAMETER_SUM2],
-                                  *vecs[PARAMETER_SUM3], scale, scale, scale);
+      vecs[PARAMETER_APPLY]->add3(*vecs[PARAMETER_SUM1],
+                                  *vecs[PARAMETER_SUM2],
+                                  *vecs[PARAMETER_SUM3],
+                                  scale,
+                                  scale,
+                                  scale);
     };
   } else {
-    return [scale](const VectorPtr vecs[], const ParameterConfig& config,
+    return [scale](const VectorPtr vecs[],
+                   const ParameterConfig& config,
                    size_t sparseId) {
       vecs[PARAMETER_GRADIENT]->copyFrom(*vecs[PARAMETER_VALUE]);
-      vecs[PARAMETER_VALUE]->add3(*vecs[PARAMETER_SUM1], *vecs[PARAMETER_SUM2],
-                                  *vecs[PARAMETER_SUM3], scale, scale, scale);
+      vecs[PARAMETER_VALUE]->add3(*vecs[PARAMETER_SUM1],
+                                  *vecs[PARAMETER_SUM2],
+                                  *vecs[PARAMETER_SUM3],
+                                  scale,
+                                  scale,
+                                  scale);
     };
   }
 }
@@ -144,8 +155,8 @@ ParameterOptimizer::TraverseCallback AverageOptimizer::restore() {
     return nullptr;
   }
 
-  return [](const VectorPtr vecs[], const ParameterConfig& config,
-            size_t sparseId) {
+  return [](
+      const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) {
     vecs[PARAMETER_VALUE]->copyFrom(*vecs[PARAMETER_GRADIENT]);
     vecs[PARAMETER_GRADIENT]->zeroMem();
   };
@@ -174,7 +185,8 @@ ParameterOptimizer::TraverseCallback AverageSparseOptimizer::startCatchUpWith()
 
   if (timer_ > 0) {
     callbacks.emplace_back(
-        [this](const VectorPtr vecs[], const ParameterConfig& config,
+        [this](const VectorPtr vecs[],
+               const ParameterConfig& config,
                size_t sparseId) { this->catchUpWith(vecs, config, sparseId); });
   }
 
diff --git a/paddle/parameter/AverageOptimizer.h b/paddle/parameter/AverageOptimizer.h
index 8e0ead84125ab283756acdbd3bf9120918adcf35..ccc2612608db574274f3e0acaacec7f9eb404223 100644
--- a/paddle/parameter/AverageOptimizer.h
+++ b/paddle/parameter/AverageOptimizer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "FirstOrderOptimizer.h"
@@ -26,7 +25,8 @@ public:
   // if *useParameterApply* set, use PARAMETER_APPLY to store averaged parameter
   // else use PARAMETER_VALUE, and value backup in PARAMETER_GRADIENT
   AverageOptimizer(const OptimizationConfig& optConfig,
-                   ParameterOptimizer* optimizer, bool useParameterApply);
+                   ParameterOptimizer* optimizer,
+                   bool useParameterApply);
 
   static ParameterOptimizer* create(const OptimizationConfig& optConfig,
                                     ParameterOptimizer* optimizer,
@@ -45,7 +45,8 @@ public:
 
   virtual void startBatch(int64_t numSamplesProcessed);
   virtual void finishBatch();
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       size_t sparseId) const {
     optimizer_->update(vecs, paraConfig, sparseId);
     vecs[PARAMETER_SUM1]->add(*vecs[PARAMETER_VALUE], 1.0f);
@@ -99,7 +100,8 @@ protected:
 class AverageSparseOptimizer : public AverageOptimizer {
 public:
   AverageSparseOptimizer(const OptimizationConfig& optConfig,
-                         ParameterOptimizer* optimizer, bool useParameterApply)
+                         ParameterOptimizer* optimizer,
+                         bool useParameterApply)
       : AverageOptimizer(optConfig, optimizer, useParameterApply) {}
 
   virtual void init(size_t numRows, const ParameterConfig* config) {
@@ -114,9 +116,11 @@ public:
     AverageOptimizer::finishBatch();
     timer_++;
   }
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       size_t sparseId) const;
-  void catchUpWith(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  void catchUpWith(const VectorPtr vecs[],
+                   const ParameterConfig& paraConfig,
                    size_t sparseId) const;
   virtual TraverseCallback startCatchUpWith() const;
   virtual void finishCatchUpWith() {
diff --git a/paddle/parameter/FirstOrderOptimizer.cpp b/paddle/parameter/FirstOrderOptimizer.cpp
index bb46a51d1e02c6d7e96e33c2cac0585055f026a1..9e363fb20d11928fb0fa303d4ad8231fb399307b 100644
--- a/paddle/parameter/FirstOrderOptimizer.cpp
+++ b/paddle/parameter/FirstOrderOptimizer.cpp
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "paddle/utils/Flags.h"
-
+#include "paddle/math/TrainingAlgorithmOp.h"
 #include "FirstOrderOptimizer.h"
 
 #include <cmath>
@@ -71,13 +70,15 @@ void SparseMomentumParameterOptimizer::update(const VectorPtr vecs[],
                                      tau_ * alpha_ * gamma_ * learningRate_);
     vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
                                tau_ / beta_ + 1.0 / alpha_,
-                               *vecs[PARAMETER_MOMENTUM_VT], 1.0 / beta_);
+                               *vecs[PARAMETER_MOMENTUM_VT],
+                               1.0 / beta_);
 
   } else {
-    vecs[PARAMETER_VALUE]->sgdUpdate(
-        *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
-        learningRate_ * paraConfig.learning_rate(), paraConfig.momentum(),
-        applyDecay_ ? paraConfig.decay_rate() : 0);
+    vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                     *vecs[PARAMETER_MOMENTUM],
+                                     learningRate_ * paraConfig.learning_rate(),
+                                     paraConfig.momentum(),
+                                     applyDecay_ ? paraConfig.decay_rate() : 0);
   }
 }
 
@@ -90,7 +91,8 @@ SparseMomentumParameterOptimizer::needSpecialTraversal(
     //  2. Note that \tau * u_t + v_t = \beta \theta_t, therefore:
     //     u_t should be rescaled to u_t/alpha_
     //     v_t should be reset to \theta_t
-    return [this](const VectorPtr vecs[], const ParameterConfig& config,
+    return [this](const VectorPtr vecs[],
+                  const ParameterConfig& config,
                   size_t sparseId) {
       vecs[PARAMETER_MOMENTUM_UT]->divScalar(alpha_);
       vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
@@ -113,17 +115,28 @@ void SparseMomentumParameterOptimizer::finishBatch() {
 void AdagradParameterOptimizer::update(const VectorPtr vecs[],
                                        const ParameterConfig& config,
                                        size_t sparseId) const {
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
-                                                1.0f);
-  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
-                                     *vecs[PARAMETER_GRADIENT_SQURESUM1]);
-  vecs[PARAMETER_LEARNING_RATE]->add(optConfig_.ada_epsilon());
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(
-      *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
-      *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(),
-      config.momentum(), applyDecay_ ? config.decay_rate() : 0);
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& accum_buffer = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
+
+  real epsilon = optConfig_.ada_epsilon();
+  real learningRate = learningRate_ * config.learning_rate();
+  real momentum = config.momentum();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
+
+  adagradApply(value,
+               grad,
+               mom,
+               accum_buffer,
+               accum,
+               lr,
+               epsilon,
+               learningRate,
+               momentum,
+               decayRate);
 }
 
 ParameterOptimizer::TraverseCallback
@@ -132,7 +145,8 @@ AdagradParameterOptimizer::needSpecialTraversal(
   if (numUpdates_ % kMaxNumAccumulates == 0) {
     // Move the sum to a different buffer to avoid loss of precision
     // due to too many sums.
-    return [this](const VectorPtr vecs[], const ParameterConfig& config,
+    return [this](const VectorPtr vecs[],
+                  const ParameterConfig& config,
                   size_t sparseId) {
       vecs[PARAMETER_GRADIENT_SQURESUM]->add(
           *vecs[PARAMETER_GRADIENT_SQURESUM1]);
@@ -147,32 +161,41 @@ void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[],
                                         const ParameterConfig& config,
                                         size_t sparseId) const {
   CHECK(sparseId == -1LU) << "Sparse update is not supported";
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(*vecs[PARAMETER_GRADIENT],
-                                                    rou_, 1.0f - rou_);
-
-  // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
-  vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
-                                        *vecs[PARAMETER_GRADIENT_SQURESUM],
-                                        epsilon_, epsilon_);
-  vecs[PARAMETER_LEARNING_RATE]->sqrt();
-
-  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
-      *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_LEARNING_RATE], rou_,
-      1.0f - rou_);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(
-      *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
-      *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(),
-      config.momentum(), applyDecay_ ? config.decay_rate() : 0);
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum_update = *vecs[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
+
+  real learningRate = learningRate_ * config.learning_rate();
+  real momentum = config.momentum();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
+
+  adadeltaApply(value,
+                grad,
+                mom,
+                accum,
+                accum_update,
+                lr,
+                rou_,
+                epsilon_,
+                learningRate,
+                momentum,
+                decayRate);
 }
 
 void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
                                        const ParameterConfig& config,
                                        size_t sparseId) const {
-  real accumulatedRou = rou_;
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& sum1 = *vecs[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
 
+  real accumulatedRou = rou_;
   bool firstTime = timer_ == 0;
   if (sparseId != -1LU) {
     CHECK_LT(sparseId, t0Vec_.size());
@@ -181,37 +204,36 @@ void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
     t0Vec_[sparseId] = timer_ + 1;
   }
 
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou,
-      firstTime ? 1.0f : 1.0f - rou_);
-
-  // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(*vecs[PARAMETER_GRADIENT],
-                                          accumulatedRou, 1.0f - rou_);
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
-  // Basiclly if the sign of the gradient changes more often,
-  // the learning rate will be decreased.
-  vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
-  vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
-                                           -1.0f);
-  vecs[PARAMETER_LEARNING_RATE]->add(optConfig_.ada_epsilon());
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(
-      *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
-      *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(),
-      config.momentum(), applyDecay_ ? config.decay_rate() : 0);
+  real epsilon = optConfig_.ada_epsilon();
+  real learningRate = learningRate_ * config.learning_rate();
+  real momentum = config.momentum();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
+
+  rmspropApply(value,
+               grad,
+               mom,
+               sum,
+               sum1,
+               lr,
+               accumulatedRou,
+               rou_,
+               epsilon,
+               learningRate,
+               momentum,
+               decayRate,
+               firstTime);
 }
 
 void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
                                               const ParameterConfig& config,
                                               size_t sparseId) const {
-  real accumulatedRou = rou_;
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
 
+  real accumulatedRou = rou_;
   bool firstTime = timer_ == 0;
   if (sparseId != -1LU) {
     CHECK_LT(sparseId, t0Vec_.size());
@@ -220,77 +242,64 @@ void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
     t0Vec_[sparseId] = timer_ + 1;
   }
 
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou,
-      firstTime ? 1.0f : 1.0f - rou_);
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
-  // Basiclly if the bigger the magnitude gradient is,
-  // the smaller the learning rate will be.
-  vecs[PARAMETER_LEARNING_RATE]->assign(optConfig_.ada_epsilon());
-  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(
-      *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
-      *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(),
-      config.momentum(), applyDecay_ ? config.decay_rate() : 0);
+  real epsilon = optConfig_.ada_epsilon();
+  real learningRate = learningRate_ * config.learning_rate();
+  real momentum = config.momentum();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
+
+  decayedAdagradApply(value,
+                      grad,
+                      mom,
+                      sum,
+                      lr,
+                      accumulatedRou,
+                      rou_,
+                      epsilon,
+                      learningRate,
+                      momentum,
+                      decayRate,
+                      firstTime);
 }
 
 void AdamParameterOptimizer::update(const VectorPtr vecs[],
                                     const ParameterConfig& config,
                                     size_t sparseId) const {
   CHECK(sparseId == -1UL) << "Sparse update is not supported";
-  Vector* m = vecs[PARAMETER_MOMENTUM].get();
-  Vector* g = vecs[PARAMETER_GRADIENT].get();
-  Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
-  Vector* theta = vecs[PARAMETER_VALUE].get();
-
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  m->add(*g, beta1_, 1 - beta1_);
-
-  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
-  g->square();
-  v->add(*g, beta2_, 1 - beta2_);
-
-  // tmp = m_t / ( \sqrt{v_t} + \epsilon )
-  // \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
-  g->sqrt(*v);
-  g->dotDiv(*m, *g, 0., epsilon_);
-  real alpha = config.learning_rate() * learningRate_;
-  alpha = alpha * std::sqrt(1 - std::pow(beta2_, step_)) /
-          (1 - std::pow(beta1_, step_));
-  theta->add(*theta, 1.0, *g, -alpha);
+  real beta1_power = std::pow(beta1_, step_);
+  real beta2_power = std::pow(beta2_, step_);
+  real learningRate = config.learning_rate() * learningRate_;
+
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& v = *vecs[PARAMETER_SECOND_MOMENTUM];
+
+  adamApply(value,
+            grad,
+            mom,
+            v,
+            beta1_,
+            beta2_,
+            beta1_power,
+            beta2_power,
+            epsilon_,
+            learningRate);
 }
 
 void AdamaxParameterOptimizer::update(const VectorPtr vecs[],
                                       const ParameterConfig& config,
                                       size_t sparseId) const {
   CHECK(sparseId == -1UL) << "Sparse update is not supported";
-  Vector* m = vecs[PARAMETER_MOMENTUM].get();
-  Vector* g = vecs[PARAMETER_GRADIENT].get();
-  Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
-  Vector* theta = vecs[PARAMETER_VALUE].get();
-
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  m->add(*g, beta1_, 1 - beta1_);
+  real learningRate = config.learning_rate() * learningRate_;
 
-  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
-  u->mulScalar(beta2_);
-  g->abs();
-  u->max(*u, *g);
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& u = *vecs[PARAMETER_WEIGHTED_INFINITY_NORM];
 
-  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
-  g->dotDiv(*m, *u);
-  real learningRate = config.learning_rate() * learningRate_;
-  learningRate /= (1 - std::pow(beta1_, step_));
-  theta->add(*theta, 1.0, *g, -learningRate);
+  adamaxApply(value, grad, mom, u, beta1_, beta2_, step_, learningRate);
 }
 
-
 void OptimizerWithGradientClipping::update(const VectorPtr vecs[],
                                            const ParameterConfig& config,
                                            size_t sparseId) const {
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
index ad5f48097643a10d8b6f5bf3202211aa2b092469..a9a2ffdd41310d1927df012be8328d0e4bd3af0f 100644
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "ParameterOptimizer.h"
@@ -31,21 +30,22 @@ public:
   virtual void startBatch(int64_t numSamplesProcessed) {
     learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
   }
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       size_t sparseId) const {
     (void)sparseId;
-    real torch_learningRate = optConfig_.learning_method() == "torch_momentum" ?
-                              1.0 - paraConfig.momentum() : 1.0;
+    real torch_learningRate = optConfig_.learning_method() == "torch_momentum"
+                                  ? 1.0 - paraConfig.momentum()
+                                  : 1.0;
     vecs[PARAMETER_VALUE]->sgdUpdate(
-        *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
+        *vecs[PARAMETER_GRADIENT],
+        *vecs[PARAMETER_MOMENTUM],
         learningRate_ * paraConfig.learning_rate() *
-        (firstTime_ ? 1.0 : torch_learningRate),
+            (firstTime_ ? 1.0 : torch_learningRate),
         paraConfig.momentum(),
         applyDecay_ ? paraConfig.decay_rate() : 0);
   }
-  virtual void finishBatch() {
-        firstTime_ = false;
-  }
+  virtual void finishBatch() { firstTime_ = false; }
 };
 
 // SGD optimization with sparse support.
@@ -71,7 +71,8 @@ public:
       const OptimizationConfig& optConfig);
   virtual void init(size_t numRows, const ParameterConfig* config);
   virtual void startBatch(int64_t numSamplesProcessed);
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       size_t sparseId) const;
   virtual TraverseCallback needSpecialTraversal(
       const ParameterConfig& config) const;
@@ -111,7 +112,8 @@ public:
     (void)numSamplesProcessed;
     ++numUpdates_;
   }
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
   virtual TraverseCallback needSpecialTraversal(
       const ParameterConfig& config) const;
@@ -141,7 +143,8 @@ public:
     learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
   }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
 
 protected:
@@ -173,7 +176,8 @@ public:
   }
   virtual void finishBatch() { timer_++; }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
 
 protected:
@@ -214,7 +218,8 @@ public:
   }
   virtual void finishBatch() { timer_++; }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
 
 protected:
@@ -251,7 +256,8 @@ public:
 
   virtual void finishBatch() { ++step_; }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
 
 protected:
@@ -280,7 +286,8 @@ public:
 
   virtual void finishBatch() { ++step_; }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
 
 protected:
@@ -301,7 +308,8 @@ public:
     // learningRate required by regularizer
     learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
   }
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       size_t sparseId) const {
     vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_GRADIENT],
                                optConfig_.delta_add_rate());
@@ -314,7 +322,8 @@ public:
   explicit DummyOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {}
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       size_t sparseId) const {}
 };
 
@@ -344,7 +353,8 @@ public:
       const ParameterConfig& config) const {
     return optimizer_->needSpecialTraversal(config);
   }
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
 
   virtual void setNoDecay() { optimizer_->setNoDecay(); }
diff --git a/paddle/parameter/LearningRateScheduler.cpp b/paddle/parameter/LearningRateScheduler.cpp
index ce045ebf05a226215d565bf0281f245918e13055..a7412500ccfa05707286f0ad493ad8280eee1cbc 100644
--- a/paddle/parameter/LearningRateScheduler.cpp
+++ b/paddle/parameter/LearningRateScheduler.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "LearningRateScheduler.h"
 #include "paddle/utils/StringUtil.h"
 
diff --git a/paddle/parameter/LearningRateScheduler.h b/paddle/parameter/LearningRateScheduler.h
index 74fb848fabe1ad9bbea8620d51d9d3674eb8a526..e987c3dcde120b8c88d58de7a18ee5c6db85bb5c 100644
--- a/paddle/parameter/LearningRateScheduler.h
+++ b/paddle/parameter/LearningRateScheduler.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "TrainerConfig.pb.h"
@@ -20,9 +19,10 @@ limitations under the License. */
 
 namespace paddle {
 // NOLINTNEXTLINES_4
-#define REGISTER_LEARNING_RATE_SCHEDULER(__type_name, __class_name)              \
-  static InitFunction __reg_type_##__type_name([]() {                            \
-    LearningRateScheduler::registrar_.registerClass<__class_name>(#__type_name); \
+#define REGISTER_LEARNING_RATE_SCHEDULER(__type_name, __class_name) \
+  static InitFunction __reg_type_##__type_name([]() {               \
+    LearningRateScheduler::registrar_.registerClass<__class_name>(  \
+        #__type_name);                                              \
   })
 
 class LearningRateScheduler {
diff --git a/paddle/parameter/OptimizerFunctions.cpp b/paddle/parameter/OptimizerFunctions.cpp
index 5adcf86efd5284ab5bc3131217c9e44172caa71b..6fd7964347644214533007dc1e11e6fa45ee9ea6 100644
--- a/paddle/parameter/OptimizerFunctions.cpp
+++ b/paddle/parameter/OptimizerFunctions.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "AverageOptimizer.h"
 #include "FirstOrderOptimizer.h"
 #include "OptimizerWithRegularizer.h"
@@ -22,19 +21,22 @@ namespace paddle {
 // creator for AverageOptimizer
 ParameterOptimizer* sgdOptimizerCreate(const OptimizationConfig& optConfig,
                                        const ParameterConfig& paraConfig,
-                                       bool isParameterSparse, bool inPserver) {
+                                       bool isParameterSparse,
+                                       bool inPserver) {
   ParameterOptimizer* optimizer = OptimizerWithRegularizer::create(
       optConfig, paraConfig, isParameterSparse, inPserver);
-  return AverageOptimizer::create(optConfig, optimizer, isParameterSparse,
-                                  inPserver /*useParameterApply*/);
+  return AverageOptimizer::create(
+      optConfig, optimizer, isParameterSparse, inPserver /*useParameterApply*/);
 }
 
 std::vector<ParameterType> sgdOptimizerGetTypes(
     const OptimizationConfig& optConfig, bool inPserver) {
   std::unique_ptr<ParameterOptimizer> optimizer;
-  optimizer.reset(AverageOptimizer::create(
-      optConfig, ParameterOptimizer::create(optConfig, inPserver),
-      false /*isParameterSparse*/, inPserver));
+  optimizer.reset(
+      AverageOptimizer::create(optConfig,
+                               ParameterOptimizer::create(optConfig, inPserver),
+                               false /*isParameterSparse*/,
+                               inPserver));
   CHECK(optimizer) << "fail to create optimizer: "
                    << optConfig.learning_method();
   return optimizer->getParameterTypes();
diff --git a/paddle/parameter/OptimizerFunctions.h b/paddle/parameter/OptimizerFunctions.h
index 9592658224d856fff1a2bde5e400ea85f95cd521..a5f8b2c56942720335c0df6c9d71fd4e15494600 100644
--- a/paddle/parameter/OptimizerFunctions.h
+++ b/paddle/parameter/OptimizerFunctions.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "FirstOrderOptimizer.h"
@@ -25,7 +24,8 @@ namespace paddle {
  */
 ParameterOptimizer* sgdOptimizerCreate(const OptimizationConfig& optConfig,
                                        const ParameterConfig& paraConfig,
-                                       bool isParameterSparse, bool inPserver);
+                                       bool isParameterSparse,
+                                       bool inPserver);
 
 /*
  * Get the parameter types needed for the specific optimization
diff --git a/paddle/parameter/OptimizerWithRegularizer.cpp b/paddle/parameter/OptimizerWithRegularizer.cpp
index 0da27a51c6d29337864222d2e85126113f7f6431..5381e7bef3b177884d85671ef6e3dfbc0de1d5ed 100644
--- a/paddle/parameter/OptimizerWithRegularizer.cpp
+++ b/paddle/parameter/OptimizerWithRegularizer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "OptimizerWithRegularizer.h"
 
 namespace paddle {
@@ -24,7 +23,8 @@ OptimizerWithRegularizerEveryNumBatches::needSpecialTraversal(
 
   if (isRegularizationBatch(config)) {
     callbacks.emplace_back(
-        [this](const VectorPtr vecs[], const ParameterConfig& config,
+        [this](const VectorPtr vecs[],
+               const ParameterConfig& config,
                size_t sparseId) { this->doTraversal(vecs, config); });
   }
 
@@ -39,8 +39,8 @@ void OptimizerWithRegularizerEveryNumBatches::doTraversal(
     const VectorPtr vecs[], const ParameterConfig& config) const {
   int32_t base =
       std::max(baseTimer_, (timer_ + 1 - config.num_batches_regularization()));
-  regularizer_->update(vecs, config, optimizer_->getLearningRate(), base,
-                       timer_ + 1);
+  regularizer_->update(
+      vecs, config, optimizer_->getLearningRate(), base, timer_ + 1);
 }
 
 ParameterOptimizer::TraverseCallback
@@ -53,7 +53,8 @@ OptimizerWithRegularizerEveryNumBatches::startCatchUpWith() const {
 
   if (baseTimer_ < timer_) {
     callbacks.emplace_back(
-        [this](const VectorPtr vecs[], const ParameterConfig& config,
+        [this](const VectorPtr vecs[],
+               const ParameterConfig& config,
                size_t sparseId) { this->catchUpWith(vecs, config, sparseId); });
   }
 
@@ -61,11 +62,15 @@ OptimizerWithRegularizerEveryNumBatches::startCatchUpWith() const {
 }
 
 void OptimizerWithRegularizerEveryNumBatches::catchUpWith(
-    const VectorPtr vecs[], const ParameterConfig& config,
+    const VectorPtr vecs[],
+    const ParameterConfig& config,
     size_t sparseId) const {
   int32_t base = timer_ - timer_ % config.num_batches_regularization();
-  regularizer_->update(vecs, config, optimizer_->getLearningRate(),
-                       std::max(base, baseTimer_), timer_);
+  regularizer_->update(vecs,
+                       config,
+                       optimizer_->getLearningRate(),
+                       std::max(base, baseTimer_),
+                       timer_);
 }
 
 void OptimizerWithRegularizerSparse::init(size_t numRows,
@@ -83,8 +88,11 @@ void OptimizerWithRegularizerSparse::update(const VectorPtr vecs[],
   optimizer_->update(vecs, config, sparseId);
   // para W(t0) -> W(t+1)
   CHECK_LT(sparseId, t0Vec_.size());
-  regularizer_->update(vecs, config, optimizer_->getLearningRate(),
-                       t0Vec_[sparseId], timer_ + 1);
+  regularizer_->update(vecs,
+                       config,
+                       optimizer_->getLearningRate(),
+                       t0Vec_[sparseId],
+                       timer_ + 1);
   t0Vec_[sparseId] = timer_ + 1;
 }
 
@@ -98,7 +106,8 @@ OptimizerWithRegularizerSparse::startCatchUpWith() const {
 
   if (timer_ > 0) {
     callbacks.emplace_back(
-        [this](const VectorPtr vecs[], const ParameterConfig& config,
+        [this](const VectorPtr vecs[],
+               const ParameterConfig& config,
                size_t sparseId) { this->catchUpWith(vecs, config, sparseId); });
   }
 
@@ -110,18 +119,20 @@ void OptimizerWithRegularizerSparse::catchUpWith(const VectorPtr vecs[],
                                                  size_t sparseId) const {
   // para W(t0) -> W(t+1)
   CHECK_LT(sparseId, t0Vec_.size());
-  regularizer_->update(vecs, config, optimizer_->getLearningRate(),
-                       t0Vec_[sparseId], timer_);
+  regularizer_->update(
+      vecs, config, optimizer_->getLearningRate(), t0Vec_[sparseId], timer_);
 }
 
 // factory method to create instance of OptimizerWithRegularizer
 ParameterOptimizer* OptimizerWithRegularizer::create(
-    const OptimizationConfig& optConfig, const ParameterConfig& paraConfig,
-    bool isParameterSparse, bool inPserver) {
+    const OptimizationConfig& optConfig,
+    const ParameterConfig& paraConfig,
+    bool isParameterSparse,
+    bool inPserver) {
   ParameterOptimizer* optimizer =
       ParameterOptimizer::create(optConfig, inPserver);
   if (paraConfig.gradient_clipping_threshold() > 0.0f &&
-     !dynamic_cast<AddOptimizer*>(optimizer)) {
+      !dynamic_cast<AddOptimizer*>(optimizer)) {
     optimizer = new OptimizerWithGradientClipping(optConfig, optimizer);
   }
   Regularizer* regularizer =
@@ -157,23 +168,23 @@ ParameterOptimizer* OptimizerWithRegularizer::create(
     }
     // normal
     optimizer->setNoDecay();
-    return new OptimizerWithRegularizerEveryNumBatches(optConfig, optimizer,
-                                                       regularizer);
+    return new OptimizerWithRegularizerEveryNumBatches(
+        optConfig, optimizer, regularizer);
   }
   if (isParameterSparse) {
-      CHECK(paraConfig.momentum() == 0.0f)
-          << "Parameter cannot support momentum if it's sparse.";
+    CHECK(paraConfig.momentum() == 0.0f)
+        << "Parameter cannot support momentum if it's sparse.";
     optimizer->setNoDecay();
-    return new OptimizerWithRegularizerSparse(optConfig, optimizer,
-                                              regularizer);
+    return new OptimizerWithRegularizerSparse(
+        optConfig, optimizer, regularizer);
   }
   // dense
   if (paraConfig.decay_rate_l1() == 0.0f ||
-    dynamic_cast<AddOptimizer*>(optimizer)) {
+      dynamic_cast<AddOptimizer*>(optimizer)) {
     return optimizer;
   }
   CHECK(paraConfig.momentum() == 0.0f)
-    << "Parameter cannot support momentum if it use L1 decay.";
+      << "Parameter cannot support momentum if it use L1 decay.";
   optimizer->setNoDecay();
   return new OptimizerWithRegularizer(optConfig, optimizer, regularizer);
 }
diff --git a/paddle/parameter/OptimizerWithRegularizer.h b/paddle/parameter/OptimizerWithRegularizer.h
index b8b2d5b84d6875c1f9e4ea8a9cd1c93c1fff4be5..ebe23c7397f6d3f14976422342953e493a6fbee1 100644
--- a/paddle/parameter/OptimizerWithRegularizer.h
+++ b/paddle/parameter/OptimizerWithRegularizer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "FirstOrderOptimizer.h"
@@ -24,7 +23,8 @@ class OptimizerWithRegularizer : public ParameterOptimizer {
 public:
   static ParameterOptimizer* create(const OptimizationConfig& optConfig,
                                     const ParameterConfig& paraConfig,
-                                    bool isParameterSparse, bool inPserver);
+                                    bool isParameterSparse,
+                                    bool inPserver);
 
   OptimizerWithRegularizer(const OptimizationConfig& optConfig,
                            ParameterOptimizer* optimizer,
@@ -60,7 +60,8 @@ public:
     return optimizer_->needSpecialTraversal(config);
   }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const {
     optimizer_->update(vecs, config, sparseId);
     regularizer_->update(vecs, config, optimizer_->getLearningRate(), 0, 1);
@@ -94,7 +95,8 @@ public:
     baseTimer_ = 0;
   }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const {
     optimizer_->update(vecs, config, sparseId);
   }
@@ -103,7 +105,8 @@ public:
       const ParameterConfig& config) const;
   void doTraversal(const VectorPtr vecs[], const ParameterConfig& config) const;
 
-  void catchUpWith(const VectorPtr vecs[], const ParameterConfig& config,
+  void catchUpWith(const VectorPtr vecs[],
+                   const ParameterConfig& config,
                    size_t sparseId) const;
 
   virtual TraverseCallback startCatchUpWith() const;
@@ -130,9 +133,11 @@ public:
 
   virtual void init(size_t numRows, const ParameterConfig* config);
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
-  void catchUpWith(const VectorPtr vecs[], const ParameterConfig& config,
+  void catchUpWith(const VectorPtr vecs[],
+                   const ParameterConfig& config,
                    size_t sparseId) const;
   virtual TraverseCallback startCatchUpWith() const;
   virtual void finishCatchUpWith() {
diff --git a/paddle/parameter/ParallelParameter.cpp b/paddle/parameter/ParallelParameter.cpp
index 19cbdab1c8d1e8b4836c8f193901edb5b166f055..99b20a59ca2a8b4a84a5bcbd0fab135ac54de61c 100644
--- a/paddle/parameter/ParallelParameter.cpp
+++ b/paddle/parameter/ParallelParameter.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include <fstream>
 
@@ -152,7 +151,8 @@ void SyncParameter::minorUpdate(real learnRate) {
   gradSem_->post();
 }
 
-AsyncParameter::AsyncParameter(TrainerRole role, int asyncCount,
+AsyncParameter::AsyncParameter(TrainerRole role,
+                               int asyncCount,
                                ParameterPtr localParam)
     : ParallelParameter(role, localParam) {
   asyncCount_ = asyncCount;
diff --git a/paddle/parameter/ParallelParameter.h b/paddle/parameter/ParallelParameter.h
index 882033af636529cd845bfaae2253767a37e2cb72..2b65321fe201ae166dbbd6629e9a0ab0c6481699 100644
--- a/paddle/parameter/ParallelParameter.h
+++ b/paddle/parameter/ParallelParameter.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <stdint.h>
@@ -47,17 +46,17 @@ const int UPDATE_TYPE_NUM = 32;
  * TrainRole denotes the role of current training, different roles have
  * different jobs.
  *
- * control, major, minor are three kinds of role to support mutiple GPUs 
+ * control, major, minor are three kinds of role to support mutiple GPUs
  * parallel SGD training. SM on GPU card has two groups, each group
  * consist of a major and a minor.
  *
  * @param    single  single GPU card single thread training.
- * 
+ *
  *
  * @param    control current parameter updates via control role,
  *                   not participate in real training. control role is
- *                   responsible for merging all major's gradient and 
- *                   update parameter value. 
+ *                   responsible for merging all major's gradient and
+ *                   update parameter value.
  *
  * @param    major   major role paticipates in real training, when local
  *                   gradient is ready, merge its corresponding minor's
@@ -83,7 +82,8 @@ typedef void (ParallelParameter::*UpdateFunction)(real learnRate);
 
 class ParallelParameter {
 public:
-  static ParallelParameterPtr create(TrainerRole role, ParameterPtr localParam,
+  static ParallelParameterPtr create(TrainerRole role,
+                                     ParameterPtr localParam,
                                      int asyncCount = 1);
 
   ParallelParameter(TrainerRole role, ParameterPtr localParam) {
@@ -135,7 +135,7 @@ protected:
 };
 
 /**
- * this class is designed for multi-threading training. 
+ * this class is designed for multi-threading training.
  *
  * "Synchronous" means multiple GPUs calculate 1/4 mini-Batch,
  * but will get only one gradient
@@ -209,14 +209,14 @@ public:
    * When asynchronous training, update strategy including slave and master.
    *
    * slave: If in range asyncCount, adopting self-update method.
-   *        If beyond asyncCount, waiting for master to update. 
+   *        If beyond asyncCount, waiting for master to update.
    */
   void slaveUpdate(real learnRate);
 
   /**
    * When asynchronous training, update strategy including slave and master.
    *
-   * master: it only polls slaves, do not training data. 
+   * master: it only polls slaves, do not training data.
    *         If slave's gradient is ready, fetch it.
    *         Update master's parameter, then copy it into
    *         corresponding slave.
@@ -227,7 +227,7 @@ public:
 private:
   /**
    * When asynchronous training, every aysnc trainer needs to
-   * accumulate a number of batch gradient. 
+   * accumulate a number of batch gradient.
    *
    * gradientAccum_ is used to save the sum of gradients.
    */
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
index 64d72ae7404f09903aea35cefd97e810b20c39a3..7e37bf225ba25e8bae269cf45b69ce418a54d1a3 100644
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <fstream>
 #include "paddle/math/MathUtils.h"
 #include "AverageOptimizer.h"
@@ -27,11 +26,13 @@ limitations under the License. */
 #include "hl_gpu.h"
 #include "paddle/utils/CommandLineParser.h"
 
-P_DEFINE_int32(enable_grad_share, (100 * 1024 * 1024),
+P_DEFINE_int32(enable_grad_share,
+               (100 * 1024 * 1024),
                "threshold for enable gradient parameter share for batch "
                "multi-cpu training");
 P_DEFINE_int32(
-    grad_share_block_num, 64,
+    grad_share_block_num,
+    64,
     "block number of gradient parameter share for batch multi-cpu training");
 
 namespace paddle {
@@ -95,13 +96,12 @@ void Parameter::randomize(const VectorPtr& value,
     real initial_max = config.initial_mean() + config.initial_std();
     value->uniform(initial_min, initial_max);
     VLOG(1) << config.name() << ": initial_min=" << initial_min
-                            << ", initial_max=" << initial_max;
+            << ", initial_max=" << initial_max;
   } else if (PARAMETER_INIT_NORMAL == config.initial_strategy()) {
     /* Initialize the parameters randomly */
     value->randnorm(config.initial_mean(), config.initial_std());
-    VLOG(1) << config.name()
-                            << ": initial_mean=" << config.initial_mean()
-                            << ", initial_std=" << config.initial_std();
+    VLOG(1) << config.name() << ": initial_mean=" << config.initial_mean()
+            << ", initial_std=" << config.initial_std();
   } else {
     LOG(FATAL) << "not supported initial_strategy: "
                << config.initial_strategy();
@@ -116,12 +116,18 @@ void Parameter::randomize() {
   if (config_.is_sparse()) {
     if (format_ == SPARSE_CSC) {
       sparseRand(intBufs_[PARAMETER_COLS]->getData(),
-                 intBufs_[PARAMETER_ROWS]->getData(), config_.size(),
-                 config_.dims(1) + 1, config_.dims(0), useGpu_);
+                 intBufs_[PARAMETER_ROWS]->getData(),
+                 config_.size(),
+                 config_.dims(1) + 1,
+                 config_.dims(0),
+                 useGpu_);
     } else {
       sparseRand(intBufs_[PARAMETER_ROWS]->getData(),
-                 intBufs_[PARAMETER_COLS]->getData(), config_.size(),
-                 config_.dims(0) + 1, config_.dims(1), useGpu_);
+                 intBufs_[PARAMETER_COLS]->getData(),
+                 config_.size(),
+                 config_.dims(0) + 1,
+                 config_.dims(1),
+                 useGpu_);
     }
   }
   setValueUpdated();
@@ -152,7 +158,7 @@ bool Parameter::isValueShared() {
 
 bool Parameter::isGradSparseUpdate() const {
   return !useGpu_ && !isStatic() &&
-      (config_.sparse_update() || config_.sparse_remote_update());
+         (config_.sparse_update() || config_.sparse_remote_update());
 }
 
 void Parameter::setMat(ParameterType pType, int matType) {
@@ -180,30 +186,42 @@ void Parameter::setMat(ParameterType pType, int matType) {
         CHECK_EQ(width + 1, intBufs_[PARAMETER_COLS]->getSize());
         CHECK_EQ(size, intBufs_[PARAMETER_ROWS]->getSize());
       }
-      mats_[pType] = Matrix::createSparseMatrix(
-          bufs_[pType]->getData(), intBufs_[PARAMETER_ROWS]->getData(),
-          intBufs_[PARAMETER_COLS]->getData(), height, width,
-          bufs_[pType]->getSize(), FLOAT_VALUE, format_, false, useGpu_);
+      mats_[pType] =
+          Matrix::createSparseMatrix(bufs_[pType]->getData(),
+                                     intBufs_[PARAMETER_ROWS]->getData(),
+                                     intBufs_[PARAMETER_COLS]->getData(),
+                                     height,
+                                     width,
+                                     bufs_[pType]->getSize(),
+                                     FLOAT_VALUE,
+                                     format_,
+                                     false,
+                                     useGpu_);
     }
   } else if (matType == MAT_NORMAL_SHARED) {
     CHECK_EQ(height * width, bufs_[pType]->getSize());
     size_t blockNum = 0;
     CHECK(isGradShared(&blockNum));
     mats_[pType] = std::make_shared<SharedCpuMatrix>(
-        blockNum, std::dynamic_pointer_cast<CpuMemoryHandle>(
-                      bufs_[pType]->getMemoryHandle()),
-        height, width);
+        blockNum,
+        std::dynamic_pointer_cast<CpuMemoryHandle>(
+            bufs_[pType]->getMemoryHandle()),
+        height,
+        width);
   } else if (matType == MAT_VALUE_SHARED) {
     CHECK_EQ(height * width, bufs_[pType]->getSize());
     mats_[pType] = std::make_shared<SharedCpuMatrix>(
         std::dynamic_pointer_cast<CpuMemoryHandle>(
-        bufs_[pType]->getMemoryHandle()), height, width);
+            bufs_[pType]->getMemoryHandle()),
+        height,
+        width);
   } else if (matType == MAT_SPARSE_ROW_IDS) {
     CHECK_EQ(height * width, bufs_[pType]->getSize());
     mats_[pType] = std::make_shared<SparseRowIdsCpuMatrix>(
         std::dynamic_pointer_cast<CpuMemoryHandle>(
             bufs_[pType]->getMemoryHandle()),
-        height, width);
+        height,
+        width);
   } else if (matType == MAT_SPARSE_ROW) {
     auto valueMat =
         std::dynamic_pointer_cast<SparseRowCpuMatrix>(mats_[PARAMETER_VALUE]);
@@ -214,29 +232,31 @@ void Parameter::setMat(ParameterType pType, int matType) {
                       << " MAT_SPARSE_ROW_PREFETCH or MAT_CACHE_ROW";
       indexDict = valueMat->getIndexDictHandle();
     }
-    auto mat = std::make_shared<SparseRowCpuMatrix>(
-        nullptr, height, width,
-        // grad share index with value
-        indexDict);
+    auto mat =
+        std::make_shared<SparseRowCpuMatrix>(nullptr,
+                                             height,
+                                             width,
+                                             // grad share index with value
+                                             indexDict);
     mats_[pType] = mat;
   } else if (matType == MAT_CACHE_ROW) {
     CHECK(isGradSparseUpdate());
-    auto mat = std::make_shared<CacheRowCpuMatrix>(
-      height, width);
+    auto mat = std::make_shared<CacheRowCpuMatrix>(height, width);
     mats_[pType] = mat;
   } else if (matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
              matType == MAT_SPARSE_ROW_PREFETCH) {
     auto mat = std::make_shared<SparsePrefetchRowCpuMatrix>(
         bufs_[pType] ? std::dynamic_pointer_cast<CpuMemoryHandle>(
-          bufs_[pType]->getMemoryHandle()) : nullptr,
-        height, width,
+                           bufs_[pType]->getMemoryHandle())
+                     : nullptr,
+        height,
+        width,
         nullptr,  // indexDictHandle
         getGlobalSyncThreadPool());
     mats_[pType] = mat;
   } else if (matType == MAT_SPARSE_ROW_AUTO_GROW) {
     CHECK(isGradSparseUpdate());
-    mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(
-      height, width);
+    mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(height, width);
   } else {
     LOG(FATAL) << "Unsupported mat type" << matType;
   }
@@ -252,30 +272,43 @@ SparsePrefetchRowCpuMatrix* Parameter::getPrefetchMatrix() {
 }
 
 void Parameter::updateWithGradient(real learningRate) {
-  sgdUpdate(learningRate * config_.learning_rate(), config_.momentum(),
-            config_.decay_rate(), bufs_[PARAMETER_VALUE].get(),
-            bufs_[PARAMETER_GRADIENT].get(), bufs_[PARAMETER_MOMENTUM].get());
+  sgdUpdate(learningRate * config_.learning_rate(),
+            config_.momentum(),
+            config_.decay_rate(),
+            bufs_[PARAMETER_VALUE].get(),
+            bufs_[PARAMETER_GRADIENT].get(),
+            bufs_[PARAMETER_MOMENTUM].get());
 }
 
-void Parameter::updateWithGradient(real learningRate, MatrixPtr gradMat,
-                                   IVectorPtr t0, int currentTime, bool fini) {
+void Parameter::updateWithGradient(real learningRate,
+                                   MatrixPtr gradMat,
+                                   IVectorPtr t0,
+                                   int currentTime,
+                                   bool fini) {
   SparseRowCpuMatrix* sparseMat =
       dynamic_cast<SparseRowCpuMatrix*>(gradMat.get());
   CHECK(sparseMat);
   CHECK_EQ(config_.momentum(), 0.0f)
       << "not support momentum in sparse input sgd";
   bool useL1 = (config_.decay_rate_l1() != 0.0f);
-  sparseMat->sgdUpdate(*bufs_[PARAMETER_VALUE], *t0,
-                       learningRate * config_.learning_rate(), currentTime,
+  sparseMat->sgdUpdate(*bufs_[PARAMETER_VALUE],
+                       *t0,
+                       learningRate * config_.learning_rate(),
+                       currentTime,
                        useL1 ? config_.decay_rate_l1() : config_.decay_rate(),
-                       useL1, fini);
+                       useL1,
+                       fini);
 }
 
-void Parameter::updateWithGradient(real learningRate, VectorPtr gradVec,
+void Parameter::updateWithGradient(real learningRate,
+                                   VectorPtr gradVec,
                                    bool normalUpdate) {
   if (normalUpdate) {
-    sgdUpdate(learningRate * config_.learning_rate(), config_.momentum(),
-              config_.decay_rate(), bufs_[PARAMETER_VALUE].get(), gradVec.get(),
+    sgdUpdate(learningRate * config_.learning_rate(),
+              config_.momentum(),
+              config_.decay_rate(),
+              bufs_[PARAMETER_VALUE].get(),
+              gradVec.get(),
               bufs_[PARAMETER_MOMENTUM].get());
   } else {
     size_t size = gradVec->getSize();
@@ -361,7 +394,7 @@ bool Parameter::load(const std::string& filename) {
       return true;
     }
     LOG(FATAL) << "unsupported load_missing_parameter_strategy: "
-        << FLAGS_load_missing_parameter_strategy;
+               << FLAGS_load_missing_parameter_strategy;
     return false;
   }
   return load(fs);
@@ -372,8 +405,8 @@ bool Parameter::load(std::istream& s) {
   Header header;
   CHECK(s.read(reinterpret_cast<char*>(&header), sizeof(header)))
       << "Fail to read parameter " << getName();
-  CHECK_EQ(header.version, kFormatVersion)
-      << "Incorrect format version: " << header.version;
+  CHECK_EQ(header.version, kFormatVersion) << "Incorrect format version: "
+                                           << header.version;
   CHECK_EQ(header.size, getSize())
       << "The size (" << header.size << ") in the file does not match the size "
       << "(" << getSize() << ") of the parameter: " << getName();
@@ -382,7 +415,7 @@ bool Parameter::load(std::istream& s) {
   CHECK(s.read(reinterpret_cast<char*>(vec.getData()),
                header.size * sizeof(real)));
 
-  auto & tmp = *bufs_[PARAMETER_VALUE].get();
+  auto& tmp = *bufs_[PARAMETER_VALUE].get();
   if (typeid(tmp) == typeid(GpuVector)) {
     bufs_[PARAMETER_VALUE]->copyFrom(vec);
   }
@@ -393,7 +426,11 @@ bool Parameter::load(std::istream& s) {
     auto height = config_.dims(0);
     auto width = config_.dims(1);
     auto mat = Matrix::create(vec.getData(), height, width);
-    CpuSparseMatrix sparseMat(height, width, 0, FLOAT_VALUE, format_,
+    CpuSparseMatrix sparseMat(height,
+                              width,
+                              0,
+                              FLOAT_VALUE,
+                              format_,
                               /*trans*/ false);
     sparseMat.copyFrom(*mat, HPPL_STREAM_DEFAULT);
     auto nnz = sparseMat.getElementCnt();
@@ -423,11 +460,11 @@ bool Parameter::load(std::istream& s) {
         s.read(reinterpret_cast<char*>(rows.getData()), rowSize * sizeof(int)));
     CHECK(
         s.read(reinterpret_cast<char*>(cols.getData()), colSize * sizeof(int)));
-    auto & paramRows = *intBufs_[PARAMETER_ROWS].get();
+    auto& paramRows = *intBufs_[PARAMETER_ROWS].get();
     if (typeid(paramRows) == typeid(GpuIVector)) {
       intBufs_[PARAMETER_ROWS]->copyFrom(rows);
     }
-    auto & paramCols = *intBufs_[PARAMETER_COLS].get();
+    auto& paramCols = *intBufs_[PARAMETER_COLS].get();
     if (typeid(paramCols) == typeid(GpuIVector)) {
       intBufs_[PARAMETER_COLS]->copyFrom(cols);
     }
@@ -457,8 +494,8 @@ void Parameter::exec(ExecFunc func) {
       func(this->getBufs());
     } else {  // multi thread
       VectorPtr* vecs = Parameter::getTlsTempBufs();
-      auto interval = calcSplitArrayInterval(this->getSize(), (size_t)tid,
-                                             numThreads, 8LU /*for avx*/);
+      auto interval = calcSplitArrayInterval(
+          this->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
       for (size_t i = 0; i < (size_t)NUM_PARAMETER_TYPES; ++i) {
         if (bufs_[i]) {
           vecs[i]->subVecFrom(*bufs_[i], interval);
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index ff251fe89f9f885c361b6c1ae7dde0ae57695e47..1c159d669a6a0f7b56c377e0b1cfa35b3fb75d53 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <stdint.h>
@@ -52,7 +51,6 @@ struct Segment {
   int64_t beginPos;  // beginning position in the local value or grad buffer
 };
 
-
 class Parameter;
 typedef std::shared_ptr<Parameter> ParameterPtr;
 
@@ -129,8 +127,7 @@ public:
     if (config_.dims_size() == 2) {
       if (matType == MAT_NORMAL || matType == MAT_NORMAL_SHARED ||
           matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
-          matType == MAT_VALUE_SHARED ||
-          matType == MAT_SPARSE_ROW_IDS) {
+          matType == MAT_VALUE_SHARED || matType == MAT_SPARSE_ROW_IDS) {
         bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
         bufs_[type]->zeroMem();
       } else {
@@ -161,7 +158,8 @@ public:
     }
   }
 
-  void enableSharedType(ParameterType type, VectorPtr vec,
+  void enableSharedType(ParameterType type,
+                        VectorPtr vec,
                         MatrixPtr mat = nullptr) {
     if (!bufs_[type] && !mats_[type]) {
       bufs_[type] = vec;
@@ -235,13 +233,17 @@ public:
    *
    * @see SparseRowCpuMatrix::sgdUpdate for more information.
    */
-  void updateWithGradient(real learningRate, MatrixPtr gradMat, IVectorPtr t0,
-                          int currentTime, bool fini = false);
+  void updateWithGradient(real learningRate,
+                          MatrixPtr gradMat,
+                          IVectorPtr t0,
+                          int currentTime,
+                          bool fini = false);
 
   /**
    * This function is used to calculate multiple gpus, but only as a candidate
    */
-  void updateWithGradient(real learningRate, VectorPtr grad,
+  void updateWithGradient(real learningRate,
+                          VectorPtr grad,
                           bool normalUpdate = true);
 
   /**
diff --git a/paddle/parameter/ParameterOptimizer.cpp b/paddle/parameter/ParameterOptimizer.cpp
index 164b50c4d279102ce14d82b102a74e56dfc5b2fe..2a71d6aee4dae556956616bd317156cfaf8732f0 100644
--- a/paddle/parameter/ParameterOptimizer.cpp
+++ b/paddle/parameter/ParameterOptimizer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 
 #include <fstream>
diff --git a/paddle/parameter/ParameterOptimizer.h b/paddle/parameter/ParameterOptimizer.h
index 8c766743401dddc6468e6db22164843e286e6ad7..21a148333c2fd3aa127c5b3bb8160784864f4cce 100644
--- a/paddle/parameter/ParameterOptimizer.h
+++ b/paddle/parameter/ParameterOptimizer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "LearningRateScheduler.h"
@@ -32,8 +31,8 @@ namespace paddle {
  */
 class ParameterOptimizer {
 public:
-  typedef std::function<void(const VectorPtr vecs[],
-                             const ParameterConfig& config, size_t sparseId)>
+  typedef std::function<void(
+      const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId)>
       TraverseCallback;
 
 public:
@@ -69,35 +68,35 @@ public:
     (void)numSamplesProcessed;
   }
 
- /**
-  * following hooks useful for sparse update,
-  * because the traversal in block costs.
-  * called by Trainer after update and before finishBatch
-  * e.g. Trainer call like this:
-  *
-  * @code
-  * startBatch();
-  * if (dense) {
-  *   update(blockVec);
-  * } else {//sparse
-  *   for (row : rows_in_block) {update(rowVec)}
-  * }
-  * auto callback = needSpecialTraversal();
-  * if (callback) {
-  *   // do traverse, maybe multi-thread
-  *   if (dense) {
-  *     callback();
-  *   } else {//sparse
-  *     for (row : all_rows_in_block) {callback();}
-  *   }
-  * }
-  * finishBatch();
-  * @endcode
-  *
-  * @return callback if need traverse,
-  *         else return nullptr.
-  *         It should be no state change.
-  */
+  /**
+   * following hooks useful for sparse update,
+   * because the traversal in block costs.
+   * called by Trainer after update and before finishBatch
+   * e.g. Trainer call like this:
+   *
+   * @code
+   * startBatch();
+   * if (dense) {
+   *   update(blockVec);
+   * } else {//sparse
+   *   for (row : rows_in_block) {update(rowVec)}
+   * }
+   * auto callback = needSpecialTraversal();
+   * if (callback) {
+   *   // do traverse, maybe multi-thread
+   *   if (dense) {
+   *     callback();
+   *   } else {//sparse
+   *     for (row : all_rows_in_block) {callback();}
+   *   }
+   * }
+   * finishBatch();
+   * @endcode
+   *
+   * @return callback if need traverse,
+   *         else return nullptr.
+   *         It should be no state change.
+   */
   virtual TraverseCallback needSpecialTraversal(
       const ParameterConfig& config) const {
     return nullptr;
@@ -112,47 +111,48 @@ public:
    * with its gradient in PARAMETER_GRADIENT. sparseId is row id,
    * when sparseId set, update is sparse, each time one row.
    */
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId = -1LU) const = 0;
 
- /**
-  * following hooks catch up with current time for sparse update,
-  * In the beginning, call startCatchUpWith() and check return.
-  * In the end, call finishCatchUpWith() to finish state.
-  * callback do the actual works, can call many times for sparse data.
-  * e.g. Trainer call like this:
-  *
-  * @code
-  * auto callback = startCatchUpWith();
-  * if (callback) {
-  *   // do catch up with, maybe multi-thread
-  *   if (dense) {
-  *     callback();
-  *   } else {//sparse
-  *     for (row : rows_in_block) {callback();}
-  *   }
-  *   // finish catch up with, main thread
-  *   finishCatchUpWith();
-  * }
-  * @endcode
-  *
-  * @return callback if need catch up with,
-  *         else return nullptr.
-  *         It should be no state change.
-  */
+  /**
+   * following hooks catch up with current time for sparse update,
+   * In the beginning, call startCatchUpWith() and check return.
+   * In the end, call finishCatchUpWith() to finish state.
+   * callback do the actual works, can call many times for sparse data.
+   * e.g. Trainer call like this:
+   *
+   * @code
+   * auto callback = startCatchUpWith();
+   * if (callback) {
+   *   // do catch up with, maybe multi-thread
+   *   if (dense) {
+   *     callback();
+   *   } else {//sparse
+   *     for (row : rows_in_block) {callback();}
+   *   }
+   *   // finish catch up with, main thread
+   *   finishCatchUpWith();
+   * }
+   * @endcode
+   *
+   * @return callback if need catch up with,
+   *         else return nullptr.
+   *         It should be no state change.
+   */
   virtual TraverseCallback startCatchUpWith() const { return nullptr; }
   virtual void finishCatchUpWith() {}
 
- /**
-  * following two hooks used by averager,
-  * apply to final parameter value (PARAMETER_VALUE or PARAMETER_APPLY).
-  *
-  * restore() will restore orginal value if it apply to PARAMETER_VALUE.
-  * Caller must ensure it's catched up with current time before apply.
-  *
-  * Use returned callback same way as callback returned by
-  * ParameterOptimizer::needSpecialTraversal()
-  */
+  /**
+   * following two hooks used by averager,
+   * apply to final parameter value (PARAMETER_VALUE or PARAMETER_APPLY).
+   *
+   * restore() will restore orginal value if it apply to PARAMETER_VALUE.
+   * Caller must ensure it's catched up with current time before apply.
+   *
+   * Use returned callback same way as callback returned by
+   * ParameterOptimizer::needSpecialTraversal()
+   */
   virtual TraverseCallback apply() { return nullptr; }
   virtual TraverseCallback restore() { return nullptr; }
 
@@ -180,7 +180,8 @@ protected:
   static TraverseCallback composeCallbacks(
       const TraverseCallbackVec& callbacks) {
     if (callbacks.size() > 1LU) {
-      return [callbacks](const VectorPtr vecs[], const ParameterConfig& config,
+      return [callbacks](const VectorPtr vecs[],
+                         const ParameterConfig& config,
                          size_t sparseId) {
         for (auto callback : callbacks) {
           callback(vecs, config, sparseId);
diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp
index 679e3bf89b517a91cdd1af6bdad4e199418485a5..510ec5bf48a7576f646ecf01b02c5047c637afeb 100644
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #ifdef __AVX__
 #include <x86intrin.h>
@@ -23,8 +22,13 @@ limitations under the License. */
 
 namespace paddle {
 
-void sgdUpdateCpu(real learningRate, real momentum, real decayRate, size_t size,
-                  real* value, const real* grad, real* momentumVec) {
+void sgdUpdateCpu(real learningRate,
+                  real momentum,
+                  real decayRate,
+                  size_t size,
+                  real* value,
+                  const real* grad,
+                  real* momentumVec) {
   decayRate *= learningRate;
   for (size_t i = 0; i < size; ++i) {
     momentumVec[i] = momentum * momentumVec[i] - learningRate * grad[i] -
@@ -33,8 +37,12 @@ void sgdUpdateCpu(real learningRate, real momentum, real decayRate, size_t size,
   }
 }
 
-void sgdUpdate(real learningRate, real momentum, real decayRate, Vector* value,
-               Vector* grad, Vector* momentumVec) {
+void sgdUpdate(real learningRate,
+               real momentum,
+               real decayRate,
+               Vector* value,
+               Vector* grad,
+               Vector* momentumVec) {
   size_t size = value->getSize();
   real* val = value->getData();
   real* grd = grad->getData();
@@ -48,8 +56,12 @@ void sgdUpdate(real learningRate, real momentum, real decayRate, Vector* value,
   }
 }
 
-void sgdUpdateAvx(float learningRate, float momentum, float decayRate,
-                  size_t size, float* value, const float* _grad,
+void sgdUpdateAvx(float learningRate,
+                  float momentum,
+                  float decayRate,
+                  size_t size,
+                  float* value,
+                  const float* _grad,
                   float* momentumVec) {
 #ifdef __AVX__
   float* grad = const_cast<float*>(_grad);  // the gradient is not modified
@@ -86,18 +98,36 @@ void sgdUpdateAvx(float learningRate, float momentum, float decayRate,
   std::function<void(void)> loopFun;
 
   learningRate *= -1;
-  lr = _mm256_set_ps(learningRate, learningRate, learningRate, learningRate,
-                     learningRate, learningRate, learningRate, learningRate);
+  lr = _mm256_set_ps(learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate);
 
   if (0 != momentum) {
-    mom = _mm256_set_ps(momentum, momentum, momentum, momentum, momentum,
-                        momentum, momentum, momentum);
+    mom = _mm256_set_ps(momentum,
+                        momentum,
+                        momentum,
+                        momentum,
+                        momentum,
+                        momentum,
+                        momentum,
+                        momentum);
   }
 
   decayRate *= learningRate;
   if (0 != decayRate) {
-    dr = _mm256_set_ps(decayRate, decayRate, decayRate, decayRate, decayRate,
-                       decayRate, decayRate, decayRate);
+    dr = _mm256_set_ps(decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate);
   }
 
   auto gradMulFun = [&](void) {
diff --git a/paddle/parameter/ParameterUpdateFunctions.h b/paddle/parameter/ParameterUpdateFunctions.h
index 59eb25656e51c097b2d957902573437894ab53f7..2d98030bd2389469fbd32940af6162203557620c 100644
--- a/paddle/parameter/ParameterUpdateFunctions.h
+++ b/paddle/parameter/ParameterUpdateFunctions.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/TypeDefs.h"
@@ -31,14 +30,27 @@ namespace paddle {
  * momentum = 0 or decayRate = 0 are specially handled to avoid unnecessary
  * computation.
  */
-void sgdUpdate(real learningRate, real momentum, real decayRate, Vector* value,
-               Vector* grad, Vector* momentumVec);
-
-void sgdUpdateCpu(real learningRate, real momentum, real decayRate, size_t size,
-                  real* value, const real* grad, real* momentumVec);
-
-void sgdUpdateAvx(float learningRate, float momentum, float decayRate,
-                  size_t size, float* value, const float* grad,
+void sgdUpdate(real learningRate,
+               real momentum,
+               real decayRate,
+               Vector* value,
+               Vector* grad,
+               Vector* momentumVec);
+
+void sgdUpdateCpu(real learningRate,
+                  real momentum,
+                  real decayRate,
+                  size_t size,
+                  real* value,
+                  const real* grad,
+                  real* momentumVec);
+
+void sgdUpdateAvx(float learningRate,
+                  float momentum,
+                  float decayRate,
+                  size_t size,
+                  float* value,
+                  const float* grad,
                   float* momentumVec);
 
 }  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdaterBase.cpp b/paddle/parameter/ParameterUpdaterBase.cpp
index e3f1d54037b305972248b7b30065b0ae5eb4b357..e706742053fc49df9c99081774f425622941e38c 100644
--- a/paddle/parameter/ParameterUpdaterBase.cpp
+++ b/paddle/parameter/ParameterUpdaterBase.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <fstream>
 #include "paddle/utils/Logging.h"
 #include "ParameterUpdaterBase.h"
diff --git a/paddle/parameter/ParameterUpdaterBase.h b/paddle/parameter/ParameterUpdaterBase.h
index f16e183515853e01eacda39977c9a7e127b3824c..ffd2980261530382ee09f2c98e354d0e56fd8038 100644
--- a/paddle/parameter/ParameterUpdaterBase.h
+++ b/paddle/parameter/ParameterUpdaterBase.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Parameter.h"
diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp
index 02a352920cf38120938d659dac4258a48643de4d..7d85a32c0cf527d39c252c2021b7bad0eb58753d 100644
--- a/paddle/parameter/ParameterUpdaterHook.cpp
+++ b/paddle/parameter/ParameterUpdaterHook.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ParameterUpdaterHook.h"
 
 #include <fstream>
@@ -155,7 +154,8 @@ private:
   std::hash<int> intHasher_;
 };
 
-static WeakKVCache<std::pair<std::string, int>, IParameterUpdaterHook,
+static WeakKVCache<std::pair<std::string, int>,
+                   IParameterUpdaterHook,
                    StringIntPairHasher> g_hookCache_;
 
 /**
diff --git a/paddle/parameter/ParameterUpdaterHook.h b/paddle/parameter/ParameterUpdaterHook.h
index 1c132a733866b8083632a64b1b47ff2b35b2ee69..553282bcaaa2e90910eaafbe2e03a4afadf04a85 100644
--- a/paddle/parameter/ParameterUpdaterHook.h
+++ b/paddle/parameter/ParameterUpdaterHook.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <memory>
 
diff --git a/paddle/parameter/Regularizer.cpp b/paddle/parameter/Regularizer.cpp
index bc7de3ca048dbe094e1f53c024e705425908cdfb..a9bddc1596656ba36d6c445781f42991684f0c52 100644
--- a/paddle/parameter/Regularizer.cpp
+++ b/paddle/parameter/Regularizer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "paddle/utils/Flags.h"
 #include "Regularizer.h"
@@ -21,8 +20,9 @@ namespace paddle {
 
 Regularizer* Regularizer::get(const std::vector<ParameterType>& types,
                               const ParameterConfig& paraConfig) {
-  bool useLearningRateVec = std::find(types.begin(), types.end(),
-                                      PARAMETER_LEARNING_RATE) != types.end();
+  bool useLearningRateVec =
+      std::find(types.begin(), types.end(), PARAMETER_LEARNING_RATE) !=
+      types.end();
   if (paraConfig.decay_rate_l1() > 0.0f &&
       paraConfig.decay_rate() > 0.0f) {  // use L1 and L2
     if (useLearningRateVec) {
diff --git a/paddle/parameter/Regularizer.h b/paddle/parameter/Regularizer.h
index 8c9eb49ab611e8aea7b88f008fe287cbdb17a008..5baaccc00db5f858272dbfa6751647915bfa6e3c 100644
--- a/paddle/parameter/Regularizer.h
+++ b/paddle/parameter/Regularizer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "ParameterUpdaterBase.h"
@@ -22,7 +21,8 @@ namespace paddle {
 // Regularizer function for parameter, e.g. L1/L2
 class Regularizer {
 public:
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       real learningRate,  // learningrate from optimizer
                       int t0,             // last occurence time
                       int t) const = 0;   // current time
@@ -34,8 +34,11 @@ public:
 
 // L1 Regularizer, |w|_1
 class L1Regularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
-                      real learningRate, int t0, int t) const {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
     vecs[PARAMETER_VALUE]->applyL1(learningRate * paraConfig.learning_rate(),
                                    paraConfig.decay_rate_l1() * (t - t0));
   }
@@ -43,8 +46,11 @@ class L1Regularizer : public Regularizer {
 
 // L1 Lr Regularizer
 class L1LrRegularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
-                      real learningRate, int t0, int t) const {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
     vecs[PARAMETER_VALUE]->applyL1(*vecs[PARAMETER_LEARNING_RATE],
                                    learningRate * paraConfig.learning_rate(),
                                    paraConfig.decay_rate_l1() * (t - t0));
@@ -53,8 +59,11 @@ class L1LrRegularizer : public Regularizer {
 
 // L2 Regularizer, |w|_2^2
 class L2Regularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
-                      real learningRate, int t0, int t) const {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
     vecs[PARAMETER_VALUE]->applyL2(learningRate * paraConfig.learning_rate(),
                                    paraConfig.decay_rate() * (t - t0));
   }
@@ -62,8 +71,11 @@ class L2Regularizer : public Regularizer {
 
 // L2 Lr Regularizer
 class L2LrRegularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
-                      real learningRate, int t0, int t) const {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
     vecs[PARAMETER_VALUE]->applyL2(*vecs[PARAMETER_LEARNING_RATE],
                                    learningRate * paraConfig.learning_rate(),
                                    paraConfig.decay_rate() * (t - t0));
@@ -72,8 +84,11 @@ class L2LrRegularizer : public Regularizer {
 
 // L1 + L2 Regularizer, |w|_1 + |w|_2^2
 class L1L2Regularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
-                      real learningRate, int t0, int t) const {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
     vecs[PARAMETER_VALUE]->applyL1(learningRate * paraConfig.learning_rate(),
                                    paraConfig.decay_rate_l1() * (t - t0));
     vecs[PARAMETER_VALUE]->applyL2(learningRate * paraConfig.learning_rate(),
@@ -83,8 +98,11 @@ class L1L2Regularizer : public Regularizer {
 
 // L1 + L2 Lr Regularizer
 class L1L2LrRegularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
-                      real learningRate, int t0, int t) const {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
     vecs[PARAMETER_VALUE]->applyL1(*vecs[PARAMETER_LEARNING_RATE],
                                    learningRate * paraConfig.learning_rate(),
                                    paraConfig.decay_rate_l1() * (t - t0));
diff --git a/paddle/parameter/Weight.cpp b/paddle/parameter/Weight.cpp
index ed02355c01a587da36da038be9e3d6eaf559c884..c138010607412fa257a6c7360a27d855197f88ad 100644
--- a/paddle/parameter/Weight.cpp
+++ b/paddle/parameter/Weight.cpp
@@ -60,14 +60,20 @@ Weight::Weight(size_t height, size_t width, ParameterPtr param, size_t offset) {
 
   // weight_
   if (vPtr) {
-    weight_ = Matrix::create(vPtr->getData() + offset, height, width,
-                             /* trans */ false, param->useGpu());
+    weight_ = Matrix::create(vPtr->getData() + offset,
+                             height,
+                             width,
+                             /* trans */ false,
+                             param->useGpu());
   }
 
   // weightGrad
   if (gPtr) {
-    weightGrad_ = Matrix::create(gPtr->getData() + offset, height, width,
-                                 /* trans */ false, param->useGpu());
+    weightGrad_ = Matrix::create(gPtr->getData() + offset,
+                                 height,
+                                 width,
+                                 /* trans */ false,
+                                 param->useGpu());
   }
 
   parameter_ = param;
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
index 1a22abf7cf80157039f6147293e7648d654e45f7..1a64fe335257a3107be03cfd333cb483c5ab452d 100644
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <stdlib.h>
 #include <paddle/utils/Util.h>
 
@@ -38,8 +37,8 @@ protected:
   CommonTest() : testStat_("test") {}
   virtual ~CommonTest() {}
   virtual void SetUp() {
-    const size_t buffSize[] = {100,  128,   500,    1024,
-                               4096, 10240, 102400, 1000000};
+    const size_t buffSize[] = {
+        100, 128, 500, 1024, 4096, 10240, 102400, 1000000};
     sizeVec_.resize(8);
     memcpy(&sizeVec_[0], &buffSize[0], 8 * sizeof(size_t));
     valueUint_.resize(4);
@@ -54,8 +53,10 @@ protected:
     learningRate_ = 1.0;
   }
 
-  void test_sgdUpadate(real* gradientBuffer, real* valueBuffer,
-                       real* momentumBuffer, size_t size);
+  void test_sgdUpadate(real* gradientBuffer,
+                       real* valueBuffer,
+                       real* momentumBuffer,
+                       size_t size);
 
   virtual void TreaDown() { LOG(INFO) << "All Test Finished."; }
 
@@ -66,8 +67,10 @@ protected:
   StatSet testStat_;
 };
 
-void CommonTest::test_sgdUpadate(real* gradientBuffer, real* valueBuffer,
-                                 real* momentumBuffer, size_t size) {
+void CommonTest::test_sgdUpadate(real* gradientBuffer,
+                                 real* valueBuffer,
+                                 real* momentumBuffer,
+                                 size_t size) {
 // sgdUpdateAvx has no double version yet
 #if defined(__AVX__) && !defined(PADDLE_TYPE_DOUBLE)
   real valueSum1 = 0, valueSum2 = 0, momSum1 = 0, momSum2 = 0;
@@ -85,8 +88,13 @@ void CommonTest::test_sgdUpadate(real* gradientBuffer, real* valueBuffer,
         gettimeofday(&t, NULL);
       }
       REGISTER_TIMER("avxTimer", 0);
-      sgdUpdateAvx(learningRate_, arg.first, arg.second, size, valueBuffer,
-                   gradientBuffer, momentumBuffer);
+      sgdUpdateAvx(learningRate_,
+                   arg.first,
+                   arg.second,
+                   size,
+                   valueBuffer,
+                   gradientBuffer,
+                   momentumBuffer);
     }
     for (size_t i = 0; i < size; i++) {
       valueSum1 += valueBuffer[i];
@@ -98,8 +106,13 @@ void CommonTest::test_sgdUpadate(real* gradientBuffer, real* valueBuffer,
     }
     {
       REGISTER_TIMER("cpuTimer", 0);
-      sgdUpdateCpu(learningRate_, arg.first, arg.second, size, valueTmp,
-                   gradTmp, momentumTmp);
+      sgdUpdateCpu(learningRate_,
+                   arg.first,
+                   arg.second,
+                   size,
+                   valueTmp,
+                   gradTmp,
+                   momentumTmp);
     }
     for (size_t i = 0; i < size; i++) {
       valueSum2 += valueTmp[i];
@@ -126,10 +139,10 @@ TEST_F(CommonTest, sgdUpdate) {
   for (auto& size : sizeVec_) {
     real *gradientBuffer, *valueBuffer, *momentumBuffer;
     CHECK_EQ(posix_memalign((void**)&gradientBuffer, 32, sizeof(real) * size),
-        0);
+             0);
     CHECK_EQ(posix_memalign((void**)&valueBuffer, 32, sizeof(real) * size), 0);
     CHECK_EQ(posix_memalign((void**)&momentumBuffer, 32, sizeof(real) * size),
-        0);
+             0);
 
     for (size_t i = 0; i < size; i++) {
       gradientBuffer[i] = 1.0;
@@ -141,7 +154,8 @@ TEST_F(CommonTest, sgdUpdate) {
                 << "-------------------------";
       test_sgdUpadate(&gradientBuffer[alignHeader[i]],
                       &valueBuffer[alignHeader[i]],
-                      &momentumBuffer[alignHeader[i]], size - alignHeader[i]);
+                      &momentumBuffer[alignHeader[i]],
+                      size - alignHeader[i]);
     }
     free(gradientBuffer);
     free(valueBuffer);
@@ -173,16 +187,16 @@ TEST_F(CommonTest, barrierStat) {
 
   SyncThreadPool pool(threadNum);
 
-#define TEST_BARRIER_RANDOM(statName, numConnThreads, ...)               \
-  pool.exec([&](int tid, size_t numThreads) {                            \
-    struct timeval time;                                                 \
-    gettimeofday(&time, nullptr);                                        \
-    uint64_t usec = timeToMicroSecond(time);                             \
-    std::srand(usec);                                                    \
-    auto value = std::rand() % 100000;                                   \
-    usleep(value);                                                       \
-    REGISTER_SLOW_NODES_PROBE(globalStat, statName, numConnThreads, tid, \
-                              __VA_ARGS__);                              \
+#define TEST_BARRIER_RANDOM(statName, numConnThreads, ...)       \
+  pool.exec([&](int tid, size_t numThreads) {                    \
+    struct timeval time;                                         \
+    gettimeofday(&time, nullptr);                                \
+    uint64_t usec = timeToMicroSecond(time);                     \
+    std::srand(usec);                                            \
+    auto value = std::rand() % 100000;                           \
+    usleep(value);                                               \
+    REGISTER_SLOW_NODES_PROBE(                                   \
+        globalStat, statName, numConnThreads, tid, __VA_ARGS__); \
   });
 
   for (auto i = 0; i < 10; i++) {
@@ -202,11 +216,11 @@ TEST_F(CommonTest, barrierStat) {
   globalStat.reset();
 
 // use it to test accurate barrier gap
-#define TEST_BARRIER(statName, numConnThreads, ...)                      \
-  pool.exec([&](int tid, size_t numThreads) {                            \
-    usleep(tid * 10000);                                                 \
-    REGISTER_SLOW_NODES_PROBE(globalStat, statName, numConnThreads, tid, \
-                              __VA_ARGS__);                              \
+#define TEST_BARRIER(statName, numConnThreads, ...)              \
+  pool.exec([&](int tid, size_t numThreads) {                    \
+    usleep(tid * 10000);                                         \
+    REGISTER_SLOW_NODES_PROBE(                                   \
+        globalStat, statName, numConnThreads, tid, __VA_ARGS__); \
   });
 
   for (auto i = 0; i < 10; i++) {
diff --git a/paddle/pserver/BaseClient.cpp b/paddle/pserver/BaseClient.cpp
index df4daca9bfaf888ccaacc73d9295d6d973dcb9fb..ff83970ab1b11f74ceb4009cc8f469f7b54a7272 100644
--- a/paddle/pserver/BaseClient.cpp
+++ b/paddle/pserver/BaseClient.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <vector>
 #include <string.h>
 #include "paddle/utils/Stat.h"
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
index f1c4c9eb375420edaf895c3ddea7ac06f7b225bd..3a501172b70a91e02ecda0f9f78e0c025ac67936 100644
--- a/paddle/pserver/BaseClient.h
+++ b/paddle/pserver/BaseClient.h
@@ -62,7 +62,10 @@ public:
 
   /// send data to server, support only synchronize
   template <class DataType>
-  void putData(int clientId, SendDataType type, DataType* datas, size_t size,
+  void putData(int clientId,
+               SendDataType type,
+               DataType* datas,
+               size_t size,
                DataUpdateMode mode) {
     synchronize(SYNC_DATA);
     sendData(clientId, type, mode, datas, size);
@@ -71,16 +74,23 @@ public:
   }
 
   template <class DataType>
-  void putOwnData(int clientId, SendDataType type, DataType* datas,
+  void putOwnData(int clientId,
+                  SendDataType type,
+                  DataType* datas,
                   size_t size) {
     putData(clientId, type, datas, size, DATA_UPDATE_MODE_SET_OWN);
   }
 
   template <class DataType>
-  void getAllData(int clientId, SendDataType type, DataType* datas,
+  void getAllData(int clientId,
+                  SendDataType type,
+                  DataType* datas,
                   size_t size) {
-    sendData(clientId, type, DATA_UPDATE_MODE_GET_ALL,
-             reinterpret_cast<DataType*>(NULL), 0);
+    sendData(clientId,
+             type,
+             DATA_UPDATE_MODE_GET_ALL,
+             reinterpret_cast<DataType*>(NULL),
+             0);
     recvData();
     size_t dataOffset = 0;
     for (auto& recvMem : recvDataMems_) {
@@ -100,7 +110,10 @@ public:
    * The results are saved in recvBuf of rootId client
    */
   template <class DataType>
-  void reduce(DataType* sendBuf, DataType* recvBuf, size_t size, int clientId,
+  void reduce(DataType* sendBuf,
+              DataType* recvBuf,
+              size_t size,
+              int clientId,
               int rootId) {
     putOwnData(clientId, DATA_REDUCE_SUM, sendBuf, size);
     if (rootId == clientId) {
@@ -147,8 +160,12 @@ protected:
   void finishThreads();
 
   template <class DataType>
-  void prepareData(int clientId, SendDataType type, DataUpdateMode updateMode,
-                   DataType* datas, size_t size, SendJob* sendJob) {
+  void prepareData(int clientId,
+                   SendDataType type,
+                   DataUpdateMode updateMode,
+                   DataType* datas,
+                   size_t size,
+                   SendJob* sendJob) {
     sendJob->parallelDataRequests.resize(serviceNum_);
     sendJob->parallelInputIovs.resize(serviceNum_);
     for (int i = 0; i < serviceNum_; ++i) {
@@ -192,8 +209,11 @@ protected:
    *        synchronization in metric learning.
    */
   template <class DataType>
-  void sendData(int clientId, SendDataType type, DataUpdateMode updateMode,
-                DataType* datas, size_t size) {
+  void sendData(int clientId,
+                SendDataType type,
+                DataUpdateMode updateMode,
+                DataType* datas,
+                size_t size) {
     SendJobPtr sendJob = std::make_shared<SendJob>();
     prepareData(clientId, type, updateMode, datas, size, sendJob.get());
     for (int i = 0; i < threadNum_; ++i) {
@@ -210,7 +230,8 @@ protected:
 
   /// send request, and recv responses
   template <typename ProtoIn, typename ProtoOut>
-  void multiCall(const char* funcName, const ProtoIn& request,
+  void multiCall(const char* funcName,
+                 const ProtoIn& request,
                  std::vector<ProtoOut>* responses) {
     responses->resize(clients_.size());
     size_t numClients = clients_.size();
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
index ff2875fc702ffbb0675f21433138961c19ff0b86..1830170a163fa47114c75a2a88a731ea31060142 100644
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <netdb.h>
@@ -32,19 +31,22 @@ limitations under the License. */
 #include "RDMANetwork.h"
 
 /// quick ack can reduce the latency of small message
-P_DEFINE_bool(small_messages, false,
+P_DEFINE_bool(small_messages,
+              false,
               "if message size is small, recommend set it True to enable quick "
               "ack and no delay");
 
 /// reasonable sock_send_buf_size can control the traffic injected into switch
 /// network. Injecting too many data into traffic could cause packets loss which
 /// cause long latency and degrade the efficiency of communication.
-P_DEFINE_int32(sock_send_buf_size, 1024 * 1024 * 40,
+P_DEFINE_int32(sock_send_buf_size,
+               1024 * 1024 * 40,
                "restrict sock send buff size, can reduce network congestion if "
                "set carefully");
 
 /// reasonable size can hold bursted packets and reduce packets loss
-P_DEFINE_int32(sock_recv_buf_size, 1024 * 1024 * 40,
+P_DEFINE_int32(sock_recv_buf_size,
+               1024 * 1024 * 40,
                "restrict sock recv buff size");
 
 namespace paddle {
@@ -174,7 +176,8 @@ void SocketServer::tcpServer() {
   if (!addr_.empty()) {
     server = gethostbyname(addr_.c_str());
     PCHECK(server) << "ERROR, no such host: " << addr_;
-    bcopy((char *)server->h_addr, (char *)&serv_addr.sin_addr.s_addr,
+    bcopy((char *)server->h_addr,
+          (char *)&serv_addr.sin_addr.s_addr,
           server->h_length);
   } else {
     serv_addr.sin_addr.s_addr = INADDR_ANY;
@@ -347,29 +350,32 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
   struct sockaddr_in serv_addr;
   struct hostent *server;
 
-  int errRet;      // temp for gethostbyname_r
+  int errRet;  // temp for gethostbyname_r
 
   /// Create a socket point
   int sockfd = socket(AF_INET, SOCK_STREAM, 0);
   PCHECK(sockfd >= 0) << "ERROR opening socket";
 
 #if defined(__OSX__) || defined(__APPLE__)
-   server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
-   CHECK_NE(HOST_NOT_FOUND, errRet)
-     << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
-   CHECK(server) << "getipnodebyname error!";
+  server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
+  CHECK_NE(HOST_NOT_FOUND, errRet) << "ERROR, no such host: " << serverAddr
+                                   << " ret = " << errRet;
+  CHECK(server) << "getipnodebyname error!";
 #else
-   struct hostent hostinfo;
-   char buf[1024];  // temp for gethostbyname_r
-   CHECK_EQ(0, gethostbyname_r(serverAddr.c_str(), &hostinfo, buf, sizeof(buf),
-                               &server, &errRet))
-       << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
-   CHECK(server) << "gethostbyname_r error!";
+  struct hostent hostinfo;
+  char buf[1024];  // temp for gethostbyname_r
+  CHECK_EQ(
+      0,
+      gethostbyname_r(
+          serverAddr.c_str(), &hostinfo, buf, sizeof(buf), &server, &errRet))
+      << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
+  CHECK(server) << "gethostbyname_r error!";
 #endif
 
   bzero((char *)&serv_addr, sizeof(serv_addr));
   serv_addr.sin_family = AF_INET;
-  bcopy((char *)server->h_addr, (char *)&serv_addr.sin_addr.s_addr,
+  bcopy((char *)server->h_addr,
+        (char *)&serv_addr.sin_addr.s_addr,
         server->h_length);
   serv_addr.sin_port = htons(serverPort);
 
@@ -421,7 +427,8 @@ void SocketClient::RdmaClient(const std::string &serverAddr, int serverPort) {
  *
  * @note  responsible for building one connection to specified pserver port
  */
-SocketClient::SocketClient(const std::string &serverAddr, int serverPort,
+SocketClient::SocketClient(const std::string &serverAddr,
+                           int serverPort,
                            enum ChannelType channelType) {
   if (channelType == F_RDMA)
     RdmaClient(serverAddr, serverPort);
diff --git a/paddle/pserver/LightNetwork.h b/paddle/pserver/LightNetwork.h
index 0d6d6bf6b7c6d3b7123f9ce05f50ad45bfd5ac60..b7d7bc7902abb18aae03fc4d8a3972f0298199fe 100644
--- a/paddle/pserver/LightNetwork.h
+++ b/paddle/pserver/LightNetwork.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "SocketChannel.h"
@@ -39,9 +38,9 @@ class SocketWorker;
  *        in child class of socketserver.
  */
 class SocketServer : public Thread {
-   // rdmaCpu controls the cpu affinity of RDMA server daemon,
-   // which could benifit performance. rdmaCpu = -1 means TCP
-   // is used instead of RDMA transport.
+  // rdmaCpu controls the cpu affinity of RDMA server daemon,
+  // which could benifit performance. rdmaCpu = -1 means TCP
+  // is used instead of RDMA transport.
 public:
   SocketServer(const std::string& addr, int port, int rdmaCpu);
   ~SocketServer();
@@ -91,7 +90,6 @@ protected:
   bool stopping_;
 };
 
-
 /**
  * @brief class for holding one connection from one trainer
  *
@@ -165,7 +163,8 @@ private:
  */
 class SocketClient {
 public:
-  SocketClient(const std::string& serverAddr, int serverPort,
+  SocketClient(const std::string& serverAddr,
+               int serverPort,
                enum ChannelType channelType);
 
   SocketChannel* getChannel() { return channel_.get(); }
diff --git a/paddle/pserver/ParameterClient2.cpp b/paddle/pserver/ParameterClient2.cpp
index d0e5352c828d197d6854ef19e6310dc63913846d..28cc0ae2dd36273397015e618f6e14ea43398964 100644
--- a/paddle/pserver/ParameterClient2.cpp
+++ b/paddle/pserver/ParameterClient2.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <unistd.h>
 
 #include "ParameterClient2.h"
@@ -27,7 +26,8 @@ P_DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
 namespace paddle {
 
 template <class T>
-void copyToRepeatedField(google::protobuf::RepeatedField<T>* dest, const T* src,
+void copyToRepeatedField(google::protobuf::RepeatedField<T>* dest,
+                         const T* src,
                          size_t size) {
   dest->Clear();
   dest->Reserve(size);
@@ -46,11 +46,10 @@ void copyToRepeatedField(const std::vector<T>& src,
 ParameterClient2::ParameterClient2(bool separate, int port, int numPorts)
     : BaseClient(separate, numPorts), port_(port) {
 #ifndef PADDLE_DISABLE_TIMER
-    forwardbackwordTime_ = 0;
+  forwardbackwordTime_ = 0;
 #endif
 }
 
-
 int ParameterClient2::calcParameterBlockSize(
     const std::vector<ParameterPtr>& parameters, size_t serviceNum) {
   size_t totalSize = 0;
@@ -89,8 +88,8 @@ bool ParameterClient2::init(const std::vector<ParameterPtr>& parameters) {
   for (auto& para : parameters) {
     /// set block size for each parameter
     para->getConfig().set_parameter_block_size(
-            para->getConfig().sparse_remote_update() ?
-            para->getConfig().dims(1) : denseBlockSize);
+        para->getConfig().sparse_remote_update() ? para->getConfig().dims(1)
+                                                 : denseBlockSize);
   }
 
   for (auto& para : parameters) {
@@ -107,7 +106,7 @@ bool ParameterClient2::init(const std::vector<ParameterPtr>& parameters) {
     allSegments_.push_back(segments);
     if (para->getConfig().sparse_remote_update()) {
       CHECK_EQ(para->getConfig().parameter_block_size(),
-              para->getConfig().dims(1))
+               para->getConfig().dims(1))
           << "For sparse remote update parameter,"
           << " block size is the width of each row.";
     }
@@ -152,7 +151,8 @@ void ParameterClient2::destroy() {
   clients_.clear();
 }
 
-void ParameterClient2::sendParallel(int tid, size_t numThreads,
+void ParameterClient2::sendParallel(int tid,
+                                    size_t numThreads,
                                     ParameterType recvParameterType) {
   int numMyClients = divup(serviceNum_ - tid, numThreads);
 
@@ -163,7 +163,8 @@ void ParameterClient2::sendParallel(int tid, size_t numThreads,
     /// at the same time so that they will not flood data to the same
     /// pserver.
     i = calcClientId(i, serviceNum_);
-    clients_[i].send("sendParameter", sendJob_.parallelRequests[i],
+    clients_[i].send("sendParameter",
+                     sendJob_.parallelRequests[i],
                      sendJob_.parallelInputIovs[i]);
 
     /// clear large structure
@@ -204,10 +205,15 @@ void ParameterClient2::sendParallel(int tid, size_t numThreads,
 }
 
 void ParameterClient2::prepareSendData(
-    ParameterUpdateMode updateMode, ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments, int64_t numSamples,
-    real cost, bool sendBackParameter, ParameterType sendBackParameterType,
-    BatchStatus batchStatus, SendJob* sendJob) {
+    ParameterUpdateMode updateMode,
+    ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments,
+    int64_t numSamples,
+    real cost,
+    bool sendBackParameter,
+    ParameterType sendBackParameterType,
+    BatchStatus batchStatus,
+    SendJob* sendJob) {
   sendJob->parallelRequests.resize(serviceNum_);
   sendJob->parallelInputIovs.resize(serviceNum_);
 
@@ -247,11 +253,11 @@ void ParameterClient2::prepareSendData(
       const auto prefetchMat = parameter->getPrefetchMatrix();
       CHECK(prefetchMat != nullptr) << "prefetchMat is nullptr";
       auto sendMat = dynamic_cast<SparseRowCpuMatrix*>(
-        parameter->getMat(parameterType).get());
+          parameter->getMat(parameterType).get());
       CHECK(sendMat != nullptr) << "sendMat is nullptr";
 
       syncThreadPool_->exec([&](int tid, size_t numThreads) {
-        const auto &localIndices = prefetchMat->getLocalIndices();
+        const auto& localIndices = prefetchMat->getLocalIndices();
         /// num of sparse rows
         size_t nLocalBlocks = localIndices.size();
         uint64_t beginDim = 0;
@@ -278,17 +284,17 @@ void ParameterClient2::prepareSendData(
 
           if (sendingPara) {
             sendJob->parallelInputIovs[serverId].push_back(
-                {sendMat->getLocalRow(row), sizeof(real) * (size_t) blockSize});
+                {sendMat->getLocalRow(row), sizeof(real) * (size_t)blockSize});
             /// detect sparse parameter distribution
             sparseDistribution_->probeDistribution(serverId,
-                    sizeof(real) * blockSize);
+                                                   sizeof(real) * blockSize);
           }
         }
       });
 
     } else {  /// parameter set for dense and sparse
-      real* buf = sendingPara ?
-          parameter->getBuf(parameterType)->getPoint(0) : nullptr;
+      real* buf =
+          sendingPara ? parameter->getBuf(parameterType)->getPoint(0) : nullptr;
       uint64_t endDim = 0;
       for (uint64_t beginDim = 0; beginDim < paraSize; beginDim = endDim) {
         endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
@@ -302,8 +308,8 @@ void ParameterClient2::prepareSendData(
         block->set_begin_pos(beginDim);
         block->set_block_size(endDim - beginDim);
         if (buf) {
-            sendJob->parallelInputIovs[serverId].push_back({buf + beginDim,
-                     sizeof(real) * ((size_t) (endDim - beginDim))});
+          sendJob->parallelInputIovs[serverId].push_back(
+              {buf + beginDim, sizeof(real) * ((size_t)(endDim - beginDim))});
         }
       }
     }
@@ -313,13 +319,23 @@ void ParameterClient2::prepareSendData(
 }
 
 void ParameterClient2::sendAndReceiveParameter(
-    ParameterUpdateMode updateMode, ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments, int64_t numSamples,
-    real cost, bool sendBackParameter, ParameterType sendBackParameterType,
+    ParameterUpdateMode updateMode,
+    ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments,
+    int64_t numSamples,
+    real cost,
+    bool sendBackParameter,
+    ParameterType sendBackParameterType,
     ParameterType recvParameterType) {
-  prepareSendData(updateMode, parameterType, parameterSegments, numSamples,
-                  cost, sendBackParameter, sendBackParameterType,
-                  /*batchStatus = */ BATCH_START_AND_FINISH, &sendJob_);
+  prepareSendData(updateMode,
+                  parameterType,
+                  parameterSegments,
+                  numSamples,
+                  cost,
+                  sendBackParameter,
+                  sendBackParameterType,
+                  /*batchStatus = */ BATCH_START_AND_FINISH,
+                  &sendJob_);
 
   syncThreadPool_->exec([&](int tid, size_t numThreads) {
     this->sendParallel(tid, numThreads, recvParameterType);
@@ -327,12 +343,22 @@ void ParameterClient2::sendAndReceiveParameter(
 }
 
 void ParameterClient2::sendParameter(
-    ParameterUpdateMode updateMode, ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments, int64_t numSamples,
-    real cost, bool sendBackParameter, BatchStatus batchStatus) {
+    ParameterUpdateMode updateMode,
+    ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments,
+    int64_t numSamples,
+    real cost,
+    bool sendBackParameter,
+    BatchStatus batchStatus) {
   SendJobPtr sendJob = std::make_shared<SendJob>();
-  prepareSendData(updateMode, parameterType, parameterSegments, numSamples,
-                  cost, sendBackParameter, PARAMETER_VALUE, batchStatus,
+  prepareSendData(updateMode,
+                  parameterType,
+                  parameterSegments,
+                  numSamples,
+                  cost,
+                  sendBackParameter,
+                  PARAMETER_VALUE,
+                  batchStatus,
                   sendJob.get());
 
   for (int i = 0; i < threadNum_; i++) {
@@ -360,10 +386,12 @@ void ParameterClient2::send(int threadId) {
       /// pserver.
       i = calcClientId(i, serviceNum_);
       if (recvJob->parallelRequests.size()) {
-        clients_[i].send("sendParameter", recvJob->parallelRequests[i],
+        clients_[i].send("sendParameter",
+                         recvJob->parallelRequests[i],
                          recvJob->parallelInputIovs[i]);
       } else {
-        clients_[i].send("sendData", recvJob->parallelDataRequests[i],
+        clients_[i].send("sendData",
+                         recvJob->parallelDataRequests[i],
                          recvJob->parallelInputIovs[i]);
       }
     }
@@ -586,12 +614,13 @@ void PreparedOperations::addOperationHelper(Operation* op, CpuMatrixPtr mat) {
   ProtoMatrix& pmat = *op->add_matrices();
   pmat.set_num_cols(mat->getWidth());
   pmat.set_num_rows(mat->getHeight());
-  copyToRepeatedField(pmat.mutable_values(), mat->getData(),
-                      pmat.num_cols() * pmat.num_rows());
+  copyToRepeatedField(
+      pmat.mutable_values(), mat->getData(), pmat.num_cols() * pmat.num_rows());
 }
 
 void ParameterClient2::doOperation(PreparedOperations& ops,
-                                   bool waitForGradient, bool sendBackGradient,
+                                   bool waitForGradient,
+                                   bool sendBackGradient,
                                    bool releasePass) {
   std::vector<DoOperationResponse> responses;
   ops.request_.set_wait_for_gradient(waitForGradient);
@@ -666,7 +695,8 @@ void ParameterClient2::doOperation(PreparedOperations& ops,
         CHECK_EQ(rmat->getWidth(), (size_t)mat.num_cols());
         CpuMatrixPtr amat =
             std::make_shared<CpuMatrix>(const_cast<real*>(mat.values().data()),
-                                        rmat->getHeight(), rmat->getWidth());
+                                        rmat->getHeight(),
+                                        rmat->getWidth());
         rmat->add(*amat);
       }
     }
@@ -700,14 +730,17 @@ void ParameterClient2::vectorAddMult(PServerVector u, PServerVector v, real a) {
   doOperation(ops, false, false);
 }
 
-void ParameterClient2::vectorAddMultInto(PServerVector u, PServerVector v,
-                                         PServerVector w, real a) {
+void ParameterClient2::vectorAddMultInto(PServerVector u,
+                                         PServerVector v,
+                                         PServerVector w,
+                                         real a) {
   PreparedOperations ops;
   ops.addOperation(PSERVER_OP_au_bv_cw, v, w, u, (real)1, a, (real)0);
   doOperation(ops, false, false);
 }
 
-void ParameterClient2::vectorScaleInto(PServerVector u, PServerVector v,
+void ParameterClient2::vectorScaleInto(PServerVector u,
+                                       PServerVector v,
                                        real a) {
   PreparedOperations ops;
   ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)0);
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
index 7a4085ad8230747ab3c740910695932623946a5e..af8dd41ec4327fcf78625e7aa5d4b136ca7d14dd 100644
--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <atomic>
@@ -190,8 +189,8 @@ protected:
 };
 
 struct ParameterSegments {
-  std::string name;               // name of the parameter
-  size_t id;                      // id of the parameter
+  std::string name;  // name of the parameter
+  size_t id;         // id of the parameter
 };
 
 /**
@@ -225,7 +224,8 @@ public:
    *                 connections the parameter client maintains.
    */
   ParameterClient2(bool separate = false,
-                   int port = FLAGS_port, int numPorts = FLAGS_ports_num);
+                   int port = FLAGS_port,
+                   int numPorts = FLAGS_ports_num);
 
   ~ParameterClient2();
 
@@ -255,14 +255,14 @@ public:
    *            client[recvParameterType]
    * @note Only parameterType will be sent.
    */
-  void sendAndReceiveParameter(
-      ParameterUpdateMode updateMode,
-      ParameterType parameterType,
-      const std::vector<ParameterSegments>& segments,
-      int64_t numSamples,
-      real cost, bool sendBackParameter,
-      ParameterType sendBackParameterType,
-      ParameterType recvParameterType);
+  void sendAndReceiveParameter(ParameterUpdateMode updateMode,
+                               ParameterType parameterType,
+                               const std::vector<ParameterSegments>& segments,
+                               int64_t numSamples,
+                               real cost,
+                               bool sendBackParameter,
+                               ParameterType sendBackParameterType,
+                               ParameterType recvParameterType);
 
   /**
    * @brief Sends all parameters to parameter servers, and receives the response
@@ -276,8 +276,13 @@ public:
       bool sendBackParameter,
       ParameterType sendBackParameterType = PARAMETER_VALUE,
       ParameterType recvParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(updateMode, parameterType, allSegments_, numSamples,
-                            cost, sendBackParameter, sendBackParameterType,
+    sendAndReceiveParameter(updateMode,
+                            parameterType,
+                            allSegments_,
+                            numSamples,
+                            cost,
+                            sendBackParameter,
+                            sendBackParameterType,
                             recvParameterType);
   }
 
@@ -302,29 +307,41 @@ public:
   void sendParameter(ParameterUpdateMode updateMode,
                      ParameterType parameterType,
                      const std::vector<ParameterSegments>& segments,
-                     int64_t numSamples, real cost, bool sendBackParameter,
+                     int64_t numSamples,
+                     real cost,
+                     bool sendBackParameter,
                      BatchStatus batchStatus);
 
   void recvParameter();
 
   /**
-   * Sends all parameters to parameter servers, recvParameter() have to be invoked
+   * Sends all parameters to parameter servers, recvParameter() have to be
+   * invoked
    * afterwards.
    *
    * @note This function is non-blocking. This means that if parameter should
    *       not changes between this call and recvParameter()
    */
   void sendParameter(ParameterUpdateMode updateMode,
-                     ParameterType parameterType, int64_t numSamples, real cost,
-                     bool sendBackParameter, BatchStatus batchStatus) {
-    sendParameter(updateMode, parameterType, allSegments_, numSamples, cost,
-                  sendBackParameter, batchStatus);
+                     ParameterType parameterType,
+                     int64_t numSamples,
+                     real cost,
+                     bool sendBackParameter,
+                     BatchStatus batchStatus) {
+    sendParameter(updateMode,
+                  parameterType,
+                  allSegments_,
+                  numSamples,
+                  cost,
+                  sendBackParameter,
+                  batchStatus);
   }
 
   /// Get all parameters from parameter servers
   void getParameter(ParameterType recvParameterType = PARAMETER_VALUE,
                     ParameterType sendBackParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM, PARAMETER_VALUE,
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM,
+                            PARAMETER_VALUE,
                             0,     // numSamples = 0
                             0,     // cost = 0
                             true,  // sendBackParameter = true
@@ -341,12 +358,14 @@ public:
                             0,     // numSamples = 0
                             0,     // cost = 0
                             true,  // sendBackParameter = true
-                            sendBackParameterType, recvParameterType);
+                            sendBackParameterType,
+                            recvParameterType);
   }
 
   /// Set all parameters on parameter servers using the local parameters
   void setParameter() {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM, PARAMETER_VALUE,
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM,
+                            PARAMETER_VALUE,
                             0,       // numSamples = 0
                             0,       // cost = 0
                             false);  // sendBackParameter = false
@@ -356,7 +375,8 @@ public:
    * means do not sending local parameters
    */
   void setParameterZero() {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM_ZERO, PARAMETER_VALUE,
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM_ZERO,
+                            PARAMETER_VALUE,
                             0,       // numSamples = 0
                             0,       // cost = 0
                             false);  // sendBackParameter = false
@@ -401,15 +421,18 @@ public:
    * @param[in] If true, and if all clients call waitPassFinish, signal all
    *            clients finish the pass.
    */
-  void doOperation(PreparedOperations& ops, bool waitForGradient,
-                   bool sendBackParameter, bool releasePass = true);
+  void doOperation(PreparedOperations& ops,
+                   bool waitForGradient,
+                   bool sendBackParameter,
+                   bool releasePass = true);
 
   /**
    * Set the configuration of pserver, including parameter config and
    * optimization config
    */
   void setConfig(const OptimizationConfig& optConfig,
-                 const std::string& saveDir = "", bool isSparseServer = false);
+                 const std::string& saveDir = "",
+                 bool isSparseServer = false);
 
   /// Return true if all pservers are in the given status
   bool inStatus(PServerStatus status);
@@ -454,7 +477,9 @@ public:
   void vectorAddMult(PServerVector u, PServerVector v, real a);
 
   /// u = v + w * a
-  void vectorAddMultInto(PServerVector u, PServerVector v, PServerVector w,
+  void vectorAddMultInto(PServerVector u,
+                         PServerVector v,
+                         PServerVector w,
                          real a);
   /// u = v * a
   void vectorScaleInto(PServerVector u, PServerVector v, real a);
@@ -491,7 +516,8 @@ public:
 
 protected:
   template <typename ProtoIn, typename ProtoOut>
-  void multiCall(const char* funcName, const ProtoIn& request,
+  void multiCall(const char* funcName,
+                 const ProtoIn& request,
                  std::vector<ProtoOut>* responses) {
     responses->resize(clients_.size());
     size_t numClients = clients_.size();
@@ -511,10 +537,12 @@ private:
    *        to all pservers. it is called under one SyncThreadPool. it
    *        supports to use N thread to control M connections. the receiving
    *        actions can be started until all sending action to all connections
-   *        owned by current thread are finished. Different connections controlled
+   *        owned by current thread are finished. Different connections
+   * controlled
    *        by different threads can transfer data asynchronously.
    */
-  void sendParallel(int tid, size_t numThreads,
+  void sendParallel(int tid,
+                    size_t numThreads,
                     ParameterType recvParameterType);
   /// sending thread routine for asynchronously send data
   void send(int threadId);
@@ -535,9 +563,12 @@ private:
       ParameterUpdateMode updateMode,
       ParameterType parameterType,  // client send type
       const std::vector<ParameterSegments>& parameterSegments,
-      int64_t numSamples, real cost, bool sendBackParameter,
+      int64_t numSamples,
+      real cost,
+      bool sendBackParameter,
       ParameterType sendBackParameterType,  // send back type in pserver
-      BatchStatus batchStatus, SendJob* sendJob);
+      BatchStatus batchStatus,
+      SendJob* sendJob);
 
   /// start necessary threads for threadPool
   void initThreads();
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index c8f37d0bf4f84cdd9588c269dda3fb6bd72c1dc5..b7f999f8b132e59ce8b7dffe5c4d43615e4c564c 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -31,10 +31,12 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 
 P_DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
-P_DEFINE_double(async_lagged_ratio_min, 1.0,
+P_DEFINE_double(async_lagged_ratio_min,
+                1.0,
                 "control config_.async_lagged_grad_discard_ratio() min value");
 P_DEFINE_double(
-    async_lagged_ratio_default, 1.5,
+    async_lagged_ratio_default,
+    1.5,
     "if async_lagged_grad_discard_ratio is not set in trainer_config.conf"
     "use it as defalut value");
 
@@ -47,7 +49,8 @@ const std::string ParameterServer2::kRetMsgInvalidVectorHandle =
 const std::string ParameterServer2::kRetMsgUnknownOperation =
     "Unknown operation";
 
-ParameterServer2::ParameterServer2(const std::string& addr, int port,
+ParameterServer2::ParameterServer2(const std::string& addr,
+                                   int port,
                                    int rdmaCpu)
     : ProtoServer(addr, port, rdmaCpu),
       dataSize_(0),
@@ -59,12 +62,12 @@ ParameterServer2::ParameterServer2(const std::string& addr, int port,
       allClientPassFinish_(false),
       serverId_(-1),
       batchId_(-1) {
- /**
-  * register function for remote client calling, these functions
-  * will be mapped to a data structure for quick looking up. each
-  * request from trainer can contains one function name to indicate
-  * remote action. this architecture looks like rpc style for pserver.
-  */
+  /**
+   * register function for remote client calling, these functions
+   * will be mapped to a data structure for quick looking up. each
+   * request from trainer can contains one function name to indicate
+   * remote action. this architecture looks like rpc style for pserver.
+   */
   REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendParameter);
   REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendData);
   REGISTER_SERVICE_FUNCTION(ParameterServer2, setConfig);
@@ -150,12 +153,12 @@ void ParameterServer2::setConfig(const SetConfigRequest& request,
       mkDir(request.save_dir().c_str());
     }
 
-  for (const auto& config : request.param_configs()) {
-    CHECK(!configMap_.count(config.para_id()))
-        << "Duplicated parameter name: " << config.name();
-    configMap_[config.para_id()] = config;
-    CHECK_EQ(config.sparse_remote_update(), isSparseServer_);
-  }
+    for (const auto& config : request.param_configs()) {
+      CHECK(!configMap_.count(config.para_id()))
+          << "Duplicated parameter name: " << config.name();
+      configMap_[config.para_id()] = config;
+      CHECK_EQ(config.sparse_remote_update(), isSparseServer_);
+    }
 
     config_ = request.opt_config();
     if (config_.algorithm() == TrainAlgorithm::AsyncSGD) {
@@ -267,9 +270,9 @@ void ParameterServer2::setParameter(const SendParameterRequest& request,
 
   if (!request.blocks().size()) {
     LOG(WARNING)
-          << "--ports_num or --ports_num_for_sparse might be too large, "
-          << "or total dense parameter size or sparse parameters size "
-          << "might be too small, this psever doesn't store any parameter.";
+        << "--ports_num or --ports_num_for_sparse might be too large, "
+        << "or total dense parameter size or sparse parameters size "
+        << "might be too small, this psever doesn't store any parameter.";
     return;
   }
 
@@ -339,8 +342,8 @@ void ParameterServer2::setParameter(const SendParameterRequest& request,
           << "width : " << width;
     }
     info.optimizer->init(1, info.config);
-    usedSegments_.push_back(std::make_pair(offsets[i],
-                offsets[i] + request.blocks(i).block_size()));
+    usedSegments_.push_back(std::make_pair(
+        offsets[i], offsets[i] + request.blocks(i).block_size()));
   }
   mergeSegments(&usedSegments_);
 
@@ -364,15 +367,18 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
                                    std::vector<Buffer>* outputBuffers) {
   VLOG(1) << "pserver: addGradient";
 
-  /// forwardbackward delta from all trainers
-  /// indicate the fluctuation caused by forwardbackward.
+/// forwardbackward delta from all trainers
+/// indicate the fluctuation caused by forwardbackward.
 #ifndef PADDLE_METRIC_LEARNING
   // @TODO(yanfei):
   // add support tuning forwardbackward balance for metric learning
   if (!numPassFinishClients_) {
     REGISTER_BARRIER_DELTA_SERVER_SET(
-        *statSet_, "forwardbackwardDelta", FLAGS_num_gradient_servers,
-        request.trainer_id(), request.forwardbackward_time(),
+        *statSet_,
+        "forwardbackwardDelta",
+        FLAGS_num_gradient_servers,
+        request.trainer_id(),
+        request.forwardbackward_time(),
         isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
   }
 #endif
@@ -390,14 +396,19 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
   /// barrier fluctuation caused by network and previous forwardbackward
   if (!numPassFinishClients_) {
     REGISTER_BARRIER_TIMER_SERVER_SET(
-        *statSet_, "handleReqBegin", FLAGS_num_gradient_servers,
-        request.trainer_id(), (*handleRequestBegin_),
+        *statSet_,
+        "handleReqBegin",
+        FLAGS_num_gradient_servers,
+        request.trainer_id(),
+        (*handleRequestBegin_),
         isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
   }
 
   if (!numPassFinishClients_) {
     REGISTER_BARRIER_TIMER_SERVER(
-        *statSet_, "addGradBegin", FLAGS_num_gradient_servers,
+        *statSet_,
+        "addGradBegin",
+        FLAGS_num_gradient_servers,
         request.trainer_id(),
         isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
   }
@@ -414,8 +425,8 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
 
       int64_t blockId = getBlockId(block);
       CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
-                          << " id=" << block.para_id()
-                          << " block id=" << block.block_id();
+                           << " id=" << block.para_id()
+                           << " block id=" << block.block_id();
 
       Buffer buffer = inputBuffers[bufferIndex];
       ++bufferIndex;
@@ -438,7 +449,9 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
 
     if (!numPassFinishClients_) {
       REGISTER_BARRIER_TIMER_SERVER(
-          *statSet_, "addGradCoreFinish", FLAGS_num_gradient_servers,
+          *statSet_,
+          "addGradCoreFinish",
+          FLAGS_num_gradient_servers,
           request.trainer_id(),
           isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
     }
@@ -453,7 +466,9 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
     /// numPassFinishClients_ means some trainer has entered finishPass
     if (!numPassFinishClients_) {
       REGISTER_SLOW_NODES_PROBE(
-          *statSet_, "SLOW_NODES", FLAGS_num_gradient_servers,
+          *statSet_,
+          "SLOW_NODES",
+          FLAGS_num_gradient_servers,
           request.trainer_id(),
           isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
     }
@@ -463,7 +478,9 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
 
     /// if wait pass finish does not start, do check
     if (!numPassFinishClients_) {
-      CHECK_BARRIER_TIMER(*statSet_, "SLOW_NODES", FLAGS_num_gradient_servers,
+      CHECK_BARRIER_TIMER(*statSet_,
+                          "SLOW_NODES",
+                          FLAGS_num_gradient_servers,
                           isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
     }
 
@@ -471,7 +488,9 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
     /// can indicate the fluctation caused by computation at pserver.
     if (!numPassFinishClients_) {
       REGISTER_BARRIER_TIMER_SERVER(
-          *statSet_, "paraReady", FLAGS_num_gradient_servers,
+          *statSet_,
+          "paraReady",
+          FLAGS_num_gradient_servers,
           request.trainer_id(),
           isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
     }
@@ -481,7 +500,8 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
     {
       /// total time except overhead of network.
       REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecvNoSend",
-                                 timeToMicroSecond(*addGradBegin_), -1,
+                                 timeToMicroSecond(*addGradBegin_),
+                                 -1,
                                  *statSet_);
     }
   }
@@ -609,7 +629,8 @@ void ParameterServer2::asyncSGD(const SendParameterRequest& request,
                         << " block id=" << block.block_id();
     int64_t blockId = getBlockId(block);
     CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
-        << " id=" << block.para_id() << " block id=" << block.block_id();
+                         << " id=" << block.para_id()
+                         << " block id=" << block.block_id();
     Buffer buffer = inputBuffers[bufferIndex];
     ++bufferIndex;
 
@@ -730,10 +751,11 @@ void ParameterServer2::sendBackParameter(const ParameterBlock& block,
 
   int64_t offset = getBlockOffset(block);
   CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-      << " id=" << block.para_id() << " block id=" << block.block_id();
+                      << " id=" << block.para_id()
+                      << " block id=" << block.block_id();
 
   real* valueBuffer = vectors_[parameterType]->getPoint(offset);
-  outputBuffers->push_back({valueBuffer, (size_t) block.block_size()});
+  outputBuffers->push_back({valueBuffer, (size_t)block.block_size()});
 }
 
 void ParameterServer2::sendBackParameter(const ParameterBlock& block,
@@ -749,7 +771,8 @@ void ParameterServer2::sendBackParameter(const ParameterBlock& block,
 
   int64_t offset = getBlockOffset(block);
   CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-      << " id=" << block.para_id() << " block id=" << block.block_id();
+                      << " id=" << block.para_id()
+                      << " block id=" << block.block_id();
 
   size_t size = buffer->size;
   real* valueBuffer = vectors_[parameterType]->getPoint(offset);
@@ -759,8 +782,11 @@ void ParameterServer2::sendBackParameter(const ParameterBlock& block,
 }
 
 void ParameterServer2::sendBackParameterSparse(
-    const ParameterBlock& block, int parameterType,
-    SendParameterResponse* response, Buffer* buffer, size_t width,
+    const ParameterBlock& block,
+    int parameterType,
+    SendParameterResponse* response,
+    Buffer* buffer,
+    size_t width,
     std::vector<Buffer>* outputBuffers) {
   ParameterBlock* returnBlock = response->add_blocks();
   returnBlock->set_para_id(block.para_id());
@@ -769,7 +795,8 @@ void ParameterServer2::sendBackParameterSparse(
   returnBlock->set_block_size(block.block_size());
   int64_t offset = getBlockOffset(block);
   CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-      << " id=" << block.para_id() << " block id=" << block.block_id();
+                      << " id=" << block.para_id()
+                      << " block id=" << block.block_id();
 
   real* valueBuffer = vectors_[parameterType]->getPoint(offset);
   CHECK_EQ(buffer->size, width);
@@ -781,7 +808,7 @@ void ParameterServer2::readAllBlocks(
     MsgReader* msgReader, std::vector<ParameterServer2::Buffer>* buffers) {
   auto& buffer = *readWriteBuffer_;
   size_t numBlocks = msgReader->getNumBlocks();
-  buffer.resizeWithAlignHints(msgReader->getTotalLength()/sizeof(real),
+  buffer.resizeWithAlignHints(msgReader->getTotalLength() / sizeof(real),
                               numBlocks);
   std::vector<void*> bufs(numBlocks);
   buffers->clear();
@@ -861,7 +888,9 @@ void ParameterServer2::sendParameter(const SendParameterRequest& request,
         /// indicates network flucatuation for big message.
         if (!numPassFinishClients_) {
           REGISTER_BARRIER_TIMER_SERVER(
-              *statSet_, "sendParamFinish", FLAGS_num_gradient_servers,
+              *statSet_,
+              "sendParamFinish",
+              FLAGS_num_gradient_servers,
               request.trainer_id(),
               isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
         }
@@ -871,13 +900,15 @@ void ParameterServer2::sendParameter(const SendParameterRequest& request,
           /// total time including overhead of network.
           REGISTER_TIMER_DYNAMIC_SET("sendParaTotal",
                                      timeToMicroSecond(*handleRequestBegin_),
-                                     -1, *statSet_);
+                                     -1,
+                                     *statSet_);
         }
         /// all time exhausted in pserverServer except recieve network.
         {
           /// total time except overhead of network receive
           REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecv",
-                                     timeToMicroSecond(*addGradBegin_), -1,
+                                     timeToMicroSecond(*addGradBegin_),
+                                     -1,
                                      *statSet_);
         }
       }
@@ -1007,36 +1038,42 @@ void ParameterServer2::clearUnusedSegments(CpuVector* vec) {
     return;
   }
   memset(data, 0, sizeof(real) * usedSegments_[0].first);
-  memset(data + usedSegments_.back().second, 0,
+  memset(data + usedSegments_.back().second,
+         0,
          sizeof(real) * (size_ - usedSegments_.back().second));
   size_t n = size_ - usedSegments_.back().second;
 
   for (size_t i = 1; i < usedSegments_.size(); ++i) {
     memset(
-        data + usedSegments_[i - 1].second, 0,
+        data + usedSegments_[i - 1].second,
+        0,
         sizeof(real) * (usedSegments_[i].first - usedSegments_[i - 1].second));
     n += usedSegments_[i].first - usedSegments_[i - 1].second;
   }
 }
 
 void ParameterServer2::parallelExecForEachBlock(ExecFunc func) {
-  SyncThreadPool::execHelper(syncThreadPool_.get(), [&](int tid,
-                                                        size_t numThreads) {
-    int64_t numBlocks = blockIdMap_.size();
-    VectorPtr* vecs = Parameter::getTlsTempBufs();
-    for (int64_t blockId = tid; blockId < numBlocks; blockId += numThreads) {
-      func(blockId, vecs);
-    }
-  });
+  SyncThreadPool::execHelper(syncThreadPool_.get(),
+                             [&](int tid, size_t numThreads) {
+                               int64_t numBlocks = blockIdMap_.size();
+                               VectorPtr* vecs = Parameter::getTlsTempBufs();
+                               for (int64_t blockId = tid; blockId < numBlocks;
+                                    blockId += numThreads) {
+                                 func(blockId, vecs);
+                               }
+                             });
 }
 
 void ParameterServer2::blockTraverse(
-    BlockInfo& info, const ParameterConfig& config, int64_t offset, size_t size,
+    BlockInfo& info,
+    const ParameterConfig& config,
+    int64_t offset,
+    size_t size,
     const VectorPtr vecs[],
     const ParameterOptimizer::TraverseCallback& callback) {
   /// setup sub bufs
   for (const auto type : info.optimizer->getParameterTypes()) {
-      vecs[type]->subVecFrom(*vectors_[type], offset, size);
+    vecs[type]->subVecFrom(*vectors_[type], offset, size);
   }
   callback(vecs, config, config.sparse_remote_update() ? 0 : -1LU);
 }
@@ -1064,10 +1101,10 @@ void ParameterServer2::op_SGD(const Operation& operation,
       info.optimizer->startBatch(numSamplesProcessed_);
 
       for (const auto type : info.optimizer->getParameterTypes()) {
-          vecs[type]->subVecFrom(*vectors_[type], offset, size);
+        vecs[type]->subVecFrom(*vectors_[type], offset, size);
       }
-      info.optimizer->update(vecs, config,
-              config.sparse_remote_update() ? 0 : -1LU);
+      info.optimizer->update(
+          vecs, config, config.sparse_remote_update() ? 0 : -1LU);
       vecs[PARAMETER_GRADIENT]->zeroMem();
 
       if (auto callback = info.optimizer->needSpecialTraversal(config)) {
@@ -1469,7 +1506,6 @@ void ParameterServer2::waitPassFinish(const WaitPassFinishRequest& request,
 
 void ParameterServer2::synchronize(const SynchronizeRequest& request,
                                    ProtoResponseCallback callback) {
-  CHECK_LT(request.sync_object_id(), SyncObject_ARRAYSIZE);
   synchronizeBarriers_[request.sync_object_id()]->wait();
   dataSize_ = 0;
   callback(SynchronizeResponse());
@@ -1477,7 +1513,6 @@ void ParameterServer2::synchronize(const SynchronizeRequest& request,
 
 void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request,
                                        ProtoResponseCallback callback) {
-  CHECK_LT(request.sync_object_id(), SyncObject_ARRAYSIZE);
   synchronizeBarriers_[request.sync_object_id()]->wait();
   callback(SynchronizeResponse());
 
diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h
index ceb1ad69e9ec51894d869cee63f48950e5e8fa7c..ccaea42e7d0cb1865234702315fd4bbd00e548d5 100644
--- a/paddle/pserver/ParameterServer2.h
+++ b/paddle/pserver/ParameterServer2.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <atomic>
@@ -55,7 +54,6 @@ namespace paddle {
 // computation causes big optmization latency, the GPU may be required by
 // pserver.
 
-
 /**
  * Client interface for the parameter server
  *
@@ -189,9 +187,10 @@ protected:
      */
     constexpr static size_t AlignElementCount = AlignBytes / sizeof(T);
 
-    static_assert(
-        AlignElementCount == (AlignElementCount & -AlignElementCount)
-          || AlignBytes > sizeof(T), "AlignElementCount should be exp of 2");
+    static_assert(AlignElementCount ==
+                          (AlignElementCount & -AlignElementCount) ||
+                      AlignBytes > sizeof(T),
+                  "AlignElementCount should be exp of 2");
 
     /**
      * @brief Resize Buffer, with block count that will be allocated. Each block
@@ -205,7 +204,7 @@ protected:
       } else {
         //! at most, we need such elements in buffer to make sure each block is
         //! aligned.
-        this->resize(size + alignBlockCount* (AlignElementCount - 1));
+        this->resize(size + alignBlockCount * (AlignElementCount - 1));
       }
     }
 
@@ -224,8 +223,8 @@ protected:
       curOffset_ += blockSize;
 
       if (!IsTLargerThanAlign) {
-        curOffset_ = (curOffset_ + AlignElementCount - 1) &
-            ~(AlignElementCount -1);
+        curOffset_ =
+            (curOffset_ + AlignElementCount - 1) & ~(AlignElementCount - 1);
       }
       return r;
     }
@@ -369,7 +368,8 @@ public:
   /**
    * @brief send config to pserver
    *
-   * @note  it can help pserver to understand the configuration for optimization,
+   * @note  it can help pserver to understand the configuration for
+   * optimization,
    *        logging control, duplicated initialization, etc.
    */
   void setConfig(const SetConfigRequest& request,
@@ -545,17 +545,17 @@ protected:
                      std::vector<ParameterServer2::Buffer>* buffers);
 
   const ParameterConfig& getParameterConfig(const ParameterBlock& block) {
-    CHECK_LT(block.para_id(), -1UL)
-        << "invalid parameter id:" << block.para_id();
+    CHECK_LT(block.para_id(), -1UL) << "invalid parameter id:"
+                                    << block.para_id();
     const auto it = configMap_.find(block.para_id());
-    CHECK(it != configMap_.end())
-        << "can not find parameter id: " << block.para_id();
+    CHECK(it != configMap_.end()) << "can not find parameter id: "
+                                  << block.para_id();
     return it->second;
   }
 
   /// it implictly check blockOffsetMap_ while retrieving blockId
   const ParameterConfig& getParameterConfig(int64_t blockId) const {
-    CHECK(blockId >= 0 && blockId < (int64_t) blockInfos_.size())
+    CHECK(blockId >= 0 && blockId < (int64_t)blockInfos_.size())
         << "block idx out of range, id: " << blockId
         << " info size: " << blockInfos_.size();
     return *(blockInfos_[blockId].config);
@@ -614,7 +614,8 @@ protected:
    *        vectors_[parameterType] directly
    *        for dense with sync-sgd
    */
-  void sendBackParameter(const ParameterBlock& block, int parameterType,
+  void sendBackParameter(const ParameterBlock& block,
+                         int parameterType,
                          SendParameterResponse* response,
                          std::vector<Buffer>* outputBuffers);
 
@@ -627,16 +628,20 @@ protected:
    *        to buffer->base.
    *        for dense with async-sgd
    */
-  void sendBackParameter(const ParameterBlock& block, int parameterType,
-                         SendParameterResponse* response, Buffer* buffer,
+  void sendBackParameter(const ParameterBlock& block,
+                         int parameterType,
+                         SendParameterResponse* response,
+                         Buffer* buffer,
                          std::vector<Buffer>* outputBuffers);
   /**
    * @brief prepare data for sending back
    *
    * @note  specified for sparse
    */
-  void sendBackParameterSparse(const ParameterBlock& block, int parameterType,
-                               SendParameterResponse* response, Buffer* buffer,
+  void sendBackParameterSparse(const ParameterBlock& block,
+                               int parameterType,
+                               SendParameterResponse* response,
+                               Buffer* buffer,
                                size_t width,
                                std::vector<Buffer>* outputBuffers);
 
@@ -648,8 +653,11 @@ protected:
    */
   typedef std::function<void(int64_t blockId, const VectorPtr vecs[])> ExecFunc;
   void parallelExecForEachBlock(ExecFunc func);
-  void blockTraverse(BlockInfo& info, const ParameterConfig& config,
-                     int64_t offset, size_t size, const VectorPtr vecs[],
+  void blockTraverse(BlockInfo& info,
+                     const ParameterConfig& config,
+                     int64_t offset,
+                     size_t size,
+                     const VectorPtr vecs[],
                      const ParameterOptimizer::TraverseCallback& callback);
 
 public:
diff --git a/paddle/pserver/ProtoServer.cpp b/paddle/pserver/ProtoServer.cpp
index 0ce06ddf9180299c0ecf28669fe96e9668d9d48b..2f6d911a017d231692c42f2a235cf1e15257f7ae 100644
--- a/paddle/pserver/ProtoServer.cpp
+++ b/paddle/pserver/ProtoServer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ProtoServer.h"
 
 namespace paddle {
@@ -42,8 +41,8 @@ void ProtoServer::handleRequest(std::unique_ptr<MsgReader> msgReader,
 
 void ProtoServer::registerServiceFunctionImp(const std::string& funcName,
                                              ServiceFunction func) {
-  CHECK(!nameToFuncMap_.count(funcName))
-      << "Duplicated registration: " << funcName;
+  CHECK(!nameToFuncMap_.count(funcName)) << "Duplicated registration: "
+                                         << funcName;
   nameToFuncMap_[funcName] = func;
 }
 
diff --git a/paddle/pserver/ProtoServer.h b/paddle/pserver/ProtoServer.h
index 86e715868356ca1939dac819b52e816e19d7d361..cf08e24ff3ef47d9c17bfe14d7d3aff1537b8ce8 100644
--- a/paddle/pserver/ProtoServer.h
+++ b/paddle/pserver/ProtoServer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "LightNetwork.h"
@@ -23,17 +22,17 @@ limitations under the License. */
 
 namespace paddle {
 
-  /**
-   *
-   * It implements the rpc framework, which launchs one thread for each
-   * connection. Here define one parameter server as single TCP server
-   * binding on single port. All connections share single tcp ProtoServer
-   * object, each connection handles all requests from specified trainer
-   * within single worker thread.
-   * to accelerate bandwidth efficiency and harness multicore for pserver
-   * optimization to reduce pserver latency, you could launch more port
-   * for single NIC hardward with --port=N(N>1) for small cluster job.
-   */
+/**
+ *
+ * It implements the rpc framework, which launchs one thread for each
+ * connection. Here define one parameter server as single TCP server
+ * binding on single port. All connections share single tcp ProtoServer
+ * object, each connection handles all requests from specified trainer
+ * within single worker thread.
+ * to accelerate bandwidth efficiency and harness multicore for pserver
+ * optimization to reduce pserver latency, you could launch more port
+ * for single NIC hardward with --port=N(N>1) for small cluster job.
+ */
 class ProtoServer : public SocketServer {
 public:
   /// rdmaCpu controls the cpu affinity of RDMA server daemon,
@@ -84,7 +83,8 @@ public:
   template <class ProtoIn>
   void registerServiceFunctionEx(
       const std::string& funcName,
-      std::function<void(const ProtoIn&, std::unique_ptr<MsgReader> msgReader,
+      std::function<void(const ProtoIn&,
+                         std::unique_ptr<MsgReader> msgReader,
                          ProtoResponseCallbackEx callback)> func);
 
 protected:
@@ -120,7 +120,8 @@ protected:
 
 class ProtoClient : public SocketClient {
 public:
-  ProtoClient(const std::string& serverAddr, int serverPort,
+  ProtoClient(const std::string& serverAddr,
+              int serverPort,
               enum ChannelType channelType = F_TCP)
       : SocketClient(serverAddr, serverPort, channelType) {}
 
@@ -133,7 +134,8 @@ public:
    * @note  iov provides additional blocks which need to be written to the
    *        communication channel
    */
-  void send(const char* funcName, const google::protobuf::MessageLite& proto,
+  void send(const char* funcName,
+            const google::protobuf::MessageLite& proto,
             const std::vector<iovec>& iov = std::vector<iovec>());
 
   /**
@@ -148,7 +150,8 @@ public:
 
   /// combines send() and recv()
   std::unique_ptr<MsgReader> sendAndRecv(
-      const char* funcName, const google::protobuf::MessageLite& protoIn,
+      const char* funcName,
+      const google::protobuf::MessageLite& protoIn,
       google::protobuf::MessageLite* protoOut) {
     send(funcName, protoIn);
     return recv(protoOut);
@@ -156,8 +159,10 @@ public:
 
   /// combines send() and recv()
   std::unique_ptr<MsgReader> sendAndRecv(
-      const char* funcName, const google::protobuf::MessageLite& protoIn,
-      const std::vector<iovec>& iov, google::protobuf::MessageLite* protoOut) {
+      const char* funcName,
+      const google::protobuf::MessageLite& protoIn,
+      const std::vector<iovec>& iov,
+      google::protobuf::MessageLite* protoOut) {
     send(funcName, protoIn, iov);
     return recv(protoOut);
   }
@@ -172,52 +177,62 @@ struct service_arg_type<R (C::*)(const Arg1&, Arg2)> {
 };
 
 template <class R, class C, class Arg1, class Arg2>
-struct service_arg_type<R (C::*)(const Arg1&, std::unique_ptr<MsgReader>,
-                                 Arg2)> {
+struct service_arg_type<R (C::*)(  // NOLINT
+    const Arg1&,
+    std::unique_ptr<MsgReader>,
+    Arg2)> {
   typedef Arg1 _1;
 };
 
 /// register a service function to the ProtoServer
 /// This should only be used within a member function of className
-#define REGISTER_SERVICE_FUNCTION(className, funcName)                        \
-  registerServiceFunction<                                                    \
-      service_arg_type<decltype(&className::funcName)>::_1>(                  \
-      #funcName, std::bind(&className::funcName, this, std::placeholders::_1, \
-                           std::placeholders::_2))
+#define REGISTER_SERVICE_FUNCTION(className, funcName)       \
+  registerServiceFunction<                                   \
+      service_arg_type<decltype(&className::funcName)>::_1>( \
+      #funcName,                                             \
+      std::bind(&className::funcName,                        \
+                this,                                        \
+                std::placeholders::_1,                       \
+                std::placeholders::_2))
 
 /// register a service function to the ProtoServer
 /// This should only be used within a member function of className
-#define REGISTER_SERVICE_FUNCTION_EX(className, funcName)                     \
-  registerServiceFunctionEx<                                                  \
-      service_arg_type<decltype(&className::funcName)>::_1>(                  \
-      #funcName, std::bind(&className::funcName, this, std::placeholders::_1, \
-                           std::placeholders::_2, std::placeholders::_3))
+#define REGISTER_SERVICE_FUNCTION_EX(className, funcName)    \
+  registerServiceFunctionEx<                                 \
+      service_arg_type<decltype(&className::funcName)>::_1>( \
+      #funcName,                                             \
+      std::bind(&className::funcName,                        \
+                this,                                        \
+                std::placeholders::_1,                       \
+                std::placeholders::_2,                       \
+                std::placeholders::_3))
 
 /// create wrapper function for parameter server high level function and
 /// register the wrapper function into function mapping.
 template <class ProtoIn>
 void ProtoServer::registerServiceFunctionEx(
     const std::string& funcName,
-    std::function<void(const ProtoIn&, std::unique_ptr<MsgReader> msgReader,
+    std::function<void(const ProtoIn&,
+                       std::unique_ptr<MsgReader> msgReader,
                        ProtoResponseCallbackEx callback)> func) {
-  auto f =
-      [func](std::unique_ptr<MsgReader> msgReader, ResponseCallback callback) {
-        ProtoIn request;
-        std::string str(msgReader->getNextBlockLength(), 0);
-        msgReader->readNextBlock(&str[0]);
-        CHECK(request.ParseFromString(str));
-        auto pcob = [callback](const google::protobuf::MessageLite& response,
-                               const std::vector<iovec>& outputIovs) {
-          std::string out;
-          CHECK(response.SerializeToString(&out));
-          std::vector<iovec> iovs;
-          iovs.push_back({&out[0], out.size()});
-          iovs.insert(iovs.end(), outputIovs.begin(), outputIovs.end());
-          callback(iovs);
-        };
-
-        func(request, std::move(msgReader), pcob);
-      };
+  auto f = [func](std::unique_ptr<MsgReader> msgReader,
+                  ResponseCallback callback) {
+    ProtoIn request;
+    std::string str(msgReader->getNextBlockLength(), 0);
+    msgReader->readNextBlock(&str[0]);
+    CHECK(request.ParseFromString(str));
+    auto pcob = [callback](const google::protobuf::MessageLite& response,
+                           const std::vector<iovec>& outputIovs) {
+      std::string out;
+      CHECK(response.SerializeToString(&out));
+      std::vector<iovec> iovs;
+      iovs.push_back({&out[0], out.size()});
+      iovs.insert(iovs.end(), outputIovs.begin(), outputIovs.end());
+      callback(iovs);
+    };
+
+    func(request, std::move(msgReader), pcob);
+  };
 
   registerServiceFunctionImp(funcName, f);
 }
@@ -226,24 +241,24 @@ template <class ProtoIn>
 void ProtoServer::registerServiceFunction(
     const std::string& funcName,
     std::function<void(const ProtoIn&, ProtoResponseCallback callback)> func) {
-  auto f =
-      [func](std::unique_ptr<MsgReader> msgReader, ResponseCallback callback) {
-        ProtoIn request;
-        std::string str(msgReader->getNextBlockLength(), 0);
-        msgReader->readNextBlock(&str[0]);
-        CHECK(request.ParseFromString(str));
-        msgReader.reset();
-
-        auto pcob = [callback](const google::protobuf::MessageLite& response) {
-          std::string out;
-          CHECK(response.SerializeToString(&out));
-          std::vector<iovec> iovs;
-          iovs.push_back({&out[0], out.size()});
-          callback(iovs);
-        };
-
-        func(request, pcob);
-      };
+  auto f = [func](std::unique_ptr<MsgReader> msgReader,
+                  ResponseCallback callback) {
+    ProtoIn request;
+    std::string str(msgReader->getNextBlockLength(), 0);
+    msgReader->readNextBlock(&str[0]);
+    CHECK(request.ParseFromString(str));
+    msgReader.reset();
+
+    auto pcob = [callback](const google::protobuf::MessageLite& response) {
+      std::string out;
+      CHECK(response.SerializeToString(&out));
+      std::vector<iovec> iovs;
+      iovs.push_back({&out[0], out.size()});
+      callback(iovs);
+    };
+
+    func(request, pcob);
+  };
 
   registerServiceFunctionImp(funcName, f);
 }
diff --git a/paddle/pserver/RDMANetwork.h b/paddle/pserver/RDMANetwork.h
index 05b845b68a150cb36fa4ba09150bc8f41e3922c8..4e492a3afd120462ac6e056b9df850063c503a53 100644
--- a/paddle/pserver/RDMANetwork.h
+++ b/paddle/pserver/RDMANetwork.h
@@ -76,7 +76,7 @@ inline sxi_sock* accept(sxi_socket* s) {
 
 inline sockaddr_in* getSourceAddress(sxi_sock* sock) {
 #ifndef PADDLE_DISABLE_RDMA
-  return reinterpret_cast<sockaddr_in *>(&sock->sa);
+  return reinterpret_cast<sockaddr_in*>(&sock->sa);
 #else
   PROMPT_ERR();
 #endif
@@ -98,7 +98,6 @@ inline int close(sxi_sock* sock) {
 #endif
 }
 
-
 inline void init() {
 #ifndef PADDLE_DISABLE_RDMA
   sxi_module_init();
@@ -155,6 +154,5 @@ inline sxi_sock* connect(sxi_socket* socket, const char* url) {
 #endif
 }
 
-
 }  //  namespace rdma
 }  //  namespace paddle
diff --git a/paddle/pserver/SocketChannel.cpp b/paddle/pserver/SocketChannel.cpp
index 20295d7cdc22b5dba6380a0792eafef9feec257a..4ebc47d32659d82f32b9da529aec7ec3f46f77a9 100644
--- a/paddle/pserver/SocketChannel.cpp
+++ b/paddle/pserver/SocketChannel.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "SocketChannel.h"
 
 #include <stdio.h>
@@ -35,7 +34,6 @@ namespace paddle {
 #define UIO_MAXIOV 512
 #endif
 
-
 SocketChannel::~SocketChannel() {
   if (tcpRdma_ == F_TCP)
     close(tcpSocket_);
@@ -81,8 +79,12 @@ size_t SocketChannel::write(const void* buf, size_t size) {
 }
 
 template <class IOFunc, class SocketType>
-static size_t readwritev(IOFunc iofunc, SocketType socket, iovec* iovs,
-                         int iovcnt, int maxiovs, const std::string& peerName) {
+static size_t readwritev(IOFunc iofunc,
+                         SocketType socket,
+                         iovec* iovs,
+                         int iovcnt,
+                         int maxiovs,
+                         const std::string& peerName) {
   int curIov = 0;
   size_t total = 0;
 
@@ -123,25 +125,40 @@ static size_t readwritev(IOFunc iofunc, SocketType socket, iovec* iovs,
   return size;
 }
 
-
 /// rdma::readv and rdma::writev can take advantage of RDMA blocking offload
 /// transfering
 size_t SocketChannel::writev(const std::vector<struct iovec>& iovs) {
   if (tcpRdma_ == F_TCP)
-    return readwritev(::writev, tcpSocket_, const_cast<iovec*>(&iovs[0]),
-                      iovs.size(), UIO_MAXIOV, peerName_);
+    return readwritev(::writev,
+                      tcpSocket_,
+                      const_cast<iovec*>(&iovs[0]),
+                      iovs.size(),
+                      UIO_MAXIOV,
+                      peerName_);
   else
-    return readwritev(rdma::writev, rdmaSocket_, const_cast<iovec*>(&iovs[0]),
-                      iovs.size(), MAX_VEC_SIZE, peerName_);
+    return readwritev(rdma::writev,
+                      rdmaSocket_,
+                      const_cast<iovec*>(&iovs[0]),
+                      iovs.size(),
+                      MAX_VEC_SIZE,
+                      peerName_);
 }
 
 size_t SocketChannel::readv(std::vector<struct iovec>* iovs) {
   if (tcpRdma_ == F_TCP)
-    return readwritev(::readv, tcpSocket_, const_cast<iovec*>(&(*iovs)[0]),
-                      iovs->size(), UIO_MAXIOV, peerName_);
+    return readwritev(::readv,
+                      tcpSocket_,
+                      const_cast<iovec*>(&(*iovs)[0]),
+                      iovs->size(),
+                      UIO_MAXIOV,
+                      peerName_);
   else
-    return readwritev(rdma::readv, rdmaSocket_, const_cast<iovec*>(&(*iovs)[0]),
-                      iovs->size(), MAX_VEC_SIZE, peerName_);
+    return readwritev(rdma::readv,
+                      rdmaSocket_,
+                      const_cast<iovec*>(&(*iovs)[0]),
+                      iovs->size(),
+                      MAX_VEC_SIZE,
+                      peerName_);
 }
 
 void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
@@ -157,8 +174,8 @@ void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
   std::vector<iovec> iovs;
   iovs.reserve(userIovs.size() + 2);
   iovs.push_back({&header, sizeof(header)});
-  iovs.push_back({&iovLengths[0], static_cast<size_t>(
-      sizeof(iovLengths[0]) * header.numIovs)});
+  iovs.push_back({&iovLengths[0],
+                  static_cast<size_t>(sizeof(iovLengths[0]) * header.numIovs)});
   iovs.insert(iovs.end(), userIovs.begin(), userIovs.end());
 
   header.totalLength = 0;
diff --git a/paddle/pserver/SocketChannel.h b/paddle/pserver/SocketChannel.h
index fb9ac2e1dc23d9921777427540fb482e9bb0bd08..472b37a12283ca1c358034427d491804af765171 100644
--- a/paddle/pserver/SocketChannel.h
+++ b/paddle/pserver/SocketChannel.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
diff --git a/paddle/pserver/SparseParameterDistribution.cpp b/paddle/pserver/SparseParameterDistribution.cpp
index 31682c158e8006e071d681b29322b6000a9d1329..2085b22a95138fa8caf474a081fb46229688966f 100644
--- a/paddle/pserver/SparseParameterDistribution.cpp
+++ b/paddle/pserver/SparseParameterDistribution.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <unistd.h>
 
 #include "paddle/utils/Logging.h"
@@ -21,19 +20,24 @@ limitations under the License. */
 
 #include "SparseParameterDistribution.h"
 
-P_DEFINE_bool(check_sparse_distribution_in_pserver, false,
+P_DEFINE_bool(check_sparse_distribution_in_pserver,
+              false,
               "check whether sparse parameter exhibts balanced distribution at "
               "all pservers");
-P_DEFINE_bool(show_check_sparse_distribution_log, false,
+P_DEFINE_bool(show_check_sparse_distribution_log,
+              false,
               "show logs details for sparse parameter distribution in pserver");
-P_DEFINE_int32(check_sparse_distribution_batches, 100,
+P_DEFINE_int32(check_sparse_distribution_batches,
+               100,
                "run sparse parameter distribution check for N batches");
 P_DEFINE_double(
-    check_sparse_distribution_ratio, 0.6,
+    check_sparse_distribution_ratio,
+    0.6,
     "if parameters dispatched to different pservers exhibit unbalanced "
     " distribution for check_sparse_distribution_ratio * "
     " check_sparse_distribution_batches times, crash program");
-P_DEFINE_double(check_sparse_distribution_unbalance_degree, 2.0,
+P_DEFINE_double(check_sparse_distribution_unbalance_degree,
+                2.0,
                 "the ratio of maximum data size and minimun data size for "
                 "different pserver");
 
diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp
index 260aed0083c5d19ba6a766a70b51e30042389e38..24c90f10785a6f5870ab291a5c5e6c13fbc0d49f 100644
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 
 #include <stdio.h>
@@ -184,7 +183,8 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
 
   bzero((char*)&serv_addr, sizeof(serv_addr));
   serv_addr.sin_family = AF_INET;
-  bcopy((char*)server->h_addr, (char*)&serv_addr.sin_addr.s_addr,
+  bcopy((char*)server->h_addr,
+        (char*)&serv_addr.sin_addr.s_addr,
         server->h_length);
   serv_addr.sin_port = htons(serverPort);
 
diff --git a/paddle/pserver/test/test_ParameterServer2.cpp b/paddle/pserver/test/test_ParameterServer2.cpp
index c9722f1212ae9b7cab15c5ae314c604ffa8f0647..eb813e92d6d696db6c2ced543a00594b69c7f5af 100644
--- a/paddle/pserver/test/test_ParameterServer2.cpp
+++ b/paddle/pserver/test/test_ParameterServer2.cpp
@@ -27,7 +27,9 @@ P_DEFINE_int32(server_cpu, 0, "assign server cpu");
 
 class ParameterServer2Tester : public ParameterServer2 {
 public:
-  ParameterServer2Tester(std::string serverAddr, int port, int rdmaCpu = -1,
+  ParameterServer2Tester(std::string serverAddr,
+                         int port,
+                         int rdmaCpu = -1,
                          bool sepSendAndRecv = false)
       : ParameterServer2(serverAddr, port, rdmaCpu), client_(sepSendAndRecv) {}
   virtual ~ParameterServer2Tester() {}
@@ -63,7 +65,7 @@ public:
     }
 
     size_t id = 0;
-    for (auto &para : parameters_) {
+    for (auto& para : parameters_) {
       para->setID(id++);
     }
 
@@ -560,8 +562,8 @@ TEST(ParameterServer2, sendData) {
   std::unique_ptr<ParameterServer2Tester> g_server2;
   std::unique_ptr<ParameterServer2Tester> g_server3;
   if (FLAGS_rdma_tcp == "rdma") {
-    g_server1.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port,
-                                               FLAGS_server_cpu));
+    g_server1.reset(new ParameterServer2Tester(
+        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
     g_server1->start();
     g_server2.reset(new ParameterServer2Tester(
         FLAGS_server_addr, FLAGS_port + 1, FLAGS_server_cpu + 1));
@@ -604,8 +606,8 @@ int main(int argc, char** argv) {
   FLAGS_num_gradient_servers = 2;
 
   if (FLAGS_rdma_tcp == "rdma") {
-    g_server.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port,
-                                              FLAGS_server_cpu));
+    g_server.reset(new ParameterServer2Tester(
+        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
   } else {
     g_server.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
   }
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
index 065d6b3396be2287ee14226b4cf9b07be32e63e0..79d1f2743a1c2e6050afe48d6cf86a1084a4500c 100644
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -126,9 +126,11 @@ TEST(ProtoServer, extended) {
         GetStatusResponse response;
         {
           REGISTER_TIMER("sendAndRecv");
-          auto msgReader = client->sendAndRecv(
-              "getStatusEx", request, {{cpuGrad.getData(), (size_t)dataSize}},
-              &response);
+          auto msgReader =
+              client->sendAndRecv("getStatusEx",
+                                  request,
+                                  {{cpuGrad.getData(), (size_t)dataSize}},
+                                  &response);
 
           EXPECT_EQ(msgReader->getNumBlocks(), (size_t)1);
           EXPECT_EQ(msgReader->getNextBlockLength(), (size_t)dataSize);
diff --git a/paddle/py_paddle/util.py b/paddle/py_paddle/util.py
index e1f310580f95cfb210ba89589bab668433818b23..35a355ef29cebd84fd34e00cee05218220b2eb43 100644
--- a/paddle/py_paddle/util.py
+++ b/paddle/py_paddle/util.py
@@ -559,10 +559,10 @@ def __monkey_patch_trainer__():
 
 
 def monkeypatches():
-    patches = [
-        __monkeypatch_init_paddle__, __monkeypatch_gradient_machine__,
-        __monkey_patch_protobuf_objects__, __monkey_patch_parameter__,
-        __monkey_patch_trainer__
-    ]
+    patches = [__monkeypatch_init_paddle__,
+               __monkeypatch_gradient_machine__,
+               __monkey_patch_protobuf_objects__,
+               __monkey_patch_parameter__,
+               __monkey_patch_trainer__]
     for patch in patches:
         patch()
diff --git a/paddle/scripts/docker/Dockerfile b/paddle/scripts/docker/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..2a1a842336aa0409bba1315c77279ba2b018a4cd
--- /dev/null
+++ b/paddle/scripts/docker/Dockerfile
@@ -0,0 +1,41 @@
+FROM ubuntu:14.04
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+RUN apt-get update \
+    && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
+    libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip \
+    python-protobuf python-numpy python-dev swig openssh-server \
+    wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
+    sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
+    clang-3.8 llvm-3.8 libclang-3.8-dev \
+    && apt-get clean -y
+RUN pip install -U BeautifulSoup docopt PyYAML pillow \
+    sphinx sphinx_rtd_theme breathe recommonmark
+
+ARG WITH_AVX
+ARG WITH_DOC
+ARG WITH_SWIG_PY
+ARG WITH_STYLE_CHECK
+
+ENV WITH_GPU=OFF
+ENV WITH_AVX=${WITH_AVX:-ON}
+ENV WITH_DOC=${WITH_DOC:-ON}
+ENV WITH_SWIG_PY=${WITH_SWIG_PY:-ON}
+ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
+
+RUN mkdir /paddle
+COPY . /paddle/
+RUN /paddle/paddle/scripts/docker/build.sh
+VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
+
+RUN echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
+RUN pip install /usr/local/opt/paddle/share/wheels/*.whl
+RUN paddle version  # print version after build
+
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+RUN mkdir /var/run/sshd
+RUN echo 'root:root' | chpasswd
+RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+EXPOSE 22
+CMD ["/usr/sbin/sshd", "-D"]
diff --git a/paddle/scripts/docker/Dockerfile.cpu b/paddle/scripts/docker/Dockerfile.cpu
deleted file mode 100644
index 69b8363b7ac9eed033ec4958e189e233b3dc2689..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.cpu
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=OFF
-ENV IS_DEVEL=OFF
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=ON
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.cpu-demo b/paddle/scripts/docker/Dockerfile.cpu-demo
deleted file mode 100644
index ccbd183ee3c1ac27fc624f22847f53eb7d60b83d..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.cpu-demo
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=OFF
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=ON
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=ON
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.cpu-devel b/paddle/scripts/docker/Dockerfile.cpu-devel
deleted file mode 100644
index 36460384f383ba10c4bff1d9875cd053d6391b97..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.cpu-devel
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=OFF
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=ON
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.cpu-noavx b/paddle/scripts/docker/Dockerfile.cpu-noavx
deleted file mode 100644
index fa3b7427b0ad3973423894fa7af54ae5a2514e06..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.cpu-noavx
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=OFF
-ENV IS_DEVEL=OFF
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=OFF
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.cpu-noavx-demo b/paddle/scripts/docker/Dockerfile.cpu-noavx-demo
deleted file mode 100644
index 61315f762dee4d64251ef3d8db5b11b30a3ddb3a..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.cpu-noavx-demo
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=OFF
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=ON
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=OFF
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.cpu-noavx-devel b/paddle/scripts/docker/Dockerfile.cpu-noavx-devel
deleted file mode 100644
index 76365311990b527ea473be840770bfeb6025d74f..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.cpu-noavx-devel
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=OFF
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=OFF
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.gpu b/paddle/scripts/docker/Dockerfile.gpu
index 1e023ae2818dbb27c457ff17b01fc4ab02815eba..b3253d23c35811a68adc665df3d35998c09f9def 100644
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
@@ -1,12 +1,41 @@
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+RUN apt-get update \
+    && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
+    libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip \
+    python-protobuf python-numpy python-dev swig openssh-server \
+    wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
+    sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
+    clang-3.8 llvm-3.8 libclang-3.8-dev \
+    && apt-get clean -y
+RUN pip install -U BeautifulSoup docopt PyYAML pillow \
+    sphinx sphinx_rtd_theme breathe recommonmark
+
+ARG WITH_AVX
+ARG WITH_DOC
+ARG WITH_SWIG_PY
+ARG WITH_STYLE_CHECK
+
 ENV WITH_GPU=ON
-ENV IS_DEVEL=OFF
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=ON
-RUN cd /root/ && bash build.sh
+ENV WITH_AVX=${WITH_AVX:-ON}
+ENV WITH_DOC=${WITH_DOC:-ON}
+ENV WITH_SWIG_PY=${WITH_SWIG_PY:-ON}
+ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
+
+RUN mkdir /paddle
+COPY . /paddle/
+RUN /paddle/paddle/scripts/docker/build.sh
+VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
+
+RUN echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
+RUN pip install /usr/local/opt/paddle/share/wheels/*.whl
+RUN paddle version  # print version after build
+
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+RUN mkdir /var/run/sshd
+RUN echo 'root:root' | chpasswd
+RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+EXPOSE 22
+CMD ["/usr/sbin/sshd", "-D"]
diff --git a/paddle/scripts/docker/Dockerfile.gpu-demo b/paddle/scripts/docker/Dockerfile.gpu-demo
deleted file mode 100644
index 92b0dca4026c89c6749e14f189370183462333b8..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.gpu-demo
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=ON
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=ON
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=ON
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.gpu-devel b/paddle/scripts/docker/Dockerfile.gpu-devel
deleted file mode 100644
index fb6f351fd2f7e0f950e00ac96681de88ca238f70..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.gpu-devel
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=ON
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=ON
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.gpu-noavx b/paddle/scripts/docker/Dockerfile.gpu-noavx
deleted file mode 100644
index 7567e62025506ca2ae8c1d35d595d92ed6de87f3..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.gpu-noavx
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=ON
-ENV IS_DEVEL=OFF
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=OFF
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.gpu-noavx-demo b/paddle/scripts/docker/Dockerfile.gpu-noavx-demo
deleted file mode 100644
index ac52484c5cb513537283e1a0ffbe9df067fefc9a..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.gpu-noavx-demo
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=ON
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=ON
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=OFF
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.gpu-noavx-devel b/paddle/scripts/docker/Dockerfile.gpu-noavx-devel
deleted file mode 100644
index 19202f306b8f71e93af085d5285098a1fbe1dba7..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.gpu-noavx-devel
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=ON
-ENV IS_DEVEL=ON
-ENV WITH_DEMO=OFF
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=OFF
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/Dockerfile.m4 b/paddle/scripts/docker/Dockerfile.m4
deleted file mode 100644
index e14493ed9e842351125ab458db53fcc3f38233f6..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.m4
+++ /dev/null
@@ -1,12 +0,0 @@
-FROM PADDLE_BASE_IMAGE
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-ENV GIT_CHECKOUT=v0.9.0a0
-ENV WITH_GPU=PADDLE_WITH_GPU
-ENV IS_DEVEL=PADDLE_IS_DEVEL
-ENV WITH_DEMO=PADDLE_WITH_DEMO
-ENV PIP_INSTALL_ARGS ""
-ENV PIP_GENERAL_ARGS ""
-ENV USE_UBUNTU_MIRROR OFF
-ENV WITH_AVX=PADDLE_WITH_AVX
-RUN cd /root/ && bash build.sh
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
old mode 100644
new mode 100755
index ec5f3bd967d3569ee058a2e12d85fc50ba25c69d..ca3f1c3f1896feaae657f47c121ce6cd858dc2c9
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -7,43 +7,41 @@ function abort(){
 
 trap 'abort' 0
 set -e
-if [ ${USE_UBUNTU_MIRROR} == "ON" ]; then
-    sed -i 's#http://archive\.ubuntu\.com/ubuntu/#mirror://mirrors\.ubuntu\.com/mirrors\.txt#g'\
-      /etc/apt/sources.list
-fi
-apt-get update
-apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
-    libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip\
-    python-protobuf python-numpy python-dev swig
 
 if [ ${WITH_GPU} == 'ON' ]; then
   ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
 fi
 
-cd ~
-git clone https://github.com/baidu/Paddle.git paddle
-cd paddle
-git checkout ${GIT_CHECKOUT}
-mkdir build
-cd build
-cmake .. -DWITH_DOC=OFF -DWITH_GPU=${WITH_GPU} -DWITH_SWIG_PY=ON\
-   -DCUDNN_ROOT=/usr/ -DWITH_STYLE_CHECK=OFF -DWITH_AVX=${WITH_AVX}
+mkdir -p /paddle/build # -p means no error if exists
+cd /paddle/build
+cmake .. \
+      -DWITH_DOC=ON \
+      -DWITH_GPU=${WITH_GPU} \
+      -DWITH_AVX=${WITH_AVX} \
+      -DWITH_SWIG_PY=ON \
+      -DCUDNN_ROOT=/usr/ \
+      -DWITH_STYLE_CHECK=OFF \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 make -j `nproc`
-# because durning make install, there are several warning, so set +e, do not cause abort
 make install
-echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
-pip ${PIP_GENERAL_ARGS} install ${PIP_INSTALL_ARGS} /usr/local/opt/paddle/share/wheels/*.whl
-paddle version  # print version after build
 
-if [ ${WITH_DEMO} == "ON" ]; then
-  apt-get install -y wget unzip perl python-matplotlib tar xz-utils bzip2 gzip coreutils\
-	          sed grep graphviz libjpeg-dev zlib1g-dev
-  pip ${PIP_GENERAL_ARGS} install ${PIP_INSTALL_ARGS}  BeautifulSoup docopt \
-    PyYAML pillow
-fi
-if [ ${IS_DEVEL} == "OFF" ]; then  # clean build packages.
-  cd ~
-  rm -rf paddle
-fi
-apt-get clean -y
+# Install woboq_codebrowser.
+git clone https://github.com/woboq/woboq_codebrowser /woboq
+cd /woboq
+cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
+      -DCMAKE_BUILD_TYPE=Release \
+      .
+make
+
+export WOBOQ_OUT=/usr/share/nginx/html/paddle
+export BUILD_DIR=/paddle/build
+mkdir -p $WOBOQ_OUT
+cp -rv /woboq/data $WOBOQ_OUT/../data
+/woboq/generator/codebrowser_generator \
+    -b /paddle/build \
+    -a \
+    -o $WOBOQ_OUT \
+    -p paddle:/paddle
+/woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
+
 trap : 0
diff --git a/paddle/scripts/docker/generate.sh b/paddle/scripts/docker/generate.sh
deleted file mode 100644
index 2ad7527db127f3bd2018a7a1f5b40dacfecca6da..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/generate.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-set -e
-cd `dirname $0`
-m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=OFF -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=ON\
-   Dockerfile.m4 > Dockerfile.cpu
-
-m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=OFF -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=OFF\
-   Dockerfile.m4 > Dockerfile.cpu-noavx
-
-m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=OFF\
-   Dockerfile.m4 > Dockerfile.cpu-noavx-devel
-
-m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=ON\
-   Dockerfile.m4 > Dockerfile.cpu-devel
-
-
-m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=ON \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=ON\
-   Dockerfile.m4 > Dockerfile.cpu-demo
-
-m4 -DPADDLE_WITH_GPU=OFF -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=ON \
-   -DPADDLE_BASE_IMAGE=ubuntu:14.04 -DPADDLE_WITH_AVX=OFF\
-   Dockerfile.m4 > Dockerfile.cpu-noavx-demo
-
-
-m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=OFF -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   -DPADDLE_WITH_AVX=ON \
-   Dockerfile.m4 > Dockerfile.gpu
-
-m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=OFF -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   -DPADDLE_WITH_AVX=OFF \
-   Dockerfile.m4 > Dockerfile.gpu-noavx
-
-
-m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   -DPADDLE_WITH_AVX=ON \
-   Dockerfile.m4 > Dockerfile.gpu-devel
-
-m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=OFF \
-   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   -DPADDLE_WITH_AVX=OFF \
-   Dockerfile.m4 > Dockerfile.gpu-noavx-devel
-
-m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=ON \
-   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   -DPADDLE_WITH_AVX=ON \
-   Dockerfile.m4 > Dockerfile.gpu-demo
-
-
-m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=ON \
-   -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
-   -DPADDLE_WITH_AVX=OFF \
-   Dockerfile.m4 > Dockerfile.gpu-noavx-demo
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 20ea2fedc4d464cdd5403af28bc917770c993b98..ace2c0dee972e338001a0e5a4045c32e64ff157e 100644
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -29,6 +29,7 @@ function version(){
 }
 
 function ver2num() {
+  set -e
   # convert version to number.
   if [ -z "$1" ]; then # empty argument
     printf "%03d%03d%03d%03d%03d" 0
@@ -41,6 +42,7 @@ function ver2num() {
       printf "%03d%03d%03d%03d%03d" $VERN
     fi
   fi
+  set +e
 }
 
 PADDLE_CONF_HOME="$HOME/.config/paddle"
diff --git a/paddle/scripts/tools/build_docs/Dockerfile b/paddle/scripts/tools/build_docs/Dockerfile
index 5db0b29c4739943f9e677dc7973b392a345b7da1..506b13210ba1ee7277e2671870d79750cf63e900 100644
--- a/paddle/scripts/tools/build_docs/Dockerfile
+++ b/paddle/scripts/tools/build_docs/Dockerfile
@@ -1,6 +1,7 @@
 FROM paddledev/paddle:cpu-devel-latest
 COPY build.sh /
 RUN pip install sphinx &&\
+    pip install sphinx_rtd_theme &&\
     apt install -y doxygen graphviz &&\
     pip install breathe recommonmark numpy protobuf==2.6.1
 CMD /build.sh
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index 242fd982aa0015bfe9cb910c52afc3b42ab1028b..9caeb21beb15ee5281f9a6aefcfd59b94b91e48a 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+./build_submodules.sh
 source ./common.sh
 CMAKE_EXTRA=""
 if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
diff --git a/paddle/scripts/travis/build_submodules.sh b/paddle/scripts/travis/build_submodules.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d458bf92bf455609de601c60402101d09765dfe4
--- /dev/null
+++ b/paddle/scripts/travis/build_submodules.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+set -e
+WORK_DIR=$PWD
+PROJ_ROOT=$(git rev-parse --show-cdup)
+SUBMODULES=$(grep path ${PROJ_ROOT}.gitmodules | sed 's/^.*path = //')
+
+for module in $SUBMODULES
+do
+  case $module in
+    "warp-ctc")
+      if [ -d ${PROJ_ROOT}warp-ctc/build ]; then
+        rm -rf ${PROJ_ROOT}warp-ctc/build
+      fi
+      mkdir ${PROJ_ROOT}warp-ctc/build
+      cd ${PROJ_ROOT}warp-ctc/build
+      cmake ..; make
+    ;;
+  esac
+done
+cd $WORK_DIR
diff --git a/paddle/trainer/ParamUtil.cpp b/paddle/trainer/ParamUtil.cpp
index bb309a54975a1dfc386bfb440c90a6dd408205c3..2be9cd62235a262812231579c536a5f0596b69d9 100644
--- a/paddle/trainer/ParamUtil.cpp
+++ b/paddle/trainer/ParamUtil.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ParamUtil.h"
 
 #include <fenv.h>
@@ -48,8 +47,6 @@ ParameterUtil::ParameterUtil(
   pUpdater_ = parameterUpdater;
 }
 
-
-
 bool ParameterUtil::loadParameters(int passId, bool local, bool remote) {
   constexpr int kBufLen = 100;
   char buf[kBufLen];
@@ -60,8 +57,9 @@ bool ParameterUtil::loadParameters(int passId, bool local, bool remote) {
   return true;
 }
 
-void ParameterUtil::loadParametersWithPath(const std::string& dir,
-                                    bool local, bool remote) {
+void ParameterUtil::loadParametersWithPath(const std::string &dir,
+                                           bool local,
+                                           bool remote) {
   if (local) {
     gserver_->loadParameters(dir);
   }
@@ -98,7 +96,7 @@ void ParameterUtil::saveParameters(int passId, int passInnerId) {
   mkDir(saveDir.c_str());
   if (!intConfig_->load_save_param_pserver_) {
     pUpdater_->getParametersRemote(true /*full parameter*/,
-                                  true /*after apply*/);
+                                   true /*after apply*/);
   }
 
   gserver_->saveParameters(saveDir);
@@ -117,9 +115,13 @@ void ParameterUtil::saveParameters(int passId, int passInnerId) {
 void ParameterUtil::deleteParameters(int passId, int passInnerId) {
   constexpr int kBufLen = 100;
   char buf[kBufLen];
-  const std::string& saveDir = config_->getSaveDir();
+  const std::string &saveDir = config_->getSaveDir();
   if (passInnerId > 0) {
-    snprintf(buf, kBufLen, "%s/pass-%05d-%03d", saveDir.c_str(), passId,
+    snprintf(buf,
+             kBufLen,
+             "%s/pass-%05d-%03d",
+             saveDir.c_str(),
+             passId,
              passInnerId);
   } else {
     snprintf(buf, kBufLen, "%s/pass-%05d", saveDir.c_str(), passId);
@@ -129,8 +131,7 @@ void ParameterUtil::deleteParameters(int passId, int passInnerId) {
   rmDir(buf);
 }
 
-
-void ParameterUtil::saveConfigWithPath(const std::string& path) {
+void ParameterUtil::saveConfigWithPath(const std::string &path) {
   std::string src;
   // save config in some path
   if (!intConfig_->config_.empty()) {
diff --git a/paddle/trainer/ParamUtil.h b/paddle/trainer/ParamUtil.h
index cfb637a3edfdcae866964bb232c64bd731e46179..3923941c3d1533621d89313aa09801e98cd5b8a9 100644
--- a/paddle/trainer/ParamUtil.h
+++ b/paddle/trainer/ParamUtil.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
@@ -37,14 +36,14 @@ namespace paddle {
 struct ParameterUtilConfig {
   DISABLE_COPY(ParameterUtilConfig);
 
-  ParameterUtilConfig(bool save_only_one, int saving_period,
+  ParameterUtilConfig(bool save_only_one,
+                      int saving_period,
                       bool load_save_parameters_in_pserver,
-                      std::string config):
-                      save_only_one_(save_only_one),
-                      saving_period_(saving_period),
-                      load_save_param_pserver_(load_save_parameters_in_pserver),
-                      config_(config) {
-                      }
+                      std::string config)
+      : save_only_one_(save_only_one),
+        saving_period_(saving_period),
+        load_save_param_pserver_(load_save_parameters_in_pserver),
+        config_(config) {}
 
   bool save_only_one_;
   int saving_period_;
@@ -52,7 +51,6 @@ struct ParameterUtilConfig {
   std::string config_;
 };
 
-
 /**
  * ParameterUtil
  * Utility class for loading and saving parameters
@@ -80,8 +78,9 @@ public:
   bool loadParameters(int passId, bool local = true, bool remote = false);
 
   /// load parameters given path info
-  void loadParametersWithPath(const std::string& dir, bool local = true,
-                      bool remote = false);
+  void loadParametersWithPath(const std::string &dir,
+                              bool local = true,
+                              bool remote = false);
 
   /// Save parameter to dist for pass passId
   /// passInnerId means saving times in one pass, some users want to
@@ -97,14 +96,14 @@ public:
   void deleteParameters(int passId, int passInnerId = 0);
 
   /// save config given path info
-  void saveConfigWithPath(const std::string& path);
+  void saveConfigWithPath(const std::string &path);
 
   /**
    * Try to load parameter from config.
    * @return true if can load from trainer config.
    */
   inline bool tryLoadParametersFromConfig() {
-    auto& c = config_->getConfig();
+    auto &c = config_->getConfig();
     if (!c.init_model_path().empty()) {
       loadParametersWithPath(c.init_model_path());
       return true;
diff --git a/paddle/trainer/ParameterUpdater.cpp b/paddle/trainer/ParameterUpdater.cpp
index ef2b1443d9c35e8d3296730b044c2d4cd3217d89..6001a0b391fb3425315de3194945a4d04aff7150 100644
--- a/paddle/trainer/ParameterUpdater.cpp
+++ b/paddle/trainer/ParameterUpdater.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ParameterUpdater.h"
 
 #include "paddle/utils/Logging.h"
@@ -30,7 +29,8 @@ SgdUpdaterWithCpuAverager::SgdUpdaterWithCpuAverager(
   CHECK(FLAGS_use_gpu && optConfig.do_average_in_cpu());
   averager_.reset(AverageOptimizer::create(optConfig,
                                            new DummyOptimizer(optConfig),
-                                           false /*sparse*/, true /*apply*/));
+                                           false /*sparse*/,
+                                           true /*apply*/));
   updateWorker_.addJob([]() { hl_set_device(FLAGS_gpu_id); });
 }
 
diff --git a/paddle/trainer/ParameterUpdater.h b/paddle/trainer/ParameterUpdater.h
index 854e6a45d890f6fa2265ac72088c8c2574dfde5a..b83b4cf55e27b25864499531bbfe483fb75f78a1 100644
--- a/paddle/trainer/ParameterUpdater.h
+++ b/paddle/trainer/ParameterUpdater.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Thread.h"
@@ -69,7 +68,8 @@ public:
     ParameterUpdater::init(parameters);
     optimizer_->init(parameters_.size(), nullptr);
     // check no L1 decay in parameter configs
-    CHECK(std::find_if(parameters.begin(), parameters.end(),
+    CHECK(std::find_if(parameters.begin(),
+                       parameters.end(),
                        [](const ParameterPtr& para) {
                          return para->getConfig().decay_rate_l1() > 0.0f;
                        }) == parameters.end())
@@ -146,7 +146,6 @@ protected:
     para->getBuf(PARAMETER_GRADIENT)->zeroMem();
   }
 
-
   std::unique_ptr<ParameterOptimizer> optimizer_;
 
   /**
@@ -163,10 +162,10 @@ class SgdCpuUpdater : public SgdLocalUpdater, public Deprecated {
 public:
   explicit SgdCpuUpdater(const OptimizationConfig& optConfig)
       : SgdLocalUpdater(optConfig),
-        Deprecated("SgdCpuUpdater is used only in recursive neural network, "
-                   "and recursive neural network is deprecated in paddle. "
-                   "Use it all by your own.")
-  {}
+        Deprecated(
+            "SgdCpuUpdater is used only in recursive neural network, "
+            "and recursive neural network is deprecated in paddle. "
+            "Use it all by your own.") {}
 
   /**
    * @brief update all parameter on finish batch.
diff --git a/paddle/trainer/RemoteParameterUpdater.cpp b/paddle/trainer/RemoteParameterUpdater.cpp
index 3a5c2a351737ec9eb98b20c679d21dbfea42eea5..d83bb5b10adeff2dc43ad4705e5c55d10856de0d 100644
--- a/paddle/trainer/RemoteParameterUpdater.cpp
+++ b/paddle/trainer/RemoteParameterUpdater.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "RemoteParameterUpdater.h"
 #include "Trainer.h"
 #include "paddle/utils/Stat.h"
@@ -31,7 +30,8 @@ const std::string RemoteParameterUpdater::kAverage = "average";
 const std::string RemoteParameterUpdater::kElasticAverage = "elastic_average";
 
 RemoteParameterUpdater::RemoteParameterUpdater(
-    const OptimizationConfig& config, int expectedPassCount,
+    const OptimizationConfig& config,
+    int expectedPassCount,
     std::unique_ptr<ParameterUpdater>&& localUpdater)
     : config_(config),
       localUpdater_(std::move(localUpdater)),
@@ -94,8 +94,8 @@ void RemoteParameterUpdater::init(std::vector<ParameterPtr>& parameters) {
     parameterClient_->getParameter();
     copyParametersToDevice(PARAMETER_VALUE);
   }
-  if (FLAGS_trainer_id == 0 && (config_.algorithm()
-                                != TrainAlgorithm::AsyncSGD)) {
+  if (FLAGS_trainer_id == 0 &&
+      (config_.algorithm() != TrainAlgorithm::AsyncSGD)) {
     startController();
     useApplyInPserver_ = useApplyInPserver(config_);
   }
@@ -241,7 +241,9 @@ void RemoteParameterUpdater::finishBatch(real cost) {
 
   {
     REGISTER_TIMER("sendAndRecv_dense");
-    parameterClient_->sendAndReceiveParameter(mode, sendType, batchSize_,
+    parameterClient_->sendAndReceiveParameter(mode,
+                                              sendType,
+                                              batchSize_,
                                               0,  // cost = 0
                                               sendBackParameter);
   }
@@ -356,7 +358,8 @@ void RemoteParameterUpdater::restore() {
 }
 
 ConcurrentRemoteParameterUpdater::ConcurrentRemoteParameterUpdater(
-    OptimizationConfig config, int passCount,
+    OptimizationConfig config,
+    int passCount,
     std::unique_ptr<ParameterUpdater>&& localUpdater)
     : RemoteParameterUpdater(config, passCount, std::move(localUpdater)) {
   sendThread_.reset(new std::thread([this]() { this->send(); }));
@@ -423,7 +426,10 @@ void ConcurrentRemoteParameterUpdater::send(Parameter* para) {
   std::vector<ParameterSegments> paraSegment;
   if (para == NULL) {
     parameterClient_->sendParameter(
-        mode, sendType, paraSegment, batchSize_,
+        mode,
+        sendType,
+        paraSegment,
+        batchSize_,
         0,              // cost=0
         true,           // sendBackParameter = true
         batchStatus_);  // batchStatus_ = BATCH_FINISH
@@ -440,7 +446,10 @@ void ConcurrentRemoteParameterUpdater::send(Parameter* para) {
       copySingleParaFromDevice(para, sendType);
       hl_stream_synchronize(kDeviceToHostStream);
     }
-    parameterClient_->sendParameter(mode, sendType, paraSegment, batchSize_,
+    parameterClient_->sendParameter(mode,
+                                    sendType,
+                                    paraSegment,
+                                    batchSize_,
                                     0,     // cost=0
                                     true,  // sendBackParameter = true
                                     batchStatus_);
@@ -589,14 +598,14 @@ SparseRemoteParameterUpdater::SparseRemoteParameterUpdater(
 void SparseRemoteParameterUpdater::init(std::vector<ParameterPtr>& parameters) {
   ParameterUpdater::init(parameters);
 
-  parameterClient_.reset(new ParameterClient2(false,
-      FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse));
+  parameterClient_.reset(new ParameterClient2(
+      false, FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse));
   parameterClient_->init(parameters_);
   parameterClient_->setTrainerId(FLAGS_trainer_id);
 
   if (FLAGS_trainer_id == 0) {
-    parameterClient_->setConfig(config_, FLAGS_save_dir,
-                                true /*is_sparse_server*/);
+    parameterClient_->setConfig(
+        config_, FLAGS_save_dir, true /*is_sparse_server*/);
     if (parameters[0]->isFullSize()) {
       parameterClient_->setParameter();
     } else {  // init in pserver
@@ -615,9 +624,8 @@ void SparseRemoteParameterUpdater::startController() {
 }
 
 void SparseRemoteParameterUpdater::controller() {
-  ParameterClient2 client(false,
-                          FLAGS_port + FLAGS_ports_num,
-                          FLAGS_ports_num_for_sparse);
+  ParameterClient2 client(
+      false, FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse);
   client.init(parameters_);
 
   while (true) {
@@ -679,7 +687,9 @@ void SparseRemoteParameterUpdater::finishBatch(real cost) {
   ParameterType sendType = PARAMETER_GRADIENT;
 
   REGISTER_TIMER("sendSparseParam");
-  parameterClient_->sendAndReceiveParameter(mode, sendType, batchSize_,
+  parameterClient_->sendAndReceiveParameter(mode,
+                                            sendType,
+                                            batchSize_,
                                             0,       // cost = 0
                                             false);  // sendBackParameter
 
@@ -823,6 +833,6 @@ void SparseRemoteParameterUpdaterComposite::init(
 
 std::vector<std::function<ParameterUpdater*(
     const std::string&, const OptimizationConfig&, bool, size_t)>>
-ParameterUpdaterCreators::constructors_;
+    ParameterUpdaterCreators::constructors_;
 
 }  // namespace paddle
diff --git a/paddle/trainer/RemoteParameterUpdater.h b/paddle/trainer/RemoteParameterUpdater.h
index be273e9ef73c744ddbcad760ac50a5720c7502a9..a40884724cc7f963dc6ce5eede750327b2bbfed9 100644
--- a/paddle/trainer/RemoteParameterUpdater.h
+++ b/paddle/trainer/RemoteParameterUpdater.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <thread>
@@ -56,7 +55,8 @@ namespace paddle {
 class RemoteParameterUpdater : public ParameterUpdater {
 public:
   RemoteParameterUpdater(
-      const OptimizationConfig& config, int expectedPpassCount,
+      const OptimizationConfig& config,
+      int expectedPpassCount,
       std::unique_ptr<ParameterUpdater>&& localUpdater = nullptr);
   ~RemoteParameterUpdater() {
     if (controllerThread_) {
@@ -180,7 +180,8 @@ protected:
 class ConcurrentRemoteParameterUpdater : public RemoteParameterUpdater {
 public:
   ConcurrentRemoteParameterUpdater(
-      OptimizationConfig config, int expectedPassCount,
+      OptimizationConfig config,
+      int expectedPassCount,
       std::unique_ptr<ParameterUpdater>&& localUpdater);
   ~ConcurrentRemoteParameterUpdater();
 
@@ -264,7 +265,8 @@ private:
 class SparseRemoteParameterUpdater : public ParameterUpdater {
 public:
   SparseRemoteParameterUpdater(const OptimizationConfig& config,
-                               int expectedPassCount, bool testing);
+                               int expectedPassCount,
+                               bool testing);
   ~SparseRemoteParameterUpdater() {
     if (controllerThread_) {
       controllerThread_->join();
@@ -345,7 +347,9 @@ public:
    * @note  use syncThreadPool to synchronize these two updaters
    */
   SparseRemoteParameterUpdaterComposite(
-      const OptimizationConfig& config, int expectedPassCount, bool testing,
+      const OptimizationConfig& config,
+      int expectedPassCount,
+      bool testing,
       std::unique_ptr<ParameterUpdater>&& normalUpdater) {
     updaters_.resize(NUMBER_UPDATERS);
     updaters_[UPDATER_SPARSE_REMOTE].reset(
@@ -373,11 +377,11 @@ public:
    */
   static void addCreator(
       const std::function<ParameterUpdater*(
-          const std::string&,  // algo
+          const std::string&,         // algo
           const OptimizationConfig&,  // optConfig
-          bool,  // isLocal
-          size_t  // numPasses
-        )>& creator) {    // NOLINT  explicit move closing ) in this line
+          bool,                       // isLocal
+          size_t                      // numPasses
+          )>& creator) {  // NOLINT  explicit move closing ) in this line
                           // for readability
     constructors_.push_back(creator);
   }
@@ -395,7 +399,7 @@ public:
                                             const OptimizationConfig& optConfig,
                                             bool isLocal,
                                             size_t numPasses) {
-    for (auto & c : constructors_) {
+    for (auto& c : constructors_) {
       if (auto updater = c(algo, optConfig, isLocal, numPasses)) {
         return updater;
       }
@@ -406,7 +410,7 @@ public:
 private:
   static std::vector<std::function<ParameterUpdater*(
       const std::string&, const OptimizationConfig&, bool, size_t)>>
-  constructors_;
+      constructors_;
 };
 
 }  // namespace paddle
diff --git a/paddle/trainer/Tester.cpp b/paddle/trainer/Tester.cpp
index d3b88019faa04b7cebf44dd63678aa9d4ffb5252..6a5b7241a020941fad4321d8e271a1f23467b6ff 100644
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
@@ -12,66 +12,63 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Tester.h"
 
 #include <fenv.h>
 #include <stdio.h>
 
-#include <iostream>
 #include <iomanip>
-#include <sstream>
+#include <iostream>
 #include <limits>
+#include <sstream>
 
 #include <google/protobuf/text_format.h>
 
+#include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
-#include "paddle/utils/GlobalConstants.h"
 
+#include "TesterConfig.h"
+#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "paddle/gserver/layers/ValidationLayer.h"
-#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
-#include "TesterConfig.h"
 
 namespace paddle {
 
-Tester::Tester(const std::shared_ptr<TrainerConfigHelper> &config,
-               std::unique_ptr<TesterConfig> &&intconfig,
-               const GradientMachinePtr &gradientMachine,
-               const std::shared_ptr<ParameterUpdater> &parameterUpdater,
-               std::shared_ptr<DataProvider> testDataProvider):
-               config_(config),
-               intconfig_(std::move(intconfig)),
-               gradientMachine_(gradientMachine),
-               parameterUpdater_(parameterUpdater),
-               testDataProvider_(testDataProvider) {
-  testEvaluator_.reset(gradientMachine_ ->makeEvaluator());
+Tester::Tester(const std::shared_ptr<TrainerConfigHelper>& config,
+               std::unique_ptr<TesterConfig>&& intconfig,
+               const GradientMachinePtr& gradientMachine,
+               const std::shared_ptr<ParameterUpdater>& parameterUpdater,
+               std::shared_ptr<DataProvider> testDataProvider)
+    : config_(config),
+      intconfig_(std::move(intconfig)),
+      gradientMachine_(gradientMachine),
+      parameterUpdater_(parameterUpdater),
+      testDataProvider_(testDataProvider) {
+  testEvaluator_.reset(gradientMachine_->makeEvaluator());
   if (intconfig_->distributeTest) {
     testParameterClient_.reset(new ParameterClient2(true));
   }
 
   if (testParameterClient_) {
-    testParameterClient_->init(
-        gradientMachine_->getParameters());
+    testParameterClient_->init(gradientMachine_->getParameters());
   }
 
   std::unique_ptr<ParameterUtilConfig> paramConfig(
-      new ParameterUtilConfig(
-          intconfig_->saveOnlyOne,
-          intconfig_->savingPeriod,
-          intconfig_->loadsaveParametersInPserver,
-          intconfig_->config));
+      new ParameterUtilConfig(intconfig_->saveOnlyOne,
+                              intconfig_->savingPeriod,
+                              intconfig_->loadsaveParametersInPserver,
+                              intconfig_->config));
 
   paramUtil_.reset(new ParameterUtil(
-      config_,
-      std::move(paramConfig),
-      gradientMachine_,
-      parameterUpdater_));
+      config_, std::move(paramConfig), gradientMachine_, parameterUpdater_));
 }
 
 void Tester::startTestPeriod() {
+  if (testDataProvider_) {
+    testDataProvider_->reset();
+  }
   testEvaluator_->start();
   testContext_.cost = 0;
   testContext_.numSamples = 0;
@@ -83,43 +80,28 @@ void Tester::startTestPeriod() {
   }
 }
 
-void Tester::testOneDataBatch(
-    const DataBatch& dataBatch, std::vector<Argument>* outArgs) {
-  testContext_.cost += forwardOneBatch(
-    dataBatch, testEvaluator_.get(), outArgs);
+void Tester::testOneDataBatch(const DataBatch& dataBatch,
+                              std::vector<Argument>* outArgs) {
+  testContext_.cost +=
+      forwardOneBatch(dataBatch, testEvaluator_.get(), outArgs);
   testContext_.numSamples += dataBatch.getSize();
 }
 
 void Tester::testOnePeriod() {
   DataBatch dataBatch;
   int64_t batchSize = config_->getOptConfig().batch_size();
-  bool testAllData =
-      intconfig_->testPeriod == 0 || intconfig_->testAllDataInOnePeriod;
-  int batches =
-      testAllData ? std::numeric_limits<int>::max() : intconfig_->testPeriod;
-
   std::vector<Argument> outArgs;
-
   startTestPeriod();
-  for (int i = 0; i < batches; ++i) {
-    int num = testDataProvider_->getNextBatch(batchSize, &dataBatch);
-    if (num == 0) {
-      testDataProvider_->reset();
-      if (intconfig_->prevBatchState) {
-        gradientMachine_->resetState();
-      }
-      if (testAllData) {
-        break;
-      } else {
-        num = testDataProvider_->getNextBatch(batchSize, &dataBatch);
-      }
-    }
+  while (testDataProvider_->getNextBatch(batchSize, &dataBatch) != 0) {
     testOneDataBatch(dataBatch, &outArgs);
   }
   finishTestPeriod();
 }
 
 void Tester::finishTestPeriod() {
+  if (intconfig_->prevBatchState) {
+    gradientMachine_->resetState();
+  }
   testEvaluator_->finish();
   CHECK_GT(testContext_.numSamples, 0)
       << "There is no samples in your test batch. Possibly "
@@ -158,8 +140,8 @@ int64_t Tester::testOneBatchById(int64_t batchId) {
   return actualBatchSize;
 }
 
-real Tester::forwardOneBatch(const DataBatch &dataBatch,
-                             Evaluator *evaluator,
+real Tester::forwardOneBatch(const DataBatch& dataBatch,
+                             Evaluator* evaluator,
                              std::vector<Argument>* pOutArgs) {
   auto& outArgs = *pOutArgs;
   const std::vector<Argument>& inArgs = dataBatch.getStreams();
@@ -180,7 +162,8 @@ real Tester::forwardOneBatch(const DataBatch &dataBatch,
     featMatrices.resize(numOutputs);
     for (size_t i = 0; i < numOutputs; ++i) {
       featMatrices[i] = Matrix::create(outArgs[i].value->getHeight(),
-                                       outArgs[i].value->getWidth(), false,
+                                       outArgs[i].value->getWidth(),
+                                       false,
                                        false);  // CPU data buffer
       featMatrices[i]->copyFrom(*(outArgs[i].value), HPPL_STREAM_DEFAULT);
     }
@@ -222,20 +205,19 @@ real Tester::forwardOneBatch(const DataBatch &dataBatch,
   return Argument::sumCosts(outArgs);
 }
 
-
 void Tester::testOnePassBatch(int passId) {
   stats_.reset();
   const std::vector<Argument> inArgs;
   gradientMachine_->forward(inArgs, nullptr, PASS_TEST);
-  int64_t num; real cost;
+  int64_t num;
+  real cost;
   gradientMachine_->getStats(cost, num);
-  stats_ += std::pair<int64_t, real> {num, cost};
+  stats_ += std::pair<int64_t, real>{num, cost};
   gradientMachine_->onPassEnd();
 
   LOG(INFO) << " Pass=" << passId << " " << stats_.getStats(false);
 }
 
-
 void Tester::testOnePass(int passId) {
   stats_.reset();
   int64_t batchId = 0;
@@ -265,7 +247,6 @@ void Tester::testOnePass(int passId) {
   }
 }
 
-
 void Tester::test() {
   CHECK(testDataProvider_) << "TestData is not specified";
   testDataProvider_->setSkipShuffle();
@@ -281,33 +262,32 @@ void Tester::test() {
     intconfig_->testPass = 0;
     intconfig_->numPasses = modelList.size();
     intconfig_->savingPeriod = 1;
-    CHECK_EQ(intconfig_->testWait, 0) <<
-      "--test_wait must be 0 for evaluation";
+    CHECK_EQ(intconfig_->testWait, 0) << "--test_wait must be 0 for evaluation";
   } else if (!initModelPath.empty()) {
     modelList.push_back(initModelPath);
     intconfig_->testPass = 0;
     intconfig_->numPasses = 1;
     intconfig_->savingPeriod = 1;
-    CHECK_EQ(intconfig_->testWait, 0) <<
-      "--test_wait must be 0 for evaluation";
+    CHECK_EQ(intconfig_->testWait, 0) << "--test_wait must be 0 for evaluation";
   }
 
   for (int i = intconfig_->testPass; i < intconfig_->numPasses; ++i) {
     int passId = i;
     if (passId % intconfig_->savingPeriod == 0) {
       if (intconfig_->testWait) {
-        while (paramUtil_->loadParameters(passId,
-                true /*local*/, true /*remote*/) == false) {
+        while (paramUtil_->loadParameters(
+                   passId, true /*local*/, true /*remote*/) == false) {
           LOG(INFO) << "Waiting for parameters of pass " << passId;
           sleep(60);  // sleep 60s
         }
       } else {
         if (modelList.size() == 0) {
-          CHECK_EQ(paramUtil_->loadParameters(passId,
-                  true /*local*/, true /*remote*/), true);
+          CHECK_EQ(paramUtil_->loadParameters(
+                       passId, true /*local*/, true /*remote*/),
+                   true);
         } else {
-          paramUtil_->loadParametersWithPath(modelList[i],
-                                      true /*local*/, true /*remote*/);
+          paramUtil_->loadParametersWithPath(
+              modelList[i], true /*local*/, true /*remote*/);
         }
       }
       if (IGradientMachineMode::trainWholeDataInOneBatch(intconfig_->mode)) {
@@ -326,9 +306,8 @@ void Tester::test() {
   gradientMachine_->finish();
 }
 
-
 void Tester::printOutput(const std::vector<Argument>& outArgs,
-                          std::ostream& os) {
+                         std::ostream& os) {
   size_t numOutputs = outArgs.size();
   size_t numIns = outArgs[0].getBatchSize();
   if (cpuMat_.size() != numOutputs || cpuVec_.size() != numOutputs) {
@@ -346,11 +325,13 @@ void Tester::printOutput(const std::vector<Argument>& outArgs,
         } else if (dynamic_cast<GpuSparseMatrix*>(outArgs[i].value.get())) {
           auto sparseMat =
               dynamic_cast<GpuSparseMatrix*>(outArgs[i].value.get());
-          cpuMat_[i] = Matrix::createSparseMatrix(
-              sparseMat->getHeight(), sparseMat->getWidth(),
-              sparseMat->getElementCnt(), sparseMat->getValueType(),
-              sparseMat->format_, false, /* trans */
-              false);                    /* useGpu */
+          cpuMat_[i] = Matrix::createSparseMatrix(sparseMat->getHeight(),
+                                                  sparseMat->getWidth(),
+                                                  sparseMat->getElementCnt(),
+                                                  sparseMat->getValueType(),
+                                                  sparseMat->format_,
+                                                  false,  /* trans */
+                                                  false); /* useGpu */
           hl_stream_t stream = HPPL_STREAM_DEFAULT;
           cpuMat_[i]->copyFrom(*sparseMat, stream);
         } else {
diff --git a/paddle/trainer/Tester.h b/paddle/trainer/Tester.h
index 671ffc5220ebaf2e009225191f6a77e6fea80d33..a9de9fe208c61c00fbeebe644222e255308e762b 100644
--- a/paddle/trainer/Tester.h
+++ b/paddle/trainer/Tester.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
@@ -49,10 +48,10 @@ public:
    *                         for getting parameter from parameter-server.
    * @param testDataProvider Test data provider.
    */
-  Tester(const std::shared_ptr<TrainerConfigHelper> &config,
-         std::unique_ptr<TesterConfig> &&intconfig,
-         const GradientMachinePtr &gradientMachine,
-         const std::shared_ptr<ParameterUpdater> &parameterUpdater,
+  Tester(const std::shared_ptr<TrainerConfigHelper>& config,
+         std::unique_ptr<TesterConfig>&& intconfig,
+         const GradientMachinePtr& gradientMachine,
+         const std::shared_ptr<ParameterUpdater>& parameterUpdater,
          std::shared_ptr<DataProvider> testDataProvider);
 
   /**
@@ -83,13 +82,11 @@ public:
                        Evaluator* evaluator,
                        std::vector<Argument>* outArgs);
 
-
   /**
    * performance the full pass of test given test data provider
    */
   void test();
 
-
 protected:
   std::shared_ptr<ParameterClient2> testParameterClient_;
   std::shared_ptr<TrainerConfigHelper> config_;
diff --git a/paddle/trainer/TesterConfig.h b/paddle/trainer/TesterConfig.h
index d5e644ce6124710c76a463d521c16451e22b5462..f490e5734415c0939fd925a6c7dd34c1e6d3a34f 100644
--- a/paddle/trainer/TesterConfig.h
+++ b/paddle/trainer/TesterConfig.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
@@ -40,11 +39,6 @@ struct TesterConfig {
    */
   int testPeriod;
 
-  /**
-   * indicate whether testing data in one period
-   */
-  bool testAllDataInOnePeriod;
-
   /**
    * indicate whether to save previous batch state
    */
diff --git a/paddle/trainer/ThreadParameterUpdater.cpp b/paddle/trainer/ThreadParameterUpdater.cpp
index d0fda1b6253e3e4b11a7e6b956d9a93ad5596728..cc22851d8ecbf594df1e3f2c8aeaa98c07b3765b 100644
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ b/paddle/trainer/ThreadParameterUpdater.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ThreadParameterUpdater.h"
 
 #include "paddle/utils/Logging.h"
@@ -45,7 +44,8 @@ void SgdThreadUpdater::init(std::vector<ParameterPtr>& parameters) {
   optimizers_.resize(maxId + 1);
   for (auto& para : parameters_) {
     int pid = para->getID();
-    optimizers_[pid].reset(sgdOptimizerCreate(config_, para->getConfig(),
+    optimizers_[pid].reset(sgdOptimizerCreate(config_,
+                                              para->getConfig(),
                                               para->isGradSparseUpdate(),
                                               false /*inPserver*/));
     size_t numRows = para->isGradSparseUpdate() ? para->getConfig().dims(0) : 0;
@@ -91,8 +91,10 @@ void SgdThreadUpdater::updateImpl(Parameter* para) {
 }
 
 void SgdThreadUpdater::threadTraverse(
-    const ParameterOptimizer::TraverseCallback& callback, int tid,
-    size_t numThreads, Parameter* para) {
+    const ParameterOptimizer::TraverseCallback& callback,
+    int tid,
+    size_t numThreads,
+    Parameter* para) {
   VectorPtr* vecs = Parameter::getTlsTempBufs();
   if (para->isGradSparseUpdate()) {
     size_t height = para->getConfig().dims(0);
@@ -106,8 +108,8 @@ void SgdThreadUpdater::threadTraverse(
     }
   } else {  // dense
     // setup sub bufs
-    auto interval = calcSplitArrayInterval(para->getSize(), (size_t)tid,
-                                           numThreads, 8LU /*for avx*/);
+    auto interval = calcSplitArrayInterval(
+        para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
     for (auto type : parameterTypes_) {
       vecs[type]->subVecFrom(*para->getBuf(type), interval);
     }
@@ -150,7 +152,7 @@ void SgdThreadUpdater::traverse(GetTraverseCallback getTraverseCallback) {
   } else if (hasCpuPara) {
     getGlobalSyncThreadPool()->exec(cpuTraverse);
   } else if (hasGpuPara) {
-      gpuTraverse(0, 0);
+    gpuTraverse(0, 0);
   }
 }
 
@@ -168,9 +170,8 @@ void SgdThreadUpdater::catchUpWith() {
 void SgdThreadUpdater::apply() {
   catchUpWith();
 
-  traverse([this](Parameter* para) {
-    return optimizers_[para->getID()]->apply();
-  });
+  traverse(
+      [this](Parameter* para) { return optimizers_[para->getID()]->apply(); });
 }
 
 void SgdThreadUpdater::restore() {
@@ -205,9 +206,9 @@ void SgdThreadUpdater::finishBatch(real cost) {
   }
 }
 
-void SgdThreadUpdater::threadUpdateSparse(
-    int tid, size_t numThreads, Parameter* para) {
-
+void SgdThreadUpdater::threadUpdateSparse(int tid,
+                                          size_t numThreads,
+                                          Parameter* para) {
   int pid = para->getID();
   ParameterOptimizer* optimizer = optimizers_[pid].get();
   VectorPtr* vecs = Parameter::getTlsTempBufs();
@@ -216,10 +217,10 @@ void SgdThreadUpdater::threadUpdateSparse(
   size_t width = para->getConfig().dims(1);
 
   if (dynamic_cast<SparseRowIdsCpuMatrix*>(
-        para->getMat(PARAMETER_GRADIENT).get())) {
+          para->getMat(PARAMETER_GRADIENT).get())) {
     // From MultiGradientMachine
     SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
-      para->getMat(PARAMETER_GRADIENT).get());
+        para->getMat(PARAMETER_GRADIENT).get());
     std::vector<uint32_t>& sparseIds = mainMat->getIds(tid);
 
     for (auto id : sparseIds) {
@@ -232,16 +233,16 @@ void SgdThreadUpdater::threadUpdateSparse(
     }
     sparseIds.clear();
   } else if (dynamic_cast<SparseRowCpuMatrix*>(
-               para->getMat(PARAMETER_GRADIENT).get())) {
+                 para->getMat(PARAMETER_GRADIENT).get())) {
     // From NeuralNetwork
     SparseRowCpuMatrix* mainMat = dynamic_cast<SparseRowCpuMatrix*>(
-      para->getMat(PARAMETER_GRADIENT).get());
+        para->getMat(PARAMETER_GRADIENT).get());
 
     std::vector<unsigned int>& localIndices =
         mainMat->getIndexDictHandle()->localIndices;
 
-    auto interval = calcSplitArrayInterval(
-      localIndices.size(), tid, numThreads);
+    auto interval =
+        calcSplitArrayInterval(localIndices.size(), tid, numThreads);
     for (size_t i = interval.first; i < interval.second; ++i) {
       auto id = localIndices[i];
       real* row = mainMat->getLocalRow(i);
@@ -261,12 +262,11 @@ void SgdThreadUpdater::threadUpdateSparse(
     CHECK_EQ(numThreads, 1UL);
     mainMat->clearIndices();
   } else {
-    auto & m = *para->getMat(PARAMETER_GRADIENT).get();
+    auto& m = *para->getMat(PARAMETER_GRADIENT).get();
     LOG(FATAL) << "Internal error: " << para->getName() << " "
                << typeid(m).name();
   }
 
-
   if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
     for (size_t i = tid; i < height; i += numThreads) {
       // setup sub bufs
@@ -278,14 +278,15 @@ void SgdThreadUpdater::threadUpdateSparse(
   }
 }
 
-void SgdThreadUpdater::threadUpdateDense(int tid, size_t numThreads,
+void SgdThreadUpdater::threadUpdateDense(int tid,
+                                         size_t numThreads,
                                          Parameter* para) {
   int pid = para->getID();
   ParameterOptimizer* optimizer = optimizers_[pid].get();
   VectorPtr* vecs = Parameter::getTlsTempBufs();
 
-  auto interval = calcSplitArrayInterval(para->getSize(), (size_t)tid,
-                                         numThreads, 8LU /*for avx*/);
+  auto interval = calcSplitArrayInterval(
+      para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
 
   // setup sub bufs
   for (auto type : parameterTypes_) {
diff --git a/paddle/trainer/ThreadParameterUpdater.h b/paddle/trainer/ThreadParameterUpdater.h
index d8a7a5dd4f12afc7edfbf2c5f28cbe31d7516153..5a5e3f1d4b3c1e915aa6ac01ff503c552e42de1a 100644
--- a/paddle/trainer/ThreadParameterUpdater.h
+++ b/paddle/trainer/ThreadParameterUpdater.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
@@ -26,7 +25,6 @@ limitations under the License. */
 #include <memory>
 #include <vector>
 
-
 namespace paddle {
 
 /**
@@ -45,14 +43,12 @@ public:
   explicit SgdThreadUpdater(const OptimizationConfig& optConfig);
   virtual ~SgdThreadUpdater() {}
 
-
   // Use the startPass() function of the base optimizer.
   virtual void startPass();
 
   // Use the finishPass() function of the base optimizer.
   virtual bool finishPass(real cost);
 
-
   virtual void init(std::vector<ParameterPtr>& parameters);
   virtual PassType startBatch(int64_t batchSize);
   // Call finishBatch for each optimizer.
@@ -78,9 +74,11 @@ protected:
   void threadUpdateDense(int tid, size_t numThreads, Parameter* para);
   // The update function for after update operations, such as averager.
   void threadTraverse(const ParameterOptimizer::TraverseCallback& callback,
-                      int tid, size_t numThreads, Parameter* para);
+                      int tid,
+                      size_t numThreads,
+                      Parameter* para);
   typedef std::function<const ParameterOptimizer::TraverseCallback(Parameter*)>
-    GetTraverseCallback;
+      GetTraverseCallback;
   void traverse(GetTraverseCallback getTraverseCallback);
 };
 
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index 7fc48dd1fbec6588b71db031d89dd88c5c5cf92c..9c83c207ede99bddeeab5f56d90d357ee8b56edd 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -12,60 +12,63 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Trainer.h"
 
 #include <fenv.h>
 #include <stdio.h>
 
-#include <iostream>
 #include <iomanip>
-#include <sstream>
+#include <iostream>
 #include <limits>
+#include <sstream>
 
 #include <google/protobuf/text_format.h>
 
+#include "paddle/utils/Excepts.h"
+#include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
-#include "paddle/utils/Excepts.h"
-#include "paddle/utils/GlobalConstants.h"
 
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
-#include "paddle/gserver/layers/ValidationLayer.h"
+#include "RemoteParameterUpdater.h"
 #include "TesterConfig.h"
 #include "ThreadParameterUpdater.h"
-#include "RemoteParameterUpdater.h"
 #include "TrainerConfigHelper.h"
+#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/gserver/layers/ValidationLayer.h"
 
 P_DEFINE_string(config, "", "Trainer config file");
-P_DEFINE_int32(test_period, 0,
-               "Run test every so many train batches."
-               " 0 for testing after each pass."
-               " If not 0, test log_period batches."
-               " If 0, test on all test data");
 
-P_DEFINE_bool(local, true, "Train in local mode or not");
+P_DEFINE_int32(test_period,
+               0,
+               "if equal 0, do test on all test data at the end of "
+               "each pass. While if equal non-zero, do test on all test "
+               "data every test_period batches");
+P_DEFINE_bool(test_all_data_in_one_period,
+              false,
+              "This option was deprecated, since we will always do "
+              "test on all test set ");
 
-P_DEFINE_bool(
-    test_all_data_in_one_period, false,
-    "true will test all data in one test peroid."
-    "Otherwise test (batch_size * log_peroid) data in one test period.");
+P_DEFINE_bool(local, true, "Train in local mode or not");
 
-P_DEFINE_int32(average_test_period, 0,
+P_DEFINE_int32(average_test_period,
+               0,
                "Do test on average parameter every so"
                " many batches. MUST be devided by FLAGS_log_period."
                " Default 0 means do not test average parameter");
 
 P_DEFINE_int32(saving_period, 1, "Save parameteres every so many passes");
-P_DEFINE_int64(saving_period_by_batches, 0,
+P_DEFINE_int64(saving_period_by_batches,
+               0,
                "Save parameters every so many batches in one pass");
 P_DEFINE_string(save_dir, "", "Directory for saving model parameter");
-P_DEFINE_int32(start_pass, 0,
+P_DEFINE_int32(start_pass,
+               0,
                "Start training from this pass. "
                "Will load parameter from the previous pass");
-P_DEFINE_int32(test_pass, -1,
+P_DEFINE_int32(test_pass,
+               -1,
                "Will load parameter start from this pass to test");
 P_DEFINE_int32(test_wait, 0, "Waiting for pass parameter if not exist");
 P_DEFINE_bool(with_cost, true, "enable cost layer or not");
@@ -73,17 +76,21 @@ P_DEFINE_bool(distribute_test, false, "test in distribute mode");
 
 P_DEFINE_int32(num_passes, 100, "train for so many passes");
 
-P_DEFINE_string(config_args, "",
+P_DEFINE_string(config_args,
+                "",
                 "arguments passed to config file."
                 "Format: key1=value1,key2=value2");
 
-P_DEFINE_bool(save_only_one, false,
+P_DEFINE_bool(save_only_one,
+              false,
               "Save only parameters in last pass, remove previous.");
 
 P_DEFINE_string(feat_file, "", "File name of extracted feature.");
-P_DEFINE_string(predict_output_dir, "",
+P_DEFINE_string(predict_output_dir,
+                "",
                 "Directory that saves the predicted results of output layers");
-P_DEFINE_string(model_list, "",
+P_DEFINE_string(model_list,
+                "",
                 "File that saves the model list when evaluation");
 
 namespace paddle {
@@ -98,11 +105,11 @@ void Trainer::init(int argc, char** argv) {
   init(config);
 }
 
-void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
+void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
                    bool testing,
-                   const std::shared_ptr<GradientMachine> &gradientMachine,
-                   const std::shared_ptr<DataProvider> &dataProvider,
-                   const std::shared_ptr<DataProvider> &testDataProvider) {
+                   const std::shared_ptr<GradientMachine>& gradientMachine,
+                   const std::shared_ptr<DataProvider>& dataProvider,
+                   const std::shared_ptr<DataProvider>& testDataProvider) {
   this->stats_ = std::make_shared<TrainerStats>();
 
   config_ = config;
@@ -156,13 +163,16 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
       LOG(INFO) << "trainer mode: Testing";
     }
   } else if (IGradientMachineMode::tryGetMode(
-               (int*)&mode_, config_->getOptConfig().algorithm(),
-               FLAGS_trainer_count,
-               FLAGS_local, FLAGS_use_gpu)) {
+                 (int*)&mode_,
+                 config_->getOptConfig().algorithm(),
+                 FLAGS_trainer_count,
+                 FLAGS_local,
+                 FLAGS_use_gpu)) {
     LOG(INFO) << "Custom trainer mode.";
   } else if ((config_->getOptConfig().algorithm() == TrainAlgorithm::SGD ||
-              config_->getOptConfig().algorithm() == TrainAlgorithm::AsyncSGD)
-             && useSparseUpdater) {
+              config_->getOptConfig().algorithm() ==
+                  TrainAlgorithm::AsyncSGD) &&
+             useSparseUpdater) {
     mode_ = GradientMachine::kSgdSparseCpuTraining;
     LOG(INFO) << "trainer mode: SgdSparseCpuTraining";
   } else {
@@ -171,29 +181,29 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
   }
 
   // initialize trainer internal
-  trainerInternal_.init(config_, gradientMachine,
+  trainerInternal_.init(config_,
+                        gradientMachine,
                         TrainerInternalConfig::createFromMode(mode_),
-                        stats_, testing);
+                        stats_,
+                        testing);
   std::unique_ptr<ParameterUtilConfig> paramConfig(
-          new ParameterUtilConfig(FLAGS_save_only_one,
-                                  FLAGS_saving_period,
-                                  FLAGS_loadsave_parameters_in_pserver,
-                                  FLAGS_config));
+      new ParameterUtilConfig(FLAGS_save_only_one,
+                              FLAGS_saving_period,
+                              FLAGS_loadsave_parameters_in_pserver,
+                              FLAGS_config));
 
   paramUtil_.reset(
-      new paddle::ParameterUtil(
-          config_,
-          std::move(paramConfig),
-          trainerInternal_.getGradientMachine(),
-          trainerInternal_.getParameterUpdater()));
-
+      new paddle::ParameterUtil(config_,
+                                std::move(paramConfig),
+                                trainerInternal_.getGradientMachine(),
+                                trainerInternal_.getParameterUpdater()));
 
-  bool gpuData = FLAGS_use_gpu && (!FLAGS_parallel_nn) &&
-                 (!IGradientMachineMode::dataMustInCpu(mode_,
-                                                       FLAGS_trainer_count));
+  bool gpuData =
+      FLAGS_use_gpu && (!FLAGS_parallel_nn) &&
+      (!IGradientMachineMode::dataMustInCpu(mode_, FLAGS_trainer_count));
 
   dataProvider_ = dataProvider;
-  if (!dataProvider_ && config_->hasDataConfig()) {
+  if (!dataProvider_ && config_->hasDataConfig() && !testing_) {
     dataProvider_.reset(DataProvider::create(*config_, *config_, gpuData));
   }
   if (!testDataProvider_) {
@@ -244,12 +254,14 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
       } else if (!config_->getConfig().init_model_path().empty() &&
                  (FLAGS_local || FLAGS_trainer_id == 0)) {
         paramUtil_->loadParametersWithPath(
-              config_->getConfig().init_model_path(),
-              false /*local*/, true /*remote*/);
+            config_->getConfig().init_model_path(),
+            false /*local*/,
+            true /*remote*/);
       } else if (config_->getConfig().start_pass() > 0 &&
                  (FLAGS_local || FLAGS_trainer_id == 0)) {
         CHECK(paramUtil_->loadParameters(config_->getConfig().start_pass() - 1,
-              false /*local*/, true /*remote*/));
+                                         false /*local*/,
+                                         true /*remote*/));
       } else {
         trainerInternal_.getParameterUpdater()->randParametersRemote();
       }
@@ -277,9 +289,8 @@ void Trainer::train(size_t numPasses) {
   finishTrain();
 }
 
-
 static double genPerturbation(real* d, real* grad, size_t dim) {
-  auto & reng = ThreadLocalRandomEngine::get();
+  auto& reng = ThreadLocalRandomEngine::get();
   std::uniform_real_distribution<double> dist(-1, 1);
   double gradNorm = 0, dNorm = 0;
   for (size_t i = 0; i < dim; ++i) {
@@ -383,16 +394,10 @@ void Trainer::startTrain() {
     dataProvider_->reset();
   }
 
-  if (this->testDataProvider_) {
-    this->testDataProvider_->reset();
-  }
-
   trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
 }
 
-void Trainer::finishTrain() {
-  trainerInternal_.getGradientMachine()->finish();
-}
+void Trainer::finishTrain() { trainerInternal_.getGradientMachine()->finish(); }
 
 void Trainer::startTrainPass() {
   stats_->reset();
@@ -421,9 +426,8 @@ void Trainer::trainOneDataBatch(DataBatch& dataBatch) {
       if (FLAGS_prev_batch_state) {
         trainerInternal_.getGradientMachine()->getState(trainState_);
       }
-      trainPassContext_.avgTestCost +=
-          tester_->forwardOneBatch(
-            dataBatch, averageEvaluator_.get(), &forwardOutput_);
+      trainPassContext_.avgTestCost += tester_->forwardOneBatch(
+          dataBatch, averageEvaluator_.get(), &forwardOutput_);
       if (FLAGS_prev_batch_state) {
         trainerInternal_.getGradientMachine()->setState(trainState_);
       }
@@ -434,16 +438,16 @@ void Trainer::trainOneDataBatch(DataBatch& dataBatch) {
   {
     REGISTER_TIMER("TrainBatch");
     trainerInternal_.trainOneBatch(
-      trainPassContext_.batchId, dataBatch, &forwardOutput_);
+        trainPassContext_.batchId, dataBatch, &forwardOutput_);
   }
 
   if (averageEvaluator_ &&
-      trainPassContext_.batchId % FLAGS_average_test_period
-        == FLAGS_average_test_period - 1) {
+      trainPassContext_.batchId % FLAGS_average_test_period ==
+          FLAGS_average_test_period - 1) {
     averageEvaluator_->finish();
     LOG(INFO) << " Averaged parameter:"
-              << " cost=" << trainPassContext_.avgTestCost
-                             / trainPassContext_.numAvgTests
+              << " cost="
+              << trainPassContext_.avgTestCost / trainPassContext_.numAvgTests
               << " Eval: " << *averageEvaluator_;
     trainPassContext_.numAvgTests = 0;
     trainPassContext_.avgTestCost = 0;
@@ -463,15 +467,15 @@ void Trainer::trainOneDataBatch(DataBatch& dataBatch) {
   }
 
   if (FLAGS_saving_period_by_batches > 0 &&
-      trainPassContext_.batchId
-          > FLAGS_saving_period_by_batches * trainPassContext_.passInnerId &&
+      trainPassContext_.batchId >
+          FLAGS_saving_period_by_batches * trainPassContext_.passInnerId &&
       0 == FLAGS_trainer_id) {
     trainerInternal_.getParameterUpdater()->catchUpWith();
     if (testDataProvider_) {
       tester_->testOnePeriod();
     }
-    paramUtil_->saveParametersOnePass(
-      trainPassContext_.passId, trainPassContext_.passInnerId);
+    paramUtil_->saveParametersOnePass(trainPassContext_.passId,
+                                      trainPassContext_.passInnerId);
     ++trainPassContext_.passInnerId;
   }
 }
@@ -482,8 +486,8 @@ void Trainer::finishTrainPass() {
     return;
   }
 
-  trainerInternal_.finishTrainPass(
-    trainPassContext_.passId, trainPassContext_.batchId);
+  trainerInternal_.finishTrainPass(trainPassContext_.passId,
+                                   trainPassContext_.batchId);
 
   FOR_TIMING(globalStat.setThreadInfo(true));
   FOR_TIMING(globalStat.printAllStatus());
@@ -493,8 +497,8 @@ void Trainer::finishTrainPass() {
     tester_->testOnePeriod();
   }
 
-  if (trainPassContext_.passId % FLAGS_saving_period == 0
-      && FLAGS_trainer_id == 0) {
+  if (trainPassContext_.passId % FLAGS_saving_period == 0 &&
+      FLAGS_trainer_id == 0) {
     paramUtil_->saveParametersOnePass(trainPassContext_.passId);
   }
   ++trainPassContext_.passId;
@@ -526,8 +530,8 @@ void Trainer::trainOnePassBatch(int passId) {
   const std::vector<Argument> inArgs;
   {
     REGISTER_TIMER("onePass");
-    trainerInternal_.getGradientMachine()->forwardBackward(inArgs, nullptr,
-                                                        PASS_TRAIN, nullptr);
+    trainerInternal_.getGradientMachine()->forwardBackward(
+        inArgs, nullptr, PASS_TRAIN, nullptr);
   }
 
   real cost = .0;
@@ -537,8 +541,7 @@ void Trainer::trainOnePassBatch(int passId) {
 
   trainerInternal_.getGradientMachine()->onPassEnd();
 
-  bool accepted =
-    trainerInternal_.getParameterUpdater()->finishPass(cost);
+  bool accepted = trainerInternal_.getParameterUpdater()->finishPass(cost);
 
   globalStat.setThreadInfo(true);
   globalStat.printAllStatus();
@@ -559,11 +562,12 @@ void Trainer::trainOnePassBatch(int passId) {
   }
 }
 
-real Trainer::calcGradient(const DataBatch& dataBatch, const Vector& value,
+real Trainer::calcGradient(const DataBatch& dataBatch,
+                           const Vector& value,
                            Vector& gradient) {
   CHECK_EQ(value.getSize(), gradient.getSize());
   std::vector<ParameterPtr>& parameters =
-    trainerInternal_.getGradientMachine()->getParameters();
+      trainerInternal_.getGradientMachine()->getParameters();
 
   clearGradient();
 
@@ -584,8 +588,8 @@ real Trainer::calcGradient(const DataBatch& dataBatch, const Vector& value,
   std::vector<Argument> inArgs = dataBatch.getStreams();
   std::vector<Argument> outArgs;
 
-  trainerInternal_.getGradientMachine()->forwardBackward(inArgs, &outArgs,
-                                                         PASS_TRAIN);
+  trainerInternal_.getGradientMachine()->forwardBackward(
+      inArgs, &outArgs, PASS_TRAIN);
   real cost = Argument::sumCosts(outArgs);
 
   offset = 0;
@@ -612,20 +616,28 @@ void Trainer::clearGradient() {
 int Trainer::getBatchSize() { return config_->getOptConfig().batch_size(); }
 
 void Trainer::createTester() {
-  tester_.reset(new paddle::Tester(config_, createTesterConfig(),
+  tester_.reset(new paddle::Tester(config_,
+                                   createTesterConfig(),
                                    trainerInternal_.getGradientMachine(),
                                    trainerInternal_.getParameterUpdater(),
                                    testDataProvider_));
 }
 
-void Trainer::test() {
-  tester_->test();
-}
+void Trainer::test() { tester_->test(); }
 
 std::unique_ptr<TesterConfig> Trainer::createTesterConfig() {
   TesterConfig* conf = new TesterConfig;
+  if (FLAGS_test_period) {
+    LOG(WARNING) << "The meaning of --test_period is changed: "
+                 << "if equal 0, do test on all test data at the end of "
+                 << "each pass. While if equal non-zero, do test on all test "
+                 << "data every test_period batches ";
+  }
+  if (FLAGS_test_all_data_in_one_period) {
+    LOG(WARNING) << "--test_all_data_in_one_period was deprecated, since "
+                 << "we will always do test on all test set ";
+  }
   conf->testPeriod = FLAGS_test_period;
-  conf->testAllDataInOnePeriod = FLAGS_test_all_data_in_one_period;
   conf->prevBatchState = FLAGS_prev_batch_state;
   conf->logPeriod = FLAGS_log_period;
   conf->loadsaveParametersInPserver = FLAGS_loadsave_parameters_in_pserver;
@@ -648,7 +660,5 @@ std::unique_ptr<TesterConfig> Trainer::createTesterConfig() {
   return std::unique_ptr<TesterConfig>(conf);
 }
 
-ParameterUtil* Trainer::getParameterUtilPtr() {
-  return paramUtil_.get();
-}
+ParameterUtil* Trainer::getParameterUtilPtr() { return paramUtil_.get(); }
 }  // namespace paddle
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
index 7762722456c442cff956c3a551c66acb2bdebc62..899607c7c0f17ef2e91969f5ba1dcfa573518727 100644
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
@@ -66,18 +65,17 @@ public:
    * @param testDataProvider Test Data Provider. null if create from config.
    */
   virtual void init(
-      const std::shared_ptr<TrainerConfigHelper> &config,
+      const std::shared_ptr<TrainerConfigHelper>& config,
       bool testing = false,
-      const std::shared_ptr<GradientMachine> &gradientMachine = nullptr,
-      const std::shared_ptr<DataProvider> &dataProvider = nullptr,
-      const std::shared_ptr<DataProvider> &testDataProvider = nullptr);
+      const std::shared_ptr<GradientMachine>& gradientMachine = nullptr,
+      const std::shared_ptr<DataProvider>& dataProvider = nullptr,
+      const std::shared_ptr<DataProvider>& testDataProvider = nullptr);
 
   /**
    * Initialize Trainer from command line flags.
    */
   void init(int argc, char** argv);
 
-
   /**
    * Train until num_passes reached.
    * One pass means neural network train through all training data.
@@ -108,7 +106,8 @@ public:
    * TODO(yuyang18): I think this method is deprecated and buggy. Should it be
    * removed?
    */
-  real calcGradient(const DataBatch& dataBatch, const Vector& value,
+  real calcGradient(const DataBatch& dataBatch,
+                    const Vector& value,
                     Vector& gradient);
 
   /**
@@ -207,12 +206,12 @@ protected:
   // parameter util
   std::unique_ptr<ParameterUtil> paramUtil_;
 
-  #ifdef PADDLE_METRIC_LEARNING
+#ifdef PADDLE_METRIC_LEARNING
   MetricTrainer trainerInternal_;
-  #else
+#else
   // trainer Internal
   TrainerInternal trainerInternal_;
-  #endif
+#endif
 };
 
 }  // namespace paddle
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp
index 98197e7988517ad9ae3cf244e98654368a6ec17a..ee5b1e0a9c5a8faa6614d76ab938f1f1b8f4e73a 100644
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
@@ -29,9 +29,8 @@ P_DECLARE_bool(with_gpu);
 P_DECLARE_bool(parallel_nn);
 P_DECLARE_string(config_args);
 
-
-const char* kConfigParserModuleName = "paddle.trainer.config_parser";
-const char* kConfigParserFuncName = "parse_config_and_serialize";
+const char *kConfigParserModuleName = "paddle.trainer.config_parser";
+const char *kConfigParserFuncName = "parse_config_and_serialize";
 
 namespace paddle {
 
@@ -40,12 +39,10 @@ struct TrainerConfigHelperPrivate {
 };
 
 TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
-  :m(new TrainerConfigHelperPrivate()) {
+    : m(new TrainerConfigHelperPrivate()) {
   std::ostringstream configArgs;
-  configArgs << "trainer_id=" << FLAGS_trainer_id
-             << ",local=" << FLAGS_local
-             << ",with_cost=" << FLAGS_with_cost
-             << ",use_gpu=" << FLAGS_use_gpu
+  configArgs << "trainer_id=" << FLAGS_trainer_id << ",local=" << FLAGS_local
+             << ",with_cost=" << FLAGS_with_cost << ",use_gpu=" << FLAGS_use_gpu
              << ",parallel_nn=" << FLAGS_parallel_nn
              << ",cudnn_version=" << hl_get_cudnn_lib_version();
   if (!FLAGS_config_args.empty()) {
@@ -54,31 +51,26 @@ TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
 
   VLOG(3) << "Parsing trainer config " << configFilePath;
   std::string configProtoStr =
-      callPythonFunc(kConfigParserModuleName, kConfigParserFuncName,
+      callPythonFunc(kConfigParserModuleName,
+                     kConfigParserFuncName,
                      {configFilePath, configArgs.str()});
   CHECK(m->conf.ParseFromString(configProtoStr));
 }
 
-TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig& config)
-  :m(new TrainerConfigHelperPrivate()) {
+TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig &config)
+    : m(new TrainerConfigHelperPrivate()) {
   m->conf = config;
 }
 
-
 TrainerConfigHelper::~TrainerConfigHelper() {
   if (m) {
     delete m;
   }
 }
 
-const TrainerConfig &
-TrainerConfigHelper::getConfig() const {
-  return m->conf;
-}
+const TrainerConfig &TrainerConfigHelper::getConfig() const { return m->conf; }
 
-TrainerConfig& TrainerConfigHelper::getMutableConfig() {
-  return m->conf;
-}
+TrainerConfig &TrainerConfigHelper::getMutableConfig() { return m->conf; }
 
 const OptimizationConfig &TrainerConfigHelper::getOptConfig() const {
   return m->conf.opt_config();
@@ -173,8 +165,7 @@ std::string TrainerConfigHelper::getConfigName(bool *ok) const {
   } else if (!m->conf.init_model_path().empty()) {
     retv = getConfigNameFromPath(m->conf.init_model_path());
   } else if (m->conf.start_pass() >= 1) {
-    retv = getConfigNameFromPassId(m->conf.start_pass(),
-                                   m->conf.save_dir());
+    retv = getConfigNameFromPassId(m->conf.start_pass(), m->conf.save_dir());
   }
 
   if (ok) {
@@ -191,8 +182,8 @@ std::shared_ptr<TrainerConfigHelper> TrainerConfigHelper::createFromFlags() {
   } else if (!FLAGS_init_model_path.empty()) {
     configPath = getConfigNameFromPath(FLAGS_init_model_path);
   } else if (FLAGS_start_pass >= 1) {
-    configPath = getConfigNameFromPassId(FLAGS_start_pass - 1,
-                                         FLAGS_init_model_path);
+    configPath =
+        getConfigNameFromPassId(FLAGS_start_pass - 1, FLAGS_init_model_path);
   } else {
     return nullptr;
   }
diff --git a/paddle/trainer/TrainerConfigHelper.h b/paddle/trainer/TrainerConfigHelper.h
index d3ad1eeeb43bc6be0b944e2059dddeab734efb75..d20684964136a553b2d4119e8db5a1de084278bb 100644
--- a/paddle/trainer/TrainerConfigHelper.h
+++ b/paddle/trainer/TrainerConfigHelper.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -27,7 +26,6 @@ struct TrainerConfigHelperPrivate;
 class ModelConfig;
 class DataConfig;
 
-
 /**
  * @brief TrainerConfig Helper. A class wrap protobuf's TrainerConfig Object,
  * simplize the usage for TrainerConfig.
@@ -46,7 +44,7 @@ public:
    * @brief Ctor, Create a TrainerConfig from config file
    * @param configFilePath Config file path.
    */
-  explicit TrainerConfigHelper(const std::string &configFilePath);
+  explicit TrainerConfigHelper(const std::string& configFilePath);
   explicit TrainerConfigHelper(const TrainerConfig& config);
 
   /**
@@ -106,7 +104,6 @@ public:
    */
   bool hasTestDataConfig() const;
 
-
   /**
    * @brief Update trainer config from command line flags.
    *        Override config's (save_dir, init_model_path, start_pass) if command
@@ -114,7 +111,6 @@ public:
    */
   void updateConfigFromFlags();
 
-
   /**
    * @brief Disable optimization's sparse remote update.
    */
@@ -125,13 +121,10 @@ public:
    */
   void disableRemoteSparseUpdaterForEachParams();
 
-
   /**
    * @brief implicit conversion.
    */
-  inline operator const TrainerConfig&() const {
-    return this->getConfig();
-  }
+  inline operator const TrainerConfig&() const { return this->getConfig(); }
 
   /**
    * @brief implicit conversion.
@@ -143,16 +136,12 @@ public:
   /**
    * @brief implicit conversion.
    */
-  inline operator const DataConfig&() const {
-    return this->getDataConfig();
-  }
+  inline operator const DataConfig&() const { return this->getDataConfig(); }
 
   /**
    * @brief implicit conversion.
    */
-  inline operator const ModelConfig&() const {
-    return this->getModelConfig();
-  }
+  inline operator const ModelConfig&() const { return this->getModelConfig(); }
 
   /**
    * @brief Get mutable optimization config.
diff --git a/paddle/trainer/TrainerInternal.cpp b/paddle/trainer/TrainerInternal.cpp
index e23e42927c381d6efa9a3eef47f7e99f0a65b013..b1c3bf26d21d1760cd1710f372aa8a89fb7b101b 100644
--- a/paddle/trainer/TrainerInternal.cpp
+++ b/paddle/trainer/TrainerInternal.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "TrainerInternal.h"
 
 #include <fenv.h>
@@ -37,30 +36,31 @@ limitations under the License. */
 
 namespace paddle {
 
-void TrainerInternal::init(const std::shared_ptr<TrainerConfigHelper> &config,
-                           const GradientMachinePtr &gradientMachine,
-                           std::unique_ptr<TrainerInternalConfig> &&intconfig,
-                           const std::shared_ptr<TrainerStats> &stats,
+void TrainerInternal::init(const std::shared_ptr<TrainerConfigHelper>& config,
+                           const GradientMachinePtr& gradientMachine,
+                           std::unique_ptr<TrainerInternalConfig>&& intconfig,
+                           const std::shared_ptr<TrainerStats>& stats,
                            bool testing) {
-    config_ = config;
-    intconfig_ = std::move(intconfig);
-    stats_ = stats;
+  config_ = config;
+  intconfig_ = std::move(intconfig);
+  stats_ = stats;
 
-    //! in training will use parameter updater definitly.
-    //! But only use parameter in testing mode when some parameter in pserver.
-    if (!testing || (config_->getOptConfig().use_sparse_remote_updater() &&
+  //! in training will use parameter updater definitly.
+  //! But only use parameter in testing mode when some parameter in pserver.
+  if (!testing || (config_->getOptConfig().use_sparse_remote_updater() &&
                    intconfig_->loadsave_parameters_in_pserver)) {
-      createParameterUpdater(testing);
-    }
+    createParameterUpdater(testing);
+  }
 
-    gradientMachine_ = gradientMachine;
-    if (!gradientMachine) {
-      CHECK(config_->getConfig().has_model_config())
-          << "Missing model_config in trainer_config";
-      gradientMachine_.reset(GradientMachine::create(
-        config_->getConfig().model_config(), intconfig_->mode,
-        parameterUpdater_->getParameterTypes()));
-    }
+  gradientMachine_ = gradientMachine;
+  if (!gradientMachine) {
+    CHECK(config_->getConfig().has_model_config())
+        << "Missing model_config in trainer_config";
+    gradientMachine_.reset(
+        GradientMachine::create(config_->getConfig().model_config(),
+                                intconfig_->mode,
+                                parameterUpdater_->getParameterTypes()));
+  }
 }
 
 void TrainerInternal::trainOneBatch(int64_t batchId,
@@ -96,8 +96,8 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
     parameterUpdater_->getParametersRemote();
   }
 
-  UpdateCallback updateCallback =
-      [this, showStats, &paraStats](Parameter* para) {
+  UpdateCallback updateCallback = [this, showStats, &paraStats](
+      Parameter* para) {
     if (showStats) {
       //! @TODO(yuyang18) Show stats is actually a ParameterHook, refactor
       // it
@@ -116,8 +116,8 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
     timer.start();
 #endif
     REGISTER_TIMER("forwardBackward");
-    forwardBackwardBatch(inArgs, *outArgs, passType, updateCallback,
-                         doPipelineUpdate);
+    forwardBackwardBatch(
+        inArgs, *outArgs, passType, updateCallback, doPipelineUpdate);
 #ifndef PADDLE_DISABLE_TIMER
     timer.stop();
     parameterUpdater_->setForwardbackwardTime(timer.get());
@@ -147,7 +147,7 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
     gradientMachine_->eval(evaluator_);
   }
 
-  *stats_ += { actualBatchSize, cost };
+  *stats_ += {actualBatchSize, cost};
   {
     REGISTER_TIMER("finishBatch");
     parameterUpdater_->finishBatch(cost);
@@ -162,12 +162,11 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
     if (intconfig_->dot_period > 0) {
       std::cerr << std::endl;
     }
-    LOG(INFO) << " Batch=" << batchId + 1 << " "
-              << *stats_
+    LOG(INFO) << " Batch=" << batchId + 1 << " " << *stats_
               << " Eval: " << *evaluator_
               << " CurrentEval: " << *currentEvaluator_;
   } else if (intconfig_->dot_period > 0 &&
-            (batchId + 1) % intconfig_->dot_period == 0) {
+             (batchId + 1) % intconfig_->dot_period == 0) {
     std::cerr << ".";
   }
 }
@@ -179,13 +178,13 @@ void TrainerInternal::finishTrainPass(int passId, int batchId) {
   gradientMachine_->onPassEnd();
   parameterUpdater_->finishPass();
   evaluator_->finish();
-  LOG(INFO) << " Pass=" << passId << " Batch=" << batchId
-            << " " << stats_->getStats(false /*without current cost*/)
+  LOG(INFO) << " Pass=" << passId << " Batch=" << batchId << " "
+            << stats_->getStats(false /*without current cost*/)
             << " Eval: " << *evaluator_;
 }
 
-void TrainerInternal::showParameterStats(const std::vector<ParaStat>&
-                                        paraStats) {
+void TrainerInternal::showParameterStats(
+    const std::vector<ParaStat>& paraStats) {
   std::vector<ParameterPtr>& parameters = gradientMachine_->getParameters();
   for (auto& parameter : parameters) {
     SetDevice device(parameter->getDeviceId());
@@ -218,18 +217,21 @@ void TrainerInternal::showParameterStats(const std::vector<ParaStat>&
 void TrainerInternal::createParameterUpdater(bool testing) {
   const std::string& alg = config_->getOptConfig().algorithm();
   parameterUpdater_.reset(ParameterUpdaterCreators::tryCreateUpdater(
-                            alg, config_->getOptConfig(), intconfig_->local,
-                            intconfig_->num_passes));
-  if (parameterUpdater_) { return; }
+      alg, config_->getOptConfig(), intconfig_->local, intconfig_->num_passes));
+  if (parameterUpdater_) {
+    return;
+  }
 
   if (!intconfig_->local) {
     if (testing && config_->getOptConfig().use_sparse_remote_updater()) {
       std::unique_ptr<ParameterUpdater> localUpdater;
       localUpdater.reset(
           new SgdLocalUpdater(config_->getOptConfig()));  // do nothing
-      parameterUpdater_.reset(new SparseRemoteParameterUpdaterComposite(
-          config_->getOptConfig(), intconfig_->num_passes, testing,
-          std::move(localUpdater)));
+      parameterUpdater_.reset(
+          new SparseRemoteParameterUpdaterComposite(config_->getOptConfig(),
+                                                    intconfig_->num_passes,
+                                                    testing,
+                                                    std::move(localUpdater)));
     } else {
       if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode &&
           !intconfig_->use_old_updater) {
@@ -251,21 +253,18 @@ void TrainerInternal::createParameterUpdater(bool testing) {
       }
 
       localUpdater.reset(
-              intconfig_->use_old_updater
+          intconfig_->use_old_updater
               ? new RemoteParameterUpdater(
-                      *config_,
-                      intconfig_->num_passes,
-                      std::move(localUpdater))
+                    *config_, intconfig_->num_passes, std::move(localUpdater))
               : new ConcurrentRemoteParameterUpdater(
-                      *config_,
-                      intconfig_->num_passes,
-                      std::move(localUpdater)));
-
+                    *config_, intconfig_->num_passes, std::move(localUpdater)));
 
       if (config_->getOptConfig().use_sparse_remote_updater()) {
-        localUpdater.reset(new SparseRemoteParameterUpdaterComposite(
-            *config_, intconfig_->num_passes, testing,
-            std::move(localUpdater)));
+        localUpdater.reset(
+            new SparseRemoteParameterUpdaterComposite(*config_,
+                                                      intconfig_->num_passes,
+                                                      testing,
+                                                      std::move(localUpdater)));
       }
 
       this->parameterUpdater_ = std::move(localUpdater);
@@ -282,8 +281,7 @@ void TrainerInternal::createParameterUpdater(bool testing) {
       } else if (intconfig_->use_gpu &&
                  config_->getOptConfig().do_average_in_cpu() &&
                  config_->getOptConfig().average_window() > 0) {
-        parameterUpdater_.reset(
-            new SgdUpdaterWithCpuAverager(*config_));
+        parameterUpdater_.reset(new SgdUpdaterWithCpuAverager(*config_));
       } else {
         parameterUpdater_.reset(new SgdLocalUpdater(*config_));
       }
@@ -294,10 +292,10 @@ void TrainerInternal::createParameterUpdater(bool testing) {
 }
 
 void TrainerInternal::forwardBackwardBatch(const std::vector<Argument>& inArgs,
-                                   std::vector<Argument>& outArgs,
-                                   PassType& passType,
-                                   UpdateCallback updateCallback,
-                                   bool doPipelineUpdate) {
+                                           std::vector<Argument>& outArgs,
+                                           PassType& passType,
+                                           UpdateCallback updateCallback,
+                                           bool doPipelineUpdate) {
   gradientMachine_->forwardBackward(
       inArgs, &outArgs, passType, doPipelineUpdate ? updateCallback : nullptr);
 }
diff --git a/paddle/trainer/TrainerInternal.h b/paddle/trainer/TrainerInternal.h
index 3a53aa1d17b31ad3e7c1aa53f622c6399baa834e..962d53a30e5454060e8ce864c347c37b9cc98116 100644
--- a/paddle/trainer/TrainerInternal.h
+++ b/paddle/trainer/TrainerInternal.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
@@ -28,7 +27,6 @@ limitations under the License. */
 #include "TrainerConfigHelper.h"
 #include "TrainerInternalConfig.h"
 
-
 namespace paddle {
 
 /**
@@ -40,12 +38,10 @@ public:
   struct ParaStat {
     real maxAbsGrad;
     real avgAbsGrad;
-    ParaStat() :maxAbsGrad(.0), avgAbsGrad(.0){
-    }
+    ParaStat() : maxAbsGrad(.0), avgAbsGrad(.0) {}
   };
 
-  TrainerInternal() {
-  }
+  TrainerInternal() {}
 
   /**
    * Intializes trainer internal class
@@ -55,10 +51,10 @@ public:
    * @param stats training stats
    * @param testing if it is in testing phase
    */
-  void init(const std::shared_ptr<TrainerConfigHelper> &config,
-            const GradientMachinePtr &machine,
-            std::unique_ptr<TrainerInternalConfig> &&intconfig,
-            const std::shared_ptr<TrainerStats> &stats,
+  void init(const std::shared_ptr<TrainerConfigHelper>& config,
+            const GradientMachinePtr& machine,
+            std::unique_ptr<TrainerInternalConfig>&& intconfig,
+            const std::shared_ptr<TrainerStats>& stats,
             bool testing);
 
   virtual ~TrainerInternal() {}
@@ -94,7 +90,7 @@ public:
   /**
    * getGradientMachine
    */
-  inline const GradientMachinePtr & getGradientMachine() const {
+  inline const GradientMachinePtr& getGradientMachine() const {
     return gradientMachine_;
   }
 
@@ -109,17 +105,13 @@ public:
    * setCurrentEvaluator
    * @param eval evaluator to set
    */
-  inline void setCurrentEvaluator(Evaluator* eval) {
-    currentEvaluator_ = eval;
-  }
+  inline void setCurrentEvaluator(Evaluator* eval) { currentEvaluator_ = eval; }
 
   /**
    * setEvaluator
    * @param eval evaluator to set
    */
-  inline void setEvaluator(Evaluator* eval) {
-    evaluator_ = eval;
-  }
+  inline void setEvaluator(Evaluator* eval) { evaluator_ = eval; }
 
   /**
    * forwardBackwardBatch
diff --git a/paddle/trainer/TrainerInternalConfig.cpp b/paddle/trainer/TrainerInternalConfig.cpp
index 4a829a4df9e345d5d6b82740deea3cd005f6432b..0dc74cb3b39309b33a1a92dfa5a45e95defb4120 100644
--- a/paddle/trainer/TrainerInternalConfig.cpp
+++ b/paddle/trainer/TrainerInternalConfig.cpp
@@ -14,7 +14,8 @@ limitations under the License. */
 
 #include "TrainerInternalConfig.h"
 
-P_DEFINE_int32(show_parameter_stats_period, 0,
+P_DEFINE_int32(show_parameter_stats_period,
+               0,
                "Whether to show parameter stats during training");
 
 P_DEFINE_int32(dot_period, 1, "Print '.' every so many batches");
diff --git a/paddle/trainer/TrainerInternalConfig.h b/paddle/trainer/TrainerInternalConfig.h
index 9b59143bade737d9cde225836b8ae634e8e1543f..b7bfd29abd729b33ca953fb20835c57cbcf3ef74 100644
--- a/paddle/trainer/TrainerInternalConfig.h
+++ b/paddle/trainer/TrainerInternalConfig.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
@@ -94,9 +93,7 @@ public:
    * @brief get all processed samples' number
    * @return all processed samples' number
    */
-  inline int64_t getNumProcessed() const {
-    return this->numProcessed_;
-  }
+  inline int64_t getNumProcessed() const { return this->numProcessed_; }
 
   /**
    * @brief same function as addCost. But it is simple to invoke.
@@ -111,7 +108,7 @@ public:
    * @param p a pair of parameter, first is numProcessed, second is cost.
    * @return *this
    */
-  inline TrainerStats& operator += (const std::pair<int64_t, real>& p) {
+  inline TrainerStats& operator+=(const std::pair<int64_t, real>& p) {
     this->addCost(p.first, p.second);
     return *this;
   }
@@ -121,9 +118,7 @@ public:
    *
    * reset stat when constructed.
    */
-  inline TrainerStats() {
-    this->reset();
-  }
+  inline TrainerStats() { this->reset(); }
 
   /**
    * @brief show stats to ostream.
@@ -137,7 +132,7 @@ public:
     os << "samples=" << this->getNumProcessed()
        << " AvgCost=" << this->getAvgCost();
     if (withCurrentCost) {
-       os << " CurrentCost=" << this->getCurrentAvgCost();
+      os << " CurrentCost=" << this->getCurrentAvgCost();
     }
   }
 
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index a486cc383ace62111dbdbdd98e83710831a64095..e23e745d99c7b10fb780cb0c89e27207eefc19c1 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <fenv.h>
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/StringUtil.h"
@@ -34,7 +33,7 @@ P_DECLARE_string(rdma_tcp);
 using namespace paddle;  // NOLINT
 
 int main(int argc, char** argv) {
-  // write logs instantly (never buffer log messages)
+// write logs instantly (never buffer log messages)
 #ifdef PADDLE_USE_GLOG
   FLAGS_logbuflevel = -1;
 #endif
diff --git a/paddle/trainer/tests/picojson.h b/paddle/trainer/tests/picojson.h
index a0b5c2274b20fdbce76d021326f22b3181f3d9d1..cb657d219e55c1e349ffb77a88945085b4149c78 100644
--- a/paddle/trainer/tests/picojson.h
+++ b/paddle/trainer/tests/picojson.h
@@ -409,7 +409,8 @@ inline std::string value::to_str() const {
     case number_type: {
       char buf[256];
       double tmp;
-      SNPRINTF(buf, sizeof(buf),
+      SNPRINTF(buf,
+               sizeof(buf),
                fabs(u_.number_) < (1ULL << 53) && modf(u_.number_, &tmp) == 0
                    ? "%.f"
                    : "%.17g",
@@ -532,7 +533,8 @@ void value::_serialize(Iter oi, int indent) const {
         ++indent;
       }
       for (object::const_iterator i = u_.object_->begin();
-           i != u_.object_->end(); ++i) {
+           i != u_.object_->end();
+           ++i) {
         if (i != u_.object_->begin()) {
           *oi++ = ',';
         }
@@ -983,7 +985,9 @@ inline std::string parse(value& out, Iter& pos, const Iter& last) {
 }
 
 template <typename Context, typename Iter>
-inline Iter _parse(Context& ctx, const Iter& first, const Iter& last,
+inline Iter _parse(Context& ctx,
+                   const Iter& first,
+                   const Iter& last,
                    std::string* err) {
   input<Iter> in(first, last);
   if (!_parse(ctx, in) && err != NULL) {
@@ -1003,7 +1007,9 @@ inline Iter _parse(Context& ctx, const Iter& first, const Iter& last,
 }
 
 template <typename Iter>
-inline Iter parse(value& out, const Iter& first, const Iter& last,
+inline Iter parse(value& out,
+                  const Iter& first,
+                  const Iter& last,
                   std::string* err) {
   default_parse_context ctx(&out);
   return _parse(ctx, first, last, err);
@@ -1017,8 +1023,10 @@ inline std::string parse(value& out, const std::string& s) {
 
 inline std::string parse(value& out, std::istream& is) {
   std::string err;
-  parse(out, std::istreambuf_iterator<char>(is.rdbuf()),
-        std::istreambuf_iterator<char>(), &err);
+  parse(out,
+        std::istreambuf_iterator<char>(is.rdbuf()),
+        std::istreambuf_iterator<char>(),
+        &err);
   return err;
 }
 
diff --git a/paddle/trainer/tests/test_Compare.cpp b/paddle/trainer/tests/test_Compare.cpp
index 735c5a5b27d8189195be8a720158977edc5d8c9e..03312f9e470e0f8b01e229237d25a7ac8e088c5c 100644
--- a/paddle/trainer/tests/test_Compare.cpp
+++ b/paddle/trainer/tests/test_Compare.cpp
@@ -52,8 +52,8 @@ void calcGradient(bool useGpu, comData& Data) {
   vector<Argument>& inArgs = dataBatch.getStreams();
   trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
   for (int i = 0; i < 2; ++i) {
-    trainer.getGradientMachine()->forwardBackward(inArgs, &Data.outArgs,
-                                                  PASS_TRAIN);
+    trainer.getGradientMachine()->forwardBackward(
+        inArgs, &Data.outArgs, PASS_TRAIN);
   }
   trainer.getGradientMachine()->finish();
 }
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/trainer/tests/test_CompareSparse.cpp
index 311dd333a1b1638e75ca7aaf441c441d3cf54447..a7c6862ce3362556fa60cc3309445347476e7f33 100644
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
@@ -23,7 +23,7 @@ using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
 static const string& configFile1 =
-              "trainer/tests/sample_trainer_config_qb_rnn.conf";
+    "trainer/tests/sample_trainer_config_qb_rnn.conf";
 
 P_DECLARE_bool(use_gpu);
 P_DECLARE_string(config);
@@ -38,8 +38,9 @@ P_DECLARE_bool(local);
 P_DECLARE_bool(use_old_updater);
 P_DECLARE_bool(parallel_nn);
 P_DECLARE_string(config_args);
-P_DEFINE_double(max_diff_ratio, 0.0f,
-              "max diff ratio allowed for parameters value");
+P_DEFINE_double(max_diff_ratio,
+                0.0f,
+                "max diff ratio allowed for parameters value");
 
 int gNumDevices = 0;
 
@@ -53,8 +54,7 @@ std::vector<ParameterPtr> trainerOnePassTest(const string& configFile,
   FLAGS_config_args = sparseUpdate ? "sparse_update=1" : "sparse_update=0";
 
   LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
-            << " configFile=" << configFile
-            << " sparseUpdate=" << sparseUpdate;
+            << " configFile=" << configFile << " sparseUpdate=" << sparseUpdate;
   srand(FLAGS_seed);
   *ThreadLocalRand::getSeed() = FLAGS_seed;
   ThreadLocalRandomEngine::get().seed(FLAGS_seed);
@@ -91,8 +91,12 @@ std::vector<ParameterPtr>& getDenseParameters() {
   return denseParameters;
 }
 
-void checkBuffer(real* A, const char* desA, real* B, const char* desB,
-                 size_t len, double maxDiffRatio) {
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 double maxDiffRatio) {
   double maxDiff = 0;
   double maxValue = 0;
   for (size_t i = 0; i < len; ++i) {
@@ -101,10 +105,8 @@ void checkBuffer(real* A, const char* desA, real* B, const char* desB,
     maxDiff = std::max(maxDiff, diff);
   }
   EXPECT_LE(maxDiff / maxValue, maxDiffRatio);
-  LOG(INFO) << " maxDiff=" << maxDiff
-            << " maxValue=" << maxValue
-            << " maxDiff/maxValue=" << maxDiff / maxValue
-            << "\n\n";
+  LOG(INFO) << " maxDiff=" << maxDiff << " maxValue=" << maxValue
+            << " maxDiff/maxValue=" << maxDiff / maxValue << "\n\n";
 }
 
 void compareValue(const vector<ParameterPtr>& parametersA,
@@ -125,8 +127,12 @@ void compareValue(const vector<ParameterPtr>& parametersA,
 
     LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
               << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(), "para_A", paraB.getData(), "para_B",
-                paraA.getSize(), maxDiffRatio);
+    checkBuffer(paraA.getData(),
+                "para_A",
+                paraB.getData(),
+                "para_B",
+                paraA.getSize(),
+                maxDiffRatio);
   }
 }
 
@@ -172,8 +178,7 @@ TEST(compareSparse, multiGradientMachine) {
       if (useGpu) continue;
 #endif
       FLAGS_parallel_nn = useGpu;
-      LOG(INFO) << " local=" << local
-                << " useGpu=" << useGpu;
+      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
       int trainerCount = useGpu ? numGpu : 2;
       std::vector<ParameterPtr> parameters =
           trainerOnePassTest(configFile1, true, trainerCount, useGpu);
@@ -197,8 +202,7 @@ TEST(compareSparse, NeuralNetwork) {
       if (useGpu) continue;
 #endif
       FLAGS_parallel_nn = useGpu;
-      LOG(INFO) << " local=" << local
-                << " useGpu=" << useGpu;
+      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
       int trainerCount = 1;
       std::vector<ParameterPtr> parameters =
           trainerOnePassTest(configFile1, true, trainerCount, useGpu);
diff --git a/paddle/trainer/tests/test_CompareTwoNets.cpp b/paddle/trainer/tests/test_CompareTwoNets.cpp
index d1057f2aeabd3bcc41330f1cfe72227de3837140..81320da6ac9c6e880b936a6b1e2650796bb50ff7 100644
--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/trainer/tests/test_CompareTwoNets.cpp
@@ -32,10 +32,12 @@ P_DECLARE_string(nics);
 
 P_DEFINE_string(config_file_a, "", "config of one network to compare");
 P_DEFINE_string(config_file_b, "", "config of another network to compare");
-P_DEFINE_bool(need_high_accuracy, false,
+P_DEFINE_bool(need_high_accuracy,
+              false,
               "whether need to run in double accuracy");
 P_DEFINE_double(
-    max_diff_ratio, 0.0f,
+    max_diff_ratio,
+    0.0f,
     "max diff ratio allowed for outputs and parameters (value/gradient)");
 P_DECLARE_bool(thread_local_rand_use_global_seed);
 P_DECLARE_int32(seed);
@@ -71,14 +73,18 @@ void calcGradient(ComData& data, const string configFile) {
   vector<Argument>& inArgs = dataBatch.getStreams();
 
   trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
-  trainer.getGradientMachine()->forwardBackward(inArgs, &data.outArgs,
-                                                PASS_TRAIN);
+  trainer.getGradientMachine()->forwardBackward(
+      inArgs, &data.outArgs, PASS_TRAIN);
 
   trainer.getGradientMachine()->finish();
 }
 
-void checkBuffer(real* A, const char* desA, real* B, const char* desB,
-                 size_t len, size_t width = 1) {
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 size_t width = 1) {
   int nNum = 0;
   real maxVal = 0;
   for (size_t i = 0; i < len; ++i) {
@@ -90,8 +96,8 @@ void checkBuffer(real* A, const char* desA, real* B, const char* desB,
     maxDiff = std::max(maxDiff, diff);
     if (diff > maxVal * FLAGS_max_diff_ratio) {
       nNum++;
-      VLOG(1) << "Row: " << i / width << ", " << desA << " : " << A[i]
-              << "    " << desB << " : " << B[i] << " diff=" << diff;
+      VLOG(1) << "Row: " << i / width << ", " << desA << " : " << A[i] << "    "
+              << desB << " : " << B[i] << " diff=" << diff;
     }
   }
   EXPECT_EQ(0, nNum);
@@ -114,8 +120,12 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
     LOG(INFO) << "\n--------------------------------"
               << " Check Network Output_" << i << ":"
               << " -------------------------------------\n";
-    checkBuffer(matA.getData(), "network A output", matB.getData(),
-                "network B output", matA.getElementCnt(), matA.getWidth());
+    checkBuffer(matA.getData(),
+                "network A output",
+                matB.getData(),
+                "network B output",
+                matA.getElementCnt(),
+                matA.getWidth());
   }
 
   vector<ParameterPtr>& parametersA = comDataA.parameters;
@@ -136,7 +146,10 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 
     LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
               << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(), "Network A", paraB.getData(), "Network B",
+    checkBuffer(paraA.getData(),
+                "Network A",
+                paraB.getData(),
+                "Network B",
                 paraA.getSize());
 
     CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
@@ -144,7 +157,10 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 
     LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
               << " ; size : " << gradA.getSize() << " -----------";
-    checkBuffer(gradA.getData(), "Network A", gradB.getData(), "Network B",
+    checkBuffer(gradA.getData(),
+                "Network A",
+                gradB.getData(),
+                "Network B",
                 gradA.getSize());
   }
 }
diff --git a/paddle/trainer/tests/test_CompareTwoOpts.cpp b/paddle/trainer/tests/test_CompareTwoOpts.cpp
index 2c44da43fcd698808805480599f2c6223d120f8d..a52f2fa7e7708925dbcb173167b17bbfef93a4da 100644
--- a/paddle/trainer/tests/test_CompareTwoOpts.cpp
+++ b/paddle/trainer/tests/test_CompareTwoOpts.cpp
@@ -32,11 +32,13 @@ P_DECLARE_string(nics);
 
 P_DEFINE_string(config_file_a, "", "config of one network to compare");
 P_DEFINE_string(config_file_b, "", "config of another network to compare");
-P_DEFINE_bool(need_high_accuracy, true,
+P_DEFINE_bool(need_high_accuracy,
+              true,
               "whether need to run in double accuracy (recommended)");
 P_DEFINE_double(
-      max_diff_ratio, 0.0f,
-      "max diff ratio allowed for outputs and parameters (value/gradient)");
+    max_diff_ratio,
+    0.0f,
+    "max diff ratio allowed for outputs and parameters (value/gradient)");
 
 struct ComData {
   vector<Argument> outArgs;
@@ -62,8 +64,12 @@ void calcGradient(ComData& data, const string configFile) {
   trainer.train();
 }
 
-void checkBuffer(real* A, const char* desA, real* B, const char* desB,
-                 size_t len, size_t width = 1) {
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 size_t width = 1) {
   int nNum = 0;
   for (size_t i = 0; i < len; ++i) {
     real diff = fabs(A[i] - B[i]);
@@ -94,8 +100,12 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
     LOG(INFO) << "\n--------------------------------"
               << " Check Network Output_" << i << ":"
               << " -------------------------------------\n";
-    checkBuffer(matA.getData(), "network A output", matB.getData(),
-                "network B output", matA.getElementCnt(), matA.getWidth());
+    checkBuffer(matA.getData(),
+                "network A output",
+                matB.getData(),
+                "network B output",
+                matA.getElementCnt(),
+                matA.getWidth());
   }
 
   vector<ParameterPtr>& parametersA = comDataA.parameters;
@@ -116,7 +126,10 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 
     LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
               << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(), "Network A", paraB.getData(), "Network B",
+    checkBuffer(paraA.getData(),
+                "Network A",
+                paraB.getData(),
+                "Network B",
                 paraA.getSize());
 
     CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
@@ -124,7 +137,10 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 
     LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
               << " ; size : " << gradA.getSize() << " -----------";
-    checkBuffer(gradA.getData(), "Network A", gradB.getData(), "Network B",
+    checkBuffer(gradA.getData(),
+                "Network A",
+                gradB.getData(),
+                "Network B",
                 gradA.getSize());
   }
 }
diff --git a/paddle/trainer/tests/test_Prediction.cpp b/paddle/trainer/tests/test_Prediction.cpp
index 1c7f93666b8dfd0307797fc5e20b05b355c75a38..6db33439b319e84e99e828246ca672fa8274e4bf 100644
--- a/paddle/trainer/tests/test_Prediction.cpp
+++ b/paddle/trainer/tests/test_Prediction.cpp
@@ -20,7 +20,8 @@ limitations under the License. */
 
 P_DECLARE_string(config);
 P_DECLARE_string(config_args);
-P_DEFINE_string(merger, "./paddle_merge_model",
+P_DEFINE_string(merger,
+                "./paddle_merge_model",
                 "path to paddle_merge_model binary");
 
 using namespace paddle;  // NOLINT
@@ -120,8 +121,10 @@ TEST(GradientMachine, create) {
           rand() / (real)RAND_MAX;  // NOLINT TODO(yuyang): use rand_r
     }
   }
-  MatrixPtr input = Matrix::create(numSamples, inputDim,
-                                   /* trans */ false, FLAGS_use_gpu);
+  MatrixPtr input = Matrix::create(numSamples,
+                                   inputDim,
+                                   /* trans */ false,
+                                   FLAGS_use_gpu);
   input->copyFrom(cpuInput);
   inArgs[0].value = input;
   gradientMachine1->forward(inArgs, &outArgs, PASS_TEST);
@@ -139,8 +142,8 @@ TEST(GradientMachine, create) {
 
   gradientMachine3->forward(inArgs, &outArgs2, PASS_TEST);
   out2.copyFrom(*outArgs2[0].value);
-  checkBuffer(out1.getData(), out2.getData(),
-              out2.getHeight() * out2.getWidth());
+  checkBuffer(
+      out1.getData(), out2.getData(), out2.getHeight() * out2.getWidth());
 
   cmd = " rm -rf " + modelDir + "/*";
   LOG(INFO) << "cmd " << cmd;
diff --git a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
index 49332b877db646fdcd7cd3b11ec96bac64dd2d6d..e53291386c6b553e26248dae75e321d4b7246823 100644
--- a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
+++ b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef PADDLE_NO_PYTHON
 #include <gtest/gtest.h>
 #include <paddle/utils/PythonUtil.h>
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index ad2a715ef89c6f4c4b509e1a8b816699b709c59d..900c05af851aede67253535228d75d211dee6a85 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -33,7 +33,9 @@ P_DECLARE_string(config);
 P_DECLARE_int32(gpu_id);
 P_DECLARE_bool(allow_only_one_model_on_one_gpu);
 
-void checkGradientTest(const string& configFile, bool useGpu, bool parallel,
+void checkGradientTest(const string& configFile,
+                       bool useGpu,
+                       bool parallel,
                        int trainerCount = 1) {
   FLAGS_use_gpu = useGpu;
   FLAGS_parallel_nn = parallel;
@@ -94,7 +96,7 @@ TEST(checkGradient, multi) {
 TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
 
 TEST(checkGradient, chunk) {
-#if defined(__APPLE__) || defined (__OSX__)
+#if defined(__APPLE__) || defined(__OSX__)
   EXPECT_EQ(0, system("python trainer/tests/gen_proto_data.py"));
 #else
   EXPECT_EQ(0, system("python2 trainer/tests/gen_proto_data.py"));
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
index 4554b94485f99f1fea1ebef8f5ae8a59b630d106..da2954d1664fc18cb78e6217807ff9799d220f7f 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -41,12 +41,13 @@ public:
   }
 };
 
-
-
 int gNumDevices = 0;
 
-void trainerOnePassTest(const string& configFile, bool useGpu, bool parallel,
-                        int trainerCount = 1, double averageWindow = 0.0f,
+void trainerOnePassTest(const string& configFile,
+                        bool useGpu,
+                        bool parallel,
+                        int trainerCount = 1,
+                        double averageWindow = 0.0f,
                         bool doAverageInCpu = false) {
   FLAGS_use_gpu = useGpu;
   FLAGS_parallel_nn = parallel;
@@ -164,13 +165,13 @@ double checkRemoteParameterUpdater(TrainerForTest& trainer) {
   const vector<Argument>& inArgs = dataBatch.getStreams();
   vector<Argument> outArgs;
 
-  UpdateCallback updateCallback =
-      [parameterUpdater, parameterCheck](Parameter* para) {
-        parameterCheck[para->getID()]
-            ->getBuf(PARAMETER_GRADIENT)
-            ->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
-        parameterUpdater->update(para);
-      };
+  UpdateCallback updateCallback = [parameterUpdater,
+                                   parameterCheck](Parameter* para) {
+    parameterCheck[para->getID()]
+        ->getBuf(PARAMETER_GRADIENT)
+        ->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
+    parameterUpdater->update(para);
+  };
 
   parameterUpdater->startPass();
   parameterUpdaterCheck->startPass();
@@ -178,8 +179,8 @@ double checkRemoteParameterUpdater(TrainerForTest& trainer) {
   for (int i = 0; i < config.opt_config().num_batches_per_get_parameter() * 2;
        ++i) {
     PassType passType = parameterUpdater->startBatch(actualBatchSize);
-    gradientMachine->forwardBackward(inArgs, &outArgs, passType,
-                                     updateCallback);
+    gradientMachine->forwardBackward(
+        inArgs, &outArgs, passType, updateCallback);
     parameterUpdater->finishBatch(0);
 
     parameterUpdaterCheck->startBatch(actualBatchSize);
@@ -191,7 +192,7 @@ double checkRemoteParameterUpdater(TrainerForTest& trainer) {
 
   double sum = 0.0f;
   for (size_t i = 0; i != parameters.size(); ++i) {
-    real* v1, *v2;
+    real *v1, *v2;
     CpuVector trainerPara(parameters[i]->getSize());
     trainerPara.copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
     if (!FLAGS_use_gpu) {
@@ -217,8 +218,10 @@ double checkRemoteParameterUpdater(TrainerForTest& trainer) {
   return sum;
 }
 
-void checkRemoteParameterUpdaterTest(const string& configFile, bool useGpu,
-                                     bool parallel, int trainerCount = 1,
+void checkRemoteParameterUpdaterTest(const string& configFile,
+                                     bool useGpu,
+                                     bool parallel,
+                                     int trainerCount = 1,
                                      bool useOldUpdater = false,
                                      int num_batches_per_get_parameter = 1) {
   FLAGS_use_gpu = useGpu;
diff --git a/paddle/trainer/tests/test_config.conf b/paddle/trainer/tests/test_config.conf
index 664e18cb986811ffca2a4865c5f50045ace122e1..2a4548896ffe0770f48b6c375c41eaf452b19366 100644
--- a/paddle/trainer/tests/test_config.conf
+++ b/paddle/trainer/tests/test_config.conf
@@ -59,7 +59,6 @@ pool = img_pool_layer(input=fc2,
                       padding_y=2,
                       stride=2,
                       stride_y=3,
-                      img_width=3,
                       pool_type=CudnnAvgPooling())
 
 concat = concat_layer(input=[fc3, fc4])
diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
index fcee318d16e00428bda447e80575dbf1b027102d..49e8a97ad057246addf29274dd9c436d1481de91 100644
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
@@ -51,8 +51,10 @@ void checkOutput(const string& expRetFile) {
   }
 }
 
-void prepareInArgs(vector<Argument>& inArgs, const size_t batchSize,
-                   bool useGpu, bool hasSubseq) {
+void prepareInArgs(vector<Argument>& inArgs,
+                   const size_t batchSize,
+                   bool useGpu,
+                   bool hasSubseq) {
   inArgs.clear();
   // sentence id
   Argument sentId;
@@ -87,7 +89,9 @@ void prepareInArgs(vector<Argument>& inArgs, const size_t batchSize,
   inArgs.emplace_back(dummyInput);
 }
 
-void testGeneration(const string& configFile, bool useGpu, bool hasSubseq,
+void testGeneration(const string& configFile,
+                    bool useGpu,
+                    bool hasSubseq,
                     const string& expRetFile) {
   FLAGS_use_gpu = useGpu;
   auto config = std::make_shared<TrainerConfigHelper>(configFile);
@@ -114,8 +118,10 @@ TEST(RecurrentGradientMachine, test_generation) {
 #else
   const auto useGpuConfs = {true, false};
 #endif
-  auto testGen = [&](const string& configFile, bool hasSubseq,
-                     const string& expRetFile, bool beam_search) {
+  auto testGen = [&](const string& configFile,
+                     bool hasSubseq,
+                     const string& expRetFile,
+                     bool beam_search) {
     FLAGS_config_args = beam_search ? "beam_search=1" : "beam_search=0";
     for (auto useGpu : useGpuConfs) {
       testGeneration(configFile, useGpu, hasSubseq, expRetFile);
@@ -126,7 +132,9 @@ TEST(RecurrentGradientMachine, test_generation) {
   // In hierarchical RNN, beam search and one way search are only in inner-RNN,
   // outer-RNN will concat the generated inner-results (first for beam search)
   // from inner-RNN. Thus, they have the same outer-results.
-  testGen(NEST_CONFIG_FILE, true, expectFile + ".nest",
+  testGen(NEST_CONFIG_FILE,
+          true,
+          expectFile + ".nest",
           false);  // no beam search
   testGen(NEST_CONFIG_FILE, true, expectFile + ".nest", true);  // beam search
 }
diff --git a/paddle/utils/BarrierStat.cpp b/paddle/utils/BarrierStat.cpp
index cbc738a839d08a83ff8498315e29427697c930c7..82c5b84e5960753d5ec4c35bd667a8e43269e9e1 100644
--- a/paddle/utils/BarrierStat.cpp
+++ b/paddle/utils/BarrierStat.cpp
@@ -20,19 +20,21 @@ limitations under the License. */
 #include "paddle/utils/BarrierStat.h"
 #include "paddle/utils/Flags.h"
 
-P_DEFINE_bool(log_barrier_abstract, true,
+P_DEFINE_bool(log_barrier_abstract,
+              true,
               "if true, show abstract of barrier performance");
-P_DEFINE_int32(log_barrier_lowest_nodes, 5,
+P_DEFINE_int32(log_barrier_lowest_nodes,
+               5,
                "how many lowest node will be logged");
-P_DEFINE_bool(log_barrier_show_log, false,  // for performance tuning insight
+P_DEFINE_bool(log_barrier_show_log,
+              false,  // for performance tuning insight
               "if true, always show barrier abstract even with little gap");
 
 namespace paddle {
 
-std::ostream &operator<<(std::ostream &output, BarrierStatBase &stat) {
+std::ostream &operator<<(std::ostream &output, const BarrierStatBase &stat) {
   if (FLAGS_log_barrier_abstract) {
-    std::lock_guard<std::mutex> guard(
-        const_cast<BarrierStatBase &>(stat).lock_);
+    std::lock_guard<std::mutex> guard(stat.lock_);
     stat.showAbstract(output);
   }
   return output;
@@ -136,7 +138,7 @@ void BarrierEndStat::reset(bool clearRawData) {
   totAbstract_.minDelta = UINT64_MAX;
 }
 
-void BarrierEndStat::showAbstract(std::ostream &output) {
+void BarrierEndStat::showAbstract(std::ostream &output) const {
   // do not support the case "<=2 pserver"
   if (numConnThreads_ <= 2 || !totSamples_) {
     return;
@@ -144,7 +146,8 @@ void BarrierEndStat::showAbstract(std::ostream &output) {
 
   // duplicate freq info
   std::vector<struct Abstract> outputAbstract = abstract_;
-  std::sort(outputAbstract.begin(), outputAbstract.end(),
+  std::sort(outputAbstract.begin(),
+            outputAbstract.end(),
             [](const struct Abstract &a, const struct Abstract &b) {
               return a.freq > b.freq;
             });
@@ -272,7 +275,7 @@ void BarrierDeltaStat::reset(bool clearRawData) {
   totAbstract_.minDelta = UINT64_MAX;
 }
 
-void BarrierDeltaStat::showAbstract(std::ostream &output) {
+void BarrierDeltaStat::showAbstract(std::ostream &output) const {
   // do not support the case "<=2 pserver"
   if (numConnThreads_ <= 2 || !totSamples_) {
     return;
@@ -280,7 +283,8 @@ void BarrierDeltaStat::showAbstract(std::ostream &output) {
 
   // duplicate freq info
   std::vector<struct Abstract> outputAbstract = abstract_;
-  std::sort(outputAbstract.begin(), outputAbstract.end(),
+  std::sort(outputAbstract.begin(),
+            outputAbstract.end(),
             [](const struct Abstract &a, const struct Abstract &b) {
               return a.freq > b.freq;
             });
diff --git a/paddle/utils/BarrierStat.h b/paddle/utils/BarrierStat.h
index 22d6cc9bcec5ec4e655a216ddf5873e47b86fa38..661340ad275365ab567175d4280abdab18444fac 100644
--- a/paddle/utils/BarrierStat.h
+++ b/paddle/utils/BarrierStat.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <stdint.h>
@@ -218,11 +217,12 @@ public:
   }
 
 protected:
-  virtual void showAbstract(std::ostream &output) {}
-  friend std::ostream &operator<<(std::ostream &output, BarrierStatBase &stat);
+  virtual void showAbstract(std::ostream &output) const {}
+  friend std::ostream &operator<<(std::ostream &output,
+                                  const BarrierStatBase &stat);
 
 protected:
-  std::mutex lock_;
+  mutable std::mutex lock_;
   std::mutex abstractLock_;  // see note on updaterStat
   // each freqency for each barrier trainer
   std::vector<struct Abstract> abstract_;
@@ -262,7 +262,7 @@ protected:
    * log_barrier_abstract, log_barrier_lowest_nodes, log_barrier_threshold
    * control details.
    */
-  virtual void showAbstract(std::ostream &output);
+  virtual void showAbstract(std::ostream &output) const;
 
 private:
   std::unique_ptr<TimeVectorEnd> timeVector_;
@@ -286,7 +286,7 @@ public:
   virtual bool checkPassBarrier() { return timeVector_->empty(); }
 
 protected:
-  virtual void showAbstract(std::ostream &outPut);
+  virtual void showAbstract(std::ostream &outPut) const;
 
 private:
   // store delta time in uint64_t, eg BP time of all trainers
@@ -304,44 +304,44 @@ private:
 // nodes.
 
 // end barrier
-#define __REGISTER_BARRIER_TIMER_SERVER(set, statName, numConnThreads, \
-                                        trainerId, ...)                \
-  do {                                                                 \
-    if (numConnThreads > 2) {                                          \
-      std::string internalName =                                       \
-          std::string(statName) + std::string(__VA_ARGS__);            \
-      BarrierStatPtr __stat =                                          \
-          (set).getStat(numConnThreads, internalName, BARRIER_END);    \
-      struct timeval cur;                                              \
-      gettimeofday(&cur, nullptr);                                     \
-      __stat->updateStat(cur, trainerId);                              \
-    }                                                                  \
+#define __REGISTER_BARRIER_TIMER_SERVER(                            \
+    set, statName, numConnThreads, trainerId, ...)                  \
+  do {                                                              \
+    if (numConnThreads > 2) {                                       \
+      std::string internalName =                                    \
+          std::string(statName) + std::string(__VA_ARGS__);         \
+      BarrierStatPtr __stat =                                       \
+          (set).getStat(numConnThreads, internalName, BARRIER_END); \
+      struct timeval cur;                                           \
+      gettimeofday(&cur, nullptr);                                  \
+      __stat->updateStat(cur, trainerId);                           \
+    }                                                               \
   } while (0);
 
 // end barrier with user-defined timer
-#define __REGISTER_BARRIER_TIMER_SERVER_SET(set, statName, numConnThreads, \
-                                            trainerId, cur, ...)           \
-  do {                                                                     \
-    if (numConnThreads > 2) {                                              \
-      std::string internalName =                                           \
-          std::string(statName) + std::string(__VA_ARGS__);                \
-      BarrierStatPtr __stat =                                              \
-          (set).getStat(numConnThreads, internalName, BARRIER_END);        \
-      __stat->updateStat(cur, trainerId);                                  \
-    }                                                                      \
+#define __REGISTER_BARRIER_TIMER_SERVER_SET(                        \
+    set, statName, numConnThreads, trainerId, cur, ...)             \
+  do {                                                              \
+    if (numConnThreads > 2) {                                       \
+      std::string internalName =                                    \
+          std::string(statName) + std::string(__VA_ARGS__);         \
+      BarrierStatPtr __stat =                                       \
+          (set).getStat(numConnThreads, internalName, BARRIER_END); \
+      __stat->updateStat(cur, trainerId);                           \
+    }                                                               \
   } while (0);
 
 // delta barrier
-#define __REGISTER_BARRIER_DELTA_SERVER_SET(set, statName, numConnThreads, \
-                                            trainerId, delta, ...)         \
-  do {                                                                     \
-    if (numConnThreads > 2) {                                              \
-      std::string internalName =                                           \
-          std::string(statName) + std::string(__VA_ARGS__);                \
-      BarrierStatPtr __stat =                                              \
-          (set).getStat(numConnThreads, internalName, BARRIER_DELTA);      \
-      __stat->updateStat(delta, trainerId);                                \
-    }                                                                      \
+#define __REGISTER_BARRIER_DELTA_SERVER_SET(                          \
+    set, statName, numConnThreads, trainerId, delta, ...)             \
+  do {                                                                \
+    if (numConnThreads > 2) {                                         \
+      std::string internalName =                                      \
+          std::string(statName) + std::string(__VA_ARGS__);           \
+      BarrierStatPtr __stat =                                         \
+          (set).getStat(numConnThreads, internalName, BARRIER_DELTA); \
+      __stat->updateStat(delta, trainerId);                           \
+    }                                                                 \
   } while (0);
 
 // check end barrier
@@ -373,10 +373,10 @@ private:
  */
 
 // try to capture which trainer is slowest node in sync-sgd at pserver.
-#define REGISTER_SLOW_NODES_PROBE(set, statName, numConnThreads, trainerId,   \
-                                  ...)                                        \
-  __REGISTER_BARRIER_TIMER_SERVER((set), statName, numConnThreads, trainerId, \
-                                  __VA_ARGS__)
+#define REGISTER_SLOW_NODES_PROBE(                 \
+    set, statName, numConnThreads, trainerId, ...) \
+  __REGISTER_BARRIER_TIMER_SERVER(                 \
+      (set), statName, numConnThreads, trainerId, __VA_ARGS__)
 // try to check if all threads or trainers have passed barriers for data
 // accuracy.
 #define CHECK_BARRIER_TIMER(set, statName, numConnThreads, ...) \
@@ -384,12 +384,12 @@ private:
 
 #ifdef PADDLE_DISABLE_TIMER
 
-#define REGISTER_BARRIER_TIMER_SERVER(set, statName, numConnThreads, \
-                                      trainerId, ...)
-#define REGISTER_BARRIER_TIMER_SERVER_SET(set, statName, numConnThreads, \
-                                          trainerId, cur, ...)
-#define REGISTER_BARRIER_DELTA_SERVER_SET(set, statName, numConnThreads, \
-                                          trainerId, cur, ...)
+#define REGISTER_BARRIER_TIMER_SERVER( \
+    set, statName, numConnThreads, trainerId, ...)
+#define REGISTER_BARRIER_TIMER_SERVER_SET( \
+    set, statName, numConnThreads, trainerId, cur, ...)
+#define REGISTER_BARRIER_DELTA_SERVER_SET( \
+    set, statName, numConnThreads, trainerId, cur, ...)
 
 #else
 
@@ -397,10 +397,10 @@ private:
  * sensing barrier time distribution for all parallelization threads.
  * it provides low API for slow node check(REGISTER_SLOW_NODES_PROBE)
  */
-#define REGISTER_BARRIER_TIMER_SERVER(set, statName, numConnThreads,          \
-                                      trainerId, ...)                         \
-  __REGISTER_BARRIER_TIMER_SERVER((set), statName, numConnThreads, trainerId, \
-                                  __VA_ARGS__)
+#define REGISTER_BARRIER_TIMER_SERVER(             \
+    set, statName, numConnThreads, trainerId, ...) \
+  __REGISTER_BARRIER_TIMER_SERVER(                 \
+      (set), statName, numConnThreads, trainerId, __VA_ARGS__)
 
 /*
  * sensing barrier time distribution for all parallelization threads.
@@ -409,18 +409,18 @@ private:
  * time distribution
  * for receiving data.
  */
-#define REGISTER_BARRIER_TIMER_SERVER_SET(set, statName, numConnThreads, \
-                                          trainerId, cur, ...)           \
-  __REGISTER_BARRIER_TIMER_SERVER_SET((set), statName, numConnThreads,   \
-                                      trainerId, cur, __VA_ARGS__)
+#define REGISTER_BARRIER_TIMER_SERVER_SET(              \
+    set, statName, numConnThreads, trainerId, cur, ...) \
+  __REGISTER_BARRIER_TIMER_SERVER_SET(                  \
+      (set), statName, numConnThreads, trainerId, cur, __VA_ARGS__)
 
 // try to capture time delta from all trainers, such as forwardBackward time
 // which implies
 // computation fluctuation
-#define REGISTER_BARRIER_DELTA_SERVER_SET(set, statName, numConnThreads, \
-                                          trainerId, delta, ...)         \
-  __REGISTER_BARRIER_DELTA_SERVER_SET((set), statName, numConnThreads,     \
-                                      trainerId, delta, __VA_ARGS__)
+#define REGISTER_BARRIER_DELTA_SERVER_SET(                \
+    set, statName, numConnThreads, trainerId, delta, ...) \
+  __REGISTER_BARRIER_DELTA_SERVER_SET(                    \
+      (set), statName, numConnThreads, trainerId, delta, __VA_ARGS__)
 
 #endif  // DISABLE_TIMER
 }  // namespace paddle
diff --git a/paddle/utils/ClassRegistrar.h b/paddle/utils/ClassRegistrar.h
index 0c7747ac77a118e794a4b0d46d10b9cc1a2d15f5..ee58ccb2ad42ac9e5380e3a80fe0044965eab083 100644
--- a/paddle/utils/ClassRegistrar.h
+++ b/paddle/utils/ClassRegistrar.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <map>
@@ -63,16 +62,16 @@ public:
   // Create a class instance of type @type using args
   BaseClass* createByType(const std::string& type, CreateArgs... args) {
     ClassCreator creator;
-    CHECK(mapGet(type, creatorMap_, &creator))
-        << "Unknown class type: " << type;
+    CHECK(mapGet(type, creatorMap_, &creator)) << "Unknown class type: "
+                                               << type;
     return creator(args...);
   }
 
   template <typename T>
   inline void forEachType(T callback) {
-      for (auto it = creatorMap_.begin(); it != creatorMap_.end(); ++it) {
-          callback(it->first);
-      }
+    for (auto it = creatorMap_.begin(); it != creatorMap_.end(); ++it) {
+      callback(it->first);
+    }
   }
 
 protected:
diff --git a/paddle/utils/CommandLineParser.cpp b/paddle/utils/CommandLineParser.cpp
index 8edcad5747b419387a933b74a2b477ea82382054..307e304bb03d79fa9a640ece9c84845919b0d9c4 100644
--- a/paddle/utils/CommandLineParser.cpp
+++ b/paddle/utils/CommandLineParser.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "CommandLineParser.h"
 #ifndef PADDLE_USE_GFLAGS
 #include "paddle/utils/StringUtil.h"
@@ -31,7 +30,6 @@ static constexpr int kStatusOK = 0;
 static constexpr int kStatusInvalid = 1;
 static constexpr int kStatusNotFound = 2;
 
-
 /**
  * \brief: Convert a string to any type value.
  *
@@ -48,13 +46,16 @@ template <>
 bool StringToValue<bool>(const std::string& content, bool* value) {
   std::string tmp = content;
 
-  std::transform(tmp.begin(), tmp.end(), tmp.begin(), [](char in) -> char {
-    if (in <= 'Z' && in >= 'A') {
-      return in - ('Z' - 'z');
-    } else {
-      return in;
-    }
-  });  // tolower.
+  std::transform(tmp.begin(),
+                 tmp.end(),
+                 tmp.begin(),
+                 [](char in) -> char {
+                   if (in <= 'Z' && in >= 'A') {
+                     return in - ('Z' - 'z');
+                   } else {
+                     return in;
+                   }
+                 });  // tolower.
 
   if (tmp == "true" || tmp == "1") {
     *value = true;
@@ -121,20 +122,16 @@ int ParseArgument(const std::string& argument, std::string* extraInfo) {
  * parse '--flag_name', '-flag_name' as true; '--noflag_name', '-noflag_name' as
  * false
  */
-static int ParseBoolArgumentExtra(
-    const std::string& argument, std::string* extraInfo) {
+static int ParseBoolArgumentExtra(const std::string& argument,
+                                  std::string* extraInfo) {
   (void)(extraInfo);  // unused extraInfo, just make api same.
 
   //! @warning: The order and content of prefixes is DESIGNED for parsing
   //! command line. The length of prefixes are 1, 2, 3, 4. The parse logic takes
   //! use of this fact. DO NOT CHANGE IT without reading how to parse command
   //! below.
-  static const std::vector<std::pair<const char*, bool> >  prefixes = {
-    {"-", true},
-    {"--", true},
-    {"-no", false},
-    {"--no", false}
-  };
+  static const std::vector<std::pair<const char*, bool>> prefixes = {
+      {"-", true}, {"--", true}, {"-no", false}, {"--no", false}};
 
   for (flags_internal::CommandLineFlagRegistry<bool>::Command& command :
        flags_internal::CommandLineFlagRegistry<bool>::Instance()->commands) {
@@ -153,7 +150,6 @@ static int ParseBoolArgumentExtra(
   return kStatusNotFound;
 }
 
-
 /**
  * \brief: Print command line arguments' usage with type T.
  */
@@ -170,12 +166,9 @@ static void PrintTypeUsage() {
   }
 }
 
-template <typename ...TS>
+template <typename... TS>
 static void PrintTypeUsages() {
-  int unused[] = {
-    0,
-    (PrintTypeUsage<TS>(), 0) ...
-  };
+  int unused[] = {0, (PrintTypeUsage<TS>(), 0)...};
   (void)(unused);
 }
 /**
@@ -190,7 +183,8 @@ static void PrintUsageAndExit(const char* argv0) {
 /**
  * \brief: Print the error flags, usage, and exit.
  */
-static void PrintParseError(const std::string& name, const char* actualInput,
+static void PrintParseError(const std::string& name,
+                            const char* actualInput,
                             const char* arg0) {
   std::cerr << "Parse command flag " << name << " error! User input is "
             << actualInput << std::endl;
@@ -211,7 +205,7 @@ void ParseCommandLineFlags(int* argc, char** argv, bool withHelp) {
     PrintParseError(extra, argv[i], argv[0]); \
   }
 
-    ParseArgumentWithType(bool);    // NOLINT
+    ParseArgumentWithType(bool);  // NOLINT
     ParseArgumentWithType(int32_t);
     ParseArgumentWithType(double);  // NOLINT
     ParseArgumentWithType(int64_t);
diff --git a/paddle/utils/CommandLineParser.h b/paddle/utils/CommandLineParser.h
index d18675ffa30d7f36ee470c35a93e522c68bbfdda..c46567913e253bdda645f129449773040c0ec93d 100644
--- a/paddle/utils/CommandLineParser.h
+++ b/paddle/utils/CommandLineParser.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #ifndef PADDLE_USE_GFLAGS
 #include "DisableCopy.h"
@@ -72,7 +71,8 @@ struct CommandLineFlagRegister {
    * \param [inout] val: The command line argument instance, FLAGS_xxx.
    * \param [in] desc: The command line helper message.
    */
-  CommandLineFlagRegister(const std::string& name, T* val,
+  CommandLineFlagRegister(const std::string& name,
+                          T* val,
                           const std::string desc) {
     CommandLineFlagRegistry<T>::Instance()->commands.push_back(
         {name, val, desc, *val});
@@ -83,7 +83,8 @@ struct CommandLineFlagRegister {
  * \brief: Define a command line arguments.
  *
  * \param type: The variable type, such as int, double, etc.
- * \param name: The variable name. The command line argument is '--name', the variable
+ * \param name: The variable name. The command line argument is '--name', the
+ *variable
  *is 'FLAGS_name'
  * \param default_value: The default value of command line argument.
  * \param text: The description in command line argument.
diff --git a/paddle/utils/CompilerMacros.h b/paddle/utils/CompilerMacros.h
new file mode 100644
index 0000000000000000000000000000000000000000..4236d750c4d8bf722fdf3e371dc95b2d9aa8223d
--- /dev/null
+++ b/paddle/utils/CompilerMacros.h
@@ -0,0 +1,17 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#define ATTR_NORETURN __attribute__((noreturn))
diff --git a/paddle/utils/CustomStackTrace.cpp b/paddle/utils/CustomStackTrace.cpp
index 232a478ecd93a7dcb7da7b02a5a1af37a1d1bc43..8740fe662ea21ce93c7c0d9505cdeb75975b3020 100644
--- a/paddle/utils/CustomStackTrace.cpp
+++ b/paddle/utils/CustomStackTrace.cpp
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "CustomStackTrace.h"
 #include "CommandLineParser.h"
 #include <iostream>
 
-P_DEFINE_bool(layer_stack_error_only_current_thread,
+P_DEFINE_bool(
+    layer_stack_error_only_current_thread,
     true,
     "Dump current thread or whole process layer stack when signal error "
     "occurred. true means only dump current thread layer stack");
@@ -33,21 +33,23 @@ void installLayerStackTracer() {
     if (!gLayerStackTrace.empty()) {
       size_t curTid = -1UL;
       std::hash<std::thread::id> hasher;
-      gLayerStackTrace.dump([&curTid, &hasher](std::thread::id tid,
-                            bool* isForwarding,
-                            const std::string& layerName) {
-        if (curTid != hasher(tid)) {
-          if (curTid != -1UL) {
-            std::cerr << std::endl;
-          }
-          curTid = hasher(tid);
-          std::cerr << "Thread [" << tid << "] ";
-          if (isForwarding) {
-            std::cerr << (*isForwarding ? "Forwarding ": "Backwarding ");
-          }
-        }
-        std::cerr << layerName << ", ";
-      }, FLAGS_layer_stack_error_only_current_thread);
+      gLayerStackTrace.dump(
+          [&curTid, &hasher](std::thread::id tid,
+                             bool* isForwarding,
+                             const std::string& layerName) {
+            if (curTid != hasher(tid)) {
+              if (curTid != -1UL) {
+                std::cerr << std::endl;
+              }
+              curTid = hasher(tid);
+              std::cerr << "Thread [" << tid << "] ";
+              if (isForwarding) {
+                std::cerr << (*isForwarding ? "Forwarding " : "Backwarding ");
+              }
+            }
+            std::cerr << layerName << ", ";
+          },
+          FLAGS_layer_stack_error_only_current_thread);
       std::cerr << std::endl;
     }
     std::cerr.write(data, sz);
diff --git a/paddle/utils/CustomStackTrace.h b/paddle/utils/CustomStackTrace.h
index 774c4db2b9be40c38286ef1248bf77746949fd6b..878e14eb5fcf870bf6c29758a1b9a297c13ce730 100644
--- a/paddle/utils/CustomStackTrace.h
+++ b/paddle/utils/CustomStackTrace.h
@@ -24,13 +24,13 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * A ThreadLocal stack for tracing train/test process. 
- * (More details of ThreadLocal can be find 
+ * A ThreadLocal stack for tracing train/test process.
+ * (More details of ThreadLocal can be find
  * in the comments of ThreadLocal class.)
- * 
+ *
  * For example.
  * @code{.cpp}
- * 
+ *
  * paddle::CustomStackTrace<std::string> stack;
  * for (auto& layer : layers){
  *   stack.push(layer->getName());
@@ -48,7 +48,7 @@ namespace paddle {
  * @endcode
  */
 template <typename T>
-class CustomStackTrace{
+class CustomStackTrace {
 public:
   /**
    * @brief Pop out an item from the top of the stack if item == top.
@@ -87,7 +87,6 @@ public:
     return true;
   }
 
-
   /**
    * @brief DumpCallback Type. It will be invoked many times by dump method.
    *
@@ -96,8 +95,8 @@ public:
    * The third parameter is the item in stack.
    */
   typedef std::function<void(const std::thread::id& /*threadId*/,
-                              bool* /*isPushing*/,
-                              const T& /*item*/)> DumpCallback;
+                             bool* /*isPushing*/,
+                             const T& /*item*/)> DumpCallback;
 
   /**
    * Dump all thread stack, and all stack will be cleared.
@@ -160,25 +159,23 @@ private:
    * @brief Get thread local stack reference.
    */
   std::stack<T>& stack() {
-    return this->getThreadLocal(this->logStack_,
-                                this->stackBuffers_);
+    return this->getThreadLocal(this->logStack_, this->stackBuffers_);
   }
 
   /**
    * @brief Get thread local pushing flag.
    */
   bool& pushing() {
-    return this->getThreadLocal(this->isPushing_,
-                                this->pushingBuffers_);
+    return this->getThreadLocal(this->isPushing_, this->pushingBuffers_);
   }
 
 private:
   mutable std::mutex mtx_;
 
-  std::unordered_map<std::thread::id, std::stack<T>* > stackBuffers_;
-  std::unordered_map<std::thread::id, bool* > pushingBuffers_;
+  std::unordered_map<std::thread::id, std::stack<T>*> stackBuffers_;
+  std::unordered_map<std::thread::id, bool*> pushingBuffers_;
   ThreadLocal<bool> isPushing_;
-  ThreadLocal<std::stack<T> > logStack_;
+  ThreadLocal<std::stack<T>> logStack_;
 };
 
 extern CustomStackTrace<std::string> gLayerStackTrace;
diff --git a/paddle/utils/DisableCopy.h b/paddle/utils/DisableCopy.h
index 964daa237beb3085bc78404c6585e6fab16dc27b..e991c07cdf68dac2bdf7fd66de03a292a3bec3c8 100644
--- a/paddle/utils/DisableCopy.h
+++ b/paddle/utils/DisableCopy.h
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 /**
  * Disable copy macro.
  */
-#define DISABLE_COPY(CLASS_NAME)\
-  CLASS_NAME(CLASS_NAME &&) = delete; \
+#define DISABLE_COPY(CLASS_NAME)                \
+  CLASS_NAME(CLASS_NAME &&) = delete;           \
   CLASS_NAME(const CLASS_NAME &other) = delete; \
-  CLASS_NAME& operator=(const CLASS_NAME &other) = delete
+  CLASS_NAME &operator=(const CLASS_NAME &other) = delete
diff --git a/paddle/utils/Excepts.cpp b/paddle/utils/Excepts.cpp
index 9123508fc78d002a9fc5fd0e7e9da8ddec975d6f..b2fad3ac9dd6477e388185d95ebd49c8f0da4c84 100644
--- a/paddle/utils/Excepts.cpp
+++ b/paddle/utils/Excepts.cpp
@@ -27,28 +27,28 @@ int feenableexcept(unsigned int excepts) {
   static fenv_t fenv;
   unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
 
-  if ( fegetenv (&fenv) ) return -1;
+  if (fegetenv(&fenv)) return -1;
   old_excepts = fenv.__control & FE_ALL_EXCEPT;
 
   // unmask
   fenv.__control &= ~new_excepts;
-  fenv.__mxcsr   &= ~(new_excepts << 7);
+  fenv.__mxcsr &= ~(new_excepts << 7);
 
-  return ( fesetenv (&fenv) ? -1 : old_excepts );
+  return (fesetenv(&fenv) ? -1 : old_excepts);
 }
 
 int fedisableexcept(unsigned int excepts) {
   static fenv_t fenv;
   unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
 
-  if ( fegetenv (&fenv) ) return -1;
+  if (fegetenv(&fenv)) return -1;
   old_excepts = fenv.__control & FE_ALL_EXCEPT;
 
   // mask
   fenv.__control |= new_excepts;
-  fenv.__mxcsr   |= new_excepts << 7;
+  fenv.__mxcsr |= new_excepts << 7;
 
-  return ( fesetenv (&fenv) ? -1 : old_excepts );
+  return (fesetenv(&fenv) ? -1 : old_excepts);
 }
 
 #endif
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index b2b5a5949e59cb7e65eb0db7573adae8e50f80a8..6fae24e1b58c5296019cfaefe97905c3e8632210 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Flags.h"
 
 #ifdef PADDLE_ONLY_CPU
@@ -22,7 +21,8 @@ P_DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
 #endif
 
 P_DEFINE_bool(
-    parallel_nn, false,
+    parallel_nn,
+    false,
     "Whether to use multi-threads to calculate one neural network."
     "If it was set false, use gpu_id specify which gpu core to use"
     "(the device property in the trainer config file will be ingored)."
@@ -32,39 +32,48 @@ P_DEFINE_int32(trainer_count, 1, "Defined how many trainers to train");
 P_DEFINE_int32(gpu_id, 0, "Which gpu core to use");
 P_DEFINE_int32(port, 20134, "Listening port for pserver");
 P_DEFINE_int32(data_server_port, 21134, "Listening port for dserver");
-P_DEFINE_int32(ports_num, 1,
+P_DEFINE_int32(ports_num,
+               1,
                "The ports number for parameter send,"
                " increment based on default port number");
-P_DEFINE_int32(ports_num_for_sparse, 0,
+P_DEFINE_int32(ports_num_for_sparse,
+               0,
                "The ports number for parameter send,"
                " increment based on default (port + ports_num)");
 P_DEFINE_string(nics, "xgbe0,xgbe1", "network device name for pservers");
 P_DEFINE_string(rdma_tcp, "tcp", "use rdma or tcp rdma transport protocol");
 P_DEFINE_int32(
-    trainer_id, 0,
+    trainer_id,
+    0,
     "For distributed training, each trainer must be given an unique id"
     " ranging from 0 to num_trainers-1. Trainer 0 is the master"
     " trainer");
 P_DEFINE_int32(num_gradient_servers, 1, "number of gradient servers");
 P_DEFINE_string(comment, "", "A string for commenting this training task");
-P_DEFINE_string(load_missing_parameter_strategy, "fail",
+P_DEFINE_string(load_missing_parameter_strategy,
+                "fail",
                 "which operation to take on load model fails. support "
                 "fail/rand/zero only.");
 P_DEFINE_int32(log_period, 100, "Log progress every so many batches");
-P_DEFINE_int32(log_period_server, 500,
+P_DEFINE_int32(log_period_server,
+               500,
                "Log progress every so many batches at pserver end");
 P_DEFINE_double(checkgrad_eps, 1e-5, "parameter change size for checkgrad");
-P_DEFINE_int32(enable_parallel_vector, 0,
+P_DEFINE_int32(enable_parallel_vector,
+               0,
                "threshold for enable parallel vector");
-P_DEFINE_bool(loadsave_parameters_in_pserver, false,
+P_DEFINE_bool(loadsave_parameters_in_pserver,
+              false,
               "load and save parameters in pserver. "
               "only work while parameter set sparse_remote_update.");
-P_DEFINE_int32(beam_size, 1,
+P_DEFINE_int32(beam_size,
+               1,
                "Beam size used in generating most probable output sequences.");
 
 P_DEFINE_bool(show_layer_stat, false, "show the statistics of each layer");
 P_DEFINE_string(predict_file, "", "File name for saving predict result");
 P_DEFINE_bool(prev_batch_state, false, "batch is continue with next batch");
-P_DEFINE_string(init_model_path, "",
+P_DEFINE_string(init_model_path,
+                "",
                 "Path of the initial model parameters."
                 "If it was set, start_pass will be ignored.");
diff --git a/paddle/utils/Flags.h b/paddle/utils/Flags.h
index b23a29eff9069117a64bfa46d8930a9a43510949..dda60c3f965abd8575677c785b21b058b3400ee5 100644
--- a/paddle/utils/Flags.h
+++ b/paddle/utils/Flags.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "CommandLineParser.h"
diff --git a/paddle/utils/GlobalConstants.cpp b/paddle/utils/GlobalConstants.cpp
index 8ed6471e4e85de6d1d012660242e2eae05139ec5..d769cd1ee7d4403f9fddbe91d2afec2c986d6b18 100644
--- a/paddle/utils/GlobalConstants.cpp
+++ b/paddle/utils/GlobalConstants.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "GlobalConstants.h"
 
 namespace paddle {
diff --git a/paddle/utils/GlobalConstants.h b/paddle/utils/GlobalConstants.h
index 8818b014f80be92f1b7b6907739c3d36bcaa7466..4c74c17a50c8cdbc18a075a58f97efc6b3330deb 100644
--- a/paddle/utils/GlobalConstants.h
+++ b/paddle/utils/GlobalConstants.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <string>
 
@@ -20,9 +19,9 @@ namespace paddle {
 
 namespace enumeration_wrapper {
 enum PassType {
-  PASS_TRAIN,  // Train pass
-  PASS_TEST,   // Test pass
-  PASS_GC,     // Gradient Check pass
+  PASS_TRAIN,   // Train pass
+  PASS_TEST,    // Test pass
+  PASS_GC,      // Gradient Check pass
   PASS_METRIC,  // pass for generate template output with no drop rate.
   // pass for metric learning training with metric learning error, only used
   // when we are doing KNN evaluation.
@@ -81,7 +80,7 @@ enum ParameterType {
 }  // namespace enumeration_wrapper
 
 //! explicit import enum into paddle namespace.
-using namespace enumeration_wrapper;    // NOLINT
+using namespace enumeration_wrapper;  // NOLINT
 
 class TrainAlgorithm {
 public:
diff --git a/paddle/utils/Locks.h b/paddle/utils/Locks.h
index 1fc0363d34597c9447996479aaf771e46d0ba600..5990e1657021611437e8fe730147dfaf207c800d 100644
--- a/paddle/utils/Locks.h
+++ b/paddle/utils/Locks.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <pthread.h>
@@ -26,7 +25,7 @@ namespace paddle {
 
 /**
  * A simple read-write lock.
- * The RWlock allows a number of readers or at most one writer 
+ * The RWlock allows a number of readers or at most one writer
  * at any point in time.
  * The RWlock disable copy.
  *
@@ -37,7 +36,7 @@ namespace paddle {
  *
  * Use lock_shared() to lock on read mode, other thread can get
  * it by using the same method lock_shared().
- * 
+ *
  * Unlock:
  *
  * Use unlock() to unlock the lock.
@@ -68,13 +67,13 @@ protected:
 };
 
 /**
- * The ReadLockGuard is a read mode RWLock 
- * using RAII management mechanism. 
+ * The ReadLockGuard is a read mode RWLock
+ * using RAII management mechanism.
  */
 class ReadLockGuard {
 public:
   /**
-   * @brief Construct Function. Lock on rwlock in read mode. 
+   * @brief Construct Function. Lock on rwlock in read mode.
    */
   explicit ReadLockGuard(RWLock& rwlock) : rwlock_(&rwlock) {
     rwlock_->lock_shared();
@@ -82,7 +81,7 @@ public:
 
   /**
    * @brief Destruct Function.
-   * @note This method just unlock the read mode rwlock, 
+   * @note This method just unlock the read mode rwlock,
    * won't destroy the lock.
    */
   ~ReadLockGuard() { rwlock_->unlock(); }
@@ -120,16 +119,15 @@ class Semaphore {
 public:
   //! Disable copy & assign
   Semaphore(const Semaphore& other) = delete;
-  Semaphore& operator= (const Semaphore&& other) = delete;
+  Semaphore& operator=(const Semaphore&& other) = delete;
 
   //! Enable move.
-  Semaphore(Semaphore&& other): m(std::move(other.m)) {
-  }
+  Semaphore(Semaphore&& other) : m(std::move(other.m)) {}
 
 public:
   /**
-   * @brief Construct Function. 
-   * @param[in] initValue the initial value of the 
+   * @brief Construct Function.
+   * @param[in] initValue the initial value of the
    * semaphore, default 0.
    */
   explicit Semaphore(int initValue = 0);
@@ -137,22 +135,23 @@ public:
   ~Semaphore();
 
   /**
-   * @brief The same as wait(), except if the decrement can not 
+   * @brief The same as wait(), except if the decrement can not
    * be performed until ts return false install of blocking.
-   * @param[in] ts an absolute timeout in seconds and nanoseconds 
+   * @param[in] ts an absolute timeout in seconds and nanoseconds
    * since the Epoch 1970-01-01 00:00:00 +0000(UTC).
-   * @return ture if the decrement proceeds before ts, 
+   * @return ture if the decrement proceeds before ts,
    * else return false.
    */
   bool timeWait(struct timespec* ts);
 
   /**
-   * @brief decrement the semaphore. If the semaphore's value is 0, then call blocks.
+   * @brief decrement the semaphore. If the semaphore's value is 0, then call
+   * blocks.
    */
   void wait();
 
   /**
-   * @brief increment the semaphore. If the semaphore's value 
+   * @brief increment the semaphore. If the semaphore's value
    * greater than 0, wake up a thread blocked in wait().
    */
   void post();
@@ -178,9 +177,9 @@ public:
   ~ThreadBarrier();
 
   /**
-   * @brief . 
-   * If there were count - 1 threads waiting before, 
-   * then wake up all the count - 1 threads and continue run together. 
+   * @brief .
+   * If there were count - 1 threads waiting before,
+   * then wake up all the count - 1 threads and continue run together.
    * Else block the thread until waked by other thread .
    */
   void wait();
@@ -218,12 +217,12 @@ public:
 
   /**
    * @brief wait until pred return ture.
-   * @tparam Predicate c++ concepts, describes a function object 
-   * that takes a single iterator argument 
-   * that is dereferenced and used to 
+   * @tparam Predicate c++ concepts, describes a function object
+   * that takes a single iterator argument
+   * that is dereferenced and used to
    * return a value testable as a bool.
-   * @note pred shall not apply any non-constant function 
-   * through the dereferenced iterator. 
+   * @note pred shall not apply any non-constant function
+   * through the dereferenced iterator.
    */
   template <class Predicate>
   void wait(Predicate pred) {
diff --git a/paddle/utils/Logging.cpp b/paddle/utils/Logging.cpp
index a0644940b5402af295f8abcd0e9cd8badcac7616..14303bd4c747db2c10ee24b1601f709a79174850 100644
--- a/paddle/utils/Logging.cpp
+++ b/paddle/utils/Logging.cpp
@@ -91,8 +91,8 @@ static inline int env2index(const char* envName,
 }
 
 static bool gLogToStderr = env2bool("PLOG_LOGTOSTDERR", true);
-static const std::vector<std::string> gLevelName = {"INFO", "WARNING", "ERROR",
-                                                    "FATAL"};
+static const std::vector<std::string> gLevelName = {
+    "INFO", "WARNING", "ERROR", "FATAL"};
 static int gMinLogLevel =
     env2int("PLOG_MINLOGLEVEL", env2index("PLOG_MINLOGLEVEL", gLevelName, 0));
 
@@ -134,7 +134,7 @@ static void initializeLogFds(char* argv0) {
   gLogInited = true;
 }
 
-static void (*gFailureFunctionPtr)() __attribute__((noreturn)) = abort;
+static void (*gFailureFunctionPtr)() ATTR_NORETURN = abort;
 
 LogMessage::LogMessage(const char* fname, int line, int severity)
     : fname_(fname), line_(line), severity_(severity) {}
@@ -143,11 +143,19 @@ LogMessage::~LogMessage() { this->generateLogMessage(); }
 
 void LogMessage::generateLogMessage() {
   if (!gLogInited) {
-    fprintf(stderr, "%c %s:%d] %s\n", "IWEF"[severity_], fname_, line_,
+    fprintf(stderr,
+            "%c %s:%d] %s\n",
+            "IWEF"[severity_],
+            fname_,
+            line_,
             str().c_str());
   } else {
     for (auto& fd : gLogFds[this->severity_]) {
-      dprintf(fd, "%c %s:%d] %s\n", "IWEF"[severity_], fname_, line_,
+      dprintf(fd,
+              "%c %s:%d] %s\n",
+              "IWEF"[severity_],
+              fname_,
+              line_,
               str().c_str());
     }
   }
@@ -167,11 +175,9 @@ void initializeLogging(int argc, char** argv) {
 }
 
 namespace logging {
-void setMinLogLevel(int level) {
-  paddle::internal::gMinLogLevel = level;
-}
+void setMinLogLevel(int level) { paddle::internal::gMinLogLevel = level; }
 
-void installFailureFunction(void (*callback)()) {
+void installFailureFunction(void (*callback)() ATTR_NORETURN) {
   paddle::internal::gFailureFunctionPtr = callback;
 }
 
@@ -191,13 +197,11 @@ void initializeLogging(int argc, char** argv) {
 }
 
 namespace logging {
-void setMinLogLevel(int level) {
-  FLAGS_minloglevel = level;
-}
+void setMinLogLevel(int level) { FLAGS_minloglevel = level; }
 void installFailureFunction(void (*callback)()) {
   google::InstallFailureFunction(callback);
 }
-void installFailureWriter(void(*callback)(const char*, int)) {
+void installFailureWriter(void (*callback)(const char*, int)) {
   google::InstallFailureWriter(callback);
 }
 }  // namespace logging
diff --git a/paddle/utils/Logging.h b/paddle/utils/Logging.h
index 7fdfa3240c1de71ca8cd4c4b7e772b6767b43672..e9029b421fa3b68845a54194f4cfa69439a99a0c 100644
--- a/paddle/utils/Logging.h
+++ b/paddle/utils/Logging.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include <string>
 
 #ifndef PADDLE_USE_GLOG
+#include "CompilerMacros.h"
 
 //! TODO(yuyang18): Move this utility macro into some global header.
 #define PP_CAT(a, b) PP_CAT_I(a, b)
@@ -31,11 +32,11 @@ limitations under the License. */
 
 /**
  * Generate Unique Variable Name, Usefully in macro.
- * @SEE http://stackoverflow.com/questions/1082192/how-to-generate-random-variable-names-in-c-using-macros
+ * @SEE
+ * http://stackoverflow.com/questions/1082192/how-to-generate-random-variable-names-in-c-using-macros
  */
 #define UNIQUE_NAME(base) PP_CAT(base, __LINE__)
 
-
 namespace paddle {
 
 //! Log levels.
@@ -168,13 +169,13 @@ void setMinLogLevel(int level);
  * @brief Install Log(Fatal) failure function. Default is abort();
  * @param callback: The failure function.
  */
-void installFailureFunction(void (*callback)());
+void installFailureFunction(void (*callback)() ATTR_NORETURN);
 
 /**
  * @brief installFailureWriter
  * @note: not implemented currently.
  */
-inline void installFailureWriter(void(*callback)(const char*, int)) {
+inline void installFailureWriter(void (*callback)(const char*, int)) {
   (void)(callback);  // unused callback.
 }
 }  //  namespace logging
@@ -186,7 +187,7 @@ void initializeLogging(int argc, char** argv);
 namespace logging {
 void setMinLogLevel(int level);
 void installFailureFunction(void (*callback)());
-void installFailureWriter(void(*callback)(const char*, int));
+void installFailureWriter(void (*callback)(const char*, int));
 }  //  namespace logging
 }
 #endif  // PADDLE_USE_GLOG
diff --git a/paddle/utils/PythonUtil.cpp b/paddle/utils/PythonUtil.cpp
index 90e5093f96ea4e892b7f2b1f2baa1bf1d6c85c05..7f17a825228ef56be7b8678bf003e57388d4b0bf 100644
--- a/paddle/utils/PythonUtil.cpp
+++ b/paddle/utils/PythonUtil.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PythonUtil.h"
 #include <sstream>
 #include <signal.h>
@@ -33,7 +32,8 @@ int executeCMD(const char* cmd, char* result) {
   strncpy(ps, cmd, kExecuteCMDBufLength);
   if ((ptr = popen(ps, "r")) != NULL) {
     size_t count = fread(bufPs, 1, kExecuteCMDBufLength, ptr);
-    memcpy(result, bufPs,
+    memcpy(result,
+           bufPs,
            count - 1);  // why count-1: remove the '\n' at the end
     result[count] = 0;
     pclose(ptr);
@@ -71,15 +71,14 @@ std::string callPythonFunc(const std::string& moduleName,
 
 #else
 
-
 static std::recursive_mutex g_pyMutex;
 
 PyGuard::PyGuard() : guard_(g_pyMutex) {}
 
-
-static void printPyErrorStack(std::ostream& os, bool withEndl = false,
+static void printPyErrorStack(std::ostream& os,
+                              bool withEndl = false,
                               bool withPyPath = true) {
-  PyObject * ptype, *pvalue, *ptraceback;
+  PyObject *ptype, *pvalue, *ptraceback;
   PyErr_Fetch(&ptype, &pvalue, &ptraceback);
   PyErr_NormalizeException(&ptype, &pvalue, &ptraceback);
   PyErr_Clear();
@@ -91,10 +90,8 @@ static void printPyErrorStack(std::ostream& os, bool withEndl = false,
   }
   PyTracebackObject* obj = (PyTracebackObject*)ptraceback;
 
-  os << "Python Error: " << PyString_AsString(PyObject_Str(ptype))
-            <<" : " << (pvalue == NULL ? ""
-                                       : PyString_AsString(
-                                           PyObject_Str(pvalue)));
+  os << "Python Error: " << PyString_AsString(PyObject_Str(ptype)) << " : "
+     << (pvalue == NULL ? "" : PyString_AsString(PyObject_Str(pvalue)));
   if (withEndl) {
     os << std::endl;
   }
@@ -104,8 +101,8 @@ static void printPyErrorStack(std::ostream& os, bool withEndl = false,
   }
   while (obj != NULL) {
     int line = obj->tb_lineno;
-    const char* filename = PyString_AsString(
-          obj->tb_frame->f_code->co_filename);
+    const char* filename =
+        PyString_AsString(obj->tb_frame->f_code->co_filename);
     os << "            " << filename << " : " << line;
     if (withEndl) {
       os << std::endl;
@@ -143,7 +140,8 @@ std::string callPythonFunc(const std::string& moduleName,
 }
 
 PyObjectPtr createPythonClass(
-    const std::string& moduleName, const std::string& className,
+    const std::string& moduleName,
+    const std::string& className,
     const std::vector<std::string>& args,
     const std::map<std::string, std::string>& kwargs) {
   PyGuard guard;
@@ -164,21 +162,18 @@ PyObjectPtr createPythonClass(
   PyObjectPtr kwargsObjectList(PyDict_New());
   for (auto& x : kwargs) {
     PyObjectPtr pyArg(Py_BuildValue("s#", x.second.c_str(), x.second.length()));
-    PyDict_SetItemString(kwargsObjectList.get(), x.first.c_str(),
-                         pyArg.release());
+    PyDict_SetItemString(
+        kwargsObjectList.get(), x.first.c_str(), pyArg.release());
   }
 
-  PyObjectPtr pyInstance(PyInstance_New(pyClass.get(), argsObjectList.release(),
-                                        kwargsObjectList.release()));
+  PyObjectPtr pyInstance(PyInstance_New(
+      pyClass.get(), argsObjectList.release(), kwargsObjectList.release()));
   CHECK_PY(pyInstance) << "Create class " << className << " failed.";
   return pyInstance;
 }
 
-
 namespace py {
-char* repr(PyObject* obj) {
-  return PyString_AsString(PyObject_Repr(obj));
-}
+char* repr(PyObject* obj) { return PyString_AsString(PyObject_Repr(obj)); }
 
 std::string getPyCallStack() {
   std::ostringstream os;
@@ -186,7 +181,7 @@ std::string getPyCallStack() {
   return os.str();
 }
 
-PyObjectPtr import(const std::string &moduleName) {
+PyObjectPtr import(const std::string& moduleName) {
   auto module = PyImport_ImportModule(moduleName.c_str());
   CHECK_PY(module) << "Import " << moduleName << "Error";
   return PyObjectPtr(module);
diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h
index 00fc177022ac343a5760e57bcbcabf18f697bd4d..65677d90101a0ee2e62c8ac45c50b88326e169e1 100644
--- a/paddle/utils/PythonUtil.h
+++ b/paddle/utils/PythonUtil.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #ifndef PADDLE_NO_PYTHON
@@ -83,8 +82,7 @@ PyObjectPtr createPythonClass(const std::string& moduleName,
                               const std::vector<std::string>& args,
                               const std::map<std::string, std::string>& kwargs);
 
-#define CHECK_PY(x)\
-  CHECK((x) != nullptr) << ::paddle::py::getPyCallStack()
+#define CHECK_PY(x) CHECK((x) != nullptr) << ::paddle::py::getPyCallStack()
 
 namespace py {
 PyObjectPtr import(const std::string& moduleName);
@@ -101,13 +99,13 @@ template <typename T>
 T castInt(PyObject* obj, bool* ok = nullptr) {
   if (PyLong_Check(obj)) {
     if (ok) *ok = true;
-    return (T) PyLong_AsUnsignedLong(obj);
+    return (T)PyLong_AsUnsignedLong(obj);
   } else if (PyInt_Check(obj)) {
     if (ok) *ok = true;
-    return (T) PyInt_AsLong(obj);
+    return (T)PyInt_AsLong(obj);
   } else {
     if (ok) *ok = false;
-    return (T) 0;
+    return (T)0;
   }
 }
 
@@ -116,14 +114,12 @@ T castInt(PyObject* obj, bool* ok = nullptr) {
  *
  * Just like toString method in java.
  */
-char *repr(PyObject* obj);
+char* repr(PyObject* obj);
 
 /**
  * Invoke repr of python object.
  */
-inline char *repr(const PyObjectPtr &obj) {
-  return repr(obj.get());
-}
+inline char* repr(const PyObjectPtr& obj) { return repr(obj.get()); }
 
 /**
  * Get Python Error Stack String.
@@ -137,8 +133,7 @@ std::string getPyCallStack();
  */
 class ObjectHelper {
 public:
-  explicit ObjectHelper(const PyObjectPtr& obj): obj_(obj) {
-  }
+  explicit ObjectHelper(const PyObjectPtr& obj) : obj_(obj) {}
 
   /**
    * get attribute
@@ -211,15 +206,13 @@ public:
     CHECK(PySequence_Check(seq_));
   }
 
-  explicit SequenceHelper(PyObject* seq): seq_(seq) {
+  explicit SequenceHelper(PyObject* seq) : seq_(seq) {
     CHECK(PySequence_Check(seq_));
   }
 
-  inline size_t size() const {
-    return (size_t) PySequence_Size(seq_);
-  }
+  inline size_t size() const { return (size_t)PySequence_Size(seq_); }
 
-  inline PyObject* operator[] (size_t i) const {
+  inline PyObject* operator[](size_t i) const {
     return PySequence_Fast_GET_ITEM(seq_, i);
   }
 
@@ -260,9 +253,9 @@ private:
 
 class DictHelper {
 public:
-  explicit DictHelper(PyObject* d): dict_(d) {}
+  explicit DictHelper(PyObject* d) : dict_(d) {}
 
-  explicit DictHelper(const PyObjectPtr& d): dict_(d.get()) {}
+  explicit DictHelper(const PyObjectPtr& d) : dict_(d.get()) {}
 
   void set(const std::string& key, PyObject* item) {
     PyDict_SetItemString(dict_, key.c_str(), item);
@@ -274,17 +267,15 @@ public:
 
   void setStringList(const std::string& key,
                      const std::vector<std::string>& items) {
-    auto * list = PyList_New(items.size());
-    for (size_t i=0; i < items.size(); ++i) {
+    auto* list = PyList_New(items.size());
+    for (size_t i = 0; i < items.size(); ++i) {
       PyList_SetItem(list, i, PyString_FromString(items[i].c_str()));
     }
     this->set(key, list);
   }
 
 private:
-  inline void checkDict() {
-    CHECK(PyDict_Check(this->dict_));
-  }
+  inline void checkDict() { CHECK(PyDict_Check(this->dict_)); }
 
   PyObject* dict_;
 };
@@ -298,7 +289,7 @@ inline static bool isCallable(const PyObjectPtr& obj) {
  */
 class CallableHelper {
 public:
-  explicit CallableHelper(const PyObjectPtr& obj): obj_(obj) {
+  explicit CallableHelper(const PyObjectPtr& obj) : obj_(obj) {
     CHECK(py::isCallable(obj_));
   }
 
@@ -308,21 +299,17 @@ public:
    * reset args, and create new tuple.
    * @param sz args size.
    */
-  void setArgsSize(size_t sz) {
-    args.reset(PyTuple_New(sz));
-  }
+  void setArgsSize(size_t sz) { args.reset(PyTuple_New(sz)); }
 
   /**
    * Get args sequence. User can set/get by SequenceHelper.
    */
-  SequenceHelper getArgs() {
-    return SequenceHelper(args);
-  }
+  SequenceHelper getArgs() { return SequenceHelper(args); }
 
   /**
    * Call python method, return an object.
    */
-  PyObject* operator() () {
+  PyObject* operator()() {
     PyGuard guard;
     return PyObject_Call(obj_.get(), args.get(), kwargs.get());
   }
diff --git a/paddle/utils/Queue.h b/paddle/utils/Queue.h
index f952cf58778dee0565a8e88ef0015d51dc295428..58d17e86c432b90a6b3240dd5528146a24b72184 100644
--- a/paddle/utils/Queue.h
+++ b/paddle/utils/Queue.h
@@ -142,12 +142,9 @@ public:
    */
   bool waitNotEmptyFor(int seconds) {
     std::unique_lock<std::mutex> lock(queueLock_);
-    return queueCV_.wait_for(
-          lock,
-          std::chrono::seconds(seconds),
-          [this] {
-      return numElements_ != 0;
-    });
+    return queueCV_.wait_for(lock,
+                             std::chrono::seconds(seconds),
+                             [this] { return numElements_ != 0; });
   }
 
 private:
@@ -190,7 +187,7 @@ template <typename T>
 class BlockingQueue {
 public:
   /**
-   * @brief Construct Function. 
+   * @brief Construct Function.
    * @param[in] capacity the max numer of elements the queue can have.
    */
   explicit BlockingQueue(size_t capacity) : capacity_(capacity) {}
@@ -198,9 +195,9 @@ public:
   /**
    * @brief enqueue an element into Queue.
    * @param[in] x The enqueue element, pass by reference .
-   * @note This method is thread-safe, and will wake up another thread 
+   * @note This method is thread-safe, and will wake up another thread
    * who was blocked because of the queue is empty.
-   * @note If it's size() >= capacity before enqueue, 
+   * @note If it's size() >= capacity before enqueue,
    * this method will block and wait until size() < capacity.
    */
   void enqueue(const T& x) {
@@ -229,7 +226,7 @@ public:
   /**
    * Return size of queue.
    *
-   * @note This method is thread safe. 
+   * @note This method is thread safe.
    * The size of the queue won't change until the method return.
    */
   size_t size() {
diff --git a/paddle/utils/Stat.cpp b/paddle/utils/Stat.cpp
index d7b20ca5eb2f4eadaa6b4acad056d669a9b59c14..ab140c33502ad315d087bb3afc7f39bffc122894 100644
--- a/paddle/utils/Stat.cpp
+++ b/paddle/utils/Stat.cpp
@@ -65,6 +65,7 @@ std::ostream& operator<<(std::ostream& outPut, const Stat& stat) {
   auto showStat = [&](const StatInfo* info, pid_t tid, bool isFirst = true) {
     uint64_t average = 0;
     if (info->count_ > 0) {
+      outPut << std::setfill(' ') << std::left;
       if (!isFirst) {
         outPut << std::setw(42) << " ";
       }
@@ -202,4 +203,22 @@ StatInfo::~StatInfo() {
   }
 }
 
+static unsigned g_profileCount = 0;
+static std::recursive_mutex g_profileMutex;
+
+GpuProfiler::GpuProfiler(std::string statName, std::string info)
+  : guard_(g_profileMutex)  {
+  if (++g_profileCount == 1) {
+    LOG(INFO) << "Enable GPU Profiler Stat: ["
+              << statName << "] " << info;
+    hl_profiler_start();
+  }
+}
+
+GpuProfiler::~GpuProfiler() {
+  if (--g_profileCount == 0) {
+    hl_profiler_end();
+  }
+}
+
 }  // namespace paddle
diff --git a/paddle/utils/Stat.h b/paddle/utils/Stat.h
index 00e5aaec2babfde5cc95b6afad8713e685ffa52a..1ef688ea8da53ee0cd51b1775e671f2b10be782b 100644
--- a/paddle/utils/Stat.h
+++ b/paddle/utils/Stat.h
@@ -15,19 +15,19 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
-#include <string>
 #include <sys/time.h>
-#include <memory>
 #include <iostream>
+#include <list>
+#include <memory>
 #include <mutex>
+#include <string>
 #include <unordered_map>
-#include <list>
 
-#include "Logging.h"
 #include "BarrierStat.h"
 #include "Locks.h"
+#include "Logging.h"
 #include "ThreadLocal.h"
-#include "BarrierStat.h"
+#include "hl_gpu.h"
 
 namespace paddle {
 
@@ -93,7 +93,8 @@ public:
     return ret.first->second;
   }
 
-  BarrierStatPtr getStat(uint16_t numConnThreads, const std::string& name,
+  BarrierStatPtr getStat(uint16_t numConnThreads,
+                         const std::string& name,
                          BarrierStatType bType);
 
   void deleteStat(const std::string& name);
@@ -204,8 +205,10 @@ protected:
 
 class TimerOnce {
 public:
-  TimerOnce(Stat* stat, const char* info = "",
-            uint64_t threshold = -1, bool autoStart = true,
+  TimerOnce(Stat* stat,
+            const char* info = "",
+            uint64_t threshold = -1,
+            bool autoStart = true,
             uint64_t startStamp = 0)
       : stat_(stat), info_(info), timer_(autoStart), threshold_(threshold) {
     if (!autoStart) {
@@ -261,23 +264,43 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1,
 
 #define REGISTER_TIMER_SET(statName, start, ...)                            \
   static StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \
-  TimerOnce __timerOnce(__stat.get(), "", registerTimerArg1(__VA_ARGS__),   \
-                        false, start);
+  TimerOnce __timerOnce(                                                    \
+      __stat.get(), "", registerTimerArg1(__VA_ARGS__), false, start);
 
 // dynmaic timer, support to discriminate runtime entity, used in pserver
-#define REGISTER_TIMER_DYNAMIC(statName, ...)                               \
-  StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName);        \
+#define REGISTER_TIMER_DYNAMIC(statName, ...)                        \
+  StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \
   TimerOnce __timerOnce(__stat.get(), "", registerTimerArg1(__VA_ARGS__));
 
-#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...)                    \
-  StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName);        \
-  TimerOnce __timerOnce(__stat.get(), "", registerTimerArg1(__VA_ARGS__),   \
-                        false, start);
+#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...)             \
+  StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \
+  TimerOnce __timerOnce(                                             \
+      __stat.get(), "", registerTimerArg1(__VA_ARGS__), false, start);
 
-#define REGISTER_TIMER_INFO(statName, info)                                 \
-  static StatPtr __stat = globalStat.getStat(statName);                     \
+#define REGISTER_TIMER_INFO(statName, info)             \
+  static StatPtr __stat = globalStat.getStat(statName); \
   TimerOnce __timerOnce(__stat.get(), info, 10 * 1000000LU /*threshold*/);
 
 #endif  // DISABLE_TIMER
 
+class GpuProfiler final {
+public:
+  GpuProfiler(std::string statName, std::string info);
+  ~GpuProfiler();
+
+private:
+  std::lock_guard<std::recursive_mutex> guard_;
+};
+
+#ifdef PADDLE_DISABLE_PROFILER
+
+#define REGISTER_GPU_PROFILER(statName, ...)
+
+#else
+
+#define REGISTER_GPU_PROFILER(statName, ...) \
+  GpuProfiler __gpuProfiler(statName, #__VA_ARGS__);
+
+#endif  // DISABLE_PROFILER
+
 }  // namespace paddle
diff --git a/paddle/utils/StringUtil.h b/paddle/utils/StringUtil.h
index 50301a19be46bf608cf072d3f47335abbb830bc9..8b44dad19231781623a0a65d02b24ac1cf9e4523 100644
--- a/paddle/utils/StringUtil.h
+++ b/paddle/utils/StringUtil.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <string>
@@ -68,8 +67,6 @@ inline T to(const std::string& s) {
   return v;
 }
 
-
-
 }  // namespace str
 
 #undef DEFINE_STRING_CONVERSION
diff --git a/paddle/utils/Thread.h b/paddle/utils/Thread.h
index f6c826a1eeb656ff852c70f70b85c3b00a6a5e8b..ade0ee496f94f6165f35dd1a0a37618df8fae585 100644
--- a/paddle/utils/Thread.h
+++ b/paddle/utils/Thread.h
@@ -57,7 +57,8 @@ public:
   void join() { thread_->join(); }
 
   /**
-   * @brief Define what to be done on this thread through override this function.
+   * @brief Define what to be done on this thread through override this
+   * function.
    */
   virtual void run() = 0;
 
@@ -155,10 +156,9 @@ public:
   /**
    * @brief Construct Function. No thread will be created.
    */
-  SyncThreadPool()
-    : jobStartBarrier_(0),
-    jobFinishBarrier_(0)
-  { LOG(FATAL) << "Not implemented"; }
+  SyncThreadPool() : jobStartBarrier_(0), jobFinishBarrier_(0) {
+    LOG(FATAL) << "Not implemented";
+  }
 
   /**
    * @brief Construct Fucntion. Create numWorkers of threads in the pool.
@@ -191,7 +191,8 @@ public:
   /**
    * @brief Execute a job using all the theads in the pool.
    * @param[in] jobFunc The function to be executed.
-   * @param[in] ownerFunc Owner thread can do something in owerFunc when job executing.
+   * @param[in] ownerFunc Owner thread can do something in owerFunc when job
+   * executing.
    * @note For the ownerFunc, tid=getNumThreads().
    */
   void exec(JobFunc jobFunc, JobFunc ownerFunc = nullptr) {
@@ -316,7 +317,8 @@ protected:
  *
  * Force stop:
  *
- *    Use forceStop() to exit forcibly even though there are remaining jobs in the
+ *    Use forceStop() to exit forcibly even though there are remaining jobs in
+ * the
  * job queue.
  */
 template <class T>
@@ -426,7 +428,8 @@ protected:
   /**
    * @brief Do the jobs in the job queue sequentianlly
    * and enqueue the result into the result queue.
-   * @note A nullptr will be enqueued into the resulte queue, when a worker finished.
+   * @note A nullptr will be enqueued into the resulte queue, when a worker
+   * finished.
    */
   virtual void run() {
     while (true) {
@@ -492,7 +495,9 @@ public:
   }
 
   ~AsyncThreadPool() {
-    if (!stopping_) { stop(); }
+    if (!stopping_) {
+      stop();
+    }
   }
 
   /**
@@ -501,7 +506,7 @@ public:
   void stop() {
     stopping_ = true;
     for (size_t i = 0; i < workers_.size(); i++) {
-      jobs_.enqueue([]{});
+      jobs_.enqueue([] {});
     }
     for (auto& worker : workers_) {
       worker->join();
@@ -526,7 +531,7 @@ public:
    * asynchronously.
    * Call std::future::get() when the execturation result is needed;
    */
-  template<class F, class... Args>
+  template <class F, class... Args>
   auto addJob(F&& f, Args&&... args)
       -> std::future<typename std::result_of<F(Args...)>::type> {
     CHECK(!stopping_) << "AsyncThreadPool is closed";
@@ -535,7 +540,7 @@ public:
     auto task = std::make_shared<std::packaged_task<T()>>(
         std::bind(std::forward<F>(f), std::forward<Args>(args)...));
     auto res = task->get_future();
-    jobs_.enqueue([task]{ (*task)(); });
+    jobs_.enqueue([task] { (*task)(); });
     return res;
   }
 
@@ -551,15 +556,15 @@ public:
    *
    * @note *results* may need to be carefully cleared before *addBatchJobs()*.
    */
-  template<class F>
-  void addBatchJobs(const std::vector<F> &jobs,
-      std::vector<typename std::result_of<F()>::type> &results) {
+  template <class F>
+  void addBatchJobs(const std::vector<F>& jobs,
+                    std::vector<typename std::result_of<F()>::type>& results) {
     typedef typename std::result_of<F()>::type T;
     static_assert(!std::is_same<T, void>::value,
-        "should pass a non-void function as job");
+                  "should pass a non-void function as job");
 
-    std::vector<std::future<T> > resFuts;
-    for (const auto &job : jobs) {
+    std::vector<std::future<T>> resFuts;
+    for (const auto& job : jobs) {
       resFuts.emplace_back(addJob(job));
     }
     for (auto& fut : resFuts) {
@@ -572,13 +577,16 @@ public:
    * @tparam F don't need to have a return value.
    * @param[in] jobs a vector of executable objection.
    */
-  template<class F>
-  void addBatchJobs(const std::vector<F> &jobs) {
+  template <class F>
+  void addBatchJobs(const std::vector<F>& jobs) {
     CHECK(!stopping_) << "AsyncThreadPool is closed";
-    std::vector<std::future<bool> > tmpRes;
+    std::vector<std::future<bool>> tmpRes;
 
     for (const auto& job : jobs) {
-      tmpRes.emplace_back(addJob([&job]{ job(); return true; }));
+      tmpRes.emplace_back(addJob([&job] {
+        job();
+        return true;
+      }));
     }
 
     for (auto& res : tmpRes) {
@@ -604,4 +612,4 @@ private:
   bool stopping_;
 };  // class AsyncThreadPool
 
-}   // namespace paddle
+}  // namespace paddle
diff --git a/paddle/utils/ThreadLocal.cpp b/paddle/utils/ThreadLocal.cpp
index 0f948f1029af85c97d2564a089b7bf878244643c..49d4b1526537def9b8183934faa971402f3678aa 100644
--- a/paddle/utils/ThreadLocal.cpp
+++ b/paddle/utils/ThreadLocal.cpp
@@ -16,7 +16,8 @@ limitations under the License. */
 #include "ThreadLocal.h"
 #include "CommandLineParser.h"
 
-P_DEFINE_bool(thread_local_rand_use_global_seed, false,
+P_DEFINE_bool(thread_local_rand_use_global_seed,
+              false,
               "Whether to use global seed in thread local rand.");
 
 namespace paddle {
diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h
index b91e4ad5472cab4f48f1eb59304aa7c0cf3f621f..06c8b392af23f81ab48042cb4d24a40b1c50275d 100644
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <pthread.h>
@@ -91,9 +90,7 @@ public:
   /**
    * Implicit conversion to T*
    */
-  operator T*() {
-    return get();
-  }
+  operator T*() { return get(); }
 
 private:
   static void dataDestructor(void* p) { delete (T*)p; }
diff --git a/paddle/utils/TypeDefs.h b/paddle/utils/TypeDefs.h
index e02fd62b53823f8bc84b957b4fa62aeb62346c0d..e8be779bea255eec71057495d1253ed92c2256c3 100644
--- a/paddle/utils/TypeDefs.h
+++ b/paddle/utils/TypeDefs.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 namespace paddle {
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index b16d4314654ffeab74137ec1ee69203dab56d851..bc727cfa74cdfb51b36259bd08733804578f6d66 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Util.h"
 
 #include <dirent.h>
@@ -54,7 +53,8 @@ P_DEFINE_int32(seed, 1, "random number seed. 0 for srand(time)");
 #include <gperftools/profiler.h>
 
 P_DEFINE_int32(profile_signal, 12, "signal for switch google profiler");
-P_DEFINE_string(profile_data_file, "gperf.prof",
+P_DEFINE_string(profile_data_file,
+                "gperf.prof",
                 "file for storing profile data");
 
 static void profilerSwitch(int signalNumber) {
@@ -94,18 +94,18 @@ static void installProfilerSwitch() {}
 namespace paddle {
 
 pid_t getTID() {
-  #if defined(__APPLE__) || defined(__OSX__)
-      // syscall is deprecated: first deprecated in macOS 10.12.
-      // syscall is unsupported;
-      // syscall pid_t tid = syscall(SYS_thread_selfid);
-      uint64_t tid;
-      pthread_threadid_np(NULL, &tid);
-  #else
-      #ifndef __NR_gettid
-      #define __NR_gettid 224
-      #endif
-      pid_t tid = syscall(__NR_gettid);
-  #endif
+#if defined(__APPLE__) || defined(__OSX__)
+  // syscall is deprecated: first deprecated in macOS 10.12.
+  // syscall is unsupported;
+  // syscall pid_t tid = syscall(SYS_thread_selfid);
+  uint64_t tid;
+  pthread_threadid_np(NULL, &tid);
+#else
+#ifndef __NR_gettid
+#define __NR_gettid 224
+#endif
+  pid_t tid = syscall(__NR_gettid);
+#endif
   CHECK_NE((int)tid, -1);
   return tid;
 }
@@ -126,22 +126,25 @@ void registerInitFunction(std::function<void()> func, int priority) {
 }
 
 void runInitFunctions() {
-  std::call_once(g_onceFlag, []() {
-    LOG(INFO) << "Calling runInitFunctions";
-    if (g_initFuncs) {
-      std::sort(g_initFuncs->begin(), g_initFuncs->end(),
-                [](const PriorityFuncPair& x, const PriorityFuncPair& y) {
-                  return x.first > y.first;
-                });
-      for (auto& f : *g_initFuncs) {
-        f.second();
-      }
-      delete g_initFuncs;
-      g_initFuncs = nullptr;
-    }
-    g_initialized = true;
-    LOG(INFO) << "Call runInitFunctions done.";
-  });
+  std::call_once(
+      g_onceFlag,
+      []() {
+        LOG(INFO) << "Calling runInitFunctions";
+        if (g_initFuncs) {
+          std::sort(g_initFuncs->begin(),
+                    g_initFuncs->end(),
+                    [](const PriorityFuncPair& x, const PriorityFuncPair& y) {
+                      return x.first > y.first;
+                    });
+          for (auto& f : *g_initFuncs) {
+            f.second();
+          }
+          delete g_initFuncs;
+          g_initFuncs = nullptr;
+        }
+        g_initialized = true;
+        LOG(INFO) << "Call runInitFunctions done.";
+      });
 }
 
 void initMain(int argc, char** argv) {
@@ -282,7 +285,7 @@ void mkDir(const char* filename) {
   }
 }
 
-void mkDirRecursively(const char *dir) {
+void mkDirRecursively(const char* dir) {
   struct stat sb;
 
   if (!stat(dir, &sb)) return;
@@ -303,7 +306,6 @@ void loadFileList(const std::string& fileListFileName,
   }
 }
 
-
 double getMemoryUsage() {
   FILE* fp = fopen("/proc/meminfo", "r");
   CHECK(fp) << "failed to fopen /proc/meminfo";
@@ -363,7 +365,9 @@ size_t calculateServiceNum(const std::string& pservers, int ports_num) {
   return hosts.size() * ports_num;
 }
 
-void memcpyWithCheck(void* dest, const void* src, size_t num,
+void memcpyWithCheck(void* dest,
+                     const void* src,
+                     size_t num,
                      const void* srcEnd) {
   int minus = (char*)srcEnd - (char*)src - num;
   CHECK_LE(0, minus) << "memcpyWithCheck: copy " << num
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index 2adb626c83f94c7c5d7a8d53653a46090e19e7b7..ed38f8fa60b3716c12e755b047557c1409fa767c 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <algorithm>
@@ -47,7 +46,8 @@ limitations under the License. */
  */
 #define FOR_EACH(iterator_name, container)                              \
   for (auto iterator_name = (container).begin(), e = (container).end(); \
-       iterator_name != e; ++iterator_name)
+       iterator_name != e;                                              \
+       ++iterator_name)
 
 /**
  * Loop over the elements in a container in reverse order
@@ -60,8 +60,8 @@ limitations under the License. */
  */
 #define FOR_EACH_R(iterator_name, container)                              \
   for (auto iterator_name = (container).rbegin(), e = (container).rend(); \
-       iterator_name != e; ++iterator_name)
-
+       iterator_name != e;                                                \
+       ++iterator_name)
 
 namespace paddle {
 
@@ -77,11 +77,11 @@ pid_t getTID();
  * \f]
  */
 inline constexpr size_t findLastSet(size_t x) {
-  return std::is_same<size_t , unsigned int>::value ?
-      (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
-    : (std::is_same<size_t , unsigned long>::value ? // NOLINT
-      (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
-    : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
+  return std::is_same<size_t, unsigned int>::value
+             ? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
+             : (std::is_same<size_t, unsigned long>::value  // NOLINT
+                    ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
+                    : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
 }
 
 /**
@@ -95,7 +95,6 @@ inline int mod(int a, int b) {
   return r >= 0 ? r : r + b;
 }
 
-
 /**
  * find the value given a key k from container c.
  * If the key can be found, the value is stored in *value
@@ -120,7 +119,7 @@ static bool contains(const Container& container, const T& val) {
 /**
  * pop and get the front element of a container
  */
-template<typename Container>
+template <typename Container>
 typename Container::value_type pop_get_front(Container& c) {
   typename Container::value_type v;
   swap(v, c.front());
@@ -207,7 +206,6 @@ protected:
   int devId_;
 };
 
-
 /**
  * Enables direct access to memory allocations on a peer device(d2).
  * input:
@@ -250,7 +248,6 @@ private:
   bool syncFlag_;
 };
 
-
 inline bool useGpu(int deviceId) {
   return FLAGS_parallel_nn ? (deviceId >= 0 ? true : false) : FLAGS_use_gpu;
 }
@@ -328,7 +325,9 @@ T readT(char*& p, const char* pEnd) {
   return v;
 }
 
-void memcpyWithCheck(void* dest, const void* src, size_t num,
+void memcpyWithCheck(void* dest,
+                     const void* src,
+                     size_t num,
                      const void* srcEnd);
 
 /**
@@ -338,7 +337,6 @@ void memcpyWithCheck(void* dest, const void* src, size_t num,
 class SyncThreadPool;
 SyncThreadPool* getGlobalSyncThreadPool();
 
-
 namespace path {
 
 // directory separator
@@ -363,7 +361,8 @@ std::string dirname(const std::string& path);
 std::string join(const std::string& part1, const std::string& part2);
 
 template <typename... Args>
-std::string join(const std::string& part1, const std::string& part2,
+std::string join(const std::string& part1,
+                 const std::string& part2,
                  Args... args) {
   return join(join(part1, part2), args...);
 }
@@ -392,8 +391,8 @@ public:
     std::call_once(onceFlag_, [&] { invokeThreadId_ = curThreadId; });
     CHECK_EQ(invokeThreadId_, curThreadId)
         << "This method should invoke in "
-           "same thread, but first invoked in " << invokeThreadId_
-        << " current invoked in " << curThreadId;
+           "same thread, but first invoked in "
+        << invokeThreadId_ << " current invoked in " << curThreadId;
   }
 
 private:
@@ -447,28 +446,23 @@ private:
  * @brief The ScopedCallbacks class is a callback invoker when object is
  *        created and destroyed.
  */
-template <typename CallbackType, typename ...Args>
+template <typename CallbackType, typename... Args>
 class ScopedCallbacks {
 public:
-  ScopedCallbacks(CallbackType enter,
-                  CallbackType exit,
-                  Args& ... args)
-    : exit_(std::bind(exit, args...)) {
+  ScopedCallbacks(CallbackType enter, CallbackType exit, Args&... args)
+      : exit_(std::bind(exit, args...)) {
     enter(args...);
   }
 
   ScopedCallbacks(const ScopedCallbacks& other) = delete;
-  ScopedCallbacks& operator = (const ScopedCallbacks& other) = delete;
+  ScopedCallbacks& operator=(const ScopedCallbacks& other) = delete;
 
-  ~ScopedCallbacks() {
-    exit_();
-  }
+  ~ScopedCallbacks() { exit_(); }
 
 private:
   std::function<void()> exit_;
 };
 
-
 /**
  * std compatible allocator with memory alignment.
  * @tparam T type of allocator elements.
@@ -537,8 +531,7 @@ public:
       return nullptr;
     }
     if (n > max_size()) {
-      throw std::length_error(
-          "AlignAllocator<T>::allocate() - Int Overflow.");
+      throw std::length_error("AlignAllocator<T>::allocate() - Int Overflow.");
     }
     void* r = nullptr;
     CHECK_EQ(posix_memalign(&r, Alignment * 8, sizeof(T) * n), 0);
@@ -558,7 +551,6 @@ private:
   AlignedAllocator& operator=(const AlignedAllocator&);  // disable
 };
 
-
 class Deprecated {
 public:
   explicit Deprecated(const std::string& msg = "") {
diff --git a/paddle/utils/Version.cpp b/paddle/utils/Version.cpp
index b59b78f5707bd4a7ee9f8073927f55c0c9ef0398..e706983918b4a865f6674a34083ef0143bd6e185 100644
--- a/paddle/utils/Version.cpp
+++ b/paddle/utils/Version.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Version.h"
 
 #include "Flags.h"
@@ -34,18 +33,22 @@ void printVersion(std::ostream& os) {
 #ifndef PADDLE_VERSION
 #define PADDLE_VERSION "unknown"
 #endif
-  os << "paddle version: " << PADDLE_VERSION << std::endl << std::boolalpha
-      << "\t" << "withGpu: " << version::isWithGpu() << std::endl
-      << "\t" << "withAvx: " << version::isWithAvx() << std::endl
-      << "\t" << "withPyDataProvider: " << version::isWithPyDataProvider()
-      << std::endl
-      << "\t" << "withTimer: " << version::isWithTimer() << std::endl
-      << "\t" << "withFpga: " << version::isWithFpga() << std::endl
-      << "\t" << "real byte size: "<< version::sizeofReal() << std::endl
-      << std::endl;
+  os << "paddle version: " << PADDLE_VERSION << std::endl
+     << std::boolalpha << "\t"
+     << "withGpu: " << version::isWithGpu() << std::endl
+     << "\t"
+     << "withAvx: " << version::isWithAvx() << std::endl
+     << "\t"
+     << "withPyDataProvider: " << version::isWithPyDataProvider() << std::endl
+     << "\t"
+     << "withTimer: " << version::isWithTimer() << std::endl
+     << "\t"
+     << "withFpga: " << version::isWithFpga() << std::endl
+     << "\t"
+     << "real byte size: " << version::sizeofReal() << std::endl
+     << std::endl;
 }
 
-
 void printVersion() {
   if (FLAGS_version) {
     printVersion(std::cout);
diff --git a/paddle/utils/Version.h b/paddle/utils/Version.h
index e6655fa75dabfeec99bc2157b8c9a1e9e4f19263..e6c799644ee7f88e4e90eec565d1bab2bc9faed7 100644
--- a/paddle/utils/Version.h
+++ b/paddle/utils/Version.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <stddef.h>
 #include "TypeDefs.h"
@@ -35,7 +34,6 @@ namespace paddle {
  *    real byte size: 4
  */
 
-
 namespace version {
 
 /**
@@ -44,7 +42,6 @@ namespace version {
  */
 void printVersion();
 
-
 void printVersion(std::ostream& os);
 /**
  * @brief isWithGpu
@@ -75,7 +72,6 @@ constexpr bool isWithPyDataProvider() {
 #endif
 }
 
-
 /**
  * @brief isWithTimer
  * @return true if paddle compiled with timer.
@@ -116,25 +112,19 @@ constexpr bool isWithFpga() {
  * @brief sizeofReal
  * @return return the byte size of real
  */
-constexpr size_t sizeofReal() {
-  return sizeof(real);
-}
+constexpr size_t sizeofReal() { return sizeof(real); }
 
 /**
  * @brief isPaddleUseDouble
  * @return true if paddle compiled with double precision.
  */
-constexpr bool isPaddleUseDouble() {
-  return sizeofReal() == sizeof(double);
-}
+constexpr bool isPaddleUseDouble() { return sizeofReal() == sizeof(double); }
 
 /**
  * @brief isPaddleUseFloat
  * @return true if paddle compiled with float precision
  */
-constexpr bool isPaddleUseFloat() {
-  return sizeofReal() == sizeof(float);
-}
+constexpr bool isPaddleUseFloat() { return sizeofReal() == sizeof(float); }
 
 }  //  namespace version
 
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp
index 347ae64c26dfdfcdaff62886481c20e9c4c7cfec..93016daeaea644ca44499fdc6024ec8deac57ca8 100644
--- a/paddle/utils/arch/linux/Locks.cpp
+++ b/paddle/utils/arch/linux/Locks.cpp
@@ -22,26 +22,19 @@ public:
   sem_t sem;
 };
 
-Semaphore::Semaphore(int initValue): m(new SemaphorePrivate()) {
+Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
   sem_init(&m->sem, 0, initValue);
 }
 
-Semaphore::~Semaphore() {
-  sem_destroy(&m->sem);
-}
+Semaphore::~Semaphore() { sem_destroy(&m->sem); }
 
 bool Semaphore::timeWait(struct timespec* ts) {
   return (0 == sem_timedwait(&m->sem, ts));
 }
 
-void Semaphore::wait() {
-  sem_wait(&m->sem);
-}
-
-void Semaphore::post() {
-  sem_post(&m->sem);
-}
+void Semaphore::wait() { sem_wait(&m->sem); }
 
+void Semaphore::post() { sem_post(&m->sem); }
 
 class SpinLockPrivate {
 public:
@@ -51,25 +44,20 @@ public:
   char padding_[64 - sizeof(pthread_spinlock_t)];
 };
 
-SpinLock::SpinLock():m(new SpinLockPrivate()) {}
-
+SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
 
 SpinLock::~SpinLock() { delete m; }
 
-void SpinLock::lock() {
-  pthread_spin_lock(&m->lock_);
-}
+void SpinLock::lock() { pthread_spin_lock(&m->lock_); }
 
-void SpinLock::unlock() {
-  pthread_spin_unlock(&m->lock_);
-}
+void SpinLock::unlock() { pthread_spin_unlock(&m->lock_); }
 
 class ThreadBarrierPrivate {
 public:
   pthread_barrier_t barrier_;
 };
 
-ThreadBarrier::ThreadBarrier(int count): m(new ThreadBarrierPrivate()) {
+ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate()) {
   pthread_barrier_init(&m->barrier_, nullptr, count);
 }
 
@@ -78,8 +66,6 @@ ThreadBarrier::~ThreadBarrier() {
   delete m;
 }
 
-void ThreadBarrier::wait() {
-  pthread_barrier_wait(&m->barrier_);
-}
+void ThreadBarrier::wait() { pthread_barrier_wait(&m->barrier_); }
 
 }  // namespace paddle
diff --git a/paddle/utils/arch/osx/Locks.cpp b/paddle/utils/arch/osx/Locks.cpp
index b3ec454976520be10995bd8399b7ce838e3fa824..ae563a6afd29b6315d9c6609474faddbfaaded14 100644
--- a/paddle/utils/arch/osx/Locks.cpp
+++ b/paddle/utils/arch/osx/Locks.cpp
@@ -22,20 +22,16 @@ namespace paddle {
 
 class SemaphorePrivate {
 public:
-  ~SemaphorePrivate() {
-    dispatch_release(sem);
-  }
+  ~SemaphorePrivate() { dispatch_release(sem); }
 
   dispatch_semaphore_t sem;
 };
 
-Semaphore::Semaphore(int initValue): m(new SemaphorePrivate()) {
+Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
   m->sem = dispatch_semaphore_create(initValue);
 }
 
-Semaphore::~Semaphore() {
-  delete m;
-}
+Semaphore::~Semaphore() { delete m; }
 
 bool Semaphore::timeWait(timespec *ts) {
   dispatch_time_t tm = dispatch_walltime(ts, 0);
@@ -46,9 +42,7 @@ void Semaphore::wait() {
   dispatch_semaphore_wait(m->sem, DISPATCH_TIME_FOREVER);
 }
 
-void Semaphore::post() {
-  dispatch_semaphore_signal(m->sem);
-}
+void Semaphore::post() { dispatch_semaphore_signal(m->sem); }
 
 class SpinLockPrivate {
 public:
@@ -56,17 +50,15 @@ public:
   char padding_[64 - sizeof(lock_)];  // Padding to cache line size
 };
 
-SpinLock::SpinLock(): m(new SpinLockPrivate()) {}
+SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
 SpinLock::~SpinLock() { delete m; }
 
 void SpinLock::lock() {
-  while (m->lock_.test_and_set(std::memory_order_acquire)) {}
-}
-
-void SpinLock::unlock() {
-  m->lock_.clear(std::memory_order_release);
+  while (m->lock_.test_and_set(std::memory_order_acquire)) {
+  }
 }
 
+void SpinLock::unlock() { m->lock_.clear(std::memory_order_release); }
 
 class ThreadBarrierPrivate {
 public:
@@ -75,7 +67,7 @@ public:
   int count_;
   int tripCount_;
 
-  inline explicit ThreadBarrierPrivate(int cnt):count_(0), tripCount_(cnt) {
+  inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) {
     CHECK_NE(cnt, 0);
     CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
     CHECK_GE(pthread_cond_init(&cond_, 0), 0);
@@ -106,7 +98,7 @@ public:
   }
 };
 
-ThreadBarrier::ThreadBarrier(int count): m(new ThreadBarrierPrivate(count)) {}
+ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
 ThreadBarrier::~ThreadBarrier() { delete m; }
 void ThreadBarrier::wait() { m->wait(); }
 
diff --git a/paddle/utils/tests/test_CommandLineParser.cpp b/paddle/utils/tests/test_CommandLineParser.cpp
index 9bb6827540f61e8c6cc8b64c2b04ed4d0fcebab1..5ecfb2b4f511e63eac21a5eae3829532f6860d66 100644
--- a/paddle/utils/tests/test_CommandLineParser.cpp
+++ b/paddle/utils/tests/test_CommandLineParser.cpp
@@ -63,10 +63,15 @@ TEST(CommandLineParser, defaultValue) {
 }
 
 TEST(CommandLineParser, normal) {
-  char* argv[] = {
-      cc("test_program"), cc("--i2=32"),              cc("--str1=abc"),
-      cc("--b2=1"),       cc("-b1=False"),            cc("--d2=.34"),
-      cc("--d1=0"),       cc("--l1=-12345678901234"), cc("-ul2=3212")};
+  char* argv[] = {cc("test_program"),
+                  cc("--i2=32"),
+                  cc("--str1=abc"),
+                  cc("--b2=1"),
+                  cc("-b1=False"),
+                  cc("--d2=.34"),
+                  cc("--d1=0"),
+                  cc("--l1=-12345678901234"),
+                  cc("-ul2=3212")};
   int argc = sizeof(argv) / sizeof(char*);
   paddle::ParseCommandLineFlags(&argc, argv);
   ASSERT_EQ(argc, 1);
@@ -104,8 +109,6 @@ int main(int argc, char** argv) {
 
 #else
 
-int main(int argc, char** argv) {
-  return 0;
-}
+int main(int argc, char** argv) { return 0; }
 
 #endif
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
index 3e665021471cb3c179b13960dcc9f2284a0d664c..3bfb381ed93659feebcc567a04b2a095dc94dfa8 100644
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -22,11 +22,12 @@ limitations under the License. */
 
 P_DEFINE_int32(test_thread_num, 10, "testing thread number");
 
-void testNormalImpl(const std::function<void(
-                      paddle::CustomStackTrace<std::string>&,
-                      size_t, size_t,
-                      paddle::ThreadBarrier&,
-                      paddle::ThreadBarrier&)>& callback) {
+void testNormalImpl(
+    const std::function<void(paddle::CustomStackTrace<std::string>&,
+                             size_t,
+                             size_t,
+                             paddle::ThreadBarrier&,
+                             paddle::ThreadBarrier&)>& callback) {
   paddle::CustomStackTrace<std::string> tracer;
   paddle::ThreadBarrier doneBarrier(FLAGS_test_thread_num + 1);
   paddle::ThreadBarrier startBarrier(FLAGS_test_thread_num + 1);
@@ -35,10 +36,13 @@ void testNormalImpl(const std::function<void(
   std::vector<std::unique_ptr<std::thread>> threads;
   threads.reserve(FLAGS_test_thread_num);
 
-  for (int32_t i=0; i < FLAGS_test_thread_num; ++i) {
-    threads.emplace_back(new std::thread([&tracer, &countDown, &layerSize,
-                                         &startBarrier, &doneBarrier,
-                                         &callback]{
+  for (int32_t i = 0; i < FLAGS_test_thread_num; ++i) {
+    threads.emplace_back(new std::thread([&tracer,
+                                          &countDown,
+                                          &layerSize,
+                                          &startBarrier,
+                                          &doneBarrier,
+                                          &callback] {
       callback(tracer, countDown, layerSize, startBarrier, doneBarrier);
     }));
   }
@@ -55,18 +59,19 @@ void testNormalImpl(const std::function<void(
   }
 }
 
-
 TEST(CustomStackTrace, normalTrain) {
   testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
-                 size_t countDown, size_t layerSize,
-                 paddle::ThreadBarrier& start, paddle::ThreadBarrier& finish){
+                    size_t countDown,
+                    size_t layerSize,
+                    paddle::ThreadBarrier& start,
+                    paddle::ThreadBarrier& finish) {
     while (countDown-- > 0) {
       start.wait();
-      for (size_t i=0; i < layerSize; ++i) {
+      for (size_t i = 0; i < layerSize; ++i) {
         tracer.push("layer_" + std::to_string(i));
       }
       tracer.pop("");
-      for (size_t i=0; i < layerSize; ++i) {
+      for (size_t i = 0; i < layerSize; ++i) {
         tracer.pop("layer_" + std::to_string(layerSize - 1 - i));
       }
       finish.wait();
@@ -75,12 +80,14 @@ TEST(CustomStackTrace, normalTrain) {
 }
 
 TEST(CustomStackTrace, normalTest) {
-  testNormalImpl([] (paddle::CustomStackTrace<std::string>& tracer,
-                 size_t countDown, size_t layerSize,
-                 paddle::ThreadBarrier& start, paddle::ThreadBarrier& finish){
+  testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
+                    size_t countDown,
+                    size_t layerSize,
+                    paddle::ThreadBarrier& start,
+                    paddle::ThreadBarrier& finish) {
     while (countDown-- > 0) {
       start.wait();
-      for (size_t i=0; i < layerSize; ++i) {
+      for (size_t i = 0; i < layerSize; ++i) {
         tracer.push("layer_" + std::to_string(i));
       }
       tracer.clear();  // in forward test, tracer will clear after forward.
diff --git a/paddle/utils/tests/test_CustomStackTracePrint.cpp b/paddle/utils/tests/test_CustomStackTracePrint.cpp
index c19c98614e6a7d6285990aa19849131579f7307b..d39a190961a96906eef2b510cb3538c639d5df5c 100644
--- a/paddle/utils/tests/test_CustomStackTracePrint.cpp
+++ b/paddle/utils/tests/test_CustomStackTracePrint.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 int main(int argc, char** argv) {
   paddle::initMain(argc, argv);
 
-  for (size_t i=0; i < 1000; ++i) {
+  for (size_t i = 0; i < 1000; ++i) {
     paddle::gLayerStackTrace.push("layer_" + std::to_string(i));
     if (i == 998) {
       throw "Unhandle exception";
diff --git a/paddle/utils/tests/test_Logging.cpp b/paddle/utils/tests/test_Logging.cpp
index a9382de6da4ef5b425afa4a8d76652d7506d8e72..9f477fab14a2abde93505a05fc4c9ccd3d6426b6 100644
--- a/paddle/utils/tests/test_Logging.cpp
+++ b/paddle/utils/tests/test_Logging.cpp
@@ -54,7 +54,7 @@ TEST(Logging, Check) {
 
   auto pcheckDown = [&] { P_CHECK(a == b); };
   ASSERT_DEATH(pcheckDown(),
-    "F .*test_Logging.cpp:[0-9]+] Check failed: a == b ");
+               "F .*test_Logging.cpp:[0-9]+] Check failed: a == b ");
 
   P_CHECK_LE(a, b);
   P_CHECK_LT(a, b);
@@ -157,8 +157,6 @@ int main(int argc, char** argv) {
 
 #else
 
-int main(int, char**) {
-  return 0;
-}
+int main(int, char**) { return 0; }
 
 #endif
diff --git a/paddle/utils/tests/test_SpinLock.cpp b/paddle/utils/tests/test_SpinLock.cpp
index ebc84e0f52d823bf4799d08ff8ea6a036e131f66..77d281962cfeaa3cc951a72eddf4f37b619c5691 100644
--- a/paddle/utils/tests/test_SpinLock.cpp
+++ b/paddle/utils/tests/test_SpinLock.cpp
@@ -21,17 +21,18 @@ limitations under the License. */
 
 P_DEFINE_int32(test_thread_num, 100, "testing thread number");
 
-void testNormalImpl(size_t thread_num, const std::function
-    <void(size_t, size_t&, paddle::SpinLock&)>& callback) {
+void testNormalImpl(
+    size_t thread_num,
+    const std::function<void(size_t, size_t&, paddle::SpinLock&)>& callback) {
   paddle::SpinLock mutex;
   std::vector<std::thread> threads;
   threads.reserve(thread_num);
 
   size_t count = 0;
   for (size_t i = 0; i < thread_num; ++i) {
-      threads.emplace_back([&thread_num, &count, &mutex, &callback]{
-          callback(thread_num, count, mutex);
-      });
+    threads.emplace_back([&thread_num, &count, &mutex, &callback] {
+      callback(thread_num, count, mutex);
+    });
   }
   for (auto& thread : threads) {
     thread.join();
@@ -41,12 +42,13 @@ void testNormalImpl(size_t thread_num, const std::function
 }
 
 TEST(ThreadSpinLock, normalTest) {
-  for (auto &thread_num : {10, 30, 50 , 100 , 300, 1000}) {
-    testNormalImpl(thread_num, [](size_t thread_num,
-        size_t& count, paddle::SpinLock& mutex){
-        std::lock_guard<paddle::SpinLock> lock(mutex);
-        ++count;
-    });
+  for (auto& thread_num : {10, 30, 50, 100, 300, 1000}) {
+    testNormalImpl(
+        thread_num,
+        [](size_t thread_num, size_t& count, paddle::SpinLock& mutex) {
+          std::lock_guard<paddle::SpinLock> lock(mutex);
+          ++count;
+        });
   }
 }
 
diff --git a/paddle/utils/tests/test_StringUtils.cpp b/paddle/utils/tests/test_StringUtils.cpp
index b8636709e9b42c7baa5d0106492ab6c0782ed6d4..2c699b791ffad8ed680c5537005aac7dad832f41 100644
--- a/paddle/utils/tests/test_StringUtils.cpp
+++ b/paddle/utils/tests/test_StringUtils.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/StringUtil.h"
 
 #include <gtest/gtest.h>
diff --git a/paddle/utils/tests/test_Thread.cpp b/paddle/utils/tests/test_Thread.cpp
index bf4e2753458e08a0b20a33663d8b8175919852b0..154db5d9c616d4817b933c82587834f5ce2d0f8e 100644
--- a/paddle/utils/tests/test_Thread.cpp
+++ b/paddle/utils/tests/test_Thread.cpp
@@ -20,7 +20,7 @@ using paddle::AsyncThreadPool;  // NOLINT
 
 TEST(AsyncThreadPool, addJob) {
   AsyncThreadPool pool(8);
-  auto a = pool.addJob([]{ return 1; });
+  auto a = pool.addJob([] { return 1; });
   auto b = pool.addJob([] { return true; });
   auto c = pool.addJob([] { return false; });
 
@@ -36,10 +36,7 @@ TEST(AsyncThreadPool, addBatchJob) {
   std::vector<AsyncThreadPool::JobFunc> jobs;
 
   for (int i = 0; i < 10000; i++) {
-    jobs.emplace_back(
-        [&] {
-          counter++;
-        });
+    jobs.emplace_back([&] { counter++; });
   }
 
   pool.addBatchJobs(jobs);
@@ -55,13 +52,16 @@ TEST(AsyncThreadPool, multiThreadAddBatchJob) {
   int counter = 0;
   const int numMonitors = 300;
   const int numSlaves = 300;
-  std::vector<AsyncThreadPool::JobFunc> moniterJobs(numMonitors, [&] {
-      std::vector<AsyncThreadPool::JobFunc> slaveJobs(numSlaves,
-          [mut, &counter] {
-            std::lock_guard<std::mutex> lk(*mut);
-            counter++;
-          });
-      levelTwoPool.addBatchJobs(slaveJobs);
+  std::vector<AsyncThreadPool::JobFunc> moniterJobs(
+      numMonitors,
+      [&] {
+        std::vector<AsyncThreadPool::JobFunc> slaveJobs(
+            numSlaves,
+            [mut, &counter] {
+              std::lock_guard<std::mutex> lk(*mut);
+              counter++;
+            });
+        levelTwoPool.addBatchJobs(slaveJobs);
       });
   levelOnePool.addBatchJobs(moniterJobs);
   ASSERT_EQ(counter, numMonitors * numSlaves);
@@ -70,13 +70,10 @@ TEST(AsyncThreadPool, multiThreadAddBatchJob) {
 TEST(AsyncThreadPool, addBatchJobWithResults) {
   AsyncThreadPool pool(100);
 
-  std::vector<std::function<int()> > jobs;
+  std::vector<std::function<int()>> jobs;
   const int numJobs = 100;
   for (int i = 0; i < numJobs; i++) {
-    jobs.emplace_back(
-        [i]{
-          return i;
-        });
+    jobs.emplace_back([i] { return i; });
   }
 
   std::vector<int> res;
diff --git a/paddle/utils/tests/test_ThreadBarrier.cpp b/paddle/utils/tests/test_ThreadBarrier.cpp
index 90bd6c21bc8e5ac05b248a0517f9e4fb43d04054..20b9babd94cf4e6a475daece349c871bd606d83d 100644
--- a/paddle/utils/tests/test_ThreadBarrier.cpp
+++ b/paddle/utils/tests/test_ThreadBarrier.cpp
@@ -22,42 +22,44 @@ limitations under the License. */
 
 P_DEFINE_int32(test_thread_num, 100, "testing thread number");
 
-void testNormalImpl(size_t thread_num,
-                    const std::function<void(size_t,
-                    std::mutex&, std::set<std::thread::id>&,
-                    paddle::ThreadBarrier&)>& callback) {
- std::mutex mutex;
- std::set<std::thread::id> tids;
- paddle::ThreadBarrier barrier(thread_num);
+void testNormalImpl(
+    size_t thread_num,
+    const std::function<void(size_t,
+                             std::mutex&,
+                             std::set<std::thread::id>&,
+                             paddle::ThreadBarrier&)>& callback) {
+  std::mutex mutex;
+  std::set<std::thread::id> tids;
+  paddle::ThreadBarrier barrier(thread_num);
 
- std::vector<std::thread> threads;
- threads.reserve(thread_num);
- for (size_t i = 0; i < thread_num; ++i) {
-    threads.emplace_back([&thread_num, &mutex,
-                         &tids, &barrier, &callback]{
-        callback(thread_num, mutex, tids, barrier);
+  std::vector<std::thread> threads;
+  threads.reserve(thread_num);
+  for (size_t i = 0; i < thread_num; ++i) {
+    threads.emplace_back([&thread_num, &mutex, &tids, &barrier, &callback] {
+      callback(thread_num, mutex, tids, barrier);
     });
- }
+  }
 
- for (auto& thread : threads) {
-   thread.join();
- }
+  for (auto& thread : threads) {
+    thread.join();
+  }
 }
 
 TEST(ThreadBarrier, normalTest) {
-  for (auto &thread_num : {10, 30, 50 , 100 , 300, 1000}) {
+  for (auto& thread_num : {10, 30, 50, 100, 300, 1000}) {
     testNormalImpl(thread_num,
-                  [](size_t thread_num, std::mutex& mutex,
-                  std::set<std::thread::id>& tids,
-                  paddle::ThreadBarrier& barrier){
-      {
-        std::lock_guard<std::mutex> guard(mutex);
-        tids.insert(std::this_thread::get_id());
-      }
-      barrier.wait();
-      // Check whether all threads reach this point or not
-      CHECK_EQ(tids.size(), thread_num);
-    });
+                   [](size_t thread_num,
+                      std::mutex& mutex,
+                      std::set<std::thread::id>& tids,
+                      paddle::ThreadBarrier& barrier) {
+                     {
+                       std::lock_guard<std::mutex> guard(mutex);
+                       tids.insert(std::this_thread::get_id());
+                     }
+                     barrier.wait();
+                     // Check whether all threads reach this point or not
+                     CHECK_EQ(tids.size(), thread_num);
+                   });
   }
 }
 
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 461c73f14c2dc9377cc39ebb8f1273eee81730a3..d7f523bc8d9bce00ba72c41284d2b3eb3cde6529 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -11,8 +11,12 @@ set(real_proto_files)
 # TODO(yuyang18): Some internal proto will also be depended on.
 #                 Find a way to automatically calculate all depends.
 foreach(filename ${proto_filenames})
+    set(PROTOBUF_3_FLAGS "")
+    if (PROTOBUF_3)
+        set(PROTOBUF_3_FLAGS "-Dproto3")
+    endif()
     add_custom_command(OUTPUT ${filename}
-        COMMAND ${M4_EXECUTABLE} -Dreal=${ACCURACY} -I '${INTERNAL_PROTO_PATH}'
+	COMMAND ${M4_EXECUTABLE} -Dreal=${ACCURACY} ${PROTOBUF_3_FLAGS} -I '${INTERNAL_PROTO_PATH}'
               ${PROJ_ROOT}/proto/${filename}.m4 > ${filename}
         DEPENDS ${PROJ_ROOT}/proto/${filename}.m4
         COMMENT "Generate ${filename}")
diff --git a/proto/DataConfig.proto.m4 b/proto/DataConfig.proto.m4
index 9862e4e7ef2ff96eafc91246e0b435c70fbe31d9..01d451ff7d5334f8f84d28973c2d7c4b4fac5885 100644
--- a/proto/DataConfig.proto.m4
+++ b/proto/DataConfig.proto.m4
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+ifdef(`proto3', `syntax = "proto2";')
 
 package paddle;
 
diff --git a/proto/DataFormat.proto.m4 b/proto/DataFormat.proto.m4
index 556eace5e194ef26991cc06d1f7794f14fbbdded..8a4a0be1b31a62cca35ca732a037ddc8b20786c4 100644
--- a/proto/DataFormat.proto.m4
+++ b/proto/DataFormat.proto.m4
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+ifdef(`proto3', `syntax = "proto2";')
 
 package paddle;
 
diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4
index aea77248cbac0f3ee044b05894d37718e692a0fc..4e8ed36f4ed4446193fab6fb710a0283d87b4b3a 100644
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+ifdef(`proto3', `syntax = "proto2";')
 
 import "ParameterConfig.proto";
 
@@ -76,6 +77,12 @@ message ConvConfig {
   required uint32 filter_size_y = 10;
   required uint32 padding_y = 11;
   required uint32 stride_y = 12;
+
+  // if not set, use output_x
+  optional uint32 output_y = 13;
+
+  // if not set, use img_size
+  optional uint32 img_size_y = 14;
 }
 
 message PoolConfig {
@@ -92,7 +99,7 @@ message PoolConfig {
   optional uint32 start = 4;
 
   // Defines the stride size between successive pooling squares.
-  required uint32 stride = 5;
+  required uint32 stride = 5 [default = 1];
 
   // The size of output feature map.
   required uint32 output_x = 6;
@@ -105,27 +112,25 @@ message PoolConfig {
   optional uint32 padding = 8 [default = 0];
 
   // if not set, use size_x
-  optional uint32 size_y = 9 [default = 0];
+  optional uint32 size_y = 9;
 
   // if not set, use stride
-  optional uint32 stride_y = 10 [default = 0];
+  optional uint32 stride_y = 10;
 
   // if not set, use output_x
-  optional uint32 output_y = 11 [default = 0];
+  optional uint32 output_y = 11;
 
   // if not set, use img_size
-  optional uint32 img_size_y = 12 [default = 0];
+  optional uint32 img_size_y = 12;
 
   // if not set, use padding
-  optional uint32 padding_y = 13 [default = 0];
+  optional uint32 padding_y = 13;
 }
 
 message SppConfig {
-  required string pool_type = 1;
-  required uint32 pyramid_height = 2;
-  required uint32 channels = 3;
-  required uint32 img_size = 4;
-  optional uint32 img_size_y = 5;
+  required ImageConfig image_conf = 1;
+  required string pool_type = 2;
+  required uint32 pyramid_height = 3;
 }
 
 message NormConfig {
@@ -155,6 +160,12 @@ message NormConfig {
   // fixed window: shared a fixed window for each value
   // sliding window: have a different window for each value
   optional bool blocked = 8;
+
+  // if not set, use output_x
+  optional uint32 output_y = 9;
+
+  // if not set, use img_size
+  optional uint32 img_size_y = 10;
 }
 
 message BlockExpandConfig {
@@ -179,12 +190,8 @@ message BlockExpandConfig {
 }
 
 message MaxOutConfig {
-  required uint32 channels = 1;
+  required ImageConfig image_conf = 1;
   required uint32 groups = 2;
-
-  // The size of input feature map.
-  required uint32 img_size_x = 3;
-  required uint32 img_size_y = 4;
 }
 
 message ProjectionConfig {
@@ -225,12 +232,10 @@ message OperatorConfig {
 
 message BilinearInterpConfig {
   // The size of input feature map.
-  optional uint32 img_size_x = 1;
-  optional uint32 img_size_y = 2;
+  required ImageConfig image_conf = 1;
   // The size of output feature map.
-  required uint32 out_size_x = 3;
-  required uint32 out_size_y = 4;
-  required uint32 num_channels = 5;
+  required uint32 out_size_x = 2;
+  required uint32 out_size_y = 3;
 }
 
 message ImageConfig {
@@ -240,6 +245,7 @@ message ImageConfig {
 
   // The size of input feature map.
   required uint32 img_size = 8;
+  required uint32 img_size_y = 9;
 }
 
 message LayerInputConfig {
@@ -412,7 +418,13 @@ sinclude(`ModelConfigLayer.proto.m4')
   // string type is used for flexibility: different types can be converted
   // to string and reinterpreted in the user's own layer implementation.  
   optional string user_arg = 49;
+  
+  // to indicate rectangle image data
+  optional uint64 height = 50;
+  optional uint64 width = 51;
 
+  // blank label used in ctc loss
+  optional uint32 blank = 52 [default = 0];
 }
 
 message EvaluatorConfig {
diff --git a/proto/ParameterConfig.proto.m4 b/proto/ParameterConfig.proto.m4
index e8d512445e5025f5663fbe3e20b4425cf1633a2b..26e7c3ef77b7377b8d6da4d947bcad27ae4edf72 100644
--- a/proto/ParameterConfig.proto.m4
+++ b/proto/ParameterConfig.proto.m4
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+ifdef(`proto3', `syntax = "proto2";')
 
 package paddle;
 
diff --git a/proto/ParameterService.proto.m4 b/proto/ParameterService.proto.m4
index 189dc1c9700bd821959bab80aef3721bd4940b5c..0b3f14a2ee5b3e1771f724bd9d271a3ecfd15038 100644
--- a/proto/ParameterService.proto.m4
+++ b/proto/ParameterService.proto.m4
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+ifdef(`proto3', `syntax = "proto2";')
 
 import "ParameterConfig.proto";
 import "TrainerConfig.proto";
@@ -20,7 +21,6 @@ package paddle;
 /**
  * Various structs for communicating with parameter server
  */
-
 enum ParameterUpdateMode {
   // Set parameter
    PSERVER_UPDATE_MODE_SET_PARAM = 0;//use local param
diff --git a/proto/TrainerConfig.proto.m4 b/proto/TrainerConfig.proto.m4
index 3b0e24f90bed8cdf0e102c12d2a4a041c17a8447..965c9cd39353970dd547f2a595eb99531f3693c6 100644
--- a/proto/TrainerConfig.proto.m4
+++ b/proto/TrainerConfig.proto.m4
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+ifdef(`proto3', `syntax = "proto2";')
 
 import "DataConfig.proto";
 import "ModelConfig.proto";
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index dbe2f3b29278c259945564959690a3aa6c0cfbe0..c6c0c9c151d840963fab1fe689eb5b9c340518ce 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -138,7 +138,14 @@ def init_config_environment(
         g_root_submodel=None,
         g_submodel_map={},
         g_submodel_stack=[],
-        g_add_submodel_suffix=False, ):
+        g_add_submodel_suffix=False,
+
+        # Whether current layer needs to pass the image height and width.
+        # Default value is true, but if it encounters recurrent_layer_group, 
+        # it will be false. The reason is that image is converted to be sequence, 
+        # image height will be sequence length, and image width will be feature 
+        # length of each timestep.
+        g_pass_height_width=True, ):
 
     for k, v in locals().iteritems():
         globals()[k] = copy.deepcopy(v)
@@ -592,6 +599,7 @@ class DotMulProjection(Projection):
     def calc_parameter_dims(self, input_size, output_size):
         return [1, output_size]
 
+
 # ScalingProjection
 @config_class
 class ScalingProjection(Projection):
@@ -685,9 +693,9 @@ class ConvProjection(Projection):
 
         parse_conv(conv_conf, input_layer_name, self.proj_conf.conv_conf,
                    num_filters)
-        # TODO: support rectangle input
-        self.proj_conf.output_size = (self.proj_conf.conv_conf.output_x
-                                      **2) * num_filters
+        self.proj_conf.output_size = self.proj_conf.conv_conf.output_x * \
+                                     self.proj_conf.conv_conf.output_y * \
+                                     num_filters
 
     def calc_output_size(self, input_layer_config):
         return self.proj_conf.output_size
@@ -697,7 +705,8 @@ class ConvProjection(Projection):
         ci = self.proj_conf.conv_conf.channels
         fh = self.proj_conf.conv_conf.filter_size
         fw = self.proj_conf.conv_conf.filter_size_y
-        return co * ci * fh * fw
+        gr = self.proj_conf.conv_conf.groups
+        return co * ci * fh * fw / gr
 
     def calc_bias_size(self):
         return self.proj_conf.num_filters
@@ -762,8 +771,9 @@ class ConvOperator(Operator):
         parse_conv(conv_conf,
                    MakeLayerNameInSubmodel(input_layer_names[0]),
                    self.operator_conf.conv_conf, num_filters)
-        self.operator_conf.output_size = (self.operator_conf.conv_conf.output_x
-                                          **2) * num_filters
+        self.operator_conf.output_size = self.operator_conf.conv_conf.output_x * \
+                                         self.operator_conf.conv_conf.output_y * \
+                                         num_filters
 
         config_assert(len(input_layer_names) == 2, "Conv is binary operator")
 
@@ -798,38 +808,35 @@ class Conv(Cfg):
             config_assert(output_x <= 0)
 
 
-# please refer to the comments in proto/ModelConfig.proto
 @config_class
 class BilinearInterp(Cfg):
-    def __init__(self, out_size_x=None, out_size_y=None, num_channels=None):
+    def __init__(self, out_size_x=None, out_size_y=None, channels=None):
         self.add_keys(locals())
 
 
-# please refer to the comments in proto/ModelConfig.proto
 @config_class
 class Pool(Cfg):
-    def __init__(self,
-                 pool_type,
-                 channels,
-                 size_x,
-                 size_y=None,
-                 img_width=None,
-                 start=None,
-                 stride=None,
-                 stride_y=None,
-                 padding=None,
-                 padding_y=None):
+    def __init__(
+            self,
+            pool_type,
+            channels,
+            size_x,
+            size_y=None,
+            img_width=None,
+            start=None,
+            stride=None,  # 1 by defalut in protobuf
+            stride_y=None,
+            padding=None,  # 0 by defalut in protobuf
+            padding_y=None):
         self.add_keys(locals())
 
 
-# please refer to the comments in proto/ModelConfig.proto
 @config_class
 class SpatialPyramidPool(Cfg):
-    def __init__(self, pool_type, pyramid_height, channels, img_width=None):
+    def __init__(self, pool_type, pyramid_height, channels):
         self.add_keys(locals())
 
 
-# please refer to the comments in proto/ModelConfig.proto
 @config_class
 class Norm(Cfg):
     def __init__(self,
@@ -844,7 +851,6 @@ class Norm(Cfg):
         self.add_keys(locals())
 
 
-# please refer to the comments in proto/ModelConfig.proto
 @config_class
 class Image(Cfg):
     def __init__(self, channels, img_size=None):
@@ -1051,18 +1057,8 @@ def TestData(data_config, async_load_data=None):
         g_config.test_data_config.async_load_data = async_load_data
 
 
-def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
-    bilinear_conf.out_size_x = bilinear.out_size_x
-    bilinear_conf.out_size_y = bilinear.out_size_y
-    bilinear_conf.num_channels = bilinear.num_channels
-
-
-'''
-caffe_mode: compute the output size using floor instead of ceil,
-            which is consistent of caffe and CuDNN's convention.
-'''
-
-
+#caffe_mode: compute the output size using floor instead of ceil,
+#            which is consistent of caffe and CuDNN's convention.
 def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
     output = (2 * padding + img_size - filter_size) / float(stride)
     if caffe_mode:
@@ -1071,20 +1067,34 @@ def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
         return 1 + int(math.ceil(output))
 
 
-'''
-calcualte image_size based on output_size for convolution. 
-It is the reverse function of cnn_output_size
-'''
-
-
+#calcualte image_size based on output_size for de-convolution (ConvTransLayer). 
+#It is the reverse function of cnn_output_size
 def cnn_image_size(output_size, filter_size, padding, stride, caffe_mode):
-    if caffe_mode:
-        img_size = (output_size - 1) * stride + filter_size - 2 * padding
-    else:
-        img_size = (output_size - 2) * stride + filter_size - 2 * padding + 1
+    img_size = (output_size - 1) * stride + filter_size - 2 * padding
+    if not caffe_mode:
+        img_size = img_size + 1
     return img_size
 
 
+def get_img_size(input_layer_name, channels):
+    input = g_layer_map[input_layer_name]
+    img_pixels = input.size / channels
+    img_size = input.width if input.width > 0 else int(img_pixels**0.5)
+    img_size_y = input.height if input.height > 0 else int(img_pixels /
+                                                           img_size)
+    config_assert(
+        img_size * img_size_y == img_pixels,
+        "Input layer %s: Incorrect input image size %d * %d for input image pixels %d"
+        % (input_layer_name, img_size, img_size_y, img_pixels))
+    return img_size, img_size_y
+
+
+def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
+    parse_image(bilinear, input_layer_name, bilinear_conf.image_conf)
+    bilinear_conf.out_size_x = bilinear.out_size_x
+    bilinear_conf.out_size_y = bilinear.out_size_y
+
+
 def parse_pool(pool, input_layer_name, pool_conf):
     pool_conf.pool_type = pool.pool_type
     config_assert(pool.pool_type in [
@@ -1100,52 +1110,35 @@ def parse_pool(pool, input_layer_name, pool_conf):
     pool_conf.size_y = default(pool.size_y, pool_conf.size_x)
     pool_conf.stride_y = default(pool.stride_y, pool_conf.stride)
 
-    img_pixels = g_layer_map[input_layer_name].size / pool.channels
-    # the img_width may be removed,
-    # and it can be calculated automatically later.
-    pool_conf.img_size = default(pool.img_width, int(img_pixels**0.5))
-    pool_conf.img_size_y = img_pixels / pool_conf.img_size
-    config_assert(pool_conf.img_size * pool_conf.img_size_y == img_pixels,
-                  "Incorrect input image size %d for input image pixels %d" %
-                  (pool_conf.img_size, img_pixels))
+    pool_conf.img_size, pool_conf.img_size_y = \
+        get_img_size(input_layer_name, pool.channels)
 
     config_assert(not pool.start, "start is deprecated in pooling.")
 
     if pool.padding is not None:
         pool_conf.padding = pool.padding
-        pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
-        pool_conf.output_x = cnn_output_size(
-            pool_conf.img_size, pool_conf.size_x, pool_conf.padding,
-            pool_conf.stride, False)
-        pool_conf.output_y = cnn_output_size(
-            pool_conf.img_size_y, pool_conf.size_y, pool_conf.padding_y,
-            pool_conf.stride_y, False)
+    pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
+    pool_conf.output_x = cnn_output_size(pool_conf.img_size, pool_conf.size_x,
+                                         pool_conf.padding, pool_conf.stride,
+                                         False)
+    pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y,
+                                         pool_conf.padding_y,
+                                         pool_conf.stride_y, False)
 
 
 def parse_spp(spp, input_layer_name, spp_conf):
+    parse_image(spp, input_layer_name, spp_conf.image_conf)
     spp_conf.pool_type = spp.pool_type
     config_assert(spp.pool_type in ['max-projection', 'avg-projection'],
                   "pool-type %s is not in "
                   "['max-projection', 'avg-projection']" % spp.pool_type)
     spp_conf.pyramid_height = spp.pyramid_height
-    spp_conf.channels = spp.channels
-
-    img_pixels = g_layer_map[input_layer_name].size / spp_conf.channels
-
-    spp_conf.img_size = default(spp.img_width, int(img_pixels**0.5))
-    spp_conf.img_size_y = img_pixels / spp_conf.img_size
-    config_assert(spp_conf.img_size * spp_conf.img_size_y == img_pixels,
-                  "Incorrect input image size %d for input image pixels %d" %
-                  (spp_conf.img_size, img_pixels))
 
 
 def parse_image(image, input_layer_name, image_conf):
     image_conf.channels = image.channels
-    image_pixels = g_layer_map[input_layer_name].size / image_conf.channels
-    image_conf.img_size = int(image_pixels**0.5)
-    config_assert((image_conf.img_size**2) == image_pixels,
-                  "Incorrect input image size %d for input image pixels %d" %
-                  (image_conf.img_size, image_pixels))
+    image_conf.img_size, image_conf.img_size_y = \
+        get_img_size(input_layer_name, image_conf.channels)
 
 
 def parse_norm(norm, input_layer_name, norm_conf):
@@ -1159,24 +1152,18 @@ def parse_norm(norm, input_layer_name, norm_conf):
     norm_conf.pow = norm.pow
     norm_conf.blocked = norm.blocked
 
-    img_pixels = g_layer_map[input_layer_name].size / norm.channels
-    norm_conf.img_size = int(img_pixels**0.5)
-    config_assert((norm_conf.img_size**2) == img_pixels,
-                  "Incorrect input image size %d for input image pixels %d" %
-                  (norm_conf.img_size, img_pixels))
+    norm_conf.img_size, norm_conf.img_size_y = \
+        get_img_size(input_layer_name, norm.channels)
     norm_conf.output_x = norm_conf.img_size
+    norm_conf.output_y = norm_conf.img_size_y
     if norm.norm_type in ['cmrnorm-projection']:
         norm_conf.scale /= norm.size
     else:
         norm_conf.scale /= norm.size**2
 
 
-'''
-caffe_mode: compute the output size using floor instead of ceil,
-            which is consistent of caffe and CuDNN's convention.
-'''
-
-
+#caffe_mode: compute the output size using floor instead of ceil,
+#            which is consistent of caffe and CuDNN's convention.
 def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
     conv_conf.filter_size = conv.filter_size
     conv_conf.filter_size_y = conv.filter_size_y
@@ -1190,33 +1177,24 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
 
     if not trans:
         conv_conf.filter_channels = conv.channels / conv.groups
-
-        img_pixels = g_layer_map[input_layer_name].size / conv.channels
-        print('channels=%d size=%d' % (conv.channels,
-                                       g_layer_map[input_layer_name].size))
-        conv_conf.img_size = int(img_pixels**0.5)
-        config_assert((conv_conf.img_size**2) == img_pixels, (
-            "Input layer %s: Incorrect input image size %d for input " +
-            "image pixels %d") %
-                      (input_layer_name, conv_conf.img_size, img_pixels))
-
+        conv_conf.img_size, conv_conf.img_size_y = \
+            get_img_size(input_layer_name, conv.channels)
         conv_conf.output_x = cnn_output_size(
             conv_conf.img_size, conv_conf.filter_size, conv_conf.padding,
             conv_conf.stride, conv_conf.caffe_mode)
+        conv_conf.output_y = cnn_output_size(
+            conv_conf.img_size_y, conv_conf.filter_size_y, conv_conf.padding_y,
+            conv_conf.stride_y, conv_conf.caffe_mode)
     else:
         conv_conf.filter_channels = num_filters / conv.groups
-
-        outputSize = g_layer_map[input_layer_name].size / conv.channels
-        print('channels=%d size=%d' % (conv.channels,
-                                       g_layer_map[input_layer_name].size))
-        conv_conf.output_x = int(outputSize**0.5)
-        config_assert((conv_conf.output_x**2) == outputSize, (
-            "Input layer %s: Incorrect input image size %d for input " +
-            "image pixels %d") %
-                      (input_layer_name, conv_conf.output_x, outputSize))
+        conv_conf.output_x, conv_conf.output_y = \
+            get_img_size(input_layer_name, conv.channels)
         conv_conf.img_size = cnn_image_size(
             conv_conf.output_x, conv_conf.filter_size, conv_conf.padding,
             conv_conf.stride, conv_conf.caffe_mode)
+        conv_conf.img_size_y = cnn_image_size(
+            conv_conf.output_y, conv_conf.filter_size_y, conv_conf.padding_y,
+            conv_conf.stride_y, conv_conf.caffe_mode)
 
 
 def parse_block_expand(block_expand, input_layer_name, block_expand_conf):
@@ -1245,10 +1223,8 @@ def parse_block_expand(block_expand, input_layer_name, block_expand_conf):
 
 
 def parse_maxout(maxout, input_layer_name, maxout_conf):
-    maxout_conf.channels = maxout.channels
+    parse_image(maxout, input_layer_name, maxout_conf.image_conf)
     maxout_conf.groups = maxout.groups
-    maxout_conf.img_size_x = maxout.img_size_x
-    maxout_conf.img_size_y = maxout.img_size_y
 
 
 # Define an evaluator
@@ -1375,6 +1351,12 @@ class LayerBase(object):
 
         g_current_submodel.layer_names.append(self.config.name)
 
+        if self.config.type != 'data' and g_pass_height_width:
+            height = self.get_input_layer(0).height
+            width = self.get_input_layer(0).width
+            if height and width:
+                self.set_layer_height_width(height, width)
+
     def get_input_layer(self, input_index):
         return g_layer_map[self.config.inputs[input_index].input_layer_name]
 
@@ -1492,6 +1474,23 @@ class LayerBase(object):
                           'Different inputs result in' +
                           'different layer size at layer %s' % self.config.name)
 
+    def set_layer_height_width(self, height, width):
+        self.config.height = height
+        self.config.width = width
+
+    def set_cnn_layer(self,
+                      input_layer_name,
+                      height,
+                      width,
+                      channels,
+                      is_print=True):
+        size = height * width * channels
+        self.set_layer_size(size)
+        self.set_layer_height_width(height, width)
+        if is_print:
+            print("output for %s: c = %d, h = %d, w = %d, size = %d" %
+                  (input_layer_name, channels, height, width, size))
+
 
 @config_layer('multi_class_cross_entropy_with_selfnorm')
 class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
@@ -1581,9 +1580,11 @@ class PrintLayer(LayerBase):
 
 @config_layer('data')
 class DataLayer(LayerBase):
-    def __init__(self, name, size, device=None):
+    def __init__(self, name, size, height=None, width=None, device=None):
         super(DataLayer, self).__init__(
             name, 'data', size, inputs=[], device=device)
+        if height and width:
+            self.set_layer_height_width(height, width)
 
 
 '''
@@ -1682,14 +1683,13 @@ class ConvLayerBase(LayerBase):
 
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
-            parse_conv(self.inputs[input_index].conv, input_layer.name,
-                       self.config.inputs[input_index].conv_conf, num_filters)
             conv_conf = self.config.inputs[input_index].conv_conf
+            parse_conv(self.inputs[input_index].conv, input_layer.name,
+                       conv_conf, num_filters)
             psize = self.calc_parameter_size(conv_conf)
-            print("output size for %s is %d " % (name, conv_conf.output_x))
             self.create_input_parameter(input_index, psize)
-            self.set_layer_size(
-                (conv_conf.output_x**2) * self.config.num_filters)
+            self.set_cnn_layer(name, conv_conf.output_y, conv_conf.output_x,
+                               self.config.num_filters)
 
         psize = self.config.size
         if shared_biases:
@@ -1776,10 +1776,11 @@ class NormLayer(LayerBase):
             name, 'norm', 0, inputs=inputs, device=device)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
-            parse_norm(self.inputs[input_index].norm, input_layer.name,
-                       self.config.inputs[input_index].norm_conf)
             norm_conf = self.config.inputs[input_index].norm_conf
-            self.set_layer_size((norm_conf.output_x**2) * norm_conf.channels)
+            parse_norm(self.inputs[input_index].norm, input_layer.name,
+                       norm_conf)
+            self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x,
+                               norm_conf.channels, False)
 
 
 @config_layer('pool')
@@ -1789,13 +1790,11 @@ class PoolLayer(LayerBase):
             name, 'pool', 0, inputs=inputs, device=device)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
-            parse_pool(self.inputs[input_index].pool, input_layer.name,
-                       self.config.inputs[input_index].pool_conf)
             pool_conf = self.config.inputs[input_index].pool_conf
-            print("output size for %s is %d*%d " % (name, pool_conf.output_y,
-                                                    pool_conf.output_x))
-            self.set_layer_size(
-                (pool_conf.output_x * pool_conf.output_y) * pool_conf.channels)
+            parse_pool(self.inputs[input_index].pool, input_layer.name,
+                       pool_conf)
+            self.set_cnn_layer(name, pool_conf.output_y, pool_conf.output_x,
+                               pool_conf.channels)
 
 
 @config_layer('spp')
@@ -1805,12 +1804,10 @@ class SpatialPyramidPoolLayer(LayerBase):
             name, 'spp', 0, inputs=inputs, device=device)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
-            parse_spp(self.inputs[input_index].spp, input_layer.name,
-                      self.config.inputs[input_index].spp_conf)
             spp_conf = self.config.inputs[input_index].spp_conf
-            output_size = (pow(4, spp_conf.pyramid_height) - 1) / (4 - 1)
-            print("output size for %s is %d " % (name, output_size))
-            self.set_layer_size(output_size * spp_conf.channels)
+            parse_spp(self.inputs[input_index].spp, input_layer.name, spp_conf)
+            output_x = (pow(4, spp_conf.pyramid_height) - 1) / (4 - 1)
+            self.set_cnn_layer(name, 1, output_x, spp_conf.image_conf.channels)
 
 
 @config_layer('batch_norm')
@@ -1872,10 +1869,10 @@ class BatchNormLayer(LayerBase):
             self.config.moving_average_fraction = moving_average_fraction
 
         input_layer = self.get_input_layer(0)
-        parse_image(self.inputs[0].image, input_layer.name,
-                    self.config.inputs[0].image_conf)
         image_conf = self.config.inputs[0].image_conf
-        self.set_layer_size((image_conf.img_size**2) * image_conf.channels)
+        parse_image(self.inputs[0].image, input_layer.name, image_conf)
+        self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size,
+                           image_conf.channels, False)
 
         psize = self.calc_parameter_size(image_conf)
         dims = [1, psize]
@@ -1933,11 +1930,11 @@ class MaxOutLayer(LayerBase):
         super(MaxOutLayer, self).__init__(
             name, 'maxout', 0, inputs=inputs, **xargs)
         input_layer = self.get_input_layer(0)
-        parse_maxout(self.inputs[0].maxout, input_layer.name,
-                     self.config.inputs[0].maxout_conf)
         maxout_conf = self.config.inputs[0].maxout_conf
-        self.set_layer_size(g_layer_map[input_layer.name].size /
-                            maxout_conf.groups)
+        parse_maxout(self.inputs[0].maxout, input_layer.name, maxout_conf)
+        out_channels = maxout_conf.image_conf.channels / maxout_conf.groups
+        self.set_cnn_layer(name, g_layer_map[input_layer.name].height,
+                           g_layer_map[input_layer.name].width, out_channels)
 
 
 # key: cost type
@@ -2517,11 +2514,10 @@ class BilinearInterpLayer(LayerBase):
         super(BilinearInterpLayer, self).__init__(
             name, 'bilinear_interp', 0, inputs=inputs, **xargs)
         input_layer = self.get_input_layer(0)
-        parse_bilinear(self.inputs[0].bilinear_interp, input_layer.name,
-                       self.config.inputs[0].bilinear_interp_conf)
-        conf = self.inputs[0].bilinear_interp
-        self.set_layer_size(conf.out_size_x * conf.out_size_y *
-                            conf.num_channels)
+        conf = self.config.inputs[0].bilinear_interp_conf
+        parse_bilinear(self.inputs[0].bilinear_interp, input_layer.name, conf)
+        self.set_cnn_layer(name, conf.out_size_y, conf.out_size_x,
+                           conf.image_conf.channels)
 
 
 @config_layer('sum_to_one_norm')
@@ -2991,9 +2987,32 @@ class CTCLayer(LayerBase):
         config_assert(len(self.inputs) == 2, 'CTCLayer must have 2 inputs')
 
 
+@config_layer('warp_ctc')
+class WarpCTCLayer(LayerBase):
+    def __init__(self,
+                 name,
+                 size,
+                 inputs,
+                 blank=0,
+                 norm_by_times=False,
+                 device=None):
+        super(WarpCTCLayer, self).__init__(
+            name, 'warp_ctc', size=size, inputs=inputs, device=device)
+        self.config.blank = blank
+        self.config.norm_by_times = norm_by_times
+        config_assert(len(self.inputs) == 2, 'WarpCTCLayer must have 2 inputs')
+        input_layer = self.get_input_layer(0)
+        config_assert(
+            (input_layer.active_type == '' or
+             input_layer.active_type == 'linear'),
+            "Expecting the active_type of input layer to be linear or null")
+
+
 @config_layer('recurrent_layer_group')
 class RecurrentLayerGroup(LayerBase):
     def __init__(self, name, device=None):
+        global g_pass_height_width
+        g_pass_height_width = False
         super(RecurrentLayerGroup, self).__init__(
             name, 'recurrent_layer_group', 0, inputs=[], device=device)
 
@@ -3379,7 +3398,21 @@ def parse_config(config_file, config_arg_str):
     g_root_submodel.is_recurrent_layer_group = False
     g_current_submodel = g_root_submodel
 
-    execfile(config_file, make_config_environment(config_file, config_args))
+    # for paddle on spark, need support non-file config.
+    # you can use parse_config like below:
+    #
+    # from paddle.trainer.config_parser import parse_config
+    # def configs():
+    #    #your paddle config code, which is same as config file.
+    #
+    # config = parse_config(configs, "is_predict=1")
+    # # then you get config proto object.
+    if hasattr(config_file, '__call__'):
+        config_file.func_globals.update(
+            make_config_environment("", config_args))
+        config_file()
+    else:
+        execfile(config_file, make_config_environment(config_file, config_args))
     for k, v in settings.iteritems():
         if v is None:
             continue
diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py
index 6261934e1bc8e8df62aeaa0757f4a237f91ef748..eeed18a98a27313dac65a695960043d0543bb577 100644
--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
@@ -16,7 +16,8 @@ __all__ = [
     "TanhActivation", "SigmoidActivation", "SoftmaxActivation",
     "IdentityActivation", "LinearActivation", 'SequenceSoftmaxActivation',
     'ExpActivation', "ReluActivation", "BReluActivation", "SoftReluActivation",
-    "STanhActivation", "AbsActivation", "SquareActivation", "BaseActivation"
+    "STanhActivation", "AbsActivation", "SquareActivation", "BaseActivation",
+    "LogActivation"
 ]
 
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index d984e843204c1cd99ee5b8941dc056c091504869..4541b6fd8deddbd9cd3f8cb02f01e8328718d6e7 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -91,6 +91,7 @@ __all__ = [
     'linear_comb_layer',
     'convex_comb_layer',
     'ctc_layer',
+    'warp_ctc_layer',
     'crf_layer',
     'crf_decoding_layer',
     'nce_layer',
@@ -129,6 +130,9 @@ class LayerType(object):
     HSIGMOID = 'hsigmoid'
     CONV_LAYER = "conv"
     CONVTRANS_LAYER = "convt"
+    EXCONV_LAYER = "exconv"
+    EXCONVTRANS_LAYER = "exconvt"
+    CUDNNCONV_LAYER = "cudnn_conv"
     POOL_LAYER = "pool"
     BATCH_NORM_LAYER = 'batch_norm'
     NORM_LAYER = 'norm'
@@ -169,6 +173,7 @@ class LayerType(object):
     PRINT_LAYER = "print"
 
     CTC_LAYER = "ctc"
+    WARP_CTC_LAYER = "warp_ctc"
     CRF_LAYER = "crf"
     CRF_DECODING_LAYER = "crf_decoding"
     NCE_LAYER = 'nce'
@@ -763,7 +768,7 @@ def mixed_layer(size=0,
 
 
 @layer_support()
-def data_layer(name, size, layer_attr=None):
+def data_layer(name, size, height=None, width=None, layer_attr=None):
     """
     Define DataLayer For NeuralNetwork.
 
@@ -778,6 +783,10 @@ def data_layer(name, size, layer_attr=None):
     :type name: basestring
     :param size: Size of this data layer.
     :type size: int
+    :param height: Height of this data layer, used for image
+    :type size: int|None
+    :param width: Width of this data layer, used for image
+    :type size: int|None
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
@@ -787,6 +796,8 @@ def data_layer(name, size, layer_attr=None):
         type=LayerType.DATA,
         name=name,
         size=size,
+        height=height,
+        width=width,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
 
     return LayerOutput(name, LayerType.DATA, size=size)
@@ -1480,7 +1491,7 @@ def bilinear_interp_layer(input,
             bilinear_interp=BilinearInterp(
                 out_size_x=out_size_x,
                 out_size_y=out_size_y,
-                num_channels=num_channels)),
+                channels=num_channels)),
         type=LayerType.BILINEAR_INTERP_LAYER,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
@@ -1762,7 +1773,8 @@ def img_conv_layer(input,
                    filter_size_y=None,
                    stride_y=None,
                    padding_y=None,
-                   trans=False):
+                   trans=False,
+                   layer_type=None):
     """
     Convolution layer for image. Paddle only support square input currently and
     thus input image's width equals height.
@@ -1829,6 +1841,10 @@ def img_conv_layer(input,
     :type layer_attr: ExtraLayerAttribute
     :param trans: true if it is a convTransLayer, false if it is a convLayer
     :type trans: bool
+    :param layer_type: specify the layer_type, default is None. If trans=True,
+                       layer_type has to be "exconvt", otherwise layer_type 
+                       has to be either "exconv" or "cudnn_conv"
+    :type layer_type: String
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1865,7 +1881,14 @@ def img_conv_layer(input,
         param_attr.attr["initial_strategy"] = 0
         param_attr.attr["initial_smart"] = False
 
-    lt = LayerType.CONVTRANS_LAYER if trans else LayerType.CONV_LAYER
+    if layer_type:
+        if trans:
+            assert layer_type in ["exconvt"]
+        else:
+            assert layer_type in ["exconv", "cudnn_conv"]
+        lt = layer_type
+    else:
+        lt = LayerType.CONVTRANS_LAYER if trans else LayerType.CONV_LAYER
 
     l = Layer(
         name=name,
@@ -1908,8 +1931,7 @@ def img_pool_layer(input,
                    layer_attr=None,
                    pool_size_y=None,
                    stride_y=None,
-                   padding_y=None,
-                   img_width=None):
+                   padding_y=None):
     """
     Image pooling Layer.
 
@@ -1940,9 +1962,6 @@ def img_pool_layer(input,
     :type stride_y: int|None
     :param layer_attr: Extra Layer attribute.
     :type layer_attr: ExtraLayerAttribute
-    :param img_width: the width of input feature map. If it is None, the input feature
-                      map should be square.
-    :type img_width: int|None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1978,8 +1997,7 @@ def img_pool_layer(input,
                     padding=padding,
                     size_y=pool_size_y,
                     stride_y=stride_y,
-                    padding_y=padding_y,
-                    img_width=img_width))
+                    padding_y=padding_y))
         ],
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
@@ -1997,7 +2015,6 @@ def spp_layer(input,
               num_channels=None,
               pool_type=None,
               pyramid_height=None,
-              img_width=None,
               layer_attr=None):
     """
     Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition.
@@ -2014,9 +2031,6 @@ def spp_layer(input,
     :type scale: BasePoolingType
     :param pyramid_height: pyramid height.
     :type pyramid_height: int
-    :param img_width: the width of input feature map. If it is None, the input feature
-                      map should be square.
-    :type img_width: int|None
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
@@ -2043,8 +2057,7 @@ def spp_layer(input,
             spp=SpatialPyramidPool(
                 pool_type=type_name,
                 channels=num_channels,
-                pyramid_height=pyramid_height,
-                img_width=img_width)),
+                pyramid_height=pyramid_height)),
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
         name,
@@ -2853,11 +2866,11 @@ def recurrent_group(step,
     :type targetInlink: LayerOutput|SubsequenceInput
 
     :param is_generating: If is generating, none of input type should be LayerOutput;
-                          else, for training or testing, one of the input type must 
+                          else, for training or testing, one of the input type must
                           be LayerOutput.
 
     : type is_generating: bool
-    
+
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -2905,15 +2918,16 @@ def recurrent_group(step,
         seq_reversed=reverse,
         target_inlinkname=targetInlinkName)
     in_args = []
-    has_LayerOutput = True
+    has_LayerOutput = False
     for each_input in input:
         assert is_single_input(each_input)
         if isinstance(each_input, LayerOutput):
             in_args.append(each_input)
+            has_LayerOutput = True
         elif isinstance(each_input, SubsequenceInput):
             in_args.append(each_input.input)
+            has_LayerOutput = True
         else:
-            has_LayerOutput = False
             mem_name = "__%s_memory__" % each_input.input.name
             mem = memory(
                 name=mem_name,
@@ -4084,6 +4098,83 @@ def ctc_layer(input,
     return LayerOutput(name, LayerType.CTC_LAYER, [input, label], size=size)
 
 
+@wrap_name_default()
+@layer_support()
+def warp_ctc_layer(input,
+                   label,
+                   size=None,
+                   name=None,
+                   blank=0,
+                   norm_by_times=False,
+                   layer_attr=None):
+    """
+    A layer intergrating the open-source `warp-ctc
+    <https://github.com/baidu-research/warp-ctc>` library, which is used in
+    `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin
+    <https://arxiv.org/pdf/1512.02595v1.pdf>`, to compute Connectionist Temporal
+    Classification (CTC) loss.
+
+    More details of CTC can be found by referring to `Connectionist Temporal
+    Classification: Labelling Unsegmented Sequence Data with Recurrent
+    Neural Networks <http://machinelearning.wustl.edu/mlpapers/paper_files/
+    icml2006_GravesFGS06.pdf>`_
+
+    Note:
+        - Let num_classes represent the category number. Considering the 'blank'
+          label needed by CTC, you need to use (num_classes + 1) as the input
+          size. Thus, the size of both warp_ctc_layer and 'input' layer should
+          be set to num_classes + 1.
+        - You can set 'blank' to any value ranged in [0, num_classes], which
+          should be consistent as that used in your labels.
+        - As a native 'softmax' activation is interated to the warp-ctc library,
+         'linear' activation is expected instead in the 'input' layer.
+
+    The simple usage:
+
+    .. code-block:: python
+
+      ctc = warp_ctc_layer(input=input,
+                           label=label,
+                           size=1001,
+                           blank=1000,
+                           norm_by_times=False)
+
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param label: The data layer of label with variable length.
+    :type label: LayerOutput
+    :param size: category numbers + 1.
+    :type size: int
+    :param name: The name of this layer, which can not specify.
+    :type name: basestring|None
+    :param blank: the 'blank' label used in ctc
+    :type blank: int
+    :param norm_by_times: Whether to normalization by times. False by default.
+    :type norm_by_times: bool
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert isinstance(label, LayerOutput)
+    if label.size is not None:
+        if size is not None:
+            assert size == label.size + 1
+        else:
+            size = label.size + 1
+    Layer(
+        name=name,
+        type=LayerType.WARP_CTC_LAYER,
+        size=size,
+        blank=blank,
+        norm_by_times=norm_by_times,
+        inputs=[input.name, label.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.WARP_CTC_LAYER, parents=[input, label], size=size)
+
+
 @wrap_name_default()
 @wrap_param_attr_default()
 @layer_support()
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
index cf52b06bfea06d6a88b5f11558627936a74bd0b9..6180b2efbcad87e511a4b981d533f204f45fb5dc 100644
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -4,7 +4,17 @@ add_test(NAME layers_test
         python ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/layers_test.py
     WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
 
-add_test(NAME test_layerHelpers
-  COMMAND
-  ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
-)
+if (PROTOBUF_3)
+  add_paddle_exe(protobuf_equal
+    ProtobufEqualMain.cpp)
+  add_test(NAME test_layerHelpers
+    COMMAND
+    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+    ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
+  )
+else()
+  add_test(NAME test_layerHelpers
+    COMMAND
+    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+  )
+endif()
diff --git a/python/paddle/trainer_config_helpers/tests/ProtobufEqualMain.cpp b/python/paddle/trainer_config_helpers/tests/ProtobufEqualMain.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..06f7de9306307e2f92958ef211ec137d54c99d1c
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/ProtobufEqualMain.cpp
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <google/protobuf/text_format.h>
+#include <google/protobuf/util/message_differencer.h>
+#include <fstream>
+#include <iostream>
+#include "TrainerConfig.pb.h"
+
+bool loadPb(google::protobuf::Message* conf, const std::string& filename) {
+  std::ifstream fin;
+  fin.open(filename.c_str());
+  if (fin.is_open()) {
+    std::string str((std::istreambuf_iterator<char>(fin)),
+                    std::istreambuf_iterator<char>());
+    bool ok = google::protobuf::TextFormat::ParseFromString(str, conf);
+    fin.close();
+    return ok;
+  } else {
+    return false;
+  }
+}
+
+int main(int argc, char** argv) {
+  std::unique_ptr<google::protobuf::Message> config1;
+  std::unique_ptr<google::protobuf::Message> config2;
+  if (argc == 3) {
+    config1.reset(new paddle::ModelConfig());
+    config2.reset(new paddle::ModelConfig());
+  } else if (argc == 4) {
+    config1.reset(new paddle::TrainerConfig());
+    config2.reset(new paddle::TrainerConfig());
+  }
+  if (!config1 || !config2) {
+    return 1;
+  } else if (!loadPb(config1.get(), argv[1])) {
+    return 2;
+  } else if (!loadPb(config2.get(), argv[2])) {
+    return 3;
+  } else {
+    if (google::protobuf::util::MessageDifferencer::ApproximatelyEquals(
+            *config1, *config2)) {
+      return 0;
+    } else {
+      return 4;
+    }
+  }
+}
diff --git a/python/paddle/trainer_config_helpers/tests/configs/.gitignore b/python/paddle/trainer_config_helpers/tests/configs/.gitignore
index eb646b4a71ec1ac0e7992aabf2992fef7a9264a0..c654bd41b0b4dd0cb510943540b660b4e4a147d9 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/.gitignore
+++ b/python/paddle/trainer_config_helpers/tests/configs/.gitignore
@@ -1 +1 @@
-protostr/*.unitest
+protostr/*.unittest
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3f1d99701afe5425553feb129c7619b5e3e689fa
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+export configs=(test_fc layer_activations projections test_print_layer
+test_sequence_pooling test_lstmemory_layer test_grumemory_layer
+last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
+img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
+test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight
+test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops)
+
+export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
index bb594ac2c245d8882569ba2c3cf00623a8fa8e2c..e55f9bd3884a907dcc17a882e3c1dfd71fef79bb 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -5,24 +5,16 @@ cd `dirname $0`
 export PYTHONPATH=$PWD/../../../../
 
 protostr=$PWD/protostr
-
-configs=(test_fc layer_activations projections test_print_layer
-test_sequence_pooling test_lstmemory_layer test_grumemory_layer
-last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
-img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
-test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight
-test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops)
-
-whole_configs=(test_split_datasource)
+. file_list.sh
 
 for conf in ${configs[*]}
 do
     echo "Generating " $conf
-    python -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unitest
+    python -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unittest
 done
 
 for conf in ${whole_configs[*]}
 do
     echo "Generating " $conf
-    python -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unitest
+    python -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unittest
 done
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
index 1f262af21126c17eb133b92c84a1ae3bb280a1d6..1a577b8d9b1e1915236ba6afcfa97040d70c707a 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
@@ -26,11 +26,15 @@ layers {
       filter_size_y: 32
       padding_y: 1
       stride_y: 1
+      output_y: 227
+      img_size_y: 256
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
   num_filters: 64
   shared_biases: true
+  height: 227
+  width: 227
 }
 layers {
   name: "__batch_norm_0__"
@@ -43,6 +47,7 @@ layers {
     image_conf {
       channels: 64
       img_size: 227
+      img_size_y: 227
     }
   }
   inputs {
@@ -55,6 +60,8 @@ layers {
   }
   bias_parameter_name: "___batch_norm_0__.wbias"
   moving_average_fraction: 0.9
+  height: 227
+  width: 227
 }
 layers {
   name: "__crmnorm_0__"
@@ -72,8 +79,12 @@ layers {
       output_x: 227
       img_size: 227
       blocked: false
+      output_y: 227
+      img_size_y: 227
     }
   }
+  height: 227
+  width: 227
 }
 layers {
   name: "__pool_0__"
@@ -97,6 +108,8 @@ layers {
       padding_y: 0
     }
   }
+  height: 196
+  width: 196
 }
 parameters {
   name: "___conv_0__.w0"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
index 38346354080b02bebd937fd998fd3c63c8030346..cd310bd13b39aca57d7a1f38ac2a8966c706b60a 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
@@ -26,6 +26,8 @@ layers {
       filter_size_y: 32
       padding_y: 1
       stride_y: 1
+      output_y: 227
+      img_size_y: 256
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
@@ -43,6 +45,7 @@ layers {
     image_conf {
       channels: 64
       img_size: 256
+      img_size_y: 256
     }
   }
   inputs {
@@ -55,6 +58,8 @@ layers {
   }
   bias_parameter_name: "___batch_norm_0__.wbias"
   moving_average_fraction: 0.9
+  height: 256
+  width: 256
 }
 layers {
   name: "__crmnorm_0__"
@@ -72,8 +77,12 @@ layers {
       output_x: 256
       img_size: 256
       blocked: false
+      output_y: 256
+      img_size_y: 256
     }
   }
+  height: 256
+  width: 256
 }
 layers {
   name: "__pool_0__"
@@ -97,6 +106,8 @@ layers {
       padding_y: 0
     }
   }
+  height: 225
+  width: 225
 }
 parameters {
   name: "___conv_0__.w0"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
index 2b3951c242411e0c0990a52bcb2ae6b1723a9367..2943ab130bd7d6f3b78ea611f1c35850ccaf5e92 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
@@ -177,6 +177,8 @@ layers {
       filter_size_y: 3
       padding_y: 0
       stride_y: 1
+      output_y: 30
+      img_size_y: 32
     }
     num_filters: 64
   }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
index 13d0d477eb58f6da887d0ad9c683caef37e00010..9fae596f281d44dc24c45cb3c750233266e95948 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
@@ -26,11 +26,15 @@ layers {
       filter_size_y: 3
       padding_y: 1
       stride_y: 1
+      output_y: 48
+      img_size_y: 48
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
   num_filters: 16
   shared_biases: true
+  height: 48
+  width: 48
 }
 layers {
   name: "__bilinear_interp_layer_0__"
@@ -40,11 +44,17 @@ layers {
   inputs {
     input_layer_name: "__conv_0__"
     bilinear_interp_conf {
+      image_conf {
+        channels: 16
+        img_size: 48
+        img_size_y: 48
+      }
       out_size_x: 64
       out_size_y: 64
-      num_channels: 16
     }
   }
+  height: 64
+  width: 64
 }
 layers {
   name: "__pool_0__"
@@ -55,19 +65,21 @@ layers {
     input_layer_name: "__bilinear_interp_layer_0__"
     pool_conf {
       pool_type: "max-projection"
-      channels: 4
+      channels: 16
       size_x: 2
       stride: 2
-      output_x: 64
-      img_size: 128
+      output_x: 32
+      img_size: 64
       padding: 0
       size_y: 2
       stride_y: 2
-      output_y: 64
-      img_size_y: 128
+      output_y: 32
+      img_size_y: 64
       padding_y: 0
     }
   }
+  height: 32
+  width: 32
 }
 layers {
   name: "__fc_layer_0__"
@@ -78,6 +90,8 @@ layers {
     input_layer_name: "__pool_0__"
     input_parameter_name: "___fc_layer_0__.w0"
   }
+  height: 32
+  width: 32
 }
 parameters {
   name: "___conv_0__.w0"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
index f6045fe1f68255daf0d9b5ab05034eec633e4503..10e59e21bc7a48bc53fb535f86f053c91f57c1df 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
@@ -47,6 +47,20 @@ layers {
   }
   norm_by_times: false
 }
+layers {
+  name: "__warp_ctc_layer_0__"
+  type: "warp_ctc"
+  size: 5001
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+  inputs {
+    input_layer_name: "labels"
+  }
+  norm_by_times: false
+  blank: 0
+}
 layers {
   name: "crf_label"
   type: "data"
@@ -244,6 +258,7 @@ input_layer_names: "xe-label"
 input_layer_names: "huber_probs"
 input_layer_names: "huber_label"
 output_layer_names: "__ctc_layer_0__"
+output_layer_names: "__warp_ctc_layer_0__"
 output_layer_names: "__crf_layer_0__"
 output_layer_names: "__rank_cost_0__"
 output_layer_names: "__lambda_cost_0__"
@@ -260,6 +275,7 @@ sub_models {
   layer_names: "xe-label"
   layer_names: "__fc_layer_0__"
   layer_names: "__ctc_layer_0__"
+  layer_names: "__warp_ctc_layer_0__"
   layer_names: "crf_label"
   layer_names: "__crf_layer_0__"
   layer_names: "left"
@@ -289,6 +305,7 @@ sub_models {
   input_layer_names: "huber_probs"
   input_layer_names: "huber_label"
   output_layer_names: "__ctc_layer_0__"
+  output_layer_names: "__warp_ctc_layer_0__"
   output_layer_names: "__crf_layer_0__"
   output_layer_names: "__rank_cost_0__"
   output_layer_names: "__lambda_cost_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
index 1be2a7ceebfb74d677ac056dcc3a9f72fd31ccd6..c763a95f9d1aefa022f38e0beef6d1c86ebb360d 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
@@ -4,6 +4,8 @@ layers {
   type: "data"
   size: 2304
   active_type: ""
+  height: 48
+  width: 48
 }
 layers {
   name: "__conv_0__"
@@ -26,11 +28,15 @@ layers {
       filter_size_y: 3
       padding_y: 1
       stride_y: 1
+      output_y: 48
+      img_size_y: 48
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
   num_filters: 16
   shared_biases: true
+  height: 48
+  width: 48
 }
 layers {
   name: "__maxout_layer_0__"
@@ -40,12 +46,16 @@ layers {
   inputs {
     input_layer_name: "__conv_0__"
     maxout_conf {
-      channels: 16
+      image_conf {
+        channels: 16
+        img_size: 48
+        img_size_y: 48
+      }
       groups: 2
-      img_size_x: 0
-      img_size_y: 0
     }
   }
+  height: 48
+  width: 48
 }
 layers {
   name: "__pool_0__"
@@ -69,48 +79,58 @@ layers {
       padding_y: 0
     }
   }
+  height: 24
+  width: 24
 }
 layers {
   name: "__conv_1__"
   type: "exconv"
-  size: 18432
+  size: 73728
   active_type: ""
   inputs {
     input_layer_name: "__pool_0__"
     input_parameter_name: "___conv_1__.w0"
     conv_conf {
       filter_size: 3
-      channels: 32
+      channels: 8
       stride: 1
       padding: 1
       groups: 1
-      filter_channels: 32
-      output_x: 12
-      img_size: 12
+      filter_channels: 8
+      output_x: 24
+      img_size: 24
       caffe_mode: true
       filter_size_y: 3
       padding_y: 1
       stride_y: 1
+      output_y: 24
+      img_size_y: 24
     }
   }
   bias_parameter_name: "___conv_1__.wbias"
   num_filters: 128
   shared_biases: true
+  height: 24
+  width: 24
 }
 layers {
   name: "__maxout_layer_1__"
   type: "maxout"
-  size: 9216
+  size: 18432
   active_type: ""
   inputs {
-    input_layer_name: "__conv_0__"
+    input_layer_name: "__conv_1__"
     maxout_conf {
-      channels: 128
+      image_conf {
+        channels: 128
+        img_size: 24
+        img_size_y: 24
+      }
       groups: 4
-      img_size_x: 0
-      img_size_y: 0
     }
   }
+  height: 24
+  width: 24
 }
 layers {
   name: "__block_expand_layer_0__"
@@ -118,7 +138,7 @@ layers {
   size: 192
   active_type: ""
   inputs {
-    input_layer_name: "__maxout_layer_0__"
+    input_layer_name: "__maxout_layer_1__"
     block_expand_conf {
       channels: 32
       stride_x: 1
@@ -133,6 +153,8 @@ layers {
       img_size_y: 0
     }
   }
+  height: 24
+  width: 24
 }
 layers {
   name: "__fc_layer_0__"
@@ -143,6 +165,8 @@ layers {
     input_layer_name: "__block_expand_layer_0__"
     input_parameter_name: "___fc_layer_0__.w0"
   }
+  height: 24
+  width: 24
 }
 parameters {
   name: "___conv_0__.w0"
@@ -164,9 +188,9 @@ parameters {
 }
 parameters {
   name: "___conv_1__.w0"
-  size: 36864
+  size: 9216
   initial_mean: 0.0
-  initial_std: 0.0833333333333
+  initial_std: 0.166666666667
   initial_strategy: 0
   initial_smart: false
 }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr
index 8b0a8f2146b709ee67981049da8061597e1716be..ca1b2d8cffd6b472dfe40feeeb762e169bc853c7 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr
@@ -4,6 +4,8 @@ layers {
   type: "data"
   size: 3200
   active_type: ""
+  height: 20
+  width: 10
 }
 layers {
   name: "__spp_0__"
@@ -13,13 +15,17 @@ layers {
   inputs {
     input_layer_name: "data"
     spp_conf {
+      image_conf {
+        channels: 16
+        img_size: 10
+        img_size_y: 20
+      }
       pool_type: "max-projection"
       pyramid_height: 2
-      channels: 16
-      img_size: 10
-      img_size_y: 20
     }
   }
+  height: 1
+  width: 5
 }
 input_layer_names: "data"
 output_layer_names: "__spp_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
index 968328835842667470467830c7ca59e4b9fa723d..73f8b333b236a8850e4c2dfa8fc75addeb143e9d 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
@@ -5,13 +5,31 @@ set -e
 
 protostr=`dirname $0`/protostr
 
-files=`ls $protostr | grep -v "unitest"`
+files=`ls $protostr | grep -v "unittest"`
 
 ./generate_protostr.sh
 
-for file in $files
-do
-    base_protostr=$protostr/$file
-    new_protostr=$protostr/$file.unitest
-    diff $base_protostr $new_protostr -u
-done
+. ./file_list.sh
+
+if [ -z $1 ]; then
+  for file in $files
+  do
+      base_protostr=$protostr/$file
+      new_protostr=$protostr/$file.unittest
+      diff $base_protostr $new_protostr -u
+  done
+else
+  for file in ${configs[*]}
+  do
+    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.unittest; then
+      diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
+    fi
+  done
+
+  for file in ${whole_configs[*]}
+  do
+    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.unittest --whole; then
+      diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
+    fi
+  done
+fi
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py b/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
index e15a55b412f9459ecd89a0f654256097099c1398..be83f4f83c5d05ea2ffd9e3df0c09fb1a37a3e57 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
@@ -17,7 +17,7 @@ bilinear = bilinear_interp_layer(input=conv, out_size_x=64, out_size_y=64)
 
 pool = img_pool_layer(
     input=bilinear,
-    num_channels=4,
+    num_channels=16,
     pool_size=2,
     stride=2,
     pool_type=MaxPooling())
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
index fd979a1e9f4337417512b4d6581c34e54c3957bd..18ff6b48c495b7a9d61595916ade1a54b1fa6a10 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
@@ -12,6 +12,8 @@ hidden = fc_layer(input=seq_in, size=4)
 outputs(
     ctc_layer(
         input=seq_in, label=labels),
+    warp_ctc_layer(
+        input=seq_in, label=labels, blank=0),
     crf_layer(
         input=hidden, label=data_layer(
             name='crf_label', size=4)),
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
index 081430d716093877db6b2e44ac5417c37ede9a6e..eb14270baa0c4ca0b84d2121a80fde0b45eda54a 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
@@ -2,7 +2,7 @@ from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
 
-data = data_layer(name='data', size=2304)
+data = data_layer(name='data', size=2304, height=48, width=48)
 
 conv = img_conv_layer(
     input=data,
@@ -21,16 +21,21 @@ pool = img_pool_layer(
 conv2 = img_conv_layer(
     input=pool,
     filter_size=3,
-    num_channels=32,
+    num_channels=8,
     num_filters=128,
     padding=1,
     act=LinearActivation(),
     bias_attr=True)
 
-maxout2 = maxout_layer(input=conv, num_channels=128, groups=4)
+maxout2 = maxout_layer(input=conv2, num_channels=128, groups=4)
 
 block = block_expand_layer(
-    input=maxout, num_channels=32, stride_x=1, stride_y=1, block_x=1, block_y=6)
+    input=maxout2,
+    num_channels=32,
+    stride_x=1,
+    stride_y=1,
+    block_x=1,
+    block_y=6)
 
 fc = fc_layer(input=block, size=384, bias_attr=False)
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
index e20ffb584e8bdd86100455d4e55fe633b878e034..e0b0d0d3be252700d99f7097f0353df885efcf07 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
@@ -2,13 +2,9 @@ from paddle.trainer_config_helpers import *
 
 settings(batch_size=100, learning_rate=1e-5)
 
-data = data_layer(name='data', size=3200)
+data = data_layer(name='data', size=3200, height=20, width=10)
 
 spp = spp_layer(
-    input=data,
-    pyramid_height=2,
-    num_channels=16,
-    pool_type=MaxPooling(),
-    img_width=10)
+    input=data, pyramid_height=2, num_channels=16, pool_type=MaxPooling())
 
 outputs(spp)
diff --git a/warp-ctc b/warp-ctc
new file mode 160000
index 0000000000000000000000000000000000000000..bd535c8d44e03c8ebd2d768e06c8c05fdccd11d2
--- /dev/null
+++ b/warp-ctc
@@ -0,0 +1 @@
+Subproject commit bd535c8d44e03c8ebd2d768e06c8c05fdccd11d2