diff --git a/.clang-format b/.clang-format
index 6bbd46d0ff956517991d4faad3f2c026487f412b..9ba433b17362424973626470d930356c2173dd84 100644
--- a/.clang-format
+++ b/.clang-format
@@ -13,8 +13,6 @@
 # The document of clang-format is 
 #   http://clang.llvm.org/docs/ClangFormat.html
 #   http://clang.llvm.org/docs/ClangFormatStyleOptions.html
-#
-# TODO(yuyang18): Add python and other language code style
 ---
 Language:        Cpp
 BasedOnStyle:  Google
@@ -22,8 +20,9 @@ IndentWidth:     2
 TabWidth:        2
 ContinuationIndentWidth: 4
 AccessModifierOffset: -2  # The private/protected/public has no indent in class
-PointerAlignment: Left    # int* p/int& p, not int *p/int &p
 Standard:  Cpp11 
 AllowAllParametersOfDeclarationOnNextLine: true
+BinPackParameters: false
+BinPackArguments: false
 ...
 
diff --git a/.gitignore b/.gitignore
index 65ba217de37c82287829eef105066aba86d69651..ee8489c1d71bd050b9a1d9358a664d2294165292 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,6 @@ build/
 .vscode
 .idea
 .project
+.cproject
 .pydevproject
+Makefile
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..90c25e435083d78ad4c123999a588aaf9092f719
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,20 @@
+-   repo: https://github.com/Lucas-C/pre-commit-hooks.git
+    sha: c25201a00e6b0514370501050cf2a8538ac12270
+    hooks:
+    -   id: remove-crlf
+-   repo: https://github.com/reyoung/mirrors-yapf.git
+    sha: v0.13.2
+    hooks:
+    -   id: yapf
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    sha: 7539d8bd1a00a3c1bfd34cdb606d3a6372e83469
+    hooks:
+    -   id: check-added-large-files
+    -   id: check-merge-conflict
+    -   id: check-symlinks
+    -   id: detect-private-key
+    -   id: end-of-file-fixer
+-   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
+    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
+    hooks:
+    -   id: clang-formater
diff --git a/.style.yapf b/.style.yapf
new file mode 100644
index 0000000000000000000000000000000000000000..4741fb4f3bbc6681088cf9e960321e7b857a93a8
--- /dev/null
+++ b/.style.yapf
@@ -0,0 +1,3 @@
+[style]
+based_on_style = pep8
+column_limit = 80
diff --git a/.travis.yml b/.travis.yml
index bf0e0b7bbddd4c1f69e287e0f5ad471a54a75600..ffe3bc193b49eb3b3318cbbc7f1c3d86dc205c14 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -35,11 +35,22 @@ addons:
       - libgoogle-glog-dev
       - libgflags-dev
       - libgtest-dev
+      - curl
+      - lcov
       - graphviz
+      - swig
 before_install:
+  - |
+    if [ ${JOB} == "BUILD_AND_TEST" ]; then
+      if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)'
+      then
+        echo "Only markdown docs were updated, stopping build process."
+        exit
+      fi
+    fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
-  - pip install wheel protobuf sphinx breathe recommonmark
+  - pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy
 script:
   - paddle/scripts/travis/main.sh
 notifications:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4613155f7700b25b2a8d7c250832722085b332fa..090ac9e188422099cc4270b87064b5590e7b620c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,14 +2,14 @@ cmake_minimum_required(VERSION 2.8)
 
 project(paddle CXX C)
 set(PADDLE_MAJOR_VERSION 0)
-set(PADDLE_MINOR_VERSION 8)
-set(PADDLE_PATCH_VERSION 0b2)
+set(PADDLE_MINOR_VERSION 9)
+set(PADDLE_PATCH_VERSION 0a0)
 set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
 set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
 include(package)
-include(swig)
+find_package(SWIG 2.0)
 find_package(CUDA QUIET)
 find_package(Protobuf REQUIRED)
 find_package(PythonLibs 2.7 REQUIRED)
@@ -40,6 +40,9 @@ option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
 option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
 option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
 option(ON_TRAVIS "Running test on travis-ci or not." OFF)
+option(ON_COVERALLS "Generating code coverage data on coveralls or not." OFF)
+option(COVERALLS_UPLOAD "Uploading the generated coveralls json." ON)
+
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING 
         "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
@@ -49,11 +52,16 @@ endif()
 include(enableCXX11)
 include(cpplint)
 include(ccache)
+if(WITH_RDMA)
+  include(rdma)
+endif()
 include(util)
 include(flags)
 include(cudnn)
 include(FindPythonModule)
 include(check_packages)
+include(swig)
+include(coveralls)
 
 # add PaddlePaddle version
 if(DEFINED ENV{PADDLE_VERSION})
@@ -87,11 +95,24 @@ if(NOT WITH_GPU)
     add_definitions(-DHPPL_STUB_FUNC)
     list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
+    if(${CUDA_VERSION_MAJOR} GREATER 6)
+        if(COMPILER_SUPPORT_CXX11)
+            LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
+        endif()
+    endif()
+
     # TODO(yuyang18): Change it to remove std=c++11 in cuda compile.
     set(CUDA_PROPAGATE_HOST_FLAGS OFF)
     if(NOT CUDNN_FOUND)
         message(FATAL_ERROR "Paddle need cudnn to compile")
     endif()
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-g -O3 --use_fast_math")
+
+    if(WITH_AVX)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
+    else(WITH_AVX)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
+    endif(WITH_AVX)
 
     if(WITH_DSO)
         set(CUDA_LIBRARIES "")
@@ -115,11 +136,11 @@ if(NOT WITH_TIMER)
 endif(NOT WITH_TIMER)
 
 if(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAGS}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAGS}")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
 else(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse3")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
 endif(WITH_AVX)
 
 if(WITH_PYTHON)
@@ -129,12 +150,15 @@ else(WITH_PYTHON)
     add_definitions(-DPADDLE_NO_PYTHON)
 endif(WITH_PYTHON)
 
-if(NOT WITH_RDMA)
-    add_definitions(-DPADDLE_DISABLE_RDMA)
-endif()
+if(WITH_RDMA)
+  include_directories("${RDMA_INC_DIR}")
+else(WITH_RDMA)
+  add_definitions(-DPADDLE_DISABLE_RDMA)
+endif(WITH_RDMA)
 
 if(WITH_GLOG)
     add_definitions(-DPADDLE_USE_GLOG)
+    include_directories(${LIBGLOG_INCLUDE_DIR})
 endif()
 
 if(WITH_GFLAGS)
diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md
new file mode 100644
index 0000000000000000000000000000000000000000..6b2614b1011081a5e0e03a53fec2012bc7b81333
--- /dev/null
+++ b/ISSUE_TEMPLATE.md
@@ -0,0 +1,14 @@
+Thank you for contributing to PaddlePaddle. Submitting an issue is a great help for us.
+Both Chinese and English issues are welcome.
+
+It's hard to solve a problem when important details are missing.
+Before submitting the issue, look over the following criteria before handing your request in.
+
+- [ ] Was there a similar issue submitted or resolved before ? You could search issue in the github.
+- [ ] Did you retrieve your issue from widespread search engines ?
+- [ ] Is my description of the issue clear enough to reproduce this problem?
+   * If some errors occurred, we need details about `how do you run your code?`, `what system do you use?`, `Are you using GPU or not?`, etc.
+   * If you use an recording [asciinema](https://asciinema.org/) to show what you are doing to make it happen, that's awesome! We could help you solve the problem more quickly.
+- [ ] Is my description of the issue use the github markdown correctly?
+   * Please use the proper markdown syntaxes for styling all forms of writing, e.g, source code, error information, etc.
+   * Check out [this page](https://guides.github.com/features/mastering-markdown/) to find out much more about markdown.
diff --git a/README.md b/README.md
index 1cc0444c0617af3da0ec1d9beaf2ae73e31bd7b2..8a8e15841586ae6a01bb93e94f6074189f556f5a 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,13 @@
 # PaddlePaddle
 
-|  **`Linux`**   | **`License`** | **`Chat Room`** |
-|----------------|---------------|-----------------|
-|[![Build Status](https://travis-ci.org/baidu/Paddle.svg?branch=master)](https://travis-ci.org/baidu/Paddle)|[![License](https://img.shields.io/badge/license-Apache%202.0-green.svg)](LICENSE)|[![Join the chat at https://gitter.im/PaddlePaddle/Deep_Learning](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/PaddlePaddle/Deep_Learning?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)|
+
+[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/cn/index.html)
+[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
+[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
+
 
 Welcome to the PaddlePaddle GitHub.
 
@@ -12,7 +17,7 @@ developed by Baidu scientists and engineers for the purpose of applying deep
 learning to many products at Baidu.
 
 Our vision is to enable deep learning for everyone via PaddlePaddle.
-Please refer to our [release log](https://github.com/baidu/Paddle/releases) to track the latest feature of PaddlePaddle. 
+Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
 ## Features
 
@@ -24,15 +29,15 @@ Please refer to our [release log](https://github.com/baidu/Paddle/releases) to t
     connection.
 
 -  **Efficiency**
-  
+
     In order to unleash the power of heterogeneous computing resource,
     optimization occurs at different levels of PaddlePaddle, including
     computing, memory, architecture and communication. The following are some
     examples:
 
       - Optimized math operations through SSE/AVX intrinsics, BLAS libraries
-      (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels. 
-      - Highly optimized recurrent networks which can handle **variable-length** 
+      (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels.
+      - Highly optimized recurrent networks which can handle **variable-length**
       sequence without padding.
       - Optimized local and distributed training for models with high dimensional
       sparse data.
@@ -55,41 +60,39 @@ Please refer to our [release log](https://github.com/baidu/Paddle/releases) to t
 
 ## Installation
 Check out the [Install Guide](http://paddlepaddle.org/doc/build/) to install from
-pre-built packages (**docker image**, **deb package**) or 
+pre-built packages (**docker image**, **deb package**) or
 directly build on **Linux** and **Mac OS X** from the source code.
- 
+
 ## Documentation
 Both [English Docs](http://paddlepaddle.org/doc/) and [Chinese Docs](http://paddlepaddle.org/doc_cn/) are provided for our users and developers.
 
 - [Quick Start](http://paddlepaddle.org/doc/demo/quick_start/index_en) <br>
    You can follow the quick start tutorial to learn how use PaddlePaddle
    step-by-step.
-    
+
 - [Example and Demo](http://paddlepaddle.org/doc/demo/) <br>
    We provide five demos, including: image classification, sentiment analysis,
-   sequence to sequence model, recommendation, semantic role labeling. 
-   
+   sequence to sequence model, recommendation, semantic role labeling.
+
 - [Distributed Training](http://paddlepaddle.org/doc/cluster) <br>
   This system supports training deep learning models on multiple machines
   with data parallelism.
-   
+
 - [Python API](http://paddlepaddle.org/doc/ui/) <br>
    PaddlePaddle supports using either Python interface or C++ to build your
    system. We also use SWIG to wrap C++ source code to create a user friendly
    interface for Python. You can also use SWIG to create interface for your
    favorite programming language.
- 
+
 - [How to Contribute](http://paddlepaddle.org/doc/build/contribute_to_paddle.html) <br>
    We sincerely appreciate your interest and contributions. If you would like to
-   contribute, please read the contribution guide.   
+   contribute, please read the contribution guide.
 
 - [Source Code Documents](http://paddlepaddle.org/doc/source/) <br>
 
 ## Ask Questions
-Please join the [**gitter chat**](https://gitter.im/PaddlePaddle/Deep_Learning) or send email to
-**paddle-dev@baidu.com** to ask questions and talk about methods and models.
-Framework development discussions and
-bug reports are collected on [Issues](https://github.com/baidu/paddle/issues).
+
+You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).
 
 ## Copyright and License
 PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
diff --git a/benchmark/README.md b/benchmark/README.md
index 8b453a7b59e9f19c7b96da1160cd348c74250bb7..29c7155a0f33ecf29a04d7c276f4531ae64e07a6 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -5,11 +5,11 @@ Machine:
 - CPU: 12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz
 - GPU: Tesla K40m
 - cuDNN: v5.1
-- system: Docker 1.12.1, all platform are tested in docker environment.
+- system: Docker 1.12.1, all platforms are tested in docker environment.
 
 Platforms: 
 
-- PaddlePaddle: 
+- PaddlePaddle: paddledev/paddle:gpu-devel-v0.9.0a0 
 - Tensorflow: gcr.io/tensorflow/tensorflow:0.11.0rc0-gpu 
 - Caffe: kaixhin/cuda-caffe
 
@@ -28,7 +28,7 @@ AlexNet, GoogleNet and a small network used in Caffe.
 - [SmallNet](https://github.com/BVLC/caffe/blob/master/examples/cifar10/cifar10\_quick\_train\_test.prototxt)
 
 
-### Singe-GPU
+### Single-GPU
 
 - AlexNet:  input - 3 * 227 * 227,  Time: ms/batch
 
@@ -61,7 +61,7 @@ All platforms use cuDNN-v5.1. We see that caffe is slower in this experiment, be
 
 **Notation**
 
-All the experiments in caffe use `caffe time` to execute, which does not include the time of parameter updating. The time in PaddlePaddle and TensorFlow contains it. But, compared with the total time, the time of parameter updating is relatively little.
+All the experiments in caffe use `caffe time` to execute, which does not include the time of parameter updating. While PaddlePaddle and TensorFlow contains this time. But, compared with the total time, the time of parameter updating is relatively little on single machine.
 
 In Tensorflow, they implement algorithm searching method instead of using the algorithm searching interface in cuDNN.
 
@@ -106,7 +106,7 @@ We use lstm network for text classfication to test benchmark.
 - Dictionary size=30000 
 - Peephole connection is used in `lstmemory` by default in PaddlePaddle. It is also configured in TensorFlow.
 
-### Single GPU
+### Single-GPU
 
 #### LSTM in Text Classification
 
diff --git a/benchmark/paddle/image/alexnet.py b/benchmark/paddle/image/alexnet.py
index 8b832473231f9894f99830149b96a14a923197f4..3358d43a4b08c6a9b89d59e1a8be53ee1f12bbe0 100644
--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
@@ -2,56 +2,63 @@
 
 from paddle.trainer_config_helpers import *
 
-height=227
-width=227
+height = 227
+width = 227
 num_class = 1000
-batch_size = get_config_arg('batch_size', int, 128) 
-
-args={'height':height, 'width':width, 'color':True, 'num_class':num_class}
-define_py_data_sources2("train.list",
-                        None,
-                        module="provider",
-                        obj="process",
-                        args=args)
+batch_size = get_config_arg('batch_size', int, 128)
 
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
 
 settings(
-    batch_size = batch_size,
-    learning_rate = 0.01 / batch_size,
-    learning_method = MomentumOptimizer(0.9),
-    regularization = L2Regularization(0.0005 * batch_size)
-)
-
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
 
 # conv1
 net = data_layer('data', size=height * width * 3)
-net = img_conv_layer(input=net, filter_size=11, num_channels=3,
-      num_filters=96, stride=4, padding=1)
+net = img_conv_layer(
+    input=net,
+    filter_size=11,
+    num_channels=3,
+    num_filters=96,
+    stride=4,
+    padding=1)
 net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
-net = img_pool_layer(input=net, pool_size=3, stride=2) 
+net = img_pool_layer(input=net, pool_size=3, stride=2)
 
 # conv2
-net = img_conv_layer(input=net, filter_size=5, num_filters=256,
-      stride=1, padding=2, groups=1)
+net = img_conv_layer(
+    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1)
 net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
 net = img_pool_layer(input=net, pool_size=3, stride=2)
 
 # conv3
-net = img_conv_layer(input=net, filter_size=3, num_filters=384,
-      stride=1, padding=1)
+net = img_conv_layer(
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1)
 # conv4
-net = img_conv_layer(input=net, filter_size=3, num_filters=384,
-      stride=1, padding=1, groups=1)
+net = img_conv_layer(
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1)
 
 # conv5
-net = img_conv_layer(input=net, filter_size=3, num_filters=256,
-      stride=1, padding=1, groups=1)
+net = img_conv_layer(
+    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1)
 net = img_pool_layer(input=net, pool_size=3, stride=2)
 
-net = fc_layer(input=net, size=4096, act=ReluActivation(), layer_attr=ExtraAttr(drop_rate=0.5))
-net = fc_layer(input=net, size=4096, act=ReluActivation(), layer_attr=ExtraAttr(drop_rate=0.5))
+net = fc_layer(
+    input=net,
+    size=4096,
+    act=ReluActivation(),
+    layer_attr=ExtraAttr(drop_rate=0.5))
+net = fc_layer(
+    input=net,
+    size=4096,
+    act=ReluActivation(),
+    layer_attr=ExtraAttr(drop_rate=0.5))
 net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
 
 lab = data_layer('label', num_class)
-loss = cross_entropy(input=net, label=lab) 
+loss = cross_entropy(input=net, label=lab)
 outputs(loss)
diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py
index 1078136a2b40b69c7e4b361487d22c414af7501f..bc893bab98c4d2e07c62fbd012d51a0939db4766 100644
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@@ -1,24 +1,20 @@
 #!/usr/bin/env python
 from paddle.trainer_config_helpers import *
 
-height=224
-width=224
+height = 224
+width = 224
 num_class = 1000
-batch_size = get_config_arg('batch_size', int, 128) 
+batch_size = get_config_arg('batch_size', int, 128)
 
-args={'height':height, 'width':width, 'color':True, 'num_class':num_class}
-define_py_data_sources2("train.list",
-                        None,
-                        module="provider",
-                        obj="process",
-                        args=args)
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
 
 settings(
-    batch_size = batch_size,
-    learning_rate = 0.01 / batch_size,
-    learning_method = MomentumOptimizer(0.9),
-    regularization = L2Regularization(0.0005 * batch_size)
-)
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
 
 def inception2(name, input, channels, \
     filter1,
@@ -34,26 +30,61 @@ def inception2(name, input, channels, \
     maxpool = name + '_max'
     convproj = name + '_proj'
 
-    cov1 = img_conv_layer(name=conv1, input=input, filter_size=1,
-                          num_channels=channels, num_filters=filter1,
-                          stride=1, padding=0)
-
-    cov3r = img_conv_layer(name=conv3r, input=input, filter_size=1,
-                           num_channels=channels, num_filters=filter3R,
-                           stride=1, padding=0)
-    cov3 = img_conv_layer(name=conv3, input=cov3r, filter_size=3,
-                          num_filters=filter3, stride=1, padding=1)
-
-    cov5r = img_conv_layer(name=conv5r, input=input, filter_size=1,
-                           num_channels=channels, num_filters=filter5R,
-                           stride=1, padding=0)
-    cov5 = img_conv_layer(name=conv5, input=cov5r, filter_size=5,
-                          num_filters=filter5, stride=1, padding=2)
-    
-    pool1 = img_pool_layer(name=maxpool, input=input, pool_size=3,
-                           num_channels=channels, stride=1, padding=1)
-    covprj = img_conv_layer(name=convproj, input=pool1, filter_size=1,
-                            num_filters=proj, stride=1, padding=0)
+    cov1 = img_conv_layer(
+        name=conv1,
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter1,
+        stride=1,
+        padding=0)
+
+    cov3r = img_conv_layer(
+        name=conv3r,
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter3R,
+        stride=1,
+        padding=0)
+    cov3 = img_conv_layer(
+        name=conv3,
+        input=cov3r,
+        filter_size=3,
+        num_filters=filter3,
+        stride=1,
+        padding=1)
+
+    cov5r = img_conv_layer(
+        name=conv5r,
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter5R,
+        stride=1,
+        padding=0)
+    cov5 = img_conv_layer(
+        name=conv5,
+        input=cov5r,
+        filter_size=5,
+        num_filters=filter5,
+        stride=1,
+        padding=2)
+
+    pool1 = img_pool_layer(
+        name=maxpool,
+        input=input,
+        pool_size=3,
+        num_channels=channels,
+        stride=1,
+        padding=1)
+    covprj = img_conv_layer(
+        name=convproj,
+        input=pool1,
+        filter_size=1,
+        num_filters=proj,
+        stride=1,
+        padding=0)
 
     cat = concat_layer(name=name, input=[cov1, cov3, cov5, covprj])
     return cat
@@ -64,28 +95,51 @@ def inception(name, input, channels, \
     filter5R, filter5,
     proj):
 
-    cov1 = conv_projection(input=input, filter_size=1, num_channels=channels,
-                           num_filters=filter1, stride=1, padding=0)
-
-    cov3r = img_conv_layer(name=name + '_3r', input=input, filter_size=1,
-                           num_channels=channels, num_filters=filter3R,
-                           stride=1, padding=0)
-    cov3 = conv_projection(input=cov3r, filter_size=3, num_filters=filter3,
-                           stride=1, padding=1)
-
-    cov5r = img_conv_layer(name=name + '_5r', input=input, filter_size=1,
-                           num_channels=channels, num_filters=filter5R,
-                           stride=1, padding=0)
-    cov5 = conv_projection(input=cov5r, filter_size=5, num_filters=filter5,
-                           stride=1, padding=2)
-    
-    pool1 = img_pool_layer(name=name + '_max', input=input, pool_size=3,
-                           num_channels=channels, stride=1, padding=1)
-    covprj = conv_projection(input=pool1, filter_size=1, num_filters=proj,
-                             stride=1, padding=0)
-
-    cat = concat_layer(name=name, input=[cov1, cov3, cov5, covprj],
-                       bias_attr=True, act=ReluActivation())
+    cov1 = conv_projection(
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter1,
+        stride=1,
+        padding=0)
+
+    cov3r = img_conv_layer(
+        name=name + '_3r',
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter3R,
+        stride=1,
+        padding=0)
+    cov3 = conv_projection(
+        input=cov3r, filter_size=3, num_filters=filter3, stride=1, padding=1)
+
+    cov5r = img_conv_layer(
+        name=name + '_5r',
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter5R,
+        stride=1,
+        padding=0)
+    cov5 = conv_projection(
+        input=cov5r, filter_size=5, num_filters=filter5, stride=1, padding=2)
+
+    pool1 = img_pool_layer(
+        name=name + '_max',
+        input=input,
+        pool_size=3,
+        num_channels=channels,
+        stride=1,
+        padding=1)
+    covprj = conv_projection(
+        input=pool1, filter_size=1, num_filters=proj, stride=1, padding=0)
+
+    cat = concat_layer(
+        name=name,
+        input=[cov1, cov3, cov5, covprj],
+        bias_attr=True,
+        act=ReluActivation())
     return cat
 
 
@@ -93,36 +147,60 @@ lab = data_layer(name="label", size=1000)
 data = data_layer(name="input", size=3 * height * width)
 
 # stage 1
-conv1 = img_conv_layer(name="conv1", input=data, filter_size=7,
-                       num_channels=3, num_filters=64, stride=2, padding=3)
-pool1 = img_pool_layer(name="pool1", input=conv1, pool_size=3,
-                       num_channels=64, stride=2)
+conv1 = img_conv_layer(
+    name="conv1",
+    input=data,
+    filter_size=7,
+    num_channels=3,
+    num_filters=64,
+    stride=2,
+    padding=3)
+pool1 = img_pool_layer(
+    name="pool1", input=conv1, pool_size=3, num_channels=64, stride=2)
 
 # stage 2
-conv2_1 = img_conv_layer(name="conv2_1", input=pool1, filter_size=1,
-                         num_filters=64, stride=1, padding=0)
-conv2_2 = img_conv_layer(name="conv2_2", input=conv2_1, filter_size=3,
-                         num_filters=192, stride=1, padding=1)
-pool2 = img_pool_layer(name="pool2", input=conv2_2, pool_size=3,
-                       num_channels=192, stride=2)
+conv2_1 = img_conv_layer(
+    name="conv2_1",
+    input=pool1,
+    filter_size=1,
+    num_filters=64,
+    stride=1,
+    padding=0)
+conv2_2 = img_conv_layer(
+    name="conv2_2",
+    input=conv2_1,
+    filter_size=3,
+    num_filters=192,
+    stride=1,
+    padding=1)
+pool2 = img_pool_layer(
+    name="pool2", input=conv2_2, pool_size=3, num_channels=192, stride=2)
 
 # stage 3
-ince3a = inception("ince3a", pool2,  192,  64, 96, 128, 16, 32, 32) 
-ince3b = inception("ince3b", ince3a, 256, 128, 128,192, 32, 96, 64) 
-pool3 = img_pool_layer(name="pool3", input=ince3b, num_channels=480, pool_size=3, stride=2)
+ince3a = inception("ince3a", pool2, 192, 64, 96, 128, 16, 32, 32)
+ince3b = inception("ince3b", ince3a, 256, 128, 128, 192, 32, 96, 64)
+pool3 = img_pool_layer(
+    name="pool3", input=ince3b, num_channels=480, pool_size=3, stride=2)
 
 # stage 4
-ince4a = inception("ince4a", pool3,  480, 192, 96,  208, 16, 48, 64)  
-ince4b = inception("ince4b", ince4a, 512, 160, 112, 224, 24, 64, 64) 
+ince4a = inception("ince4a", pool3, 480, 192, 96, 208, 16, 48, 64)
+ince4b = inception("ince4b", ince4a, 512, 160, 112, 224, 24, 64, 64)
 ince4c = inception("ince4c", ince4b, 512, 128, 128, 256, 24, 64, 64)
-ince4d = inception("ince4d", ince4c, 512, 112, 144, 288, 32, 64, 64)  
-ince4e = inception("ince4e", ince4d, 528, 256, 160, 320, 32, 128, 128) 
-pool4 = img_pool_layer(name="pool4", input=ince4e, num_channels=832, pool_size=3, stride=2)
+ince4d = inception("ince4d", ince4c, 512, 112, 144, 288, 32, 64, 64)
+ince4e = inception("ince4e", ince4d, 528, 256, 160, 320, 32, 128, 128)
+pool4 = img_pool_layer(
+    name="pool4", input=ince4e, num_channels=832, pool_size=3, stride=2)
 
 # stage 5
-ince5a = inception("ince5a", pool4,  832, 256, 160, 320, 32, 128, 128)
+ince5a = inception("ince5a", pool4, 832, 256, 160, 320, 32, 128, 128)
 ince5b = inception("ince5b", ince5a, 832, 384, 192, 384, 48, 128, 128)
-pool5 = img_pool_layer(name="pool5", input=ince5b, num_channels=1024, pool_size=7, stride=7, pool_type=AvgPooling())
+pool5 = img_pool_layer(
+    name="pool5",
+    input=ince5b,
+    num_channels=1024,
+    pool_size=7,
+    stride=7,
+    pool_type=AvgPooling())
 
 # We remove loss1 and loss2 for all system when testing benchmark
 # output 1
@@ -141,7 +219,8 @@ pool5 = img_pool_layer(name="pool5", input=ince5b, num_channels=1024, pool_size=
 
 # output 3
 dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
-out3 = fc_layer(name="output3", input=dropout, size=1000, act=SoftmaxActivation())
-loss3 = cross_entropy(name='loss3', input=out3, label=lab) 
+out3 = fc_layer(
+    name="output3", input=dropout, size=1000, act=SoftmaxActivation())
+loss3 = cross_entropy(name='loss3', input=out3, label=lab)
 
 outputs(loss3)
diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py
index 0d45268aa3f4900349e176a56acc9a9eb6eb120b..b6bc0e9aa21a1083ddc2c8f3ada4acf637425a62 100644
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@@ -1,13 +1,14 @@
-import io,os
+import io, os
 import random
 import numpy as np
 from paddle.trainer.PyDataProvider2 import *
 
+
 def initHook(settings, height, width, color, num_class, **kwargs):
-    settings.height = height 
-    settings.width = width 
-    settings.color = color 
-    settings.num_class = num_class 
+    settings.height = height
+    settings.width = width
+    settings.color = color
+    settings.num_class = num_class
     if settings.color:
         settings.data_size = settings.height * settings.width * 3
     else:
@@ -15,7 +16,9 @@ def initHook(settings, height, width, color, num_class, **kwargs):
 
     settings.slots = [dense_vector(settings.data_size), integer_value(1)]
 
-@provider(init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
+
+@provider(
+    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_list):
     with open(file_list, 'r') as fdata:
         for line in fdata:
diff --git a/benchmark/paddle/image/smallnet_mnist_cifar.py b/benchmark/paddle/image/smallnet_mnist_cifar.py
index 78dba880d29250158326b23834a60273407eb111..58879c454f37991405d83bbb593bb5d1e977ff53 100644
--- a/benchmark/paddle/image/smallnet_mnist_cifar.py
+++ b/benchmark/paddle/image/smallnet_mnist_cifar.py
@@ -2,42 +2,44 @@
 
 from paddle.trainer_config_helpers import *
 
-height=32
-width=32
+height = 32
+width = 32
 num_class = 10
 
-batch_size = get_config_arg('batch_size', int, 128) 
+batch_size = get_config_arg('batch_size', int, 128)
 
-args={'height':height, 'width':width, 'color':True, 'num_class':num_class}
-define_py_data_sources2("train.list",
-                        None,
-                        module="provider",
-                        obj="process",
-                        args=args)
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
 
 settings(
-    batch_size = batch_size,
-    learning_rate = 0.01 / batch_size,
-    learning_method = MomentumOptimizer(0.9),
-    regularization = L2Regularization(0.0005 * batch_size)
-)
-
+    batch_size=batch_size,
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
 
 # conv1
 net = data_layer('data', size=height * width * 3)
-net = img_conv_layer(input=net, filter_size=5, num_channels=3,
-                     num_filters=32, stride=1, padding=2)
+net = img_conv_layer(
+    input=net,
+    filter_size=5,
+    num_channels=3,
+    num_filters=32,
+    stride=1,
+    padding=2)
 net = img_pool_layer(input=net, pool_size=3, stride=2, padding=1)
 
 # conv2
-net = img_conv_layer(input=net, filter_size=5, num_filters=32,
-                     stride=1, padding=2)
-net = img_pool_layer(input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
+net = img_conv_layer(
+    input=net, filter_size=5, num_filters=32, stride=1, padding=2)
+net = img_pool_layer(
+    input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
 
 # conv3
-net = img_conv_layer(input=net, filter_size=3, num_filters=64,
-                     stride=1, padding=1)
-net = img_pool_layer(input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
+net = img_conv_layer(
+    input=net, filter_size=3, num_filters=64, stride=1, padding=1)
+net = img_pool_layer(
+    input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
 
 net = fc_layer(input=net, size=64, act=ReluActivation())
 net = fc_layer(input=net, size=10, act=SoftmaxActivation())
diff --git a/benchmark/paddle/rnn/imdb.py b/benchmark/paddle/rnn/imdb.py
index 93e1686854b447c4248ae1809fb5289a36e3e0f7..fc4ed4025f9ed2e0a32a1709ff8df4af53521196 100755
--- a/benchmark/paddle/rnn/imdb.py
+++ b/benchmark/paddle/rnn/imdb.py
@@ -4,6 +4,7 @@ import gzip
 import os
 import numpy
 
+
 def get_dataset_file(dataset, default_dataset, origin):
     data_dir, data_file = os.path.split(dataset)
     if (not os.path.isfile(dataset)) and data_file == default_dataset:
@@ -13,13 +14,14 @@ def get_dataset_file(dataset, default_dataset, origin):
 
     return dataset
 
+
 def create_data(path="imdb.pkl"):
 
     if (not os.path.isfile('imdb.train.pkl')):
         path = get_dataset_file(
             path, "imdb.pkl",
             "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
-    
+
         if path.endswith(".gz"):
             f = gzip.open(path, 'rb')
         else:
@@ -35,8 +37,10 @@ def create_data(path="imdb.pkl"):
     if (not os.path.isfile('train.list')):
         file('train.list', 'w').write('imdb.train.pkl\n')
 
+
 def main():
     create_data('imdb.pkl')
 
+
 if __name__ == "__main__":
     main()
diff --git a/benchmark/paddle/rnn/provider.py b/benchmark/paddle/rnn/provider.py
index 90d3fee67601604b236b27fb2e5492e92095cb72..928ca75daf84ccebb775364b0be0d8b3d5eebff9 100644
--- a/benchmark/paddle/rnn/provider.py
+++ b/benchmark/paddle/rnn/provider.py
@@ -1,19 +1,25 @@
-import io,os
+import io, os
 import random
 import numpy as np
 import six.moves.cPickle as pickle
 from paddle.trainer.PyDataProvider2 import *
 
+
 def remove_unk(x, n_words):
     return [[1 if w >= n_words else w for w in sen] for sen in x]
 
+
 # ==============================================================
 #  tensorflow uses fixed length, but PaddlePaddle can process
 #  variable-length. Padding is used in benchmark in order to
 #  compare with other platform. 
 # ==============================================================
-def pad_sequences(sequences, maxlen=None, dtype='int32', padding='post',
-                  truncating='post', value=0.):
+def pad_sequences(sequences,
+                  maxlen=None,
+                  dtype='int32',
+                  padding='post',
+                  truncating='post',
+                  value=0.):
     lengths = [len(s) for s in sequences]
 
     nb_samples = len(sequences)
@@ -43,12 +49,14 @@ def pad_sequences(sequences, maxlen=None, dtype='int32', padding='post',
 def initHook(settings, vocab_size, pad_seq, maxlen, **kwargs):
     settings.vocab_size = vocab_size
     settings.pad_seq = pad_seq
-    settings.maxlen = maxlen 
+    settings.maxlen = maxlen
     settings.input_types = [
-        integer_value_sequence(vocab_size),
-        integer_value(2)]
+        integer_value_sequence(vocab_size), integer_value(2)
+    ]
+
 
-@provider(init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
+@provider(
+    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file):
     f = open(file, 'rb')
     train_set = pickle.load(f)
@@ -57,8 +65,8 @@ def process(settings, file):
 
     # remove unk, namely remove the words out of dictionary
     x = remove_unk(x, settings.vocab_size)
-    if settings.pad_seq: 
+    if settings.pad_seq:
         x = pad_sequences(x, maxlen=settings.maxlen, value=0.)
 
     for i in range(len(y)):
-        yield map(int,x[i]), int(y[i])
+        yield map(int, x[i]), int(y[i])
diff --git a/benchmark/paddle/rnn/rnn.py b/benchmark/paddle/rnn/rnn.py
index fc8221b1126649d3d1b6a2a8743d25fe4a8d4aec..83eb3e565473f7e7e91cddeaa3cd2aafb7e3df2c 100755
--- a/benchmark/paddle/rnn/rnn.py
+++ b/benchmark/paddle/rnn/rnn.py
@@ -6,33 +6,29 @@ import imdb
 num_class = 2
 vocab_size = 30000
 fixedlen = 100
-batch_size = get_config_arg('batch_size', int, 128) 
-lstm_num = get_config_arg('lstm_num', int, 1) 
-hidden_size = get_config_arg('hidden_size', int, 128) 
+batch_size = get_config_arg('batch_size', int, 128)
+lstm_num = get_config_arg('lstm_num', int, 1)
+hidden_size = get_config_arg('hidden_size', int, 128)
 # whether to pad sequence into fixed length
 pad_seq = get_config_arg('pad_seq', bool, True)
 imdb.create_data('imdb.pkl')
 
-args={'vocab_size':vocab_size, 'pad_seq':pad_seq, 'maxlen':fixedlen}
-define_py_data_sources2("train.list",
-                        None,
-                        module="provider",
-                        obj="process",
-                        args=args)
+args = {'vocab_size': vocab_size, 'pad_seq': pad_seq, 'maxlen': fixedlen}
+define_py_data_sources2(
+    "train.list", None, module="provider", obj="process", args=args)
 
 settings(
     batch_size=batch_size,
     learning_rate=2e-3,
     learning_method=AdamOptimizer(),
     regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25
-)
+    gradient_clipping_threshold=25)
 
 net = data_layer('data', size=vocab_size)
 net = embedding_layer(input=net, size=128)
 
 for i in xrange(lstm_num):
-    net = simple_lstm(input=net, size=hidden_size) 
+    net = simple_lstm(input=net, size=hidden_size)
 
 net = last_seq(input=net)
 net = fc_layer(input=net, size=2, act=SoftmaxActivation())
diff --git a/benchmark/tensorflow/image/alexnet.py b/benchmark/tensorflow/image/alexnet.py
index 57b7ef6c323243c8e03324533d0022ab00bb8516..f6a39ef778e21bee7374718a1b1ddf43392825a8 100644
--- a/benchmark/tensorflow/image/alexnet.py
+++ b/benchmark/tensorflow/image/alexnet.py
@@ -8,10 +8,8 @@ import tensorflow as tf
 
 FLAGS = tf.app.flags.FLAGS
 
-tf.app.flags.DEFINE_integer('batch_size', 128,
-                            """Batch size.""")
-tf.app.flags.DEFINE_integer('num_batches', 100,
-                            """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
 tf.app.flags.DEFINE_boolean('forward_only', False,
                             """Only run the forward pass.""")
 tf.app.flags.DEFINE_boolean('forward_backward_only', False,
@@ -23,47 +21,64 @@ tf.app.flags.DEFINE_string('data_format', 'NCHW',
 tf.app.flags.DEFINE_boolean('log_device_placement', False,
                             """Whether to log device placement.""")
 
+
 def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.0005):
     with tf.name_scope(name) as scope:
-        kernel = tf.get_variable(name + '_w',[kH, kW, nIn, nOut],
-          initializer=tf.truncated_normal_initializer(stddev=0.01, dtype=tf.float32),
-          dtype=tf.float32)
+        kernel = tf.get_variable(
+            name + '_w', [kH, kW, nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
 
         if wd is not None and wd > 0:
             weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
             tf.add_to_collection('losses', weight_decay)
 
         if FLAGS.data_format == 'NCHW':
-          strides = [1, 1, dH, dW]
+            strides = [1, 1, dH, dW]
         else:
-          strides = [1, dH, dW, 1]
-        conv = tf.nn.conv2d(inpOp, kernel, strides, padding=padType,
-                            data_format=FLAGS.data_format)
-
-        biases = tf.get_variable(name=name + '_b', shape=[nOut], 
-            initializer=tf.constant_initializer(value=0.0, dtype=tf.float32),
+            strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(
+            inpOp,
+            kernel,
+            strides,
+            padding=padType,
+            data_format=FLAGS.data_format)
+
+        biases = tf.get_variable(
+            name=name + '_b',
+            shape=[nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
             dtype=tf.float32)
 
         bias = tf.reshape(
-            tf.nn.bias_add(conv, biases, data_format=FLAGS.data_format),
+            tf.nn.bias_add(
+                conv, biases, data_format=FLAGS.data_format),
             conv.get_shape())
 
         conv1 = tf.nn.relu(bias, name=scope)
         return conv1
 
+
 def _affine(name, inpOp, nIn, nOut, wd=0.0005, act=True, drop=None):
     with tf.name_scope(name) as scope:
-        kernel = tf.get_variable(name + '_w', [nIn, nOut],
-            initializer=tf.truncated_normal_initializer(stddev=0.01, dtype=tf.float32),
+        kernel = tf.get_variable(
+            name + '_w', [nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
             dtype=tf.float32)
 
         if wd is not None and wd > 0:
             weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
             tf.add_to_collection('losses', weight_decay)
 
-        biases = tf.get_variable(name + '_b', [nOut],
-            initializer=tf.constant_initializer(value=0.0, dtype=tf.float32),
-            dtype=tf.float32,trainable=True)
+        biases = tf.get_variable(
+            name + '_b', [nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=True)
 
         affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
                   tf.matmul(inpOp, kernel) + biases
@@ -72,31 +87,36 @@ def _affine(name, inpOp, nIn, nOut, wd=0.0005, act=True, drop=None):
 
         return output
 
+
 def _mpool(name, inpOp, kH, kW, dH, dW):
     if FLAGS.data_format == 'NCHW':
-      ksize = [1, 1, kH, kW]
-      strides = [1, 1, dH, dW]
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
     else:
-      ksize = [1, kH, kW, 1]
-      strides = [1, dH, dW, 1]
-    return tf.nn.max_pool(inpOp,
-                          ksize=ksize,
-                          strides=strides,
-                          padding='VALID',
-                          data_format=FLAGS.data_format,
-                          name=name)
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding='VALID',
+        data_format=FLAGS.data_format,
+        name=name)
+
 
 def _norm(name, l_input, lsize=4):
-    return tf.nn.lrn(l_input, lsize, bias=1.0,
+    return tf.nn.lrn(l_input,
+                     lsize,
+                     bias=1.0,
                      alpha=0.001 / 9.0,
-                     beta=0.75, name=name)
-
+                     beta=0.75,
+                     name=name)
 
 
 def loss(logits, labels):
     labels = tf.cast(labels, tf.int64)
     cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
-                    logits, labels, name='cross_entropy_per_example')
+        logits, labels, name='cross_entropy_per_example')
     cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
     tf.add_to_collection('losses', cross_entropy_mean)
 
@@ -104,6 +124,7 @@ def loss(logits, labels):
     # decay terms (L2 loss).
     return tf.add_n(tf.get_collection('losses'), name='total_loss')
 
+
 def get_incoming_shape(incoming):
     """ Returns the incoming data shape """
     if isinstance(incoming, tf.Tensor):
@@ -113,50 +134,52 @@ def get_incoming_shape(incoming):
     else:
         raise Exception("Invalid incoming layer.")
 
+
 def inference(images):
-    conv1 = _conv ('conv1', images, 3, 96, 11, 11, 4, 4, 'VALID')
-    pool1 = _mpool('pool1', conv1,  3, 3, 2, 2)
-    norm1 = _norm ('norm1', pool1, lsize=5)
-    conv2 = _conv ('conv2', norm1,  96, 256, 5, 5, 1, 1, 'SAME')
-    pool2 = _mpool('pool2', conv2,  3, 3, 2, 2)
-    norm2 = _norm ('norm2', pool2, lsize=5)
-    conv3 = _conv ('conv3', norm2,  256, 384, 3, 3, 1, 1, 'SAME')
-    conv4 = _conv ('conv4', conv3,  384, 384, 3, 3, 1, 1, 'SAME')
-    conv5 = _conv ('conv5', conv4,  384, 256, 3, 3, 1, 1, 'SAME')
-    pool5 = _mpool('pool5', conv5,  3, 3, 2, 2)
+    conv1 = _conv('conv1', images, 3, 96, 11, 11, 4, 4, 'VALID')
+    pool1 = _mpool('pool1', conv1, 3, 3, 2, 2)
+    norm1 = _norm('norm1', pool1, lsize=5)
+    conv2 = _conv('conv2', norm1, 96, 256, 5, 5, 1, 1, 'SAME')
+    pool2 = _mpool('pool2', conv2, 3, 3, 2, 2)
+    norm2 = _norm('norm2', pool2, lsize=5)
+    conv3 = _conv('conv3', norm2, 256, 384, 3, 3, 1, 1, 'SAME')
+    conv4 = _conv('conv4', conv3, 384, 384, 3, 3, 1, 1, 'SAME')
+    conv5 = _conv('conv5', conv4, 384, 256, 3, 3, 1, 1, 'SAME')
+    pool5 = _mpool('pool5', conv5, 3, 3, 2, 2)
     resh1 = tf.reshape(pool5, [-1, 256 * 6 * 6])
     affn1 = _affine('fc6', resh1, 256 * 6 * 6, 4096, 0.5)
     affn2 = _affine('fc7', affn1, 4096, 4096, 0.5)
-    affn3 = _affine('fc8', affn2, 4096, 1000, wd=None, act=False) # last fc
+    affn3 = _affine('fc8', affn2, 4096, 1000, wd=None, act=False)  # last fc
 
     return affn3
 
 
 def time_tensorflow_run(session, target, info_string):
-  num_steps_burn_in = 10
-  total_duration = 0.0
-  total_duration_squared = 0.0
-  if not isinstance(target, list):
-    target = [target]
-  target_op = tf.group(*target)
-  for i in xrange(FLAGS.num_batches + num_steps_burn_in):
-    start_time = time.time()
-    _ = session.run(target_op)
-    duration = time.time() - start_time
-    if i > num_steps_burn_in:
-      if not i % 10:
-        print ('%s: step %d, duration = %.3f' %
-               (datetime.now(), i - num_steps_burn_in, duration))
-      total_duration += duration
-      total_duration_squared += duration * duration
-  mn = total_duration / FLAGS.num_batches
-  vr = total_duration_squared / FLAGS.num_batches - mn * mn
-  sd = math.sqrt(vr)
-  print ('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
-         (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
+    num_steps_burn_in = 10
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    if not isinstance(target, list):
+        target = [target]
+    target_op = tf.group(*target)
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _ = session.run(target_op)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                print('%s: step %d, duration = %.3f' %
+                      (datetime.now(), i - num_steps_burn_in, duration))
+            total_duration += duration
+            total_duration_squared += duration * duration
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
+
 
 def _add_loss_summaries(total_loss):
-  """
+    """
   Generates moving average for all losses and associated summaries for
   visualizing the performance of the network.
 
@@ -165,96 +188,111 @@ def _add_loss_summaries(total_loss):
   Returns:
     loss_averages_op: op for generating moving averages of losses.
   """
-  # Compute the moving average of all individual losses and the total loss.
-  loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
-  losses = tf.get_collection('losses')
-  loss_averages_op = loss_averages.apply(losses + [total_loss])
-
-  # Attach a scalar summary to all individual losses and the total loss; do the
-  # same for the averaged version of the losses.
-  for l in losses + [total_loss]:
-    # Name each loss as '(raw)' and name the moving average version of the loss
-    # as the original loss name.
-    tf.scalar_summary(l.op.name +' (raw)', l)
-    tf.scalar_summary(l.op.name, loss_averages.average(l))
+    # Compute the moving average of all individual losses and the total loss.
+    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
+    losses = tf.get_collection('losses')
+    loss_averages_op = loss_averages.apply(losses + [total_loss])
 
-  return loss_averages_op
+    # Attach a scalar summary to all individual losses and the total loss; do the
+    # same for the averaged version of the losses.
+    for l in losses + [total_loss]:
+        # Name each loss as '(raw)' and name the moving average version of the loss
+        # as the original loss name.
+        tf.scalar_summary(l.op.name + ' (raw)', l)
+        tf.scalar_summary(l.op.name, loss_averages.average(l))
 
+    return loss_averages_op
 
 
 def run_benchmark():
-  with tf.Graph().as_default():
-    with tf.device('/gpu:0'):
-      # Generate some dummy images.
-      image_size = 224
-      # Note that our padding definition is slightly different the cuda-convnet.
-      # In order to force the model to start with the same activations sizes,
-      # we add 3 to the image_size and employ VALID padding above.
-      if FLAGS.data_format == 'NCHW':
-        image_shape = [FLAGS.batch_size, 3, image_size + 3, image_size + 3]
-      else:
-        image_shape = [FLAGS.batch_size, image_size + 3, image_size + 3, 3]
-      images = tf.get_variable('image', image_shape, 
-                               initializer=tf.truncated_normal_initializer(stddev=0.1, dtype=tf.float32),
-                               dtype=tf.float32,
-                               trainable=False)
-
-      labels = tf.get_variable('label', [FLAGS.batch_size],
-                               initializer=tf.constant_initializer(1),
-                               dtype=tf.int32,
-                               trainable=False)
-
-      # Build a Graph that computes the logits predictions from the
-      # inference model.
-      last_layer = inference(images)
-
-      objective = loss(last_layer, labels)
-      # Compute the gradient with respect to all the parameters.
-
-      # Compute gradients.
-      # opt = tf.train.GradientDescentOptimizer(0.001)
-      opt = tf.train.MomentumOptimizer(0.001, 0.9)
-      grads = opt.compute_gradients(objective) 
-      global_step = tf.get_variable('global_step', [],
-         initializer=tf.constant_initializer(0.0, dtype=tf.float32),
-         trainable=False, dtype=tf.float32)
-      apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
-
-      # Track the moving averages of all trainable variables.
-      variable_averages = tf.train.ExponentialMovingAverage(
-           0.9, global_step)
-      variables_averages_op = variable_averages.apply(tf.trainable_variables())
-
-      # Build an initialization operation.
-      init = tf.initialize_all_variables()
-
-      # Start running operations on the Graph.
-      sess = tf.Session(config=tf.ConfigProto(
-        allow_soft_placement=True,
-        log_device_placement=FLAGS.log_device_placement))
-      sess.run(init)
-
-      run_forward = True
-      run_forward_backward = True
-      if FLAGS.forward_only and FLAGS.forward_backward_only:
-        raise ValueError("Cannot specify --forward_only and "
-                         "--forward_backward_only at the same time.")
-      if FLAGS.forward_only:
-        run_forward_backward = False
-      elif FLAGS.forward_backward_only:
-        run_forward = False
-
-      if run_forward:
-        time_tensorflow_run(sess, last_layer, "Forward")
-
-      if run_forward_backward:
-        with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
-            train_op = tf.no_op(name='train')
-        time_tensorflow_run(sess, [train_op, objective], "Forward-backward")
+    with tf.Graph().as_default():
+        with tf.device('/gpu:0'):
+            # Generate some dummy images.
+            image_size = 224
+            # Note that our padding definition is slightly different the cuda-convnet.
+            # In order to force the model to start with the same activations sizes,
+            # we add 3 to the image_size and employ VALID padding above.
+            if FLAGS.data_format == 'NCHW':
+                image_shape = [
+                    FLAGS.batch_size, 3, image_size + 3, image_size + 3
+                ]
+            else:
+                image_shape = [
+                    FLAGS.batch_size, image_size + 3, image_size + 3, 3
+                ]
+            images = tf.get_variable(
+                'image',
+                image_shape,
+                initializer=tf.truncated_normal_initializer(
+                    stddev=0.1, dtype=tf.float32),
+                dtype=tf.float32,
+                trainable=False)
+
+            labels = tf.get_variable(
+                'label', [FLAGS.batch_size],
+                initializer=tf.constant_initializer(1),
+                dtype=tf.int32,
+                trainable=False)
+
+            # Build a Graph that computes the logits predictions from the
+            # inference model.
+            last_layer = inference(images)
+
+            objective = loss(last_layer, labels)
+            # Compute the gradient with respect to all the parameters.
+
+            # Compute gradients.
+            # opt = tf.train.GradientDescentOptimizer(0.001)
+            opt = tf.train.MomentumOptimizer(0.001, 0.9)
+            grads = opt.compute_gradients(objective)
+            global_step = tf.get_variable(
+                'global_step', [],
+                initializer=tf.constant_initializer(
+                    0.0, dtype=tf.float32),
+                trainable=False,
+                dtype=tf.float32)
+            apply_gradient_op = opt.apply_gradients(
+                grads, global_step=global_step)
+
+            # Track the moving averages of all trainable variables.
+            variable_averages = tf.train.ExponentialMovingAverage(0.9,
+                                                                  global_step)
+            variables_averages_op = variable_averages.apply(
+                tf.trainable_variables())
+
+            # Build an initialization operation.
+            init = tf.initialize_all_variables()
+
+            # Start running operations on the Graph.
+            sess = tf.Session(config=tf.ConfigProto(
+                allow_soft_placement=True,
+                log_device_placement=FLAGS.log_device_placement))
+            sess.run(init)
+
+            run_forward = True
+            run_forward_backward = True
+            if FLAGS.forward_only and FLAGS.forward_backward_only:
+                raise ValueError("Cannot specify --forward_only and "
+                                 "--forward_backward_only at the same time.")
+            if FLAGS.forward_only:
+                run_forward_backward = False
+            elif FLAGS.forward_backward_only:
+                run_forward = False
+
+            if run_forward:
+                time_tensorflow_run(sess, last_layer, "Forward")
+
+            if run_forward_backward:
+                with tf.control_dependencies(
+                    [apply_gradient_op, variables_averages_op]):
+                    train_op = tf.no_op(name='train')
+                time_tensorflow_run(sess, [train_op, objective],
+                                    "Forward-backward")
+
 
 def main(_):
-  run_benchmark()
+    run_benchmark()
 
 
 if __name__ == '__main__':
-  tf.app.run()
+    tf.app.run()
diff --git a/benchmark/tensorflow/image/alexnet_multi_gpu.py b/benchmark/tensorflow/image/alexnet_multi_gpu.py
index f006fb56af7bcdfd2912976fff3ec6c3fcb18fdb..7b5ee78f4dd5429abd85d75c092a6e3a2a39f922 100644
--- a/benchmark/tensorflow/image/alexnet_multi_gpu.py
+++ b/benchmark/tensorflow/image/alexnet_multi_gpu.py
@@ -9,10 +9,8 @@ import tensorflow as tf
 
 FLAGS = tf.app.flags.FLAGS
 
-tf.app.flags.DEFINE_integer('batch_size', 64,
-                            """Batch size.""")
-tf.app.flags.DEFINE_integer('num_batches', 100,
-                            """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
 tf.app.flags.DEFINE_string('data_format', 'NCHW',
                            """The data format for Convnet operations.
                            Can be either NHWC or NCHW.
@@ -21,88 +19,110 @@ tf.app.flags.DEFINE_string('data_format', 'NCHW',
 tf.app.flags.DEFINE_string('train_dir', '/train_model',
                            """Directory where to write event logs """
                            """and checkpoint.""")
-tf.app.flags.DEFINE_integer('num_gpus', 4,
-                            """How many GPUs to use.""")
+tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""")
 tf.app.flags.DEFINE_boolean('log_device_placement', False,
                             """Whether to log device placement.""")
 
-NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN=50000
-NUM_EPOCHS_PER_DECAY=50
-INITIAL_LEARNING_RATE = 0.1 
+NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
+NUM_EPOCHS_PER_DECAY = 50
+INITIAL_LEARNING_RATE = 0.1
 LEARNING_RATE_DECAY_FACTOR = 0.1
 TOWER_NAME = 'tower'
 
 
 def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005):
     with tf.name_scope(name) as scope:
-        kernel = tf.get_variable(name + '_w',[kH, kW, nIn, nOut],
-          initializer=tf.truncated_normal_initializer(stddev=0.01, dtype=tf.float32),
-          dtype=tf.float32)
+        kernel = tf.get_variable(
+            name + '_w', [kH, kW, nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
 
         if wd is not None:
             weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
             tf.add_to_collection('losses', weight_decay)
 
         if FLAGS.data_format == 'NCHW':
-          strides = [1, 1, dH, dW]
+            strides = [1, 1, dH, dW]
         else:
-          strides = [1, dH, dW, 1]
-        conv = tf.nn.conv2d(inpOp, kernel, strides, padding=padType,
-                            data_format=FLAGS.data_format)
-
-        biases = tf.get_variable(name=name + '_b', shape=[nOut], 
-            initializer=tf.constant_initializer(value=0.0, dtype=tf.float32),
+            strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(
+            inpOp,
+            kernel,
+            strides,
+            padding=padType,
+            data_format=FLAGS.data_format)
+
+        biases = tf.get_variable(
+            name=name + '_b',
+            shape=[nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
             dtype=tf.float32)
 
         bias = tf.reshape(
-            tf.nn.bias_add(conv, biases, data_format=FLAGS.data_format),
+            tf.nn.bias_add(
+                conv, biases, data_format=FLAGS.data_format),
             conv.get_shape())
 
         conv1 = tf.nn.relu(bias, name=scope)
         return conv1
 
+
 def _affine(name, inpOp, nIn, nOut, wd=0.005, act=True):
     with tf.name_scope(name) as scope:
-        kernel = tf.get_variable(name + '_w', [nIn, nOut],
-            initializer=tf.truncated_normal_initializer(stddev=0.01, dtype=tf.float32),
+        kernel = tf.get_variable(
+            name + '_w', [nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
             dtype=tf.float32)
 
         if wd is not None:
             weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
             tf.add_to_collection('losses', weight_decay)
 
-        biases = tf.get_variable(name + '_b', [nOut],
-            initializer=tf.constant_initializer(value=0.0, dtype=tf.float32),
-            dtype=tf.float32,trainable=True)
+        biases = tf.get_variable(
+            name + '_b', [nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=True)
 
         affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
                   tf.matmul(inpOp, kernel) + biases
 
         return affine1
 
+
 def _mpool(name, inpOp, kH, kW, dH, dW):
     if FLAGS.data_format == 'NCHW':
-      ksize = [1, 1, kH, kW]
-      strides = [1, 1, dH, dW]
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
     else:
-      ksize = [1, kH, kW, 1]
-      strides = [1, dH, dW, 1]
-    return tf.nn.max_pool(inpOp,
-                          ksize=ksize,
-                          strides=strides,
-                          padding='VALID',
-                          data_format=FLAGS.data_format,
-                          name=name)
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding='VALID',
+        data_format=FLAGS.data_format,
+        name=name)
+
 
 def _norm(name, l_input, lsize=4):
-    return tf.nn.lrn(l_input, lsize, bias=1.0,
+    return tf.nn.lrn(l_input,
+                     lsize,
+                     bias=1.0,
                      alpha=0.001 / 9.0,
-                     beta=0.75, name=name)
+                     beta=0.75,
+                     name=name)
+
 
 def loss(logits, labels):
     labels = tf.cast(labels, tf.int64)
     cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
-                    logits, labels, name='cross_entropy_per_example')
+        logits, labels, name='cross_entropy_per_example')
     cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
     tf.add_to_collection('losses', cross_entropy_mean)
 
@@ -120,24 +140,26 @@ def get_incoming_shape(incoming):
     else:
         raise Exception("Invalid incoming layer.")
 
+
 def inference(images):
-    conv1 = _conv ('conv1', images, 3, 96, 11, 11, 4, 4, 'VALID')
-    pool1 = _mpool('pool1', conv1,  3, 3, 2, 2)
-    norm1 = _norm ('norm1', pool1, lsize=5)
-    conv2 = _conv ('conv2', norm1,  96, 256, 5, 5, 1, 1, 'SAME')
-    pool2 = _mpool('pool2', conv2,  3, 3, 2, 2)
-    norm2 = _norm ('norm2', pool2, lsize=5)
-    conv3 = _conv ('conv3', norm2,  256, 384, 3, 3, 1, 1, 'SAME')
-    conv4 = _conv ('conv4', conv3,  384, 384, 3, 3, 1, 1, 'SAME')
-    conv5 = _conv ('conv5', conv4,  384, 256, 3, 3, 1, 1, 'SAME')
-    pool5 = _mpool('pool5', conv5,  3, 3, 2, 2)
+    conv1 = _conv('conv1', images, 3, 96, 11, 11, 4, 4, 'VALID')
+    pool1 = _mpool('pool1', conv1, 3, 3, 2, 2)
+    norm1 = _norm('norm1', pool1, lsize=5)
+    conv2 = _conv('conv2', norm1, 96, 256, 5, 5, 1, 1, 'SAME')
+    pool2 = _mpool('pool2', conv2, 3, 3, 2, 2)
+    norm2 = _norm('norm2', pool2, lsize=5)
+    conv3 = _conv('conv3', norm2, 256, 384, 3, 3, 1, 1, 'SAME')
+    conv4 = _conv('conv4', conv3, 384, 384, 3, 3, 1, 1, 'SAME')
+    conv5 = _conv('conv5', conv4, 384, 256, 3, 3, 1, 1, 'SAME')
+    pool5 = _mpool('pool5', conv5, 3, 3, 2, 2)
     resh1 = tf.reshape(pool5, [-1, 256 * 6 * 6])
     affn1 = _affine('fc6', resh1, 256 * 6 * 6, 4096)
     affn2 = _affine('fc7', affn1, 4096, 4096)
-    affn3 = _affine('fc8', affn2, 4096, 1000, wd=None, act=False) # last fc
+    affn3 = _affine('fc8', affn2, 4096, 1000, wd=None, act=False)  # last fc
 
     return affn3
 
+
 def tower_loss(scope):
     """Calculate the total loss on a single tower running the model.
     Args:
@@ -150,15 +172,19 @@ def tower_loss(scope):
         image_shape = [FLAGS.batch_size, 3, image_size + 3, image_size + 3]
     else:
         image_shape = [FLAGS.batch_size, image_size + 3, image_size + 3, 3]
-    images = tf.get_variable('image', image_shape, 
-                             initializer=tf.truncated_normal_initializer(stddev=0.1, dtype=tf.float32),
-                             dtype=tf.float32,
-                             trainable=False)
-
-    labels = tf.get_variable('label', [FLAGS.batch_size],
-                             initializer=tf.constant_initializer(1),
-                             dtype=tf.int32,
-                             trainable=False)
+    images = tf.get_variable(
+        'image',
+        image_shape,
+        initializer=tf.truncated_normal_initializer(
+            stddev=0.1, dtype=tf.float32),
+        dtype=tf.float32,
+        trainable=False)
+
+    labels = tf.get_variable(
+        'label', [FLAGS.batch_size],
+        initializer=tf.constant_initializer(1),
+        dtype=tf.int32,
+        trainable=False)
 
     # Build a Graph that computes the logits predictions from the
     # inference model.
@@ -167,7 +193,7 @@ def tower_loss(scope):
     # Build the portion of the Graph calculating the losses. Note that we will
     # assemble the total_loss using a custom function below.
     _ = loss(last_layer, labels)
-    
+
     # Assemble all of the losses for the current tower only.
     losses = tf.get_collection('losses', scope)
 
@@ -186,7 +212,7 @@ def tower_loss(scope):
         loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
         # Name each loss as '(raw)' and name the moving average version of the loss
         # as the original loss name.
-        tf.scalar_summary(loss_name +' (raw)', l)
+        tf.scalar_summary(loss_name + ' (raw)', l)
         tf.scalar_summary(loss_name, loss_averages.average(l))
 
     with tf.control_dependencies([loss_averages_op]):
@@ -195,7 +221,7 @@ def tower_loss(scope):
 
 
 def average_gradients(tower_grads):
-  """Calculate the average gradient for each shared variable across all towers.
+    """Calculate the average gradient for each shared variable across all towers.
   Note that this function provides a synchronization point across all towers.
   Args:
     tower_grads: List of lists of (gradient, variable) tuples. The outer list
@@ -205,130 +231,135 @@ def average_gradients(tower_grads):
      List of pairs of (gradient, variable) where the gradient has been averaged
      across all towers.
   """
-  average_grads = []
-  for grad_and_vars in zip(*tower_grads):
-    # Note that each grad_and_vars looks like the following:
-    #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
-    grads = []
-    for g, _ in grad_and_vars:
-      # Add 0 dimension to the gradients to represent the tower.
-      expanded_g = tf.expand_dims(g, 0)
-
-      # Append on a 'tower' dimension which we will average over below.
-      grads.append(expanded_g)
-
-    # Average over the 'tower' dimension.
-    grad = tf.concat(0, grads)
-    grad = tf.reduce_mean(grad, 0)
-
-    # Keep in mind that the Variables are redundant because they are shared
-    # across towers. So .. we will just return the first tower's pointer to
-    # the Variable.
-    v = grad_and_vars[0][1]
-    grad_and_var = (grad, v)
-    average_grads.append(grad_and_var)
-  return average_grads
+    average_grads = []
+    for grad_and_vars in zip(*tower_grads):
+        # Note that each grad_and_vars looks like the following:
+        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+        grads = []
+        for g, _ in grad_and_vars:
+            # Add 0 dimension to the gradients to represent the tower.
+            expanded_g = tf.expand_dims(g, 0)
+
+            # Append on a 'tower' dimension which we will average over below.
+            grads.append(expanded_g)
+
+        # Average over the 'tower' dimension.
+        grad = tf.concat(0, grads)
+        grad = tf.reduce_mean(grad, 0)
+
+        # Keep in mind that the Variables are redundant because they are shared
+        # across towers. So .. we will just return the first tower's pointer to
+        # the Variable.
+        v = grad_and_vars[0][1]
+        grad_and_var = (grad, v)
+        average_grads.append(grad_and_var)
+    return average_grads
+
 
 def time_tensorflow_run(session, target):
     num_steps_burn_in = 50
     total_duration = 0.0
     total_duration_squared = 0.0
     for i in xrange(FLAGS.num_batches + num_steps_burn_in):
-      start_time = time.time()
-      _, loss_value = session.run(target)
-      duration = time.time() - start_time
-      if i > num_steps_burn_in:
-        if not i % 10:
-          num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
-          examples_per_sec = num_examples_per_step / duration
-          sec_per_batch = duration
-           
-          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
-                      'sec/batch batch_size = %d)')
-          print (format_str %
-                (datetime.now(), i - num_steps_burn_in, 
-                 loss_value, duration, sec_per_batch, num_examples_per_step))
-
-        total_duration += duration
-        total_duration_squared += duration * duration
+        start_time = time.time()
+        _, loss_value = session.run(target)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
+                examples_per_sec = num_examples_per_step / duration
+                sec_per_batch = duration
+
+                format_str = (
+                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
+                    'sec/batch batch_size = %d)')
+                print(format_str %
+                      (datetime.now(), i - num_steps_burn_in, loss_value,
+                       duration, sec_per_batch, num_examples_per_step))
+
+            total_duration += duration
+            total_duration_squared += duration * duration
 
     mn = total_duration / FLAGS.num_batches
     vr = total_duration_squared / FLAGS.num_batches - mn * mn
     sd = math.sqrt(vr)
-    print ('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
+    print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
           (datetime.now(), FLAGS.num_batches, mn, sd))
 
+
 def run_benchmark():
-  with tf.Graph().as_default(), tf.device('/cpu:0'):
-    # Create a variable to count the number of train() calls. This equals the
-    # number of batches processed * FLAGS.num_gpus.
-    global_step = tf.get_variable(
-        'global_step', [],
-        initializer=tf.constant_initializer(0), trainable=False)
-
-    # Calculate the learning rate schedule.
-    num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
-                             FLAGS.batch_size)
-    decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
-
-    # Decay the learning rate exponentially based on the number of steps.
-    lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
-                                    global_step,
-                                    decay_steps,
-                                    LEARNING_RATE_DECAY_FACTOR,
-                                    staircase=True)
-
-    # Create an optimizer that performs gradient descent.
-    opt = tf.train.MomentumOptimizer(lr, 0.9)
-
-    # Calculate the gradients for each model tower.
-    tower_grads = []
-    for i in xrange(FLAGS.num_gpus):
-      with tf.device('/gpu:%d' % i):
-        with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
-          # Calculate the loss for one tower of the model. This function
-          # constructs the entire model but shares the variables across
-          # all towers.
-          loss = tower_loss(scope)
-
-          # Reuse variables for the next tower.
-          tf.get_variable_scope().reuse_variables()
-
-          # Retain the summaries from the final tower.
-          summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
-
-          # Calculate the gradients for the batch of data on this tower.
-          grads = opt.compute_gradients(loss)
-          
-          # Keep track of the gradients across all towers.
-          tower_grads.append(grads)
-
-    # We must calculate the mean of each gradient. Note that this is the
-    # synchronization point across all towers.
-    grads = average_gradients(tower_grads)
-
-    # Apply the gradients to adjust the shared variables.
-    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
-
-    # Group all updates to into a single train op.
-    train_op = tf.group(apply_gradient_op)
-
-    # Build an initialization operation.
-    init = tf.initialize_all_variables()
-
-    # Start running operations on the Graph. allow_soft_placement must be set to
-    # True to build towers on GPU, as some of the ops do not have GPU
-    # implementations.
-    sess = tf.Session(config=tf.ConfigProto(
-        allow_soft_placement=True,
-        log_device_placement=FLAGS.log_device_placement))
-    sess.run(init)
-    time_tensorflow_run(sess, [train_op, loss])
+    with tf.Graph().as_default(), tf.device('/cpu:0'):
+        # Create a variable to count the number of train() calls. This equals the
+        # number of batches processed * FLAGS.num_gpus.
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(0),
+            trainable=False)
+
+        # Calculate the learning rate schedule.
+        num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
+                                 FLAGS.batch_size)
+        decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
+
+        # Decay the learning rate exponentially based on the number of steps.
+        lr = tf.train.exponential_decay(
+            INITIAL_LEARNING_RATE,
+            global_step,
+            decay_steps,
+            LEARNING_RATE_DECAY_FACTOR,
+            staircase=True)
+
+        # Create an optimizer that performs gradient descent.
+        opt = tf.train.MomentumOptimizer(lr, 0.9)
+
+        # Calculate the gradients for each model tower.
+        tower_grads = []
+        for i in xrange(FLAGS.num_gpus):
+            with tf.device('/gpu:%d' % i):
+                with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
+                    # Calculate the loss for one tower of the model. This function
+                    # constructs the entire model but shares the variables across
+                    # all towers.
+                    loss = tower_loss(scope)
+
+                    # Reuse variables for the next tower.
+                    tf.get_variable_scope().reuse_variables()
+
+                    # Retain the summaries from the final tower.
+                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
+
+                    # Calculate the gradients for the batch of data on this tower.
+                    grads = opt.compute_gradients(loss)
+
+                    # Keep track of the gradients across all towers.
+                    tower_grads.append(grads)
+
+        # We must calculate the mean of each gradient. Note that this is the
+        # synchronization point across all towers.
+        grads = average_gradients(tower_grads)
+
+        # Apply the gradients to adjust the shared variables.
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Group all updates to into a single train op.
+        train_op = tf.group(apply_gradient_op)
+
+        # Build an initialization operation.
+        init = tf.initialize_all_variables()
+
+        # Start running operations on the Graph. allow_soft_placement must be set to
+        # True to build towers on GPU, as some of the ops do not have GPU
+        # implementations.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+        time_tensorflow_run(sess, [train_op, loss])
 
 
 def main(_):
-  run_benchmark()
+    run_benchmark()
 
 
 if __name__ == '__main__':
-  tf.app.run()
+    tf.app.run()
diff --git a/benchmark/tensorflow/image/googlenet.py b/benchmark/tensorflow/image/googlenet.py
index 097a8997b78ff55813897b7f32c4d7d931e8288d..decf855b54451efba5f6a7868fbcf631789f3572 100644
--- a/benchmark/tensorflow/image/googlenet.py
+++ b/benchmark/tensorflow/image/googlenet.py
@@ -8,10 +8,8 @@ import tensorflow as tf
 
 FLAGS = tf.app.flags.FLAGS
 
-tf.app.flags.DEFINE_integer('batch_size', 128,
-                            """Batch size.""")
-tf.app.flags.DEFINE_integer('num_batches', 100,
-                            """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
 tf.app.flags.DEFINE_boolean('forward_only', False,
                             """Only run the forward pass.""")
 tf.app.flags.DEFINE_boolean('forward_backward_only', False,
@@ -29,72 +27,92 @@ conv_counter = 1
 pool_counter = 1
 affine_counter = 1
 
-def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd = 0.0005):
+
+def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.0005):
     global conv_counter
     global parameters
     name = 'conv' + str(conv_counter)
     conv_counter += 1
     with tf.name_scope(name) as scope:
-        kernel = tf.Variable(tf.truncated_normal([kH, kW, nIn, nOut],
-                                                 dtype=tf.float32,
-                                                 stddev=1e-1), name='weights')
+        kernel = tf.Variable(
+            tf.truncated_normal(
+                [kH, kW, nIn, nOut], dtype=tf.float32, stddev=1e-1),
+            name='weights')
 
         if wd is not None and wd > 0:
             weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
             tf.add_to_collection('losses', weight_decay)
 
         if FLAGS.data_format == 'NCHW':
-          strides = [1, 1, dH, dW]
+            strides = [1, 1, dH, dW]
         else:
-          strides = [1, dH, dW, 1]
-        conv = tf.nn.conv2d(inpOp, kernel, strides, padding=padType,
-                            data_format=FLAGS.data_format)
-        biases = tf.Variable(tf.constant(0.0, shape=[nOut], dtype=tf.float32),
-                             trainable=True, name='biases')
-        bias = tf.reshape(tf.nn.bias_add(conv, biases,
-                                         data_format=FLAGS.data_format),
-                          conv.get_shape())
+            strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(
+            inpOp,
+            kernel,
+            strides,
+            padding=padType,
+            data_format=FLAGS.data_format)
+        biases = tf.Variable(
+            tf.constant(
+                0.0, shape=[nOut], dtype=tf.float32),
+            trainable=True,
+            name='biases')
+        bias = tf.reshape(
+            tf.nn.bias_add(
+                conv, biases, data_format=FLAGS.data_format),
+            conv.get_shape())
         conv1 = tf.nn.relu(bias, name=scope)
         parameters += [kernel, biases]
         return conv1
 
-def _affine(inpOp, nIn, nOut, act=True, wd = 0.0005):
+
+def _affine(inpOp, nIn, nOut, act=True, wd=0.0005):
     global affine_counter
     global parameters
     name = 'affine' + str(affine_counter)
     affine_counter += 1
     with tf.name_scope(name) as scope:
-        kernel = tf.Variable(tf.truncated_normal([nIn, nOut],
-                                                 dtype=tf.float32,
-                                                 stddev=1e-1), name='weights')
+        kernel = tf.Variable(
+            tf.truncated_normal(
+                [nIn, nOut], dtype=tf.float32, stddev=1e-1),
+            name='weights')
 
         if wd is not None and wd > 0:
             weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
             tf.add_to_collection('losses', weight_decay)
 
-        biases = tf.Variable(tf.constant(0.0, shape=[nOut], dtype=tf.float32),
-                             trainable=True, name='biases')
-        affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else tf.matmul(inpOp, kernel) + biases 
+        biases = tf.Variable(
+            tf.constant(
+                0.0, shape=[nOut], dtype=tf.float32),
+            trainable=True,
+            name='biases')
+        affine1 = tf.nn.relu_layer(
+            inpOp, kernel, biases,
+            name=name) if act else tf.matmul(inpOp, kernel) + biases
         parameters += [kernel, biases]
         return affine1
 
+
 def _mpool(inpOp, kH, kW, dH, dW, padding):
     global pool_counter
     global parameters
     name = 'pool' + str(pool_counter)
     pool_counter += 1
     if FLAGS.data_format == 'NCHW':
-      ksize = [1, 1, kH, kW]
-      strides = [1, 1, dH, dW]
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
     else:
-      ksize = [1, kH, kW, 1]
-      strides = [1, dH, dW, 1]
-    return tf.nn.max_pool(inpOp,
-                          ksize=ksize,
-                          strides=strides,
-                          padding=padding,
-                          data_format=FLAGS.data_format,
-                          name=name)
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
 
 def _apool(inpOp, kH, kW, dH, dW, padding):
     global pool_counter
@@ -102,17 +120,19 @@ def _apool(inpOp, kH, kW, dH, dW, padding):
     name = 'pool' + str(pool_counter)
     pool_counter += 1
     if FLAGS.data_format == 'NCHW':
-      ksize = [1, 1, kH, kW]
-      strides = [1, 1, dH, dW]
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
     else:
-      ksize = [1, kH, kW, 1]
-      strides = [1, dH, dW, 1]
-    return tf.nn.avg_pool(inpOp,
-                          ksize=ksize,
-                          strides=strides,
-                          padding=padding,
-                          data_format=FLAGS.data_format,
-                          name=name)
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.avg_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
 
 def _inception(inp, inSize, o1s, o2s1, o2s2, o3s1, o3s2, o4s1, o4s2):
     conv1 = _conv(inp, inSize, o1s, 1, 1, 1, 1, 'VALID')
@@ -127,9 +147,9 @@ def _inception(inp, inSize, o1s, o2s1, o2s2, o3s1, o3s2, o4s1, o4s2):
     pool = _conv(pool_, inSize, o4s2, 1, 1, 1, 1, 'VALID')
 
     if FLAGS.data_format == 'NCHW':
-      channel_dim = 1
+        channel_dim = 1
     else:
-      channel_dim = 3
+        channel_dim = 3
     incept = tf.concat(channel_dim, [conv1, conv3, conv5, pool])
     return incept
 
@@ -139,40 +159,40 @@ def loss(logits, labels):
     labels = tf.expand_dims(labels, 1)
     indices = tf.expand_dims(tf.range(0, batch_size, 1), 1)
     concated = tf.concat(1, [indices, labels])
-    onehot_labels = tf.sparse_to_dense(
-        concated, tf.pack([batch_size, 1000]), 1.0, 0.0)
-    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits,
-                                                            onehot_labels,
-                                                            name='xentropy')
+    onehot_labels = tf.sparse_to_dense(concated,
+                                       tf.pack([batch_size, 1000]), 1.0, 0.0)
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+        logits, onehot_labels, name='xentropy')
     loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
     return loss
 
+
 def inference(images):
     # stage 1
-    conv1 = _conv (images, 3, 64, 7, 7, 2, 2, 'SAME')
-    pool1 = _mpool(conv1,  3, 3, 2, 2, 'SAME')
+    conv1 = _conv(images, 3, 64, 7, 7, 2, 2, 'SAME')
+    pool1 = _mpool(conv1, 3, 3, 2, 2, 'SAME')
     # stage 2
-    conv2 = _conv (pool1,  64, 64, 1, 1, 1, 1, 'VALID')
-    conv3 = _conv (conv2,  64, 192, 3, 3, 1, 1, 'SAME')
-    pool3 = _mpool(conv3,  3, 3, 2, 2, 'SAME')
+    conv2 = _conv(pool1, 64, 64, 1, 1, 1, 1, 'VALID')
+    conv3 = _conv(conv2, 64, 192, 3, 3, 1, 1, 'SAME')
+    pool3 = _mpool(conv3, 3, 3, 2, 2, 'SAME')
 
     # stage 3
-    incept3a = _inception(pool3,    192, 64, 96, 128, 16, 32, 3, 32)
+    incept3a = _inception(pool3, 192, 64, 96, 128, 16, 32, 3, 32)
     incept3b = _inception(incept3a, 256, 128, 128, 192, 32, 96, 3, 64)
-    pool4 = _mpool(incept3b,  3, 3, 2, 2, 'SAME')
+    pool4 = _mpool(incept3b, 3, 3, 2, 2, 'SAME')
 
     # stage 4
-    incept4a = _inception(pool4,    480, 192,  96, 208, 16, 48, 3, 64)
+    incept4a = _inception(pool4, 480, 192, 96, 208, 16, 48, 3, 64)
     incept4b = _inception(incept4a, 512, 160, 112, 224, 24, 64, 3, 64)
     incept4c = _inception(incept4b, 512, 128, 128, 256, 24, 64, 3, 64)
     incept4d = _inception(incept4c, 512, 112, 144, 288, 32, 64, 3, 64)
     incept4e = _inception(incept4d, 528, 256, 160, 320, 32, 128, 3, 128)
-    pool5 = _mpool(incept4e,  3, 3, 2, 2, 'SAME')
+    pool5 = _mpool(incept4e, 3, 3, 2, 2, 'SAME')
 
     # stage 5
-    incept5a = _inception(pool5,    832, 256, 160, 320, 32, 128, 3, 128)
+    incept5a = _inception(pool5, 832, 256, 160, 320, 32, 128, 3, 128)
     incept5b = _inception(incept5a, 832, 384, 192, 384, 48, 128, 3, 128)
-    pool6 = _apool(incept5b,  7, 7, 1, 1, 'VALID')
+    pool6 = _apool(incept5b, 7, 7, 1, 1, 'VALID')
 
     # output 1
     resh1 = tf.reshape(pool6, [-1, 1024])
@@ -183,100 +203,109 @@ def inference(images):
 
 
 def time_tensorflow_run(session, target, info_string):
-  num_steps_burn_in = 10
-  total_duration = 0.0
-  total_duration_squared = 0.0
-  if not isinstance(target, list):
-    target = [target]
-  target_op = tf.group(*target)
-  for i in range(FLAGS.num_batches + num_steps_burn_in):
-    start_time = time.time()
-    _ = session.run(target_op)
-    duration = time.time() - start_time
-    if i > num_steps_burn_in:
-      if not i % 10:
-        print ('%s: step %d, duration = %.3f' %
-               (datetime.now(), i - num_steps_burn_in, duration))
-      total_duration += duration
-      total_duration_squared += duration * duration
-  mn = total_duration / FLAGS.num_batches
-  vr = total_duration_squared / FLAGS.num_batches - mn * mn
-  sd = math.sqrt(vr)
-  print ('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
-         (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
+    num_steps_burn_in = 10
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    if not isinstance(target, list):
+        target = [target]
+    target_op = tf.group(*target)
+    for i in range(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _ = session.run(target_op)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                print('%s: step %d, duration = %.3f' %
+                      (datetime.now(), i - num_steps_burn_in, duration))
+            total_duration += duration
+            total_duration_squared += duration * duration
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
+
 
 def run_benchmark():
-  global parameters
-  with tf.Graph().as_default():
-    # Generate some dummy images.
-    image_size = 224
-    if FLAGS.data_format == 'NCHW':
-      image_shape = [FLAGS.batch_size, 3, image_size, image_size]
-    else:
-      image_shape = [FLAGS.batch_size, image_size, image_size, 3]
-
-    images = tf.get_variable('image', image_shape, 
-                             initializer=tf.truncated_normal_initializer(stddev=0.1, dtype=tf.float32),
-                             dtype=tf.float32,
-                             trainable=False)
-
-    labels = tf.get_variable('label', [FLAGS.batch_size],
-                             initializer=tf.constant_initializer(1),
-                             dtype=tf.int32,
-                             trainable=False)
-
-    # Build a Graph that computes the logits predictions from the
-    # inference model.
-    last_layer = inference(images)
-
-    objective = loss(last_layer, labels)
-
-    # Compute gradients.
-    # opt = tf.train.GradientDescentOptimizer(0.001)
-    opt = tf.train.MomentumOptimizer(0.001, 0.9)
-    grads = opt.compute_gradients(objective) 
-    global_step = tf.get_variable('global_step', [],
-       initializer=tf.constant_initializer(0.0, dtype=tf.float32),
-       trainable=False, dtype=tf.float32)
-    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
-
-    # Track the moving averages of all trainable variables.
-    variable_averages = tf.train.ExponentialMovingAverage(
-         0.9, global_step)
-    variables_averages_op = variable_averages.apply(tf.trainable_variables())
-
-    # Build an initialization operation.
-    init = tf.initialize_all_variables()
-
-    # Start running operations on the Graph.
-    sess = tf.Session(config=tf.ConfigProto(
-      allow_soft_placement=True,
-      log_device_placement=FLAGS.log_device_placement))
-    sess.run(init)
-
-    run_forward = True
-    run_forward_backward = True
-    if FLAGS.forward_only and FLAGS.forward_backward_only:
-      raise ValueError("Cannot specify --forward_only and "
-                       "--forward_backward_only at the same time.")
-    if FLAGS.forward_only:
-      run_forward_backward = False
-    elif FLAGS.forward_backward_only:
-      run_forward = False
-
-    if run_forward:
-      # Run the forward benchmark.
-      time_tensorflow_run(sess, last_layer, "Forward")
-
-    if run_forward_backward:
-      with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
-          train_op = tf.no_op(name='train')
-      time_tensorflow_run(sess, [train_op, objective], "Forward-backward")
+    global parameters
+    with tf.Graph().as_default():
+        # Generate some dummy images.
+        image_size = 224
+        if FLAGS.data_format == 'NCHW':
+            image_shape = [FLAGS.batch_size, 3, image_size, image_size]
+        else:
+            image_shape = [FLAGS.batch_size, image_size, image_size, 3]
+
+        images = tf.get_variable(
+            'image',
+            image_shape,
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.1, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=False)
+
+        labels = tf.get_variable(
+            'label', [FLAGS.batch_size],
+            initializer=tf.constant_initializer(1),
+            dtype=tf.int32,
+            trainable=False)
+
+        # Build a Graph that computes the logits predictions from the
+        # inference model.
+        last_layer = inference(images)
+
+        objective = loss(last_layer, labels)
+
+        # Compute gradients.
+        # opt = tf.train.GradientDescentOptimizer(0.001)
+        opt = tf.train.MomentumOptimizer(0.001, 0.9)
+        grads = opt.compute_gradients(objective)
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(
+                0.0, dtype=tf.float32),
+            trainable=False,
+            dtype=tf.float32)
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Track the moving averages of all trainable variables.
+        variable_averages = tf.train.ExponentialMovingAverage(0.9, global_step)
+        variables_averages_op = variable_averages.apply(tf.trainable_variables(
+        ))
+
+        # Build an initialization operation.
+        init = tf.initialize_all_variables()
+
+        # Start running operations on the Graph.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+
+        run_forward = True
+        run_forward_backward = True
+        if FLAGS.forward_only and FLAGS.forward_backward_only:
+            raise ValueError("Cannot specify --forward_only and "
+                             "--forward_backward_only at the same time.")
+        if FLAGS.forward_only:
+            run_forward_backward = False
+        elif FLAGS.forward_backward_only:
+            run_forward = False
+
+        if run_forward:
+            # Run the forward benchmark.
+            time_tensorflow_run(sess, last_layer, "Forward")
+
+        if run_forward_backward:
+            with tf.control_dependencies(
+                [apply_gradient_op, variables_averages_op]):
+                train_op = tf.no_op(name='train')
+            time_tensorflow_run(sess, [train_op, objective], "Forward-backward")
 
 
 def main(_):
-  run_benchmark()
+    run_benchmark()
 
 
 if __name__ == '__main__':
-  tf.app.run()
+    tf.app.run()
diff --git a/benchmark/tensorflow/image/googlenet_multi_gpu.py b/benchmark/tensorflow/image/googlenet_multi_gpu.py
index e22a6b6253eedcbc2680309a29de10c9dd2bf4ff..31466faa37c47c66e4fe4628e28c867875e89f2e 100644
--- a/benchmark/tensorflow/image/googlenet_multi_gpu.py
+++ b/benchmark/tensorflow/image/googlenet_multi_gpu.py
@@ -9,10 +9,8 @@ import tensorflow as tf
 
 FLAGS = tf.app.flags.FLAGS
 
-tf.app.flags.DEFINE_integer('batch_size', 64,
-                            """Batch size.""")
-tf.app.flags.DEFINE_integer('num_batches', 100,
-                            """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
 tf.app.flags.DEFINE_string('data_format', 'NCHW',
                            """The data format for Convnet operations.
                            Can be either NHWC or NCHW.
@@ -21,97 +19,117 @@ tf.app.flags.DEFINE_string('data_format', 'NCHW',
 tf.app.flags.DEFINE_string('train_dir', '/train_model',
                            """Directory where to write event logs """
                            """and checkpoint.""")
-tf.app.flags.DEFINE_integer('num_gpus', 4,
-                            """How many GPUs to use.""")
+tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""")
 tf.app.flags.DEFINE_boolean('log_device_placement', False,
                             """Whether to log device placement.""")
 
-NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN=50000
-NUM_EPOCHS_PER_DECAY=50
-INITIAL_LEARNING_RATE = 0.1 
+NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
+NUM_EPOCHS_PER_DECAY = 50
+INITIAL_LEARNING_RATE = 0.1
 LEARNING_RATE_DECAY_FACTOR = 0.1
 TOWER_NAME = 'tower'
 
 
 def _conv(name, inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005):
     with tf.name_scope(name) as scope:
-        kernel = tf.get_variable(name + '_w',[kH, kW, nIn, nOut],
-          initializer=tf.truncated_normal_initializer(stddev=0.01, dtype=tf.float32),
-          dtype=tf.float32)
+        kernel = tf.get_variable(
+            name + '_w', [kH, kW, nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
+            dtype=tf.float32)
 
         if wd is not None:
             weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
             tf.add_to_collection('losses', weight_decay)
 
         if FLAGS.data_format == 'NCHW':
-          strides = [1, 1, dH, dW]
+            strides = [1, 1, dH, dW]
         else:
-          strides = [1, dH, dW, 1]
-        conv = tf.nn.conv2d(inpOp, kernel, strides, padding=padType,
-                            data_format=FLAGS.data_format)
-
-        biases = tf.get_variable(name=name + '_b', shape=[nOut], 
-            initializer=tf.constant_initializer(value=0.0, dtype=tf.float32),
+            strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(
+            inpOp,
+            kernel,
+            strides,
+            padding=padType,
+            data_format=FLAGS.data_format)
+
+        biases = tf.get_variable(
+            name=name + '_b',
+            shape=[nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
             dtype=tf.float32)
 
         bias = tf.reshape(
-            tf.nn.bias_add(conv, biases, data_format=FLAGS.data_format),
+            tf.nn.bias_add(
+                conv, biases, data_format=FLAGS.data_format),
             conv.get_shape())
 
         conv1 = tf.nn.relu(bias, name=scope)
         return conv1
 
+
 def _affine(name, inpOp, nIn, nOut, wd=0.005, act=True):
     with tf.name_scope(name) as scope:
-        kernel = tf.get_variable(name + '_w', [nIn, nOut],
-            initializer=tf.truncated_normal_initializer(stddev=0.01, dtype=tf.float32),
+        kernel = tf.get_variable(
+            name + '_w', [nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
             dtype=tf.float32)
 
         if wd is not None:
             weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
             tf.add_to_collection('losses', weight_decay)
 
-        biases = tf.get_variable(name + '_b', [nOut],
-            initializer=tf.constant_initializer(value=0.0, dtype=tf.float32),
-            dtype=tf.float32,trainable=True)
+        biases = tf.get_variable(
+            name + '_b', [nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=True)
 
         affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
                   tf.matmul(inpOp, kernel) + biases
 
         return affine1
 
+
 def _mpool(name, inpOp, kH, kW, dH, dW, padding):
     if FLAGS.data_format == 'NCHW':
-      ksize = [1, 1, kH, kW]
-      strides = [1, 1, dH, dW]
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
     else:
-      ksize = [1, kH, kW, 1]
-      strides = [1, dH, dW, 1]
-    return tf.nn.max_pool(inpOp,
-                          ksize=ksize,
-                          strides=strides,
-                          padding=padding,
-                          data_format=FLAGS.data_format,
-                          name=name)
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
 
 def _apool(name, inpOp, kH, kW, dH, dW, padding):
     if FLAGS.data_format == 'NCHW':
-      ksize = [1, 1, kH, kW]
-      strides = [1, 1, dH, dW]
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
     else:
-      ksize = [1, kH, kW, 1]
-      strides = [1, dH, dW, 1]
-    return tf.nn.avg_pool(inpOp,
-                          ksize=ksize,
-                          strides=strides,
-                          padding=padding,
-                          data_format=FLAGS.data_format,
-                          name=name)
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.avg_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
 
 def loss(logits, labels):
     labels = tf.cast(labels, tf.int64)
     cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
-                    logits, labels, name='cross_entropy_per_example')
+        logits, labels, name='cross_entropy_per_example')
     cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
     tf.add_to_collection('losses', cross_entropy_mean)
 
@@ -131,7 +149,7 @@ def get_incoming_shape(incoming):
 
 
 def _inception(name, inp, inSize, o1s, o2s1, o2s2, o3s1, o3s2, o4s1, o4s2):
-    conv1 = _conv(name + '_1' , inp, inSize, o1s, 1, 1, 1, 1, 'VALID')
+    conv1 = _conv(name + '_1', inp, inSize, o1s, 1, 1, 1, 1, 'VALID')
 
     conv3_ = _conv(name + '_3r', inp, inSize, o2s1, 1, 1, 1, 1, 'VALID')
     conv3 = _conv(name + '_3', conv3_, o2s1, o2s2, 3, 3, 1, 1, 'SAME')
@@ -143,40 +161,42 @@ def _inception(name, inp, inSize, o1s, o2s1, o2s2, o3s1, o3s2, o4s1, o4s2):
     pool = _conv(name + 'proj', pool_, inSize, o4s2, 1, 1, 1, 1, 'VALID')
 
     if FLAGS.data_format == 'NCHW':
-      channel_dim = 1
+        channel_dim = 1
     else:
-      channel_dim = 3
+        channel_dim = 3
     incept = tf.concat(channel_dim, [conv1, conv3, conv5, pool])
     return incept
 
 
 def inference(images):
     # stage 1
-    conv1 = _conv ('conv1', images, 3, 64, 7, 7, 2, 2, 'SAME')
-    pool1 = _mpool('pool1', conv1,  3, 3, 2, 2, 'SAME')
+    conv1 = _conv('conv1', images, 3, 64, 7, 7, 2, 2, 'SAME')
+    pool1 = _mpool('pool1', conv1, 3, 3, 2, 2, 'SAME')
 
     # stage 2
-    conv2 = _conv ('conv2', pool1,  64, 64, 1, 1, 1, 1, 'VALID')
-    conv3 = _conv ('conv3', conv2,  64, 192, 3, 3, 1, 1, 'SAME')
-    pool3 = _mpool('pool3', conv3,  3, 3, 2, 2, 'SAME')
+    conv2 = _conv('conv2', pool1, 64, 64, 1, 1, 1, 1, 'VALID')
+    conv3 = _conv('conv3', conv2, 64, 192, 3, 3, 1, 1, 'SAME')
+    pool3 = _mpool('pool3', conv3, 3, 3, 2, 2, 'SAME')
 
     # stage 3
-    incept3a = _inception('ince3a', pool3,    192, 64, 96, 128, 16, 32, 3, 32)
+    incept3a = _inception('ince3a', pool3, 192, 64, 96, 128, 16, 32, 3, 32)
     incept3b = _inception('ince3b', incept3a, 256, 128, 128, 192, 32, 96, 3, 64)
-    pool4 = _mpool('pool4', incept3b,  3, 3, 2, 2, 'SAME')
+    pool4 = _mpool('pool4', incept3b, 3, 3, 2, 2, 'SAME')
 
     # stage 4
-    incept4a = _inception('ince4a', pool4,    480, 192,  96, 208, 16, 48, 3, 64)
+    incept4a = _inception('ince4a', pool4, 480, 192, 96, 208, 16, 48, 3, 64)
     incept4b = _inception('ince4b', incept4a, 512, 160, 112, 224, 24, 64, 3, 64)
     incept4c = _inception('ince4c', incept4b, 512, 128, 128, 256, 24, 64, 3, 64)
     incept4d = _inception('ince4d', incept4c, 512, 112, 144, 288, 32, 64, 3, 64)
-    incept4e = _inception('ince4e', incept4d, 528, 256, 160, 320, 32, 128, 3, 128)
-    pool5 = _mpool('pool5', incept4e,  3, 3, 2, 2, 'SAME')
+    incept4e = _inception('ince4e', incept4d, 528, 256, 160, 320, 32, 128, 3,
+                          128)
+    pool5 = _mpool('pool5', incept4e, 3, 3, 2, 2, 'SAME')
 
     # stage 5
-    incept5a = _inception('ince5a', pool5,    832, 256, 160, 320, 32, 128, 3, 128)
-    incept5b = _inception('ince5b', incept5a, 832, 384, 192, 384, 48, 128, 3, 128)
-    pool6 = _apool('pool6', incept5b,  7, 7, 1, 1, 'VALID')
+    incept5a = _inception('ince5a', pool5, 832, 256, 160, 320, 32, 128, 3, 128)
+    incept5b = _inception('ince5b', incept5a, 832, 384, 192, 384, 48, 128, 3,
+                          128)
+    pool6 = _apool('pool6', incept5b, 7, 7, 1, 1, 'VALID')
 
     # output 1
     resh1 = tf.reshape(pool6, [-1, 1024])
@@ -185,6 +205,7 @@ def inference(images):
 
     return affn1
 
+
 def tower_loss(scope):
     """Calculate the total loss on a single tower running the model.
     Args:
@@ -197,15 +218,19 @@ def tower_loss(scope):
         image_shape = [FLAGS.batch_size, 3, image_size, image_size]
     else:
         image_shape = [FLAGS.batch_size, image_size, image_size, 3]
-    images = tf.get_variable('image', image_shape, 
-                             initializer=tf.truncated_normal_initializer(stddev=0.1, dtype=tf.float32),
-                             dtype=tf.float32,
-                             trainable=False)
-
-    labels = tf.get_variable('label', [FLAGS.batch_size],
-                             initializer=tf.constant_initializer(1),
-                             dtype=tf.int32,
-                             trainable=False)
+    images = tf.get_variable(
+        'image',
+        image_shape,
+        initializer=tf.truncated_normal_initializer(
+            stddev=0.1, dtype=tf.float32),
+        dtype=tf.float32,
+        trainable=False)
+
+    labels = tf.get_variable(
+        'label', [FLAGS.batch_size],
+        initializer=tf.constant_initializer(1),
+        dtype=tf.int32,
+        trainable=False)
 
     # Build a Graph that computes the logits predictions from the
     # inference model.
@@ -214,7 +239,7 @@ def tower_loss(scope):
     # Build the portion of the Graph calculating the losses. Note that we will
     # assemble the total_loss using a custom function below.
     _ = loss(last_layer, labels)
-    
+
     # Assemble all of the losses for the current tower only.
     losses = tf.get_collection('losses', scope)
 
@@ -233,7 +258,7 @@ def tower_loss(scope):
         loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
         # Name each loss as '(raw)' and name the moving average version of the loss
         # as the original loss name.
-        tf.scalar_summary(loss_name +' (raw)', l)
+        tf.scalar_summary(loss_name + ' (raw)', l)
         tf.scalar_summary(loss_name, loss_averages.average(l))
 
     with tf.control_dependencies([loss_averages_op]):
@@ -242,7 +267,7 @@ def tower_loss(scope):
 
 
 def average_gradients(tower_grads):
-  """Calculate the average gradient for each shared variable across all towers.
+    """Calculate the average gradient for each shared variable across all towers.
   Note that this function provides a synchronization point across all towers.
   Args:
     tower_grads: List of lists of (gradient, variable) tuples. The outer list
@@ -252,130 +277,135 @@ def average_gradients(tower_grads):
      List of pairs of (gradient, variable) where the gradient has been averaged
      across all towers.
   """
-  average_grads = []
-  for grad_and_vars in zip(*tower_grads):
-    # Note that each grad_and_vars looks like the following:
-    #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
-    grads = []
-    for g, _ in grad_and_vars:
-      # Add 0 dimension to the gradients to represent the tower.
-      expanded_g = tf.expand_dims(g, 0)
-
-      # Append on a 'tower' dimension which we will average over below.
-      grads.append(expanded_g)
-
-    # Average over the 'tower' dimension.
-    grad = tf.concat(0, grads)
-    grad = tf.reduce_mean(grad, 0)
-
-    # Keep in mind that the Variables are redundant because they are shared
-    # across towers. So .. we will just return the first tower's pointer to
-    # the Variable.
-    v = grad_and_vars[0][1]
-    grad_and_var = (grad, v)
-    average_grads.append(grad_and_var)
-  return average_grads
+    average_grads = []
+    for grad_and_vars in zip(*tower_grads):
+        # Note that each grad_and_vars looks like the following:
+        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+        grads = []
+        for g, _ in grad_and_vars:
+            # Add 0 dimension to the gradients to represent the tower.
+            expanded_g = tf.expand_dims(g, 0)
+
+            # Append on a 'tower' dimension which we will average over below.
+            grads.append(expanded_g)
+
+        # Average over the 'tower' dimension.
+        grad = tf.concat(0, grads)
+        grad = tf.reduce_mean(grad, 0)
+
+        # Keep in mind that the Variables are redundant because they are shared
+        # across towers. So .. we will just return the first tower's pointer to
+        # the Variable.
+        v = grad_and_vars[0][1]
+        grad_and_var = (grad, v)
+        average_grads.append(grad_and_var)
+    return average_grads
+
 
 def time_tensorflow_run(session, target):
     num_steps_burn_in = 50
     total_duration = 0.0
     total_duration_squared = 0.0
     for i in xrange(FLAGS.num_batches + num_steps_burn_in):
-      start_time = time.time()
-      _, loss_value = session.run(target)
-      duration = time.time() - start_time
-      if i > num_steps_burn_in:
-        if not i % 10:
-          num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
-          examples_per_sec = num_examples_per_step / duration
-          sec_per_batch = duration
-           
-          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
-                      'sec/batch batch_size = %d)')
-          print (format_str %
-                (datetime.now(), i - num_steps_burn_in, 
-                 loss_value, duration, sec_per_batch, num_examples_per_step))
-
-        total_duration += duration
-        total_duration_squared += duration * duration
+        start_time = time.time()
+        _, loss_value = session.run(target)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
+                examples_per_sec = num_examples_per_step / duration
+                sec_per_batch = duration
+
+                format_str = (
+                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
+                    'sec/batch batch_size = %d)')
+                print(format_str %
+                      (datetime.now(), i - num_steps_burn_in, loss_value,
+                       duration, sec_per_batch, num_examples_per_step))
+
+            total_duration += duration
+            total_duration_squared += duration * duration
 
     mn = total_duration / FLAGS.num_batches
     vr = total_duration_squared / FLAGS.num_batches - mn * mn
     sd = math.sqrt(vr)
-    print ('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
+    print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
           (datetime.now(), FLAGS.num_batches, mn, sd))
 
+
 def run_benchmark():
-  with tf.Graph().as_default(), tf.device('/cpu:0'):
-    # Create a variable to count the number of train() calls. This equals the
-    # number of batches processed * FLAGS.num_gpus.
-    global_step = tf.get_variable(
-        'global_step', [],
-        initializer=tf.constant_initializer(0), trainable=False)
-
-    # Calculate the learning rate schedule.
-    num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
-                             FLAGS.batch_size)
-    decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
-
-    # Decay the learning rate exponentially based on the number of steps.
-    lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
-                                    global_step,
-                                    decay_steps,
-                                    LEARNING_RATE_DECAY_FACTOR,
-                                    staircase=True)
-
-    # Create an optimizer that performs gradient descent.
-    opt = tf.train.MomentumOptimizer(lr, 0.9)
-
-    # Calculate the gradients for each model tower.
-    tower_grads = []
-    for i in xrange(FLAGS.num_gpus):
-      with tf.device('/gpu:%d' % i):
-        with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
-          # Calculate the loss for one tower of the model. This function
-          # constructs the entire model but shares the variables across
-          # all towers.
-          loss = tower_loss(scope)
-
-          # Reuse variables for the next tower.
-          tf.get_variable_scope().reuse_variables()
-
-          # Retain the summaries from the final tower.
-          summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
-
-          # Calculate the gradients for the batch of data on this tower.
-          grads = opt.compute_gradients(loss)
-          
-          # Keep track of the gradients across all towers.
-          tower_grads.append(grads)
-
-    # We must calculate the mean of each gradient. Note that this is the
-    # synchronization point across all towers.
-    grads = average_gradients(tower_grads)
-
-    # Apply the gradients to adjust the shared variables.
-    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
-
-    # Group all updates to into a single train op.
-    train_op = tf.group(apply_gradient_op)
-
-    # Build an initialization operation.
-    init = tf.initialize_all_variables()
-
-    # Start running operations on the Graph. allow_soft_placement must be set to
-    # True to build towers on GPU, as some of the ops do not have GPU
-    # implementations.
-    sess = tf.Session(config=tf.ConfigProto(
-        allow_soft_placement=True,
-        log_device_placement=FLAGS.log_device_placement))
-    sess.run(init)
-    time_tensorflow_run(sess, [train_op, loss])
+    with tf.Graph().as_default(), tf.device('/cpu:0'):
+        # Create a variable to count the number of train() calls. This equals the
+        # number of batches processed * FLAGS.num_gpus.
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(0),
+            trainable=False)
+
+        # Calculate the learning rate schedule.
+        num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
+                                 FLAGS.batch_size)
+        decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
+
+        # Decay the learning rate exponentially based on the number of steps.
+        lr = tf.train.exponential_decay(
+            INITIAL_LEARNING_RATE,
+            global_step,
+            decay_steps,
+            LEARNING_RATE_DECAY_FACTOR,
+            staircase=True)
+
+        # Create an optimizer that performs gradient descent.
+        opt = tf.train.MomentumOptimizer(lr, 0.9)
+
+        # Calculate the gradients for each model tower.
+        tower_grads = []
+        for i in xrange(FLAGS.num_gpus):
+            with tf.device('/gpu:%d' % i):
+                with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
+                    # Calculate the loss for one tower of the model. This function
+                    # constructs the entire model but shares the variables across
+                    # all towers.
+                    loss = tower_loss(scope)
+
+                    # Reuse variables for the next tower.
+                    tf.get_variable_scope().reuse_variables()
+
+                    # Retain the summaries from the final tower.
+                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
+
+                    # Calculate the gradients for the batch of data on this tower.
+                    grads = opt.compute_gradients(loss)
+
+                    # Keep track of the gradients across all towers.
+                    tower_grads.append(grads)
+
+        # We must calculate the mean of each gradient. Note that this is the
+        # synchronization point across all towers.
+        grads = average_gradients(tower_grads)
+
+        # Apply the gradients to adjust the shared variables.
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Group all updates to into a single train op.
+        train_op = tf.group(apply_gradient_op)
+
+        # Build an initialization operation.
+        init = tf.initialize_all_variables()
+
+        # Start running operations on the Graph. allow_soft_placement must be set to
+        # True to build towers on GPU, as some of the ops do not have GPU
+        # implementations.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+        time_tensorflow_run(sess, [train_op, loss])
 
 
 def main(_):
-  run_benchmark()
+    run_benchmark()
 
 
 if __name__ == '__main__':
-  tf.app.run()
+    tf.app.run()
diff --git a/benchmark/tensorflow/image/smallnet_mnist_cifar.py b/benchmark/tensorflow/image/smallnet_mnist_cifar.py
index 679dd1ab32293f73481dfcc03f6491af95519f94..1a625134a6c58586b29190ede9c66253f484d2cf 100644
--- a/benchmark/tensorflow/image/smallnet_mnist_cifar.py
+++ b/benchmark/tensorflow/image/smallnet_mnist_cifar.py
@@ -8,10 +8,8 @@ import tensorflow as tf
 
 FLAGS = tf.app.flags.FLAGS
 
-tf.app.flags.DEFINE_integer('batch_size', 128,
-                            """Batch size.""")
-tf.app.flags.DEFINE_integer('num_batches', 100,
-                            """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
 tf.app.flags.DEFINE_boolean('forward_only', False,
                             """Only run the forward pass.""")
 tf.app.flags.DEFINE_boolean('forward_backward_only', False,
@@ -29,78 +27,97 @@ conv_counter = 1
 pool_counter = 1
 affine_counter = 1
 
+
 def _conv(inpOp, nIn, nOut, kH, kW, dH, dW, padType, wd=0.005, act=True):
     global conv_counter
     global parameters
     name = 'conv' + str(conv_counter)
     conv_counter += 1
     with tf.name_scope(name) as scope:
-        kernel = tf.Variable(tf.truncated_normal([kH, kW, nIn, nOut],
-                                                 dtype=tf.float32,
-                                                 stddev=1e-1), name='weights')
+        kernel = tf.Variable(
+            tf.truncated_normal(
+                [kH, kW, nIn, nOut], dtype=tf.float32, stddev=1e-1),
+            name='weights')
 
         if wd is not None:
             weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
             tf.add_to_collection('losses', weight_decay)
 
         if FLAGS.data_format == 'NCHW':
-          strides = [1, 1, dH, dW]
+            strides = [1, 1, dH, dW]
         else:
-          strides = [1, dH, dW, 1]
-        conv = tf.nn.conv2d(inpOp, kernel, strides, padding=padType,
-                            data_format=FLAGS.data_format)
-        biases = tf.Variable(tf.constant(0.0, shape=[nOut], dtype=tf.float32),
-                             trainable=True, name='biases')
-        bias = tf.reshape(tf.nn.bias_add(conv, biases,
-                                         data_format=FLAGS.data_format),
-                          conv.get_shape())
+            strides = [1, dH, dW, 1]
+        conv = tf.nn.conv2d(
+            inpOp,
+            kernel,
+            strides,
+            padding=padType,
+            data_format=FLAGS.data_format)
+        biases = tf.Variable(
+            tf.constant(
+                0.0, shape=[nOut], dtype=tf.float32),
+            trainable=True,
+            name='biases')
+        bias = tf.reshape(
+            tf.nn.bias_add(
+                conv, biases, data_format=FLAGS.data_format),
+            conv.get_shape())
 
         conv1 = tf.nn.relu(bias, name=scope) if act else bias
-        
+
         parameters += [kernel, biases]
 
         return conv1
 
+
 def _affine(inpOp, nIn, nOut, wd=None, act=True):
     global affine_counter
     global parameters
     name = 'affine' + str(affine_counter)
     affine_counter += 1
     with tf.name_scope(name) as scope:
-        kernel = tf.Variable(tf.truncated_normal([nIn, nOut],
-                                                 dtype=tf.float32,
-                                                 stddev=1e-1), name='weights')
+        kernel = tf.Variable(
+            tf.truncated_normal(
+                [nIn, nOut], dtype=tf.float32, stddev=1e-1),
+            name='weights')
 
         if wd is not None:
             weight_decay = tf.mul(tf.nn.l2_loss(kernel), wd, name='weight_loss')
             tf.add_to_collection('losses', weight_decay)
 
-        biases = tf.Variable(tf.constant(0.0, shape=[nOut], dtype=tf.float32),
-                             trainable=True, name='biases')
+        biases = tf.Variable(
+            tf.constant(
+                0.0, shape=[nOut], dtype=tf.float32),
+            trainable=True,
+            name='biases')
 
-        affine1 = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else tf.matmul(inpOp, kernel) + biases 
+        affine1 = tf.nn.relu_layer(
+            inpOp, kernel, biases,
+            name=name) if act else tf.matmul(inpOp, kernel) + biases
 
         parameters += [kernel, biases]
 
         return affine1
 
+
 def _mpool(inpOp, kH, kW, dH, dW, padding):
     global pool_counter
     global parameters
     name = 'pool' + str(pool_counter)
     pool_counter += 1
     if FLAGS.data_format == 'NCHW':
-      ksize = [1, 1, kH, kW]
-      strides = [1, 1, dH, dW]
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
     else:
-      ksize = [1, kH, kW, 1]
-      strides = [1, dH, dW, 1]
-    return tf.nn.max_pool(inpOp,
-                          ksize=ksize,
-                          strides=strides,
-                          padding=padding,
-                          data_format=FLAGS.data_format,
-                          name=name)
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.max_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
 
 
 def _apool(inpOp, kH, kW, dH, dW, padding):
@@ -109,36 +126,42 @@ def _apool(inpOp, kH, kW, dH, dW, padding):
     name = 'pool' + str(pool_counter)
     pool_counter += 1
     if FLAGS.data_format == 'NCHW':
-      ksize = [1, 1, kH, kW]
-      strides = [1, 1, dH, dW]
+        ksize = [1, 1, kH, kW]
+        strides = [1, 1, dH, dW]
     else:
-      ksize = [1, kH, kW, 1]
-      strides = [1, dH, dW, 1]
-    return tf.nn.avg_pool(inpOp,
-                          ksize=ksize,
-                          strides=strides,
-                          padding=padding,
-                          data_format=FLAGS.data_format,
-                          name=name)
+        ksize = [1, kH, kW, 1]
+        strides = [1, dH, dW, 1]
+    return tf.nn.avg_pool(
+        inpOp,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=FLAGS.data_format,
+        name=name)
+
 
 def _norm(name, l_input, lsize=4):
-    return tf.nn.lrn(l_input, lsize, bias=1.0,
+    return tf.nn.lrn(l_input,
+                     lsize,
+                     bias=1.0,
                      alpha=0.001 / 9.0,
-                     beta=0.75, name=name)
+                     beta=0.75,
+                     name=name)
+
 
 def loss(logits, labels):
     batch_size = tf.size(labels)
     labels = tf.expand_dims(labels, 1)
     indices = tf.expand_dims(tf.range(0, batch_size, 1), 1)
     concated = tf.concat(1, [indices, labels])
-    onehot_labels = tf.sparse_to_dense(
-        concated, tf.pack([batch_size, 10]), 1.0, 0.0)
-    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits,
-                                                            onehot_labels,
-                                                            name='xentropy')
+    onehot_labels = tf.sparse_to_dense(concated,
+                                       tf.pack([batch_size, 10]), 1.0, 0.0)
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+        logits, onehot_labels, name='xentropy')
     loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
     return loss
 
+
 def get_incoming_shape(incoming):
     """ Returns the incoming data shape """
     if isinstance(incoming, tf.Tensor):
@@ -148,125 +171,134 @@ def get_incoming_shape(incoming):
     else:
         raise Exception("Invalid incoming layer.")
 
+
 def inference(images):
-    conv1 = _conv (images, 3, 32, 5, 5, 1, 1, 'SAME')
-    pool1 = _mpool(conv1,  3, 3, 2, 2, 'SAME')
-    conv2 = _conv (pool1,  32, 32, 5, 5, 1, 1, 'SAME')
-    pool2 = _apool(conv2,  3, 3, 2, 2, 'SAME')
-    conv3 = _conv (pool2,  32, 64, 5, 5, 1, 1, 'SAME')
-    pool3 = _apool(conv3,  3, 3, 2, 2, 'SAME')
+    conv1 = _conv(images, 3, 32, 5, 5, 1, 1, 'SAME')
+    pool1 = _mpool(conv1, 3, 3, 2, 2, 'SAME')
+    conv2 = _conv(pool1, 32, 32, 5, 5, 1, 1, 'SAME')
+    pool2 = _apool(conv2, 3, 3, 2, 2, 'SAME')
+    conv3 = _conv(pool2, 32, 64, 5, 5, 1, 1, 'SAME')
+    pool3 = _apool(conv3, 3, 3, 2, 2, 'SAME')
     resh1 = tf.reshape(pool3, [-1, 64 * 4 * 4])
     affn1 = _affine(resh1, 64 * 4 * 4, 64)
     affn2 = _affine(affn1, 64, 10, act=False)
 
-    print ('conv1:', get_incoming_shape(conv1))
-    print ('pool1:', get_incoming_shape(pool1))
-    print ('conv2:', get_incoming_shape(conv2))
-    print ('pool2:', get_incoming_shape(pool2))
-    print ('conv3:', get_incoming_shape(conv3))
-    print ('pool3:', get_incoming_shape(pool3))
-  
+    print('conv1:', get_incoming_shape(conv1))
+    print('pool1:', get_incoming_shape(pool1))
+    print('conv2:', get_incoming_shape(conv2))
+    print('pool2:', get_incoming_shape(pool2))
+    print('conv3:', get_incoming_shape(conv3))
+    print('pool3:', get_incoming_shape(pool3))
+
     return affn2
 
 
 def time_tensorflow_run(session, target, info_string):
-  num_steps_burn_in = 10
-  total_duration = 0.0
-  total_duration_squared = 0.0
-  if not isinstance(target, list):
-    target = [target]
-  target_op = tf.group(*target)
-  for i in xrange(FLAGS.num_batches + num_steps_burn_in):
-    start_time = time.time()
-    _ = session.run(target_op)
-    duration = time.time() - start_time
-    if i > num_steps_burn_in:
-      if not i % 10:
-        print ('%s: step %d, duration = %.3f' %
-               (datetime.now(), i - num_steps_burn_in, duration))
-      total_duration += duration
-      total_duration_squared += duration * duration
-  mn = total_duration / FLAGS.num_batches
-  vr = total_duration_squared / FLAGS.num_batches - mn * mn
-  sd = math.sqrt(vr)
-  print ('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
-         (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
+    num_steps_burn_in = 10
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    if not isinstance(target, list):
+        target = [target]
+    target_op = tf.group(*target)
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        _ = session.run(target_op)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                print('%s: step %d, duration = %.3f' %
+                      (datetime.now(), i - num_steps_burn_in, duration))
+            total_duration += duration
+            total_duration_squared += duration * duration
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
+
 
 def run_benchmark():
-  global parameters
-  with tf.Graph().as_default():
-    # Generate some dummy images.
-    image_size = 32 
-    # Note that our padding definition is slightly different the cuda-convnet.
-    # In order to force the model to start with the same activations sizes,
-    # we add 3 to the image_size and employ VALID padding above.
-    if FLAGS.data_format == 'NCHW':
-      image_shape = [FLAGS.batch_size, 3, image_size, image_size]
-    else:
-      image_shape = [FLAGS.batch_size, image_size, image_size, 3]
-
-    images = tf.get_variable('image', image_shape, 
-                             initializer=tf.truncated_normal_initializer(stddev=0.1, dtype=tf.float32),
-                             dtype=tf.float32,
-                             trainable=False)
-
-    labels = tf.get_variable('label', [FLAGS.batch_size],
-                             initializer=tf.constant_initializer(1),
-                             dtype=tf.int32,
-                             trainable=False)
-
-    # Build a Graph that computes the logits predictions from the
-    # inference model.
-    last_layer = inference(images)
-
-    objective = loss(last_layer, labels)
-
-    # Compute gradients.
-    opt = tf.train.MomentumOptimizer(0.001, 0.9)
-    grads = opt.compute_gradients(objective) 
-    global_step = tf.get_variable('global_step', [],
-       initializer=tf.constant_initializer(0.0, dtype=tf.float32),
-       trainable=False, dtype=tf.float32)
-    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
-
-    # Track the moving averages of all trainable variables.
-    variable_averages = tf.train.ExponentialMovingAverage(
-         0.9, global_step)
-    variables_averages_op = variable_averages.apply(tf.trainable_variables())
-
-
-    # Build an initialization operation.
-    init = tf.initialize_all_variables()
-
-    # Start running operations on the Graph.
-    sess = tf.Session(config=tf.ConfigProto(
-      allow_soft_placement=True,
-      log_device_placement=FLAGS.log_device_placement))
-    sess.run(init)
-
-    run_forward = True
-    run_forward_backward = True
-    if FLAGS.forward_only and FLAGS.forward_backward_only:
-      raise ValueError("Cannot specify --forward_only and "
-                       "--forward_backward_only at the same time.")
-    if FLAGS.forward_only:
-      run_forward_backward = False
-    elif FLAGS.forward_backward_only:
-      run_forward = False
-
-    if run_forward:
-      # Run the forward benchmark.
-      time_tensorflow_run(sess, last_layer, "Forward")
-
-    if run_forward_backward:
-      with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
-          train_op = tf.no_op(name='train')
-      time_tensorflow_run(sess, [train_op, objective], "Forward-backward")
+    global parameters
+    with tf.Graph().as_default():
+        # Generate some dummy images.
+        image_size = 32
+        # Note that our padding definition is slightly different the cuda-convnet.
+        # In order to force the model to start with the same activations sizes,
+        # we add 3 to the image_size and employ VALID padding above.
+        if FLAGS.data_format == 'NCHW':
+            image_shape = [FLAGS.batch_size, 3, image_size, image_size]
+        else:
+            image_shape = [FLAGS.batch_size, image_size, image_size, 3]
+
+        images = tf.get_variable(
+            'image',
+            image_shape,
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.1, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=False)
+
+        labels = tf.get_variable(
+            'label', [FLAGS.batch_size],
+            initializer=tf.constant_initializer(1),
+            dtype=tf.int32,
+            trainable=False)
+
+        # Build a Graph that computes the logits predictions from the
+        # inference model.
+        last_layer = inference(images)
+
+        objective = loss(last_layer, labels)
+
+        # Compute gradients.
+        opt = tf.train.MomentumOptimizer(0.001, 0.9)
+        grads = opt.compute_gradients(objective)
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(
+                0.0, dtype=tf.float32),
+            trainable=False,
+            dtype=tf.float32)
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Track the moving averages of all trainable variables.
+        variable_averages = tf.train.ExponentialMovingAverage(0.9, global_step)
+        variables_averages_op = variable_averages.apply(tf.trainable_variables(
+        ))
+
+        # Build an initialization operation.
+        init = tf.initialize_all_variables()
+
+        # Start running operations on the Graph.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+
+        run_forward = True
+        run_forward_backward = True
+        if FLAGS.forward_only and FLAGS.forward_backward_only:
+            raise ValueError("Cannot specify --forward_only and "
+                             "--forward_backward_only at the same time.")
+        if FLAGS.forward_only:
+            run_forward_backward = False
+        elif FLAGS.forward_backward_only:
+            run_forward = False
+
+        if run_forward:
+            # Run the forward benchmark.
+            time_tensorflow_run(sess, last_layer, "Forward")
+
+        if run_forward_backward:
+            with tf.control_dependencies(
+                [apply_gradient_op, variables_averages_op]):
+                train_op = tf.no_op(name='train')
+            time_tensorflow_run(sess, [train_op, objective], "Forward-backward")
 
 
 def main(_):
-  run_benchmark()
+    run_benchmark()
 
 
 if __name__ == '__main__':
-  tf.app.run()
+    tf.app.run()
diff --git a/benchmark/tensorflow/rnn/README.md b/benchmark/tensorflow/rnn/README.md
index b5314d544608480a732f7d0d94ec69c53b4c8049..da8e7b8b07969051cbec3ac6a713eaf7fc738a55 100644
--- a/benchmark/tensorflow/rnn/README.md
+++ b/benchmark/tensorflow/rnn/README.md
@@ -1,5 +1,5 @@
 You also should install tflearn:
 
 ```bash
-pip install tflearn
+pip install -r requirements.txt
 ```
diff --git a/benchmark/tensorflow/rnn/reader.py b/benchmark/tensorflow/rnn/reader.py
index 0d8308046ed6543b218f604480e9630e6b4b1091..f538329a15ea9ad9293c97c94340989e2c421eb2 100755
--- a/benchmark/tensorflow/rnn/reader.py
+++ b/benchmark/tensorflow/rnn/reader.py
@@ -8,14 +8,13 @@ import tflearn
 from tflearn.data_utils import to_categorical, pad_sequences
 from tflearn.datasets import imdb
 
-
 FLAGS = tf.app.flags.FLAGS
 
+
 class DataSet(object):
     def __init__(self, data, labels):
         assert data.shape[0] == labels.shape[0], (
-            'data.shape: %s labels.shape: %s' % (data.shape,
-                                                 labels.shape))
+            'data.shape: %s labels.shape: %s' % (data.shape, labels.shape))
         self._num_examples = data.shape[0]
 
         self._data = data
@@ -64,8 +63,11 @@ class DataSet(object):
 def create_datasets(file_path, vocab_size=30000, val_fraction=0.0):
 
     # IMDB Dataset loading
-    train, test, _ = imdb.load_data(path=file_path, n_words=vocab_size,
-                                valid_portion=val_fraction, sort_by_len=False)
+    train, test, _ = imdb.load_data(
+        path=file_path,
+        n_words=vocab_size,
+        valid_portion=val_fraction,
+        sort_by_len=False)
     trainX, trainY = train
     testX, testY = test
 
diff --git a/benchmark/tensorflow/rnn/requirements.txt b/benchmark/tensorflow/rnn/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4242e7d24fbbeb18e8fb9a760d76fa6d5363b03f
--- /dev/null
+++ b/benchmark/tensorflow/rnn/requirements.txt
@@ -0,0 +1 @@
+tflearn
diff --git a/benchmark/tensorflow/rnn/rnn.py b/benchmark/tensorflow/rnn/rnn.py
index 5377187f39141be6b9884d8a75c1c1772710c525..f288083e13656563b511980553245142efec4e65 100755
--- a/benchmark/tensorflow/rnn/rnn.py
+++ b/benchmark/tensorflow/rnn/rnn.py
@@ -11,27 +11,22 @@ from tensorflow.python.ops import rnn
 
 FLAGS = tf.app.flags.FLAGS
 
-tf.app.flags.DEFINE_integer('batch_size', 128,
-                            """Batch size.""")
-tf.app.flags.DEFINE_integer('num_batches', 100,
-                            """Number of batches to run.""")
-tf.app.flags.DEFINE_integer('num_layers', 1,
-                            """Number of batches to run.""")
-tf.app.flags.DEFINE_integer('max_len', 100,
-                            """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('batch_size', 128, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('num_layers', 1, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('max_len', 100, """Number of batches to run.""")
 tf.app.flags.DEFINE_boolean('forward_only', False,
                             """Only run the forward pass.""")
 tf.app.flags.DEFINE_boolean('forward_backward_only', False,
                             """Only run the forward-forward pass.""")
-tf.app.flags.DEFINE_integer('hidden_size', 128,
-                            """Number of batches to run.""")
-tf.app.flags.DEFINE_integer('emb_size', 128,
-                            """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('hidden_size', 128, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('emb_size', 128, """Number of batches to run.""")
 tf.app.flags.DEFINE_boolean('log_device_placement', False,
                             """Whether to log device placement.""")
 
-VOCAB_SIZE=30000
-NUM_CLASS=2
+VOCAB_SIZE = 30000
+NUM_CLASS = 2
+
 
 def get_feed_dict(x_data, y_data=None):
     feed_dict = {}
@@ -44,6 +39,7 @@ def get_feed_dict(x_data, y_data=None):
 
     return feed_dict
 
+
 def get_incoming_shape(incoming):
     """ Returns the incoming data shape """
     if isinstance(incoming, tf.Tensor):
@@ -56,53 +52,75 @@ def get_incoming_shape(incoming):
 
 # Note input * W is done in LSTMCell, 
 # which is different from PaddlePaddle
-def single_lstm(name, incoming, n_units, use_peepholes=True, 
-         return_seq=False, return_state=False):
-  with tf.name_scope(name) as scope:
-    cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes)
-    output, _cell_state = rnn.rnn(cell, incoming, dtype=tf.float32)
-    out = output if return_seq else output[-1]
-    return (out, _cell_state) if return_state else out
-
-def lstm(name, incoming, n_units, use_peepholes=True, 
-         return_seq=False, return_state=False, num_layers=1):
-  with tf.name_scope(name) as scope:
-    lstm_cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes)
-    cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers)
-    initial_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32)
-    if not isinstance(incoming, list):
-        # if the input is embeding, the Tensor shape : [None, time_step, emb_size]
-        incoming = [tf.squeeze(input_, [1])
-                  for input_ in tf.split(1, FLAGS.max_len, incoming)]
-    outputs, state = tf.nn.rnn(cell, incoming, initial_state=initial_state,
-                               dtype=tf.float32)
-    out = outputs if return_seq else outputs[-1]
-    return (out, _cell_state) if return_state else out
+def single_lstm(name,
+                incoming,
+                n_units,
+                use_peepholes=True,
+                return_seq=False,
+                return_state=False):
+    with tf.name_scope(name) as scope:
+        cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes)
+        output, _cell_state = rnn.rnn(cell, incoming, dtype=tf.float32)
+        out = output if return_seq else output[-1]
+        return (out, _cell_state) if return_state else out
+
+
+def lstm(name,
+         incoming,
+         n_units,
+         use_peepholes=True,
+         return_seq=False,
+         return_state=False,
+         num_layers=1):
+    with tf.name_scope(name) as scope:
+        lstm_cell = tf.nn.rnn_cell.LSTMCell(
+            n_units, use_peepholes=use_peepholes)
+        cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers)
+        initial_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32)
+        if not isinstance(incoming, list):
+            # if the input is embeding, the Tensor shape : [None, time_step, emb_size]
+            incoming = [
+                tf.squeeze(input_, [1])
+                for input_ in tf.split(1, FLAGS.max_len, incoming)
+            ]
+        outputs, state = tf.nn.rnn(cell,
+                                   incoming,
+                                   initial_state=initial_state,
+                                   dtype=tf.float32)
+        out = outputs if return_seq else outputs[-1]
+        return (out, _cell_state) if return_state else out
 
 
 def embedding(name, incoming, vocab_size, emb_size):
-  with tf.name_scope(name) as scope:
-    #with tf.device("/cpu:0"):
-      embedding = tf.get_variable(
-            name+'_emb', [vocab_size, emb_size], dtype=tf.float32)
-      out = tf.nn.embedding_lookup(embedding, incoming)
-      return out 
+    with tf.name_scope(name) as scope:
+        #with tf.device("/cpu:0"):
+        embedding = tf.get_variable(
+            name + '_emb', [vocab_size, emb_size], dtype=tf.float32)
+        out = tf.nn.embedding_lookup(embedding, incoming)
+        return out
+
 
 def fc(name, inpOp, nIn, nOut, act=True):
     with tf.name_scope(name) as scope:
-        kernel = tf.get_variable(name + '_w', [nIn, nOut],
-            initializer=tf.truncated_normal_initializer(stddev=0.01, dtype=tf.float32),
+        kernel = tf.get_variable(
+            name + '_w', [nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
             dtype=tf.float32)
 
-        biases = tf.get_variable(name + '_b', [nOut],
-            initializer=tf.constant_initializer(value=0.0, dtype=tf.float32),
-            dtype=tf.float32,trainable=True)
+        biases = tf.get_variable(
+            name + '_b', [nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=True)
 
         net = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
                   tf.matmul(inpOp, kernel) + biases
 
         return net
 
+
 def inference(seq):
     net = embedding('emb', seq, VOCAB_SIZE, FLAGS.emb_size)
     print "emb:", get_incoming_shape(net)
@@ -111,91 +129,95 @@ def inference(seq):
     net = fc('fc1', net, FLAGS.hidden_size, 2)
     return net
 
+
 def loss(logits, labels):
     # one label index for one sample
     labels = tf.cast(labels, tf.float32)
     cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
-                    logits, labels, name='cross_entropy_per_example')
+        logits, labels, name='cross_entropy_per_example')
     cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
     tf.add_to_collection('losses', cross_entropy_mean)
     return tf.add_n(tf.get_collection('losses'), name='total_loss')
 
 
 def time_tensorflow_run(session, target, x_input, y_input, info_string):
-  num_steps_burn_in = 50
-  total_duration = 0.0
-  total_duration_squared = 0.0
-  if not isinstance(target, list):
-    target = [target]
-  target_op = tf.group(*target)
-  train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE)
-  for i in xrange(FLAGS.num_batches + num_steps_burn_in):
-    start_time = time.time()
-    data, label = train_dataset.next_batch(FLAGS.batch_size)
-    _ = session.run(target_op, feed_dict={x_input:data, y_input:label})
-    duration = time.time() - start_time
-    if i > num_steps_burn_in:
-      if not i % 10:
-        print ('%s: step %d, duration = %.3f' %
-               (datetime.now(), i - num_steps_burn_in, duration))
-      total_duration += duration
-      total_duration_squared += duration * duration
-  mn = total_duration / FLAGS.num_batches
-  vr = total_duration_squared / FLAGS.num_batches - mn * mn
-  sd = math.sqrt(vr)
-  print ('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
-         (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
+    num_steps_burn_in = 50
+    total_duration = 0.0
+    total_duration_squared = 0.0
+    if not isinstance(target, list):
+        target = [target]
+    target_op = tf.group(*target)
+    train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE)
+    for i in xrange(FLAGS.num_batches + num_steps_burn_in):
+        start_time = time.time()
+        data, label = train_dataset.next_batch(FLAGS.batch_size)
+        _ = session.run(target_op, feed_dict={x_input: data, y_input: label})
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                print('%s: step %d, duration = %.3f' %
+                      (datetime.now(), i - num_steps_burn_in, duration))
+            total_duration += duration
+            total_duration_squared += duration * duration
+    mn = total_duration / FLAGS.num_batches
+    vr = total_duration_squared / FLAGS.num_batches - mn * mn
+    sd = math.sqrt(vr)
+    print('%s: %s across %d steps, %.3f +/- %.3f sec / batch' %
+          (datetime.now(), info_string, FLAGS.num_batches, mn, sd))
 
 
 def run_benchmark():
-  with tf.Graph().as_default():
-    global_step=0
-    with tf.device('/cpu:0'):
-        global_step = tf.Variable(0, trainable=False)
-    with tf.device('/gpu:0'):
-      #x_input = tf.placeholder(tf.int32, [None, FLAGS.max_len], name="x_input")
-      #y_input = tf.placeholder(tf.int32, [None, NUM_CLASS], name="y_input")
-      x_input = tf.placeholder(tf.int32, [FLAGS.batch_size, FLAGS.max_len], name="x_input")
-      y_input = tf.placeholder(tf.int32, [FLAGS.batch_size, NUM_CLASS], name="y_input")
-      # Generate some dummy sequnce.
-
-
-      last_layer = inference(x_input)
-
-      objective = loss(last_layer, y_input)
-      opt = tf.train.AdamOptimizer(0.001)
-      grads = opt.compute_gradients(objective) 
-      apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
-
-      init = tf.initialize_all_variables()
-      sess = tf.Session(config=tf.ConfigProto(
-        allow_soft_placement=True,
-        log_device_placement=FLAGS.log_device_placement))
-      sess.run(init)
-
-      run_forward = True
-      run_forward_backward = True
-      if FLAGS.forward_only and FLAGS.forward_backward_only:
-        raise ValueError("Cannot specify --forward_only and "
-                         "--forward_backward_only at the same time.")
-      if FLAGS.forward_only:
-        run_forward_backward = False
-      elif FLAGS.forward_backward_only:
-        run_forward = False
-
-      if run_forward:
-        time_tensorflow_run(sess, last_layer, x_input, y_input, "Forward")
-
-      if run_forward_backward:
-        with tf.control_dependencies([apply_gradient_op]):
-            train_op = tf.no_op(name='train')
-        time_tensorflow_run(sess, [train_op, objective], x_input, y_input, "Forward-backward")
+    with tf.Graph().as_default():
+        global_step = 0
+        with tf.device('/cpu:0'):
+            global_step = tf.Variable(0, trainable=False)
+        with tf.device('/gpu:0'):
+            #x_input = tf.placeholder(tf.int32, [None, FLAGS.max_len], name="x_input")
+            #y_input = tf.placeholder(tf.int32, [None, NUM_CLASS], name="y_input")
+            x_input = tf.placeholder(
+                tf.int32, [FLAGS.batch_size, FLAGS.max_len], name="x_input")
+            y_input = tf.placeholder(
+                tf.int32, [FLAGS.batch_size, NUM_CLASS], name="y_input")
+            # Generate some dummy sequnce.
+
+            last_layer = inference(x_input)
+
+            objective = loss(last_layer, y_input)
+            opt = tf.train.AdamOptimizer(0.001)
+            grads = opt.compute_gradients(objective)
+            apply_gradient_op = opt.apply_gradients(
+                grads, global_step=global_step)
+
+            init = tf.initialize_all_variables()
+            sess = tf.Session(config=tf.ConfigProto(
+                allow_soft_placement=True,
+                log_device_placement=FLAGS.log_device_placement))
+            sess.run(init)
+
+            run_forward = True
+            run_forward_backward = True
+            if FLAGS.forward_only and FLAGS.forward_backward_only:
+                raise ValueError("Cannot specify --forward_only and "
+                                 "--forward_backward_only at the same time.")
+            if FLAGS.forward_only:
+                run_forward_backward = False
+            elif FLAGS.forward_backward_only:
+                run_forward = False
+
+            if run_forward:
+                time_tensorflow_run(sess, last_layer, x_input, y_input,
+                                    "Forward")
+
+            if run_forward_backward:
+                with tf.control_dependencies([apply_gradient_op]):
+                    train_op = tf.no_op(name='train')
+                time_tensorflow_run(sess, [train_op, objective], x_input,
+                                    y_input, "Forward-backward")
 
 
 def main(_):
-  run_benchmark()
+    run_benchmark()
 
 
 if __name__ == '__main__':
-  tf.app.run()
-
+    tf.app.run()
diff --git a/benchmark/tensorflow/rnn/rnn_multi_gpu.py b/benchmark/tensorflow/rnn/rnn_multi_gpu.py
index 97ba5d4c29672afe2756850430351b2abdeb20ca..eabee4fa8fe6325212ace1c11be4862cd2720b08 100755
--- a/benchmark/tensorflow/rnn/rnn_multi_gpu.py
+++ b/benchmark/tensorflow/rnn/rnn_multi_gpu.py
@@ -12,35 +12,28 @@ from tensorflow.python.ops import rnn
 
 FLAGS = tf.app.flags.FLAGS
 
-tf.app.flags.DEFINE_integer('batch_size', 64,
-                            """Batch size.""")
-tf.app.flags.DEFINE_integer('num_batches', 100,
-                            """Number of batches to run.""")
-tf.app.flags.DEFINE_integer('num_layers', 1,
-                            """Number of batches to run.""")
-tf.app.flags.DEFINE_integer('max_len', 100,
-                            """Number of batches to run.""")
-tf.app.flags.DEFINE_integer('hidden_size', 128,
-                            """Number of batches to run.""")
-tf.app.flags.DEFINE_integer('emb_size', 64,
-                            """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('batch_size', 64, """Batch size.""")
+tf.app.flags.DEFINE_integer('num_batches', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('num_layers', 1, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('max_len', 100, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('hidden_size', 128, """Number of batches to run.""")
+tf.app.flags.DEFINE_integer('emb_size', 64, """Number of batches to run.""")
 tf.app.flags.DEFINE_boolean('log_device_placement', False,
                             """Whether to log device placement.""")
-tf.app.flags.DEFINE_integer('num_gpus', 4,
-                            """How many GPUs to use.""")
+tf.app.flags.DEFINE_integer('num_gpus', 4, """How many GPUs to use.""")
 
-VOCAB_SIZE=30000
-NUM_CLASS=2
+VOCAB_SIZE = 30000
+NUM_CLASS = 2
 
-
-NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN=50000
-NUM_EPOCHS_PER_DECAY=50
-INITIAL_LEARNING_RATE = 0.1 
+NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
+NUM_EPOCHS_PER_DECAY = 50
+INITIAL_LEARNING_RATE = 0.1
 LEARNING_RATE_DECAY_FACTOR = 0.1
 TOWER_NAME = 'tower'
 
 train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE)
 
+
 def get_incoming_shape(incoming):
     """ Returns the incoming data shape """
     if isinstance(incoming, tf.Tensor):
@@ -53,49 +46,68 @@ def get_incoming_shape(incoming):
 
 # Note input * W is done in LSTMCell, 
 # which is different from PaddlePaddle
-def single_lstm(name, incoming, n_units, use_peepholes=True, 
-         return_seq=False, return_state=False):
-  with tf.name_scope(name) as scope:
-    cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes)
-    output, _cell_state = rnn.rnn(cell, incoming, dtype=tf.float32)
-    out = output if return_seq else output[-1]
-    return (out, _cell_state) if return_state else out
-
-
-def lstm(name, incoming, n_units, use_peepholes=True, 
-         return_seq=False, return_state=False, num_layers=1):
-  with tf.name_scope(name) as scope:
-    lstm_cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes)
-    cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers)
-    initial_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32)
-    if not isinstance(incoming, list):
-        # if the input is embeding, the Tensor shape : [None, time_step, emb_size]
-        incoming = [tf.squeeze(input_, [1])
-                  for input_ in tf.split(1, FLAGS.max_len, incoming)]
-    outputs, state = tf.nn.rnn(cell, incoming, initial_state=initial_state,
-                               dtype=tf.float32)
-    out = outputs if return_seq else outputs[-1]
-    return (out, _cell_state) if return_state else out
+def single_lstm(name,
+                incoming,
+                n_units,
+                use_peepholes=True,
+                return_seq=False,
+                return_state=False):
+    with tf.name_scope(name) as scope:
+        cell = tf.nn.rnn_cell.LSTMCell(n_units, use_peepholes=use_peepholes)
+        output, _cell_state = rnn.rnn(cell, incoming, dtype=tf.float32)
+        out = output if return_seq else output[-1]
+        return (out, _cell_state) if return_state else out
+
+
+def lstm(name,
+         incoming,
+         n_units,
+         use_peepholes=True,
+         return_seq=False,
+         return_state=False,
+         num_layers=1):
+    with tf.name_scope(name) as scope:
+        lstm_cell = tf.nn.rnn_cell.LSTMCell(
+            n_units, use_peepholes=use_peepholes)
+        cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers)
+        initial_state = cell.zero_state(FLAGS.batch_size, dtype=tf.float32)
+        if not isinstance(incoming, list):
+            # if the input is embeding, the Tensor shape : [None, time_step, emb_size]
+            incoming = [
+                tf.squeeze(input_, [1])
+                for input_ in tf.split(1, FLAGS.max_len, incoming)
+            ]
+        outputs, state = tf.nn.rnn(cell,
+                                   incoming,
+                                   initial_state=initial_state,
+                                   dtype=tf.float32)
+        out = outputs if return_seq else outputs[-1]
+        return (out, _cell_state) if return_state else out
 
 
 def embedding(name, incoming, vocab_size, emb_size):
-  with tf.name_scope(name) as scope:
-    #with tf.device("/cpu:0"):
-      embedding = tf.get_variable(
-            name+'_emb', [vocab_size, emb_size], dtype=tf.float32)
-      out = tf.nn.embedding_lookup(embedding, incoming)
-      return out 
+    with tf.name_scope(name) as scope:
+        #with tf.device("/cpu:0"):
+        embedding = tf.get_variable(
+            name + '_emb', [vocab_size, emb_size], dtype=tf.float32)
+        out = tf.nn.embedding_lookup(embedding, incoming)
+        return out
 
 
 def fc(name, inpOp, nIn, nOut, act=True):
     with tf.name_scope(name) as scope:
-        kernel = tf.get_variable(name + '_w', [nIn, nOut],
-            initializer=tf.truncated_normal_initializer(stddev=0.01, dtype=tf.float32),
+        kernel = tf.get_variable(
+            name + '_w', [nIn, nOut],
+            initializer=tf.truncated_normal_initializer(
+                stddev=0.01, dtype=tf.float32),
             dtype=tf.float32)
 
-        biases = tf.get_variable(name + '_b', [nOut],
-            initializer=tf.constant_initializer(value=0.0, dtype=tf.float32),
-            dtype=tf.float32,trainable=True)
+        biases = tf.get_variable(
+            name + '_b', [nOut],
+            initializer=tf.constant_initializer(
+                value=0.0, dtype=tf.float32),
+            dtype=tf.float32,
+            trainable=True)
 
         net = tf.nn.relu_layer(inpOp, kernel, biases, name=name) if act else \
                   tf.matmul(inpOp, kernel) + biases
@@ -119,7 +131,7 @@ def loss(logits, labels):
     #                logits, labels, name='cross_entropy_per_example')
     labels = tf.cast(labels, tf.float32)
     cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
-                    logits, labels, name='cross_entropy_per_example')
+        logits, labels, name='cross_entropy_per_example')
     cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
     tf.add_to_collection('losses', cross_entropy_mean)
     return tf.add_n(tf.get_collection('losses'), name='total_loss')
@@ -142,7 +154,7 @@ def tower_loss(scope):
     # assemble the total_loss using a custom function below.
     #_ = loss(last_layer, label)
     _ = loss(last_layer, label)
-    
+
     # Assemble all of the losses for the current tower only.
     losses = tf.get_collection('losses', scope)
 
@@ -161,7 +173,7 @@ def tower_loss(scope):
         loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
         # Name each loss as '(raw)' and name the moving average version of the loss
         # as the original loss name.
-        tf.scalar_summary(loss_name +' (raw)', l)
+        tf.scalar_summary(loss_name + ' (raw)', l)
         #tf.scalar_summary(loss_name, loss_averages.average(l))
 
     with tf.control_dependencies([loss_averages_op]):
@@ -170,7 +182,7 @@ def tower_loss(scope):
 
 
 def average_gradients(tower_grads):
-  """Calculate the average gradient for each shared variable across all towers.
+    """Calculate the average gradient for each shared variable across all towers.
   Note that this function provides a synchronization point across all towers.
   Args:
     tower_grads: List of lists of (gradient, variable) tuples. The outer list
@@ -180,127 +192,131 @@ def average_gradients(tower_grads):
      List of pairs of (gradient, variable) where the gradient has been averaged
      across all towers.
   """
-  average_grads = []
-  for grad_and_vars in zip(*tower_grads):
-    # Note that each grad_and_vars looks like the following:
-    #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
-    grads = []
-    for g, _ in grad_and_vars:
-      # Add 0 dimension to the gradients to represent the tower.
-      expanded_g = tf.expand_dims(g, 0)
-
-      # Append on a 'tower' dimension which we will average over below.
-      grads.append(expanded_g)
-
-    # Average over the 'tower' dimension.
-    grad = tf.concat(0, grads)
-    grad = tf.reduce_mean(grad, 0)
-
-    # Keep in mind that the Variables are redundant because they are shared
-    # across towers. So .. we will just return the first tower's pointer to
-    # the Variable.
-    v = grad_and_vars[0][1]
-    grad_and_var = (grad, v)
-    average_grads.append(grad_and_var)
-  return average_grads
+    average_grads = []
+    for grad_and_vars in zip(*tower_grads):
+        # Note that each grad_and_vars looks like the following:
+        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+        grads = []
+        for g, _ in grad_and_vars:
+            # Add 0 dimension to the gradients to represent the tower.
+            expanded_g = tf.expand_dims(g, 0)
+
+            # Append on a 'tower' dimension which we will average over below.
+            grads.append(expanded_g)
+
+        # Average over the 'tower' dimension.
+        grad = tf.concat(0, grads)
+        grad = tf.reduce_mean(grad, 0)
+
+        # Keep in mind that the Variables are redundant because they are shared
+        # across towers. So .. we will just return the first tower's pointer to
+        # the Variable.
+        v = grad_and_vars[0][1]
+        grad_and_var = (grad, v)
+        average_grads.append(grad_and_var)
+    return average_grads
+
 
 def time_tensorflow_run(session, target):
     num_steps_burn_in = 80
     total_duration = 0.0
     total_duration_squared = 0.0
     for i in xrange(FLAGS.num_batches + num_steps_burn_in):
-      start_time = time.time()
-      _ = session.run(target, feed_dict={x_input:data, y_input:label})
-      _, loss_value = session.run(target)
-      duration = time.time() - start_time
-      if i > num_steps_burn_in:
-        if not i % 10:
-          num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
-          examples_per_sec = num_examples_per_step / duration
-          # sec_per_batch = duration / FLAGS.num_gpus
-          sec_per_batch = duration
-           
-          format_str = ('%s: step %d, loss= %.2f (%.1f examples/sec; %.3f '
-                      'sec/batch batch_size= %d)')
-          print (format_str %
-                (datetime.now(), i - num_steps_burn_in, 
-                 loss_value, duration, sec_per_batch, num_examples_per_step))
-
-        total_duration += duration
-        total_duration_squared += duration * duration
+        start_time = time.time()
+        _ = session.run(target, feed_dict={x_input: data, y_input: label})
+        _, loss_value = session.run(target)
+        duration = time.time() - start_time
+        if i > num_steps_burn_in:
+            if not i % 10:
+                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
+                examples_per_sec = num_examples_per_step / duration
+                # sec_per_batch = duration / FLAGS.num_gpus
+                sec_per_batch = duration
+
+                format_str = (
+                    '%s: step %d, loss= %.2f (%.1f examples/sec; %.3f '
+                    'sec/batch batch_size= %d)')
+                print(format_str %
+                      (datetime.now(), i - num_steps_burn_in, loss_value,
+                       duration, sec_per_batch, num_examples_per_step))
+
+            total_duration += duration
+            total_duration_squared += duration * duration
 
     mn = total_duration / FLAGS.num_batches
     vr = total_duration_squared / FLAGS.num_batches - mn * mn
     sd = math.sqrt(vr)
-    print ('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
+    print('%s: FwdBwd across %d steps, %.3f +/- %.3f sec / batch' %
           (datetime.now(), FLAGS.num_batches, mn, sd))
 
+
 def run_benchmark():
-  with tf.Graph().as_default(), tf.device('/cpu:0'):
-    # Create a variable to count the number of train() calls. This equals the
-    # number of batches processed * FLAGS.num_gpus.
-    global_step = tf.get_variable(
-        'global_step', [],
-        initializer=tf.constant_initializer(0), trainable=False)
-
-    # Calculate the learning rate schedule.
-    num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
-                             FLAGS.batch_size)
-    decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
-
-    # Create an optimizer that performs gradient descent.
-    opt = tf.train.AdamOptimizer(0.001)
-
-    #train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE)
-
-    # Calculate the gradients for each model tower.
-    tower_grads = []
-    for i in xrange(FLAGS.num_gpus):
-      with tf.device('/gpu:%d' % i):
-        with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
-          # Calculate the loss for one tower of the model. This function
-          # constructs the entire model but shares the variables across
-          # all towers.
-          loss = tower_loss(scope)
-
-          # Reuse variables for the next tower.
-          tf.get_variable_scope().reuse_variables()
-
-          # Retain the summaries from the final tower.
-          # summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
-
-          # Calculate the gradients for the batch of data on this tower.
-          grads = opt.compute_gradients(loss)
-          
-          # Keep track of the gradients across all towers.
-          tower_grads.append(grads)
-
-    # We must calculate the mean of each gradient. Note that this is the
-    # synchronization point across all towers.
-    grads = average_gradients(tower_grads)
-
-    # Apply the gradients to adjust the shared variables.
-    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
-
-    # Group all updates to into a single train op.
-    train_op = tf.group(apply_gradient_op)
-
-    # Build an initialization operation.
-    init = tf.initialize_all_variables()
-
-    # Start running operations on the Graph. allow_soft_placement must be set to
-    # True to build towers on GPU, as some of the ops do not have GPU
-    # implementations.
-    sess = tf.Session(config=tf.ConfigProto(
-        allow_soft_placement=True,
-        log_device_placement=FLAGS.log_device_placement))
-    sess.run(init)
-    time_tensorflow_run(sess, [train_op, loss])
+    with tf.Graph().as_default(), tf.device('/cpu:0'):
+        # Create a variable to count the number of train() calls. This equals the
+        # number of batches processed * FLAGS.num_gpus.
+        global_step = tf.get_variable(
+            'global_step', [],
+            initializer=tf.constant_initializer(0),
+            trainable=False)
+
+        # Calculate the learning rate schedule.
+        num_batches_per_epoch = (NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
+                                 FLAGS.batch_size)
+        decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
+
+        # Create an optimizer that performs gradient descent.
+        opt = tf.train.AdamOptimizer(0.001)
+
+        #train_dataset = reader.create_datasets("imdb.pkl", VOCAB_SIZE)
+
+        # Calculate the gradients for each model tower.
+        tower_grads = []
+        for i in xrange(FLAGS.num_gpus):
+            with tf.device('/gpu:%d' % i):
+                with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
+                    # Calculate the loss for one tower of the model. This function
+                    # constructs the entire model but shares the variables across
+                    # all towers.
+                    loss = tower_loss(scope)
+
+                    # Reuse variables for the next tower.
+                    tf.get_variable_scope().reuse_variables()
+
+                    # Retain the summaries from the final tower.
+                    # summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
+
+                    # Calculate the gradients for the batch of data on this tower.
+                    grads = opt.compute_gradients(loss)
+
+                    # Keep track of the gradients across all towers.
+                    tower_grads.append(grads)
+
+        # We must calculate the mean of each gradient. Note that this is the
+        # synchronization point across all towers.
+        grads = average_gradients(tower_grads)
+
+        # Apply the gradients to adjust the shared variables.
+        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+        # Group all updates to into a single train op.
+        train_op = tf.group(apply_gradient_op)
+
+        # Build an initialization operation.
+        init = tf.initialize_all_variables()
+
+        # Start running operations on the Graph. allow_soft_placement must be set to
+        # True to build towers on GPU, as some of the ops do not have GPU
+        # implementations.
+        sess = tf.Session(config=tf.ConfigProto(
+            allow_soft_placement=True,
+            log_device_placement=FLAGS.log_device_placement))
+        sess.run(init)
+        time_tensorflow_run(sess, [train_op, loss])
 
 
 def main(_):
-  run_benchmark()
+    run_benchmark()
 
 
 if __name__ == '__main__':
-  tf.app.run()
+    tf.app.run()
diff --git a/cmake/FindAVX.cmake b/cmake/FindAVX.cmake
index f6103c6e667e8a8f6b8998d8eb467235fb49cb19..d380c996dfa95f0caa2b9cd9daa0ac9141e51fe0 100644
--- a/cmake/FindAVX.cmake
+++ b/cmake/FindAVX.cmake
@@ -3,36 +3,55 @@
 
 INCLUDE(CheckCXXSourceRuns)
 
-SET(FIND_AVX_10)
-SET(FIND_AVX_20)
-SET(AVX_FLAGS)
-SET(AVX_FOUND)
-
-# Check AVX 2
-SET(CMAKE_REQUIRED_FLAGS)
 IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-  SET(CMAKE_REQUIRED_FLAGS "-mavx2")
-ELSEIF(MSVC AND NOT CMAKE_CL_64)  # reserve for WINDOWS
-  SET(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
+    set(MMX_FLAG "-mmmx")
+    set(SSE2_FLAG "-msse2")
+    set(SSE3_FLAG "-msse3")
+    SET(AVX_FLAG "-mavx")
+    SET(AVX2_FLAG "-mavx2")
+ELSEIF(MSVC)
+    set(MMX_FLAG "/arch:MMX")
+    set(SSE2_FLAG "/arch:SSE2")
+    set(SSE3_FLAG "/arch:SSE3")
+    SET(AVX_FLAG "/arch:AVX")
+    SET(AVX2_FLAG "/arch:AVX2")
 ENDIF()
 
+# Check  MMX
+set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
 CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
+#include <mmintrin.h>
 int main()
 {
-    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
-    __m256i result = _mm256_abs_epi32 (a);
+    _mm_setzero_si64();
     return 0;
-}" FIND_AVX_20)
+}" MMX_FOUND)
 
-# Check AVX
-SET(CMAKE_REQUIRED_FLAGS)
-IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-    SET(CMAKE_REQUIRED_FLAGS "-mavx")
-ELSEIF(MSVC AND NOT CMAKE_CL_64)
-    SET(CMAKE_REQUIRED_FLAGS "/arch:AVX")
-endif()
+# Check SSE2
+set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG})
+CHECK_CXX_SOURCE_RUNS("
+#include <emmintrin.h>
+int main()
+{
+    _mm_setzero_si128();
+    return 0;
+}" SSE2_FOUND)
 
+# Check SSE3
+set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG})
+CHECK_CXX_SOURCE_RUNS("
+#include <pmmintrin.h>
+int main()
+{
+    __m128d a = _mm_set1_pd(6.28);
+    __m128d b = _mm_set1_pd(3.14);
+    __m128d result = _mm_addsub_pd(a, b);
+    result = _mm_movedup_pd(result);
+    return 0;
+}" SSE3_FOUND)
+
+# Check AVX
+set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
 CHECK_CXX_SOURCE_RUNS("
 #include <immintrin.h>
 int main()
@@ -41,25 +60,17 @@ int main()
     __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
     __m256 result = _mm256_add_ps (a, b);
     return 0;
-}" FIND_AVX_10)
-
-IF(${FIND_AVX_20})
-    IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-        SET(AVX_FLAGS "${AVX_FLAGS} -mavx2")
-    ELSEIF(MSVC)
-        SET(AVX_FLAGS "${AVX_FLAGS} /arch:AVX2")
-    ENDIF()
-ENDIF()
+}" AVX_FOUND)
 
-IF(${FIND_AVX_10})
-    IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-        SET(AVX_FLAGS "${AVX_FLAGS} -mavx")
-    ELSEIF(MSVC)
-        SET(AVX_FLAGS "${AVX_FLAGS} /arch:AVX")
-    ENDIF()
-ENDIF()
+# Check AVX 2
+set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
+    __m256i result = _mm256_abs_epi32 (a);
+    return 0;
+}" AVX2_FOUND)
 
-IF(${FIND_AVX_10})
-    SET(AVX_FOUND TRUE)
-    MESSAGE(STATUS "Find CPU supports ${AVX_FLAGS}.")
-ENDIF()
+mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 57c32a54cd727e3acb181eeb19f811fab4dc82fd..685334c6585060c0344e552c6f3fda2c7324de03 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -1,4 +1,4 @@
-# Find the CBlas libraries
+# Find the CBlas and lapack libraries
 #
 # It will search MKL, atlas, OpenBlas, reference-cblas in order.
 #
@@ -19,6 +19,8 @@ set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
 
 find_path(MKL_INCLUDE_DIR mkl.h PATHS
   ${MKL_ROOT}/include)
+find_path(MKL_INCLUDE_DIR mkl_lapacke.h PATHS
+  ${MKL_ROOT}/include)
 find_library(MKL_CORE_LIB NAMES mkl_core PATHS
   ${MKL_ROOT}/lib
   ${MKL_ROOT}/lib/intel64)
@@ -37,6 +39,7 @@ if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
           ${MKL_SEQUENTIAL_LIB}
           ${MKL_CORE_LIB})
   add_definitions(-DPADDLE_USE_MKL)
+  message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
   return() # return file.
 endif()
 
@@ -55,15 +58,19 @@ set(ATLAS_LIB_SEARCH_PATHS
     )
 find_path(ATLAS_INC_DIR NAMES cblas.h 
   PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
+find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
+  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
 find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3 
   PATHS ${ATLAS_LIB_SEARCH_PATHS})
-find_library(ATLAS_LIB NAMES atlas libatlas.so.3
+find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
   PATHS ${ATLAS_LIB_SEARCH_PATHS})
 
 if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB)
   set(CBLAS_PROVIDER ATLAS)
-  set(CBLAS_INC_DIR ${ATLAS_INC_DIR})
+  set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
   set(CBLAS_LIBS ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
+  add_definitions(-DPADDLE_USE_ATLAS)  
+  message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
   return()
 endif()
 
@@ -83,6 +90,8 @@ set(OPENBLAS_LIB_SEARCH_PATHS
 
 find_path(OPENBLAS_INC_DIR NAMES cblas.h
   PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
+find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
+  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
 find_library(OPENBLAS_LIB NAMES openblas
   PATHS ${OPENBLAS_LIB_SEARCH_PATHS})
 
@@ -90,6 +99,7 @@ if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
   set(CBLAS_PROVIDER OPENBLAS)
   set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR})
   set(CBLAS_LIBS ${OPENBLAS_LIB})
+  message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
   return()
 endif()
 
diff --git a/cmake/coveralls.cmake b/cmake/coveralls.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..9be7643819efdde3f42e4d39b2849ecc17e0d9fb
--- /dev/null
+++ b/cmake/coveralls.cmake
@@ -0,0 +1,103 @@
+# CMake script for code coverage.
+# If _COVERALLS_UPLOAD is ON, it will upload json files to overalls.io automatically.
+
+# Param _COVERAGE_SRCS          A list of coverage source files.
+# Param _COVERALLS_UPLOAD       Upload the result to coveralls.
+# Param _CMAKE_SCRIPT_PATH      CMake script path.
+function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH)
+    # clean previous gcov data.
+    file(REMOVE_RECURSE ${PROJECT_BINARY_DIR}/*.gcda)
+
+    # find curl for upload JSON soon.
+    if (_COVERALLS_UPLOAD)
+        find_program(CURL_EXECUTABLE curl)
+        if (NOT CURL_EXECUTABLE)
+            message(FATAL_ERROR "Coveralls: curl not found!")
+        endif()
+    endif()
+
+    # When passing a CMake list to an external process, the list
+    # will be converted from the format "1;2;3" to "1 2 3".
+    set(COVERAGE_SRCS "")
+    foreach (SINGLE_SRC ${_COVERAGE_SRCS})
+        set(COVERAGE_SRCS "${COVERAGE_SRCS}*${SINGLE_SRC}")
+    endforeach()
+
+    # query number of logical cores
+    cmake_host_system_information(RESULT core_size QUERY NUMBER_OF_LOGICAL_CORES)
+    # coveralls json file.
+    set(COVERALLS_FILE ${PROJECT_BINARY_DIR}/coveralls.json)
+    add_custom_target(coveralls_generate
+        # Run regress tests.
+        COMMAND ${CMAKE_CTEST_COMMAND}
+                -j ${core_size}
+                --output-on-failure
+        # Generate Gcov and translate it into coveralls JSON.
+        COMMAND ${CMAKE_COMMAND}
+                -DCOVERAGE_SRCS="${COVERAGE_SRCS}"
+                -DCOVERALLS_OUTPUT_FILE="${COVERALLS_FILE}"
+                -DCOV_PATH="${PROJECT_BINARY_DIR}"
+                -DPROJECT_ROOT="${PROJECT_SOURCE_DIR}"
+                -P "${_CMAKE_SCRIPT_PATH}/coverallsGcovJsons.cmake"
+        WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+        COMMENT "Coveralls: generating coveralls output..."
+    )
+
+    if (_COVERALLS_UPLOAD)
+        message("COVERALLS UPLOAD: ON")
+        # Upload the JSON to coveralls.
+        add_custom_target(coveralls_upload
+            COMMAND ${CURL_EXECUTABLE}
+                    -S -F json_file=@${COVERALLS_FILE}
+                    https://coveralls.io/api/v1/jobs
+            DEPENDS coveralls_generate
+            WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+            COMMENT "Coveralls: uploading coveralls output...")
+
+        add_custom_target(coveralls DEPENDS coveralls_upload)
+    else()
+        message("COVERALLS UPLOAD: OFF")
+        add_custom_target(coveralls DEPENDS coveralls_generate)
+    endif()
+endfunction()
+
+if(ON_COVERALLS)
+    set(CMAKE_BUILD_TYPE "Debug")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
+
+    set(EXCLUDE_DIRS
+        "demo/"
+        "build/"
+        "tests/"
+        ".test_env/"
+    )
+
+    if(WITH_GPU)
+        file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" ".c" "*.cu")
+    else()
+        file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" "*.c")
+    endif()
+
+    # exclude trivial files in PADDLE_SOURCES
+    foreach(EXCLUDE_DIR ${EXCLUDE_DIRS})
+        foreach(TMP_PATH ${PADDLE_SOURCES})
+            string(FIND ${TMP_PATH} ${EXCLUDE_DIR} EXCLUDE_DIR_FOUND)
+            if(NOT ${EXCLUDE_DIR_FOUND} EQUAL -1)
+                list(REMOVE_ITEM PADDLE_SOURCES ${TMP_PATH})
+            endif()
+        endforeach(TMP_PATH)
+    endforeach()
+
+    # convert to absolute path
+    set(PADDLE_SRCS "")
+    foreach(PADDLE_SRC ${PADDLE_SOURCES})
+        set(PADDLE_SRCS "${PADDLE_SRCS};${PROJECT_SOURCE_DIR}/${PADDLE_SRC}")
+    endforeach()
+
+    code_coverage(
+        "${PADDLE_SRCS}"
+        ${COVERALLS_UPLOAD}
+        "${PROJECT_SOURCE_DIR}/cmake"
+    )
+endif()
diff --git a/cmake/coverallsGcovJsons.cmake b/cmake/coverallsGcovJsons.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..ae3530c3a0eeb79ddbcbf4f2e99be75aa7968a2f
--- /dev/null
+++ b/cmake/coverallsGcovJsons.cmake
@@ -0,0 +1,403 @@
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# Copyright (C) 2014 Joakim Söderberg <joakim.soderberg@gmail.com>
+#
+# This is intended to be run by a custom target in a CMake project like this.
+# 0. Compile program with coverage support.
+# 1. Clear coverage data. (Recursively delete *.gcda in build dir)
+# 2. Run the unit tests.
+# 3. Run this script specifying which source files the coverage should be performed on.
+#
+# This script will then use gcov to generate .gcov files in the directory specified
+# via the COV_PATH var. This should probably be the same as your cmake build dir.
+#
+# It then parses the .gcov files to convert them into the Coveralls JSON format:
+# https://coveralls.io/docs/api
+#
+
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
+
+# Since it's not possible to pass a CMake list properly in the
+# "1;2;3" format to an external process, we have replaced the
+# ";" with "*", so reverse that here so we get it back into the
+# CMake list format.
+string(REGEX REPLACE "\\*" ";" COVERAGE_SRCS ${COVERAGE_SRCS})
+
+find_program(GCOV_EXECUTABLE gcov)
+if (NOT GCOV_EXECUTABLE)
+	message(FATAL_ERROR "gcov not found! Aborting...")
+endif()
+
+find_package(Git)
+
+# TODO: Add these git things to the coveralls json.
+if (GIT_FOUND)
+	# Branch.
+	execute_process(
+		COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD
+		WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+		OUTPUT_VARIABLE GIT_BRANCH
+		OUTPUT_STRIP_TRAILING_WHITESPACE
+	)
+
+	macro (git_log_format FORMAT_CHARS VAR_NAME)
+		execute_process(
+			COMMAND ${GIT_EXECUTABLE} log -1 --pretty=format:%${FORMAT_CHARS}
+			WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+			OUTPUT_VARIABLE ${VAR_NAME}
+			OUTPUT_STRIP_TRAILING_WHITESPACE
+		)
+	endmacro()
+
+	git_log_format(an GIT_AUTHOR_EMAIL)
+	git_log_format(ae GIT_AUTHOR_EMAIL)
+	git_log_format(cn GIT_COMMITTER_NAME)
+	git_log_format(ce GIT_COMMITTER_EMAIL)
+	git_log_format(B GIT_COMMIT_MESSAGE)
+
+	message("Git exe: ${GIT_EXECUTABLE}")
+	message("Git branch: ${GIT_BRANCH}")
+	message("Git author: ${GIT_AUTHOR_NAME}")
+	message("Git e-mail: ${GIT_AUTHOR_EMAIL}")
+	message("Git commiter name: ${GIT_COMMITTER_NAME}")
+	message("Git commiter e-mail: ${GIT_COMMITTER_EMAIL}")
+	message("Git commit message: ${GIT_COMMIT_MESSAGE}")
+
+endif()
+
+############################# Macros #########################################
+
+#
+# This macro converts from the full path format gcov outputs:
+#
+#    /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
+#
+# to the original source file path the .gcov is for:
+#
+#   /path/to/project/root/subdir/the_file.c
+#
+macro(get_source_path_from_gcov_filename _SRC_FILENAME _GCOV_FILENAME)
+
+	# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov 
+	# -> 
+	# #path#to#project#root#subdir#the_file.c.gcov   
+	get_filename_component(_GCOV_FILENAME_WEXT ${_GCOV_FILENAME} NAME)
+
+	# #path#to#project#root#subdir#the_file.c.gcov -> /path/to/project/root/subdir/the_file.c
+	string(REGEX REPLACE "\\.gcov$" "" SRC_FILENAME_TMP ${_GCOV_FILENAME_WEXT})
+	string(REGEX REPLACE "\#" "/" SRC_FILENAME_TMP ${SRC_FILENAME_TMP})
+	set(${_SRC_FILENAME} "${SRC_FILENAME_TMP}")
+endmacro()
+
+##############################################################################
+
+# Get the coverage data.
+file(GLOB_RECURSE GCDA_FILES "${COV_PATH}" "*.gcda")
+message("GCDA files:")
+
+# Get a list of all the object directories needed by gcov
+# (The directories the .gcda files and .o files are found in)
+# and run gcov on those.
+foreach(GCDA ${GCDA_FILES})
+	message("Process: ${GCDA}")
+	message("------------------------------------------------------------------------------")
+	get_filename_component(GCDA_DIR ${GCDA} PATH)
+
+	#
+	# The -p below refers to "Preserve path components",
+	# This means that the generated gcov filename of a source file will
+	# keep the original files entire filepath, but / is replaced with #.
+	# Example:
+	#
+	# /path/to/project/root/build/CMakeFiles/the_file.dir/subdir/the_file.c.gcda
+	# ------------------------------------------------------------------------------
+	# File '/path/to/project/root/subdir/the_file.c'
+	# Lines executed:68.34% of 199
+	# /path/to/project/root/subdir/the_file.c:creating '#path#to#project#root#subdir#the_file.c.gcov'
+	#
+	# If -p is not specified then the file is named only "the_file.c.gcov"
+	#
+	execute_process(
+		COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA}
+		WORKING_DIRECTORY ${GCDA_DIR}
+	)
+endforeach()
+
+# TODO: Make these be absolute path
+file(GLOB_RECURSE ALL_GCOV_FILES "${COV_PATH}" "*.gcov")
+
+# Get only the filenames to use for filtering.
+#set(COVERAGE_SRCS_NAMES "")
+#foreach (COVSRC ${COVERAGE_SRCS})
+#	get_filename_component(COVSRC_NAME ${COVSRC} NAME)
+#	message("${COVSRC} -> ${COVSRC_NAME}")
+#	list(APPEND COVERAGE_SRCS_NAMES "${COVSRC_NAME}")
+#endforeach()
+
+#
+# Filter out all but the gcov files we want.
+#
+# We do this by comparing the list of COVERAGE_SRCS filepaths that the
+# user wants the coverage data for with the paths of the generated .gcov files,
+# so that we only keep the relevant gcov files.
+#
+# Example:
+# COVERAGE_SRCS =
+#				/path/to/project/root/subdir/the_file.c
+#
+# ALL_GCOV_FILES =
+#				/path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
+#				/path/to/project/root/build/#path#to#project#root#subdir#other_file.c.gcov
+# 
+# Result should be:
+# GCOV_FILES = 
+#				/path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
+#
+set(GCOV_FILES "")
+#message("Look in coverage sources: ${COVERAGE_SRCS}")
+message("\nFilter out unwanted GCOV files:")
+message("===============================")
+
+set(COVERAGE_SRCS_REMAINING ${COVERAGE_SRCS})
+
+foreach (GCOV_FILE ${ALL_GCOV_FILES})
+
+	#
+	# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov 
+	# -> 
+	# /path/to/project/root/subdir/the_file.c 
+	get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE})
+
+	# Is this in the list of source files?
+	# TODO: We want to match against relative path filenames from the source file root...
+	list(FIND COVERAGE_SRCS ${GCOV_SRC_PATH} WAS_FOUND)
+
+	if (NOT WAS_FOUND EQUAL -1)
+		message("YES: ${GCOV_FILE}")
+		list(APPEND GCOV_FILES ${GCOV_FILE})
+
+		# We remove it from the list, so we don't bother searching for it again.
+		# Also files left in COVERAGE_SRCS_REMAINING after this loop ends should
+		# have coverage data generated from them (no lines are covered).
+		list(REMOVE_ITEM COVERAGE_SRCS_REMAINING ${GCOV_SRC_PATH})
+	else()
+		message("NO:  ${GCOV_FILE}")
+	endif()
+endforeach()
+
+# TODO: Enable setting these
+set(JSON_SERVICE_NAME "travis-ci")
+set(JSON_SERVICE_JOB_ID $ENV{TRAVIS_JOB_ID})
+
+set(JSON_TEMPLATE
+"{
+  \"service_name\": \"\@JSON_SERVICE_NAME\@\",
+  \"service_job_id\": \"\@JSON_SERVICE_JOB_ID\@\",
+  \"source_files\": \@JSON_GCOV_FILES\@
+}"
+)
+
+set(SRC_FILE_TEMPLATE
+"{
+      \"name\": \"\@GCOV_SRC_REL_PATH\@\",
+      \"source_digest\": \"\@GCOV_CONTENTS_MD5\@\",
+      \"coverage\": \@GCOV_FILE_COVERAGE\@
+  }"
+)
+
+message("\nGenerate JSON for files:")
+message("=========================")
+
+set(JSON_GCOV_FILES "[")
+
+# Read the GCOV files line by line and get the coverage data.
+foreach (GCOV_FILE ${GCOV_FILES})
+
+	get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE})
+	file(RELATIVE_PATH GCOV_SRC_REL_PATH "${PROJECT_ROOT}" "${GCOV_SRC_PATH}")
+
+	# The new coveralls API doesn't need the entire source (Yay!)
+	# However, still keeping that part for now. Will cleanup in the future.
+	file(MD5 "${GCOV_SRC_PATH}" GCOV_CONTENTS_MD5)
+	message("MD5: ${GCOV_SRC_PATH} = ${GCOV_CONTENTS_MD5}")
+
+	# Loads the gcov file as a list of lines.
+	# (We first open the file and replace all occurences of [] with _
+	#  because CMake will fail to parse a line containing unmatched brackets...
+	#  also the \ to escaped \n in macros screws up things.)
+	# https://public.kitware.com/Bug/view.php?id=15369
+	file(READ ${GCOV_FILE} GCOV_CONTENTS)
+	string(REPLACE "[" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
+	string(REPLACE "]" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
+	string(REPLACE "\\" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
+	file(WRITE ${GCOV_FILE}_tmp "${GCOV_CONTENTS}")
+
+	file(STRINGS ${GCOV_FILE}_tmp GCOV_LINES)
+	list(LENGTH GCOV_LINES LINE_COUNT)
+
+	# Instead of trying to parse the source from the
+	# gcov file, simply read the file contents from the source file.
+	# (Parsing it from the gcov is hard because C-code uses ; in many places
+	#  which also happens to be the same as the CMake list delimeter).
+	file(READ ${GCOV_SRC_PATH} GCOV_FILE_SOURCE)
+
+	string(REPLACE "\\" "\\\\" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	string(REGEX REPLACE "\"" "\\\\\"" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	string(REPLACE "\t" "\\\\t" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	string(REPLACE "\r" "\\\\r" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	string(REPLACE "\n" "\\\\n" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	# According to http://json.org/ these should be escaped as well.
+	# Don't know how to do that in CMake however...
+	#string(REPLACE "\b" "\\\\b" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	#string(REPLACE "\f" "\\\\f" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	#string(REGEX REPLACE "\u([a-fA-F0-9]{4})" "\\\\u\\1" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+
+	# We want a json array of coverage data as a single string
+	# start building them from the contents of the .gcov
+	set(GCOV_FILE_COVERAGE "[")
+
+	set(GCOV_LINE_COUNT 1) # Line number for the .gcov.
+	set(DO_SKIP 0)
+	foreach (GCOV_LINE ${GCOV_LINES})
+		#message("${GCOV_LINE}")
+		# Example of what we're parsing:
+		# Hitcount  |Line | Source
+		# "        8:   26:        if (!allowed || (strlen(allowed) == 0))"
+		string(REGEX REPLACE 
+			"^([^:]*):([^:]*):(.*)$" 
+			"\\1;\\2;\\3"
+			RES
+			"${GCOV_LINE}")
+
+		# Check if we should exclude lines using the Lcov syntax.
+		string(REGEX MATCH "LCOV_EXCL_START" START_SKIP "${GCOV_LINE}")
+		string(REGEX MATCH "LCOV_EXCL_END" END_SKIP "${GCOV_LINE}")
+		string(REGEX MATCH "LCOV_EXCL_LINE" LINE_SKIP "${GCOV_LINE}")
+
+		set(RESET_SKIP 0)
+		if (LINE_SKIP AND NOT DO_SKIP)
+			set(DO_SKIP 1)
+			set(RESET_SKIP 1)
+		endif()
+
+		if (START_SKIP)
+			set(DO_SKIP 1)
+			message("${GCOV_LINE_COUNT}: Start skip")
+		endif()
+
+		if (END_SKIP)
+			set(DO_SKIP 0)
+		endif()
+
+		list(LENGTH RES RES_COUNT)
+
+		if (RES_COUNT GREATER 2)
+			list(GET RES 0 HITCOUNT)
+			list(GET RES 1 LINE)
+			list(GET RES 2 SOURCE)
+
+			string(STRIP ${HITCOUNT} HITCOUNT)
+			string(STRIP ${LINE} LINE)
+
+			# Lines with 0 line numbers are metadata and can be ignored.
+			if (NOT ${LINE} EQUAL 0)
+				
+				if (DO_SKIP)
+					set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ")
+				else()
+					# Translate the hitcount into valid JSON values.
+					if (${HITCOUNT} STREQUAL "#####")
+						set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ")
+					elseif (${HITCOUNT} STREQUAL "-")
+						set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ")
+					else()
+						set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}${HITCOUNT}, ")
+					endif()
+				endif()
+			endif()
+		else()
+			message(WARNING "Failed to properly parse line (RES_COUNT = ${RES_COUNT}) ${GCOV_FILE}:${GCOV_LINE_COUNT}\n-->${GCOV_LINE}")
+		endif()
+
+		if (RESET_SKIP)
+			set(DO_SKIP 0)
+		endif()
+		math(EXPR GCOV_LINE_COUNT "${GCOV_LINE_COUNT}+1")
+	endforeach()
+
+	message("${GCOV_LINE_COUNT} of ${LINE_COUNT} lines read!")
+
+	# Advanced way of removing the trailing comma in the JSON array.
+	# "[1, 2, 3, " -> "[1, 2, 3"
+	string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE})
+
+	# Append the trailing ] to complete the JSON array.
+	set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")
+
+	# Generate the final JSON for this file.
+	message("Generate JSON for file: ${GCOV_SRC_REL_PATH}...")
+	string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
+
+	set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
+endforeach()
+
+# Loop through all files we couldn't find any coverage for
+# as well, and generate JSON for those as well with 0% coverage.
+foreach(NOT_COVERED_SRC ${COVERAGE_SRCS_REMAINING})
+
+	# Loads the source file as a list of lines.
+	file(STRINGS ${NOT_COVERED_SRC} SRC_LINES)
+
+	set(GCOV_FILE_COVERAGE "[")
+	set(GCOV_FILE_SOURCE "")
+
+	foreach (SOURCE ${SRC_LINES})
+		set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ")
+
+		string(REPLACE "\\" "\\\\" SOURCE "${SOURCE}")
+		string(REGEX REPLACE "\"" "\\\\\"" SOURCE "${SOURCE}")
+		string(REPLACE "\t" "\\\\t" SOURCE "${SOURCE}")
+		string(REPLACE "\r" "\\\\r" SOURCE "${SOURCE}")
+		set(GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}${SOURCE}\\n")
+	endforeach()
+
+	# Remove trailing comma, and complete JSON array with ]
+	string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE})
+	set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")
+
+	# Generate the final JSON for this file.
+	message("Generate JSON for non-gcov file: ${NOT_COVERED_SRC}...")
+	string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
+	set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
+endforeach()
+
+# Get rid of trailing comma.
+string(REGEX REPLACE ",[ ]*$" "" JSON_GCOV_FILES ${JSON_GCOV_FILES})
+set(JSON_GCOV_FILES "${JSON_GCOV_FILES}]")
+
+# Generate the final complete JSON!
+message("Generate final JSON...")
+string(CONFIGURE ${JSON_TEMPLATE} JSON)
+
+file(WRITE "${COVERALLS_OUTPUT_FILE}" "${JSON}")
+message("###########################################################################")
+message("Generated coveralls JSON containing coverage data:") 
+message("${COVERALLS_OUTPUT_FILE}")
+message("###########################################################################")
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index dbad6be3f41b3f565d6bf275633d07198491ff3d..e087770991aefc17535d50c0539c50f6316520d7 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -21,12 +21,6 @@ function(safe_set_flag is_c src_list flag_name)
     endif()
     if(${safe_name})
         set(${src_list} "${${src_list}} ${flag_name}" PARENT_SCOPE)
-        if(is_c)
-          set(CUDA_NVCC_FLAGS
-              --compiler-options;${flag_name}
-              ${CUDA_NVCC_FLAGS}
-              PARENT_SCOPE)
-        endif()
     endif()
 endfunction()
 
@@ -40,6 +34,20 @@ macro(safe_set_cxxflag src_list flag_name)
     safe_set_flag(OFF ${src_list} ${flag_name})
 endmacro()
 
+# helper macro to set nvcc flag
+macro(safe_set_nvflag flag_name)
+    string(REPLACE "-" "_" safe_name ${flag_name})
+    string(REPLACE "=" "_" safe_name ${safe_name})
+    CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name})
+    set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name})
+    if(${safe_name})
+        set(CUDA_NVCC_FLAGS
+            --compiler-options;${flag_name}
+            ${CUDA_NVCC_FLAGS})
+    endif()
+endmacro()
+
+
 CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
 if(NOT UINT64_MAX_EXISTS)
   set(CMAKE_REQUIRED_DEFINITIONS -D__STDC_LIMIT_MACROS)
@@ -63,20 +71,43 @@ set(COMMON_FLAGS
     -Wnon-virtual-dtor
     -Wdelete-non-virtual-dtor
     -Wno-unused-parameter
+    -Wno-unused-function
+    -Wno-error=literal-suffix
+    -Wno-error=unused-local-typedefs)
+
+set(GPU_COMMON_FLAGS
+    -fPIC
+    -fno-omit-frame-pointer
+    -Wnon-virtual-dtor
+    -Wdelete-non-virtual-dtor
+    -Wno-unused-parameter
+    -Wno-unused-function
     -Wno-error=literal-suffix
     -Wno-error=unused-local-typedefs
     -Wno-error=unused-function  # Warnings in Numpy Header.
 )
 
+if (APPLE)
+    # On Mac OS X build fat binaries with x86_64 architectures by default.
+    set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
+else()
+    set(GPU_COMMON_FLAGS
+        -Wall
+        -Wextra
+        -Werror
+        ${GPU_COMMON_FLAGS})
+endif()
+
+
 foreach(flag ${COMMON_FLAGS})
     safe_set_cflag(CMAKE_C_FLAGS ${flag})
     safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
 endforeach()
 
-# On Mac OS X build fat binaries with x86_64 architectures by default.
-if (APPLE)
-    set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
-endif ()
+foreach(flag ${GPU_COMMON_FLAGS})
+    safe_set_nvflag(${flag})
+endforeach()
+
 
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
diff --git a/cmake/rdma.cmake b/cmake/rdma.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..e9a4da79aa92a92aa7e5d21bb795ab9aaf60ab8b
--- /dev/null
+++ b/cmake/rdma.cmake
@@ -0,0 +1,76 @@
+# user should download rdma first from subversion repository
+
+# execute following instruction to download svn mannally
+# svn co https://svn.baidu.com/sys/ip/trunk/rdma/sockrdmav1 rdma/
+# svn co https://svn.baidu.com/sys/ip/trunk/rdma/thirdparty rdma/
+# we use static output in svn repositories to avoid implict bugs from not standard runtime env.
+
+set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
+
+function(generate_rdma_links)
+  #redirect to current DIR to isolate the pollution from system runtime environment
+  #it can benifits unified control for different gcc environment. 
+  #e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
+  #runtime libraries that will crash process while loading it. That redirect trick
+  #can fix it.
+  execute_process(
+    COMMAND mkdir -p librdma
+    COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
+    COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
+    COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
+    COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so 
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  )
+endfunction(generate_rdma_links)
+
+
+#check and set headers
+find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
+find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
+find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
+
+#check and set libs
+find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
+find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
+find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
+
+if(
+    RDMA_INC_SXISOCK AND
+    RDMA_INC_XIO AND
+    RDMA_INC_EVENT AND
+    RDMA_INC_NUMA AND
+    RDMA_LIB_SXISOCK AND 
+    RDMA_LIB_XIO AND
+    RDMA_LIB_EVENT AND
+    RDMA_LIB_EVENT_CORE AND
+    RDMA_LIB_EVENT_EXTRA AND
+    RDMA_LIB_EVENT_PTHREADS AND
+    RDMA_LIB_NUMA
+    )
+
+  set(RDMA_INC_DIR 
+    ${RDMA_INC_SXISOCK} 
+    ${RDMA_INC_XIO}
+    ${RDMA_INC_EVENT}
+    ${RDMA_INC_NUMA})
+  set(RDMA_LIBS  
+    ${RDMA_LIB_SXISOCK} 
+    ${RDMA_LIB_XIO} 
+    ${RDMA_LIB_EVENT} 
+    ${RDMA_LIB_EVENT_CORE} 
+    ${RDMA_LIB_EVENT_EXTRA} 
+    ${RDMA_LIB_EVENT_PTHREADS} 
+    ${RDMA_LIB_NUMA} 
+    )
+  set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
+  return()
+endif()
+
+#if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
+
+message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")
diff --git a/cmake/swig.cmake b/cmake/swig.cmake
index f5c1bcc79b3dc0e6c4f4489ee9f72a084afe8847..97e87aa947791e2c5a88e7e554dec43bcd661664 100644
--- a/cmake/swig.cmake
+++ b/cmake/swig.cmake
@@ -1,25 +1,3 @@
-find_program(
-    SWIG_BINARY_PATH
-    swig)
-
-if(${SWIG_BINARY_PATH} STREQUAL "SWIG_BINARY_PATH-NOTFOUND")
-    set(SWIG_FOUND OFF)
-else()
-    set(SWIG_FOUND ON)
-endif()
-
-set(MIN_SWIG_VERSION 2)
-if(SWIG_FOUND)
-    execute_process(COMMAND sh -c "${SWIG_BINARY_PATH} -version | grep Version | cut -f3 -d' '"
-        OUTPUT_VARIABLE _SWIG_VERSION
-        OUTPUT_STRIP_TRAILING_WHITESPACE)
-    if(${_SWIG_VERSION} VERSION_LESS ${MIN_SWIG_VERSION})
-        message("swig version ${MIN_SWIG_VERSION} or greater is needed for generating python api. "
-                 "Only version ${_SWIG_VERSION} is found. Set SWIG_FOUND to FALSE")
-        set(SWIG_FOUND FALSE)
-    endif(${_SWIG_VERSION} VERSION_LESS ${MIN_SWIG_VERSION})
-endif(SWIG_FOUND)
-
 function(generate_python_api target_name)
     add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
                               ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
@@ -27,6 +5,7 @@ function(generate_python_api target_name)
         COMMAND swig -python -c++ -outcurrentdir -I../ api/Paddle.swig
                 && mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
         DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
+                ${PROJ_ROOT}/paddle/api/PaddleAPI.h
         WORKING_DIRECTORY ${PROJ_ROOT}/paddle
         COMMENT "Generate Python API from swig")
     add_custom_target(${target_name} ALL DEPENDS
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 0fa36f070cc11be543efe9573b93173ec771b9be..a8282f07184c34f77d506ed7ef40206fbbd55b41 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -67,6 +67,10 @@ endmacro()
 #
 # It will handle WITH_PYTHON/WITH_GLOG etc.
 function(link_paddle_exe TARGET_NAME)
+    if(WITH_RDMA)
+        generate_rdma_links()
+    endif()
+
     if(WITH_METRIC)
         if(WITH_GPU)
             set(METRIC_LIBS paddle_metric_learning paddle_dserver_lib metric metric_cpu)
@@ -109,6 +113,12 @@ function(link_paddle_exe TARGET_NAME)
         ${ZLIB_LIBRARIES}
         ${INTERAL_LIBS}
         ${CMAKE_DL_LIBS})
+
+    if(WITH_RDMA)
+        target_link_libraries(${TARGET_NAME}
+            ${RDMA_LD_FLAGS}
+            ${RDMA_LIBS})
+    endif()
     
     if(WITH_PYTHON)
         target_link_libraries(${TARGET_NAME}
@@ -178,14 +188,6 @@ macro(add_simple_unittest TARGET_NAME)
     add_unittest(${TARGET_NAME} ${TARGET_NAME}.cpp)
 endmacro()
 
-macro(add_paddle_culib TARGET_NAME)
-    set(NVCC_FLAG ${CUDA_NVCC_FLAGS})
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--use_fast_math)
-    cuda_add_library(${TARGET_NAME} STATIC ${ARGN})
-    set(CUDA_NVCC_FLAGS ${NVCC_FLAG})
-endmacro()
-
-
 # Creates C resources file from files in given resource file
 function(create_resources res_file output)
     # Create empty output file
diff --git a/demo/image_classification/.gitignore b/demo/image_classification/.gitignore
index 76961dd1436f859f85f75ff9ed7d3fefdec83dc4..6a05b8f6632db0977fceade8b48a89b9f7f6e6cc 100644
--- a/demo/image_classification/.gitignore
+++ b/demo/image_classification/.gitignore
@@ -5,3 +5,5 @@ plot.png
 train.log
 image_provider_copy_1.py
 *pyc
+train.list
+test.list
diff --git a/demo/image_classification/data/download_cifar.sh b/demo/image_classification/data/download_cifar.sh
old mode 100644
new mode 100755
diff --git a/demo/image_classification/data/process_cifar.py b/demo/image_classification/data/process_cifar.py
index b766118eb00737c7a196ed85850b3cebd690b0d0..b235010e4ece377beffaaa1b9247a77d7a96b712 100644
--- a/demo/image_classification/data/process_cifar.py
+++ b/demo/image_classification/data/process_cifar.py
@@ -16,7 +16,6 @@ import numpy as np
 import sys
 import os
 import PIL.Image as Image
-
 """
   Usage: python process_cifar input_dir output_dir
 """
@@ -30,6 +29,7 @@ def mkdir_not_exist(path):
     if not os.path.exists(path):
         os.mkdir(path)
 
+
 def create_dir_structure(output_dir):
     """
     Create the directory structure for the directory.
@@ -39,8 +39,8 @@ def create_dir_structure(output_dir):
     mkdir_not_exist(os.path.join(output_dir, "train"))
     mkdir_not_exist(os.path.join(output_dir, "test"))
 
-def convert_batch(batch_path, label_set, label_map,
-                  output_dir, data_split):
+
+def convert_batch(batch_path, label_set, label_map, output_dir, data_split):
     """
     Convert CIFAR batch to the structure of Paddle format.
     batch_path: the batch to be converted.
@@ -67,11 +67,23 @@ if __name__ == '__main__':
     output_dir = sys.argv[2]
     num_batch = 5
     create_dir_structure(output_dir)
-    label_map = {0: "airplane", 1: "automobile", 2: "bird", 3: "cat", 4: "deer",
-                 5: "dog", 6: "frog", 7: "horse", 8: "ship", 9: "truck"}
+    label_map = {
+        0: "airplane",
+        1: "automobile",
+        2: "bird",
+        3: "cat",
+        4: "deer",
+        5: "dog",
+        6: "frog",
+        7: "horse",
+        8: "ship",
+        9: "truck"
+    }
     labels = {}
     for i in range(1, num_batch + 1):
-        convert_batch(os.path.join(input_dir, "data_batch_%d" % i), labels,
-                      label_map, output_dir, "train")
-    convert_batch(os.path.join(input_dir, "test_batch"), {},
-                  label_map, output_dir, "test")
\ No newline at end of file
+        convert_batch(
+            os.path.join(input_dir, "data_batch_%d" % i), labels, label_map,
+            output_dir, "train")
+    convert_batch(
+        os.path.join(input_dir, "test_batch"), {}, label_map, output_dir,
+        "test")
diff --git a/demo/image_classification/image_provider.py b/demo/image_classification/image_provider.py
index 9e2f8b8949b39b930680e6d84758133eed566881..28bf1bb02c1f08b2e8ec9acd38f0a8594b05ab66 100644
--- a/demo/image_classification/image_provider.py
+++ b/demo/image_classification/image_provider.py
@@ -46,36 +46,41 @@ def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg,
 
     settings.img_mean = image_util.load_meta(settings.meta_path,
                                              settings.mean_img_size,
-                                             settings.img_size,
-                                             settings.color)
+                                             settings.img_size, settings.color)
 
     settings.logger.info('Image size: %s', settings.img_size)
     settings.logger.info('Meta path: %s', settings.meta_path)
     settings.input_types = [
         dense_vector(settings.img_raw_size),  # image feature
-        integer_value(settings.num_classes)]  # labels
+        integer_value(settings.num_classes)
+    ]  # labels
 
     settings.logger.info('DataProvider Initialization finished')
 
 
-@provider(init_hook=hook)
-def processData(settings, file_name):
+@provider(init_hook=hook, min_pool_size=0)
+def processData(settings, file_list):
     """
     The main function for loading data.
     Load the batch, iterate all the images and labels in this batch.
-    file_name: the batch file name.
+    file_list: the batch file list.
     """
-    data = cPickle.load(io.open(file_name, 'rb'))
-    indexes = list(range(len(data['images'])))
-    if settings.is_train:
-        random.shuffle(indexes)
-    for i in indexes:
-        if settings.use_jpeg == 1:
-            img = image_util.decode_jpeg(data['images'][i])
-        else:
-            img = data['images'][i]
-        img_feat = image_util.preprocess_img(img, settings.img_mean,
-                                             settings.img_size, settings.is_train,
-                                             settings.color)
-        label = data['labels'][i]
-        yield img_feat.tolist(), int(label)
+    with open(file_list, 'r') as fdata:
+        lines = [line.strip() for line in fdata]
+        random.shuffle(lines)
+        for file_name in lines:
+            with io.open(file_name.strip(), 'rb') as file:
+                data = cPickle.load(file)
+                indexes = list(range(len(data['images'])))
+                if settings.is_train:
+                    random.shuffle(indexes)
+                for i in indexes:
+                    if settings.use_jpeg == 1:
+                        img = image_util.decode_jpeg(data['images'][i])
+                    else:
+                        img = data['images'][i]
+                    img_feat = image_util.preprocess_img(
+                        img, settings.img_mean, settings.img_size,
+                        settings.is_train, settings.color)
+                    label = data['labels'][i]
+                    yield img_feat.astype('float32'), int(label)
diff --git a/demo/image_classification/image_util.py b/demo/image_classification/image_util.py
index c545d16aafbc741bce25f9469e7f67de5b88fa8c..b5c6431c06f77cef5c31ca844a8427eebaea2fce 100644
--- a/demo/image_classification/image_util.py
+++ b/demo/image_classification/image_util.py
@@ -16,17 +16,20 @@ import numpy as np
 from PIL import Image
 from cStringIO import StringIO
 
+
 def resize_image(img, target_size):
     """
     Resize an image so that the shorter edge has length target_size.
     img: the input image to be resized.
     target_size: the target resized image size.
     """
-    percent = (target_size/float(min(img.size[0], img.size[1])))
-    resized_size = int(round(img.size[0] * percent)), int(round(img.size[1] * percent))
+    percent = (target_size / float(min(img.size[0], img.size[1])))
+    resized_size = int(round(img.size[0] * percent)), int(
+        round(img.size[1] * percent))
     img = img.resize(resized_size, Image.ANTIALIAS)
     return img
 
+
 def flip(im):
     """
     Return the flipped image.
@@ -38,6 +41,7 @@ def flip(im):
     else:
         return im[:, ::-1]
 
+
 def crop_img(im, inner_size, color=True, test=True):
     """
     Return cropped image.
@@ -50,20 +54,22 @@ def crop_img(im, inner_size, color=True, test=True):
       If True, crop the center of images.
     """
     if color:
-        height, width = max(inner_size, im.shape[1]), max(inner_size, im.shape[2])
+        height, width = max(inner_size, im.shape[1]), max(inner_size,
+                                                          im.shape[2])
         padded_im = np.zeros((3, height, width))
         startY = (height - im.shape[1]) / 2
         startX = (width - im.shape[2]) / 2
         endY, endX = startY + im.shape[1], startX + im.shape[2]
-        padded_im[:, startY: endY, startX: endX] = im
+        padded_im[:, startY:endY, startX:endX] = im
     else:
         im = im.astype('float32')
-        height, width = max(inner_size, im.shape[0]), max(inner_size, im.shape[1])
+        height, width = max(inner_size, im.shape[0]), max(inner_size,
+                                                          im.shape[1])
         padded_im = np.zeros((height, width))
         startY = (height - im.shape[0]) / 2
         startX = (width - im.shape[1]) / 2
         endY, endX = startY + im.shape[0], startX + im.shape[1]
-        padded_im[startY: endY, startX: endX] = im
+        padded_im[startY:endY, startX:endX] = im
     if test:
         startY = (height - inner_size) / 2
         startX = (width - inner_size) / 2
@@ -72,19 +78,21 @@ def crop_img(im, inner_size, color=True, test=True):
         startX = np.random.randint(0, width - inner_size + 1)
     endY, endX = startY + inner_size, startX + inner_size
     if color:
-        pic = padded_im[:, startY: endY, startX: endX]
+        pic = padded_im[:, startY:endY, startX:endX]
     else:
-        pic = padded_im[startY: endY, startX: endX]
+        pic = padded_im[startY:endY, startX:endX]
     if (not test) and (np.random.randint(2) == 0):
         pic = flip(pic)
     return pic
 
+
 def decode_jpeg(jpeg_string):
     np_array = np.array(Image.open(StringIO(jpeg_string)))
     if len(np_array.shape) == 3:
         np_array = np.transpose(np_array, (2, 0, 1))
     return np_array
 
+
 def preprocess_img(im, img_mean, crop_size, is_train, color=True):
     """
     Does data augmentation for images.
@@ -99,6 +107,7 @@ def preprocess_img(im, img_mean, crop_size, is_train, color=True):
     pic -= img_mean
     return pic.flatten()
 
+
 def load_meta(meta_path, mean_img_size, crop_size, color=True):
     """
     Return the loaded meta file.
@@ -109,17 +118,18 @@ def load_meta(meta_path, mean_img_size, crop_size, color=True):
     mean = np.load(meta_path)['data_mean']
     border = (mean_img_size - crop_size) / 2
     if color:
-        assert(mean_img_size * mean_img_size * 3 == mean.shape[0])
+        assert (mean_img_size * mean_img_size * 3 == mean.shape[0])
         mean = mean.reshape(3, mean_img_size, mean_img_size)
-        mean = mean[:, border: border + crop_size,
-                       border: border + crop_size].astype('float32')
+        mean = mean[:, border:border + crop_size, border:border +
+                    crop_size].astype('float32')
     else:
-        assert(mean_img_size * mean_img_size == mean.shape[0])
+        assert (mean_img_size * mean_img_size == mean.shape[0])
         mean = mean.reshape(mean_img_size, mean_img_size)
-        mean = mean[border: border + crop_size,
-                    border: border + crop_size].astype('float32')
+        mean = mean[border:border + crop_size, border:border +
+                    crop_size].astype('float32')
     return mean
 
+
 def load_image(img_path, is_color=True):
     """
     Load image and return. 
@@ -130,6 +140,7 @@ def load_image(img_path, is_color=True):
     img.load()
     return img
 
+
 def oversample(img, crop_dims):
     """
     image : iterable of (H x W x K) ndarrays
@@ -152,50 +163,53 @@ def oversample(img, crop_dims):
         for j in w_indices:
             crops_ix[curr] = (i, j, i + crop_dims[0], j + crop_dims[1])
             curr += 1
-    crops_ix[4] = np.tile(im_center, (1, 2)) + np.concatenate([
-        -crop_dims / 2.0,
-         crop_dims / 2.0
-    ])
+    crops_ix[4] = np.tile(im_center, (1, 2)) + np.concatenate(
+        [-crop_dims / 2.0, crop_dims / 2.0])
     crops_ix = np.tile(crops_ix, (2, 1))
 
     # Extract crops
-    crops = np.empty((10 * len(img), crop_dims[0], crop_dims[1],
-                      im_shape[-1]), dtype=np.float32)
+    crops = np.empty(
+        (10 * len(img), crop_dims[0], crop_dims[1], im_shape[-1]),
+        dtype=np.float32)
     ix = 0
     for im in img:
         for crop in crops_ix:
             crops[ix] = im[crop[0]:crop[2], crop[1]:crop[3], :]
             ix += 1
-        crops[ix-5:ix] = crops[ix-5:ix, :, ::-1, :]  # flip for mirrors
+        crops[ix - 5:ix] = crops[ix - 5:ix, :, ::-1, :]  # flip for mirrors
     return crops
 
+
 class ImageTransformer:
-    def __init__(self, transpose = None,
-                 channel_swap = None, mean = None, is_color = True):
+    def __init__(self,
+                 transpose=None,
+                 channel_swap=None,
+                 mean=None,
+                 is_color=True):
         self.transpose = transpose
         self.channel_swap = None
         self.mean = None
-        self.is_color = is_color 
+        self.is_color = is_color
 
-    def set_transpose(self, order): 
+    def set_transpose(self, order):
         if self.is_color:
-            assert 3 == len(order) 
+            assert 3 == len(order)
         self.transpose = order
 
-    def set_channel_swap(self, order): 
+    def set_channel_swap(self, order):
         if self.is_color:
-            assert 3 == len(order) 
+            assert 3 == len(order)
         self.channel_swap = order
 
     def set_mean(self, mean):
         # mean value, may be one value per channel 
         if mean.ndim == 1:
-            mean = mean[:, np.newaxis, np.newaxis]       
-        else: 
+            mean = mean[:, np.newaxis, np.newaxis]
+        else:
             # elementwise mean
             if self.is_color:
                 assert len(mean.shape) == 3
-        self.mean = mean 
+        self.mean = mean
 
     def transformer(self, data):
         if self.transpose is not None:
diff --git a/demo/image_classification/prediction.py b/demo/image_classification/prediction.py
index 5d9e93265867389ca6d2aa26e48fcfa08561e6ae..6a47bd5851c99635dd7d3f1d5df67dd081ca4584 100755
--- a/demo/image_classification/prediction.py
+++ b/demo/image_classification/prediction.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os,sys
+import os, sys
 import numpy as np
 import logging
 from PIL import Image
@@ -24,9 +24,11 @@ from py_paddle import swig_paddle, DataProviderConverter
 from paddle.trainer.PyDataProvider2 import dense_vector
 from paddle.trainer.config_parser import parse_config
 
-logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
+logging.basicConfig(
+    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
 logging.getLogger().setLevel(logging.INFO)
 
+
 class ImageClassifier():
     def __init__(self,
                  train_conf,
@@ -58,18 +60,19 @@ class ImageClassifier():
         self.oversample = oversample
         self.is_color = is_color
 
-        self.transformer = image_util.ImageTransformer(is_color = is_color)
-        self.transformer.set_transpose((2,0,1))
+        self.transformer = image_util.ImageTransformer(is_color=is_color)
+        self.transformer.set_transpose((2, 0, 1))
 
         self.mean_file = mean_file
         mean = np.load(self.mean_file)['data_mean']
         mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1])
-        self.transformer.set_mean(mean) # mean pixel
+        self.transformer.set_mean(mean)  # mean pixel
         gpu = 1 if use_gpu else 0
         conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (gpu)
         conf = parse_config(train_conf, conf_args)
         swig_paddle.initPaddle("--use_gpu=%d" % (gpu))
-        self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
+        self.network = swig_paddle.GradientMachine.createFromConfigProto(
+            conf.model_config)
         assert isinstance(self.network, swig_paddle.GradientMachine)
         self.network.loadParameters(self.model_dir)
 
@@ -90,14 +93,14 @@ class ImageClassifier():
             # image_util.resize_image: short side is self.resize_dim
             image = image_util.resize_image(image, self.resize_dim)
             image = np.array(image)
-            input = np.zeros((1, image.shape[0], image.shape[1], 3),
-                             dtype=np.float32)
+            input = np.zeros(
+                (1, image.shape[0], image.shape[1], 3), dtype=np.float32)
             input[0] = image.astype(np.float32)
             input = image_util.oversample(input, self.crop_dims)
         else:
             image = image.resize(self.crop_dims, Image.ANTIALIAS)
-            input = np.zeros((1, self.crop_dims[0], self.crop_dims[1], 3),
-                             dtype=np.float32)
+            input = np.zeros(
+                (1, self.crop_dims[0], self.crop_dims[1], 3), dtype=np.float32)
             input[0] = np.array(image).astype(np.float32)
 
         data_in = []
@@ -133,22 +136,24 @@ class ImageClassifier():
         lab = np.argsort(-prob)
         logging.info("Label of %s is: %d", image, lab[0])
 
+
 if __name__ == '__main__':
-    image_size=32
-    crop_size=32
-    multi_crop=True
-    config="vgg_16_cifar.py"
-    output_layer="__fc_layer_1__"
-    mean_path="data/cifar-out/batches/batches.meta"
-    model_path=sys.argv[1]
-    image=sys.argv[2]
-    use_gpu=bool(int(sys.argv[3]))
-
-    obj = ImageClassifier(train_conf=config,
-                          model_dir=model_path,
-                          resize_dim=image_size,
-                          crop_dim=crop_size,
-                          mean_file=mean_path,
-                          use_gpu=use_gpu,
-                          oversample=multi_crop)
+    image_size = 32
+    crop_size = 32
+    multi_crop = True
+    config = "vgg_16_cifar.py"
+    output_layer = "__fc_layer_1__"
+    mean_path = "data/cifar-out/batches/batches.meta"
+    model_path = sys.argv[1]
+    image = sys.argv[2]
+    use_gpu = bool(int(sys.argv[3]))
+
+    obj = ImageClassifier(
+        train_conf=config,
+        model_dir=model_path,
+        resize_dim=image_size,
+        crop_dim=crop_size,
+        mean_file=mean_path,
+        use_gpu=use_gpu,
+        oversample=multi_crop)
     obj.predict(image, output_layer)
diff --git a/demo/image_classification/preprocess.py b/demo/image_classification/preprocess.py
index 0286a5d7e9dc8d0f546b18b1ed846c9452cdbe4b..10b9c1691b5e51273c73a975545cd36f3822e901 100755
--- a/demo/image_classification/preprocess.py
+++ b/demo/image_classification/preprocess.py
@@ -19,22 +19,36 @@ from optparse import OptionParser
 def option_parser():
     parser = OptionParser(usage="usage: python preprcoess.py "\
                           "-i data_dir [options]")
-    parser.add_option("-i", "--input", action="store",
-                      dest="input", help="Input data directory.")
-    parser.add_option("-s", "--size", action="store",
-                      dest="size", help="Processed image size.")
-    parser.add_option("-c", "--color", action="store",
-                      dest="color", help="whether to use color images.")
+    parser.add_option(
+        "-i",
+        "--input",
+        action="store",
+        dest="input",
+        help="Input data directory.")
+    parser.add_option(
+        "-s",
+        "--size",
+        action="store",
+        dest="size",
+        help="Processed image size.")
+    parser.add_option(
+        "-c",
+        "--color",
+        action="store",
+        dest="color",
+        help="whether to use color images.")
     return parser.parse_args()
 
+
 if __name__ == '__main__':
-     options, args = option_parser()
-     data_dir = options.input
-     processed_image_size = int(options.size)
-     color = options.color == "1"
-     data_creator = ImageClassificationDatasetCreater(data_dir,
-                                                      processed_image_size,
-                                                      color)
-     data_creator.num_per_batch = 1000
-     data_creator.overwrite = True
-     data_creator.create_batches()
+    options, args = option_parser()
+    data_dir = options.input
+    processed_image_size = int(options.size)
+    color = options.color == "1"
+    data_creator = ImageClassificationDatasetCreater(
+        data_dir, processed_image_size, color)
+    data_creator.train_list_name = "train.txt"
+    data_creator.test_list_name = "test.txt"
+    data_creator.num_per_batch = 1000
+    data_creator.overwrite = True
+    data_creator.create_batches()
diff --git a/demo/image_classification/preprocess.sh b/demo/image_classification/preprocess.sh
index dfe3eb95d1ab8b2114fcf5e0f461ea0efb7cc1e5..e3e86ff10675c0622867af2eb0d26c87f4bc2db5 100755
--- a/demo/image_classification/preprocess.sh
+++ b/demo/image_classification/preprocess.sh
@@ -17,3 +17,6 @@ set -e
 data_dir=./data/cifar-out
 
 python preprocess.py -i $data_dir -s 32 -c 1
+
+echo "data/cifar-out/batches/train.txt" > train.list
+echo "data/cifar-out/batches/test.txt" > test.list
diff --git a/demo/image_classification/vgg_16_cifar.py b/demo/image_classification/vgg_16_cifar.py
index e8b8af4bd313d0738aafab8da93fc510e40cc3d6..58ceff5fc2f46cac9997b6d8af2b0db0c43e0c75 100755
--- a/demo/image_classification/vgg_16_cifar.py
+++ b/demo/image_classification/vgg_16_cifar.py
@@ -18,36 +18,38 @@ is_predict = get_config_arg("is_predict", bool, False)
 
 ####################Data Configuration ##################
 if not is_predict:
-  data_dir='data/cifar-out/batches/'
-  meta_path=data_dir+'batches.meta'
-
-  args = {'meta':meta_path,'mean_img_size': 32,
-          'img_size': 32,'num_classes': 10,
-          'use_jpeg': 1,'color': "color"}
-
-  define_py_data_sources2(train_list=data_dir+"train.list",
-                          test_list=data_dir+'test.list',
-                          module='image_provider',
-                          obj='processData',
-                          args=args)
+    data_dir = 'data/cifar-out/batches/'
+    meta_path = data_dir + 'batches.meta'
+
+    args = {
+        'meta': meta_path,
+        'mean_img_size': 32,
+        'img_size': 32,
+        'num_classes': 10,
+        'use_jpeg': 1,
+        'color': "color"
+    }
+
+    define_py_data_sources2(
+        train_list="train.list",
+        test_list="train.list",
+        module='image_provider',
+        obj='processData',
+        args=args)
 
 ######################Algorithm Configuration #############
 settings(
-    batch_size = 128,
-    learning_rate = 0.1 / 128.0,
-    learning_method = MomentumOptimizer(0.9),
-    regularization = L2Regularization(0.0005 * 128)
-)
+    batch_size=128,
+    learning_rate=0.1 / 128.0,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * 128))
 
 #######################Network Configuration #############
-data_size=3*32*32
-label_size=10
-img = data_layer(name='image',
-                 size=data_size)
+data_size = 3 * 32 * 32
+label_size = 10
+img = data_layer(name='image', size=data_size)
 # small_vgg is predefined in trainer_config_helpers.networks
-predict = small_vgg(input_image=img,
-                    num_channels=3,
-                    num_classes=label_size)
+predict = small_vgg(input_image=img, num_channels=3, num_classes=label_size)
 
 if not is_predict:
     lbl = data_layer(name="label", size=label_size)
diff --git a/demo/introduction/README.md b/demo/introduction/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0614a7afe645677ef0b65a17ea05f1dcfa45214f
--- /dev/null
+++ b/demo/introduction/README.md
@@ -0,0 +1,3 @@
+This folder contains scripts used in PaddlePaddle introduction.
+- use `bash train.sh` to train a simple linear regression model
+- use `python evaluate_model.py` to read model parameters. You can see that `w` and `b` are very close to [2, 0.3].
diff --git a/demo/introduction/dataprovider.py b/demo/introduction/dataprovider.py
new file mode 100644
index 0000000000000000000000000000000000000000..8515022e18dc6bbf055e6db3121568acf1df1c55
--- /dev/null
+++ b/demo/introduction/dataprovider.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+import random
+
+
+# define data types of input: 2 real numbers
+@provider(input_types=[dense_vector(1), dense_vector(1)], use_seq=False)
+def process(settings, input_file):
+    for i in xrange(2000):
+        x = random.random()
+        yield [x], [2 * x + 0.3]
diff --git a/demo/introduction/evaluate_model.py b/demo/introduction/evaluate_model.py
new file mode 100755
index 0000000000000000000000000000000000000000..ca4a1872731abde90e72cb167929b3d9e2e1ebf4
--- /dev/null
+++ b/demo/introduction/evaluate_model.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Print model parameters in last model
+
+Usage:
+    python evaluate_model.py
+"""
+import numpy as np
+import os
+
+
+def load(file_name):
+    with open(file_name, 'rb') as f:
+        f.read(16)  # skip header for float type.
+        return np.fromfile(f, dtype=np.float32)
+
+
+def main():
+    print 'w=%.6f, b=%.6f from pass 29' % (load('output/pass-00029/w'),
+                                           load('output/pass-00029/b'))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/introduction/train.sh b/demo/introduction/train.sh
new file mode 100755
index 0000000000000000000000000000000000000000..06db8edd105ada071597ed1aa5e42f7de547174d
--- /dev/null
+++ b/demo/introduction/train.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+paddle train \
+    --config=trainer_config.py \
+    --save_dir=./output \
+    --num_passes=30 \
+    2>&1 |tee 'train.log'
diff --git a/demo/introduction/trainer_config.py b/demo/introduction/trainer_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c838c1a8f5b3cb6ac732197c85cd7c728eb013f
--- /dev/null
+++ b/demo/introduction/trainer_config.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+# 1. read data. Suppose you saved above python code as dataprovider.py
+data_file = 'empty.list'
+with open(data_file, 'w') as f:
+    f.writelines(' ')
+define_py_data_sources2(
+    train_list=data_file,
+    test_list=None,
+    module='dataprovider',
+    obj='process',
+    args={})
+
+# 2. learning algorithm
+settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
+
+# 3. Network configuration
+x = data_layer(name='x', size=1)
+y = data_layer(name='y', size=1)
+y_predict = fc_layer(
+    input=x,
+    param_attr=ParamAttr(name='w'),
+    size=1,
+    act=LinearActivation(),
+    bias_attr=ParamAttr(name='b'))
+cost = regression_cost(input=y_predict, label=y)
+outputs(cost)
diff --git a/demo/mnist/data/generate_list.py b/demo/mnist/data/generate_list.py
index 1b929048b4d82b5e9d80585b6d0180f2e92200ce..d880721f94c68bbbc1740f82872462efdb368fa2 100644
--- a/demo/mnist/data/generate_list.py
+++ b/demo/mnist/data/generate_list.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 o = open("./" + "train.list", "w")
-o.write("./data/raw_data/train" +"\n")
+o.write("./data/raw_data/train" + "\n")
 o.close()
 
 o = open("./" + "test.list", "w")
-o.write("./data/raw_data/t10k" +"\n")
-o.close()
\ No newline at end of file
+o.write("./data/raw_data/t10k" + "\n")
+o.close()
diff --git a/demo/mnist/data/get_mnist_data.sh b/demo/mnist/data/get_mnist_data.sh
index 9099b5ab6fb85d86d346a7ad819538fbd013c6ff..5a2e34026d4fe7f8315d4f5453bec7c4ee4f6885 100755
--- a/demo/mnist/data/get_mnist_data.sh
+++ b/demo/mnist/data/get_mnist_data.sh
@@ -19,4 +19,3 @@ done
 cd $DIR
 rm -f *.list
 python generate_list.py
-
diff --git a/demo/mnist/mnist_provider.py b/demo/mnist/mnist_provider.py
index 32af29730a7365df1a98fe54a2edf8850ee93e8d..6df4676da3bdc2e6949cc911fa3720cb51ddc568 100644
--- a/demo/mnist/mnist_provider.py
+++ b/demo/mnist/mnist_provider.py
@@ -2,10 +2,9 @@ from paddle.trainer.PyDataProvider2 import *
 
 
 # Define a py data provider
-@provider(input_types={
-    'pixel': dense_vector(28 * 28),
-    'label': integer_value(10)
-})
+@provider(
+    input_types={'pixel': dense_vector(28 * 28),
+                 'label': integer_value(10)})
 def process(settings, filename):  # settings is not used currently.
     imgf = filename + "-images-idx3-ubyte"
     labelf = filename + "-labels-idx1-ubyte"
diff --git a/demo/mnist/vgg_16_mnist.py b/demo/mnist/vgg_16_mnist.py
index 45a45bb061aa781231a944bb82ebfbc6b0dc9618..f9e89bc588abacd98a8f5fc82a00fae6bb2de10e 100644
--- a/demo/mnist/vgg_16_mnist.py
+++ b/demo/mnist/vgg_16_mnist.py
@@ -18,32 +18,29 @@ is_predict = get_config_arg("is_predict", bool, False)
 
 ####################Data Configuration ##################
 
-
 if not is_predict:
-  data_dir='./data/'
-  define_py_data_sources2(train_list= data_dir + 'train.list',
-                        test_list= data_dir + 'test.list',
-                        module='mnist_provider',
-                        obj='process')
+    data_dir = './data/'
+    define_py_data_sources2(
+        train_list=data_dir + 'train.list',
+        test_list=data_dir + 'test.list',
+        module='mnist_provider',
+        obj='process')
 
 ######################Algorithm Configuration #############
 settings(
-    batch_size = 128,
-    learning_rate = 0.1 / 128.0,
-    learning_method = MomentumOptimizer(0.9),
-    regularization = L2Regularization(0.0005 * 128)
-)
+    batch_size=128,
+    learning_rate=0.1 / 128.0,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * 128))
 
 #######################Network Configuration #############
 
-data_size=1*28*28
-label_size=10
+data_size = 1 * 28 * 28
+label_size = 10
 img = data_layer(name='pixel', size=data_size)
 
 # small_vgg is predined in trainer_config_helpers.network
-predict = small_vgg(input_image=img,
-                    num_channels=1,
-                    num_classes=label_size)
+predict = small_vgg(input_image=img, num_channels=1, num_classes=label_size)
 
 if not is_predict:
     lbl = data_layer(name="label", size=label_size)
diff --git a/demo/model_zoo/embedding/extract_para.py b/demo/model_zoo/embedding/extract_para.py
index 17067792fc38d0d25bd28dc35bfb1b88ad5020cd..47e06fae9caa9c3d9e0d6eb2e3f6633a776c5b1d 100755
--- a/demo/model_zoo/embedding/extract_para.py
+++ b/demo/model_zoo/embedding/extract_para.py
@@ -12,7 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 Example:
     python extract_para.py --preModel PREMODEL --preDict PREDICT \
@@ -29,6 +28,7 @@ Options:
 from optparse import OptionParser
 import struct
 
+
 def get_row_index(preDict, usrDict):
     """
     Get the row positions for all words in user dictionary from pre-trained dictionary.
@@ -47,7 +47,9 @@ def get_row_index(preDict, usrDict):
             pos.append(index[word])
     return pos
 
-def extract_parameters_by_usrDict(preModel, preDict, usrModel, usrDict, paraDim):
+
+def extract_parameters_by_usrDict(preModel, preDict, usrModel, usrDict,
+                                  paraDim):
     """
     Extract desired parameters from a pretrained embedding model based on user dictionary
     """
@@ -70,6 +72,7 @@ def extract_parameters_by_usrDict(preModel, preDict, usrModel, usrDict, paraDim)
     print "extract parameters finish, total", len(rowIndex), "lines"
     fi.close()
 
+
 def main():
     """
     Main entry for running paraconvert.py 
@@ -78,19 +81,33 @@ def main():
             "python %prog --preModel PREMODEL --preDict PREDICT" \
             " --usrModel USRMODEL --usrDict USRDICT -d DIM"
     parser = OptionParser(usage)
-    parser.add_option("--preModel", action="store", dest="preModel",
-                      help="the name of pretrained embedding model")
-    parser.add_option("--preDict", action="store", dest="preDict",
-                      help="the name of pretrained dictionary")
-    parser.add_option("--usrModel", action="store", dest="usrModel",
-                      help="the name of output usr embedding model")
-    parser.add_option("--usrDict", action="store", dest="usrDict",
-                      help="the name of user specified dictionary")
-    parser.add_option("-d", action="store", dest="dim",
-                      help="dimension of parameter")
+    parser.add_option(
+        "--preModel",
+        action="store",
+        dest="preModel",
+        help="the name of pretrained embedding model")
+    parser.add_option(
+        "--preDict",
+        action="store",
+        dest="preDict",
+        help="the name of pretrained dictionary")
+    parser.add_option(
+        "--usrModel",
+        action="store",
+        dest="usrModel",
+        help="the name of output usr embedding model")
+    parser.add_option(
+        "--usrDict",
+        action="store",
+        dest="usrDict",
+        help="the name of user specified dictionary")
+    parser.add_option(
+        "-d", action="store", dest="dim", help="dimension of parameter")
     (options, args) = parser.parse_args()
-    extract_parameters_by_usrDict(options.preModel, options.preDict, 
-                      options.usrModel, options.usrDict, int(options.dim))
+    extract_parameters_by_usrDict(options.preModel, options.preDict,
+                                  options.usrModel, options.usrDict,
+                                  int(options.dim))
+
 
 if __name__ == '__main__':
     main()
diff --git a/demo/model_zoo/embedding/paraconvert.py b/demo/model_zoo/embedding/paraconvert.py
index 523412303617a38035392e4bb99f8ce119be8ac8..54155eff8e26b16ff5303d8d279e81b4bf8a90f4 100755
--- a/demo/model_zoo/embedding/paraconvert.py
+++ b/demo/model_zoo/embedding/paraconvert.py
@@ -12,7 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 Example:
     python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM
@@ -29,6 +28,7 @@ Options:
 from optparse import OptionParser
 import struct
 
+
 def binary2text(input, output, paraDim):
     """
     Convert a binary parameter file of embedding model to be a text file.  
@@ -76,12 +76,13 @@ def binary2text(input, output, paraDim):
     fo.close()
     print "binary2text finish, total", line, "lines"
 
+
 def get_para_count(input):
     """
     Compute the total number of embedding parameters in input text file. 
     input: the name of input text file
     """
-    numRows = 1 
+    numRows = 1
     paraDim = 0
     with open(input) as f:
         line = f.readline()
@@ -90,6 +91,7 @@ def get_para_count(input):
             numRows += 1
     return numRows * paraDim
 
+
 def text2binary(input, output, paddle_head=True):
     """
     Convert a text parameter file of embedding model to be a binary file.
@@ -123,6 +125,7 @@ def text2binary(input, output, paddle_head=True):
     fo.close()
     print "text2binary finish, total", count, "lines"
 
+
 def main():
     """
     Main entry for running paraconvert.py 
@@ -131,21 +134,26 @@ def main():
             "python %prog --b2t -i INPUT -o OUTPUT -d DIM \n" \
             "python %prog --t2b -i INPUT -o OUTPUT"
     parser = OptionParser(usage)
-    parser.add_option("--b2t", action="store_true",
-                      help="convert parameter file of embedding model from binary to text")
-    parser.add_option("--t2b", action="store_true",
-                      help="convert parameter file of embedding model from text to binary")
-    parser.add_option("-i", action="store", dest="input",
-                      help="input parameter file name")
-    parser.add_option("-o", action="store", dest="output",
-                      help="output parameter file name")
-    parser.add_option("-d", action="store", dest="dim",
-                      help="dimension of parameter")
+    parser.add_option(
+        "--b2t",
+        action="store_true",
+        help="convert parameter file of embedding model from binary to text")
+    parser.add_option(
+        "--t2b",
+        action="store_true",
+        help="convert parameter file of embedding model from text to binary")
+    parser.add_option(
+        "-i", action="store", dest="input", help="input parameter file name")
+    parser.add_option(
+        "-o", action="store", dest="output", help="output parameter file name")
+    parser.add_option(
+        "-d", action="store", dest="dim", help="dimension of parameter")
     (options, args) = parser.parse_args()
     if options.b2t:
         binary2text(options.input, options.output, options.dim)
     if options.t2b:
         text2binary(options.input, options.output)
 
+
 if __name__ == '__main__':
     main()
diff --git a/demo/model_zoo/resnet/classify.py b/demo/model_zoo/resnet/classify.py
index 06d471722f8059804a59e6823bebccff85a8d542..7855126edcfec20de251e5bc08c08c7aab8f7a8e 100755
--- a/demo/model_zoo/resnet/classify.py
+++ b/demo/model_zoo/resnet/classify.py
@@ -26,16 +26,22 @@ from py_paddle import swig_paddle, DataProviderConverter
 from paddle.trainer.PyDataProvider2 import dense_vector
 from paddle.trainer.config_parser import parse_config
 
-logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
+logging.basicConfig(
+    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
 logging.getLogger().setLevel(logging.INFO)
 
+
 class ImageClassifier():
-    def __init__(self, train_conf, model_dir=None,
-                 resize_dim=256, crop_dim=224,
+    def __init__(self,
+                 train_conf,
+                 model_dir=None,
+                 resize_dim=256,
+                 crop_dim=224,
                  use_gpu=True,
                  mean_file=None,
                  output_layer=None,
-                 oversample=False, is_color=True):
+                 oversample=False,
+                 is_color=True):
         """
         train_conf: network configure.
         model_dir: string, directory of model.
@@ -62,24 +68,25 @@ class ImageClassifier():
             assert isinstance(self.output_layer, basestring)
             self.output_layer = self.output_layer.split(",")
 
-        self.transformer = image_util.ImageTransformer(is_color = is_color)
-        self.transformer.set_transpose((2,0,1))
-        self.transformer.set_channel_swap((2,1,0))
+        self.transformer = image_util.ImageTransformer(is_color=is_color)
+        self.transformer.set_transpose((2, 0, 1))
+        self.transformer.set_channel_swap((2, 1, 0))
 
         self.mean_file = mean_file
         if self.mean_file is not None:
             mean = np.load(self.mean_file)['data_mean']
             mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1])
-            self.transformer.set_mean(mean) # mean pixel
+            self.transformer.set_mean(mean)  # mean pixel
         else:
             # if you use three mean value, set like:
             # this three mean value is calculated from ImageNet.
-            self.transformer.set_mean(np.array([103.939,116.779,123.68]))
+            self.transformer.set_mean(np.array([103.939, 116.779, 123.68]))
 
         conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (int(use_gpu))
         conf = parse_config(train_conf, conf_args)
         swig_paddle.initPaddle("--use_gpu=%d" % (int(use_gpu)))
-        self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
+        self.network = swig_paddle.GradientMachine.createFromConfigProto(
+            conf.model_config)
         assert isinstance(self.network, swig_paddle.GradientMachine)
         self.network.loadParameters(self.model_dir)
 
@@ -105,14 +112,14 @@ class ImageClassifier():
             # image_util.resize_image: short side is self.resize_dim
             image = image_util.resize_image(image, self.resize_dim)
             image = np.array(image)
-            input = np.zeros((1, image.shape[0], image.shape[1], 3),
-                             dtype=np.float32)
+            input = np.zeros(
+                (1, image.shape[0], image.shape[1], 3), dtype=np.float32)
             input[0] = image.astype(np.float32)
             input = image_util.oversample(input, self.crop_dims)
         else:
             image = image.resize(self.crop_dims, Image.ANTIALIAS)
-            input = np.zeros((1, self.crop_dims[0], self.crop_dims[1], 3),
-                             dtype=np.float32)
+            input = np.zeros(
+                (1, self.crop_dims[0], self.crop_dims[1], 3), dtype=np.float32)
             input[0] = np.array(image).astype(np.float32)
 
         data_in = []
@@ -172,7 +179,7 @@ class ImageClassifier():
             logging.info("Label of %s is: %d", image, lab[0])
         return results
 
-    def extract(self, data_file, output_dir, batch_size = 10000):
+    def extract(self, data_file, output_dir, batch_size=10000):
         """
         extract and save features of output layers, which are
         specify in Outputs() in network configure.
@@ -197,7 +204,7 @@ class ImageClassifier():
             image_feature[file_name] = feature
             sample_num += 1
             if sample_num == batch_size:
-                batch_name = os.path.join(output_dir, 'batch_%d' %(batch_num))
+                batch_name = os.path.join(output_dir, 'batch_%d' % (batch_num))
                 self.save_file(image_feature, batch_name)
                 logging.info('Finish batch %d', batch_num)
                 batch_num += 1
@@ -206,7 +213,7 @@ class ImageClassifier():
             if idx % 1000 == 0:
                 logging.info('%d/%d, %s', idx, len(image_files), file_name)
         if sample_num > 0:
-            batch_name = os.path.join(output_dir, 'batch_%d' %(batch_num))
+            batch_name = os.path.join(output_dir, 'batch_%d' % (batch_num))
             self.save_file(image_feature, batch_name)
             logging.info('Finish batch %d', batch_num)
         logging.info('Done: make image feature batch')
@@ -215,38 +222,64 @@ class ImageClassifier():
         of = open(file, 'wb')
         cPickle.dump(data, of, protocol=cPickle.HIGHEST_PROTOCOL)
 
+
 def option_parser():
     """
     Main entry for predciting
     """
     usage = "%prog -c config -i data_list -w model_dir [options]"
     parser = OptionParser(usage="usage: %s" % usage)
-    parser.add_option("-j", "--job",
-                      action="store", dest="job_type",
-                      help="job type: predict, extract\
+    parser.add_option(
+        "-j",
+        "--job",
+        action="store",
+        dest="job_type",
+        help="job type: predict, extract\
                             predict: predicting,\
                             extract: extract features")
-    parser.add_option("-c", "--conf",
-                      action="store", dest="train_conf",
-                      help="network config")
-    parser.add_option("-i", "--data",
-                      action="store", dest="data_file",
-                      help="image list")
-    parser.add_option("-w", "--model",
-                      action="store", dest="model_path",
-                      default=None, help="model path")
-    parser.add_option("-g", "--use_gpu", action="store",
-                      dest="use_gpu", default=True,
-                      help="Whether to use gpu mode.")
-    parser.add_option("-o", "--output_dir",
-                      action="store", dest="output_dir",
-                      default="output", help="output path")
-    parser.add_option("-m", "--mean", action="store",
-                      dest="mean", default=None,
-                      help="mean file.")
-    parser.add_option("-p", "--multi_crop", action="store_true",
-                      dest="multi_crop", default=False,
-                      help="Wether to use multiple crops on image.")
+    parser.add_option(
+        "-c",
+        "--conf",
+        action="store",
+        dest="train_conf",
+        help="network config")
+    parser.add_option(
+        "-i", "--data", action="store", dest="data_file", help="image list")
+    parser.add_option(
+        "-w",
+        "--model",
+        action="store",
+        dest="model_path",
+        default=None,
+        help="model path")
+    parser.add_option(
+        "-g",
+        "--use_gpu",
+        action="store",
+        dest="use_gpu",
+        default=True,
+        help="Whether to use gpu mode.")
+    parser.add_option(
+        "-o",
+        "--output_dir",
+        action="store",
+        dest="output_dir",
+        default="output",
+        help="output path")
+    parser.add_option(
+        "-m",
+        "--mean",
+        action="store",
+        dest="mean",
+        default=None,
+        help="mean file.")
+    parser.add_option(
+        "-p",
+        "--multi_crop",
+        action="store_true",
+        dest="multi_crop",
+        default=False,
+        help="Wether to use multiple crops on image.")
     parser.add_option("-l", "--output_layer", action="store",
                       dest="output_layer", default=None,
                       help="--job=extract, specify layers to extract "\
@@ -254,24 +287,26 @@ def option_parser():
                            "classification probability, output in resnet.py.")
     return parser.parse_args()
 
+
 def main():
     """
     1. parse input arguments.
     2. predicting or extract features according job type.
     """
     options, args = option_parser()
-    obj = ImageClassifier(options.train_conf,
-                          options.model_path,
-                          use_gpu=options.use_gpu,
-                          mean_file=options.mean,
-                          output_layer=options.output_layer,
-                          oversample=options.multi_crop)
+    obj = ImageClassifier(
+        options.train_conf,
+        options.model_path,
+        use_gpu=options.use_gpu,
+        mean_file=options.mean,
+        output_layer=options.output_layer,
+        oversample=options.multi_crop)
     if options.job_type == "predict":
         obj.predict(options.data_file)
 
     elif options.job_type == "extract":
-        obj.extract(options.data_file,
-                    options.output_dir)
+        obj.extract(options.data_file, options.output_dir)
+
 
 if __name__ == '__main__':
     main()
diff --git a/demo/model_zoo/resnet/example/__init__.py b/demo/model_zoo/resnet/example/__init__.py
index 7f9e87eee6037666b86420fba194624859d356b3..c90af2ee000d46a032984ee23559e7e99b49ddad 100644
--- a/demo/model_zoo/resnet/example/__init__.py
+++ b/demo/model_zoo/resnet/example/__init__.py
@@ -11,4 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
diff --git a/demo/model_zoo/resnet/example/image_list_provider.py b/demo/model_zoo/resnet/example/image_list_provider.py
index ee457e1fffc7ed8629dc6bde63a8047818c0ff9d..9e415f76a53326c5809b7a8c508701c519ab443b 100644
--- a/demo/model_zoo/resnet/example/image_list_provider.py
+++ b/demo/model_zoo/resnet/example/image_list_provider.py
@@ -16,8 +16,7 @@ from paddle.utils.image_util import *
 from paddle.trainer.PyDataProvider2 import *
 
 
-def hook(settings, image_size, crop_size, color, file_list,
-         is_train, **kwargs):
+def hook(settings, image_size, crop_size, color, file_list, is_train, **kwargs):
     """
     Description: Init with a list of data file
     file_list is the name list of input files.
@@ -58,7 +57,7 @@ def hook(settings, image_size, crop_size, color, file_list,
         sz = settings.crop_size * settings.crop_size
         settings.img_mean = np.zeros(sz * 3, dtype=np.single)
         for idx, value in enumerate(settings.mean_value):
-            settings.img_mean[idx * sz: (idx + 1) * sz] = value
+            settings.img_mean[idx * sz:(idx + 1) * sz] = value
         settings.img_mean = settings.img_mean.reshape(3, settings.crop_size,
                                                       settings.crop_size)
 
@@ -69,7 +68,8 @@ def hook(settings, image_size, crop_size, color, file_list,
 
     settings.input_types = [
         dense_vector(settings.img_input_size),  # image feature
-        integer_value(1)]  # labels
+        integer_value(1)
+    ]  # labels
 
     settings.logger.info('Image short side: %s', settings.img_size)
     settings.logger.info('Crop size: %s', settings.crop_size)
@@ -97,9 +97,6 @@ def processData(settings, file_list):
     # swap channel
     if settings.is_swap_channel:
         img = img[settings.swap_channel, :, :]
-    img_feat = preprocess_img(img,
-                              settings.img_mean,
-                              settings.crop_size,
-                              settings.is_train,
-                              settings.color)
+    img_feat = preprocess_img(img, settings.img_mean, settings.crop_size,
+                              settings.is_train, settings.color)
     yield img_feat.tolist(), int(lab.strip())
diff --git a/demo/model_zoo/resnet/load_feature.py b/demo/model_zoo/resnet/load_feature.py
index ee4930b7a17f7f21ceeba8db253eed64416ebf10..b0948b75fd0ac9a3fa89070aed04d523ce286f4e 100644
--- a/demo/model_zoo/resnet/load_feature.py
+++ b/demo/model_zoo/resnet/load_feature.py
@@ -17,9 +17,11 @@ import sys
 import cPickle
 import logging
 
-logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
+logging.basicConfig(
+    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
 logging.getLogger().setLevel(logging.INFO)
 
+
 def load_feature_c(file):
     """
     Load feature extracted by C++ interface.
@@ -30,14 +32,15 @@ def load_feature_c(file):
     f = open(file, 'r')
     for line in f:
         sample = []
-        for slot in line.strip().split(";"): 
-            fea = [float(val) for val in slot.strip().split()] 
+        for slot in line.strip().split(";"):
+            fea = [float(val) for val in slot.strip().split()]
             if fea:
                 sample.append(fea)
         features.append(sample)
     f.close()
     return features
 
+
 def load_feature_py(feature_dir):
     """
     Load feature extracted by python interface.
@@ -54,6 +57,7 @@ def load_feature_py(feature_dir):
             logging.info('Load feature file %s', file_name)
     return features
 
+
 if __name__ == '__main__':
-    print load_feature_py(sys.argv[1]) 
+    print load_feature_py(sys.argv[1])
     #print load_feature_c(sys.argv[1]) 
diff --git a/demo/model_zoo/resnet/resnet.py b/demo/model_zoo/resnet/resnet.py
index 483e308ac804e13ca249ef4e47e9e9b00770ce1b..015b74cd484596039b9fcf010576ca340d044db7 100644
--- a/demo/model_zoo/resnet/resnet.py
+++ b/demo/model_zoo/resnet/resnet.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 from paddle.trainer_config_helpers import *
-
 """
 paper: https://arxiv.org/abs/1512.03385
 """
@@ -28,15 +27,19 @@ if not is_predict and data_provider:
     # mean.meta size : 3 x 224 x 224.
     # If you use three mean value, set like:
     # "mean_value:103.939,116.779,123.68;"
-    args={
+    args = {
         'mean_meta': "model/mean_meta_224/mean.meta",
-        'image_size': 224, 'crop_size': 224,
-        'color': True,'swap_channel:': [2, 1, 0]}
-    define_py_data_sources2(train_list,
-                           'example/test.list',
-                           module="example.image_list_provider",
-                           obj="processData",
-                           args=args)
+        'image_size': 224,
+        'crop_size': 224,
+        'color': True,
+        'swap_channel:': [2, 1, 0]
+    }
+    define_py_data_sources2(
+        train_list,
+        'example/test.list',
+        module="example.image_list_provider",
+        obj="processData",
+        args=args)
 
 batch_size = 1
 learning_rate = 0.1 / batch_size
@@ -54,12 +57,16 @@ Settings(
     learning_method='momentum',
     learning_rate_decay_a=0.5,
     learning_rate_decay_b=1200000 * 10,
-    learning_rate_schedule="discexp",
-)
+    learning_rate_schedule="discexp", )
 
 
-def conv_bn_layer(name, input, filter_size, num_filters,
-                  stride, padding, channels=None,
+def conv_bn_layer(name,
+                  input,
+                  filter_size,
+                  num_filters,
+                  stride,
+                  padding,
+                  channels=None,
                   active_type=ReluActivation()):
     """
     A wrapper for conv layer with batch normalization layers.
@@ -67,19 +74,18 @@ def conv_bn_layer(name, input, filter_size, num_filters,
     conv layer has no activation.
     """
 
-    tmp = img_conv_layer(name=name + "_conv",
-                         input=input,
-                         filter_size=filter_size,
-                         num_channels=channels,
-                         num_filters=num_filters,
-                         stride=stride,
-                         padding=padding,
-                         act=LinearActivation(),
-                         bias_attr=False)
-    return batch_norm_layer(name=name + "_bn",
-                            input=tmp,
-                            act=active_type,
-                            use_global_stats=is_test)
+    tmp = img_conv_layer(
+        name=name + "_conv",
+        input=input,
+        filter_size=filter_size,
+        num_channels=channels,
+        num_filters=num_filters,
+        stride=stride,
+        padding=padding,
+        act=LinearActivation(),
+        bias_attr=False)
+    return batch_norm_layer(
+        name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
 
 
 def bottleneck_block(name, input, num_filters1, num_filters2):
@@ -88,29 +94,31 @@ def bottleneck_block(name, input, num_filters1, num_filters2):
     Last conv_bn_layer has no activation.
     Addto layer has activation of relu.
     """
-    last_name = conv_bn_layer(name=name + '_branch2a',
-                              input=input,
-                              filter_size=1,
-                              num_filters=num_filters1,
-                              stride=1,
-                              padding=0)
-    last_name = conv_bn_layer(name=name + '_branch2b',
-                              input=last_name,
-                              filter_size=3,
-                              num_filters=num_filters1,
-                              stride=1,
-                              padding=1)
-    last_name = conv_bn_layer(name=name + '_branch2c',
-                              input=last_name,
-                              filter_size=1,
-                              num_filters=num_filters2,
-                              stride=1,
-                              padding=0,
-                              active_type=LinearActivation())
-
-    return addto_layer(name=name + "_addto",
-                       input=[input, last_name],
-                       act=ReluActivation())
+    last_name = conv_bn_layer(
+        name=name + '_branch2a',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters1,
+        stride=1,
+        padding=0)
+    last_name = conv_bn_layer(
+        name=name + '_branch2b',
+        input=last_name,
+        filter_size=3,
+        num_filters=num_filters1,
+        stride=1,
+        padding=1)
+    last_name = conv_bn_layer(
+        name=name + '_branch2c',
+        input=last_name,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=1,
+        padding=0,
+        active_type=LinearActivation())
+
+    return addto_layer(
+        name=name + "_addto", input=[input, last_name], act=ReluActivation())
 
 
 def mid_projection(name, input, num_filters1, num_filters2, stride=2):
@@ -123,38 +131,41 @@ def mid_projection(name, input, num_filters1, num_filters2, stride=2):
     branch2x: bottleneck building block, shortcuts are identity.
     """
     # stride = 2
-    branch1 = conv_bn_layer(name=name + '_branch1',
-                            input=input,
-                            filter_size=1,
-                            num_filters=num_filters2,
-                            stride=stride,
-                            padding=0,
-                            active_type=LinearActivation())
-
-    last_name = conv_bn_layer(name=name + '_branch2a',
-                              input=input,
-                              filter_size=1,
-                              num_filters=num_filters1,
-                              stride=stride,
-                              padding=0)
-    last_name = conv_bn_layer(name=name + '_branch2b',
-                              input=last_name,
-                              filter_size=3,
-                              num_filters=num_filters1,
-                              stride=1,
-                              padding=1)
-
-    last_name = conv_bn_layer(name=name + '_branch2c',
-                              input=last_name,
-                              filter_size=1,
-                              num_filters=num_filters2,
-                              stride=1,
-                              padding=0,
-                              active_type=LinearActivation())
-
-    return addto_layer(name=name + "_addto",
-                       input=[branch1, last_name],
-                       act=ReluActivation())
+    branch1 = conv_bn_layer(
+        name=name + '_branch1',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=stride,
+        padding=0,
+        active_type=LinearActivation())
+
+    last_name = conv_bn_layer(
+        name=name + '_branch2a',
+        input=input,
+        filter_size=1,
+        num_filters=num_filters1,
+        stride=stride,
+        padding=0)
+    last_name = conv_bn_layer(
+        name=name + '_branch2b',
+        input=last_name,
+        filter_size=3,
+        num_filters=num_filters1,
+        stride=1,
+        padding=1)
+
+    last_name = conv_bn_layer(
+        name=name + '_branch2c',
+        input=last_name,
+        filter_size=1,
+        num_filters=num_filters2,
+        stride=1,
+        padding=0,
+        active_type=LinearActivation())
+
+    return addto_layer(
+        name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
 
 
 def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
@@ -168,67 +179,67 @@ def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
     # For ImageNet
     # conv1: 112x112
     img = data_layer(name='input', size=224 * 224 * 3)
-    tmp = conv_bn_layer("conv1", img,
-                        filter_size=7,
-                        channels=3,
-                        num_filters=64,
-                        stride=2,
-                        padding=3)
+    tmp = conv_bn_layer(
+        "conv1",
+        img,
+        filter_size=7,
+        channels=3,
+        num_filters=64,
+        stride=2,
+        padding=3)
     tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
 
     # conv2_x: 56x56
-    tmp = mid_projection(name="res2_1",
-                         input=tmp,
-                         num_filters1=64,
-                         num_filters2=256,
-                         stride=1)
+    tmp = mid_projection(
+        name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1)
     for i in xrange(2, res2_num + 1, 1):
-        tmp = bottleneck_block(name="res2_" + str(i),
-                               input=tmp,
-                               num_filters1=64,
-                               num_filters2=256)
+        tmp = bottleneck_block(
+            name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256)
 
     # conv3_x: 28x28
-    tmp = mid_projection(name="res3_1",
-                         input=tmp,
-                         num_filters1=128,
-                         num_filters2=512)
+    tmp = mid_projection(
+        name="res3_1", input=tmp, num_filters1=128, num_filters2=512)
     for i in xrange(2, res3_num + 1, 1):
-        tmp = bottleneck_block(name="res3_" + str(i),
-                               input=tmp, num_filters1=128,
-                               num_filters2=512)
+        tmp = bottleneck_block(
+            name="res3_" + str(i),
+            input=tmp,
+            num_filters1=128,
+            num_filters2=512)
 
     # conv4_x: 14x14
-    tmp = mid_projection(name="res4_1", input=tmp,
-                         num_filters1=256, num_filters2=1024)
+    tmp = mid_projection(
+        name="res4_1", input=tmp, num_filters1=256, num_filters2=1024)
     for i in xrange(2, res4_num + 1, 1):
-        tmp = bottleneck_block(name="res4_" + str(i),
-                               input=tmp,
-                               num_filters1=256,
-                               num_filters2=1024)
+        tmp = bottleneck_block(
+            name="res4_" + str(i),
+            input=tmp,
+            num_filters1=256,
+            num_filters2=1024)
 
     # conv5_x: 7x7
-    tmp = mid_projection(name="res5_1", input=tmp,
-                         num_filters1=512, num_filters2=2048)
+    tmp = mid_projection(
+        name="res5_1", input=tmp, num_filters1=512, num_filters2=2048)
     for i in xrange(2, res5_num + 1, 1):
-        tmp = bottleneck_block(name="res5_" + str(i),
-                               input=tmp, num_filters1=512,
-                               num_filters2=2048)
-
-    tmp = img_pool_layer(name='avgpool',
-                         input=tmp,
-                         pool_size=7,
-                         stride=1,
-                         pool_type=AvgPooling())
-
-    output = fc_layer(name='output',
-                      input=tmp,
-                      size=1000,
-                      act=SoftmaxActivation())
+        tmp = bottleneck_block(
+            name="res5_" + str(i),
+            input=tmp,
+            num_filters1=512,
+            num_filters2=2048)
+
+    tmp = img_pool_layer(
+        name='avgpool',
+        input=tmp,
+        pool_size=7,
+        stride=1,
+        pool_type=AvgPooling())
+
+    output = fc_layer(
+        name='output', input=tmp, size=1000, act=SoftmaxActivation())
 
     if not is_predict:
-        classification_cost(input=output, label=data_layer(name='label',
-                                                           size=1))
+        classification_cost(
+            input=output, label=data_layer(
+                name='label', size=1))
 
 
 def res_net_50():
diff --git a/demo/quick_start/api_train.py b/demo/quick_start/api_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..66cbb856484d231613a0026be129a7bc3a7cfdf5
--- /dev/null
+++ b/demo/quick_start/api_train.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import itertools
+import random
+
+from paddle.trainer.config_parser import parse_config
+from py_paddle import swig_paddle as api
+from py_paddle import DataProviderConverter
+from paddle.trainer.PyDataProvider2 \
+    import integer_value, integer_value_sequence, sparse_binary_vector
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--train_data", type=str, required=False, help="train data file")
+    parser.add_argument("--test_data", type=str, help="test data file")
+    parser.add_argument(
+        "--config", type=str, required=True, help="config file name")
+    parser.add_argument("--dict_file", required=True, help="dictionary file")
+    parser.add_argument(
+        "--seq", default=1, type=int, help="whether use sequence training")
+    parser.add_argument(
+        "--use_gpu", default=0, type=int, help="whether use GPU for training")
+    parser.add_argument(
+        "--trainer_count",
+        default=1,
+        type=int,
+        help="Number of threads for training")
+    parser.add_argument(
+        "--num_passes", default=5, type=int, help="Number of training passes")
+    return parser.parse_args()
+
+
+UNK_IDX = 0
+
+
+def load_data(file_name, word_dict):
+    with open(file_name, 'r') as f:
+        for line in f:
+            label, comment = line.strip().split('\t')
+            words = comment.split()
+            word_slot = [word_dict.get(w, UNK_IDX) for w in words]
+            yield word_slot, int(label)
+
+
+def load_dict(dict_file):
+    word_dict = dict()
+    with open(dict_file, 'r') as f:
+        for i, line in enumerate(f):
+            w = line.strip().split()[0]
+            word_dict[w] = i
+    return word_dict
+
+
+def main():
+    options = parse_arguments()
+    api.initPaddle("--use_gpu=%s" % options.use_gpu,
+                   "--trainer_count=%s" % options.trainer_count)
+
+    word_dict = load_dict(options.dict_file)
+    train_dataset = list(load_data(options.train_data, word_dict))
+    if options.test_data:
+        test_dataset = list(load_data(options.test_data, word_dict))
+    else:
+        test_dataset = None
+
+    trainer_config = parse_config(options.config,
+                                  "dict_file=%s" % options.dict_file)
+    # No need to have data provider for trainer
+    trainer_config.ClearField('data_config')
+    trainer_config.ClearField('test_data_config')
+
+    # create a GradientMachine from the model configuratin
+    model = api.GradientMachine.createFromConfigProto(
+        trainer_config.model_config)
+    # create a trainer for the gradient machine
+    trainer = api.Trainer.create(trainer_config, model)
+
+    # create a data converter which converts data to PaddlePaddle
+    # internal format
+    input_types = [
+        integer_value_sequence(len(word_dict)) if options.seq else
+        sparse_binary_vector(len(word_dict)), integer_value(2)
+    ]
+    converter = DataProviderConverter(input_types)
+
+    batch_size = trainer_config.opt_config.batch_size
+    trainer.startTrain()
+    for train_pass in xrange(options.num_passes):
+        trainer.startTrainPass()
+        random.shuffle(train_dataset)
+        for pos in xrange(0, len(train_dataset), batch_size):
+            batch = itertools.islice(train_dataset, pos, pos + batch_size)
+            size = min(batch_size, len(train_dataset) - pos)
+            trainer.trainOneDataBatch(size, converter(batch))
+        trainer.finishTrainPass()
+        if test_dataset:
+            trainer.startTestPeriod()
+            for pos in xrange(0, len(test_dataset), batch_size):
+                batch = itertools.islice(test_dataset, pos, pos + batch_size)
+                size = min(batch_size, len(test_dataset) - pos)
+                trainer.testOneDataBatch(size, converter(batch))
+            trainer.finishTestPeriod()
+    trainer.finishTrain()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/quick_start/api_train.sh b/demo/quick_start/api_train.sh
new file mode 100755
index 0000000000000000000000000000000000000000..40e9d0a09aaa6b672d6b3997c67c07a5e8a8c3d8
--- /dev/null
+++ b/demo/quick_start/api_train.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+# Note: if using trainer_config.emb.py, trainer_config.cnn.py
+# or trainer_config.lstm.py, you need to change --seq to --seq=1
+# because they are sequence models.
+python api_train.py \
+  --config=trainer_config.lr.py \
+  --trainer_count=2 \
+  --num_passes=15 \
+  --use_gpu=0 \
+  --seq=0 \
+  --train_data=data/train.txt \
+  --test_data=data/test.txt \
+  --dict_file=data/dict.txt \
+  2>&1 | tee 'train.log'
diff --git a/demo/quick_start/dataprovider_bow.py b/demo/quick_start/dataprovider_bow.py
index f8cde189cf87d73aec05da4b34e064cddecff56b..a5156a2d40cc04c02e50d676045ae6da8937ba01 100644
--- a/demo/quick_start/dataprovider_bow.py
+++ b/demo/quick_start/dataprovider_bow.py
@@ -17,6 +17,7 @@ from paddle.trainer.PyDataProvider2 import *
 # id of the word not in dictionary
 UNK_IDX = 0
 
+
 # initializer is called by the framework during initialization.
 # It allows the user to describe the data types and setup the
 # necessary data structure for later use.
@@ -38,7 +39,9 @@ def initializer(settings, dictionary, **kwargs):
         # The second input is an integer. It represents the category id of the
         # sample. 2 means there are two labels in the dataset.
         # (1 for positive and 0 for negative)
-        integer_value(2)]
+        integer_value(2)
+    ]
+
 
 # Delaring a data provider. It has an initializer 'data_initialzer'.
 # It will cache the generated data of the first pass in memory, so that
@@ -69,9 +72,8 @@ def process(settings, file_name):
 
 def predict_initializer(settings, dictionary, **kwargs):
     settings.word_dict = dictionary
-    settings.input_types = [
-        sparse_binary_vector(len(dictionary))
-    ]
+    settings.input_types = [sparse_binary_vector(len(dictionary))]
+
 
 # Declaring a data provider for prediction. The difference with process
 # is that label is not generated.
diff --git a/demo/quick_start/dataprovider_emb.py b/demo/quick_start/dataprovider_emb.py
index ca940a89e54770eaf93b7c704a8d1274de2dc693..286f3f5c82081f1a6e02a26023969790792a78a3 100755
--- a/demo/quick_start/dataprovider_emb.py
+++ b/demo/quick_start/dataprovider_emb.py
@@ -16,6 +16,7 @@ from paddle.trainer.PyDataProvider2 import *
 
 UNK_IDX = 0
 
+
 def initializer(settings, dictionary, **kwargs):
     settings.word_dict = dictionary
     settings.input_types = [
@@ -23,7 +24,8 @@ def initializer(settings, dictionary, **kwargs):
         # The value of the integers range from 0 to len(dictrionary)-1
         integer_value_sequence(len(dictionary)),
         # Define the second input for label id
-        integer_value(2)]
+        integer_value(2)
+    ]
 
 
 @provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
@@ -39,7 +41,8 @@ def process(settings, file_name):
 def predict_initializer(settings, dictionary, **kwargs):
     settings.word_dict = dictionary
     settings.input_types = [
-        integer_value(len(dictionary), seq_type=SequenceType.SEQUENCE)
+        integer_value(
+            len(dictionary), seq_type=SequenceType.SEQUENCE)
     ]
 
 
diff --git a/demo/quick_start/preprocess.py b/demo/quick_start/preprocess.py
index 69fdbe44b5245bc2855847a1507e6eaed517eb96..d87fad632a7429f7d9682badabe4c72ca127354f 100755
--- a/demo/quick_start/preprocess.py
+++ b/demo/quick_start/preprocess.py
@@ -13,7 +13,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 1. (remove HTML before or not)tokensizing
 2. pos sample : rating score 5; neg sample: rating score 1-2.
@@ -35,7 +34,8 @@ import multiprocessing
 
 batch_size = 5000
 word_count = {}
-num_tokenize = max(1, multiprocessing.cpu_count() - 2)  # parse + tokenize + save
+num_tokenize = max(1,
+                   multiprocessing.cpu_count() - 2)  # parse + tokenize + save
 max_queue_size = 8
 parse_queue = Queue(maxsize=max_queue_size + num_tokenize)
 tokenize_queue = Queue(maxsize=max_queue_size + num_tokenize)
diff --git a/demo/quick_start/preprocess.sh b/demo/quick_start/preprocess.sh
index fb2bee98beb268e88d82b64332273aa10399ff42..c9190e2dd2ef754bf3c7287006322b52493dc3a0 100755
--- a/demo/quick_start/preprocess.sh
+++ b/demo/quick_start/preprocess.sh
@@ -20,13 +20,22 @@
 
 set -e
 
+export LC_ALL=C
+UNAME_STR=`uname`
+
+if [ ${UNAME_STR} == 'Linux' ]; then
+  SHUF_PROG='shuf'
+else
+  SHUF_PROG='gshuf'
+fi
+
 mkdir -p data/tmp
 python preprocess.py -i data/reviews_Electronics_5.json.gz
 # uniq and shuffle
 cd data/tmp
 echo 'uniq and shuffle...'
-cat pos_*|sort|uniq|shuf> pos.shuffed
-cat neg_*|sort|uniq|shuf> neg.shuffed
+cat pos_*|sort|uniq|${SHUF_PROG}> pos.shuffed
+cat neg_*|sort|uniq|${SHUF_PROG}> neg.shuffed
 
 min_len=`sed -n '$=' neg.shuffed`
 test_num=$((min_len/10))
@@ -40,8 +49,8 @@ head -n$train_num neg.shuffed >train.neg
 tail -n$test_num pos.shuffed >test.pos
 tail -n$test_num neg.shuffed >test.neg
 
-cat train.pos train.neg|shuf>../train.txt
-cat test.pos test.neg|shuf>../test.txt
+cat train.pos train.neg | ${SHUF_PROG} >../train.txt
+cat test.pos test.neg | ${SHUF_PROG} >../test.txt
 
 cd -
 echo 'data/train.txt' > data/train.list
diff --git a/demo/quick_start/train.sh b/demo/quick_start/train.sh
index 1f0a137c8bd59498a327df4c0136314030bbaf7e..b3c471608c3248bfc714d5e44dd927f25dd23ea0 100755
--- a/demo/quick_start/train.sh
+++ b/demo/quick_start/train.sh
@@ -18,11 +18,14 @@ cfg=trainer_config.lr.py
 #cfg=trainer_config.emb.py
 #cfg=trainer_config.cnn.py
 #cfg=trainer_config.lstm.py
+#cfg=trainer_config.bidi-lstm.py
+#cfg=trainer_config.db-lstm.py
+#cfg=trainer_config.resnet-lstm.py
 paddle train \
   --config=$cfg \
   --save_dir=./output \
   --trainer_count=4 \
-  --log_period=20 \
+  --log_period=100 \
   --num_passes=15 \
   --use_gpu=false \
   --show_parameter_stats_period=100 \
diff --git a/demo/quick_start/trainer_config.bidi-lstm.py b/demo/quick_start/trainer_config.bidi-lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..51deaf31f94681b6b61f98f798cef14a65ec92cb
--- /dev/null
+++ b/demo/quick_start/trainer_config.bidi-lstm.py
@@ -0,0 +1,61 @@
+# edit-mode: -*- python -*-
+
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+dict_file = "./data/dict.txt"
+word_dict = dict()
+with open(dict_file, 'r') as f:
+    for i, line in enumerate(f):
+        w = line.strip().split()[0]
+        word_dict[w] = i
+
+is_predict = get_config_arg('is_predict', bool, False)
+trn = 'data/train.list' if not is_predict else None
+tst = 'data/test.list' if not is_predict else 'data/pred.list'
+process = 'process' if not is_predict else 'process_predict'
+define_py_data_sources2(
+    train_list=trn,
+    test_list=tst,
+    module="dataprovider_emb",
+    obj=process,
+    args={"dictionary": word_dict})
+
+batch_size = 128 if not is_predict else 1
+settings(
+    batch_size=batch_size,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25)
+
+bias_attr = ParamAttr(initial_std=0., l2_rate=0.)
+data = data_layer(name="word", size=len(word_dict))
+emb = embedding_layer(input=data, size=128)
+
+bi_lstm = bidirectional_lstm(input=emb, size=128)
+dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
+
+output = fc_layer(
+    input=dropout, size=2, bias_attr=bias_attr, act=SoftmaxActivation())
+
+if is_predict:
+    maxid = maxid_layer(output)
+    outputs([maxid, output])
+else:
+    label = data_layer(name="label", size=2)
+    cls = classification_cost(input=output, label=label)
+    outputs(cls)
diff --git a/demo/quick_start/trainer_config.cnn.py b/demo/quick_start/trainer_config.cnn.py
index 253ec0aee26cf42226d79726a75aad6c61c77565..388efa75f903e0c7c803c99cd50d73a004133a67 100644
--- a/demo/quick_start/trainer_config.cnn.py
+++ b/demo/quick_start/trainer_config.cnn.py
@@ -27,11 +27,12 @@ is_predict = get_config_arg('is_predict', bool, False)
 trn = 'data/train.list' if not is_predict else None
 tst = 'data/test.list' if not is_predict else 'data/pred.list'
 process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(train_list=trn,
-                        test_list=tst,
-                        module="dataprovider_emb",
-                        obj=process,
-                        args={"dictionary": word_dict})
+define_py_data_sources2(
+    train_list=trn,
+    test_list=tst,
+    module="dataprovider_emb",
+    obj=process,
+    args={"dictionary": word_dict})
 
 batch_size = 128 if not is_predict else 1
 settings(
@@ -39,8 +40,7 @@ settings(
     learning_rate=2e-3,
     learning_method=AdamOptimizer(),
     regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25
-)
+    gradient_clipping_threshold=25)
 
 data = data_layer(name="word", size=len(word_dict))
 embedding = embedding_layer(input=data, size=128)
diff --git a/demo/quick_start/trainer_config.db-lstm.py b/demo/quick_start/trainer_config.db-lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..02bc898d881efbd3bfaed95d45cd9e70ed046746
--- /dev/null
+++ b/demo/quick_start/trainer_config.db-lstm.py
@@ -0,0 +1,74 @@
+# edit-mode: -*- python -*-
+
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+dict_file = "./data/dict.txt"
+word_dict = dict()
+with open(dict_file, 'r') as f:
+    for i, line in enumerate(f):
+        w = line.strip().split()[0]
+        word_dict[w] = i
+
+is_predict = get_config_arg('is_predict', bool, False)
+trn = 'data/train.list' if not is_predict else None
+tst = 'data/test.list' if not is_predict else 'data/pred.list'
+process = 'process' if not is_predict else 'process_predict'
+define_py_data_sources2(
+    train_list=trn,
+    test_list=tst,
+    module="dataprovider_emb",
+    obj=process,
+    args={"dictionary": word_dict})
+
+batch_size = 128 if not is_predict else 1
+settings(
+    batch_size=batch_size,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25)
+
+bias_attr = ParamAttr(initial_std=0., l2_rate=0.)
+
+data = data_layer(name="word", size=len(word_dict))
+emb = embedding_layer(input=data, size=128)
+
+hidden_0 = mixed_layer(size=128, input=[full_matrix_projection(input=emb)])
+lstm_0 = lstmemory(input=hidden_0, layer_attr=ExtraAttr(drop_rate=0.1))
+
+input_layers = [hidden_0, lstm_0]
+
+for i in range(1, 8):
+    fc = fc_layer(input=input_layers, size=128)
+    lstm = lstmemory(
+        input=fc,
+        layer_attr=ExtraAttr(drop_rate=0.1),
+        reverse=(i % 2) == 1, )
+    input_layers = [fc, lstm]
+
+lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
+
+output = fc_layer(
+    input=lstm_last, size=2, bias_attr=bias_attr, act=SoftmaxActivation())
+
+if is_predict:
+    maxid = maxid_layer(output)
+    outputs([maxid, output])
+else:
+    label = data_layer(name="label", size=2)
+    cls = classification_cost(input=output, label=label)
+    outputs(cls)
diff --git a/demo/quick_start/trainer_config.emb.py b/demo/quick_start/trainer_config.emb.py
index 34dd7b96f2f142159472b98a09fd0092fac15e43..8fd18a7aac704e62b137845edb46cce5bc373285 100644
--- a/demo/quick_start/trainer_config.emb.py
+++ b/demo/quick_start/trainer_config.emb.py
@@ -27,18 +27,16 @@ is_predict = get_config_arg('is_predict', bool, False)
 trn = 'data/train.list' if not is_predict else None
 tst = 'data/test.list' if not is_predict else 'data/pred.list'
 process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(train_list=trn,
-                        test_list=tst,
-                        module="dataprovider_emb",
-                        obj=process,
-                        args={"dictionary": word_dict})
+define_py_data_sources2(
+    train_list=trn,
+    test_list=tst,
+    module="dataprovider_emb",
+    obj=process,
+    args={"dictionary": word_dict})
 
 batch_size = 128 if not is_predict else 1
 settings(
-    batch_size=batch_size,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer()
-)
+    batch_size=batch_size, learning_rate=2e-3, learning_method=AdamOptimizer())
 
 data = data_layer(name="word", size=len(word_dict))
 embedding = embedding_layer(input=data, size=128)
diff --git a/demo/quick_start/trainer_config.lr.py b/demo/quick_start/trainer_config.lr.py
index 119e3849a4b7e01713bc983d83c000772a60b76d..b9c9441baac28a8a8f6078065b75664819d6cd04 100644
--- a/demo/quick_start/trainer_config.lr.py
+++ b/demo/quick_start/trainer_config.lr.py
@@ -16,7 +16,7 @@
 
 from paddle.trainer_config_helpers import *
 
-dict_file = "./data/dict.txt"
+dict_file = get_config_arg('dict_file', str, "./data/dict.txt")
 word_dict = dict()
 with open(dict_file, 'r') as f:
     for i, line in enumerate(f):
@@ -32,11 +32,12 @@ process = 'process' if not is_predict else 'process_predict'
 # We need to use different process for training and prediction.
 # For training, the input data includes both word IDs and labels.
 # For prediction, the input data only includs word Ids.
-define_py_data_sources2(train_list=trn,
-                        test_list=tst,
-                        module="dataprovider_bow",
-                        obj=process,
-                        args={"dictionary": word_dict})
+define_py_data_sources2(
+    train_list=trn,
+    test_list=tst,
+    module="dataprovider_bow",
+    obj=process,
+    args={"dictionary": word_dict})
 
 batch_size = 128 if not is_predict else 1
 settings(
@@ -44,8 +45,7 @@ settings(
     learning_rate=2e-3,
     learning_method=AdamOptimizer(),
     regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25
-)
+    gradient_clipping_threshold=25)
 
 # Define the data for text features. The size of the data layer is the number
 # of words in the dictionary.
@@ -63,7 +63,6 @@ if not is_predict:
     label = data_layer(name="label", size=2)
 
     # Define cross-entropy classification loss and error.
-    classification_cost(input=output, label=label)
     cls = classification_cost(input=output, label=label)
     outputs(cls)
 else:
diff --git a/demo/quick_start/trainer_config.lstm.py b/demo/quick_start/trainer_config.lstm.py
index ec8a2cb00abd19ef80c327ac564e91661ecc3928..8821e02d9bd4a0d06b8afa99df8e0fac3e2fcefe 100644
--- a/demo/quick_start/trainer_config.lstm.py
+++ b/demo/quick_start/trainer_config.lstm.py
@@ -27,11 +27,12 @@ is_predict = get_config_arg('is_predict', bool, False)
 trn = 'data/train.list' if not is_predict else None
 tst = 'data/test.list' if not is_predict else 'data/pred.list'
 process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(train_list=trn,
-                        test_list=tst,
-                        module="dataprovider_emb",
-                        obj=process,
-                        args={"dictionary": word_dict})
+define_py_data_sources2(
+    train_list=trn,
+    test_list=tst,
+    module="dataprovider_emb",
+    obj=process,
+    args={"dictionary": word_dict})
 
 batch_size = 128 if not is_predict else 1
 settings(
@@ -39,24 +40,14 @@ settings(
     learning_rate=2e-3,
     learning_method=AdamOptimizer(),
     regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25
-)
-
-bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
+    gradient_clipping_threshold=25)
 
 data = data_layer(name="word", size=len(word_dict))
 emb = embedding_layer(input=data, size=128)
-fc = fc_layer(input=emb, size=512,
-              act=LinearActivation(),
-              bias_attr=bias_attr,
-              layer_attr=ExtraAttr(drop_rate=0.1))
-lstm = lstmemory(input=fc, act=TanhActivation(),
-                 bias_attr=bias_attr,
-                 layer_attr=ExtraAttr(drop_rate=0.25))
-lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
-output = fc_layer(input=lstm_last, size=2,
-                  bias_attr=bias_attr,
-                  act=SoftmaxActivation())
+lstm = simple_lstm(
+    input=emb, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.25))
+lstm_max = pooling_layer(input=lstm, pooling_type=MaxPooling())
+output = fc_layer(input=lstm_max, size=2, act=SoftmaxActivation())
 if is_predict:
     maxid = maxid_layer(output)
     outputs([maxid, output])
diff --git a/demo/quick_start/trainer_config.resnet-lstm.py b/demo/quick_start/trainer_config.resnet-lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..91e1581c386eb880d481b7352c4d21f3a5ef5c9a
--- /dev/null
+++ b/demo/quick_start/trainer_config.resnet-lstm.py
@@ -0,0 +1,94 @@
+# edit-mode: -*- python -*-
+
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This configuration is a demonstration of how to implement the stacked LSTM
+with residual connections, i.e. an LSTM layer takes the sum of the hidden states
+and inputs of the previous LSTM layer instead of only the hidden states.
+This architecture is from:
+Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi,
+Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey,
+Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, Lukasz Kaiser,
+Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens,
+George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith, Jason Riesa,
+Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes, Jeffrey Dean. 2016.
+Google's Neural Machine Translation System: Bridging the Gap between Human and
+Machine Translation. In arXiv https://arxiv.org/pdf/1609.08144v2.pdf
+Different from the architecture described in the paper, we use a stack single
+direction LSTM layers as the first layer instead of bi-directional LSTM. Also,
+since this is a demo code, to reduce computation time, we stacked 4 layers
+instead of 8 layers.
+"""
+
+from paddle.trainer_config_helpers import *
+
+dict_file = "./data/dict.txt"
+word_dict = dict()
+with open(dict_file, 'r') as f:
+    for i, line in enumerate(f):
+        w = line.strip().split()[0]
+        word_dict[w] = i
+
+is_predict = get_config_arg('is_predict', bool, False)
+trn = 'data/train.list' if not is_predict else None
+tst = 'data/test.list' if not is_predict else 'data/pred.list'
+process = 'process' if not is_predict else 'process_predict'
+define_py_data_sources2(train_list=trn,
+                        test_list=tst,
+                        module="dataprovider_emb",
+                        obj=process,
+                        args={"dictionary": word_dict})
+
+batch_size = 128 if not is_predict else 1
+settings(
+    batch_size=batch_size,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25
+)
+
+bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
+
+data = data_layer(name="word", size=len(word_dict))
+emb = embedding_layer(input=data, size=128)
+lstm = simple_lstm(input=emb, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.1))
+
+previous_input, previous_hidden_state = emb, lstm
+
+for i in range(3):
+    # The input to the current layer is the sum of the hidden state
+    # and input of the previous layer.
+    current_input = addto_layer(input=[previous_input, previous_hidden_state])
+    hidden_state = simple_lstm(input=current_input, size=128,
+                               lstm_cell_attr=ExtraAttr(drop_rate=0.1))
+    previous_input, previous_hidden_state = current_input, hidden_state
+
+lstm = previous_hidden_state
+
+lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
+output = fc_layer(input=lstm_last, size=2,
+                  bias_attr=bias_attr,
+                  act=SoftmaxActivation())
+
+
+if is_predict:
+    maxid = maxid_layer(output)
+    outputs([maxid, output])
+else:
+    label = data_layer(name="label", size=2)
+    cls = classification_cost(input=output, label=label)
+    outputs(cls)
diff --git a/demo/recommendation/common_utils.py b/demo/recommendation/common_utils.py
index a5f00b3ef9ca00b42b8e31ddd6cfeca3580152b0..613e36b496e47edbc0eabd8f15a0abdcb50f6424 100755
--- a/demo/recommendation/common_utils.py
+++ b/demo/recommendation/common_utils.py
@@ -21,8 +21,9 @@ def meta_to_header(meta, name):
             yield integer_value(each_meta['max'])
         elif each_meta['type'] == 'embedding':
             is_seq = each_meta['seq'] == 'sequence'
-            yield integer_value(len(each_meta['dict']),
-                                seq_type=SequenceType.SEQUENCE if is_seq
-                                else SequenceType.NO_SEQUENCE)
+            yield integer_value(
+                len(each_meta['dict']),
+                seq_type=SequenceType.SEQUENCE
+                if is_seq else SequenceType.NO_SEQUENCE)
         elif each_meta['type'] == 'one_hot_dense':
             yield dense_vector(len(each_meta['dict']))
diff --git a/demo/recommendation/data/config.json b/demo/recommendation/data/config.json
index 71a9dd7be6bd10e177dfb443a94b719c3816d833..f26e74ce47bb7843a571e6033f051c046b31f054 100644
--- a/demo/recommendation/data/config.json
+++ b/demo/recommendation/data/config.json
@@ -14,4 +14,3 @@
     "fields": ["id", "title", "genres"]
   }
 }
-
diff --git a/demo/recommendation/data/config_generator.py b/demo/recommendation/data/config_generator.py
index 29f38082693ad890ac4dfa302399663af8dbd27b..fa605458300f81da6772d88cfbad413e4dcf97fe 100644
--- a/demo/recommendation/data/config_generator.py
+++ b/demo/recommendation/data/config_generator.py
@@ -12,7 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 config_generator.py
 
@@ -29,10 +28,7 @@ import json
 import docopt
 import copy
 
-DEFAULT_FILE = {
-    "type": "split",
-    "delimiter": ","
-}
+DEFAULT_FILE = {"type": "split", "delimiter": ","}
 
 DEFAULT_FIELD = {
     "id": {
@@ -107,19 +103,16 @@ def main(filename, fmt):
                 field = copy.deepcopy(DEFAULT_FIELD[field_key])
                 field['pos'] = pos
                 fields.append(field)
-            obj[k] = {
-                "file": file_dict,
-                "fields": fields
-            }
-    meta = {
-        "meta": obj
-    }
+            obj[k] = {"file": file_dict, "fields": fields}
+    meta = {"meta": obj}
     # print meta
     if fmt == 'json':
+
         def formatter(x):
             import json
             return json.dumps(x, indent=2)
     elif fmt == 'yaml':
+
         def formatter(x):
             import yaml
             return yaml.safe_dump(x, default_flow_style=False)
diff --git a/demo/recommendation/data/meta_generator.py b/demo/recommendation/data/meta_generator.py
index 8d1a33d02aea112e51f1d43bedc06fdcee1186f5..593c863670d5eb5d684adf643ff745f3914b656b 100644
--- a/demo/recommendation/data/meta_generator.py
+++ b/demo/recommendation/data/meta_generator.py
@@ -12,7 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 Preprocess Movielens dataset, to get movie/user object.
 
@@ -66,8 +65,8 @@ class SortedIDGenerator(object):
         self.__key_set__.add(key)
 
     def finish_scan(self, compare=None, key=None, reverse=False):
-        self.__key_set__ = sorted(list(self.__key_set__), cmp=compare,
-                                  key=key, reverse=reverse)
+        self.__key_set__ = sorted(
+            list(self.__key_set__), cmp=compare, key=key, reverse=reverse)
         self.dict = dict()
         for idx, each_key in enumerate(self.__key_set__):
             self.dict[each_key] = idx
@@ -207,11 +206,10 @@ class EmbeddingFieldParser(object):
             self.dict = EmbeddingFieldParser.CharBasedEmbeddingDict(
                 self.seq_type == EmbeddingFieldParser.SEQUENCE)
         elif config['dict']['type'] == 'split':
-            self.dict = SplitEmbeddingDict(
-                config['dict'].get('delimiter', ','))
+            self.dict = SplitEmbeddingDict(config['dict'].get('delimiter', ','))
         elif config['dict']['type'] == 'whole_content':
-            self.dict = EmbeddingFieldParser.WholeContentDict(
-                config['dict']['sort'])
+            self.dict = EmbeddingFieldParser.WholeContentDict(config['dict'][
+                'sort'])
         else:
             print config
             assert False
@@ -333,8 +331,8 @@ class ContentExtractorFactory(object):
                 return PositionContentExtractor(config['pos'])
             else:
                 extra_args = config['regex']
-                return RegexPositionContentExtractor(pos=config['pos'],
-                                                     **extra_args)
+                return RegexPositionContentExtractor(
+                    pos=config['pos'], **extra_args)
 
 
 class MetaFile(object):
@@ -364,9 +362,10 @@ class MetaFile(object):
 
             metas = map(lambda x: x.meta_field(), field_parsers)
             # print metas
-            key_index = filter(lambda x: x is not None, map(
-                lambda (idx, meta): idx if 'is_key' in meta and meta['is_key']
-                else None, enumerate(metas)))[0]
+            key_index = filter(
+                lambda x: x is not None,
+                map(lambda (idx, meta): idx if 'is_key' in meta and meta['is_key'] else None,
+                    enumerate(metas)))[0]
 
             key_map = []
             for i in range(min(key_index, len(metas))):
@@ -374,12 +373,7 @@ class MetaFile(object):
             for i in range(key_index + 1, len(metas)):
                 key_map.append(i)
 
-            obj = {
-                '__meta__': {
-                    'raw_meta': metas,
-                    'feature_map': key_map
-                }
-            }
+            obj = {'__meta__': {'raw_meta': metas, 'feature_map': key_map}}
 
             for each_block in reader.read():
                 idx = field_parsers[key_index].parse(each_block)
diff --git a/demo/recommendation/data/split.py b/demo/recommendation/data/split.py
index ff1f7fab7befdb5bdfa39fd0f1753e6804e82d8f..8dd0cbd32af6074439e98dac024c5fed76cd52b2 100644
--- a/demo/recommendation/data/split.py
+++ b/demo/recommendation/data/split.py
@@ -12,7 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 Separate movielens 1m dataset to train/test file.
 
diff --git a/demo/recommendation/dataprovider.py b/demo/recommendation/dataprovider.py
index 454467f40b44bb526d143934c4a7350d41e54c0e..ff3932be03f1e4a1fc1d0bdb189ab7fe1fbbeca0 100755
--- a/demo/recommendation/dataprovider.py
+++ b/demo/recommendation/dataprovider.py
@@ -15,6 +15,7 @@
 from paddle.trainer.PyDataProvider2 import *
 import common_utils  # parse
 
+
 def hook(settings, meta, **kwargs):
     """
     Init hook is invoked before process data. It will set obj.slots and store
@@ -41,6 +42,7 @@ def hook(settings, meta, **kwargs):
     settings.input_types = headers
     settings.meta = meta
 
+
 @provider(init_hook=hook, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, filename):
     with open(filename, 'r') as f:
diff --git a/demo/recommendation/prediction.py b/demo/recommendation/prediction.py
index f8044a3195ec25bc2fa7c9079e4977f971059352..e2a202cfd1a476046d7e1d1896b87d72c4906ff2 100755
--- a/demo/recommendation/prediction.py
+++ b/demo/recommendation/prediction.py
@@ -28,7 +28,8 @@ if __name__ == '__main__':
     model_path = sys.argv[1]
     swig_paddle.initPaddle('--use_gpu=0')
     conf = parse_config("trainer_config.py", "is_predict=1")
-    network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
+    network = swig_paddle.GradientMachine.createFromConfigProto(
+        conf.model_config)
     assert isinstance(network, swig_paddle.GradientMachine)
     network.loadParameters(model_path)
     with open('./data/meta.bin', 'rb') as f:
@@ -39,11 +40,12 @@ if __name__ == '__main__':
         while True:
             movie_id = int(raw_input("Input movie_id: "))
             user_id = int(raw_input("Input user_id: "))
-            movie_meta = meta['movie'][movie_id]    # Query Data From Meta.
+            movie_meta = meta['movie'][movie_id]  # Query Data From Meta.
             user_meta = meta['user'][user_id]
             data = [movie_id - 1]
             data.extend(movie_meta)
             data.append(user_id - 1)
             data.extend(user_meta)
-            print "Prediction Score is %.2f" % ((network.forwardTest(
-                cvt.convert([data]))[0]['value'][0][0] + 5) / 2)
+            print "Prediction Score is %.2f" % (
+                (network.forwardTest(cvt.convert([data]))[0]['value'][0][0] + 5)
+                / 2)
diff --git a/demo/recommendation/trainer_config.py b/demo/recommendation/trainer_config.py
index 624c22ec969dc98808863ad53573b9633f1791ac..cec340b0b65a841029a1c0538d9881bb38f026ff 100755
--- a/demo/recommendation/trainer_config.py
+++ b/demo/recommendation/trainer_config.py
@@ -27,8 +27,8 @@ with open(META_FILE, 'rb') as f:
     # load meta file
     meta = pickle.load(f)
 
-settings(batch_size=1600, learning_rate=1e-3,
-         learning_method=RMSPropOptimizer())
+settings(
+    batch_size=1600, learning_rate=1e-3, learning_method=RMSPropOptimizer())
 
 
 def construct_feature(name):
@@ -59,11 +59,10 @@ def construct_feature(name):
         slot_name = each_meta.get('name', '%s_id' % name)
         if type_name == 'id':
             slot_dim = each_meta['max']
-            embedding = embedding_layer(input=data_layer(slot_name,
-                                                          size=slot_dim),
-                                        size=256)
-            fusion.append(fc_layer(input=embedding,
-                                   size=256))
+            embedding = embedding_layer(
+                input=data_layer(
+                    slot_name, size=slot_dim), size=256)
+            fusion.append(fc_layer(input=embedding, size=256))
         elif type_name == 'embedding':
             is_seq = each_meta['seq'] == 'sequence'
             slot_dim = len(each_meta['dict'])
@@ -71,17 +70,14 @@ def construct_feature(name):
             embedding = embedding_layer(input=din, size=256)
             if is_seq:
                 fusion.append(
-                    text_conv_pool(input=embedding, context_len=5,
-                                   hidden_size=256))
+                    text_conv_pool(
+                        input=embedding, context_len=5, hidden_size=256))
             else:
-                fusion.append(fc_layer(input=embedding,
-                                       size=256))
+                fusion.append(fc_layer(input=embedding, size=256))
         elif type_name == 'one_hot_dense':
             slot_dim = len(each_meta['dict'])
-            hidden = fc_layer(input=data_layer(slot_name, slot_dim),
-                              size=256)
-            fusion.append(fc_layer(input=hidden,
-                                   size=256))
+            hidden = fc_layer(input=data_layer(slot_name, slot_dim), size=256)
+            fusion.append(fc_layer(input=hidden, size=256))
 
     return fc_layer(name="%s_fusion" % name, input=fusion, size=256)
 
@@ -90,10 +86,16 @@ movie_feature = construct_feature("movie")
 user_feature = construct_feature("user")
 similarity = cos_sim(a=movie_feature, b=user_feature)
 if not is_predict:
-    outputs(regression_cost(input=similarity,
-                            label=data_layer('rating', size=1)))
-
-    define_py_data_sources2('data/train.list', 'data/test.list', module='dataprovider',
-                           obj='process', args={'meta': meta})
+    outputs(
+        regression_cost(
+            input=similarity, label=data_layer(
+                'rating', size=1)))
+
+    define_py_data_sources2(
+        'data/train.list',
+        'data/test.list',
+        module='dataprovider',
+        obj='process',
+        args={'meta': meta})
 else:
     outputs(similarity)
diff --git a/demo/semantic_role_labeling/.gitignore b/demo/semantic_role_labeling/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..cd90ca7bbe9be46f54cb656a8067c794a55d8cfc
--- /dev/null
+++ b/demo/semantic_role_labeling/.gitignore
@@ -0,0 +1,10 @@
+*.pyc
+train.log
+data/feature
+data/conll05st-release/
+data/src.dict
+data/test.wsj.props
+data/test.wsj.seq_pair
+data/test.wsj.words
+data/tgt.dict
+output
diff --git a/demo/semantic_role_labeling/data/extract_dict_feature.py b/demo/semantic_role_labeling/data/extract_dict_feature.py
index 2982e54c665b41400aab0a893ff3c76335404988..daca5f01cf2b3bd231bf530f17ec760272ce93e0 100644
--- a/demo/semantic_role_labeling/data/extract_dict_feature.py
+++ b/demo/semantic_role_labeling/data/extract_dict_feature.py
@@ -17,24 +17,15 @@ import os
 from optparse import OptionParser
 
 
-def extract_dict_features(pair_file, feature_file, src_dict_file,
-                          tgt_dict_file):
-    src_dict = set()
-    tgt_dict = set()
-
-    with open(pair_file) as fin, open(feature_file, 'w') as feature_out, open(
-            src_dict_file, 'w') as src_dict_out, open(tgt_dict_file,
-                                                      'w') as tgt_dict_out:
+def extract_dict_features(pair_file, feature_file):
+
+    with open(pair_file) as fin, open(feature_file, 'w') as feature_out:
         for line in fin:
-            sentence, labels = line.strip().split('\t')
+            sentence, predicate, labels = line.strip().split('\t')
             sentence_list = sentence.split()
             labels_list = labels.split()
 
-            src_dict.update(sentence_list)
-            tgt_dict.update(labels_list)
-
             verb_index = labels_list.index('B-V')
-            verb_feature = sentence_list[verb_index]
 
             mark = [0] * len(labels_list)
             if verb_index > 0:
@@ -42,47 +33,50 @@ def extract_dict_features(pair_file, feature_file, src_dict_file,
                 ctx_n1 = sentence_list[verb_index - 1]
             else:
                 ctx_n1 = 'bos'
-            ctx_n1_feature = ctx_n1
+            
+            if verb_index > 1:
+                mark[verb_index - 2] = 1
+                ctx_n2 = sentence_list[verb_index - 2]
+            else:
+                ctx_n2 = 'bos'
 
             mark[verb_index] = 1
-            ctx_0_feature = sentence_list[verb_index]
+            ctx_0 = sentence_list[verb_index]
 
             if verb_index < len(labels_list) - 2:
                 mark[verb_index + 1] = 1
                 ctx_p1 = sentence_list[verb_index + 1]
             else:
                 ctx_p1 = 'eos'
-            ctx_p1_feature = ctx_p1
+            
+            if verb_index < len(labels_list) - 3:
+                mark[verb_index + 2] = 1
+                ctx_p2 = sentence_list[verb_index + 2]
+            else:
+                ctx_p2 = 'eos'
+
 
             feature_str  = sentence + '\t' \
-                           + verb_feature + '\t' \
-                           + ctx_n1_feature + '\t' \
-                           + ctx_0_feature + '\t' \
-                           + ctx_p1_feature + '\t' \
+                           + predicate + '\t' \
+                           + ctx_n2 + '\t' \
+                           + ctx_n1 + '\t' \
+                           + ctx_0 + '\t' \
+                           + ctx_p1 + '\t' \
+                           + ctx_p2 + '\t' \
                            + ' '.join([str(i) for i in mark]) + '\t' \
                            + labels
 
             feature_out.write(feature_str + '\n')
 
-        src_dict_out.write('<unk>\n')
-        src_dict_out.write('\n'.join(list(src_dict)))
-
-        tgt_dict_out.write('\n'.join(list(tgt_dict)))
 
 
 if __name__ == '__main__':
 
-    usage = '-p pair_file -f feature_file -s source dictionary -t target dictionary '
+    usage = '-p pair_file -f feature_file'
     parser = OptionParser(usage)
     parser.add_option('-p', dest='pair_file', help='the pair file')
-    parser.add_option(
-        '-f', dest='feature_file', help='the file to store feature')
-    parser.add_option(
-        '-s', dest='src_dict', help='the file to store source dictionary')
-    parser.add_option(
-        '-t', dest='tgt_dict', help='the file to store target dictionary')
+    parser.add_option('-f', dest='feature_file', help='the feature file')
 
     (options, args) = parser.parse_args()
 
-    extract_dict_features(options.pair_file, options.feature_file,
-                          options.src_dict, options.tgt_dict)
+    extract_dict_features(options.pair_file, options.feature_file)
diff --git a/demo/semantic_role_labeling/data/extract_pairs.py b/demo/semantic_role_labeling/data/extract_pairs.py
index 4d1bef8f958a62be9941d474a0b67542dcc5cfab..86ab00ce41723169de035a841d9e129a1b9e82a3 100644
--- a/demo/semantic_role_labeling/data/extract_pairs.py
+++ b/demo/semantic_role_labeling/data/extract_pairs.py
@@ -51,7 +51,7 @@ def read_sentences(words_file):
         for line in fin:
             line = line.strip()
             if line == '':
-                sentences.append(s.lower())
+                sentences.append(s)
                 s = ''
             else:
                 s += line + ' '
@@ -64,6 +64,11 @@ def transform_labels(sentences, labels):
         if len(labels[i]) == 1:
             continue
         else:
+            verb_list = []
+            for x in labels[i][0]:
+                if x !='-':
+                   verb_list.append(x)
+
             for j in xrange(1, len(labels[i])):
                 label_list = labels[i][j]
                 current_tag = 'O'
@@ -88,8 +93,7 @@ def transform_labels(sentences, labels):
                         is_in_bracket = True
                     else:
                         print 'error:', ll
-
-                sen_lab_pair.append((sentences[i], label_seq))
+                sen_lab_pair.append((sentences[i], verb_list[j-1], label_seq))
     return sen_lab_pair
 
 
@@ -97,9 +101,9 @@ def write_file(sen_lab_pair, output_file):
     with open(output_file, 'w') as fout:
         for x in sen_lab_pair:
             sentence = x[0]
-            label_seq = ' '.join(x[1])
-            assert len(sentence.split()) == len(x[1])
-            fout.write(sentence + '\t' + label_seq + '\n')
+            label_seq = ' '.join(x[2])
+            assert len(sentence.split()) == len(x[2])
+            fout.write(sentence + '\t' + x[1]+'\t' +label_seq + '\n')
 
 
 if __name__ == '__main__':
diff --git a/demo/semantic_role_labeling/data/get_data.sh b/demo/semantic_role_labeling/data/get_data.sh
index 268c0995e27006ec62f38bdda9b0a0994dab096c..55e33f4685627ed483aa6642c518a33558091531 100644
--- a/demo/semantic_role_labeling/data/get_data.sh
+++ b/demo/semantic_role_labeling/data/get_data.sh
@@ -14,6 +14,10 @@
 # limitations under the License.
 set -e
 wget http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz
+wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/verbDict.txt --no-check-certificate
+wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/targetDict.txt --no-check-certificate
+wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/wordDict.txt --no-check-certificate
+wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/emb --no-check-certificate
 tar -xzvf conll05st-tests.tar.gz
 rm conll05st-tests.tar.gz
 cp ./conll05st-release/test.wsj/words/test.wsj.words.gz  .
@@ -22,4 +26,4 @@ gunzip test.wsj.words.gz
 gunzip test.wsj.props.gz
 
 python extract_pairs.py  -w test.wsj.words -p test.wsj.props -o test.wsj.seq_pair
-python extract_dict_feature.py -p test.wsj.seq_pair -f feature  -s src.dict  -t tgt.dict
+python extract_dict_feature.py -p test.wsj.seq_pair -f feature 
diff --git a/demo/semantic_role_labeling/dataprovider.py b/demo/semantic_role_labeling/dataprovider.py
index 2ef25c42c1794c410fe85fd497a6ed9d2295dca9..d4c137ef42c4e2ec609f3e6f809363e602dfd8dd 100644
--- a/demo/semantic_role_labeling/dataprovider.py
+++ b/demo/semantic_role_labeling/dataprovider.py
@@ -17,41 +17,51 @@ from paddle.trainer.PyDataProvider2 import *
 UNK_IDX = 0
 
 
-def hook(settings, word_dict, label_dict, **kwargs):
+def hook(settings, word_dict, label_dict, predicate_dict, **kwargs):
     settings.word_dict = word_dict
     settings.label_dict = label_dict
+    settings.predicate_dict = predicate_dict
+   
     #all inputs are integral and sequential type
     settings.slots = [
         integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(predicate_dict)),
         integer_value_sequence(len(word_dict)),
         integer_value_sequence(len(word_dict)),
         integer_value_sequence(len(word_dict)),
         integer_value_sequence(len(word_dict)),
-        integer_value_sequence(2),
-        integer_value_sequence(len(label_dict))]
+        integer_value_sequence(len(word_dict)), integer_value_sequence(2),
+        integer_value_sequence(len(label_dict))
+    ]
 
 
-@provider(init_hook=hook)
-def process(obj, file_name):
+def get_batch_size(yeild_data):
+    return len(yeild_data[0])
+    
+
+@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size, 
+          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_name):
     with open(file_name, 'r') as fdata:
         for line in fdata:
-            sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = \
+            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
                 line.strip().split('\t')
-
+           
             words = sentence.split()
             sen_len = len(words)
-            word_slot = [obj.word_dict.get(w, UNK_IDX) for w in words]
+            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
 
-            predicate_slot = [obj.word_dict.get(predicate, UNK_IDX)] * sen_len
-            ctx_n1_slot = [obj.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
-            ctx_0_slot = [obj.word_dict.get(ctx_0, UNK_IDX)] * sen_len
-            ctx_p1_slot = [obj.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
+            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
+            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
 
             marks = mark.split()
             mark_slot = [int(w) for w in marks]
 
             label_list = label.split()
-            label_slot = [obj.label_dict.get(w) for w in label_list]
-
-            yield word_slot, predicate_slot, ctx_n1_slot, \
-                  ctx_0_slot, ctx_p1_slot, mark_slot, label_slot
+            label_slot = [settings.label_dict.get(w) for w in label_list]
+            yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
+                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
diff --git a/demo/semantic_role_labeling/db_lstm.py b/demo/semantic_role_labeling/db_lstm.py
index 364460afbe31caf42cd4f0836eba75e444b3f5b8..54ceff0e724220cc9ea96b9e0ec6844947a8343e 100644
--- a/demo/semantic_role_labeling/db_lstm.py
+++ b/demo/semantic_role_labeling/db_lstm.py
@@ -12,15 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import math
 import os
 import sys
 from paddle.trainer_config_helpers import *
 
 #file paths
-word_dict_file = './data/src.dict'
-label_dict_file = './data/tgt.dict'
+word_dict_file = './data/wordDict.txt'
+label_dict_file = './data/targetDict.txt'
+predicate_file= './data/verbDict.txt'
 train_list_file = './data/train.list'
 test_list_file = './data/test.list'
 
@@ -31,8 +31,10 @@ if not is_predict:
     #load dictionaries
     word_dict = dict()
     label_dict = dict()
+    predicate_dict = dict()
     with open(word_dict_file, 'r') as f_word, \
-         open(label_dict_file, 'r') as f_label:
+         open(label_dict_file, 'r') as f_label, \
+         open(predicate_file, 'r') as f_pre:
         for i, line in enumerate(f_word):
             w = line.strip()
             word_dict[w] = i
@@ -41,8 +43,13 @@ if not is_predict:
             w = line.strip()
             label_dict[w] = i
 
+        for i, line in enumerate(f_pre):
+            w = line.strip()
+            predicate_dict[w] = i
+
+
     if is_test:
-        train_list_file = None 
+        train_list_file = None
 
     #define data provider
     define_py_data_sources2(
@@ -51,91 +58,157 @@ if not is_predict:
         module='dataprovider',
         obj='process',
         args={'word_dict': word_dict,
-              'label_dict': label_dict})
+              'label_dict': label_dict,
+              'predicate_dict': predicate_dict })
 
     word_dict_len = len(word_dict)
     label_dict_len = len(label_dict)
+    pred_len = len(predicate_dict)
 
 else:
     word_dict_len = get_config_arg('dict_len', int)
     label_dict_len = get_config_arg('label_len', int)
+    pred_len = get_config_arg('pred_len', int)
 
+############################## Hyper-parameters ##################################
 mark_dict_len = 2
 word_dim = 32
 mark_dim = 5
-hidden_dim = 128
+hidden_dim = 512
 depth = 8
-emb_lr = 1e-2
-fc_lr = 1e-2
-lstm_lr = 2e-2
+
+
+
+########################### Optimizer #######################################
+
 
 settings(
     batch_size=150,
-    learning_method=AdamOptimizer(),
-    learning_rate=1e-3,
+    learning_method=MomentumOptimizer(momentum=0),
+    learning_rate=2e-2,
     regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
+    is_async=False,
+    model_average=ModelAverage(average_window=0.5,
+                               max_average_window=10000),
+                               
+)
 
-#6 features
+
+
+
+####################################### network ##############################
+#8 features and 1 target
 word = data_layer(name='word_data', size=word_dict_len)
-predicate = data_layer(name='verb_data', size=word_dict_len)
+predicate = data_layer(name='verb_data', size=pred_len)
+
+ctx_n2 = data_layer(name='ctx_n2_data', size=word_dict_len)
 ctx_n1 = data_layer(name='ctx_n1_data', size=word_dict_len)
 ctx_0 = data_layer(name='ctx_0_data', size=word_dict_len)
 ctx_p1 = data_layer(name='ctx_p1_data', size=word_dict_len)
+ctx_p2 = data_layer(name='ctx_p2_data', size=word_dict_len)
 mark = data_layer(name='mark_data', size=mark_dict_len)
 
+
 if not is_predict:
     target = data_layer(name='target', size=label_dict_len)
 
-ptt = ParameterAttribute(name='src_emb', learning_rate=emb_lr)
-layer_attr = ExtraLayerAttribute(drop_rate=0.5)
-fc_para_attr = ParameterAttribute(learning_rate=fc_lr)
-lstm_para_attr = ParameterAttribute(initial_std=0., learning_rate=lstm_lr)
-para_attr = [fc_para_attr, lstm_para_attr]
 
-word_embedding = embedding_layer(size=word_dim, input=word, param_attr=ptt)
-predicate_embedding = embedding_layer(
-    size=word_dim, input=predicate, param_attr=ptt)
-ctx_n1_embedding = embedding_layer(size=word_dim, input=ctx_n1, param_attr=ptt)
-ctx_0_embedding = embedding_layer(size=word_dim, input=ctx_0, param_attr=ptt)
-ctx_p1_embedding = embedding_layer(size=word_dim, input=ctx_p1, param_attr=ptt)
-mark_embedding = embedding_layer(size=mark_dim, input=mark)
+default_std=1/math.sqrt(hidden_dim)/3.0
+
+emb_para = ParameterAttribute(name='emb', initial_std=0., learning_rate=0.)
+std_0 = ParameterAttribute(initial_std=0.)
+std_default = ParameterAttribute(initial_std=default_std) 
+
+predicate_embedding = embedding_layer(size=word_dim, input=predicate, param_attr=ParameterAttribute(name='vemb',initial_std=default_std))
+mark_embedding = embedding_layer(name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0)
+
+word_input=[word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+emb_layers = [embedding_layer(size=word_dim, input=x, param_attr=emb_para) for x in word_input]
+emb_layers.append(predicate_embedding)
+emb_layers.append(mark_embedding)
 
 hidden_0 = mixed_layer(
+    name='hidden0',
     size=hidden_dim,
-    input=[
-        full_matrix_projection(input=word_embedding),
-        full_matrix_projection(input=predicate_embedding),
-        full_matrix_projection(input=ctx_n1_embedding),
-        full_matrix_projection(input=ctx_0_embedding),
-        full_matrix_projection(input=ctx_p1_embedding),
-        full_matrix_projection(input=mark_embedding),
-    ])
+    bias_attr=std_default,
+    input=[ full_matrix_projection(input=emb, param_attr=std_default ) for emb in emb_layers ])
+
 
-lstm_0 = lstmemory(input=hidden_0, layer_attr=layer_attr)
+mix_hidden_lr = 1e-3
+lstm_para_attr = ParameterAttribute(initial_std=0.0, learning_rate=1.0)
+hidden_para_attr = ParameterAttribute(initial_std=default_std, learning_rate=mix_hidden_lr)
+
+lstm_0 = lstmemory(name='lstm0',
+                   input=hidden_0, 
+                   act=ReluActivation(),
+                   gate_act=SigmoidActivation(),
+                   state_act=SigmoidActivation(),
+                   bias_attr=std_0,
+                   param_attr=lstm_para_attr)
 
 #stack L-LSTM and R-LSTM with direct edges
 input_tmp = [hidden_0, lstm_0]
 
+
 for i in range(1, depth):
 
-    fc = fc_layer(input=input_tmp, size=hidden_dim, param_attr=para_attr)
+    mix_hidden = mixed_layer(name='hidden'+str(i),
+                             size=hidden_dim, 
+                             bias_attr=std_default,
+                             input=[full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr),
+                                    full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr)
+                                   ]
+                             )
+
+    lstm = lstmemory(name='lstm'+str(i),
+                     input=mix_hidden,
+                     act=ReluActivation(),
+                     gate_act=SigmoidActivation(),
+                     state_act=SigmoidActivation(),
+                     reverse=((i % 2)==1),
+                     bias_attr=std_0,
+                     param_attr=lstm_para_attr)
+
+    input_tmp = [mix_hidden, lstm]
+
+feature_out = mixed_layer(name='output',
+                          size=label_dict_len,
+                          bias_attr=std_default, 
+                          input=[full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr),
+                                 full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr)
+                                ],
+                          )
 
-    lstm = lstmemory(
-        input=fc,
-        act=ReluActivation(),
-        reverse=(i % 2) == 1,
-        layer_attr=layer_attr)
-    input_tmp = [fc, lstm]
 
-prob = fc_layer(
-    input=input_tmp,
-    size=label_dict_len,
-    act=SoftmaxActivation(),
-    param_attr=para_attr)
 
 if not is_predict:
-    cls = classification_cost(input=prob, label=target)
-    outputs(cls)
+    crf_l = crf_layer( name = 'crf',
+                       size = label_dict_len,
+                       input = feature_out, 
+                       label = target,
+                       param_attr=ParameterAttribute(name='crfw',initial_std=default_std, learning_rate=mix_hidden_lr)
+
+                      )
+
+    
+    crf_dec_l = crf_decoding_layer(name = 'crf_dec_l',
+                                   size = label_dict_len,
+                                   input = feature_out,
+                                   label = target,
+                                   param_attr=ParameterAttribute(name='crfw')
+                                       )
+
+
+    eval = sum_evaluator(input=crf_dec_l)
+        
+    outputs(crf_l)
+
 else:
-    outputs(prob)
+    crf_dec_l = crf_decoding_layer(name = 'crf_dec_l',
+                                   size = label_dict_len,
+                                   input = feature_out,
+                                   param_attr=ParameterAttribute(name='crfw')
+                                       )
+
+    outputs(crf_dec_l)
+
diff --git a/demo/semantic_role_labeling/predict.py b/demo/semantic_role_labeling/predict.py
index 9a27112828e449174e3da79dc7db9fed20bfed6f..2761814e1811e701122e0be4850526c5b290c457 100644
--- a/demo/semantic_role_labeling/predict.py
+++ b/demo/semantic_role_labeling/predict.py
@@ -26,7 +26,7 @@ UNK_IDX = 0
 
 
 class Prediction():
-    def __init__(self, train_conf, dict_file, model_dir, label_file):
+    def __init__(self, train_conf, dict_file, model_dir, label_file, predicate_dict_file):
         """
         train_conf: trainer configure.
         dict_file: word dictionary file name.
@@ -35,16 +35,19 @@ class Prediction():
 
         self.dict = {}
         self.labels = {}
+        self.predicate_dict={}
         self.labels_reverse = {}
-        self.load_dict_label(dict_file, label_file)
+        self.load_dict_label(dict_file, label_file, predicate_dict_file)
 
         len_dict = len(self.dict)
         len_label = len(self.labels)
+        len_pred = len(self.predicate_dict)
 
         conf = parse_config(
             train_conf,
-            'dict_len=' + str(len_dict) +
+            'dict_len=' + str(len_dict) + 
             ',label_len=' + str(len_label) +
+            ',pred_len=' + str(len_pred) +
             ',is_predict=True')
         self.network = swig_paddle.GradientMachine.createFromConfigProto(
             conf.model_config)
@@ -52,15 +55,21 @@ class Prediction():
 
         slots = [
             integer_value_sequence(len_dict),
+            integer_value_sequence(len_pred),
             integer_value_sequence(len_dict),
             integer_value_sequence(len_dict),
             integer_value_sequence(len_dict),
             integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict), 
             integer_value_sequence(2)
+            ]
+            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict), integer_value_sequence(2)
         ]
         self.converter = DataProviderConverter(slots)
 
-    def load_dict_label(self, dict_file, label_file):
+    def load_dict_label(self, dict_file, label_file, predicate_dict_file):
         """
         Load dictionary from self.dict_file.
         """
@@ -71,52 +80,55 @@ class Prediction():
             self.labels[line.strip()] = line_count
             self.labels_reverse[line_count] = line.strip()
 
+        for line_count, line in enumerate(open(predicate_dict_file, 'r')):
+            self.predicate_dict[line.strip()] = line_count
     def get_data(self, data_file):
         """
         Get input data of paddle format.
         """
         with open(data_file, 'r') as fdata:
             for line in fdata:
-                sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = line.strip(
+                sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = line.strip(
                 ).split('\t')
                 words = sentence.split()
                 sen_len = len(words)
-
+                 
                 word_slot = [self.dict.get(w, UNK_IDX) for w in words]
-                predicate_slot = [self.dict.get(predicate, UNK_IDX)] * sen_len
+                predicate_slot = [self.predicate_dict.get(predicate, UNK_IDX)] * sen_len
+                ctx_n2_slot = [self.dict.get(ctx_n2, UNK_IDX)] * sen_len
                 ctx_n1_slot = [self.dict.get(ctx_n1, UNK_IDX)] * sen_len
                 ctx_0_slot = [self.dict.get(ctx_0, UNK_IDX)] * sen_len
                 ctx_p1_slot = [self.dict.get(ctx_p1, UNK_IDX)] * sen_len
+                ctx_p2_slot = [self.dict.get(ctx_p2, UNK_IDX)] * sen_len
 
                 marks = mark.split()
                 mark_slot = [int(w) for w in marks]
+                
+                yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
+                      ctx_0_slot, ctx_p1_slot, ctx_p2_slot,  mark_slot
 
-                yield word_slot, predicate_slot, ctx_n1_slot, \
-                      ctx_0_slot, ctx_p1_slot, mark_slot
-
-    def predict(self, data_file):
+    def predict(self, data_file, output_file):
         """
         data_file: file name of input data.
         """
         input = self.converter(self.get_data(data_file))
         output = self.network.forwardTest(input)
-        prob = output[0]["value"]
-        lab = list(np.argsort(-prob)[:, 0])
+        lab = output[0]["id"].tolist()
 
-        with open(data_file, 'r') as fin, open('predict.res', 'w') as fout:
+        with open(data_file, 'r') as fin, open(output_file, 'w') as fout:
             index = 0
             for line in fin:
                 sen = line.split('\t')[0]
                 len_sen = len(sen.split())
                 line_labels = lab[index:index + len_sen]
                 index += len_sen
-                fout.write(sen + '\t' + ' '.join([self.labels_reverse[
-                    i] for i in line_labels]) + '\n')
+                fout.write(sen + '\t' + ' '.join(
+                    [self.labels_reverse[i] for i in line_labels]) + '\n')
 
 
 def option_parser():
-    usage = ("python predict.py -c config -w model_dir "
-             "-d word dictionary -l label_file -i input_file")
+    usage = ("python predict.py -c config -w model_dir " 
+             "-d word dictionary -l label_file -i input_file  -p pred_dict_file")
     parser = OptionParser(usage="usage: %s [options]" % usage)
     parser.add_option(
         "-c",
@@ -137,6 +149,13 @@ def option_parser():
         dest="label_file",
         default=None,
         help="label file")
+    parser.add_option(
+        "-p",
+        "--predict_dict_file",
+        action="store",
+        dest="predict_dict_file",
+        default=None,
+        help="predict_dict_file")
     parser.add_option(
         "-i",
         "--data",
@@ -150,6 +169,14 @@ def option_parser():
         dest="model_path",
         default=None,
         help="model path")
+
+    parser.add_option(
+        "-o",
+        "--output_file",
+        action="store",
+        dest="output_file",
+        default=None,
+        help="output file")
     return parser.parse_args()
 
 
@@ -160,10 +187,12 @@ def main():
     dict_file = options.dict_file
     model_path = options.model_path
     label_file = options.label_file
+    predict_dict_file = options.predict_dict_file
+    output_file = options.output_file
 
     swig_paddle.initPaddle("--use_gpu=0")
-    predict = Prediction(train_conf, dict_file, model_path, label_file)
-    predict.predict(data_file)
+    predict = Prediction(train_conf, dict_file, model_path, label_file, predict_dict_file)
+    predict.predict(data_file,output_file)
 
 
 if __name__ == '__main__':
diff --git a/demo/semantic_role_labeling/predict.sh b/demo/semantic_role_labeling/predict.sh
index a545b9a5d591b41bdbd54905cbbffc410abc8fb0..d0acdb0bd093974485475cf796c6d41ac7899135 100644
--- a/demo/semantic_role_labeling/predict.sh
+++ b/demo/semantic_role_labeling/predict.sh
@@ -26,15 +26,18 @@ LOG=`get_best_pass $log`
 LOG=(${LOG})
 best_model_path="output/pass-${LOG[1]}"
 
-
 config_file=db_lstm.py
-dict_file=./data/src.dict
-label_file=./data/tgt.dict 
+dict_file=./data/wordDict.txt
+label_file=./data/targetDict.txt 
+predicate_dict_file=./data/verbDict.txt
 input_file=./data/feature
+output_file=predict.res
  
 python predict.py \
      -c $config_file \
      -w $best_model_path \
      -l $label_file \
+     -p $predicate_dict_file  \
      -d $dict_file \
-     -i $input_file
+     -i $input_file \
+     -o $output_file
diff --git a/demo/semantic_role_labeling/test.sh b/demo/semantic_role_labeling/test.sh
index 804f722e5b8e9ee5b54c778c54f7833f5e6c4de0..c4ab44f5ca08aefd18f2851a1410aa08563925a9 100644
--- a/demo/semantic_role_labeling/test.sh
+++ b/demo/semantic_role_labeling/test.sh
@@ -36,5 +36,5 @@ paddle train \
   --job=test \
   --use_gpu=false \
   --config_args=is_test=1 \
+  --test_all_data_in_one_period=1 \
 2>&1 | tee 'test.log'
-
diff --git a/demo/semantic_role_labeling/train.sh b/demo/semantic_role_labeling/train.sh
index 94c7b6f31df3b5e5e059d6e1323ae0c0bec74753..420768bb2b4ebed7b135a49c5eee5e5538426ae1 100644
--- a/demo/semantic_role_labeling/train.sh
+++ b/demo/semantic_role_labeling/train.sh
@@ -16,12 +16,14 @@
 set -e
 paddle train \
   --config=./db_lstm.py \
+  --use_gpu=0 \
+  --log_period=5000 \
+  --trainer_count=1 \
+  --show_parameter_stats_period=5000 \
   --save_dir=./output \
-  --trainer_count=4 \
-  --log_period=10 \
-  --num_passes=500 \
-  --use_gpu=false \
-  --show_parameter_stats_period=10 \
+  --num_passes=10000 \
+  --average_test_period=10000000 \
+  --init_model_path=./data \
+  --load_missing_parameter_strategy=rand \
   --test_all_data_in_one_period=1 \
-2>&1 | tee 'train.log'
-
+  2>&1 | tee 'train.log'
diff --git a/demo/sentiment/data/get_imdb.sh b/demo/sentiment/data/get_imdb.sh
index 41523927afe75428ef1151cef8184ede14eea9a7..28fa86232d89964b3f1680080239cf8a4ebefa9a 100755
--- a/demo/sentiment/data/get_imdb.sh
+++ b/demo/sentiment/data/get_imdb.sh
@@ -38,11 +38,11 @@ unzip master.zip
 mkdir -p imdb/train
 mkdir -p imdb/test
 
-cp -r aclImdb/train/pos/ imdb/train/
-cp -r aclImdb/train/neg/ imdb/train/
+cp -r aclImdb/train/pos/ imdb/train/pos
+cp -r aclImdb/train/neg/ imdb/train/neg
 
-cp -r aclImdb/test/pos/ imdb/test/
-cp -r aclImdb/test/neg/ imdb/test/
+cp -r aclImdb/test/pos/ imdb/test/pos
+cp -r aclImdb/test/neg/ imdb/test/neg
 
 #remove compressed package
 rm aclImdb_v1.tar.gz
diff --git a/demo/sentiment/dataprovider.py b/demo/sentiment/dataprovider.py
index 9a9fd81f030cb1d2a10a5000fd1d12810d12112b..53e3d1d20df92b8815347bd8937064871f326b3f 100755
--- a/demo/sentiment/dataprovider.py
+++ b/demo/sentiment/dataprovider.py
@@ -17,8 +17,8 @@ from paddle.trainer.PyDataProvider2 import *
 def hook(settings, dictionary, **kwargs):
     settings.word_dict = dictionary
     settings.input_types = [
-        integer_value_sequence(len(settings.word_dict)),
-        integer_value(2)]
+        integer_value_sequence(len(settings.word_dict)), integer_value(2)
+    ]
     settings.logger.info('dict len : %d' % (len(settings.word_dict)))
 
 
@@ -29,6 +29,7 @@ def process(settings, file_name):
             label, comment = line.strip().split('\t\t')
             label = int(label)
             words = comment.split()
-            word_slot = [settings.word_dict[w] for w in words if w in
-                         settings.word_dict]
+            word_slot = [
+                settings.word_dict[w] for w in words if w in settings.word_dict
+            ]
             yield word_slot, label
diff --git a/demo/sentiment/predict.py b/demo/sentiment/predict.py
index c61628d34db4a2bcecd8b367879045f7cb57d491..bc0f6f31264294034ed38309f7fda370865b2845 100755
--- a/demo/sentiment/predict.py
+++ b/demo/sentiment/predict.py
@@ -18,14 +18,14 @@ from optparse import OptionParser
 from py_paddle import swig_paddle, DataProviderConverter
 from paddle.trainer.PyDataProvider2 import integer_value_sequence
 from paddle.trainer.config_parser import parse_config
-
 """
 Usage: run following command to show help message.
   python predict.py -h
 """
 
+
 class SentimentPrediction():
-    def __init__(self, train_conf, dict_file, model_dir=None, label_file = None):
+    def __init__(self, train_conf, dict_file, model_dir=None, label_file=None):
         """
         train_conf: trainer configure.
         dict_file: word dictionary file name.
@@ -44,10 +44,11 @@ class SentimentPrediction():
             self.load_label(label_file)
 
         conf = parse_config(train_conf, "is_predict=1")
-        self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
+        self.network = swig_paddle.GradientMachine.createFromConfigProto(
+            conf.model_config)
         self.network.loadParameters(self.model_dir)
-        slots = [integer_value_sequence(self.dict_dim)]
-        self.converter = DataProviderConverter(slots)
+        input_types = [integer_value_sequence(self.dict_dim)]
+        self.converter = DataProviderConverter(input_types)
 
     def load_dict(self):
         """
@@ -61,7 +62,7 @@ class SentimentPrediction():
         """
         Load label.
         """
-        self.label={}
+        self.label = {}
         for v in open(label_file, 'r'):
             self.label[int(v.split('\t')[1])] = v.split('\t')[0]
 
@@ -72,7 +73,9 @@ class SentimentPrediction():
         with open(data_file, 'r') as fdata:
             for line in fdata:
                 words = line.strip().split()
-                word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
+                word_slot = [
+                    self.word_dict[w] for w in words if w in self.word_dict
+                ]
                 if not word_slot:
                     print "all words are not in dictionary: %s", line
                     continue
@@ -89,25 +92,48 @@ class SentimentPrediction():
         if self.label is None:
             print("%s: predicting label is %d" % (data_file, lab[0][0]))
         else:
-            print("%s: predicting label is %s" % (data_file, self.label[lab[0][0]]))
+            print("%s: predicting label is %s" %
+                  (data_file, self.label[lab[0][0]]))
+
 
 def option_parser():
     usage = "python predict.py -n config -w model_dir -d dictionary -i input_file "
     parser = OptionParser(usage="usage: %s [options]" % usage)
-    parser.add_option("-n", "--tconf", action="store",
-                      dest="train_conf", help="network config")
-    parser.add_option("-d", "--dict", action="store",
-                      dest="dict_file",help="dictionary file")
-    parser.add_option("-b", "--label", action="store",
-                      dest="label", default=None,
-                      help="dictionary file")
-    parser.add_option("-i", "--data", action="store",
-                      dest="data", help="data file to predict")
-    parser.add_option("-w", "--model", action="store",
-                      dest="model_path", default=None,
-                      help="model path")
+    parser.add_option(
+        "-n",
+        "--tconf",
+        action="store",
+        dest="train_conf",
+        help="network config")
+    parser.add_option(
+        "-d",
+        "--dict",
+        action="store",
+        dest="dict_file",
+        help="dictionary file")
+    parser.add_option(
+        "-b",
+        "--label",
+        action="store",
+        dest="label",
+        default=None,
+        help="dictionary file")
+    parser.add_option(
+        "-i",
+        "--data",
+        action="store",
+        dest="data",
+        help="data file to predict")
+    parser.add_option(
+        "-w",
+        "--model",
+        action="store",
+        dest="model_path",
+        default=None,
+        help="model path")
     return parser.parse_args()
 
+
 def main():
     options, args = option_parser()
     train_conf = options.train_conf
@@ -119,5 +145,6 @@ def main():
     predict = SentimentPrediction(train_conf, dict_file, model_path, label)
     predict.predict(data)
 
+
 if __name__ == '__main__':
     main()
diff --git a/demo/sentiment/preprocess.py b/demo/sentiment/preprocess.py
index 49b53d500a1bf816bde9c9675b251be8e9a68ae9..7146e95d751c4de649e204fab724085994dfa4d3 100755
--- a/demo/sentiment/preprocess.py
+++ b/demo/sentiment/preprocess.py
@@ -22,13 +22,13 @@ from os.path import join as join_path
 from optparse import OptionParser
 
 from paddle.utils.preprocess_util import *
-
 """
 Usage: run following command to show help message.
   python preprocess.py -h 
 """
 
-def save_dict(dict, filename, is_reverse = True):
+
+def save_dict(dict, filename, is_reverse=True):
     """
     Save dictionary into file.
     dict:   input dictionary.
@@ -39,9 +39,10 @@ def save_dict(dict, filename, is_reverse = True):
     f = open(filename, 'w')
     for k, v in sorted(dict.items(), key=operator.itemgetter(1),\
                        reverse=is_reverse):
-        f.write('%s\t%s\n'%(k, v))
+        f.write('%s\t%s\n' % (k, v))
     f.close()
 
+
 def tokenize(sentences):
     """
     Use tokenizer.perl to tokenize input sentences.
@@ -58,6 +59,7 @@ def tokenize(sentences):
     toks = tok_text.split('\n')[:-1]
     return toks
 
+
 def read_lines(path):
     """
     path: String, file path.
@@ -71,12 +73,17 @@ def read_lines(path):
                 seqs.append(line)
     return seqs
 
+
 class SentimentDataSetCreate():
     """
     A class to process data for sentiment analysis task.
     """
-    def __init__(self, data_path, output_path,
-                 use_okenizer = True, multi_lines = False):
+
+    def __init__(self,
+                 data_path,
+                 output_path,
+                 use_okenizer=True,
+                 multi_lines=False):
         """
         data_path: string, traing and testing dataset path
         output_path: string, output path, store processed dataset
@@ -164,23 +171,17 @@ class SentimentDataSetCreate():
         # Preprocess train data.
         train_data, train_lab_set = self.data_list(self.train_dir)
         print "processing train set..."
-        file_lists = self.save_data(train_data,
-                                     "train",
-                                     self.batch_size,
-                                     True,
-                                     True)
+        file_lists = self.save_data(train_data, "train", self.batch_size, True,
+                                    True)
         save_list(file_lists, self.train_list)
 
         # If have test data path, preprocess test data.
         if os.path.exists(self.test_dir):
             test_data, test_lab_set = self.data_list(self.test_dir)
-            assert(train_lab_set == test_lab_set)
+            assert (train_lab_set == test_lab_set)
             print "processing test set..."
-            file_lists = self.save_data(test_data,
-                                        "test",
-                                        self.batch_size,
-                                        False,
-                                        self.dict_with_test)
+            file_lists = self.save_data(test_data, "test", self.batch_size,
+                                        False, self.dict_with_test)
             save_list(file_lists, self.test_list)
 
         # save labels set.
@@ -191,7 +192,9 @@ class SentimentDataSetCreate():
         save_dict(self.word_count, self.dict_file, True)
         self.dict_size = len(self.word_count)
 
-    def save_data(self, data, prefix = "",
+    def save_data(self,
+                  data,
+                  prefix="",
                   batch_size=50000,
                   is_shuffle=False,
                   build_dict=False):
@@ -205,7 +208,8 @@ class SentimentDataSetCreate():
         return: list of batch names
         """
         if is_shuffle and self.multi_lines:
-           return self.save_data_multi_lines(data, prefix, batch_size, build_dict)
+            return self.save_data_multi_lines(data, prefix, batch_size,
+                                              build_dict)
 
         if is_shuffle:
             random.shuffle(data)
@@ -213,7 +217,7 @@ class SentimentDataSetCreate():
         batch_names = []
         for i in range(num_batches):
             batch_name = join_path(self.output_path,
-                                   "%s_part_%03d" %(prefix, i))
+                                   "%s_part_%03d" % (prefix, i))
             begin = i * batch_size
             end = min((i + 1) * batch_size, len(data))
             # read a batch of data
@@ -246,7 +250,9 @@ class SentimentDataSetCreate():
             data_list = tokenize(data_list)
         return label_list, data_list
 
-    def save_data_multi_lines(self, data, prefix = "",
+    def save_data_multi_lines(self,
+                              data,
+                              prefix="",
                               batch_size=50000,
                               build_dict=False):
         """
@@ -274,14 +280,14 @@ class SentimentDataSetCreate():
             self.create_dict(data_list)
 
         length = len(label_list)
-        perm_list = np.array([ i for i in xrange(length) ])
+        perm_list = np.array([i for i in xrange(length)])
         random.shuffle(perm_list)
 
         num_batches = int(math.ceil(length / float(batch_size)))
         batch_names = []
         for i in range(num_batches):
             batch_name = join_path(self.output_path,
-                                   "%s_part_%03d" %(prefix, i))
+                                   "%s_part_%03d" % (prefix, i))
             begin = i * batch_size
             end = min((i + 1) * batch_size, length)
             sub_label = [label_list[perm_list[i]] for i in range(begin, end)]
@@ -304,35 +310,50 @@ class SentimentDataSetCreate():
             f.write('%s\t\t%s\n' % (lab, seq))
         f.close()
 
+
 def option_parser():
     parser = OptionParser(usage="usage: python preprcoess.py "\
                                 "-i data_dir [options]")
-    parser.add_option("-i", "--data", action="store",
-                      dest="input", help="Input data directory.")
-    parser.add_option("-o", "--output", action="store",
-                      dest="output", default=None,
-                      help="Output directory.")
-    parser.add_option("-t", "--tokenizer", action="store",
-                      dest="use_tokenizer", default=True,
-                      help="Whether to use tokenizer.")
+    parser.add_option(
+        "-i",
+        "--data",
+        action="store",
+        dest="input",
+        help="Input data directory.")
+    parser.add_option(
+        "-o",
+        "--output",
+        action="store",
+        dest="output",
+        default=None,
+        help="Output directory.")
+    parser.add_option(
+        "-t",
+        "--tokenizer",
+        action="store",
+        dest="use_tokenizer",
+        default=True,
+        help="Whether to use tokenizer.")
     parser.add_option("-m", "--multi_lines", action="store",
                       dest="multi_lines", default=False,
                       help="If input text files have multi lines and they "\
                            "need to be shuffled, you should set -m True,")
     return parser.parse_args()
 
+
 def main():
     options, args = option_parser()
-    data_dir=options.input
-    output_dir=options.output
-    use_tokenizer=options.use_tokenizer
-    multi_lines=options.multi_lines
+    data_dir = options.input
+    output_dir = options.output
+    use_tokenizer = options.use_tokenizer
+    multi_lines = options.multi_lines
     if output_dir is None:
         outname = os.path.basename(options.input)
         output_dir = join_path(os.path.dirname(data_dir), 'pre-' + outname)
-    data_creator = SentimentDataSetCreate(data_dir, output_dir,
-                                          use_tokenizer, multi_lines)
+    data_creator = SentimentDataSetCreate(data_dir, output_dir, use_tokenizer,
+                                          multi_lines)
     data_creator.create_dataset()
 
+
 if __name__ == '__main__':
     main()
diff --git a/demo/sentiment/sentiment_net.py b/demo/sentiment/sentiment_net.py
index 31e585edcaa111898c950ad016d3996fae15a7db..ff6a3624a404cb52d5d7ac0934fedba0d489dc22 100644
--- a/demo/sentiment/sentiment_net.py
+++ b/demo/sentiment/sentiment_net.py
@@ -47,10 +47,12 @@ def sentiment_data(data_dir=None,
         for i, line in enumerate(open(dict_file, 'r')):
             word_dict[line.split('\t')[0]] = i
 
-    define_py_data_sources2(train_list, test_list,
-                           module="dataprovider",
-                           obj="process",
-                           args={'dictionary': word_dict})
+    define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={'dictionary': word_dict})
 
     return dict_dim, class_dim
 
@@ -64,8 +66,7 @@ def bidirectional_lstm_net(input_dim,
     emb = embedding_layer(input=data, size=emb_dim)
     bi_lstm = bidirectional_lstm(input=emb, size=lstm_dim)
     dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
-    output = fc_layer(input=dropout, size=class_dim,
-                      act=SoftmaxActivation())
+    output = fc_layer(input=dropout, size=class_dim, act=SoftmaxActivation())
 
     if not is_predict:
         lbl = data_layer("label", 1)
@@ -109,27 +110,36 @@ def stacked_lstm_net(input_dim,
     data = data_layer("word", input_dim)
     emb = embedding_layer(input=data, size=emb_dim)
 
-    fc1 = fc_layer(input=emb, size=hid_dim, act=linear,
-                   bias_attr=bias_attr)
-    lstm1 = lstmemory(input=fc1, act=relu, bias_attr=bias_attr,
-                      layer_attr=layer_attr)
+    fc1 = fc_layer(input=emb, size=hid_dim, act=linear, bias_attr=bias_attr)
+    lstm1 = lstmemory(
+        input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
 
     inputs = [fc1, lstm1]
     for i in range(2, stacked_num + 1):
-        fc = fc_layer(input=inputs, size=hid_dim, act=linear,
-                      param_attr=para_attr, bias_attr=bias_attr)
-        lstm = lstmemory(input=fc, reverse=(i % 2) == 0, act=relu,
-                         bias_attr=bias_attr, layer_attr=layer_attr)
+        fc = fc_layer(
+            input=inputs,
+            size=hid_dim,
+            act=linear,
+            param_attr=para_attr,
+            bias_attr=bias_attr)
+        lstm = lstmemory(
+            input=fc,
+            reverse=(i % 2) == 0,
+            act=relu,
+            bias_attr=bias_attr,
+            layer_attr=layer_attr)
         inputs = [fc, lstm]
 
     fc_last = pooling_layer(input=inputs[0], pooling_type=MaxPooling())
     lstm_last = pooling_layer(input=inputs[1], pooling_type=MaxPooling())
-    output = fc_layer(input=[fc_last, lstm_last], size=class_dim,
-                      act=SoftmaxActivation(),
-                      bias_attr=bias_attr, param_attr=para_attr)
+    output = fc_layer(
+        input=[fc_last, lstm_last],
+        size=class_dim,
+        act=SoftmaxActivation(),
+        bias_attr=bias_attr,
+        param_attr=para_attr)
 
     if is_predict:
         outputs(output)
     else:
-        outputs(
-            classification_cost(input=output, label=data_layer('label', 1)))
+        outputs(classification_cost(input=output, label=data_layer('label', 1)))
diff --git a/demo/sentiment/trainer_config.py b/demo/sentiment/trainer_config.py
index db24182a8d7359786bd1f3b2083892cf846605d1..114a9138ebfef054c7d3ba99b4a510a452f8f2cd 100644
--- a/demo/sentiment/trainer_config.py
+++ b/demo/sentiment/trainer_config.py
@@ -20,20 +20,20 @@ is_test = get_config_arg('is_test', bool, False)
 # whether this config is used for prediction
 is_predict = get_config_arg('is_predict', bool, False)
 
-data_dir  = "./data/pre-imdb"
+data_dir = "./data/pre-imdb"
 dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
 
 ################## Algorithm Config #####################
 
 settings(
-  batch_size=128,
-  learning_rate=2e-3,
-  learning_method=AdamOptimizer(),
-  regularization=L2Regularization(8e-4),
-  gradient_clipping_threshold=25
-)
+    batch_size=128,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer(),
+    average_window=0.5,
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25)
 
 #################### Network Config ######################
-stacked_lstm_net(dict_dim, class_dim=class_dim,
-                 stacked_num=3, is_predict=is_predict)
+stacked_lstm_net(
+    dict_dim, class_dim=class_dim, stacked_num=3, is_predict=is_predict)
 # bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
diff --git a/demo/seqToseq/dataprovider.py b/demo/seqToseq/dataprovider.py
index df19db109ed223c7515c3ebf2cb1918f41163930..c5da1b7685f47fda337921c7c60ac1497b9e48bb 100755
--- a/demo/seqToseq/dataprovider.py
+++ b/demo/seqToseq/dataprovider.py
@@ -30,14 +30,14 @@ def hook(settings, src_dict, trg_dict, file_list, **kwargs):
     if settings.job_mode:
         settings.trg_dict = trg_dict
         settings.slots = [
-            integer_value_sequence(len(settings.src_dict)), 
-            integer_value_sequence(len(settings.trg_dict)), 
+            integer_value_sequence(len(settings.src_dict)),
+            integer_value_sequence(len(settings.trg_dict)),
             integer_value_sequence(len(settings.trg_dict))
         ]
         settings.logger.info("trg dict len : %d" % (len(settings.trg_dict)))
     else:
         settings.slots = [
-            integer_value_sequence(len(settings.src_dict)), 
+            integer_value_sequence(len(settings.src_dict)),
             integer_value_sequence(len(open(file_list[0], "r").readlines()))
         ]
 
@@ -62,8 +62,7 @@ def process(settings, file_name):
             if settings.job_mode:
                 trg_seq = line_split[1]  # one target sequence
                 trg_words = trg_seq.split()
-                trg_ids = [settings.trg_dict.get(w, UNK_IDX)
-                           for w in trg_words]
+                trg_ids = [settings.trg_dict.get(w, UNK_IDX) for w in trg_words]
 
                 # remove sequence whose length > 80 in training mode
                 if len(src_ids) > 80 or len(trg_ids) > 80:
diff --git a/demo/seqToseq/preprocess.py b/demo/seqToseq/preprocess.py
index 5efb17a664b9a2525972c29b9b5700b483b8c07e..bd1c51b1514b790ec385d48f49197b3e0285e736 100755
--- a/demo/seqToseq/preprocess.py
+++ b/demo/seqToseq/preprocess.py
@@ -12,7 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 Example:
     python preprocess.py -i INPUT [-d DICTSIZE] [-m]
@@ -24,12 +23,13 @@ Options:
     -m --mergeDict merge source and target dictionary
 """
 import os
-import sys 
+import sys
 
 import string
 from optparse import OptionParser
 from paddle.utils.preprocess_util import save_list, DatasetCreater
 
+
 class SeqToSeqDatasetCreater(DatasetCreater):
     """
     A class to process data for sequence to sequence application.
@@ -75,7 +75,7 @@ class SeqToSeqDatasetCreater(DatasetCreater):
         if not os.path.exists(output):
             os.system(cmd + '> ' + output)
 
-    def build_dict(self, file_path, dict_path, dict_size = -1):
+    def build_dict(self, file_path, dict_path, dict_size=-1):
         """ 
         Create the dictionary for the file, Note that
         1. Valid characters include all printable characters
@@ -99,20 +99,23 @@ class SeqToSeqDatasetCreater(DatasetCreater):
                         for word in words:
                             if word not in dictory:
                                 dictory[word] = 1
-                            else: 
+                            else:
                                 dictory[word] += 1
             output = open(dict_path, "w+")
             output.write('<s>\n<e>\n<unk>\n')
             count = 3
-            for key, value in sorted(dictory.items(), key = lambda d:d[1], reverse = True):
+            for key, value in sorted(
+                    dictory.items(), key=lambda d: d[1], reverse=True):
                 output.write(key + "\n")
                 count += 1
                 if count == dict_size:
                     break
             self.dict_size = count
-      
-    def create_dataset(self, dict_size = -1, mergeDict = False,
-                       suffixes = ['.src', '.trg']):
+
+    def create_dataset(self,
+                       dict_size=-1,
+                       mergeDict=False,
+                       suffixes=['.src', '.trg']):
         """
         Create seqToseq dataset 
         """
@@ -135,13 +138,14 @@ class SeqToSeqDatasetCreater(DatasetCreater):
         # checkout dataset should be parallel corpora
         suffix_len = len(suffixes[0])
         for dataset in dataset_list:
-          file_list = os.listdir(dataset)
-          if len(file_list) % 2 == 1:
-              raise RuntimeError("dataset should be parallel corpora")
-          file_list.sort()
-          for i in range(0, len(file_list), 2):
-              if file_list[i][:-suffix_len] != file_list[i + 1][:-suffix_len]:
-                  raise RuntimeError("source and target file name should be equal")
+            file_list = os.listdir(dataset)
+            if len(file_list) % 2 == 1:
+                raise RuntimeError("dataset should be parallel corpora")
+            file_list.sort()
+            for i in range(0, len(file_list), 2):
+                if file_list[i][:-suffix_len] != file_list[i + 1][:-suffix_len]:
+                    raise RuntimeError(
+                        "source and target file name should be equal")
 
         # cat all the files with the same suffix in dataset
         for suffix in suffixes:
@@ -155,16 +159,18 @@ class SeqToSeqDatasetCreater(DatasetCreater):
         list = ['train.list', 'test.list', 'gen.list']
         for dataset in dataset_list:
             outname = os.path.basename(dataset)
-            self.concat_file(dataset, outname + suffixes[0], 
+            self.concat_file(dataset, outname + suffixes[0],
                              outname + suffixes[1], dir_list[id], outname)
-            save_list([os.path.join(dir_list[id], outname)], 
+            save_list([os.path.join(dir_list[id], outname)],
                       os.path.join(self.output_path, list[id]))
             id += 1
 
         # build dictionary for train data
         dict = ['src.dict', 'trg.dict']
-        dict_path = [os.path.join(self.output_path, dict[0]), 
-                     os.path.join(self.output_path, dict[1])]
+        dict_path = [
+            os.path.join(self.output_path, dict[0]),
+            os.path.join(self.output_path, dict[1])
+        ]
         if mergeDict:
             outname = os.path.join(train_dir, train_dataset.split('/')[-1])
             print 'build src dictionary for train data'
@@ -173,22 +179,30 @@ class SeqToSeqDatasetCreater(DatasetCreater):
             os.system('cp ' + dict_path[0] + ' ' + dict_path[1])
         else:
             outname = os.path.join(train_dataset, self.train_dir_name)
-            for id in range(0,2):
+            for id in range(0, 2):
                 suffix = suffixes[id]
                 print 'build ' + suffix[1:] + ' dictionary for train data'
                 self.build_dict(outname + suffix, dict_path[id], dict_size)
         print 'dictionary size is', self.dict_size
 
+
 def main():
     usage = "usage: \n" \
             "python %prog -i INPUT [-d DICTSIZE] [-m]"
     parser = OptionParser(usage)
-    parser.add_option("-i", action="store", dest="input",
-                      help="input original dataset path")
-    parser.add_option("-d", action="store", dest="dictsize",
-                      help="specified word count of dictionary")
-    parser.add_option("-m", "--mergeDict", action="store_true", dest="mergeDict",
-                      help="merge source and target dictionary")
+    parser.add_option(
+        "-i", action="store", dest="input", help="input original dataset path")
+    parser.add_option(
+        "-d",
+        action="store",
+        dest="dictsize",
+        help="specified word count of dictionary")
+    parser.add_option(
+        "-m",
+        "--mergeDict",
+        action="store_true",
+        dest="mergeDict",
+        help="merge source and target dictionary")
     (options, args) = parser.parse_args()
     if options.input[-1] == os.path.sep:
         options.input = options.input[:-1]
@@ -200,5 +214,6 @@ def main():
         data_creator = SeqToSeqDatasetCreater(options.input, output_path)
         data_creator.create_dataset(dictsize, options.mergeDict)
 
+
 if __name__ == "__main__":
-    main(); 
+    main()
diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py
index edd6ad3f739b6cefc24d235be55c7a8f541e1ab7..ad5e3339c1461de06732eb62aca9e8323eea707b 100644
--- a/demo/seqToseq/seqToseq_net.py
+++ b/demo/seqToseq/seqToseq_net.py
@@ -50,16 +50,21 @@ def seq_to_seq_data(data_dir,
         trg_dict = None
     else:
         train_list = os.path.join(data_dir, train_list)
-        test_list = os.path.join(data_dir,test_list)
+        test_list = os.path.join(data_dir, test_list)
 
-    define_py_data_sources2(train_list, test_list,
-                           module = "dataprovider",
-                           obj = "process",
-                           args = {"src_dict": src_dict,
-                                   "trg_dict": trg_dict})
+    define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={"src_dict": src_dict,
+              "trg_dict": trg_dict})
 
-    return {"src_dict_path": src_lang_dict, "trg_dict_path": trg_lang_dict,
-            "gen_result": gen_result}
+    return {
+        "src_dict_path": src_lang_dict,
+        "trg_dict_path": trg_lang_dict,
+        "gen_result": gen_result
+    }
 
 
 def gru_encoder_decoder(data_conf,
@@ -90,51 +95,55 @@ def gru_encoder_decoder(data_conf,
         size=word_vector_dim,
         param_attr=ParamAttr(name='_source_language_embedding'))
     src_forward = simple_gru(input=src_embedding, size=encoder_size)
-    src_backward = simple_gru(input=src_embedding,
-                              size=encoder_size,
-                              reverse=True)
+    src_backward = simple_gru(
+        input=src_embedding, size=encoder_size, reverse=True)
     encoded_vector = concat_layer(input=[src_forward, src_backward])
 
     with mixed_layer(size=decoder_size) as encoded_proj:
         encoded_proj += full_matrix_projection(input=encoded_vector)
 
     backward_first = first_seq(input=src_backward)
-    with mixed_layer(size=decoder_size,
-                     act=TanhActivation(), ) as decoder_boot:
+    with mixed_layer(
+            size=decoder_size,
+            act=TanhActivation(), ) as decoder_boot:
         decoder_boot += full_matrix_projection(input=backward_first)
 
     def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
-        decoder_mem = memory(name='gru_decoder',
-                             size=decoder_size,
-                             boot_layer=decoder_boot)
+        decoder_mem = memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
 
-        context = simple_attention(encoded_sequence=enc_vec,
-                                   encoded_proj=enc_proj,
-                                   decoder_state=decoder_mem, )
+        context = simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem, )
 
         with mixed_layer(size=decoder_size * 3) as decoder_inputs:
             decoder_inputs += full_matrix_projection(input=context)
             decoder_inputs += full_matrix_projection(input=current_word)
 
-        gru_step = gru_step_layer(name='gru_decoder',
-                                  input=decoder_inputs,
-                                  output_mem=decoder_mem,
-                                  size=decoder_size)
+        gru_step = gru_step_layer(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
 
-        with mixed_layer(size=target_dict_dim,
-                         bias_attr=True,
-                         act=SoftmaxActivation()) as out:
+        with mixed_layer(
+                size=target_dict_dim, bias_attr=True,
+                act=SoftmaxActivation()) as out:
             out += full_matrix_projection(input=gru_step)
         return out
 
     decoder_group_name = "decoder_group"
-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-                  StaticInput(input=encoded_proj,is_seq=True)]
+    group_inputs = [
+        StaticInput(
+            input=encoded_vector, is_seq=True), StaticInput(
+                input=encoded_proj, is_seq=True)
+    ]
 
     if not is_generating:
         trg_embedding = embedding_layer(
-            input=data_layer(name='target_language_word',
-                             size=target_dict_dim),
+            input=data_layer(
+                name='target_language_word', size=target_dict_dim),
             size=word_vector_dim,
             param_attr=ParamAttr(name='_target_language_embedding'))
         group_inputs.append(trg_embedding)
@@ -144,12 +153,12 @@ def gru_encoder_decoder(data_conf,
         # while encoded source sequence is accessed to as an unbounded memory.
         # Here, the StaticInput defines a read-only memory
         # for the recurrent_group.
-        decoder = recurrent_group(name=decoder_group_name,
-                                  step=gru_decoder_with_attention,
-                                  input=group_inputs)
+        decoder = recurrent_group(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs)
 
-        lbl = data_layer(name='target_language_next_word',
-                         size=target_dict_dim)
+        lbl = data_layer(name='target_language_next_word', size=target_dict_dim)
         cost = classification_cost(input=decoder, label=lbl)
         outputs(cost)
     else:
@@ -168,16 +177,19 @@ def gru_encoder_decoder(data_conf,
             embedding_size=word_vector_dim)
         group_inputs.append(trg_embedding)
 
-        beam_gen = beam_search(name=decoder_group_name,
-                               step=gru_decoder_with_attention,
-                               input=group_inputs,
-                               bos_id=0,
-                               eos_id=1,
-                               beam_size=beam_size,
-                               max_length=max_length)
-
-        seqtext_printer_evaluator(input=beam_gen,
-                                  id_input=data_layer(name="sent_id", size=1),
-                                  dict_file=trg_dict_path,
-                                  result_file=gen_trans_file)
+        beam_gen = beam_search(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs,
+            bos_id=0,
+            eos_id=1,
+            beam_size=beam_size,
+            max_length=max_length)
+
+        seqtext_printer_evaluator(
+            input=beam_gen,
+            id_input=data_layer(
+                name="sent_id", size=1),
+            dict_file=trg_dict_path,
+            result_file=gen_trans_file)
         outputs(beam_gen)
diff --git a/demo/sequence_tagging/data/get_data.sh b/demo/sequence_tagging/data/get_data.sh
new file mode 100755
index 0000000000000000000000000000000000000000..e579d6c46ce5ed96f442acc448b4cc61bf8394a3
--- /dev/null
+++ b/demo/sequence_tagging/data/get_data.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+
+wget http://www.cnts.ua.ac.be/conll2000/chunking/train.txt.gz
+wget http://www.cnts.ua.ac.be/conll2000/chunking/test.txt.gz
diff --git a/demo/sequence_tagging/data/test.list b/demo/sequence_tagging/data/test.list
new file mode 100644
index 0000000000000000000000000000000000000000..073c0a0c9063ac55f762ac261746aa73057d70e8
--- /dev/null
+++ b/demo/sequence_tagging/data/test.list
@@ -0,0 +1 @@
+data/test.txt.gz
diff --git a/demo/sequence_tagging/data/train.list b/demo/sequence_tagging/data/train.list
new file mode 100644
index 0000000000000000000000000000000000000000..43c24d5f6484a90fe883ad5516fe100d27c9ce47
--- /dev/null
+++ b/demo/sequence_tagging/data/train.list
@@ -0,0 +1 @@
+data/train.txt.gz
diff --git a/demo/sequence_tagging/dataprovider.py b/demo/sequence_tagging/dataprovider.py
new file mode 100644
index 0000000000000000000000000000000000000000..37dcb7aa17c0abd197ef2f3121bf8be6c54375c2
--- /dev/null
+++ b/demo/sequence_tagging/dataprovider.py
@@ -0,0 +1,260 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+import gzip
+import logging
+
+logging.basicConfig(
+    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', )
+logger = logging.getLogger('paddle')
+logger.setLevel(logging.INFO)
+
+OOV_POLICY_IGNORE = 0
+OOV_POLICY_USE = 1
+OOV_POLICY_ERROR = 2
+
+num_original_columns = 3
+
+# Feature combination patterns.
+# [[-1,0], [0,0]]  means previous token at column 0 and current token at 
+# column 0 are combined as one feature.
+patterns = [
+    [[-2, 0]],
+    [[-1, 0]],
+    [[0, 0]],
+    [[1, 0]],
+    [[2, 0]],
+    [[-1, 0], [0, 0]],
+    [[0, 0], [1, 0]],
+    [[-2, 1]],
+    [[-1, 1]],
+    [[0, 1]],
+    [[1, 1]],
+    [[2, 1]],
+    [[-2, 1], [-1, 1]],
+    [[-1, 1], [0, 1]],
+    [[0, 1], [1, 1]],
+    [[1, 1], [2, 1]],
+    [[-2, 1], [-1, 1], [0, 1]],
+    [[-1, 1], [0, 1], [1, 1]],
+    [[0, 1], [1, 1], [2, 1]],
+]
+
+dict_label = {
+    'B-ADJP': 0,
+    'I-ADJP': 1,
+    'B-ADVP': 2,
+    'I-ADVP': 3,
+    'B-CONJP': 4,
+    'I-CONJP': 5,
+    'B-INTJ': 6,
+    'I-INTJ': 7,
+    'B-LST': 8,
+    'I-LST': 9,
+    'B-NP': 10,
+    'I-NP': 11,
+    'B-PP': 12,
+    'I-PP': 13,
+    'B-PRT': 14,
+    'I-PRT': 15,
+    'B-SBAR': 16,
+    'I-SBAR': 17,
+    'B-UCP': 18,
+    'I-UCP': 19,
+    'B-VP': 20,
+    'I-VP': 21,
+    'O': 22
+}
+
+
+def make_features(sequence):
+    length = len(sequence)
+    num_features = len(sequence[0])
+
+    def get_features(pos):
+        if pos < 0:
+            return ['#B%s' % -pos] * num_features
+        if pos >= length:
+            return ['#E%s' % (pos - length + 1)] * num_features
+        return sequence[pos]
+
+    for i in xrange(length):
+        for pattern in patterns:
+            fname = '/'.join([get_features(i + pos)[f] for pos, f in pattern])
+            sequence[i].append(fname)
+
+
+'''
+Source file format:
+Each line is for one timestep. The features are separated by space.
+An empty line indicates end of a sequence.
+
+cutoff: a list of numbers. If count of a feature is smaller than this,
+ it will be ignored.
+if oov_policy[i] is OOV_POLICY_USE, id 0 is reserved for OOV features of
+i-th column.
+
+return a list of dict for each column
+'''
+
+
+def create_dictionaries(filename, cutoff, oov_policy):
+    def add_to_dict(sequence, dicts):
+        num_features = len(dicts)
+        for features in sequence:
+            l = len(features)
+            assert l == num_features, "Wrong number of features " + line
+            for i in xrange(l):
+                if features[i] in dicts[i]:
+                    dicts[i][features[i]] += 1
+                else:
+                    dicts[i][features[i]] = 1
+
+    num_features = len(cutoff)
+    dicts = []
+    for i in xrange(num_features):
+        dicts.append(dict())
+
+    f = gzip.open(filename, 'rb')
+
+    sequence = []
+
+    for line in f:
+        line = line.strip()
+        if not line:
+            make_features(sequence)
+            add_to_dict(sequence, dicts)
+            sequence = []
+            continue
+        features = line.split(' ')
+        sequence.append(features)
+
+    for i in xrange(num_features):
+        dct = dicts[i]
+        n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
+        todo = []
+        for k, v in dct.iteritems():
+            if v < cutoff[i]:
+                todo.append(k)
+            else:
+                dct[k] = n
+                n += 1
+
+        if oov_policy[i] == OOV_POLICY_USE:
+            # placeholder so that len(dct) will be the number of features
+            # including OOV
+            dct['#OOV#'] = 0
+
+        logger.info('column %d dict size=%d, ignored %d' % (i, n, len(todo)))
+        for k in todo:
+            del dct[k]
+
+    f.close()
+    return dicts
+
+
+def initializer(settings, **xargs):
+    cutoff = [3, 1, 0]
+    cutoff += [3] * len(patterns)
+    oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR]
+    oov_policy += [OOV_POLICY_IGNORE] * len(patterns)
+    dicts = create_dictionaries('data/train.txt.gz', cutoff, oov_policy)
+    dicts[2] = dict_label
+    settings.dicts = dicts
+    settings.oov_policy = oov_policy
+    input_types = []
+    num_features = len(dicts)
+    for i in xrange(num_original_columns):
+        input_types.append(integer_sequence(len(dicts[i])))
+        logger.info("slot %s size=%s" % (i, len(dicts[i])))
+    if patterns:
+        dim = 0
+        for i in xrange(num_original_columns, num_features):
+            dim += len(dicts[i])
+        input_types.append(sparse_binary_vector_sequence(dim))
+        logger.info("feature size=%s" % dim)
+    settings.input_types = input_types
+
+
+'''
+if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not
+existed in dicts[i] will be assigned to id 0.
+if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
+in dicts[i].
+'''
+
+
+@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, filename):
+    input_file = filename
+    dicts = settings.dicts
+    oov_policy = settings.oov_policy
+
+    def gen_sample(sequence):
+        num_features = len(dicts)
+        sample = [list() for i in xrange(num_original_columns)]
+        if patterns:
+            sample.append([])
+        for features in sequence:
+            assert len(features) == num_features, \
+                "Wrong number of features: " + line
+            for i in xrange(num_original_columns):
+                id = dicts[i].get(features[i], -1)
+                if id != -1:
+                    sample[i].append(id)
+                elif oov_policy[i] == OOV_POLICY_IGNORE:
+                    sample[i].append(0xffffffff)
+                elif oov_policy[i] == OOV_POLICY_ERROR:
+                    logger.fatal("Unknown token: %s" % features[i])
+                else:
+                    sample[i].append(0)
+
+            if patterns:
+                dim = 0
+                vec = []
+                for i in xrange(num_original_columns, num_features):
+                    id = dicts[i].get(features[i], -1)
+                    if id != -1:
+                        vec.append(dim + id)
+                    elif oov_policy[i] == OOV_POLICY_IGNORE:
+                        pass
+                    elif oov_policy[i] == OOV_POLICY_ERROR:
+                        logger.fatal("Unknown token: %s" % features[i])
+                    else:
+                        vec.ids.append(dim + 0)
+
+                    dim += len(dicts[i])
+                sample[-1].append(vec)
+        return sample
+
+    num_features = len(dicts)
+    f = gzip.open(input_file, 'rb')
+
+    num_sequences = 0
+    sequence = []
+    for line in f:
+        line = line.strip()
+        if not line:
+            make_features(sequence)
+            yield gen_sample(sequence)
+            sequence = []
+            num_sequences += 1
+            continue
+        features = line.split(' ')
+        sequence.append(features)
+
+    f.close()
+
+    logger.info("num_sequences=%s" % num_sequences)
diff --git a/demo/sequence_tagging/linear_crf.py b/demo/sequence_tagging/linear_crf.py
new file mode 100644
index 0000000000000000000000000000000000000000..64895742e1b8c0a11cbedee0b88e61b5b63b007f
--- /dev/null
+++ b/demo/sequence_tagging/linear_crf.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+import math
+
+define_py_data_sources2(
+    train_list="data/train.list",
+    test_list="data/test.list",
+    module="dataprovider",
+    obj="process")
+
+batch_size = 1
+settings(
+    learning_method=MomentumOptimizer(),
+    batch_size=batch_size,
+    regularization=L2Regularization(batch_size * 1e-4),
+    average_window=0.5,
+    learning_rate=1e-1,
+    learning_rate_decay_a=1e-5,
+    learning_rate_decay_b=0.25, )
+
+num_label_types = 23
+
+
+def get_simd_size(size):
+    return int(math.ceil(float(size) / 8)) * 8
+
+
+# Currently, in order to use sparse_update=True,
+# the size has to be aligned.
+num_label_types = get_simd_size(num_label_types)
+
+features = data_layer(name="features", size=76328)
+word = data_layer(name="word", size=6778)
+pos = data_layer(name="pos", size=44)
+chunk = data_layer(name="chunk", size=num_label_types)
+
+crf_input = fc_layer(
+    input=features,
+    size=num_label_types,
+    act=LinearActivation(),
+    bias_attr=False,
+    param_attr=ParamAttr(
+        initial_std=0, sparse_update=True))
+
+crf = crf_layer(
+    input=crf_input,
+    label=chunk,
+    param_attr=ParamAttr(
+        name="crfw", initial_std=0), )
+
+crf_decoding = crf_decoding_layer(
+    size=num_label_types,
+    input=crf_input,
+    label=chunk,
+    param_attr=ParamAttr(name="crfw"), )
+
+sum_evaluator(
+    name="error",
+    input=crf_decoding, )
+
+chunk_evaluator(
+    name="chunk_f1",
+    input=[crf_decoding, chunk],
+    chunk_scheme="IOB",
+    num_chunk_types=11, )
+
+inputs(word, pos, chunk, features)
+outputs(crf)
diff --git a/demo/sequence_tagging/readme.md b/demo/sequence_tagging/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..2e17fffb83c532f5e5fec1227f169c97c1f20e22
--- /dev/null
+++ b/demo/sequence_tagging/readme.md
@@ -0,0 +1,45 @@
+# Sequence Tagging
+
+This demo is a sequence model for assigning tags to each token in a sentence. The task is described at <a href = "http://www.cnts.ua.ac.be/conll2000/chunking">CONLL2000 Text Chunking</a> task.
+
+## Download data
+```bash
+cd demo/sequence_tagging
+./data/get_data.sh
+```
+
+## Train model
+```bash
+cd demo/sequence_tagging
+./train.sh
+```
+
+## Model description
+
+We provide two models. One is a linear CRF model (linear_crf.py) with is equivalent to the one at <a href="http://leon.bottou.org/projects/sgd#stochastic_gradient_crfs">leon.bottou.org/projects/sgd</a>. The second one is a stacked bidirectional RNN and CRF model (rnn_crf.py).
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+
+<thead>
+<th scope="col" class="left">Model name</th>
+<th scope="col" class="left">Number of parameters</th>
+<th scope="col" class="left">F1 score</th>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">linear_crf</td>
+<td class="left"> 1.8M </td>
+<td class="left"> 0.937</td>
+</tr>
+
+<tr>
+<td class="left">rnn_crf</td>
+<td class="left"> 960K </td>
+<td class="left">0.941</td>
+</tr>
+
+</tbody>
+</table>
+</center>
+<br>
diff --git a/demo/sequence_tagging/rnn_crf.py b/demo/sequence_tagging/rnn_crf.py
new file mode 100644
index 0000000000000000000000000000000000000000..90d4bbdddfdb4e38b930d54a2bc865df9fac589c
--- /dev/null
+++ b/demo/sequence_tagging/rnn_crf.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+import math
+
+define_py_data_sources2(
+    train_list="data/train.list",
+    test_list="data/test.list",
+    module="dataprovider",
+    obj="process")
+
+batch_size = 16
+settings(
+    learning_method=MomentumOptimizer(),
+    batch_size=batch_size,
+    regularization=L2Regularization(batch_size * 1e-5),
+    average_window=0.5,
+    learning_rate=2e-3,
+    learning_rate_decay_a=5e-7,
+    learning_rate_decay_b=0.5, )
+
+word_dim = 128
+hidden_dim = 128
+with_rnn = True
+
+initial_std = 1 / math.sqrt(hidden_dim)
+param_attr = ParamAttr(initial_std=initial_std)
+cpu_layer_attr = ExtraLayerAttribute(device=-1)
+
+default_device(0)
+
+num_label_types = 23
+
+features = data_layer(name="features", size=76328)
+word = data_layer(name="word", size=6778)
+pos = data_layer(name="pos", size=44)
+chunk = data_layer(
+    name="chunk", size=num_label_types, layer_attr=cpu_layer_attr)
+
+emb = embedding_layer(
+    input=word, size=word_dim, param_attr=ParamAttr(initial_std=0))
+
+hidden1 = mixed_layer(
+    size=hidden_dim,
+    act=STanhActivation(),
+    bias_attr=True,
+    input=[
+        full_matrix_projection(emb), table_projection(
+            pos, param_attr=param_attr)
+    ])
+
+if with_rnn:
+    rnn1 = recurrent_layer(
+        act=ReluActivation(),
+        bias_attr=True,
+        input=hidden1,
+        param_attr=ParamAttr(initial_std=0), )
+
+hidden2 = mixed_layer(
+    size=hidden_dim,
+    act=STanhActivation(),
+    bias_attr=True,
+    input=[full_matrix_projection(hidden1)] +
+    ([full_matrix_projection(
+        rnn1, param_attr=ParamAttr(initial_std=0))] if with_rnn else []), )
+
+if with_rnn:
+    rnn2 = recurrent_layer(
+        reverse=True,
+        act=ReluActivation(),
+        bias_attr=True,
+        input=hidden2,
+        param_attr=ParamAttr(initial_std=0), )
+
+crf_input = mixed_layer(
+    size=num_label_types,
+    bias_attr=False,
+    input=[full_matrix_projection(hidden2), ] +
+    ([full_matrix_projection(
+        rnn2, param_attr=ParamAttr(initial_std=0))] if with_rnn else []), )
+
+crf = crf_layer(
+    input=crf_input,
+    label=chunk,
+    param_attr=ParamAttr(
+        name="crfw", initial_std=0),
+    layer_attr=cpu_layer_attr, )
+
+crf_decoding = crf_decoding_layer(
+    size=num_label_types,
+    input=crf_input,
+    label=chunk,
+    param_attr=ParamAttr(name="crfw"),
+    layer_attr=cpu_layer_attr, )
+
+sum_evaluator(
+    name="error",
+    input=crf_decoding, )
+
+chunk_evaluator(
+    name="chunk_f1",
+    input=[crf_decoding, chunk],
+    chunk_scheme="IOB",
+    num_chunk_types=11, )
+
+inputs(word, pos, chunk, features)
+outputs(crf)
diff --git a/demo/sequence_tagging/train.sh b/demo/sequence_tagging/train.sh
new file mode 100755
index 0000000000000000000000000000000000000000..9a706b98d8686101ba21b513644bdd791062ec26
--- /dev/null
+++ b/demo/sequence_tagging/train.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+paddle train \
+       --config rnn_crf.py \
+       --parallel_nn=1 \
+       --use_gpu=1 \
+       --dot_period=10 \
+       --log_period=1000 \
+       --test_period=0 \
+       --num_passes=10
diff --git a/demo/sequence_tagging/train_linear.sh b/demo/sequence_tagging/train_linear.sh
new file mode 100755
index 0000000000000000000000000000000000000000..597b5afea9c63a8e209b69b6a40e74556e27ac31
--- /dev/null
+++ b/demo/sequence_tagging/train_linear.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+paddle train \
+       --config linear_crf.py \
+       --use_gpu=0 \
+       --dot_period=100 \
+       --log_period=10000 \
+       --test_period=0 \
+       --num_passes=10
diff --git a/doc/algorithm/index.rst b/doc/algorithm/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6073add3c0cbb12529eabb0f8d8a051bcb84e628
--- /dev/null
+++ b/doc/algorithm/index.rst
@@ -0,0 +1,7 @@
+Algorithm Tutorial
+==================
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn/rnn.rst
diff --git a/doc/algorithm/rnn/rnn.rst b/doc/algorithm/rnn/rnn.rst
index 343f55a20e464f63f054ebe724b5ef90f848d5e9..01d2caefb5cdf4e949511fd0f5bbafe0e604e881 100644
--- a/doc/algorithm/rnn/rnn.rst
+++ b/doc/algorithm/rnn/rnn.rst
@@ -1,5 +1,5 @@
-Recurrent Neural Network Configuration
-======================================
+RNN Configuration
+=================
 
 This tutorial will guide you how to configure recurrent neural network in PaddlePaddle. PaddlePaddle supports highly flexible and efficient recurrent neural network configuration. In this tutorial, you will learn how to:
 
@@ -17,7 +17,7 @@ PaddlePaddle does not need any preprocessing to sequence data, such as padding.
 
 .. code-block:: python
 
-    settings.slots = [
+    settings.input_types = [
       integer_value_sequence(len(settings.src_dict)),
       integer_value_sequence(len(settings.trg_dict)),
       integer_value_sequence(len(settings.trg_dict))]
diff --git a/doc/build/build_from_source.md b/doc/build/build_from_source.md
index f7db0a9b92e67e1ecf5e44f1edb17cb8cacd8d2d..e44fa0d38e9982e5d0ed159743994ce6acc51246 100644
--- a/doc/build/build_from_source.md
+++ b/doc/build/build_from_source.md
@@ -4,13 +4,12 @@ Installing from Sources
 * [1. Download and Setup](#download)
 * [2. Requirements](#requirements)
 * [3. Build on Ubuntu](#ubuntu)
-* [4. Build on Mac OS X](#mac)
 
 ## <span id="download">Download and Setup</span> 
-You can download PaddlePaddle from the [github source](https://github.com/gangliao/Paddle).
+You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
 
 ```bash
-git clone https://github.com/baidu/Paddle paddle
+git clone https://github.com/PaddlePaddle/Paddle paddle
 cd paddle
 ```
 
@@ -153,12 +152,12 @@ As a simple example, consider the following:
 - **Only CPU**
 
   ```bash
-  cmake  .. -DWITH_GPU=OFF -DWITH_DOC=OFF
+  cmake  .. -DWITH_GPU=OFF
   ```
 - **GPU**
 
   ```bash
-  cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF
+  cmake .. -DWITH_GPU=ON
   ```
 
 - **GPU with doc and swig**
@@ -171,7 +170,7 @@ Finally, you can build PaddlePaddle:
 
 ```bash
 # you can add build option here, such as:    
-cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<path to install>
+cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<path to install>
 # please use sudo make install, if you want to install PaddlePaddle into the system
 make -j `nproc` && make install
 # set PaddlePaddle installation path in ~/.bashrc
@@ -191,122 +190,3 @@ sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
 # or just run 
 sudo paddle version
 ```
-
-## <span id="mac">Building on Mac OS X</span>
-
-### Prerequisites
-This guide is based on Mac OS X 10.11 (El Capitan). Note that if you are running an up to date version of OS X, 
-you will already have Python 2.7.10 and Numpy 1.8 installed.
-
-The best option is to use the package manager homebrew to handle installations and upgrades for you.
-To install [homebrew](http://brew.sh/), first open a terminal window (you can find Terminal in the Utilities folder in Applications), and issue the command:
-
-```bash
-# install brew
-/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
-# install pip
-easy_install pip
-```
-
-### Install Dependencies
-
-- **CPU Dependencies**
-
-  ```bash
-  # Install fundamental dependents 
-  brew install glog gflags cmake protobuf openblas
-
-  # Install google test on Mac OS X
-  # Download gtest 1.7.0
-  wget https://github.com/google/googletest/archive/release-1.7.0.tar.gz
-  tar -xvf googletest-release-1.7.0.tar.gz && cd googletest-release-1.7.0
-  # Build gtest
-  mkdir build && cmake ..
-  make
-  # Install gtest library
-  sudo cp -r ../include/gtest /usr/local/include/
-  sudo cp lib*.a /usr/local/lib
-  ```
-
-- **GPU Dependencies(optional)**
-
-    To build GPU version, you will need the following installed:
-
-        1. a CUDA-capable GPU
-        2. Mac OS X 10.11 or later
-        2. the Clang compiler and toolchain installed using Xcode
-        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
-
-    The CUDA development environment relies on tight integration with the host development environment,
-    including the host compiler and C runtime libraries, and is therefore only supported on
-    distribution versions that have been qualified for this CUDA Toolkit release.
-        
-    1. After downloading cuDNN library, issue the following commands:
-
-        ```bash
-        sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local
-        sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
-        ```
-    2. Then you need to set DYLD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
-
-        ```bash
-        export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH
-        export PATH=/usr/local/cuda/bin:$PATH
-        ```
-
-### Build and Install
-
-As usual, the best option is to create build folder under paddle project directory.
-
-```bash
-mkdir build && cd build
-cmake ..
-```
-
-CMake first check PaddlePaddle's dependencies in system default path. After installing some optional
-libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
-If still not found, you can manually set it based on CMake error information from your screen.
-
-As a simple example, consider the following:
-
-- **Only CPU**
-
-  ```bash
-  cmake  .. -DWITH_GPU=OFF -DWITH_DOC=OFF
-  ```
-- **GPU**
-
-  ```bash
-  cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF
-  ```
-
-- **GPU with doc and swig**
-
-  ```bash
-  cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
-  ``` 
-
-Finally, you can build PaddlePaddle:
-
-```bash
-# you can add build option here, such as:    
-cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<installation path>
-# please use sudo make install, if you want to install PaddlePaddle into the system
-make -j `nproc` && make install
-# set PaddlePaddle installation path in ~/.bashrc
-export PATH=<installation path>/bin:$PATH
-```
-**Note:**
-
-If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
-Otherwise, PaddlePaddle will automatically install python dependencies
-at first time when user run paddle commands, such as `paddle version`, `paddle train`.
-It may require sudo privileges:
-
-```bash
-# you can run
-sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-# or just run 
-sudo paddle version
-```
diff --git a/doc/build/contribute_to_paddle.md b/doc/build/contribute_to_paddle.md
index 06fcff61720755432c5618500ac509c5b3f867df..1d03eb7362b1b6f2fcdac7b53f8b7f93fb75e49c 100644
--- a/doc/build/contribute_to_paddle.md
+++ b/doc/build/contribute_to_paddle.md
@@ -1,10 +1,10 @@
-# Contribute to PaddlePaddle
+# Contribute Code
 
 We sincerely appreciate your contributions. You can use fork and pull request
 workflow to merge your code. 
  
 ## Code Requirements
-- Your code mush be fully documented by
+- Your code must be fully documented by
   [doxygen](http://www.stack.nl/~dimitri/doxygen/) style.
 - Make sure the compiler option WITH\_STYLE\_CHECK is on and the compiler
   passes the code style check.
@@ -20,16 +20,30 @@ It's just that simple.
 
 ## Clone
 
+Paddle is currently using [git-flow branching model](http://nvie.com/posts/a-successful-git-branching-model/).
+The **develop** is the main branch, and other user's branches are feature branches.
+
 Once you've created a fork, you can use your favorite git client to clone your
 repo or just head straight to the command line:
  
 ```shell
 # Clone your fork to your local machine
-git clone https://github.com/USERNAME/Paddle.git
+git clone --branch develop https://github.com/USERNAME/Paddle.git
+```
+If your repository doesn't contain **develop** branch, just create it by your own.
+
+```shell
+git clone https://github.com/USERNAME/Paddle.git Paddle
+cd Paddle
+git checkout -b develop  # create develop branch.
+git remote add upstream https://github.com/baidu/Paddle.git  # add upstream to baidu/Paddle
+git pull upstream develop  # update to upstream
 ```
+
 Then you can start to develop by making a local developement branch
+
 ```shell
-git checkout -b MY_COOL_STUFF_BRANCH origin/master
+git checkout -b MY_COOL_STUFF_BRANCH
 ```
 
 ## Commit
@@ -41,7 +55,7 @@ Commit your changes by following command lines:
 git status
 # add modified files
 git add xx
-git commit -m "commit info"
+env EDITOR=vim git commit  # You can write your comments by vim/nano/emacs.
 ```
 The first line of commit infomation is the title. The second and later lines
 are the details if any.
@@ -63,7 +77,7 @@ git remote -v
 Update your fork with the latest upstream changes:
 
 ```shell
-git pull --rebase upstream HEAD
+git pull --rebase upstream develop
 ```
 
 If there are no unique commits locally, git will simply perform a fast-forward.
@@ -76,7 +90,7 @@ Now, your local master branch is up-to-date with everything modified upstream.
 
 ```shell
 # push to your repository in Github
-git push origin HEAD
+git push -u origin MY_COOL_STUFF_BRANCH  # create remote branch MY_COOL_STUFF_BRANCH to origin.
 ```
 
 ## Pull Request
@@ -93,9 +107,24 @@ of conflict, you need to do the update manually. You need to do the following on
 your local repository:
 ```shell
 git checkout MY_COOL_STUFF_BRANCH
-git pull --rebase upstream HEAD
+git pull upstream develop
 # You may need to resolve the conflict according to the git prompt.
 # Make and test your code.
-git push -f origin HEAD
+git push origin MY_COOL_STUFF_BRANCH
 ```
 Now your Pull Request is updated with the latest version.
+
+## Revise your pull request
+
+When you revise your pull request according to reviewer's comments, please use 'git commit' instead of 'git commit --amend' to commit your changes so that the reviewers can see the difference between the new pull requrest and the old pull request.
+
+The possible commands are
+
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull upstream develop   # update local to newest code base.
+# May be some conflicts will occured.
+# And develop your cool stuff
+env EDITOR=vim git commit  # add your revise log
+git push origin MY_COOL_STUFF_BRANCH
+```
diff --git a/doc/build/docker_install.rst b/doc/build/docker_install.rst
index 542b9bac27afb84d2b41e5295145540bf2aa5485..e95de35f4da35fee511551f13bc6026532cce5c3 100644
--- a/doc/build/docker_install.rst
+++ b/doc/build/docker_install.rst
@@ -69,7 +69,7 @@ If you want to launch container with GPU support, you need to set some environme
 
 ..  code-block:: bash
 
-    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}"
+    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
     docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest
 
diff --git a/doc/build/index.rst b/doc/build/index.rst
index 511cdea145c7fd0e41566d0a85115dbb06f84058..b4fe4596047c7d201fdf36bc76c26d5134611560 100644
--- a/doc/build/index.rst
+++ b/doc/build/index.rst
@@ -1,5 +1,5 @@
-Build And Install PaddlePaddle
-================================
+Install and Build
+=================
 
 Install PaddlePaddle
 ----------------------
@@ -18,11 +18,7 @@ Build from Source
 
 ..  warning::
 
-    Please use :code:`deb` package or :code:`docker` image to install paddle. The building guide is used for hacking or contributing to PaddlePaddle.
-    
-
-If you want to hack and contribute PaddlePaddle source code, following guides can help you\:
-
+    Please use :code:`deb` package or :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
 
 ..  toctree::
     :maxdepth: 1
@@ -30,4 +26,3 @@ If you want to hack and contribute PaddlePaddle source code, following guides ca
 
     build_from_source.md
     contribute_to_paddle.md
-
diff --git a/doc/cluster/opensource/cluster_train.md b/doc/cluster/opensource/cluster_train.md
index 4763ede39b049b6c49225dc9ae7add77325d704e..cb493a88f031850cb6a5eeed0ebe9e41bb7e01c3 100644
--- a/doc/cluster/opensource/cluster_train.md
+++ b/doc/cluster/opensource/cluster_train.md
@@ -1,26 +1,24 @@
-# Cluster Training
+# Distributed Training
 
-We provide some simple scripts ```paddle/scripts/cluster_train``` to help you to launch cluster training Job to harness PaddlePaddle's distributed trainning. For MPI and other cluster scheduler refer this naive script to implement more robust cluster training platform by yourself.
+In this article, we explain how to run distributed Paddle training jobs on clusters.  We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).
 
-The following cluster demo is based on RECOMMENDATION local training demo in PaddlePaddle ```demo/recommendation``` directory.  Assuming you enter the ```paddle/scripts/cluster_train/``` directory.
+[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and Kubernetes.
 
-## Pre-requirements
+## Prerequisite
 
-Firstly,
+1. Aforementioned scripts use a Python library [fabric](http://www.fabfile.org/) to run SSH commands.  We can use `pip` to install fabric:
 
-```bash
+   ```bash
 pip install fabric
-```
-
-Secondly, go through installing scripts to install PaddlePaddle at all nodes to make sure demo can run as local mode. For CUDA enabled training, we assume that CUDA is installed in ```/usr/local/cuda```, otherwise missed cuda runtime libraries error could be reported at cluster runtime. In one word, the local training environment should be well prepared for the simple scripts.
+   ```
 
-Then you should prepare same ROOT_DIR directory in all nodes. ROOT_DIR is from in cluster_train/conf.py. Assuming that the ROOT_DIR = /home/paddle, you can create ```paddle``` user account as well, at last ```paddle.py``` can ssh connections to all nodes with ```paddle``` user automatically.
+1. We need to install PaddlePaddle on all nodes in the cluster.  To enable GPUs, we need to install CUDA in `/usr/local/cuda`; otherwise Paddle would report errors at runtime.
 
-At last you can create ssh mutual trust relationship between all nodes for easy ssh login, otherwise ```password``` should be provided at runtime from ```paddle.py```.
+1. Set the `ROOT_DIR` variable in [`cluster_train/conf.py`] on all nodes.  For convenience, we often create a Unix user `paddle` on all nodes and set `ROOT_DIR=/home/paddle`.  In this way, we can write public SSH keys into `/home/paddle/.ssh/authorized_keys` so that user `paddle` can SSH to all nodes without password.
 
 ## Prepare Job Workspace
 
-```Job workspace``` is defined as one package directory which contains dependency libraries, train data, test data, model config file and all other related file dependencies.
+We refer to the directory where we put dependent libraries, config files, etc., as *workspace*.
 
 These ```train/test``` data should be prepared before launching cluster job. To  satisfy the requirement that train/test data are placed in different directory from workspace, PADDLE refers train/test data according to index file named as ```train.list/test.list``` which are used in model config file. So the train/test data also contains train.list/test.list two list file. All local training demo already provides scripts to help you create these two files,  and all nodes in cluster job will handle files with same logical code in normal condition.
 
diff --git a/doc/demo/quick_start/index_en.md b/doc/demo/quick_start/index_en.md
index ee3fa2a2166f497524663574270b239a6170ab19..80d816a768a71156ce72cda6ea92b749fbcdbe1f 100644
--- a/doc/demo/quick_start/index_en.md
+++ b/doc/demo/quick_start/index_en.md
@@ -1,4 +1,4 @@
-# Quick Start Tutorial
+# Quick Start
 
 This tutorial will teach the basics of deep learning (DL), including how to implement many different models in PaddlePaddle. You will learn how to:
   - Prepare data into the standardized format that PaddlePaddle accepts.
@@ -134,7 +134,7 @@ def process(settings, file_name):
 You need to add a data provider definition `define_py_data_sources2` in our network configuration. This definition specifies:
 
 - The path of the training and testing data (`data/train.list`, `data/test.list`).
-- The location of the data provider file (`dataprovider_pow`).
+- The location of the data provider file (`dataprovider_bow`).
 - The function to call to get data. (`process`).
 - Additional arguments or data. Here it passes the path of word dictionary.
 
diff --git a/doc/demo/semantic_role_labeling/curve.jpg b/doc/demo/semantic_role_labeling/curve.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..baa35ae7f0a0b6c246f3a0d331735477ab8bcd70
Binary files /dev/null and b/doc/demo/semantic_role_labeling/curve.jpg differ
diff --git a/doc/demo/semantic_role_labeling/semantic_role_labeling.md b/doc/demo/semantic_role_labeling/semantic_role_labeling.md
index 05fbc8278daf204df60ad19b742c920e47128c27..e2793b2b3494160a7a80f07ec2127bd1f1a4f2e4 100644
--- a/doc/demo/semantic_role_labeling/semantic_role_labeling.md
+++ b/doc/demo/semantic_role_labeling/semantic_role_labeling.md
@@ -30,8 +30,6 @@ Several new files appear in the `data `directory as follows.
 conll05st-release：the test data set of CoNll-2005 shared task 
 test.wsj.words：the Wall Street Journal data sentences
 test.wsj.props:  the propositional arguments
-src.dict：the dictionary of words in sentences
-tgt.dict：the labels dictionary
 feature: the extracted features from data set
 ```
 
@@ -67,6 +65,8 @@ def hook(settings, word_dict, label_dict, **kwargs):
     settings.label_dict = label_dict
     #all inputs are integral and sequential type
     settings.slots = [
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(predicate_dict)),
         integer_value_sequence(len(word_dict)),
         integer_value_sequence(len(word_dict)),
         integer_value_sequence(len(word_dict)),
@@ -77,34 +77,39 @@ def hook(settings, word_dict, label_dict, **kwargs):
 ```
 The corresponding data iterator is as following:
 ```
-@provider(use_seq=True, init_hook=hook)
-def process(obj, file_name):
+@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size,
+          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, file_name):
     with open(file_name, 'r') as fdata:
         for line in fdata:
-            sentence, predicate, ctx_n1, ctx_0, ctx_p1, mark, label = line.strip().split('\t')
+            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
+                line.strip().split('\t')
+
             words = sentence.split()
             sen_len = len(words)
-            word_slot = [obj.word_dict.get(w, UNK_IDX) for w in words]
+            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
 
-            predicate_slot = [obj.word_dict.get(predicate, UNK_IDX)] * sen_len
-            ctx_n1_slot = [obj.word_dict.get(ctx_n1, UNK_IDX) ] * sen_len
-            ctx_0_slot = [obj.word_dict.get(ctx_0, UNK_IDX) ] * sen_len
-            ctx_p1_slot = [obj.word_dict.get(ctx_p1, UNK_IDX) ] * sen_len
+            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
+            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
+            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
 
             marks = mark.split()
             mark_slot = [int(w) for w in marks]
 
             label_list = label.split()
-            label_slot = [obj.label_dict.get(w) for w in label_list]
-
-            yield word_slot, predicate_slot, ctx_n1_slot, ctx_0_slot, ctx_p1_slot, mark_slot, label_slot
+            label_slot = [settings.label_dict.get(w) for w in label_list]
+            yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
+                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
 ```
-The `process`function yield 7 lists which are six features and labels.
+The `process`function yield 9 lists which are 8 features and label.
  
 ### Neural Network Config
 `db_lstm.py` is the neural network config file to load the dictionaries and define the  data provider module and network architecture during the training procedure. 
 
-Seven `data_layer` load instances from data provider. Six features are transformed into embedddings respectively, and mixed by `mixed_layer` .  Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels.
+Nine `data_layer` load instances from data provider. Eight features are transformed into embedddings respectively, and mixed by `mixed_layer` .  Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels.
 
 ### Run Training 
 The script for training is `train.sh`, user just need to execute:
@@ -115,27 +120,36 @@ The content in `train.sh`:
 ```
 paddle train \
   --config=./db_lstm.py \
+  --use_gpu=0 \
+  --log_period=5000 \
+  --trainer_count=1 \
+  --show_parameter_stats_period=5000 \
   --save_dir=./output \
-  --trainer_count=4 \
-  --log_period=10 \
-  --num_passes=500 \
-  --use_gpu=false \
-  --show_parameter_stats_period=10 \
+  --num_passes=10000 \
+  --average_test_period=10000000 \
+  --init_model_path=./data \
+  --load_missing_parameter_strategy=rand \
   --test_all_data_in_one_period=1 \
 2>&1 | tee 'train.log'
 ```
 
 -  \--config=./db_lstm.py : network config file.
--  \--save_di=./output: output path to save models.
--  \--trainer_count=4 : set thread number (or GPU count).
--  \--log_period=10 : print log every 20 batches.
--  \--num_passes=500: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
--  \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train.
--  \--show_parameter_stats_period=10: show parameter statistic every 100 batches.
--  \--test_all_data_in_one_period=1: test all data in every testing.
-
-
-After training, the models  will be saved in directory `output`.
+-  \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train, until now crf_layer do not support GPU
+-  \--log_period=500: print log every 20 batches.
+-  \--trainer_count=1: set thread number (or GPU count).
+-  \--show_parameter_stats_period=5000: show parameter statistic every 100 batches.
+-  \--save_dir=./output: output path to save models.
+-  \--num_passes=10000: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
+-  \--average_test_period=10000000:  do test on average parameter every average_test_period batches
+-  \--init_model_path=./data: parameter initialization path 
+-  \--load_missing_parameter_strategy=rand: random initialization unexisted parameters
+-  \--test_all_data_in_one_period=1: test all data in one period
+
+
+After training, the models  will be saved in directory `output`. Our training curve is as following:
+<center>
+![pic](./curve.jpg)
+</center>
 
 ### Run testing
 The script for testing is `test.sh`, user just need to execute:
@@ -155,6 +169,7 @@ paddle train \
   - \--model_list=$model_list.list: model list file
   - \--job=test: indicate the test job
   - \--config_args=is_test=1: flag to indicate test
+  - \--test_all_data_in_one_period=1: test all data in 1 period
   
 
 ### Run prediction
@@ -166,11 +181,13 @@ The script for prediction is `predict.sh`, user just need to execute:
 In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file
 ```
 python predict.py 
-     -c $config_file 
-     -w $model_path 
-     -l $label_file 
-     -d $dict_file 
-     -i $input_file
+     -c $config_file \
+     -w $best_model_path \
+     -l $label_file \
+     -p $predicate_dict_file  \
+     -d $dict_file \
+     -i $input_file \
+     -o $output_file
 ```
 
 `predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix.
diff --git a/doc/demo/sentiment_analysis/sentiment_analysis.md b/doc/demo/sentiment_analysis/sentiment_analysis.md
index 385f49891dcd840c525f7d1c3aaf7f08a7e4903f..c53952c544de9fa88a6318432e34b0d05b149445 100644
--- a/doc/demo/sentiment_analysis/sentiment_analysis.md
+++ b/doc/demo/sentiment_analysis/sentiment_analysis.md
@@ -6,7 +6,7 @@ Sentiment analysis is also used to monitor social media based on large amount of
 
 On the other hand, grabbing the user comments of products and analyzing their sentiment are useful to understand user preferences for companies, products, even competing products.
 
-This tutorial will guide you through the process of training a Long Short Term Memory (LSTM) Network to classify the sentiment of sentences from [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), sometimes known as the [Internet Movie Database (IMDB)](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf). This dataset contains movie reviews along with their associated binary sentiment polarity labels, namely positive and negative. So randomly guessing yields 50% accuracy.
+This tutorial will guide you through the process of training a Long Short Term Memory (LSTM) Network to classify the sentiment of sentences from [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), sometimes known as the Internet Movie Database (IMDB). This dataset contains movie reviews along with their associated binary sentiment polarity labels, namely positive and negative. So randomly guessing yields 50% accuracy.
 
 ## Data Preparation
 
@@ -39,7 +39,7 @@ imdbEr.txt  imdb.vocab  README  test  train
 * imdbEr.txt: expected rating for each token in imdb.vocab.
 * README: data documentation.
 
-Both train and test set directory contains:
+The file in train set directory is as follows. The test set also contains them except `unsup` and `urls_unsup.txt`.
 
 ```
 labeledBow.feat  neg  pos  unsup  unsupBow.feat  urls_neg.txt  urls_pos.txt  urls_unsup.txt
@@ -151,6 +151,7 @@ settings(
   batch_size=128,
   learning_rate=2e-3,
   learning_method=AdamOptimizer(),
+  average_window=0.5,
   regularization=L2Regularization(8e-4),
   gradient_clipping_threshold=25
 )
@@ -163,17 +164,18 @@ stacked_lstm_net(dict_dim, class_dim=class_dim,
 
 * **Data Definition**:
    * get\_config\_arg(): get arguments setted by `--config_args=xx` in commandline argument.
-   * Define TrainData and TestData provider, here using Python interface (PyDataProviderWrapper) of PaddlePaddle to load data. For details, you can refer to the document of PyDataProvider.
+   * Define data provider, here using Python interface to load data. For details, you can refer to the document of PyDataProvider2.
 
 * **Algorithm Configuration**:
-   * use sgd algorithm.
-   * use adam optimization.
    * set batch size of 128.
-   * set average sgd window.
    * set global learning rate.
+   * use adam optimization.
+   * set average sgd window.
+   * set L2 regularization.
+   * set gradient clipping threshold.
 * **Network Configuration**:
-   * dict_dim: get dictionary dimension.
-   * class_dim: set category number, IMDB has two label, namely positive and negative label.
+   * dict_dim: dictionary dimension.
+   * class_dim: category number, IMDB has two label, namely positive and negative label.
    * `stacked_lstm_net`: predefined network as shown in Figure 3, use this network by default.
    * `bidirectional_lstm_net`: predefined network as shown in Figure 2.
 
diff --git a/doc/dev/index.rst b/doc/dev/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0468dd492b6246cfe0771a05c3597ddee95b3ddd
--- /dev/null
+++ b/doc/dev/index.rst
@@ -0,0 +1,9 @@
+Development Guide
+=================
+
+..  toctree::
+  :maxdepth: 1
+
+  layer.md
+  new_layer/new_layer.rst
+  ../source/index.md
diff --git a/doc/dev/layer.md b/doc/dev/layer.md
new file mode 100644
index 0000000000000000000000000000000000000000..930fb0de1ac074b15d06197ed0e732f92288b411
--- /dev/null
+++ b/doc/dev/layer.md
@@ -0,0 +1,4 @@
+# Layer Documents
+
+* [Layer Source Code Document](../source/gserver/layers/index.rst)
+* [Layer Python API Document](../ui/api/trainer_config_helpers/index.rst)
diff --git a/doc/dev/new_layer/index.rst b/doc/dev/new_layer/index.rst
deleted file mode 100644
index 37dac3a14dedf2aaa99335e1b0ebe110dc746174..0000000000000000000000000000000000000000
--- a/doc/dev/new_layer/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Writing New Layers
-==================
-
-.. toctree::
-  :maxdepth: 3
-
-  new_layer.rst
diff --git a/doc/dev/new_layer/new_layer.rst b/doc/dev/new_layer/new_layer.rst
index bd4a4c46c87f6429338b4d220a80b6265a1f253f..af8b76a3075194ead9be40d2c943238b2cfadecc 100644
--- a/doc/dev/new_layer/new_layer.rst
+++ b/doc/dev/new_layer/new_layer.rst
@@ -1,3 +1,4 @@
+==================
 Writing New Layers
 ==================
 
@@ -59,7 +60,7 @@ Implement C++ Class
 
 The C++ class of the layer implements the initialization, forward, and backward part of the layer. The fully connected layer is at :code:`paddle/gserver/layers/FullyConnectedLayer.h` and :code:`paddle/gserver/layers/FullyConnectedLayer.cpp`. We list simplified version of the code below.
 
-It needs to derive the base class :code:`paddle::BaseLayer`, and it needs to override the following functions:
+It needs to derive the base class :code:`paddle::Layer`, and it needs to override the following functions:
 
 - constructor and destructor.
 - :code:`init` function. It is used to initialize the parameters and settings.
diff --git a/doc/index.md b/doc/index.md
deleted file mode 100644
index df03a33fac98c46635eef05d88639235ac72cf8f..0000000000000000000000000000000000000000
--- a/doc/index.md
+++ /dev/null
@@ -1,22 +0,0 @@
-PaddlePaddle Documentation
-==========================
-
-User Guide
-----------
-* [Quick Start](demo/quick_start/index_en.md)
-* [Build and Installation](build/index.rst)
-* [Contribute Code](build/contribute_to_paddle.md)
-* [User Interface](ui/index.md)
-* [Model Config Interface](ui/api/trainer_config_helpers/index.md)
-* [Example and Demo](demo/index.md)
-* [Cluster Train](cluster/index.md)
-
-Development Guide
------------------
-* [Layer Documents](layer.md)
-* [Writing New Layers](dev/new_layer/index.rst)
-* [Source Code Documents](source/index.md)
-
-Algorithm Tutorial
-------------------
-* [RNN Configuration](algorithm/rnn/rnn.rst)
diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..668ad75a902bdd14c6198c41380ae93e29cec0d3
--- /dev/null
+++ b/doc/index.rst
@@ -0,0 +1,10 @@
+PaddlePaddle Documentation
+==========================
+
+..  toctree::
+  :maxdepth: 1
+
+  introduction/index.md
+  user_guide.rst
+  dev/index.rst
+  algorithm/index.rst
diff --git a/doc/introduction/index.md b/doc/introduction/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..01f52031a1d0247cd0b885218c17001f23685239
--- /dev/null
+++ b/doc/introduction/index.md
@@ -0,0 +1,100 @@
+# Introduction
+
+PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
+
+## 1. A Classic Problem
+
+Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - <a href="https://en.wikipedia.org/wiki/Simple_linear_regression">**simple linear regression**</a> : you have observed a set of two-dimensional data points of `X` and `Y`, where `X` is an explanatory variable and `Y` is corresponding dependent variable, and you want to recover the underlying correlation between `X` and `Y`. Linear regression can be used in many practical scenarios. For example, `X` can be a variable about house size, and `Y` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
+
+## 2. Prepare the Data
+
+Suppose the true relationship can be characterized as `Y = 2X + 0.3`, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
+
+```python
+# dataprovider.py
+from paddle.trainer.PyDataProvider2 import *
+import random
+
+# define data types of input: 2 real numbers
+@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
+def process(settings, input_file):
+    for i in xrange(2000):
+        x = random.random()
+        yield [x], [2*x+0.3]
+```
+
+## 3. Train a NeuralNetwork in PaddlePaddle
+
+To recover this relationship between `X` and `Y`, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line `Y' = wX + b` , then we gradually adapt `w` and `b` to minimize the difference between `Y'` and `Y`. Here is what it looks like in PaddlePaddle:
+
+```python
+# trainer_config.py
+from paddle.trainer_config_helpers import *
+
+# 1. read data. Suppose you saved above python code as dataprovider.py
+data_file = 'empty.list'
+with open(data_file, 'w') as f: f.writelines(' ')
+define_py_data_sources2(train_list=data_file, test_list=None, 
+        module='dataprovider', obj='process',args={})
+
+# 2. learning algorithm
+settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
+
+# 3. Network configuration
+x = data_layer(name='x', size=1)
+y = data_layer(name='y', size=1)
+y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
+cost = regression_cost(input=y_predict, label=y)
+outputs(cost)
+```
+
+Some of the most fundamental usages of PaddlePaddle are demonstrated:
+
+-  The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly.
+
+-  The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time.
+
+-  Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration:
+	-  **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for `X` and `Y`.
+	-  **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model.
+	-  **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters.
+
+Now that everything is ready, you can train the network with a simple command line call:
+ ```
+ paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
+ ```
+
+This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path `./output`. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
+
+
+## 4. Evaluate the Model
+
+Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: `w=2, b=0.3`, thus a better option is to check out model parameters directly.
+
+In PaddlePaddle, training is just to get a collection of model parameters, which are `w` and `b` in this case. Each parameter is saved in an individual file in the popular `numpy` array format. Here is the code that reads parameters from last pass.
+
+```python
+import numpy as np
+import os
+
+def load(file_name):
+    with open(file_name, 'rb') as f:
+        f.read(16) # skip header for float type.
+        return np.fromfile(f, dtype=np.float32)
+        
+print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
+# w=1.999743, b=0.300137
+```
+
+<center> ![](./parameters.png) </center>
+
+Although starts from a random guess, you can see that value of `w` changes quickly towards 2 and `b` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer.
+
+There, you have recovered the underlying pattern between `X` and `Y` only from observed data.
+
+
+## 5. Where to Go from Here
+
+- <a href="../build/index.html"> Build and Installation </a>
+- <a href="../demo/quick_start/index_en.html">Quick Start</a>
+- <a href="../demo/index.html">Example and Demo</a>
diff --git a/doc/introduction/parameters.png b/doc/introduction/parameters.png
new file mode 120000
index 0000000000000000000000000000000000000000..f47e74c94fffabbd32f055febbadb1b18aa0c429
--- /dev/null
+++ b/doc/introduction/parameters.png
@@ -0,0 +1 @@
+../../doc_cn/introduction/parameters.png
\ No newline at end of file
diff --git a/doc/layer.md b/doc/layer.md
deleted file mode 100644
index 45f2e2bad542ff5c29c89201b356728cf7ca8c1c..0000000000000000000000000000000000000000
--- a/doc/layer.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Layer Documents
-
-* [Layer Source Code Document](source/gserver/layers/index.rst)
-* [Layer Python API Document](ui/api/trainer_config_helpers/layers_index.rst)
diff --git a/doc/source/api/api.rst b/doc/source/api.rst
similarity index 90%
rename from doc/source/api/api.rst
rename to doc/source/api.rst
index 6fc450202df73f5ca99c2c52f257243aa37c90d4..30396c26b61827847cc5acc29cee1c3c8e7b226e 100644
--- a/doc/source/api/api.rst
+++ b/doc/source/api.rst
@@ -1,5 +1,5 @@
 API
-========
+===
 
 .. doxygenfile:: paddle/api/PaddleAPI.h
 .. doxygenfile:: paddle/api/Internal.h
diff --git a/doc/source/cuda/cuda/cuda.rst b/doc/source/cuda/cuda/cuda.rst
deleted file mode 100644
index 52f17c2b2e48aec8e6fc8d5a7e4f443ad72d96a6..0000000000000000000000000000000000000000
--- a/doc/source/cuda/cuda/cuda.rst
+++ /dev/null
@@ -1,39 +0,0 @@
-Cuda
-=============
-
-Dynamic Link Libs
---------------------------
-
-hl_dso_loader.h
-``````````````````
-.. doxygenfile:: paddle/cuda/include/hl_dso_loader.h
-
-GPU Resources
-----------------
-
-hl_cuda.ph
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda.ph
-
-hl_cuda.h
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda.h
-
-CUDA Wrapper
---------------
-
-hl_cuda_cublas.h
-``````````````````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda_cublas.h
-
-hl_cuda_cudnn.h
-``````````````````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.h
-
-hl_cuda_cudnn.h
-``````````````````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.ph
-
-
-
-
diff --git a/doc/source/cuda/cuda/index.rst b/doc/source/cuda/cuda/index.rst
deleted file mode 100644
index 5fa38ff0fc8cea2b97262ea5493dea27b322dc1c..0000000000000000000000000000000000000000
--- a/doc/source/cuda/cuda/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-CUDA
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  cuda.rst
diff --git a/doc/source/cuda/index.rst b/doc/source/cuda/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b0fed2e7f72c9a9671e56e114edfc88d72504dbe
--- /dev/null
+++ b/doc/source/cuda/index.rst
@@ -0,0 +1,9 @@
+CUDA
+====
+
+.. toctree::
+  :maxdepth: 2
+
+  matrix.rst
+  nn.rst
+  utils.rst
diff --git a/doc/source/cuda/matrix/matrix.rst b/doc/source/cuda/matrix.rst
similarity index 76%
rename from doc/source/cuda/matrix/matrix.rst
rename to doc/source/cuda/matrix.rst
index dd4f06599c5af29a0278617ffd1bd9f6ae6b222e..b7699c83eda15d9003506f5fc57b51d52e7af823 100644
--- a/doc/source/cuda/matrix/matrix.rst
+++ b/doc/source/cuda/matrix.rst
@@ -1,61 +1,59 @@
 Matrix
-=======
+======
 
-Base Matrix
--------------
+Base
+----
 
 hl_matrix.h
-``````````````````
+```````````
 .. doxygenfile:: paddle/cuda/include/hl_matrix.h
 
 hl_matrix_base.h
-``````````````````
+````````````````
 .. doxygenfile:: paddle/cuda/include/hl_matrix_base.cuh
 
 hl_matrix_apply.cuh
-``````````````````````
+```````````````````
 .. doxygenfile:: paddle/cuda/include/hl_matrix_apply.cuh
 
 hl_matrix_ops.cuh
-``````````````````````
+`````````````````
 .. doxygenfile:: paddle/cuda/include/hl_matrix_ops.cuh
 
 hl_matrix_type.cuh
-``````````````````````
+``````````````````
 .. doxygenfile:: paddle/cuda/include/hl_matrix_type.cuh
 
 hl_sse_matrix_kernel.cuh
-``````````````````````````
+````````````````````````
 .. doxygenfile:: paddle/cuda/include/hl_sse_matrix_kernel.cuh
 
+Matrix Function 
+---------------
+
 hl_batch_transpose.h
-``````````````````````````
+````````````````````
 .. doxygenfile:: paddle/cuda/include/hl_batch_transpose.h
 
-Sparse Matrix
---------------
-
-hl_sparse.h
-``````````````````
-.. doxygenfile:: paddle/cuda/include/hl_sparse.h
-
-hl_sparse.ph
-``````````````````````
-.. doxygenfile:: paddle/cuda/include/hl_sparse.ph
-
-Others
----------------
-
 hl_aggregate.h
-``````````````````
+``````````````
 .. doxygenfile:: paddle/cuda/include/hl_aggregate.h
 
+hl_top_k.h
+``````````
+.. doxygenfile:: paddle/cuda/include/hl_top_k.h
+
 hl_table_apply.h
-``````````````````
+````````````````
 .. doxygenfile:: paddle/cuda/include/hl_table_apply.h
 
-hl_top_k.h
-``````````````````
-.. doxygenfile:: paddle/cuda/include/hl_top_k.h
+Sparse Matrix
+-------------
 
+hl_sparse.h
+```````````
+.. doxygenfile:: paddle/cuda/include/hl_sparse.h
 
+hl_sparse.ph
+````````````
+.. doxygenfile:: paddle/cuda/include/hl_sparse.ph
diff --git a/doc/source/cuda/matrix/index.rst b/doc/source/cuda/matrix/index.rst
deleted file mode 100644
index 63f95eb46618fd43a1140e4d857ae7e2fc89a6ae..0000000000000000000000000000000000000000
--- a/doc/source/cuda/matrix/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Matrix
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  matrix.rst
diff --git a/doc/source/cuda/rnn/rnn.rst b/doc/source/cuda/nn.rst
similarity index 79%
rename from doc/source/cuda/rnn/rnn.rst
rename to doc/source/cuda/nn.rst
index ce8ed96692bcb79eec0e5e6ae52a8bf5f6573418..5577d01e72a5b22847bda40528c46a28cacc1490 100644
--- a/doc/source/cuda/rnn/rnn.rst
+++ b/doc/source/cuda/nn.rst
@@ -1,36 +1,39 @@
-Neural Networks
-==================
+Neural Network
+==============
 
 Base
--------
+----
+
 .. doxygenfile:: paddle/cuda/include/hl_gpu.h
-.. doxygenfile:: paddle/cuda/include/hl_cnn.h
 .. doxygenfile:: paddle/cuda/include/hl_functions.h
 .. doxygenfile:: paddle/cuda/include/hl_avx_functions.h
-.. doxygenfile:: paddle/cuda/include/hl_device_functions.cuh
 .. doxygenfile:: paddle/cuda/include/hl_gpu_functions.cuh
-
-Activation Functions
------------------------
 .. doxygenfile:: paddle/cuda/include/hl_activation_functions.h
 
+
+CNN Related APIs
+----------------
+.. doxygenfile:: paddle/cuda/include/hl_cnn.h
+.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.h
+.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.ph
+
 RNN Related APIs
------------------
+----------------
 
 .. doxygenfile:: paddle/cuda/include/hl_recurrent_apply.cuh
 .. doxygenfile:: paddle/cuda/include/hl_sequence.h
 
 LSTM Model
-``````````````
+``````````
+
 .. doxygenfile:: paddle/cuda/include/hl_lstm.h
 .. dpxygenfile:: paddle/cuda/include/hl_cpu_lstm.cuh
 .. doxygenfile:: paddle/cuda/include/hl_gpu_lstm.cuh
 .. doxygenfile:: paddle/cuda/include/hl_lstm_ops.cuh
 
 GRU Model
-````````````````
+`````````
+
 .. doxygenfile:: paddle/cuda/include/hl_gru_ops.cuh
 .. doxygenfile:: paddle/cuda/include/hl_cpu_gru.cuh
 .. doxygenfile:: paddle/cuda/include/hl_gpu_gru.cuh
-
-
diff --git a/doc/source/cuda/rnn/index.rst b/doc/source/cuda/rnn/index.rst
deleted file mode 100644
index 4913e47ba1cbc1c2b93fe3e128626a8e66aedc62..0000000000000000000000000000000000000000
--- a/doc/source/cuda/rnn/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-RNN
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  rnn.rst
diff --git a/doc/source/cuda/utils.rst b/doc/source/cuda/utils.rst
new file mode 100644
index 0000000000000000000000000000000000000000..850e8bd1c6670947e2a5f1b6f9b0d5b252117cbf
--- /dev/null
+++ b/doc/source/cuda/utils.rst
@@ -0,0 +1,37 @@
+Utils
+=====
+
+Dynamic Link Libs
+-----------------
+.. doxygenfile:: paddle/cuda/include/hl_dso_loader.h
+
+GPU Resources
+-------------
+
+hl_cuda.ph
+``````````
+.. doxygenfile:: paddle/cuda/include/hl_cuda.ph
+
+hl_cuda.h
+`````````
+.. doxygenfile:: paddle/cuda/include/hl_cuda.h
+
+HPPL Base
+---------
+.. doxygenfile:: paddle/cuda/include/hl_base.h
+
+CUBLAS Wrapper
+--------------
+.. doxygenfile:: paddle/cuda/include/hl_cuda_cublas.h
+
+Timer
+-----
+.. doxygenfile:: paddle/cuda/include/hl_time.h
+
+Thread Resource
+---------------
+.. doxygenfile:: paddle/cuda/include/hl_thread.ph
+
+Device Function
+---------------
+.. doxygenfile:: paddle/cuda/include/hl_device_functions.cuh
diff --git a/doc/source/cuda/utils/index.rst b/doc/source/cuda/utils/index.rst
deleted file mode 100644
index 7a84cbe27dd21e326add1a0a1774cbaa089e195f..0000000000000000000000000000000000000000
--- a/doc/source/cuda/utils/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Utils
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  utils.rst
diff --git a/doc/source/cuda/utils/utils.rst b/doc/source/cuda/utils/utils.rst
deleted file mode 100644
index 1ea3e5404aa5fc792075aa09c7fd7a1986332c79..0000000000000000000000000000000000000000
--- a/doc/source/cuda/utils/utils.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-Utilities
-===========
-
-HPPL Base
-------------
-
-hl_base.h
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_base.h
-
-Timer
------------
-
-hl_time.h
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_time.h
-
-Thread Resource
------------
-
-hl_thread.ph
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_thread.ph
diff --git a/doc/source/gserver/activations/index.rst b/doc/source/gserver/activations.rst
similarity index 83%
rename from doc/source/gserver/activations/index.rst
rename to doc/source/gserver/activations.rst
index ccdae41128cd6b4edddda0ac44a825082d7495c9..55b9d3be383c07842d7066280cc0e174788db1fb 100644
--- a/doc/source/gserver/activations/index.rst
+++ b/doc/source/gserver/activations.rst
@@ -1,5 +1,5 @@
 Activations
-=============
+===========
 
 ..  doxygenclass:: paddle::ActivationFunction
     :members:
diff --git a/doc/source/gserver/dataprovider/index.rst b/doc/source/gserver/dataprovider/index.rst
deleted file mode 100644
index 4f6077f1224f90f693515d3414da4d96dc652345..0000000000000000000000000000000000000000
--- a/doc/source/gserver/dataprovider/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Data Providers Documents
-==========================
-
-.. toctree::
-  :maxdepth: 3
-
-  dataproviders.rst
diff --git a/doc/source/gserver/dataprovider/dataproviders.rst b/doc/source/gserver/dataproviders.rst
similarity index 87%
rename from doc/source/gserver/dataprovider/dataproviders.rst
rename to doc/source/gserver/dataproviders.rst
index e8aa4bc35634a0c6ede192a15b276564f7a2c13e..c30d9d6a36a6fbb664ae001274b6a7b0e721070f 100644
--- a/doc/source/gserver/dataprovider/dataproviders.rst
+++ b/doc/source/gserver/dataproviders.rst
@@ -1,23 +1,27 @@
+==============
 Data Providers
-================
+==============
 
-Base DataProvider
-------------------
+DataProviders
+=============
+
+Base
+----
 ..  doxygenclass:: paddle::DataProvider
     :members:
 
 DataProviderGroup
--------------------
+-----------------
 ..  doxygenclass:: paddle::DataProviderGroup
     :members:
 
 MultiDataProvider
--------------------
+-----------------
 ..  doxygenclass:: paddle::MultiDataProvider
     :members:
 
 PyDataProvider
-===================
+==============
 
 IFieldScanner
 -------------
@@ -45,7 +49,7 @@ SparseValueScanner
     :members:
 
 SequenceScanner
-------------------
+---------------
 ..  doxygenclass:: paddle::SparseValueScanner
     :members:
 
@@ -69,8 +73,8 @@ IPyDataProvider
 ..  doxygenclass:: paddle::PyDataProvider2
     :members:
 
-Proto Data Provider
-===================
+ProtoDataProvider
+=================
 
 ProtoDataProvider
 ----------------
@@ -78,6 +82,6 @@ ProtoDataProvider
     :members:
 
 ProtoSequenceDataProvider
-----------------
+-------------------------
 ..  doxygenclass:: paddle::ProtoSequenceDataProvider
     :members:
diff --git a/doc/source/gserver/evaluators/evaluators.rst b/doc/source/gserver/evaluators.rst
similarity index 96%
rename from doc/source/gserver/evaluators/evaluators.rst
rename to doc/source/gserver/evaluators.rst
index 0c5cc85e7dff31693bdc9d2ee44ef470a0fc5f90..f5361f76cd2b1c9c004221c03ea05b2c1f3a652e 100644
--- a/doc/source/gserver/evaluators/evaluators.rst
+++ b/doc/source/gserver/evaluators.rst
@@ -1,14 +1,15 @@
-Base Evaluator
-==============
+==========
+Evaluators
+==========
+
+Base
+====
 
-Evaluator
----------
 ..  doxygenclass:: paddle::Evaluator
     :members:
 
-
-Utils
-=====
+Sum
+===
 
 SumEvaluator
 ------------
diff --git a/doc/source/gserver/evaluators/index.rst b/doc/source/gserver/evaluators/index.rst
deleted file mode 100644
index 298de3e1a32d36b9102f5ad64cc1b968f418041b..0000000000000000000000000000000000000000
--- a/doc/source/gserver/evaluators/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Evaluators
-==========
-
-.. toctree::
-  :maxdepth: 3
-
-  evaluators.rst
diff --git a/doc/source/gserver/gradientmachines/gradientmachines.rst b/doc/source/gserver/gradientmachines.rst
similarity index 54%
rename from doc/source/gserver/gradientmachines/gradientmachines.rst
rename to doc/source/gserver/gradientmachines.rst
index 3607664c850cdf4df4e10151b05f15e275adceaf..04c8e91d0316a45ad10b0ed0513d3e8916b7c3d9 100644
--- a/doc/source/gserver/gradientmachines/gradientmachines.rst
+++ b/doc/source/gserver/gradientmachines.rst
@@ -1,18 +1,18 @@
 Gradient Machines
-================
+=================
 
 GradientMachine
----------------------
+---------------
 ..  doxygenclass:: paddle::GradientMachine
     :members:
 
-GradientMachineModel
---------------------
+GradientMachineMode
+-------------------
 ..  doxygenclass:: paddle::IGradientMachineMode
     :members:
 
 MultiGradientMachine
----------------------
+--------------------
 ..  doxygenclass:: paddle::MultiGradientMachine
     :members:
 
@@ -21,20 +21,7 @@ TrainerThread
 ..  doxygenclass:: paddle::TrainerThread
     :members:
 
-Recurrent Gradient Machines
----------------------------
+RecurrentGradientMachine
+------------------------
 ..  doxygenclass:: paddle::RecurrentGradientMachine
     :members:
-
-Networks
-========
-
-NeuralNetwork
--------------
-..  doxygenclass:: paddle::NeuralNetwork
-    :members:
-
-ParallelNeuralNetwork
----------------------
-..  doxygenclass:: paddle::ParallelNeuralNetwork
-    :members:
diff --git a/doc/source/gserver/gradientmachines/index.rst b/doc/source/gserver/gradientmachines/index.rst
deleted file mode 100644
index 997c29a102f53c165c70ff11cd9650b83bcecf44..0000000000000000000000000000000000000000
--- a/doc/source/gserver/gradientmachines/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Gradient Machines Documents
-=============================
-
-.. toctree::
-  :maxdepth: 3
-
-  gradientmachines.rst
diff --git a/doc/source/gserver/index.rst b/doc/source/gserver/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..223b00b9a9dbf1db40ce702cf0e154e5e53a8644
--- /dev/null
+++ b/doc/source/gserver/index.rst
@@ -0,0 +1,12 @@
+GServer
+=======
+
+.. toctree::
+  :maxdepth: 2
+
+  activations.rst
+  dataproviders.rst
+  evaluators.rst
+  gradientmachines.rst
+  layers.rst
+  neworks.rst
diff --git a/doc/source/gserver/layers/layer.rst b/doc/source/gserver/layers.rst
similarity index 94%
rename from doc/source/gserver/layers/layer.rst
rename to doc/source/gserver/layers.rst
index 807b22ca140ee71208a96e2877b9c5636620b165..191b2bdff26ed17437370a12036f9dbb174dae15 100644
--- a/doc/source/gserver/layers/layer.rst
+++ b/doc/source/gserver/layers.rst
@@ -1,6 +1,10 @@
-Base
+======
+Layers
 ======
 
+Base
+====
+
 Layer 
 -----
 ..  doxygenclass:: paddle::Layer
@@ -17,7 +21,7 @@ Operator
     :members:
     
 Data Layer
-===========
+==========
 
 ..  doxygenclass:: paddle::DataLayer
     :members:
@@ -58,6 +62,11 @@ CudnnConvLayer
 ..  doxygenclass:: paddle::CudnnConvLayer
     :members:
 
+ExpandConvBaseLayer
+-------------------
+..  doxygenclass:: paddle::ExpandConvBaseLayer
+    :members:
+
 ExpandConvLayer
 ---------------
 ..  doxygenclass:: paddle::ExpandConvLayer
@@ -86,6 +95,16 @@ CudnnPoolLayer
 ..  doxygenclass:: paddle::CudnnPoolLayer
     :members:
 
+SpatialPyramidPoolLayer
+-----------------------
+..  doxygenclass:: paddle::SpatialPyramidPoolLayer
+    :members:
+
+MaxOutLayer
+-----------
+..  doxygenclass:: paddle::MaxOutLayer
+    :members:
+
 Norm Layers
 ===========
 
@@ -402,6 +421,11 @@ TransLayer
 Sampling Layers
 ===============
 
+BilinearInterpLayer
+-------------------
+..  doxygenclass:: paddle::BilinearInterpLayer
+    :members:
+
 MultinomialSampler
 ------------------
 ..  doxygenclass:: paddle::MultinomialSampler
@@ -465,6 +489,11 @@ SumOfSquaresCostLayer
 ..  doxygenclass:: paddle::SumOfSquaresCostLayer
     :members:
 
+SumCostLayer
+`````````````````````
+..  doxygenclass:: paddle::SumCostLayer
+    :members:
+
 CosSimLayer
 -----------
 ..  doxygenclass:: paddle::CosSimLayer
diff --git a/doc/source/gserver/layers/index.rst b/doc/source/gserver/layers/index.rst
deleted file mode 100644
index 559c5436b10a5977ac347611639b32d43f1ed123..0000000000000000000000000000000000000000
--- a/doc/source/gserver/layers/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Layers Documents
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  layer.rst
diff --git a/doc/source/gserver/neworks.rst b/doc/source/gserver/neworks.rst
new file mode 100644
index 0000000000000000000000000000000000000000..73fb60d549cc88f61d2e2d18c9ec31c37cf4fa9a
--- /dev/null
+++ b/doc/source/gserver/neworks.rst
@@ -0,0 +1,12 @@
+Networks
+========
+
+NeuralNetwork
+-------------
+..  doxygenclass:: paddle::NeuralNetwork
+    :members:
+
+ParallelNeuralNetwork
+---------------------
+..  doxygenclass:: paddle::ParallelNeuralNetwork
+    :members:
diff --git a/doc/source/index.md b/doc/source/index.md
deleted file mode 100644
index 55fcdeb3dfcedd8589bf7986682708a957c05746..0000000000000000000000000000000000000000
--- a/doc/source/index.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# Source Code Documents
-
-## cuda
-
-- [CUDA](cuda/cuda/index.rst)
-- [Matrix](cuda/matrix/index.rst)
-- [RNN](cuda/rnn/index.rst)
-- [Utils](cuda/utils/index.rst)
-
-## gserver
-
-- [Activations](gserver/activations/index.rst)
-- [Data Providers](gserver/dataprovider/index.rst)
-- [Evaluators](gserver/evaluators/index.rst)
-- [Gradient Machines](gserver/gradientmachines/index.rst)
-- [Layers](gserver/layers/index.rst)
-
-## math
-
-- [Matrix](math/matrix/index.rst)
-- [Utils](math/utils/index.rst)
-
-## parameter
-
-- [Parameter](parameter/parameter/index.rst)
-- [Update](parameter/update/index.rst)
-- [Optimizer](parameter/optimizer/index.rst)
-
-## pserver
-
-- [Client](pserver/client/index.rst)
-- [Network](pserver/network/index.rst)
-- [Server](pserver/server/index.rst)
-
-## trainer
-
-- [Trainer](trainer/trainer.rst)
-
-## api
-
-- [API](api/api.rst)
-
-## utils
-
-- [CustomStackTrace](utils/customStackTrace.rst)
-- [Enumeration wrapper](utils/enum.rst)
-- [Lock](utils/lock.rst)
-- [Queue](utils/queue.rst)
-- [Thread](utils/thread.rst)
diff --git a/doc/source/index.rst b/doc/source/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..36323c888ee65147f59f28160dc26ca29235ba63
--- /dev/null
+++ b/doc/source/index.rst
@@ -0,0 +1,14 @@
+Source Code Documents
+=====================
+
+.. toctree::
+  :maxdepth: 1
+
+  gserver/index.rst
+  trainer.rst
+  parameter/index.rst
+  pserver/index.rst
+  api.rst
+  cuda/index.rst
+  math/index.rst
+  utils/index.rst
diff --git a/doc/source/math/functions.rst b/doc/source/math/functions.rst
new file mode 100644
index 0000000000000000000000000000000000000000..aef12e0f005226c6d40d74d0e858a11585339758
--- /dev/null
+++ b/doc/source/math/functions.rst
@@ -0,0 +1,10 @@
+Functions
+=========
+
+MathFunctions
+-------------
+.. doxygenfile:: paddle/math/MathFunctions.h
+
+SIMDFunctions
+-------------
+.. doxygenfile:: paddle/math/SIMDFunctions.h
diff --git a/doc/source/math/index.rst b/doc/source/math/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2ec16f2b4450c870f9590aea4ad4ca7dc415b75d
--- /dev/null
+++ b/doc/source/math/index.rst
@@ -0,0 +1,10 @@
+Math
+====
+
+.. toctree::
+  :maxdepth: 2
+
+  vector.rst
+  matrix.rst
+  functions.rst
+  utils.rst
diff --git a/doc/source/math/matrix.rst b/doc/source/math/matrix.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9bb20f618d229e1baea15e26378bf40d7c6e1783
--- /dev/null
+++ b/doc/source/math/matrix.rst
@@ -0,0 +1,76 @@
+Matrix
+======
+
+Base
+----
+
+BaseMatrix Template
+```````````````````
+..  doxygenclass:: paddle::BaseMatrixT
+    :members:
+
+Matrix
+``````
+..  doxygenclass:: paddle::Matrix
+    :members:
+
+MatrixOffset
+````````````
+..  doxygenclass:: paddle::MatrixOffset
+    :members:
+
+CpuMatrix
+---------
+
+CpuMatrix
+`````````
+..  doxygenclass:: paddle::CpuMatrix
+    :members:
+
+SharedCpuMatrix
+```````````````
+..  doxygenclass:: paddle::SharedCpuMatrix
+    :members:
+
+GpuMatrix
+---------
+..  doxygenclass:: paddle::GpuMatrix
+    :members:
+
+CpuSparseMatrix
+---------------
+
+CpuSparseMatrix
+```````````````
+..  doxygenclass:: paddle::CpuSparseMatrix
+    :members:
+
+SparseRowCpuMatrix
+``````````````````
+..  doxygenclass:: paddle::SparseRowCpuMatrix
+    :members:
+
+SparseAutoGrowRowCpuMatrix
+``````````````````````````
+..  doxygenclass:: paddle::SparseAutoGrowRowCpuMatrix
+    :members:
+
+SparsePrefetchRowCpuMatrix
+``````````````````````````
+..  doxygenclass:: paddle::SparsePrefetchRowCpuMatrix
+    :members:
+
+SparseRowIdsCpuMatrix
+`````````````````````
+..  doxygenclass:: paddle::SparseRowIdsCpuMatrix
+    :members:
+
+CacheRowCpuMatrix
+`````````````````
+..  doxygenclass:: paddle::CacheRowCpuMatrix
+    :members:
+
+GpuSparseMatrix
+---------------
+..  doxygenclass:: paddle::GpuSparseMatrix
+    :members:
diff --git a/doc/source/math/matrix/index.rst b/doc/source/math/matrix/index.rst
deleted file mode 100644
index 68410f2a27b68c87087f2c17de351495ac6a6cd0..0000000000000000000000000000000000000000
--- a/doc/source/math/matrix/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Matrix Documents
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  matrix.rst
diff --git a/doc/source/math/matrix/matrix.rst b/doc/source/math/matrix/matrix.rst
deleted file mode 100644
index b12e3934f4705d4a2b7d3d790873701ddfe27d9f..0000000000000000000000000000000000000000
--- a/doc/source/math/matrix/matrix.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-Matrix
-=======
-
-Base
---------
-.. doxygenfile:: paddle/math/BaseMatrix.h
-
-Sparse Matrix
-----------------
-.. doxygenfile:: paddle/math/Matrix.h
-.. doxygenfile:: paddle/math/Vector.h
-.. doxygenfile:: paddle/math/MathUtils.h
-.. doxygenfile:: paddle/math/SparseMatrix.h
-.. doxygenfile:: paddle/math/SparseRowMatrix.h
-.. doxygenfile:: paddle/math/CpuSparseMatrix.h
-
-Others
-----------
-.. doxygenfile:: paddle/math/MathFunctions.h
-.. doxygenfile:: paddle/math/SIMDFunctions.h
diff --git a/doc/source/math/utils/utils.rst b/doc/source/math/utils.rst
similarity index 62%
rename from doc/source/math/utils/utils.rst
rename to doc/source/math/utils.rst
index 3df721a47b93bce950185f2d6ffe22d4a801af30..55d9961a390c205563a9ae4fbd87ac4ae90fc314 100644
--- a/doc/source/math/utils/utils.rst
+++ b/doc/source/math/utils.rst
@@ -1,9 +1,18 @@
-Utils
-=======
+Memory Manager
+==============
 
 Memory Handle
---------------
+-------------
 .. doxygenfile:: paddle/math/MemoryHandle.h
+
+Allocator
+---------
 .. doxygenfile:: paddle/math/Allocator.h
+
+PoolAllocator
+`````````````
 .. doxygenfile:: paddle/math/PoolAllocator.h
+
+Storage
+-------
 .. doxygenfile:: paddle/math/Storage.h
diff --git a/doc/source/math/utils/index.rst b/doc/source/math/utils/index.rst
deleted file mode 100644
index e5fe335da29b957706ed52662682d11c425e5908..0000000000000000000000000000000000000000
--- a/doc/source/math/utils/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Utils Documents
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  utils.rst
diff --git a/doc/source/math/vector.rst b/doc/source/math/vector.rst
new file mode 100644
index 0000000000000000000000000000000000000000..07f7062abaf4f30b8967b594f4e16ab881f5414f
--- /dev/null
+++ b/doc/source/math/vector.rst
@@ -0,0 +1,37 @@
+Vector
+======
+
+BaseVector
+``````````
+..  doxygenclass:: paddle::BaseVector
+    :members:
+
+Vector Template
+```````````````
+..  doxygenclass:: paddle::VectorT
+    :members:
+
+CpuVector Template
+``````````````````
+..  doxygenclass:: paddle::CpuVectorT
+    :members:
+
+GpuVector Template
+``````````````````
+..  doxygenclass:: paddle::GpuVectorT
+    :members:
+
+ParallelCpuVector Template
+``````````````````````````
+..  doxygenclass:: paddle::ParallelCpuVectorT
+    :members:
+
+ParallelGpuVector Template
+``````````````````````````
+..  doxygenclass:: paddle::ParallelGpuVectorT
+    :members:
+
+CpuGpuVector Template
+`````````````````````
+..  doxygenclass:: paddle::CpuGpuVectorT
+    :members:
diff --git a/doc/source/parameter/index.rst b/doc/source/parameter/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3bf6948dc3478574d8d125d8461235f8827e4e42
--- /dev/null
+++ b/doc/source/parameter/index.rst
@@ -0,0 +1,9 @@
+Parameter
+=========
+
+.. toctree::
+  :maxdepth: 2
+
+  parameter.rst
+  optimizer.rst
+  updater.rst
diff --git a/doc/source/parameter/optimizer.rst b/doc/source/parameter/optimizer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b5b8b850b349d547c9e5508d3ebec3d7e00ea310
--- /dev/null
+++ b/doc/source/parameter/optimizer.rst
@@ -0,0 +1,22 @@
+Optimizer
+=========
+
+ParameterOptimizer
+------------------
+.. doxygenfile:: paddle/parameter/ParameterOptimizer.h
+
+Regularizer
+-----------
+.. doxygenfile:: paddle/parameter/Regularizer.h
+
+FirstOrderOptimizer
+-------------------
+.. doxygenfile:: paddle/parameter/FirstOrderOptimizer.h
+
+AverageOptimizer
+----------------
+.. doxygenfile:: paddle/parameter/AverageOptimizer.h
+
+OptimizerWithRegularizer
+------------------------
+.. doxygenfile:: paddle/parameter/OptimizerWithRegularizer.h
diff --git a/doc/source/parameter/optimizer/index.rst b/doc/source/parameter/optimizer/index.rst
deleted file mode 100644
index 3338af5608a03ee853e3a5f16d2483b810215514..0000000000000000000000000000000000000000
--- a/doc/source/parameter/optimizer/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Parameter Documents
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  optimizer.rst
diff --git a/doc/source/parameter/optimizer/optimizer.rst b/doc/source/parameter/optimizer/optimizer.rst
deleted file mode 100644
index 3d9e49217eb17541c14d8d64715278e62c99d2b4..0000000000000000000000000000000000000000
--- a/doc/source/parameter/optimizer/optimizer.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Optimizer
-============
-
-.. doxygenfile:: paddle/parameter/FirstOrderOptimizer.h
-.. doxygenfile:: paddle/parameter/AverageOptimizer.h
-.. doxygenfile:: paddle/parameter/ParameterOptimizer.h
-.. doxygenfile:: paddle/parameter/OptimizerWithRegularizer.h
diff --git a/doc/source/parameter/parameter/parameter.rst b/doc/source/parameter/parameter.rst
similarity index 66%
rename from doc/source/parameter/parameter/parameter.rst
rename to doc/source/parameter/parameter.rst
index 2b7afdb4093753598d73c686b1dc81b970d199d5..2daa62d4e63b952cd93bba35ee32ce35ce768a0d 100644
--- a/doc/source/parameter/parameter/parameter.rst
+++ b/doc/source/parameter/parameter.rst
@@ -1,16 +1,12 @@
 Parameter
-=============
-
-Weight
---------
-.. doxygenfile:: paddle/parameter/Weight.h
-
-Regularizer
-------------
-.. doxygenfile:: paddle/parameter/Regularizer.h
+=========
 
 Parameter
--------------
+---------
 .. doxygenfile:: paddle/parameter/Argument.h
 .. doxygenfile:: paddle/parameter/Parameter.h
 .. doxygenfile:: paddle/parameter/ParallelParameter.h
+
+Weight
+------
+.. doxygenfile:: paddle/parameter/Weight.h
diff --git a/doc/source/parameter/parameter/index.rst b/doc/source/parameter/parameter/index.rst
deleted file mode 100644
index e7ed70ec4c87b3613cd8450f1e7fca1fb974afca..0000000000000000000000000000000000000000
--- a/doc/source/parameter/parameter/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Parameter Documents
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  parameter.rst
diff --git a/doc/source/parameter/update/index.rst b/doc/source/parameter/update/index.rst
deleted file mode 100644
index 1bbd73319396e7b8ea32c78e0fe3569919bacf2d..0000000000000000000000000000000000000000
--- a/doc/source/parameter/update/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Parameter Documents
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  update.rst
diff --git a/doc/source/parameter/update/update.rst b/doc/source/parameter/updater.rst
similarity index 75%
rename from doc/source/parameter/update/update.rst
rename to doc/source/parameter/updater.rst
index c417602f0338dbd84ae2bd2ca4eb09330202a0e8..dfa22e8e7d1d6f0713974835de93194d2cc58e6f 100644
--- a/doc/source/parameter/update/update.rst
+++ b/doc/source/parameter/updater.rst
@@ -1,7 +1,14 @@
-Update
-==========
+Updater
+=======
 
+Base
+----
 .. doxygenfile:: paddle/parameter/ParameterUpdaterBase.h
+
+Hook
+----
 .. doxygenfile:: paddle/parameter/ParameterUpdaterHook.h
-.. doxygenfile:: paddle/parameter/ParameterUpdateFunctions.h
 
+Functions
+---------
+.. doxygenfile:: paddle/parameter/ParameterUpdateFunctions.h
diff --git a/doc/source/pserver/client.rst b/doc/source/pserver/client.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e5bba0706a1d919104b85e23861ba490a2c828db
--- /dev/null
+++ b/doc/source/pserver/client.rst
@@ -0,0 +1,12 @@
+Client
+======
+
+BaseClient
+----------
+..  doxygenclass:: paddle::BaseClient
+    :members:
+
+ParameterClient2
+----------------
+..  doxygenclass:: paddle::ParameterClient2
+    :members:
diff --git a/doc/source/pserver/client/client.rst b/doc/source/pserver/client/client.rst
deleted file mode 100644
index fc7ed90d3dc8beb0baa30d63ccc956fbba2a4e4c..0000000000000000000000000000000000000000
--- a/doc/source/pserver/client/client.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Client
-=========
-
-.. doxygenclass:: paddle::BaseClient
-    :members:
-    :protected-members:
-    :private-members:
-    :undoc-members:
-
-.. doxygenclass:: paddle::ParameterClient2
-    :members:
-    :protected-members:
-    :private-members:
-    :undoc-members:
diff --git a/doc/source/pserver/client/index.rst b/doc/source/pserver/client/index.rst
deleted file mode 100644
index dc924c9ca8e7b9965638fd299dc2f5e78591c91b..0000000000000000000000000000000000000000
--- a/doc/source/pserver/client/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Client Documents
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  client.rst
diff --git a/doc/source/pserver/index.rst b/doc/source/pserver/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0031e9476bd063511cc2f0a8c209f35627cf44ba
--- /dev/null
+++ b/doc/source/pserver/index.rst
@@ -0,0 +1,10 @@
+PServer
+=======
+
+.. toctree::
+  :maxdepth: 2
+
+  client.rst
+  network.rst
+  server.rst
+  utils.rst
diff --git a/doc/source/pserver/network.rst b/doc/source/pserver/network.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7004c9d91fa9f2af11e15791ef682c108761027e
--- /dev/null
+++ b/doc/source/pserver/network.rst
@@ -0,0 +1,27 @@
+Network
+=======
+
+SocketServer
+------------
+..  doxygenclass:: paddle::SocketServer
+    :members:
+
+SocketWorker
+------------
+..  doxygenclass:: paddle::SocketWorker
+    :members:
+
+SocketClient
+------------
+..  doxygenclass:: paddle::SocketClient
+    :members:
+
+SocketChannel
+-------------
+..  doxygenclass:: paddle::SocketChannel
+    :members:
+
+MessageReader
+-------------
+..  doxygenclass:: paddle::MsgReader
+    :members:
diff --git a/doc/source/pserver/network/index.rst b/doc/source/pserver/network/index.rst
deleted file mode 100644
index 2fdf95e17d339d69de8e027d92cbb385e2bd51ec..0000000000000000000000000000000000000000
--- a/doc/source/pserver/network/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Network Documents
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  network.rst
diff --git a/doc/source/pserver/network/network.rst b/doc/source/pserver/network/network.rst
deleted file mode 100644
index e000ff8dbbdc37e9d638d18d20a8ba53e21dd245..0000000000000000000000000000000000000000
--- a/doc/source/pserver/network/network.rst
+++ /dev/null
@@ -1,42 +0,0 @@
-Network
-==========
-
-Socket Server
-----------------
-.. doxygenclass:: paddle::SocketServer
-    :members:
-    :protected-members:
-    :private-members:
-    :undoc-members:
-
-Socket Worker
-----------------
-.. doxygenclass:: paddle::SocketWorker
-    :members:
-    :protected-members:
-    :private-members:
-    :undoc-members:
-
-Socket Client
-----------------
-.. doxygenclass:: paddle::SocketClient
-    :members:
-    :protected-members:
-    :private-members:
-    :undoc-members:
-
-Socket Channel
----------------
-.. doxygenclass:: paddle::SocketChannel
-    :members:
-    :protected-members:
-    :private-members:
-    :undoc-members:
-
-Message Reader
----------------
-.. doxygenclass:: paddle::MsgReader
-    :members:
-    :protected-members:
-    :private-members:
-    :undoc-members:
diff --git a/doc/source/pserver/server.rst b/doc/source/pserver/server.rst
new file mode 100644
index 0000000000000000000000000000000000000000..35301acf8ffe3d97e6124c37cf8fe1b43071e14e
--- /dev/null
+++ b/doc/source/pserver/server.rst
@@ -0,0 +1,12 @@
+Server
+======
+
+ProtoServer
+-----------
+..  doxygenclass:: paddle::ProtoServer
+    :members:
+
+ParameterServer2
+----------------
+..  doxygenclass:: paddle::ParameterServer2
+    :members:
diff --git a/doc/source/pserver/server/index.rst b/doc/source/pserver/server/index.rst
deleted file mode 100644
index 09e3530bfeaf56ebbadb1694a69a036813e8970f..0000000000000000000000000000000000000000
--- a/doc/source/pserver/server/index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Server Documents
-====================
-
-.. toctree::
-  :maxdepth: 3
-
-  server.rst
diff --git a/doc/source/pserver/server/server.rst b/doc/source/pserver/server/server.rst
deleted file mode 100644
index f3110fdd731d246ce4211d05e32ddd98584bdbb7..0000000000000000000000000000000000000000
--- a/doc/source/pserver/server/server.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Server
-==========
-
-.. doxygenclass:: paddle::ProtoServer
-    :members:
-    :protected-members:
-    :private-members:
-    :undoc-members:
-
-.. doxygenclass:: paddle::ParameterServer2
-    :members:
-    :protected-members:
-    :private-members:
-    :undoc-members:
diff --git a/doc/source/trainer/trainer.rst b/doc/source/trainer.rst
similarity index 94%
rename from doc/source/trainer/trainer.rst
rename to doc/source/trainer.rst
index 12c24597e7f99cd489204602ae25a89d7b960630..85f1feb4fc941f94e65a6b1d037445d2367f65ec 100644
--- a/doc/source/trainer/trainer.rst
+++ b/doc/source/trainer.rst
@@ -14,7 +14,7 @@ RemoteParameterUpdater
     :members:
 
 ConcurrentRemoteParameterUpdater
----------------------------------
+--------------------------------
 
 ..  doxygenclass:: paddle::ConcurrentRemoteParameterUpdater
     :members:
diff --git a/doc/source/utils/customStackTrace.rst b/doc/source/utils/customStackTrace.rst
index a4e6f05a406f33256548fc0ef32bbbf3daff1536..cdc8930739eb4b4d6308ff1fbce170d2977d42e8 100644
--- a/doc/source/utils/customStackTrace.rst
+++ b/doc/source/utils/customStackTrace.rst
@@ -1,9 +1,4 @@
 CustomStackTrace
 ================
-
-
-class CustomStackTrace
-----------------------
-
 ..  doxygenclass:: paddle::CustomStackTrace
     :members:
diff --git a/doc/source/utils/enum.rst b/doc/source/utils/enum.rst
index 17166d35f7cfa63e51058cc5f86165b1e22bbe1e..e0da75afe164f9dab59b862faa7230fc57423e50 100644
--- a/doc/source/utils/enum.rst
+++ b/doc/source/utils/enum.rst
@@ -1,9 +1,3 @@
-enumeration_wrapper
+Enumeration wrapper
 ===================
-
-
-namespace paddle::enumeration_wrapper
--------------------------------------
-
 ..  doxygennamespace:: paddle::enumeration_wrapper
-
diff --git a/doc/source/utils/index.rst b/doc/source/utils/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7ddc47d1726f7627852be922d2b769d0752aa799
--- /dev/null
+++ b/doc/source/utils/index.rst
@@ -0,0 +1,11 @@
+Utils
+=====
+
+.. toctree::
+  :maxdepth: 2
+
+  lock.rst
+  queue.rst
+  thread.rst
+  customStackTrace.rst
+  enum.rst
diff --git a/doc/source/utils/lock.rst b/doc/source/utils/lock.rst
index 0b027e403f49fc1720904cf4b502d81e4148e1e3..f011acb9431f0f3dc3b2ba27fcfe71fe6eb07ae9 100644
--- a/doc/source/utils/lock.rst
+++ b/doc/source/utils/lock.rst
@@ -1,37 +1,32 @@
-Thread
-======
+Lock
+====
 
-
-class Thread 
-------------
-
-..  doxygenclass:: paddle::Thread
+RWLock
+------
+..  doxygenclass:: paddle::RWLock
     :members:
 
-
-class ThreadWorker
-------------------
-
-..  doxygenclass:: paddle::ThreadWorker
+ReadLockGuard
+-------------
+..  doxygenclass:: paddle::ReadLockGuard
     :members:
-    
 
-class SyncThreadPool 
---------------------
-
-..  doxygenclass:: paddle::SyncThreadPool 
+SpinLock
+--------
+..  doxygenclass:: paddle::SpinLock
     :members:
-    
-
-class MultiThreadWorker 
------------------------
 
-..  doxygenclass:: paddle::MultiThreadWorker 
+Semaphore
+---------
+..  doxygenclass:: paddle::Semaphore
     :members:
-    
 
-class AsyncThreadPool 
----------------------
+ThreadBarrier
+-------------
+..  doxygenclass:: paddle::ThreadBarrier
+    :members:
 
-..  doxygenclass:: paddle::AsyncThreadPool 
+LockedCondition
+---------------
+..  doxygenclass:: paddle::LockedCondition
     :members:
diff --git a/doc/source/utils/queue.rst b/doc/source/utils/queue.rst
index 72a464ca67288d0d0e24980d59c3bbc85f111081..98192648e2d61e622c2337d10ba024dd676ee685 100644
--- a/doc/source/utils/queue.rst
+++ b/doc/source/utils/queue.rst
@@ -1,16 +1,12 @@
 Queue
 =====
 
-
-class Queue
-------------
-
+Queue
+-----
 ..  doxygenclass:: paddle::Queue
     :members:
 
-
-class BlockingQueue 
--------------------
-
+BlockingQueue 
+-------------
 ..  doxygenclass:: paddle::BlockingQueue 
     :members:
diff --git a/doc/source/utils/thread.rst b/doc/source/utils/thread.rst
index 2eb67dde6a945cc8e250989f7fc8cefed942950e..23d379a9894e5fc22bc6795a480a53d768e608e6 100644
--- a/doc/source/utils/thread.rst
+++ b/doc/source/utils/thread.rst
@@ -1,40 +1,27 @@
-Lock
-====
+Thread
+======
 
-
-class RWLock
-------------
-
-..  doxygenclass:: paddle::RWLock
+Thread 
+------
+..  doxygenclass:: paddle::Thread
     :members:
 
-class ReadLockGuard
--------------------
-
-..  doxygenclass:: paddle::ReadLockGuard
+ThreadWorker
+------------
+..  doxygenclass:: paddle::ThreadWorker
     :members:
 
-class SpinLock
+SyncThreadPool 
 --------------
-
-..  doxygenclass:: paddle::SpinLock
+..  doxygenclass:: paddle::SyncThreadPool 
     :members:
-
-class Semaphore
----------------
-
-..  doxygenclass:: paddle::Semaphore
-    :members:
-
-class ThreadBarrier
--------------------
-
-..  doxygenclass:: paddle::ThreadBarrier
+    
+MultiThreadWorker 
+-----------------
+..  doxygenclass:: paddle::MultiThreadWorker 
     :members:
 
-class LockedCondition
----------------------
-
-..  doxygenclass:: paddle::LockedCondition
+AsyncThreadPool 
+---------------
+..  doxygenclass:: paddle::AsyncThreadPool
     :members:
-
diff --git a/doc/ui/api/trainer_config_helpers/activations.rst b/doc/ui/api/trainer_config_helpers/activations.rst
index c4e14ed779efb6f6601d2c5fa41764f318c82848..269e6491e7ebe3899c3fb24fca756a393043473b 100644
--- a/doc/ui/api/trainer_config_helpers/activations.rst
+++ b/doc/ui/api/trainer_config_helpers/activations.rst
@@ -1,3 +1,7 @@
+===========
+Activations
+===========
+
 BaseActivation
 ==============
 
@@ -32,6 +36,13 @@ LinearActivation
 ..  automodule:: paddle.trainer_config_helpers.activations
     :members: LinearActivation
     :noindex:
+
+LogActivation
+==================
+
+..  automodule:: paddle.trainer_config_helpers.activations
+    :members: LogActivation
+    :noindex:
     
 SquareActivation
 ================
@@ -95,4 +106,3 @@ STanhActivation
 ..  automodule:: paddle.trainer_config_helpers.activations
     :members: STanhActivation
     :noindex:
-    
diff --git a/doc/ui/api/trainer_config_helpers/activations_index.rst b/doc/ui/api/trainer_config_helpers/activations_index.rst
deleted file mode 100644
index 1c0b71ab77eec62859c1d7615f6ebe637f3108ac..0000000000000000000000000000000000000000
--- a/doc/ui/api/trainer_config_helpers/activations_index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Activations
-===========
-
-.. toctree::
-  :maxdepth: 3
-
-  activations.rst
diff --git a/doc/ui/api/trainer_config_helpers/evaluators.rst b/doc/ui/api/trainer_config_helpers/evaluators.rst
index 0586c9907e472dd98c5f7e9098251f3bc6b88bab..d6a79c13e2316b0fd3d53eb47960a767bcf8abdb 100644
--- a/doc/ui/api/trainer_config_helpers/evaluators.rst
+++ b/doc/ui/api/trainer_config_helpers/evaluators.rst
@@ -1,3 +1,7 @@
+==========
+Evaluators
+==========
+
 Base
 ====
 ..  automodule:: paddle.trainer_config_helpers.evaluators
diff --git a/doc/ui/api/trainer_config_helpers/evaluators_index.rst b/doc/ui/api/trainer_config_helpers/evaluators_index.rst
deleted file mode 100644
index 298de3e1a32d36b9102f5ad64cc1b968f418041b..0000000000000000000000000000000000000000
--- a/doc/ui/api/trainer_config_helpers/evaluators_index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Evaluators
-==========
-
-.. toctree::
-  :maxdepth: 3
-
-  evaluators.rst
diff --git a/doc/ui/api/trainer_config_helpers/index.md b/doc/ui/api/trainer_config_helpers/index.md
deleted file mode 100644
index 00fa99bb3fa4c407dc867f91f4c7c495dc4061a1..0000000000000000000000000000000000000000
--- a/doc/ui/api/trainer_config_helpers/index.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# Model Config Interface
-
-* [Optimizer](optimizers_index.rst)
-* [Data Source](data_sources.rst)
-* [Layers](layers_index.rst)
-* [Activations](activations_index.rst)
-* [Poolings](poolings_index.rst)
-* [Networks](networks_index.rst)
-* [Evaluators](evaluators_index.rst)
-* [Parameter and Extra Layer Attribute](attrs.rst)
diff --git a/doc/ui/api/trainer_config_helpers/index.rst b/doc/ui/api/trainer_config_helpers/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8395eb75710b3e67ec0c5442f79c999bdacdff42
--- /dev/null
+++ b/doc/ui/api/trainer_config_helpers/index.rst
@@ -0,0 +1,14 @@
+Model Config Interface
+======================
+
+.. toctree::
+  :maxdepth: 1
+
+  optimizers.rst
+  data_sources.rst
+  layers.rst
+  activations.rst 
+  poolings.rst
+  networks.rst
+  evaluators.rst
+  attrs.rst
diff --git a/doc/ui/api/trainer_config_helpers/layers.rst b/doc/ui/api/trainer_config_helpers/layers.rst
index 55f5623b0faef5553064bfc07e4854bed251f623..b487b739a719e9f7118efcc143301da36f7a978e 100644
--- a/doc/ui/api/trainer_config_helpers/layers.rst
+++ b/doc/ui/api/trainer_config_helpers/layers.rst
@@ -1,3 +1,7 @@
+======
+Layers
+======
+
 Base
 ======
 
@@ -46,6 +50,12 @@ conv_operator
     :members: conv_operator
     :noindex:
 
+conv_projection
+---------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: conv_projection
+    :noindex:
+
 conv_shift_layer
 ------------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -71,6 +81,18 @@ img_pool_layer
 --------------
 ..  automodule:: paddle.trainer_config_helpers.layers
     :members: img_pool_layer
+    :noindex:   
+
+spp_layer
+--------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: spp_layer
+    :noindex:
+
+maxout_layer
+------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: maxout_layer
     :noindex:
 
 Norm Layer
@@ -169,6 +191,12 @@ embedding_layer
     :members: embedding_layer
     :noindex:
 
+scaling_projection
+------------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: scaling_projection
+    :noindex:
+
 dotmul_projection
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -248,6 +276,12 @@ expand_layer
     :members: expand_layer
     :noindex:
 
+repeat_layer
+------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: repeat_layer
+    :noindex:
+
 Math Layers
 ===========
 
@@ -269,6 +303,12 @@ interpolation_layer
     :members: interpolation_layer
     :noindex:
 
+bilinear_interp_layer
+----------------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: bilinear_interp_layer
+    :noindex:
+
 power_layer
 -----------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -389,6 +429,12 @@ hsigmoid
     :members: hsigmoid
     :noindex:
 
+sum_cost
+---------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: sum_cost
+    :noindex:
+
 Check Layer 
 ============
 
diff --git a/doc/ui/api/trainer_config_helpers/layers_index.rst b/doc/ui/api/trainer_config_helpers/layers_index.rst
deleted file mode 100644
index c0daab152148ce769948f600c3101bd79f5a1013..0000000000000000000000000000000000000000
--- a/doc/ui/api/trainer_config_helpers/layers_index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Layers
-======
-
-.. toctree::
-  :maxdepth: 3
-
-  layers.rst
diff --git a/doc/ui/api/trainer_config_helpers/networks.rst b/doc/ui/api/trainer_config_helpers/networks.rst
index 2a15b34eaea0b763f992a7225550e6af747f303c..29c52c5ce3078f1755162dbbdd65a059d8ba9fa4 100644
--- a/doc/ui/api/trainer_config_helpers/networks.rst
+++ b/doc/ui/api/trainer_config_helpers/networks.rst
@@ -1,3 +1,9 @@
+========
+Networks
+========
+
+The networks module contains pieces of neural network that combine multiple layers.
+
 NLP
 ===
 
@@ -111,4 +117,3 @@ outputs
 ..  automodule:: paddle.trainer_config_helpers.networks
     :members: outputs
     :noindex:
-
diff --git a/doc/ui/api/trainer_config_helpers/networks_index.rst b/doc/ui/api/trainer_config_helpers/networks_index.rst
deleted file mode 100644
index 17bc4dfaa6c4ed3cd5daf0476d0d4c15a2067a22..0000000000000000000000000000000000000000
--- a/doc/ui/api/trainer_config_helpers/networks_index.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-Networks
-========
-
-The networks module contains pieces of neural network that combine multiple layers.
-
-.. toctree::
-  :maxdepth: 3
-
-  networks.rst
diff --git a/doc/ui/api/trainer_config_helpers/optimizers.rst b/doc/ui/api/trainer_config_helpers/optimizers.rst
index b487fec64c4ebb5cfbdff1aa101d9b3675776a2c..7ca4e34156e273caf66cc71e6927bfb23bb5235e 100644
--- a/doc/ui/api/trainer_config_helpers/optimizers.rst
+++ b/doc/ui/api/trainer_config_helpers/optimizers.rst
@@ -1,3 +1,7 @@
+==========
+Optimizers
+==========
+
 BaseSGDOptimizer
 ================
 ..  automodule:: paddle.trainer_config_helpers.optimizers
@@ -51,4 +55,3 @@ settings
 ..  automodule:: paddle.trainer_config_helpers.optimizers
     :members: settings
     :noindex:
-
diff --git a/doc/ui/api/trainer_config_helpers/optimizers_index.rst b/doc/ui/api/trainer_config_helpers/optimizers_index.rst
deleted file mode 100644
index f39f94f0cd6e1a6c3c25eeceb7820a7fbc070570..0000000000000000000000000000000000000000
--- a/doc/ui/api/trainer_config_helpers/optimizers_index.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Optimizers
-==========
-
-.. toctree::
-  :maxdepth: 3
-
-  optimizers.rst
diff --git a/doc/ui/api/trainer_config_helpers/poolings.rst b/doc/ui/api/trainer_config_helpers/poolings.rst
index caadec639383aad24ed477d8bdaeaa31c0026bb5..66566809d26f59263597b5286c5b27e0bbc9415a 100644
--- a/doc/ui/api/trainer_config_helpers/poolings.rst
+++ b/doc/ui/api/trainer_config_helpers/poolings.rst
@@ -1,3 +1,7 @@
+========
+Poolings
+========
+
 BasePoolingType
 ===============
 ..  automodule:: paddle.trainer_config_helpers.poolings
@@ -27,4 +31,3 @@ SquareRootNPooling
 ..  automodule:: paddle.trainer_config_helpers.poolings
     :members: SquareRootNPooling
     :noindex:
-
diff --git a/doc/ui/api/trainer_config_helpers/poolings_index.rst b/doc/ui/api/trainer_config_helpers/poolings_index.rst
deleted file mode 100644
index 250d3fa69c0dcedfd689b685fe7b47ec71d02fee..0000000000000000000000000000000000000000
--- a/doc/ui/api/trainer_config_helpers/poolings_index.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-Poolings
-========
-
-These pooling types are used for sequence input, not for images.
-
-.. toctree::
-  :maxdepth: 3
-
-  poolings.rst
diff --git a/doc/ui/cmd_argument/argument_outline.md b/doc/ui/cmd_argument/argument_outline.md
index 98dadc270dcac8cb5c05f3065c98bac78671d7fa..d6cc2c6ed7cc1b9209d56b4348497427efe40ac3 100644
--- a/doc/ui/cmd_argument/argument_outline.md
+++ b/doc/ui/cmd_argument/argument_outline.md
@@ -183,7 +183,7 @@ It looks like there are a lot of arguments. However, most of them are for develo
 </tr>
 
 <tr>
-<td class="left" rowspan = "5">GPU</td><td class="left">gpu_id</td>
+<td class="left" rowspan = "6">GPU</td><td class="left">gpu_id</td>
 <td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
 </tr>
 
@@ -207,6 +207,11 @@ It looks like there are a lot of arguments. However, most of them are for develo
 <td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
 </tr>
 
+<tr>
+<td class="left">cudnn_conv_workspace_limit_in_mb</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
 <tr>
 <td class="left" rowspan = "4">RNN</td>
 <td class="left">beam_size</td>
diff --git a/doc/ui/cmd_argument/detail_introduction.md b/doc/ui/cmd_argument/detail_introduction.md
index 0d0362d022a72b597e78e760893c91df449e5745..07608e5edf740bd3e1242913f1d2d7589ad313aa 100644
--- a/doc/ui/cmd_argument/detail_introduction.md
+++ b/doc/ui/cmd_argument/detail_introduction.md
@@ -163,6 +163,10 @@
   - Choose path to dynamic load NVIDIA CUDA library, for instance, /usr/local/cuda/lib64. [Default]: LD_LIBRARY_PATH
   - type: string (default: "", null)
 
+* `--cudnn_conv_workspace_limit_in_mb`
+  - Specify cuDNN max workspace limit, in units MB, 4096MB=4GB by default. 
+  - type: int32 (default: 4096MB=4GB)
+
 ## NLP: RNN/LSTM/GRU
 * `--rnn_use_batch`
   - Whether to use batch method for calculation in simple RecurrentLayer.
diff --git a/doc/ui/predict/predict_sample.py b/doc/ui/predict/predict_sample.py
index d55d2c730dece07f068b728d0a75f34c70b817bd..63e8b36d26057d4a87dabb8745de8e13efe2524f 100644
--- a/doc/ui/predict/predict_sample.py
+++ b/doc/ui/predict/predict_sample.py
@@ -16,82 +16,113 @@ from py_paddle import swig_paddle, DataProviderConverter
 from paddle.trainer.PyDataProvider2 import dense_vector
 from paddle.trainer.config_parser import parse_config
 
-TEST_DATA = [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.215686,
-               0.533333, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.67451,
-               0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.070588, 0.886275,
-               0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.192157, 0.070588, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-               0.670588, 0.992157, 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.117647, 0.933333, 0.858824, 0.313725, 0, 0, 0,
-               0, 0, 0, 0, 0, 0, 0, 0, 0.090196, 0.858824, 0.992157, 0.831373, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.141176,
-               0.992157, 0.992157, 0.611765, 0.054902, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.258824, 0.992157, 0.992157,
-               0.529412, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.368627, 0.992157, 0.992157, 0.419608, 0.003922, 0, 0, 0, 0, 0, 0,
-               0, 0, 0, 0.094118, 0.835294, 0.992157, 0.992157, 0.517647, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.603922, 0.992157,
-               0.992157, 0.992157, 0.603922, 0.545098, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0.447059, 0.992157, 0.992157,
-               0.956863, 0.062745, 0, 0, 0, 0, 0, 0, 0, 0, 0.011765, 0.666667, 0.992157, 0.992157, 0.992157, 0.992157,
-               0.992157, 0.745098, 0.137255, 0, 0, 0, 0, 0, 0.152941, 0.866667, 0.992157, 0.992157, 0.521569, 0, 0, 0,
-               0, 0, 0, 0, 0, 0, 0.070588, 0.992157, 0.992157, 0.992157, 0.803922, 0.352941, 0.745098, 0.992157,
-               0.945098, 0.317647, 0, 0, 0, 0, 0.580392, 0.992157, 0.992157, 0.764706, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0,
-               0, 0.070588, 0.992157, 0.992157, 0.776471, 0.043137, 0, 0.007843, 0.27451, 0.882353, 0.941176, 0.176471,
-               0, 0, 0.180392, 0.898039, 0.992157, 0.992157, 0.313725, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.070588, 0.992157,
-               0.992157, 0.713725, 0, 0, 0, 0, 0.627451, 0.992157, 0.729412, 0.062745, 0, 0.509804, 0.992157, 0.992157,
-               0.776471, 0.035294, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.494118, 0.992157, 0.992157, 0.968627, 0.168627, 0, 0,
-               0, 0.423529, 0.992157, 0.992157, 0.364706, 0, 0.717647, 0.992157, 0.992157, 0.317647, 0, 0, 0, 0, 0, 0,
-               0, 0, 0, 0, 0, 0.533333, 0.992157, 0.984314, 0.945098, 0.603922, 0, 0, 0, 0.003922, 0.466667, 0.992157,
-               0.988235, 0.976471, 0.992157, 0.992157, 0.788235, 0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.686275,
-               0.882353, 0.364706, 0, 0, 0, 0, 0, 0, 0.098039, 0.588235, 0.992157, 0.992157, 0.992157, 0.980392,
-               0.305882, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.101961, 0.67451, 0.321569, 0, 0, 0, 0, 0, 0, 0, 0.105882,
-               0.733333, 0.976471, 0.811765, 0.713725, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.65098, 0.992157,
-               0.321569, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.25098, 0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-               0.94902, 0.219608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.968627,
-               0.764706, 0.152941, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.498039,
-               0.25098, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [
-                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.298039, 0.333333, 0.333333, 0.333333, 0.337255, 0.333333,
-                  0.333333, 0.109804, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.027451, 0.223529, 0.776471,
-                  0.964706, 0.988235, 0.988235, 0.988235, 0.992157, 0.988235, 0.988235, 0.780392, 0.098039, 0, 0, 0, 0,
-                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.14902, 0.698039, 0.988235, 0.992157, 0.988235, 0.901961, 0.87451,
-                  0.568627, 0.882353, 0.976471, 0.988235, 0.988235, 0.501961, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                  0.188235, 0.647059, 0.988235, 0.988235, 0.745098, 0.439216, 0.098039, 0, 0, 0, 0.572549, 0.988235,
-                  0.988235, 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0.933333, 0.992157, 0.941176,
-                  0.247059, 0, 0, 0, 0, 0, 0, 0.188235, 0.898039, 0.992157, 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                  0.039216, 0.639216, 0.933333, 0.988235, 0.913725, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0.113725, 0.843137,
-                  0.988235, 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.235294, 0.988235, 0.992157, 0.988235, 0.815686,
-                  0.07451, 0, 0, 0, 0, 0, 0, 0, 0.333333, 0.988235, 0.988235, 0.552941, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                  0.211765, 0.878431, 0.988235, 0.992157, 0.701961, 0.329412, 0.109804, 0, 0, 0, 0, 0, 0, 0, 0.698039,
-                  0.988235, 0.913725, 0.145098, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.188235, 0.890196, 0.988235, 0.988235,
-                  0.745098, 0.047059, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.882353, 0.988235, 0.568627, 0, 0, 0, 0, 0, 0, 0, 0,
-                  0, 0.2, 0.933333, 0.992157, 0.992157, 0.992157, 0.447059, 0.294118, 0, 0, 0, 0, 0, 0, 0, 0, 0.447059,
-                  0.992157, 0.768627, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.623529, 0.988235, 0.988235, 0.988235, 0.988235,
-                  0.992157, 0.47451, 0, 0, 0, 0, 0, 0, 0, 0.188235, 0.933333, 0.87451, 0.509804, 0, 0, 0, 0, 0, 0, 0, 0,
-                  0, 0, 0.992157, 0.988235, 0.937255, 0.792157, 0.988235, 0.894118, 0.082353, 0, 0, 0, 0, 0, 0,
-                  0.027451, 0.647059, 0.992157, 0.654902, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.623529, 0.988235, 0.913725,
-                  0.329412, 0.376471, 0.184314, 0, 0, 0, 0, 0, 0, 0.027451, 0.513725, 0.988235, 0.635294, 0.219608, 0,
-                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.196078, 0.929412, 0.988235, 0.988235, 0.741176, 0.309804, 0, 0, 0, 0,
-                  0, 0, 0.529412, 0.988235, 0.678431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.223529, 0.992157,
-                  0.992157, 1, 0.992157, 0.992157, 0.992157, 0.992157, 1, 0.992157, 0.992157, 0.882353, 0, 0, 0, 0, 0,
-                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.023529, 0.478431, 0.654902, 0.658824, 0.952941, 0.988235, 0.988235,
-                  0.988235, 0.992157, 0.988235, 0.729412, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                  0, 0, 0, 0.196078, 0.647059, 0.764706, 0.764706, 0.768627, 0.580392, 0.047059, 0, 0, 0, 0, 0, 0, 0, 0,
-                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                  0, 0, 0, 0, 0, 0, 0]]]
+TEST_DATA = [[[
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.215686, 0.533333, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.67451, 0.992157, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.070588, 0.886275, 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.192157,
+    0.070588, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.670588, 0.992157,
+    0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.117647, 0.933333, 0.858824, 0.313725,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.090196, 0.858824, 0.992157, 0.831373, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0.141176, 0.992157, 0.992157, 0.611765, 0.054902, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0.258824, 0.992157, 0.992157, 0.529412, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0.368627, 0.992157, 0.992157, 0.419608, 0.003922, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0.094118, 0.835294, 0.992157, 0.992157, 0.517647, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0.603922, 0.992157, 0.992157, 0.992157, 0.603922,
+    0.545098, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0.447059, 0.992157, 0.992157,
+    0.956863, 0.062745, 0, 0, 0, 0, 0, 0, 0, 0, 0.011765, 0.666667, 0.992157,
+    0.992157, 0.992157, 0.992157, 0.992157, 0.745098, 0.137255, 0, 0, 0, 0, 0,
+    0.152941, 0.866667, 0.992157, 0.992157, 0.521569, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.070588, 0.992157, 0.992157, 0.992157, 0.803922, 0.352941, 0.745098,
+    0.992157, 0.945098, 0.317647, 0, 0, 0, 0, 0.580392, 0.992157, 0.992157,
+    0.764706, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.070588, 0.992157, 0.992157,
+    0.776471, 0.043137, 0, 0.007843, 0.27451, 0.882353, 0.941176, 0.176471, 0,
+    0, 0.180392, 0.898039, 0.992157, 0.992157, 0.313725, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0.070588, 0.992157, 0.992157, 0.713725, 0, 0, 0, 0, 0.627451,
+    0.992157, 0.729412, 0.062745, 0, 0.509804, 0.992157, 0.992157, 0.776471,
+    0.035294, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.494118, 0.992157, 0.992157,
+    0.968627, 0.168627, 0, 0, 0, 0.423529, 0.992157, 0.992157, 0.364706, 0,
+    0.717647, 0.992157, 0.992157, 0.317647, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.533333, 0.992157, 0.984314, 0.945098, 0.603922, 0, 0, 0, 0.003922,
+    0.466667, 0.992157, 0.988235, 0.976471, 0.992157, 0.992157, 0.788235,
+    0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.686275, 0.882353, 0.364706, 0,
+    0, 0, 0, 0, 0, 0.098039, 0.588235, 0.992157, 0.992157, 0.992157, 0.980392,
+    0.305882, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.101961, 0.67451, 0.321569,
+    0, 0, 0, 0, 0, 0, 0, 0.105882, 0.733333, 0.976471, 0.811765, 0.713725, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.65098, 0.992157, 0.321569, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0.25098, 0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+    0.94902, 0.219608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0.968627, 0.764706, 0.152941, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.498039, 0.25098, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0
+]], [[
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0.298039, 0.333333, 0.333333, 0.333333, 0.337255,
+    0.333333, 0.333333, 0.109804, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0.027451, 0.223529, 0.776471, 0.964706, 0.988235, 0.988235, 0.988235,
+    0.992157, 0.988235, 0.988235, 0.780392, 0.098039, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0.14902, 0.698039, 0.988235, 0.992157, 0.988235, 0.901961,
+    0.87451, 0.568627, 0.882353, 0.976471, 0.988235, 0.988235, 0.501961, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.188235, 0.647059, 0.988235, 0.988235,
+    0.745098, 0.439216, 0.098039, 0, 0, 0, 0.572549, 0.988235, 0.988235,
+    0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0.933333, 0.992157,
+    0.941176, 0.247059, 0, 0, 0, 0, 0, 0, 0.188235, 0.898039, 0.992157,
+    0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.039216, 0.639216, 0.933333,
+    0.988235, 0.913725, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0.113725, 0.843137,
+    0.988235, 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.235294, 0.988235,
+    0.992157, 0.988235, 0.815686, 0.07451, 0, 0, 0, 0, 0, 0, 0, 0.333333,
+    0.988235, 0.988235, 0.552941, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.211765,
+    0.878431, 0.988235, 0.992157, 0.701961, 0.329412, 0.109804, 0, 0, 0, 0, 0,
+    0, 0, 0.698039, 0.988235, 0.913725, 0.145098, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.188235, 0.890196, 0.988235, 0.988235, 0.745098, 0.047059, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0.882353, 0.988235, 0.568627, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2,
+    0.933333, 0.992157, 0.992157, 0.992157, 0.447059, 0.294118, 0, 0, 0, 0, 0,
+    0, 0, 0, 0.447059, 0.992157, 0.768627, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.623529, 0.988235, 0.988235, 0.988235, 0.988235, 0.992157, 0.47451, 0, 0,
+    0, 0, 0, 0, 0, 0.188235, 0.933333, 0.87451, 0.509804, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0.992157, 0.988235, 0.937255, 0.792157, 0.988235, 0.894118,
+    0.082353, 0, 0, 0, 0, 0, 0, 0.027451, 0.647059, 0.992157, 0.654902, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0.623529, 0.988235, 0.913725, 0.329412, 0.376471,
+    0.184314, 0, 0, 0, 0, 0, 0, 0.027451, 0.513725, 0.988235, 0.635294,
+    0.219608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.196078, 0.929412, 0.988235,
+    0.988235, 0.741176, 0.309804, 0, 0, 0, 0, 0, 0, 0.529412, 0.988235,
+    0.678431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.223529, 0.992157,
+    0.992157, 1, 0.992157, 0.992157, 0.992157, 0.992157, 1, 0.992157, 0.992157,
+    0.882353, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.023529,
+    0.478431, 0.654902, 0.658824, 0.952941, 0.988235, 0.988235, 0.988235,
+    0.992157, 0.988235, 0.729412, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0.196078, 0.647059, 0.764706, 0.764706, 0.768627,
+    0.580392, 0.047059, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0
+]]]
 
 
 def main():
     conf = parse_config("./mnist_model/trainer_config.py", "")
     print conf.data_config.load_data_args
-    network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
+    network = swig_paddle.GradientMachine.createFromConfigProto(
+        conf.model_config)
     assert isinstance(network, swig_paddle.GradientMachine)  # For code hint.
     network.loadParameters("./mnist_model/")
     converter = DataProviderConverter([dense_vector(784)])
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d4deb3ca5a4523b509ea5082f32be8a315570dea
--- /dev/null
+++ b/doc/user_guide.rst
@@ -0,0 +1,13 @@
+User Guide
+==========
+
+..  toctree::
+  :maxdepth: 1
+
+  demo/quick_start/index_en.md
+  build/index.rst
+  build/contribute_to_paddle.md
+  ui/index.md
+  ui/api/trainer_config_helpers/index.rst
+  demo/index.md
+  cluster/index.md
diff --git a/doc_cn/algorithm/rnn/hierarchical-layer.md b/doc_cn/algorithm/rnn/hierarchical-layer.md
index 5282bbbcb82d00f5aed7b784d2bd44f9ec33fa42..519653df081d6e7919ada3cbff6aaf4d2a2f6115 100644
--- a/doc_cn/algorithm/rnn/hierarchical-layer.md
+++ b/doc_cn/algorithm/rnn/hierarchical-layer.md
@@ -1,66 +1,66 @@
-# 支持双层序列作为输入的Layer
-
-## 概述
-
-在自然语言处理任务中，序列是一种常见的数据类型。一个独立的词语，可以看作是一个非序列输入，或者，我们称之为一个0层的序列；由词语构成的句子，是一个单层序列；若干个句子构成一个段落，是一个双层的序列。
-
-双层序列是一个嵌套的序列，它的每一个元素，又是一个单层的序列。这是一种非常灵活的数据组织方式，帮助我们构造一些复杂的输入信息。
-
-我们可以按照如下层次定义非序列，单层序列，以及双层序列。
-
-+ 0层序列：一个独立的元素，类型可以是PaddlePaddle支持的任意输入数据类型
-+ 单层序列：排成一列的多个元素，每个元素是一个0层序列，元素之间的顺序是重要的输入信息
-+ 双层序列：排成一列的多个元素，每个元素是一个单层序列，称之为双层序列的一个子序列（subseq），subseq的每个元素是一个0层序列
-
-
-在 PaddlePaddle中，下面这些Layer能够接受双层序列作为输入，完成相应的计算。
-## pooling_layer
-
-pooling_layer的使用示例如下，详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#pooling-layer">配置API</a>。
-```python
-seq_pool = pooling_layer(input=layer,
-                         pooling_type=AvgPooling(),
-                         agg_level=AggregateLevel.EACH_SEQUENCE)
-```
-- `pooling_type` 目前支持两种，分别是：MaxPooling()和AvgPooling()。
-- `agg_level=AggregateLevel.TIMESTEP`时（默认值）：
-  - 作用：双层序列经过运算变成一个0层序列，或单层序列经过运算变成一个0层序列
-  - 输入：一个双层序列，或一个单层序列
-  - 输出：一个0层序列，即整个输入序列（单层或双层）的平均值（或最大值）
-- `agg_level=AggregateLevel.EACH_SEQUENCE`时：
-  - 作用：一个双层序列经过运算变成一个单层序列
-  - 输入：必须是一个双层序列
-  - 输出：一个单层序列，序列的每个元素是原来双层序列每个subseq元素的平均值（或最大值）
-
-## last_seq 和 first_seq
-
-last_seq的使用示例如下（first_seq类似），详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#last-seq">配置API</a>。
-```python
-last = last_seq(input=layer,
-                agg_level=AggregateLevel.EACH_SEQUENCE)
-```
-- `agg_level=AggregateLevel.TIMESTEP`时（默认值）：
-  - 作用：一个双层序列经过运算变成一个0层序列，或一个单层序列经过运算变成一个0层序列
-  - 输入：一个双层序列或一个单层序列
-  - 输出：一个0层序列，即整个输入序列（双层或者单层）最后一个，或第一个元素。
-- `agg_level=AggregateLevel.EACH_SEQUENCE`时：
-  - 作用：一个双层序列经过运算变成一个单层序列
-  - 输入：必须是一个双层序列
-  - 输出：一个单层序列，其中每个元素是双层序列中每个subseq最后一个（或第一个）元素。
-
-## expand_layer
-
-expand_layer的使用示例如下，详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#expand-layer">配置API</a>。
-```python
-expand = expand_layer(input=layer1,
-                      expand_as=layer2,
-                      expand_level=ExpandLevel.FROM_TIMESTEP)
-```
-- `expand_level=ExpandLevel.FROM_TIMESTEP`时（默认值）：
-  - 作用：一个0层序列经过运算扩展成一个单层序列，或者一个双层序列
-  - 输入：layer1必须是一个0层序列，是待扩展的数据；layer2可以是一个单层序列，或者是一个双层序列，提供扩展的长度信息
-  - 输出：一个单层序列，或一个双层序列，输出序列的类型（双层序列，或单层序列）和序列中含有元素的数目同 layer2一致。若输出是单层序列，单层序列的每个元素（0层序列），都是对layer1元素的拷贝；若输出是双层序列，双层序列每个subseq中每个元素（0层序列），都是对layer1元素的拷贝
-- `expand_level=ExpandLevel.FROM_SEQUENCE`时：
-  - 作用：一个单层序列经过运算扩展成一个双层序列
-  - 输入：layer1必须是一个单层序列，是待扩展的数据；layer2必须是一个双层序列，提供扩展的长度信息
-  - 输出：一个双层序列，序列中含有元素的数目同layer2一致。要求单层序列含有元素的数目（0层序列），和双层序列含有subseq 的数目一致。单层序列第i个元素（0层序列），被扩展为一个单层序列，构成了输出双层序列的第i个subseq。
\ No newline at end of file
+# 支持双层序列作为输入的Layer
+
+## 概述
+
+在自然语言处理任务中，序列是一种常见的数据类型。一个独立的词语，可以看作是一个非序列输入，或者，我们称之为一个0层的序列；由词语构成的句子，是一个单层序列；若干个句子构成一个段落，是一个双层的序列。
+
+双层序列是一个嵌套的序列，它的每一个元素，又是一个单层的序列。这是一种非常灵活的数据组织方式，帮助我们构造一些复杂的输入信息。
+
+我们可以按照如下层次定义非序列，单层序列，以及双层序列。
+
++ 0层序列：一个独立的元素，类型可以是PaddlePaddle支持的任意输入数据类型
++ 单层序列：排成一列的多个元素，每个元素是一个0层序列，元素之间的顺序是重要的输入信息
++ 双层序列：排成一列的多个元素，每个元素是一个单层序列，称之为双层序列的一个子序列（subseq），subseq的每个元素是一个0层序列
+
+
+在 PaddlePaddle中，下面这些Layer能够接受双层序列作为输入，完成相应的计算。
+## pooling_layer
+
+pooling_layer的使用示例如下，详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#pooling-layer">配置API</a>。
+```python
+seq_pool = pooling_layer(input=layer,
+                         pooling_type=AvgPooling(),
+                         agg_level=AggregateLevel.EACH_SEQUENCE)
+```
+- `pooling_type` 目前支持两种，分别是：MaxPooling()和AvgPooling()。
+- `agg_level=AggregateLevel.TIMESTEP`时（默认值）：
+  - 作用：双层序列经过运算变成一个0层序列，或单层序列经过运算变成一个0层序列
+  - 输入：一个双层序列，或一个单层序列
+  - 输出：一个0层序列，即整个输入序列（单层或双层）的平均值（或最大值）
+- `agg_level=AggregateLevel.EACH_SEQUENCE`时：
+  - 作用：一个双层序列经过运算变成一个单层序列
+  - 输入：必须是一个双层序列
+  - 输出：一个单层序列，序列的每个元素是原来双层序列每个subseq元素的平均值（或最大值）
+
+## last_seq 和 first_seq
+
+last_seq的使用示例如下（first_seq类似），详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#last-seq">配置API</a>。
+```python
+last = last_seq(input=layer,
+                agg_level=AggregateLevel.EACH_SEQUENCE)
+```
+- `agg_level=AggregateLevel.TIMESTEP`时（默认值）：
+  - 作用：一个双层序列经过运算变成一个0层序列，或一个单层序列经过运算变成一个0层序列
+  - 输入：一个双层序列或一个单层序列
+  - 输出：一个0层序列，即整个输入序列（双层或者单层）最后一个，或第一个元素。
+- `agg_level=AggregateLevel.EACH_SEQUENCE`时：
+  - 作用：一个双层序列经过运算变成一个单层序列
+  - 输入：必须是一个双层序列
+  - 输出：一个单层序列，其中每个元素是双层序列中每个subseq最后一个（或第一个）元素。
+
+## expand_layer
+
+expand_layer的使用示例如下，详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#expand-layer">配置API</a>。
+```python
+expand = expand_layer(input=layer1,
+                      expand_as=layer2,
+                      expand_level=ExpandLevel.FROM_TIMESTEP)
+```
+- `expand_level=ExpandLevel.FROM_TIMESTEP`时（默认值）：
+  - 作用：一个0层序列经过运算扩展成一个单层序列，或者一个双层序列
+  - 输入：layer1必须是一个0层序列，是待扩展的数据；layer2可以是一个单层序列，或者是一个双层序列，提供扩展的长度信息
+  - 输出：一个单层序列，或一个双层序列，输出序列的类型（双层序列，或单层序列）和序列中含有元素的数目同 layer2一致。若输出是单层序列，单层序列的每个元素（0层序列），都是对layer1元素的拷贝；若输出是双层序列，双层序列每个subseq中每个元素（0层序列），都是对layer1元素的拷贝
+- `expand_level=ExpandLevel.FROM_SEQUENCE`时：
+  - 作用：一个单层序列经过运算扩展成一个双层序列
+  - 输入：layer1必须是一个单层序列，是待扩展的数据；layer2必须是一个双层序列，提供扩展的长度信息
+  - 输出：一个双层序列，序列中含有元素的数目同layer2一致。要求单层序列含有元素的数目（0层序列），和双层序列含有subseq 的数目一致。单层序列第i个元素（0层序列），被扩展为一个单层序列，构成了输出双层序列的第i个subseq。
diff --git a/doc_cn/algorithm/rnn/hierarchical-rnn.md b/doc_cn/algorithm/rnn/hierarchical-rnn.md
index 4a85cf336146ef368b04c13fdc74f39ee7a361d3..c184a34e85a571e98e88c14ef653356fdd555a19 100644
--- a/doc_cn/algorithm/rnn/hierarchical-rnn.md
+++ b/doc_cn/algorithm/rnn/hierarchical-rnn.md
@@ -1,403 +1,403 @@
-# 双层RNN配置与示例
-
-我们在`paddle/gserver/tests/test_RecurrentGradientMachine`单测中，通过多组语义相同的单双层RNN配置，讲解如何使用双层RNN。
-
-## 示例1：双进双出，subseq间无memory
-
-配置：单层RNN（`sequence_layer_group`）和双层RNN（`sequence_nest_layer_group`），语义完全相同。
-
-### 读取双层序列的方法
-
-首先，我们看一下单双层序列的不同数据组织形式（您也可以采用别的组织形式）：
-
-- 单层序列的数据（`Sequence/tour_train_wdseg`）如下，一共有10个样本。每个样本由两部分组成，一个label（此处都为2）和一个已经分词后的句子。
-
-```text
-2  	酒店 有 很 舒适 的 床垫 子 ， 床上用品 也 应该 是 一人 一 换 ， 感觉 很 利落 对 卫生 很 放心 呀 。
-2  	很 温馨 ， 也 挺 干净 的 * 地段 不错 ， 出来 就 有 全家 ， 离 地铁站 也 近 ， 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 ， 就 第一天 给 了 一次性杯子 *
-2  	位置 方便 ， 强烈推荐 ， 十一 出去玩 的 时候 选 的 ， 对面 就是 华润万家 ， 周围 吃饭 的 也 不少 。
-2  	交通便利 ， 吃 很 便利 ， 乾 浄 、 安静 ， 商务 房 有 电脑 、 上网 快 ， 价格 可以 ， 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。
-2  	本来 准备 住 两 晚 ， 第 2 天 一早 居然 停电 ， 且 无 通知 ， 只有 口头 道歉 。 总体来说 性价比 尚可 ， 房间 较 新 ， 还是 推荐 .
-2  	这个 酒店 去过 很多 次 了 ， 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店
-2  	挺好 的 汉庭 ， 前台 服务 很 热情 ， 卫生 很 整洁 ， 房间 安静 ， 水温 适中 ， 挺好 ！
-2  	HowardJohnson 的 品质 ， 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 ， 简直 一 流 。 就 在 天一阁 、 月湖 旁边 ， 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。
-2  	酒店 很干净 ， 很安静 ， 很 温馨 ， 服务员 服务 好 ， 各方面 都 不错 *
-2  	挺好 的 ， 就是 没 窗户 ， 不过 对 得 起 这 价格
-```
-
-- 双层序列的数据（`Sequence/tour_train_wdseg.nest`）如下，一共有4个样本。样本间用空行分开，代表不同的双层序列，序列数据和上面的完全一样。每个样本的子句数分别为2,3,2,3。
-
-```text
-2  	酒店 有 很 舒适 的 床垫 子 ， 床上用品 也 应该 是 一人 一 换 ， 感觉 很 利落 对 卫生 很 放心 呀 。
-2  	很 温馨 ， 也 挺 干净 的 * 地段 不错 ， 出来 就 有 全家 ， 离 地铁站 也 近 ， 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 ， 就 第一天 给 了 一次性杯子 *
-
-2  	位置 方便 ， 强烈推荐 ， 十一 出去玩 的 时候 选 的 ， 对面 就是 华润万家 ， 周围 吃饭 的 也 不少 。
-2  	交通便利 ， 吃 很 便利 ， 乾 浄 、 安静 ， 商务 房 有 电脑 、 上网 快 ， 价格 可以 ， 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。
-2  	本来 准备 住 两 晚 ， 第 2 天 一早 居然 停电 ， 且 无 通知 ， 只有 口头 道歉 。 总体来说 性价比 尚可 ， 房间 较 新 ， 还是 推荐 .
-
-2  	这个 酒店 去过 很多 次 了 ， 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店
-2  	挺好 的 汉庭 ， 前台 服务 很 热情 ， 卫生 很 整洁 ， 房间 安静 ， 水温 适中 ， 挺好 ！
-
-2  	HowardJohnson 的 品质 ， 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 ， 简直 一 流 。 就 在 天一阁 、 月湖 旁边 ， 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。
-2  	酒店 很干净 ， 很安静 ， 很 温馨 ， 服务员 服务 好 ， 各方面 都 不错 *
-2  	挺好 的 ， 就是 没 窗户 ， 不过 对 得 起 这 价格
-```
-
-其次，我们看一下单双层序列的不同dataprovider（见`sequenceGen.py`）：
-
-- 单层序列的dataprovider如下：
-  - word_slot是integer_value_sequence类型，代表单层序列。
-  - label是integer_value类型，代表一个向量。
-
-```python
-def hook(settings, dict_file, **kwargs):
-    settings.word_dict = dict_file
-    settings.input_types = [integer_value_sequence(len(settings.word_dict)), 
-                            integer_value(3)]
-
-@provider(init_hook=hook)
-def process(settings, file_name):
-    with open(file_name, 'r') as fdata:
-        for line in fdata:
-            label, comment = line.strip().split('\t')
-            label = int(''.join(label.split()))
-            words = comment.split()
-            word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict]
-            yield word_slot, label
-```
-
-- 双层序列的dataprovider如下：
-  - word_slot是integer_value_sub_sequence类型，代表双层序列。
-  - label是integer_value_sequence类型，代表单层序列，即一个子句一个label。注意：也可以为integer_value类型，代表一个向量，即一个句子一个label。通常根据任务需求进行不同设置。
-  - 关于dataprovider中input_types的详细用法，参见PyDataProvider2。
-
-```python
-def hook2(settings, dict_file, **kwargs):
-    settings.word_dict = dict_file
-    settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)),
-                            integer_value_sequence(3)]
-
-@provider(init_hook=hook2)
-def process2(settings, file_name):
-    with open(file_name) as fdata:
-        label_list = []
-        word_slot_list = []
-        for line in fdata:
-            if (len(line)) > 1:
-                label,comment = line.strip().split('\t')
-                label = int(''.join(label.split()))
-                words = comment.split()
-                word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict]
-                label_list.append(label)
-                word_slot_list.append(word_slot)
-            else:
-                yield word_slot_list, label_list
-                label_list = []
-                word_slot_list = []
-```
-
-### 模型中的配置
-
-首先，我们看一下单层序列的配置（见`sequence_layer_group.conf`）。注意：batchsize=5表示一次过5句单层序列，因此2个batch就可以完成1个pass。
-
-```python
-settings(batch_size=5)
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-# (lstm_input + lstm) is equal to lstmemory 
-with mixed_layer(size=hidden_dim*4) as lstm_input:
-    lstm_input += full_matrix_projection(input=emb)
-
-lstm = lstmemory_group(input=lstm_input,
-                       size=hidden_dim,
-                       act=TanhActivation(),
-                       gate_act=SigmoidActivation(),
-                       state_act=TanhActivation(),
-                       lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
-
-lstm_last = last_seq(input=lstm)
-
-with mixed_layer(size=label_dim, 
-                 act=SoftmaxActivation(), 
-                 bias_attr=True) as output:
-    output += full_matrix_projection(input=lstm_last)
-
-outputs(classification_cost(input=output, label=data_layer(name="label", size=1)))
-
-```
-其次，我们看一下语义相同的双层序列配置（见`sequence_nest_layer_group.conf`），并对其详细分析：
-
-- batchsize=2表示一次过2句双层序列。但从上面的数据格式可知，2句双层序列和5句单层序列的数据完全一样。
-- data_layer和embedding_layer不关心数据是否是序列格式，因此两个配置在这两层上的输出是一样的。
-- lstmemory:
-  - 单层序列过了一个mixed_layer和lstmemory_group。
-  - 双层序列在同样的mixed_layer和lstmemory_group外，直接加了一层group。由于这个外层group里面没有memory，表示subseq间不存在联系，即起到的作用仅仅是把双层seq拆成单层，因此双层序列过完lstmemory的输出和单层的一样。
-- last_seq：
-  - 单层序列直接取了最后一个元素
-  - 双层序列首先（last_seq层）取了每个subseq的最后一个元素，将其拼接成一个新的单层序列；接着（expand_layer层）将其扩展成一个新的双层序列，其中第i个subseq中的所有向量均为输入的单层序列中的第i个向量；最后（average_layer层）取了每个subseq的平均值。
-  - 分析得出：第一个last_seq后，每个subseq的最后一个元素就等于单层序列的最后一个元素，而expand_layer和average_layer后，依然保持每个subseq最后一个元素的值不变（这两层仅是为了展示它们的用法，实际中并不需要）。因此单双层序列的输出是一样旳。
-
-```python
-settings(batch_size=2)
-
-data = data_layer(name="word", size=dict_dim)
-
-emb_group = embedding_layer(input=data, size=word_dim)
-
-# (lstm_input + lstm) is equal to lstmemory 
-def lstm_group(lstm_group_input):
-    with mixed_layer(size=hidden_dim*4) as group_input:
-      group_input += full_matrix_projection(input=lstm_group_input)
-
-    lstm_output = lstmemory_group(input=group_input,
-                                  name="lstm_group",
-                                  size=hidden_dim,
-                                  act=TanhActivation(),
-                                  gate_act=SigmoidActivation(),
-                                  state_act=TanhActivation(),
-                                  lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
-    return lstm_output
-
-lstm_nest_group = recurrent_group(input=SubsequenceInput(emb_group),
-                                  step=lstm_group,
-                                  name="lstm_nest_group")
-# hasSubseq ->(seqlastins) seq
-lstm_last = last_seq(input=lstm_nest_group, agg_level=AggregateLevel.EACH_SEQUENCE)
-
-# seq ->(expand) hasSubseq
-lstm_expand = expand_layer(input=lstm_last, expand_as=emb_group, expand_level=ExpandLevel.FROM_SEQUENCE)
-
-# hasSubseq ->(average) seq
-lstm_average = pooling_layer(input=lstm_expand,
-                             pooling_type=AvgPooling(),
-                             agg_level=AggregateLevel.EACH_SEQUENCE)
-
-with mixed_layer(size=label_dim, 
-                 act=SoftmaxActivation(), 
-                 bias_attr=True) as output:
-    output += full_matrix_projection(input=lstm_average)
-
-outputs(classification_cost(input=output, label=data_layer(name="label", size=1)))
-```
-## 示例2：双进双出，subseq间有memory
-
-配置：单层RNN（`sequence_rnn.conf`），双层RNN（`sequence_nest_rnn.conf`和`sequence_nest_rnn_readonly_memory.conf`），语义完全相同。
-
-### 读取双层序列的方法
-
-我们看一下单双层序列的不同数据组织形式和dataprovider（见`rnn_data_provider.py`）
-```python
-data = [
-    [[[1, 3, 2], [4, 5, 2]], 0],
-    [[[0, 2], [2, 5], [0, 1, 2]], 1],
-]
-
-@provider(input_types=[integer_value_sub_sequence(10),
-                       integer_value(3)])
-def process_subseq(settings, file_name):
-    for d in data:
-        yield d
-
-@provider(input_types=[integer_value_sequence(10),
-                       integer_value(3)])
-def process_seq(settings, file_name):
-    for d in data:
-        seq = []
-```
-- 单层序列：有两句，分别为[1,3,2,4,5,2]和[0,2,2,5,0,1,2]。
-- 双层序列：有两句，分别为[[1,3,2],[4,5,2]]（2个子句）和[[0,2],[2,5],[0,1,2]]（3个子句）。
-- 单双层序列的label都分别是0和1
-
-### 模型中的配置
-
-我们选取单双层序列配置中的不同部分，来对比分析两者语义相同的原因。
-
-- 单层序列：过了一个很简单的recurrent_group。每一个时间步，当前的输入y和上一个时间步的输出rnn_state做了一个全链接。
-
-```python
-def step(y):
-    mem = memory(name="rnn_state", size=hidden_dim)
-    return fc_layer(input=[y, mem],
-                    size=hidden_dim,
-                    act=TanhActivation(),
-                    bias_attr=True,
-                    name="rnn_state")
-
-out = recurrent_group(step=step, input=emb)
-```
-- 双层序列，外层memory是一个元素：
-  - 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem，表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中，outer_mem是一个子句的最后一个向量，即整个双层group是将前一个子句的最后一个向量，作为下一个子句memory的初始状态。
-  - 从输入数据上看，单双层序列的句子是一样的，只是双层序列将其又做了子序列划分。因此双层序列的配置中，必须将前一个子句的最后一个元素，作为boot_layer传给下一个子句的memory，才能保证和单层序列的配置中“每一个时间步都用了上一个时间步的输出结果”一致。
-
-```python
-def outer_step(x):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-    def inner_step(y):
-        inner_mem = memory(name="inner_rnn_state",
-                           size=hidden_dim,
-                           boot_layer=outer_mem)
-        return fc_layer(input=[y, inner_mem],
-                        size=hidden_dim,
-                        act=TanhActivation(),
-                        bias_attr=True,
-                        name="inner_rnn_state")
-
-    inner_rnn_output = recurrent_group(
-        step=inner_step,
-        input=x)
-    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
-
-    return inner_rnn_output
-
-out = recurrent_group(step=outer_step, input=SubsequenceInput(emb))
-```
-- 双层序列，外层memory是单层序列：
-  - 由于外层每个时间步返回的是一个子句，这些子句的长度往往不等长。因此当外层有is_seq=True的memory时，内层是**无法直接使用**它的，即内层memory的boot_layer不能链接外层的这个memory。
-  - 如果内层memory想**间接使用**这个外层memory，只能通过`pooling_layer`、`last_seq`或`first_seq`这三个layer将它先变成一个元素。但这种情况下，外层memory必须有boot_layer，否则在第0个时间步时，由于外层memory没有任何seq信息，因此上述三个layer的前向会报出“**Check failed: input.sequenceStartPositions**”的错误。
-
-## 示例3：双进双出，输入不等长
-
-**输入不等长**是指recurrent_group的多个输入在各时刻的长度可以不相等, 但需要指定一个和输出长度一致的input，用<font color="red">targetInlink</font>表示。参考配置：单层RNN（`sequence_rnn_multi_unequalength_inputs.conf`），双层RNN（`sequence_nest_rnn_multi_unequalength_inputs.conf`）
-
-### 读取双层序列的方法
-
-我们看一下单双层序列的数据组织形式和dataprovider（见`rnn_data_provider.py`）
-```python
-data2 = [
-    [[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0],
-    [[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1],
-]
-
-@provider(input_types=[integer_value_sub_sequence(10),
-                       integer_value_sub_sequence(10),
-                       integer_value(2)],
-          should_shuffle=False)
-def process_unequalength_subseq(settings, file_name): #双层RNN的dataprovider
-    for d in data2:
-        yield d
-
-
-@provider(input_types=[integer_value_sequence(10),
-                       integer_value_sequence(10),
-                       integer_value(2)],
-          should_shuffle=False)
-def process_unequalength_seq(settings, file_name): #单层RNN的dataprovider
-    for d in data2:
-        words1=reduce(lambda x,y: x+y, d[0])
-        words2=reduce(lambda x,y: x+y, d[1])
-        yield words1, words2, d[2]
-```
-
-data2 中有两个样本，每个样本有两个特征, 记fea1, fea2。
-
-- 单层序列：两个样本分别为[[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]] 和 [[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]]
-- 双层序列：两个样本分别为
-  - **样本1**：[[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]]]。fea1和fea2都分别有2个子句，fea1=[[1, 2], [4, 5, 2]], fea2=[[5, 4, 1], [3, 1]]
-  - **样本2**：[[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]]。fea1和fea2都分别有3个子句， fea1=[[0, 2], [2, 5], [0, 1, 2]], fea2=[[1, 5], [4], [2, 3, 6, 1]]。<br/>
-  - **注意**：每个样本中，各特征的子句数目需要相等。这里说的“双进双出，输入不等长”是指fea1在i时刻的输入的长度可以不等于fea2在i时刻的输入的长度。如对于第1个样本，时刻i=2, fea1[2]=[4, 5, 2]，fea2[2]=[3, 1]，3≠2。
-- 单双层序列中，两个样本的label都分别是0和1
-
-### 模型中的配置
-
-单层RNN（`sequence_rnn_multi_unequalength_inputs.conf`）和双层RNN（`sequence_nest_rnn_multi_unequalength_inputs.conf`）两个模型配置达到的效果完全一样，区别只在于输入为单层还是双层序列，现在我们来看它们内部分别是如何实现的。
-
-- 单层序列：
-  - 过了一个简单的recurrent_group。每一个时间步，当前的输入y和上一个时间步的输出rnn_state做了一个全连接，功能与示例2中`sequence_rnn.conf`的`step`函数完全相同。这里，两个输入x1,x2分别通过calrnn返回最后时刻的状态。结果得到的encoder1_rep和encoder2_rep分别是单层序列，最后取encoder1_rep的最后一个时刻和encoder2_rep的所有时刻分别相加得到context。
-  - 注意到这里recurrent_group输入的每个样本中，fea1和fea2的长度都分别相等，这并非偶然，而是因为recurrent_group要求输入为单层序列时，所有输入的长度都必须相等。
-
-```python
-def step(x1, x2):
-	def calrnn(y):
-		mem = memory(name = 'rnn_state_' + y.name, size = hidden_dim)
-        out = fc_layer(input = [y, mem],
-	        size = hidden_dim,
-	        act = TanhActivation(),
-            bias_attr = True,
-            name = 'rnn_state_' + y.name)
-        return out
-
-	encoder1 = calrnn(x1)
-    encoder2 = calrnn(x2)
-    return [encoder1, encoder2]
-    
-encoder1_rep, encoder2_rep = recurrent_group(
-    name="stepout",                           
-    step=step,
-    input=[emb1, emb2])
-
-encoder1_last = last_seq(input = encoder1_rep)                           
-encoder1_expandlast = expand_layer(input = encoder1_last,
-                                   expand_as = encoder2_rep)
-context = mixed_layer(input = [identity_projection(encoder1_expandlast),
-                               identity_projection(encoder2_rep)],
-                      size = hidden_dim)
-```
-- 双层序列：
-  - 双层RNN中，对输入的两个特征分别求时序上的连续全连接(`inner_step1`和`inner_step2`分别处理fea1和fea2)，其功能与示例2中`sequence_nest_rnn.conf`的`outer_step`函数完全相同。不同之处是，此时输入`[SubsequenceInput(emb1), SubsequenceInput(emb2)]`在各时刻并不等长。
-  - 函数`outer_step`中可以分别处理这两个特征，但我们需要用<font color=red>targetInlink</font>指定recurrent_group的输出的格式（各子句长度）只能和其中一个保持一致，如这里选择了和emb2的长度一致。
-  - 最后，依然是取encoder1_rep的最后一个时刻和encoder2_rep的所有时刻分别相加得到context。
-
-```python
-def outer_step(x1, x2):
-    outer_mem1 = memory(name = "outer_rnn_state1", size = hidden_dim)
-    outer_mem2 = memory(name = "outer_rnn_state2", size = hidden_dim)
-    def inner_step1(y):
-        inner_mem = memory(name = 'inner_rnn_state_' + y.name,
-                           size = hidden_dim,
-                           boot_layer = outer_mem1)
-        out = fc_layer(input = [y, inner_mem],
-                       size = hidden_dim,
-                       act = TanhActivation(),
-                       bias_attr = True,
-                       name = 'inner_rnn_state_' + y.name)
-        return out
-
-    def inner_step2(y):
-        inner_mem = memory(name = 'inner_rnn_state_' + y.name,
-                           size = hidden_dim,
-                           boot_layer = outer_mem2)
-        out = fc_layer(input = [y, inner_mem],
-                       size = hidden_dim,
-                       act = TanhActivation(),
-                       bias_attr = True,
-                       name = 'inner_rnn_state_' + y.name)
-        return out
-
-    encoder1 = recurrent_group(
-        step = inner_step1,
-        name = 'inner1',
-        input = x1)
-
-    encoder2 = recurrent_group(
-        step = inner_step2,
-        name = 'inner2',
-        input = x2)
-
-    sentence_last_state1 = last_seq(input = encoder1, name = 'outer_rnn_state1')
-    sentence_last_state2_ = last_seq(input = encoder2, name = 'outer_rnn_state2')
-
-    encoder1_expand = expand_layer(input = sentence_last_state1,
-                                   expand_as = encoder2)
-
-    return [encoder1_expand, encoder2]
-
-encoder1_rep, encoder2_rep = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[SubsequenceInput(emb1), SubsequenceInput(emb2)],
-    targetInlink=emb2)
-
-encoder1_last = last_seq(input = encoder1_rep)
-encoder1_expandlast = expand_layer(input = encoder1_last,
-                                   expand_as = encoder2_rep)
-context = mixed_layer(input = [identity_projection(encoder1_expandlast),
-                               identity_projection(encoder2_rep)],
-                      size = hidden_dim)
-```
-
-## 示例4：beam_search的生成
-
-TBD
\ No newline at end of file
+# 双层RNN配置与示例
+
+我们在`paddle/gserver/tests/test_RecurrentGradientMachine`单测中，通过多组语义相同的单双层RNN配置，讲解如何使用双层RNN。
+
+## 示例1：双进双出，subseq间无memory
+
+配置：单层RNN（`sequence_layer_group`）和双层RNN（`sequence_nest_layer_group`），语义完全相同。
+
+### 读取双层序列的方法
+
+首先，我们看一下单双层序列的不同数据组织形式（您也可以采用别的组织形式）：
+
+- 单层序列的数据（`Sequence/tour_train_wdseg`）如下，一共有10个样本。每个样本由两部分组成，一个label（此处都为2）和一个已经分词后的句子。
+
+```text
+2  	酒店 有 很 舒适 的 床垫 子 ， 床上用品 也 应该 是 一人 一 换 ， 感觉 很 利落 对 卫生 很 放心 呀 。
+2  	很 温馨 ， 也 挺 干净 的 * 地段 不错 ， 出来 就 有 全家 ， 离 地铁站 也 近 ， 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 ， 就 第一天 给 了 一次性杯子 *
+2  	位置 方便 ， 强烈推荐 ， 十一 出去玩 的 时候 选 的 ， 对面 就是 华润万家 ， 周围 吃饭 的 也 不少 。
+2  	交通便利 ， 吃 很 便利 ， 乾 浄 、 安静 ， 商务 房 有 电脑 、 上网 快 ， 价格 可以 ， 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。
+2  	本来 准备 住 两 晚 ， 第 2 天 一早 居然 停电 ， 且 无 通知 ， 只有 口头 道歉 。 总体来说 性价比 尚可 ， 房间 较 新 ， 还是 推荐 .
+2  	这个 酒店 去过 很多 次 了 ， 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店
+2  	挺好 的 汉庭 ， 前台 服务 很 热情 ， 卫生 很 整洁 ， 房间 安静 ， 水温 适中 ， 挺好 ！
+2  	HowardJohnson 的 品质 ， 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 ， 简直 一 流 。 就 在 天一阁 、 月湖 旁边 ， 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。
+2  	酒店 很干净 ， 很安静 ， 很 温馨 ， 服务员 服务 好 ， 各方面 都 不错 *
+2  	挺好 的 ， 就是 没 窗户 ， 不过 对 得 起 这 价格
+```
+
+- 双层序列的数据（`Sequence/tour_train_wdseg.nest`）如下，一共有4个样本。样本间用空行分开，代表不同的双层序列，序列数据和上面的完全一样。每个样本的子句数分别为2,3,2,3。
+
+```text
+2  	酒店 有 很 舒适 的 床垫 子 ， 床上用品 也 应该 是 一人 一 换 ， 感觉 很 利落 对 卫生 很 放心 呀 。
+2  	很 温馨 ， 也 挺 干净 的 * 地段 不错 ， 出来 就 有 全家 ， 离 地铁站 也 近 ， 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 ， 就 第一天 给 了 一次性杯子 *
+
+2  	位置 方便 ， 强烈推荐 ， 十一 出去玩 的 时候 选 的 ， 对面 就是 华润万家 ， 周围 吃饭 的 也 不少 。
+2  	交通便利 ， 吃 很 便利 ， 乾 浄 、 安静 ， 商务 房 有 电脑 、 上网 快 ， 价格 可以 ， 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。
+2  	本来 准备 住 两 晚 ， 第 2 天 一早 居然 停电 ， 且 无 通知 ， 只有 口头 道歉 。 总体来说 性价比 尚可 ， 房间 较 新 ， 还是 推荐 .
+
+2  	这个 酒店 去过 很多 次 了 ， 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店
+2  	挺好 的 汉庭 ， 前台 服务 很 热情 ， 卫生 很 整洁 ， 房间 安静 ， 水温 适中 ， 挺好 ！
+
+2  	HowardJohnson 的 品质 ， 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 ， 简直 一 流 。 就 在 天一阁 、 月湖 旁边 ， 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。
+2  	酒店 很干净 ， 很安静 ， 很 温馨 ， 服务员 服务 好 ， 各方面 都 不错 *
+2  	挺好 的 ， 就是 没 窗户 ， 不过 对 得 起 这 价格
+```
+
+其次，我们看一下单双层序列的不同dataprovider（见`sequenceGen.py`）：
+
+- 单层序列的dataprovider如下：
+  - word_slot是integer_value_sequence类型，代表单层序列。
+  - label是integer_value类型，代表一个向量。
+
+```python
+def hook(settings, dict_file, **kwargs):
+    settings.word_dict = dict_file
+    settings.input_types = [integer_value_sequence(len(settings.word_dict)), 
+                            integer_value(3)]
+
+@provider(init_hook=hook)
+def process(settings, file_name):
+    with open(file_name, 'r') as fdata:
+        for line in fdata:
+            label, comment = line.strip().split('\t')
+            label = int(''.join(label.split()))
+            words = comment.split()
+            word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict]
+            yield word_slot, label
+```
+
+- 双层序列的dataprovider如下：
+  - word_slot是integer_value_sub_sequence类型，代表双层序列。
+  - label是integer_value_sequence类型，代表单层序列，即一个子句一个label。注意：也可以为integer_value类型，代表一个向量，即一个句子一个label。通常根据任务需求进行不同设置。
+  - 关于dataprovider中input_types的详细用法，参见PyDataProvider2。
+
+```python
+def hook2(settings, dict_file, **kwargs):
+    settings.word_dict = dict_file
+    settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)),
+                            integer_value_sequence(3)]
+
+@provider(init_hook=hook2)
+def process2(settings, file_name):
+    with open(file_name) as fdata:
+        label_list = []
+        word_slot_list = []
+        for line in fdata:
+            if (len(line)) > 1:
+                label,comment = line.strip().split('\t')
+                label = int(''.join(label.split()))
+                words = comment.split()
+                word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict]
+                label_list.append(label)
+                word_slot_list.append(word_slot)
+            else:
+                yield word_slot_list, label_list
+                label_list = []
+                word_slot_list = []
+```
+
+### 模型中的配置
+
+首先，我们看一下单层序列的配置（见`sequence_layer_group.conf`）。注意：batchsize=5表示一次过5句单层序列，因此2个batch就可以完成1个pass。
+
+```python
+settings(batch_size=5)
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+# (lstm_input + lstm) is equal to lstmemory 
+with mixed_layer(size=hidden_dim*4) as lstm_input:
+    lstm_input += full_matrix_projection(input=emb)
+
+lstm = lstmemory_group(input=lstm_input,
+                       size=hidden_dim,
+                       act=TanhActivation(),
+                       gate_act=SigmoidActivation(),
+                       state_act=TanhActivation(),
+                       lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
+
+lstm_last = last_seq(input=lstm)
+
+with mixed_layer(size=label_dim, 
+                 act=SoftmaxActivation(), 
+                 bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_last)
+
+outputs(classification_cost(input=output, label=data_layer(name="label", size=1)))
+
+```
+其次，我们看一下语义相同的双层序列配置（见`sequence_nest_layer_group.conf`），并对其详细分析：
+
+- batchsize=2表示一次过2句双层序列。但从上面的数据格式可知，2句双层序列和5句单层序列的数据完全一样。
+- data_layer和embedding_layer不关心数据是否是序列格式，因此两个配置在这两层上的输出是一样的。
+- lstmemory:
+  - 单层序列过了一个mixed_layer和lstmemory_group。
+  - 双层序列在同样的mixed_layer和lstmemory_group外，直接加了一层group。由于这个外层group里面没有memory，表示subseq间不存在联系，即起到的作用仅仅是把双层seq拆成单层，因此双层序列过完lstmemory的输出和单层的一样。
+- last_seq：
+  - 单层序列直接取了最后一个元素
+  - 双层序列首先（last_seq层）取了每个subseq的最后一个元素，将其拼接成一个新的单层序列；接着（expand_layer层）将其扩展成一个新的双层序列，其中第i个subseq中的所有向量均为输入的单层序列中的第i个向量；最后（average_layer层）取了每个subseq的平均值。
+  - 分析得出：第一个last_seq后，每个subseq的最后一个元素就等于单层序列的最后一个元素，而expand_layer和average_layer后，依然保持每个subseq最后一个元素的值不变（这两层仅是为了展示它们的用法，实际中并不需要）。因此单双层序列的输出是一样旳。
+
+```python
+settings(batch_size=2)
+
+data = data_layer(name="word", size=dict_dim)
+
+emb_group = embedding_layer(input=data, size=word_dim)
+
+# (lstm_input + lstm) is equal to lstmemory 
+def lstm_group(lstm_group_input):
+    with mixed_layer(size=hidden_dim*4) as group_input:
+      group_input += full_matrix_projection(input=lstm_group_input)
+
+    lstm_output = lstmemory_group(input=group_input,
+                                  name="lstm_group",
+                                  size=hidden_dim,
+                                  act=TanhActivation(),
+                                  gate_act=SigmoidActivation(),
+                                  state_act=TanhActivation(),
+                                  lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
+    return lstm_output
+
+lstm_nest_group = recurrent_group(input=SubsequenceInput(emb_group),
+                                  step=lstm_group,
+                                  name="lstm_nest_group")
+# hasSubseq ->(seqlastins) seq
+lstm_last = last_seq(input=lstm_nest_group, agg_level=AggregateLevel.EACH_SEQUENCE)
+
+# seq ->(expand) hasSubseq
+lstm_expand = expand_layer(input=lstm_last, expand_as=emb_group, expand_level=ExpandLevel.FROM_SEQUENCE)
+
+# hasSubseq ->(average) seq
+lstm_average = pooling_layer(input=lstm_expand,
+                             pooling_type=AvgPooling(),
+                             agg_level=AggregateLevel.EACH_SEQUENCE)
+
+with mixed_layer(size=label_dim, 
+                 act=SoftmaxActivation(), 
+                 bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_average)
+
+outputs(classification_cost(input=output, label=data_layer(name="label", size=1)))
+```
+## 示例2：双进双出，subseq间有memory
+
+配置：单层RNN（`sequence_rnn.conf`），双层RNN（`sequence_nest_rnn.conf`和`sequence_nest_rnn_readonly_memory.conf`），语义完全相同。
+
+### 读取双层序列的方法
+
+我们看一下单双层序列的不同数据组织形式和dataprovider（见`rnn_data_provider.py`）
+```python
+data = [
+    [[[1, 3, 2], [4, 5, 2]], 0],
+    [[[0, 2], [2, 5], [0, 1, 2]], 1],
+]
+
+@provider(input_types=[integer_value_sub_sequence(10),
+                       integer_value(3)])
+def process_subseq(settings, file_name):
+    for d in data:
+        yield d
+
+@provider(input_types=[integer_value_sequence(10),
+                       integer_value(3)])
+def process_seq(settings, file_name):
+    for d in data:
+        seq = []
+```
+- 单层序列：有两句，分别为[1,3,2,4,5,2]和[0,2,2,5,0,1,2]。
+- 双层序列：有两句，分别为[[1,3,2],[4,5,2]]（2个子句）和[[0,2],[2,5],[0,1,2]]（3个子句）。
+- 单双层序列的label都分别是0和1
+
+### 模型中的配置
+
+我们选取单双层序列配置中的不同部分，来对比分析两者语义相同的原因。
+
+- 单层序列：过了一个很简单的recurrent_group。每一个时间步，当前的输入y和上一个时间步的输出rnn_state做了一个全链接。
+
+```python
+def step(y):
+    mem = memory(name="rnn_state", size=hidden_dim)
+    return fc_layer(input=[y, mem],
+                    size=hidden_dim,
+                    act=TanhActivation(),
+                    bias_attr=True,
+                    name="rnn_state")
+
+out = recurrent_group(step=step, input=emb)
+```
+- 双层序列，外层memory是一个元素：
+  - 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem，表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中，outer_mem是一个子句的最后一个向量，即整个双层group是将前一个子句的最后一个向量，作为下一个子句memory的初始状态。
+  - 从输入数据上看，单双层序列的句子是一样的，只是双层序列将其又做了子序列划分。因此双层序列的配置中，必须将前一个子句的最后一个元素，作为boot_layer传给下一个子句的memory，才能保证和单层序列的配置中“每一个时间步都用了上一个时间步的输出结果”一致。
+
+```python
+def outer_step(x):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+    def inner_step(y):
+        inner_mem = memory(name="inner_rnn_state",
+                           size=hidden_dim,
+                           boot_layer=outer_mem)
+        return fc_layer(input=[y, inner_mem],
+                        size=hidden_dim,
+                        act=TanhActivation(),
+                        bias_attr=True,
+                        name="inner_rnn_state")
+
+    inner_rnn_output = recurrent_group(
+        step=inner_step,
+        input=x)
+    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
+
+    return inner_rnn_output
+
+out = recurrent_group(step=outer_step, input=SubsequenceInput(emb))
+```
+- 双层序列，外层memory是单层序列：
+  - 由于外层每个时间步返回的是一个子句，这些子句的长度往往不等长。因此当外层有is_seq=True的memory时，内层是**无法直接使用**它的，即内层memory的boot_layer不能链接外层的这个memory。
+  - 如果内层memory想**间接使用**这个外层memory，只能通过`pooling_layer`、`last_seq`或`first_seq`这三个layer将它先变成一个元素。但这种情况下，外层memory必须有boot_layer，否则在第0个时间步时，由于外层memory没有任何seq信息，因此上述三个layer的前向会报出“**Check failed: input.sequenceStartPositions**”的错误。
+
+## 示例3：双进双出，输入不等长
+
+**输入不等长**是指recurrent_group的多个输入在各时刻的长度可以不相等, 但需要指定一个和输出长度一致的input，用<font color="red">targetInlink</font>表示。参考配置：单层RNN（`sequence_rnn_multi_unequalength_inputs.conf`），双层RNN（`sequence_nest_rnn_multi_unequalength_inputs.conf`）
+
+### 读取双层序列的方法
+
+我们看一下单双层序列的数据组织形式和dataprovider（见`rnn_data_provider.py`）
+```python
+data2 = [
+    [[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0],
+    [[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1],
+]
+
+@provider(input_types=[integer_value_sub_sequence(10),
+                       integer_value_sub_sequence(10),
+                       integer_value(2)],
+          should_shuffle=False)
+def process_unequalength_subseq(settings, file_name): #双层RNN的dataprovider
+    for d in data2:
+        yield d
+
+
+@provider(input_types=[integer_value_sequence(10),
+                       integer_value_sequence(10),
+                       integer_value(2)],
+          should_shuffle=False)
+def process_unequalength_seq(settings, file_name): #单层RNN的dataprovider
+    for d in data2:
+        words1=reduce(lambda x,y: x+y, d[0])
+        words2=reduce(lambda x,y: x+y, d[1])
+        yield words1, words2, d[2]
+```
+
+data2 中有两个样本，每个样本有两个特征, 记fea1, fea2。
+
+- 单层序列：两个样本分别为[[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]] 和 [[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]]
+- 双层序列：两个样本分别为
+  - **样本1**：[[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]]]。fea1和fea2都分别有2个子句，fea1=[[1, 2], [4, 5, 2]], fea2=[[5, 4, 1], [3, 1]]
+  - **样本2**：[[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]]。fea1和fea2都分别有3个子句， fea1=[[0, 2], [2, 5], [0, 1, 2]], fea2=[[1, 5], [4], [2, 3, 6, 1]]。<br/>
+  - **注意**：每个样本中，各特征的子句数目需要相等。这里说的“双进双出，输入不等长”是指fea1在i时刻的输入的长度可以不等于fea2在i时刻的输入的长度。如对于第1个样本，时刻i=2, fea1[2]=[4, 5, 2]，fea2[2]=[3, 1]，3≠2。
+- 单双层序列中，两个样本的label都分别是0和1
+
+### 模型中的配置
+
+单层RNN（`sequence_rnn_multi_unequalength_inputs.conf`）和双层RNN（`sequence_nest_rnn_multi_unequalength_inputs.conf`）两个模型配置达到的效果完全一样，区别只在于输入为单层还是双层序列，现在我们来看它们内部分别是如何实现的。
+
+- 单层序列：
+  - 过了一个简单的recurrent_group。每一个时间步，当前的输入y和上一个时间步的输出rnn_state做了一个全连接，功能与示例2中`sequence_rnn.conf`的`step`函数完全相同。这里，两个输入x1,x2分别通过calrnn返回最后时刻的状态。结果得到的encoder1_rep和encoder2_rep分别是单层序列，最后取encoder1_rep的最后一个时刻和encoder2_rep的所有时刻分别相加得到context。
+  - 注意到这里recurrent_group输入的每个样本中，fea1和fea2的长度都分别相等，这并非偶然，而是因为recurrent_group要求输入为单层序列时，所有输入的长度都必须相等。
+
+```python
+def step(x1, x2):
+	def calrnn(y):
+		mem = memory(name = 'rnn_state_' + y.name, size = hidden_dim)
+        out = fc_layer(input = [y, mem],
+	        size = hidden_dim,
+	        act = TanhActivation(),
+            bias_attr = True,
+            name = 'rnn_state_' + y.name)
+        return out
+
+	encoder1 = calrnn(x1)
+    encoder2 = calrnn(x2)
+    return [encoder1, encoder2]
+    
+encoder1_rep, encoder2_rep = recurrent_group(
+    name="stepout",                           
+    step=step,
+    input=[emb1, emb2])
+
+encoder1_last = last_seq(input = encoder1_rep)                           
+encoder1_expandlast = expand_layer(input = encoder1_last,
+                                   expand_as = encoder2_rep)
+context = mixed_layer(input = [identity_projection(encoder1_expandlast),
+                               identity_projection(encoder2_rep)],
+                      size = hidden_dim)
+```
+- 双层序列：
+  - 双层RNN中，对输入的两个特征分别求时序上的连续全连接(`inner_step1`和`inner_step2`分别处理fea1和fea2)，其功能与示例2中`sequence_nest_rnn.conf`的`outer_step`函数完全相同。不同之处是，此时输入`[SubsequenceInput(emb1), SubsequenceInput(emb2)]`在各时刻并不等长。
+  - 函数`outer_step`中可以分别处理这两个特征，但我们需要用<font color=red>targetInlink</font>指定recurrent_group的输出的格式（各子句长度）只能和其中一个保持一致，如这里选择了和emb2的长度一致。
+  - 最后，依然是取encoder1_rep的最后一个时刻和encoder2_rep的所有时刻分别相加得到context。
+
+```python
+def outer_step(x1, x2):
+    outer_mem1 = memory(name = "outer_rnn_state1", size = hidden_dim)
+    outer_mem2 = memory(name = "outer_rnn_state2", size = hidden_dim)
+    def inner_step1(y):
+        inner_mem = memory(name = 'inner_rnn_state_' + y.name,
+                           size = hidden_dim,
+                           boot_layer = outer_mem1)
+        out = fc_layer(input = [y, inner_mem],
+                       size = hidden_dim,
+                       act = TanhActivation(),
+                       bias_attr = True,
+                       name = 'inner_rnn_state_' + y.name)
+        return out
+
+    def inner_step2(y):
+        inner_mem = memory(name = 'inner_rnn_state_' + y.name,
+                           size = hidden_dim,
+                           boot_layer = outer_mem2)
+        out = fc_layer(input = [y, inner_mem],
+                       size = hidden_dim,
+                       act = TanhActivation(),
+                       bias_attr = True,
+                       name = 'inner_rnn_state_' + y.name)
+        return out
+
+    encoder1 = recurrent_group(
+        step = inner_step1,
+        name = 'inner1',
+        input = x1)
+
+    encoder2 = recurrent_group(
+        step = inner_step2,
+        name = 'inner2',
+        input = x2)
+
+    sentence_last_state1 = last_seq(input = encoder1, name = 'outer_rnn_state1')
+    sentence_last_state2_ = last_seq(input = encoder2, name = 'outer_rnn_state2')
+
+    encoder1_expand = expand_layer(input = sentence_last_state1,
+                                   expand_as = encoder2)
+
+    return [encoder1_expand, encoder2]
+
+encoder1_rep, encoder2_rep = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[SubsequenceInput(emb1), SubsequenceInput(emb2)],
+    targetInlink=emb2)
+
+encoder1_last = last_seq(input = encoder1_rep)
+encoder1_expandlast = expand_layer(input = encoder1_last,
+                                   expand_as = encoder2_rep)
+context = mixed_layer(input = [identity_projection(encoder1_expandlast),
+                               identity_projection(encoder2_rep)],
+                      size = hidden_dim)
+```
+
+## 示例4：beam_search的生成
+
+TBD
diff --git a/doc_cn/algorithm/rnn/rnn-tutorial.md b/doc_cn/algorithm/rnn/rnn-tutorial.md
index 7a553054c80392946ba5b16cc31bcaea18cfc977..9e488b0d51956e86f9fb76f450fdb438f596e239 100644
--- a/doc_cn/algorithm/rnn/rnn-tutorial.md
+++ b/doc_cn/algorithm/rnn/rnn-tutorial.md
@@ -93,4 +93,4 @@ memory只能在`recurrent_group`中定义和使用。memory不能独立存在，
 使用`beam_search`需要遵循以下约定：
 
 - 单层RNN：从一个word生成下一个word。
-- 双层RNN：即把单层RNN生成后的subseq给拼接成一个新的双层seq。从语义上看，也不存在一个subseq直接生成下一个subseq的情况。
\ No newline at end of file
+- 双层RNN：即把单层RNN生成后的subseq给拼接成一个新的双层seq。从语义上看，也不存在一个subseq直接生成下一个subseq的情况。
diff --git a/doc_cn/build_and_install/cmake/cblas_settings.csv b/doc_cn/build_and_install/cmake/cblas_settings.csv
index d804c0a662cb652dbefb0d09fb18538308c20aec..a6356baf16a0d3d2499e39d2055d8ee878dcaef2 100644
--- a/doc_cn/build_and_install/cmake/cblas_settings.csv
+++ b/doc_cn/build_and_install/cmake/cblas_settings.csv
@@ -1,4 +1,5 @@
-MKL_ROOT,mkl的路径，在${MKL_ROOT}/include下需要包含mkl.h，在${MKL_ROOT}/lib目录下需要包含 mkl_core，mkl_sequential和mkl_intel_lp64三个库
-ATLAS_ROOT,ATLAS库的路径，在${ATLAS_ROOT}/include下需要包含cblas.h，而在${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库
-OPENBLAS_ROOT,在${OPENBLAS_ROOT}/include下需要包含cblas.h，而在${OPENBLAS_ROOT}/lib下需要包含openblas库
-REFERENCE_CBLAS_ROOT,在${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h，在${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库
\ No newline at end of file
+编译选项,描述,注意
+MKL_ROOT,MKL的路径,${MKL_ROOT}/include下需要包含mkl.h，${MKL_ROOT}/lib目录下需要包含mkl_core，mkl_sequential和mkl_intel_lp64三个库。
+ATLAS_ROOT,ATLAS的路径,${ATLAS_ROOT}/include下需要包含cblas.h，${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库。
+OPENBLAS_ROOT,OpenBLAS的路径,${OPENBLAS_ROOT}/include下需要包含cblas.h，${OPENBLAS_ROOT}/lib下需要包含openblas库。
+REFERENCE_CBLAS_ROOT,REFERENCE BLAS的路径,${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h，${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库。
\ No newline at end of file
diff --git a/doc_cn/build_and_install/cmake/compile_options.csv b/doc_cn/build_and_install/cmake/compile_options.csv
index 0b8015aaee4d7b9068cb4a8de5d9967569e37f0c..12b45eebb2822d77447fa1bc754360605971dcab 100644
--- a/doc_cn/build_and_install/cmake/compile_options.csv
+++ b/doc_cn/build_and_install/cmake/compile_options.csv
@@ -1,15 +1,14 @@
-选项,说明,默认值
-WITH_GPU,是否编译GPU支持。,是否寻找到cuda工具链
-WITH_DOUBLE,是否使用双精度浮点数。,否
-WITH_DSO,是否使用运行时动态加载cuda动态库，而非静态加载cuda动态库。,是
-WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制,是
-WITH_PYTHON,是否内嵌python解释器。可以方便嵌入式工作。,是
-WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
-WITH_RDMA,是否开启RDMA支持,否
-WITH_GLOG,是否使用GLOG，如果不使用则会使用一个简化版的日志实现。可以方便嵌入式工作。,取决于是否寻找到GLOG
-WITH_GFLAGS,是否使用GFLAGS，如果不使用则会使用一个简化版的命令行参数解析。可以方便嵌入式工作。,取决于是否寻找到GFLAGS
-WITH_TIMER,是否开启计时功能开启计时功能会导致运行略慢，打印的日志变多。但是方便调试和benchmark,否
-WITH_TESTING,是否开启单元测试,取决于是否寻找到gtest
-WITH_DOC,是否编译英文文档,否
-WITH_DOC_CN,是否编译中文文档,否
-WITH_SWIG_PY,是否编译python的swig接口，python的swig接口可以方便进行预测和定制化训练,取决于是否找到swig
+选项,说明,默认值
+WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
+WITH_DOUBLE,是否使用双精度浮点数。,否
+WITH_DSO,是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。,是
+WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
+WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
+WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
+WITH_RDMA,是否开启RDMA,否
+WITH_GLOG,是否开启GLOG。如果不开启，则会使用一个简化版的日志，同时方便今后的嵌入式移植工作。,取决于是否寻找到GLOG
+WITH_GFLAGS,是否使用GFLAGS。如果不开启，则会使用一个简化版的命令行参数解析器，同时方便今后的嵌入式移植工作。,取决于是否寻找到GFLAGS
+WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢，打印的日志变多，但是方便调试和测Benchmark,否
+WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
+WITH_DOC,是否编译中英文文档,否
+WITH_SWIG_PY,是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练,取决于是否寻找到SWIG
\ No newline at end of file
diff --git a/doc_cn/build_and_install/cmake/compile_options.rst b/doc_cn/build_and_install/cmake/compile_options.rst
index bb5b18a073803662774cb6b7bcbdbafe3ad51112..f345ead2bf851bdad7be2fb8185d16fd2a318a66 100644
--- a/doc_cn/build_and_install/cmake/compile_options.rst
+++ b/doc_cn/build_and_install/cmake/compile_options.rst
@@ -1,62 +1,43 @@
-设置PaddlePaddle的编译选项
-==========================
-
-PaddlePaddle的编译选项可以在调用cmake的时候设置。cmake是一个跨平台的编译脚本，调用
-cmake可以将cmake项目文件，生成各个平台的makefile。详细的cmake使用方法可以参考
-`cmake的官方文档 <https://cmake.org/cmake-tutorial>`_ 。
-
-PaddlePaddle的编译选项是可以控制PaddlePaddle生成CPU/GPU版本二进制，链接何种blas等等。所有的
-编译选项列表如下
-
-PaddlePaddle的编译选项
-----------------------
-
-bool型的编译选项
-++++++++++++++++
-设置下列编译选项时，可以在cmake的命令行设置。使用 -D命令即可。例如 
-:code:`cmake -D WITH_GPU=OFF`
-
-..  csv-table:: PaddlePaddle的bool型编译选项
-    :widths: 1, 7, 2
-    :file: compile_options.csv
-
-blas相关的编译选项
-++++++++++++++++++
-
-PaddlePaddle可以使用 `MKL <https://software.intel.com/en-us/intel-mkl>`_ ，
-`Atlas <http://math-atlas.sourceforge.net/>`_ ,
-`OpenBlas <http://www.openblas.net/>`_ 和 
-`refference Blas <http://www.netlib.org/blas/>`_ ，任意一种cblas实现。
-通过编译时指定路径来实现引用各种blas。
-
-cmake编译时会首先在系统路径(/usr/lib\:/usr/local/lib)中寻找这些blas的实现。同时
-也会读取相关路径变量来进行搜索。路径变量为\:
-
-
-..  csv-table:: PaddlePaddle的cblas编译选项
-    :widths: 1, 9
-    :header: "编译选项", "描述"
-    :file: cblas_settings.csv
-
-这些变量均可以使用 -D命令指定。例如 :code:`cmake -D MKL_ROOT=/opt/mkl/`。这些变
-量也可以通过调用cmake命令前通过环境变量指定。例如
-
-..  code-block:: bash
-
-    export MKL_ROOT=/opt/mkl
-    cmake
-
-需要注意的是，这些变量只在第一次cmake的时候有效。如果在第一次cmake之后想要重新设
-置这些变量，推荐清理( :code:`rm -rf` )掉编译目录后，再指定。
-
-cuda/cudnn相关的编译选项
-++++++++++++++++++++++++
-
-PaddlePaddle可以使用 cudnn v2之后的任何一个cudnn版本来编译运行。但需要注意的是编译和
-运行使用的cudnn尽量是同一个版本。推荐使用最新版本的cudnn v5.1。
-
-在cmake配置时可以使用 :code:`CUDNN_ROOT` 来配置CUDNN的安装路径。使用的命令也是 
--D，例如 :code:`cmake -D CUDNN_ROOT=/opt/cudnnv5` 。
-
-需要注意的是，这些变量只在第一次cmake的时候有效。如果在第一次cmake之后想要重新设
-置这些变量，推荐清理( :code:`rm -rf` )掉编译目录后，再指定。
+PaddlePaddle的编译选项
+======================
+
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考 `官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+Bool型的编译选项
+----------------
+用户可在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: Bool型的编译选项
+    :widths: 1, 7, 2
+    :file: compile_options.csv
+
+BLAS/CUDA/Cudnn的编译选项
+--------------------------
+BLAS
++++++
+
+PaddlePaddle支持以下任意一种BLAS库：`MKL <https://software.intel.com/en-us/intel-mkl>`_ ，`ATLAS <http://math-atlas.sourceforge.net/>`_ ，`OpenBlAS <http://www.openblas.net/>`_ 和 `REFERENCE BLAS <http://www.netlib.org/blas/>`_ 。
+
+..  csv-table:: BLAS路径相关的编译选项
+    :widths: 1, 2, 7
+    :file: cblas_settings.csv
+
+CUDA/Cudnn
++++++++++++
+
+PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时，首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
+
+..  code-block:: bash
+
+    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
+
+注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
\ No newline at end of file
diff --git a/doc_cn/build_and_install/install/paddle_version.txt b/doc_cn/build_and_install/install/paddle_version.txt
index 7b2bfd2b1b3a9850665d118e424fd0cf6c24a062..a80873303fd0d05d963482629000d76260185ef6 100644
--- a/doc_cn/build_and_install/install/paddle_version.txt
+++ b/doc_cn/build_and_install/install/paddle_version.txt
@@ -8,4 +8,4 @@ PaddlePaddle 0.8.0b1, compiled with
     with_gflags: ON
     with_metric_learning:
     with_timer: OFF
-    with_predict_sdk:
\ No newline at end of file
+    with_predict_sdk:
diff --git a/doc_cn/concepts/nn.rst b/doc_cn/concepts/nn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f4d2cf490d14761f4b9f6a308180c5e8015cbecb
--- /dev/null
+++ b/doc_cn/concepts/nn.rst
@@ -0,0 +1,3 @@
+TBD
+
+目前正在书写中。敬请期待。
\ No newline at end of file
diff --git a/doc_cn/concepts/program_concepts.rst b/doc_cn/concepts/program_concepts.rst
new file mode 100644
index 0000000000000000000000000000000000000000..af5bbdac260afce0a032461ab913d05bc2f55929
--- /dev/null
+++ b/doc_cn/concepts/program_concepts.rst
@@ -0,0 +1,4 @@
+TBD
+###
+
+目前正在书写中。敬请期待。
\ No newline at end of file
diff --git a/doc_cn/concepts/pserver_topology.dot b/doc_cn/concepts/pserver_topology.dot
new file mode 100644
index 0000000000000000000000000000000000000000..9ff658b8495030f322d4f553f3bf72ddf8d3a578
--- /dev/null
+++ b/doc_cn/concepts/pserver_topology.dot
@@ -0,0 +1,68 @@
+graph pp_topology {
+	rankdir=BT;
+	subgraph cluster_node0 {
+		style=filled;
+		color=lightgrey;
+		node [style=filled, color=white, shape=box];
+		label = "机器0"
+
+		pserver0 [label="Parameter \n Server 0"]
+		trainer0 [label="Trainer 0"]
+	}
+	subgraph cluster_node1 {
+		style=filled;
+		color=lightgrey;
+		node [style=filled, color=white, shape=box];
+		label = "机器1"
+
+		pserver1 [label="Parameter \n Server 1"]
+		trainer1 [label="Trainer 1"]
+	}
+
+	subgraph cluster_node2 {
+		style=filled;
+		color=lightgrey;
+		node [style=filled, color=white, shape=box];
+		label = "机器2"
+
+		pserver2 [label="Parameter \n Server 2"]
+		trainer2 [label="Trainer 2"]
+	}
+
+	subgraph cluster_node3 {
+		style=filled;
+		color=lightgrey;
+		node [style=filled, color=white, shape=box];
+		label = "机器3"
+
+		pserver3 [label="Parameter \n Server 3"]
+		trainer3 [label="Trainer 3"]
+	}
+
+	data [label="数据", shape=hexagon]
+
+	trainer0 -- pserver0
+	trainer0 -- pserver1
+	trainer0 -- pserver2
+	trainer0 -- pserver3
+
+	trainer1 -- pserver0
+	trainer1 -- pserver1
+	trainer1 -- pserver2
+	trainer1 -- pserver3
+
+	trainer2 -- pserver0
+	trainer2 -- pserver1
+	trainer2 -- pserver2
+	trainer2 -- pserver3
+
+	trainer3 -- pserver0
+	trainer3 -- pserver1
+	trainer3 -- pserver2
+	trainer3 -- pserver3
+
+	data -- trainer0
+	data -- trainer1
+	data -- trainer2
+	data -- trainer3
+}
diff --git a/doc_cn/concepts/trainer_config.py b/doc_cn/concepts/trainer_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..3eccbd7bc11f4865130286de718d1be74e4d1722
--- /dev/null
+++ b/doc_cn/concepts/trainer_config.py
@@ -0,0 +1,29 @@
+from paddle.trainer_config_helpers import *
+
+define_py_data_sources2(
+    train_list='train.list',
+    test_list='test.list',
+    module='provider',
+    obj='process')
+settings(
+    batch_size=128,
+    learning_rate=1e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(0.5))
+
+img = data_layer(name='pixel', size=28 * 28)
+
+hidden1 = simple_img_conv_pool(
+    input=img, filter_size=3, num_filters=32, pool_size=3, num_channel=1)
+
+hidden2 = fc_layer(
+    input=hidden1,
+    size=200,
+    act=TanhActivation(),
+    layer_attr=ExtraAttr(drop_rate=0.5))
+predict = fc_layer(input=hidden2, size=10, act=SoftmaxActivation())
+
+outputs(
+    classification_cost(
+        input=predict, label=data_layer(
+            name='label', size=10)))
diff --git a/doc_cn/concepts/use_concepts.rst b/doc_cn/concepts/use_concepts.rst
new file mode 100644
index 0000000000000000000000000000000000000000..67e98edabc0c2a4ecdf8d7993f8dd66b9365a05d
--- /dev/null
+++ b/doc_cn/concepts/use_concepts.rst
@@ -0,0 +1,191 @@
+#########################
+PaddlePaddle 基本使用概念
+#########################
+
+PaddlePaddle是一个神经网络学习框架。其单机进程为 :code:`paddle train`。 单机的所有设备使用，均在单机进程内调度完成。 而多机辅助进程 :code:`paddle pserver` 负责联合多个单机进程进行通信，进而充分利用集群的计算资源。 PaddlePaddle同时以 :code:`swig api` 的形式，提供训练结果模型预测的方法和自定义训练流程。
+
+下面我们会分别介绍主要进程 :code:`paddle train` 中的一些概念。这些概念会对如何使用PaddlePaddle有一定的帮助。 了解这些概念的前提是，读者已经了解 `基本的神经网络/机器学习原理和概念 <nn.html>`_ 。同时，如果想要了解PaddlePaddle实现中的一些概念，请参考 `PaddlePaddle 编程中的基本概念 <program_concepts.html>`_ 。
+
+..	contents::
+
+PaddlePaddle 的进程模型
+=======================
+
+PaddlePaddle进程内嵌了一个 :code:`python` 解释器。 这个 :code:`python` 解释器负责解析用户定义的神经网络配置，和解析用户数据，并将用户数据传入给 PaddlePaddle。
+
+..	graphviz:: 
+
+	digraph pp_process {
+		rankdir=LR;
+		config_file [label="用户神经网络配置"];
+		subgraph cluster_pp {
+			style=filled;
+			color=lightgrey;
+			node [style=filled, color=white, shape=box];
+			label = "PaddlePaddle C++";
+			py [label="Python解释器"];
+		}
+		data_provider [label="用户数据解析"];
+		config_file -> py;
+		py -> data_provider [dir="back"];
+	}
+
+所以，PaddlePaddle单机训练进程，:code:`paddle train` , 对于用户的主要接口语言为 python。 主要需要用户配置的两个文件为 :code:`DataProvider` 和训练文件 :code:`TrainerConfig` 。
+
+
+DataProvider
+============
+
+DataProvider是 :code:`paddle train` 的数据提供器。 它负责将用户的原始数据转换成 PaddlePaddle 可以识别的数据类型。每当 PaddlePaddle 需要新的数据训练时，都会调用 DataProvider 返回数据。 当所有数据读取完一轮后，DataProvider 便返回空数据通知 PaddlePaddle。PaddlePaddle负责在下一轮训练开始前，将DataProvider重置。
+
+需要注意的是，DataProvider在PaddlePaddle中是被训练逻辑调用的关系， 而不是新的数据驱动训练。并且所有的 :code:`shuffle` , 和一些随机化的噪声添加，都应该在 DataProvider 阶段完成。
+
+为了方便用户使用自己的数据格式， PaddlePaddle 提供了 `PyDataProvider`_ 来处理数据。 并且在这个Provider中，PaddlePaddle的 C++ 部分接管了如何shuffle，处理 batch，GPU/CPU通信，双缓冲，异步读取等问题。 用户可以参考 `PyDataProvider`_ 的相关文档，继续深入了解 DataProvider 的使用。
+
+
+训练文件
+========
+
+训练文件是PaddlePaddle中配置神经网络结构、学习优化算法、数据传入方式的地方。 训练文件是一个python文件，使用命令行参数 :code:`--config` 传给 paddle 的主程序。 例如\:
+
+..	code-block:: bash
+
+	paddle train --config=trainer_config.py
+
+一个典型简单的训练文件可能为
+
+..  literalinclude:: trainer_config.py
+    :linenos:
+
+下面我们详细的介绍一下训练文件中各个模块的概念。
+
+
+trainer_config_helpers
+----------------------
+
+PaddlePaddle的配置文件与PaddlePaddle C++端通信的最基础协议是 :code:`protobuf` 。而为了避免用户直接写比较难写的 protobuf string，我们书写了一个helpers来生成这个protobuf包。所以在文件的开始，import这些helpers函数。
+
+需要注意的是，这个 :code:`paddle.trainer_config_helpers` 包是标准的python包，这意味着用户可以选择自己喜欢的 :code:`ide` 或者编辑器来编写Paddle的配置文件，这个python包注释文档比较完善，并且考虑了IDE的代码提示与类型注释。
+
+data_sources
+------------
+
+data_sources是配置神经网络的数据源。这里使用的函数是 :code:`define_py_data_sources2` ，这个函数是定义了使用 `PyDataProvider`_ 作为数据源。 而后缀 :code:`2` 是Paddle历史遗留问题，因为Paddle之前使用的 PyDataProvider 性能较差，所以完全重构了一个新的 `PyDataProvider`_ 。
+
+data_sources里面的 train_list 和 test_list 指定的是训练文件列表和测试文件列表。 如果传入一个字符串的话，是指一个训练列表文件。这个训练列表文件中包含的是每一个训练或者测试文件的路径。如果传入一个list的话，则会默认生成一个 list 文件，再传入给 train.list 或者 test.list 。
+
+而 :code:`module` 和 :code:`obj` 指定了 DataProvider 的模块名和函数名。
+
+更具体的使用，请参考 `PyDataProvider`_ 。
+
+settings
+--------
+
+`settings`_ 是神经网络训练算法相关的设置项。包括学习率，batch_size，优化算法，正则方法等等。具体的使用方法请参考 `settings`_ 文档。
+
+网络配置
+--------
+
+上述网络配置中余下的部分均是神经网络配置。第一行是定义一个名字叫 "pixel" 的 :code:`data_layer` 。每一个layer返回的都是一个 :code:`LayerOutput` 对象。 这里第一层的输出对象是 :code:`img` 。然后这个对象传输给了另一个 layer 函数，
+:code:`simple_img_conv_pool` 。:code:`simple_img_conv_pool` 是一个组合层，
+包括了图像的卷积 (convolution) 和池化(pooling)，
+并继续接了一个全连接层( :code:`fc_layer` )，然后再接了一个Softmax的全连接层。
+
+最终，网络配置输出了 :code:`classification_cost` 。标记网络输出的函数为 
+:code:`outputs` 。网络的输出是神经网络的优化目标，神经网络训练的时候，实际上就是
+要最小化这个输出。
+
+在神经网络进行预测的时候，实际上网络的输出也是通过 :code:`outputs` 标记。
+
+
+Layer、Projection、Operator
+===========================
+
+PaddlePaddle的网络基本上是基于Layer来配置的。所谓的Layer即是神经网络的某一层，
+而神经网络的某一层，一般是封装了许多复杂操作的操作集合。比如最简单的
+:code:`fc_layer` ，也包括矩阵乘法，多输入的求和，和activation。
+
+..	code-block:: python
+
+	data = data_layer(name='data', size=200)
+	out = fc_layer(input=data, size=200, act=TanhActivation())
+
+而对于更灵活配置需求，可能这样基于Layer的配置是不灵活的。于是 PaddlePaddle 提供
+了基于 Projection 或者 Operator 的配置。使用Projection和Operator需要与
+:code:`mixed_layer` 配合使用。 :code:`mixed_layer` 是将layer中的元素累加求和，
+并且做一个 :code:`activation` ， 而这个layer具体如何计算，是交由内部的Projection
+和 Operator 定义。Projection是指含有可学习参数的操作，而Operator不含有可学习的
+参数，输入全是其他Layer的输出。
+
+
+例如，和 :code:`fc_layer` 同样功能的 :code:`mixed_layer` 。
+
+..	code-block:: python
+
+	data = data_layer(name='data', size=200)
+	with mixed_layer(size=200) as out:
+		out += full_matrix_projection(input=data)
+
+PaddlePaddle可以使用的mixed layer 配置出非常复杂的网络，甚至可以直接配置一个完整的LSTM。
+用户可以参考 `mixed_layer`_ 的相关文档进行配置。
+
+如何利用单机的所有GPU或所有CPU核心
+==================================
+
+PaddlePaddle的单机进程 :code:`paddle train` 可以充分利用一台计算机上所有的GPU资
+源或者CPU。
+
+如果要使用机器上多块GPU，使用如下命令即可\:
+
+..	code-block:: bash
+
+	paddle train --use_gpu=true --trainer_count=4  # use 4 gpu card, 0, 1, 2, 3
+
+如果要使用机器上多块CPU, 使用如下命令即可\:
+
+..	code-block:: bash
+
+	paddle train --trainer_config=4  # use 4 cpu cores.
+
+对于其他设置GPU的选择情况，例如选择第0、2号GPU显卡，则可以使用 :code:`CUDA_VISIBLE_DEVICES` 环境变量来选择部分的显卡。 具体可以参考连接`masking-gpus`_ 。 可以使用的命令为
+
+..	code-block:: bash
+
+	env CUDA_VISIBLE_DEVICES=0,2 paddle train --use_gpu=true --trainer_config=2
+
+如何利用多台机器的计算资源训练神经网络
+======================================
+
+PaddlePaddle多机使用的经典方法是通过 :code:`Parameter Server` 来对多机的 :code:`paddle train` 进行同步。 而多机训练神经网络，首先要讲数据切分到不同的机器上。 切分数据文件的方式在PaddlePaddle的开源实现中并没有提供工具包。 但是切分数据并不是一件非常复杂的事情，也不是神经网络实现的重点。
+
+多机训练过程中，经典的拓扑结构如下\:
+
+..	graphviz:: pserver_topology.dot
+
+图中每个灰色方块是一台机器，在每个机器中，先去启动一个 :code:`paddle pserver` 进程，并确定整体的端口号。可能的参数是\:
+
+..	code-block:: bash
+
+	paddle pserver --port=5000 --num_gradient_servers=4 --nics='eth0'
+
+这里说明系统的 :code:`paddle pserver` 的起始端口是 :code:`5000` ，并且有四个训练进程(:code:`gradient_servers`，Paddle同时将 :code:`paddle train` 进程称作 :code:`GradientServer` 。因为其为负责提供Gradient的进程)。 而对于训练进程的话，则需要在 :code:`paddle pserver` 启动之后，再在各个节点上运行如下命令\:
+
+..	code-block:: bash
+
+	paddle train --port=5000 --pservers=192.168.100.101,192.168.100.102,192.168.100.103,192.168.100.104 --config=...
+
+对于简单的多机协同使用上述方式即可。同时，pserver/train 通常在高级情况下，还有两个参数需要设置，他们是
+
+* --ports_num\: 一个 pserver进程共绑定多少个端口用来做稠密更新。默认是1
+* --ports_num_for_sparse\: 一个pserver进程共绑定多少端口用来做稀疏更新，默认是0
+
+使用手工指定端口数量，是因为Paddle的网络通信中，使用了 :code:`int32` 作为消息长度，比较容易在大模型下溢出。所以，在 :code:`paddle pserver` 进程中可以启动多个子线程去接受 trainer 的数据，这样单个子线程的长度就不会溢出了。但是这个值不可以调的过大，因为增加这个值，还是对性能，尤其是内存占用有一定的开销的，另外稀疏更新的端口如果太大的话，很容易某一个参数服务器没有分配到任何参数。
+
+详细的说明可以参考，使用 `集群训练Paddle`_ 。
+
+
+..  _PyDataProvider: ../ui/data_provider/pydataprovider2.html
+..	_settings: ../../doc/ui/api/trainer_config_helpers/optimizers.html#settings
+..	_mixed_layer: ../../doc/ui/api/trainer_config_helpers/layers.html#mixed-layer
+..	_masking-gpu: http://www.acceleware.com/blog/cudavisibledevices-masking-gpus
+..  _集群训练Paddle: ../cluster/index.html
diff --git a/doc_cn/demo/index.rst b/doc_cn/demo/index.rst
index 71f54bc18fbb5b1b8cdd0e6cbee2ee028c0af218..e15e839f93d4ac0d455e49fd8b1cde8bf60a29ac 100644
--- a/doc_cn/demo/index.rst
+++ b/doc_cn/demo/index.rst
@@ -9,7 +9,7 @@
 自然语言处理
 ''''''''''''
 
-* `情感分析 <../../doc/demo/sentiment_analysis/index.html>`_
+* `情感分析 <sentiment_analysis/index.html>`_
 * `文本生成 <../../doc/demo/text_generation/index.html>`_
 * `词性标注 <../../doc/demo/semantic_role_labeling/index.html>`_
 
diff --git a/doc_cn/demo/quick_start/index.md b/doc_cn/demo/quick_start/index.md
index aa6b66ca8c02411016420bf9d99c5e1b4e3cefdd..4d9b24ba851a7aaaeb0d79bfbeb0703b8878b77f 100644
--- a/doc_cn/demo/quick_start/index.md
+++ b/doc_cn/demo/quick_start/index.md
@@ -134,9 +134,8 @@ define_py_data_sources2(train_list='data/train.list',
 * obj="process": 指定生成数据的函数
 * args={"dictionary": word_dict}: 额外的参数，这里指定词典
 
-更详细用例请参考文档<a href = "../../../doc/ui/data_provider/python_case.html">Python Use Case</a>，
-数据格式和详细文档请参考<a href = "../../../doc/ui/data_provider/pydataprovider2.html">
-PyDataProviderWrapper</a>。
+更详细数据格式和用例请参考<a href = "../../ui/data_provider/pydataprovider2.html">
+PyDataProvider2</a>。
 
 ## 网络结构(Network Architecture)
 本节我们将专注于网络结构的介绍。
diff --git a/doc_cn/demo/sentiment_analysis/index.rst b/doc_cn/demo/sentiment_analysis/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..82400b2459ebcaf89ff5e884edfe721b9ec01d7f
--- /dev/null
+++ b/doc_cn/demo/sentiment_analysis/index.rst
@@ -0,0 +1,8 @@
+情感分析教程
+===========================
+
+.. toctree::
+    :maxdepth: 3
+    :glob:
+
+    Training Locally <sentiment_analysis.md>
\ No newline at end of file
diff --git a/doc_cn/demo/sentiment_analysis/sentiment_analysis.md b/doc_cn/demo/sentiment_analysis/sentiment_analysis.md
new file mode 100644
index 0000000000000000000000000000000000000000..b70f2d59675615c26b29932cdf99d728bb206148
--- /dev/null
+++ b/doc_cn/demo/sentiment_analysis/sentiment_analysis.md
@@ -0,0 +1,324 @@
+# 情感分析教程
+
+情感分析有许多应用场景。 一个基本的应用场景是区分给定文本的褒贬两极性，给定的文本可以是一个文档、句子、或者是一个小的文本片段。 一个简单的例子如：把用户在购物网站、旅游网站、团购网站（亚马逊、天猫、淘宝等）上发表的评论分成正面评论和负面评论两类。
+
+情感分析也常用于基于大量评论和个人博客来监控社会媒体。 例如，研究人员分析了几个关于消费者信心和政治观点的调查，结果发现它们与同时期的Twitter消息中的情绪词频率相关 [1]。 另一个例子是通过分析每日Twitter博客的文本内容来预测股票变动 [2]。
+
+另一方面，抓取产品的用户评论并分析他们的情感，有助于理解用户对不同公司，不同产品，甚至不同竞争对手产品的偏好。
+
+本教程将指导您完成长期短期记忆（LSTM）网络的训练过程，以分类来自[大型电影评论数据集](http://ai.stanford.edu/~amaas/data/sentiment/)（有时称为[互联网电影数据库 (IMDB)](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf)）的句子的情感 。 此数据集包含电影评论及其相关联的类别标签，即正面和负面。
+
+## 数椐准备
+
+### IMDB 数椐介绍
+
+训练模型之前, 我们需要预处理数椐并构建一个字典。 首先, 你可以使用下面的脚本下载 IMDB 数椐集和[Moses](http://www.statmt.org/moses/)工具, 这是一个基于统计的机器翻译系统. 我们提供了一个数据预处理脚本，它不仅能够处理IMDB数据，还能处理其他用户自定义的数据。 为了使用提前编写的脚本，需要将标记的训练和测试样本移动到另一个路径，这已经在`get_imdb.sh`中完成。
+
+```
+cd demo/sentiment/data
+./get_imdb.sh
+```
+如果数椐获取成功，你将在目录```./demo/sentiment/data```中看到下面的文件：
+
+```
+aclImdb  get_imdb.sh  imdb  mosesdecoder-master
+```
+
+* aclImdb: 从外部网站上下载的原始数椐集。
+* imdb: 仅包含训练和测试数椐集。
+* mosesdecoder-master: Moses 工具。
+
+IMDB数据集包含25,000个已标注过的高极性电影评论用于训练，25,000个用于测试。负面的评论的得分小于等于4,正面的评论的得大于等于7，总评分10分。 运行完脚本 `./get_imdb.sh`后, 我们可以看到在目录 `aclImdb`中的数椐集的结构如下：
+
+```
+imdbEr.txt  imdb.vocab  README  test  train
+```
+* train: 训练数椐集。
+* test : 测试数椐集。
+* imdb.vocab: 字典文件。
+* imdbEr.txt: 字典imdb.vocab中每个切分单词的预期评级。
+* README: 数椐说明文档。
+
+测试集和训练集目录包含下面的文件:
+
+```
+labeledBow.feat  neg  pos  unsup  unsupBow.feat  urls_neg.txt  urls_pos.txt  urls_unsup.txt
+```
+
+* pos: 正面评价样本，包含12,500个txt文件，每个文件是一个电影评论。
+* neg: 负面评价样本，包含12,500个txt文件，每个文件是一个电影评论。
+* unsup: 未标记的评价样本，包含50,000个txt文件。
+* urls_xx.txt: 每个评论的网址。
+* xxBow.feat: 用于统计词频的Bow模型特征。
+
+### IMDB 数椐准备
+
+在这个例子中，我们只使用已经标注过的训练集和测试集，且默认在训练集上构建字典，而不使用IMDB数椐集中的imdb.vocab做为字典。训练集已经做了随机打乱排序而测试集没有。 Moses 工具中的脚本`tokenizer.perl` 用于切分单单词和标点符号。执行下面的命令就可以预处理数椐。
+
+```
+cd demo/sentiment/
+./preprocess.sh
+```
+preprocess.sh:
+
+```
+data_dir="./data/imdb"
+python preprocess.py -i data_dir
+```
+
+* data_dir: 输入数椐所在目录。
+* preprocess.py: 预处理脚本。
+
+运行成功后目录`demo/sentiment/data/pre-imdb` 结构如下:
+
+```
+dict.txt  labels.list  test.list  test_part_000  train.list  train_part_000
+```
+* test\_part\_000 and train\_part\_000: 所有标记的测试集和训练集， 训练集已经随机打乱。
+* train.list and test.list: 训练集和测试集文件列表。
+* dict.txt: 利用训练集生成的字典。
+* labels.txt: neg  0, pos 1, 含义：标签0表示负面的评论，标签1表示正面的评论。
+
+### 用户自定义数椐预处理
+
+如果你执行其它的用情感分析来分类文本的任务，可以按如下的结构来准备数椐. 我们提供了脚本来构建字典和预处理数椐。所以你只用按下面的结构来组织数椐就行了。
+
+```
+dataset
+|----train
+|    |----class1
+|    |    |----text_files
+|    |----class2
+|    |    |----text_files
+|    |    ...
+|----test
+|    |----class1
+|    |    |----text_files
+|    |----class2
+|    |    |----text_files
+|    |    ...
+```
+* dataset: 一级目录。
+* train, test: 二级目录。
+* class1,class2,...: 三级目录。
+* text_files: 文本格式的实例文件。
+
+所有同目录下的文本实例文件都是同级别的。 每个文本文件包含一个或者多个实例，每一行表示一个实例。 为了充分的随机打乱训练集, 在预处理含有多行数椐的文本文件时参数设置稍有不同, 执行`preprocess.sh`脚本时需要加上`-m True`参数。 tokenizer.perl 默认用来切分单记和标点符号，如果你不需要这个操作，在运行`preprocess.sh`时加上`-t False`参数即可。
+
+## 训练模型
+
+在这步任务中,我们使用了循环神经网络（RNN）的 LSTM 架构来训练情感分析模型。 引入LSTM模型主要是为了克服消失梯度的问题。 LSTM网络类似于具有隐藏层的标准循环神经网络, 但是隐藏层中的每个普通节点被一个记忆单元替换。 每个记忆单元包含四个主要的元素: 输入门, 具有自循环连接的神经元，忘记门和输出门。 更多的细节可以在文献中找到[4]。 LSTM架构的最大优点是它可以在长时间间隔内记忆信息，而没有短时记忆的损失。在有新的单词来临的每一个时间步骤内，存储在记忆单元区块的历史信息被更新用来迭代的学习单词以合理的序列程现。
+
+<center>![LSTM](../../../doc/demo/sentiment_analysis/lstm.png)</center>
+<center>图表 1. LSTM [3]</center>
+
+情感分析是自然语言理解中最典型的问题之一。 它的目的是预测在一个序列中表达的情感态度。 通常, ，仅仅是一些关键词，如形容词和副词，在预测序列或段落的情感中起主要作用。然而有些评论上下文非常长，例如 IMDB的数椐集。 我们只所以使用LSTM来执行这个任务是因为其改进的设计并且具有门机制。 首先，它能够从词级到具有可变上下文长度的上下文级别来总结表示。 第二，它可以在句子级别利用可扩展的上下文, 而大多数方法只是利用n-gram级别的知识。第三，它直接学习段落表示，而不是组合上下文级别信息。
+
+在本演示中，我们提供两个网络，即双向LSTM和三层堆叠LSTM。
+
+#### 双向LSTM
+
+图2是双向LSTM网络，后面连全连接层和softmax层。
+
+<center>![BiLSTM](../../../doc/demo/sentiment_analysis/bi_lstm.jpg)</center>
+<center>图 2. Bidirectional-LSTM </center>
+
+#### Stacked-LSTM
+图3是三层LSTM结构。图的底部是word embedding(对文档处理后形成的单词向量)。 接下来，连接三个LSTM隐藏层，并且第二个是反向LSTM。然后提取隐藏LSTM层的所有时间步长的最大词向量作为整个序列的表示。 最后，使用具有softmax激活的全连接前馈层来执行分类任务。 更多内容可查看参考文献 [5]。
+
+<center>![StackedLSTM](../../../doc/demo/sentiment_analysis/stacked_lstm.jpg)</center>
+<center>图 3. Stacked-LSTM for sentiment analysis </center>
+
+**配置**
+
+进入`demo/sentiment` 目录 , `trainer_config.py` 是一个配置文件的例子, 其中包含算法和网络配置。第一行从`sentiment_net.py`中导出预定义的网络。
+
+trainer_config.py:
+
+```python
+from sentiment_net import *
+
+data_dir  = "./data/pre-imdb"
+# whether this config is used for test
+is_test = get_config_arg('is_test', bool, False)
+# whether this config is used for prediction
+is_predict = get_config_arg('is_predict', bool, False)
+dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
+
+################## Algorithm Config #####################
+
+settings(
+  batch_size=128,
+  learning_rate=2e-3,
+  learning_method=AdamOptimizer(),
+  regularization=L2Regularization(8e-4),
+  gradient_clipping_threshold=25
+)
+
+#################### Network Config ######################
+stacked_lstm_net(dict_dim, class_dim=class_dim,
+                 stacked_num=3, is_predict=is_predict)
+#bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
+```
+
+* **数椐定义**:
+   * get\_config\_arg(): 获取通过 `--config_args=xx` 设置的命令行参数。
+   * 定义训练数椐和测试数椐提供者, 这里使用了PaddlePaddle的Python接口来加载数椐。想了解更多细节可以参考PyDataProvider部分的文档
+
+* **算法配置**:
+   * 使用随机梯度下降（sgd）算法。
+   * 使用 adam 优化。
+   * 设置batch size大小为128。
+   * 设置平均sgd窗口。
+   * 设置全局学习率。
+* **网络配置**:
+   * dict_dim: 获取字典维度。
+   * class_dim: 设置类别数，IMDB有两个标签，即正面评价标签和负面评价标签。
+   * `stacked_lstm_net`: 预定义网络如图3所示，默认情况下使用此网络
+   * `bidirectional_lstm_net`: 预定义网络，如图2所示。
+
+**训练**
+
+首先安装PaddlePaddle。 然后使用下面的脚本 `train.sh` 来开启本地的训练。
+
+```
+cd demo/sentiment/
+./train.sh
+```
+
+train.sh:
+
+```
+config=trainer_config.py
+output=./model_output
+paddle train --config=$config \
+             --save_dir=$output \
+             --job=train \
+             --use_gpu=false \
+             --trainer_count=4 \
+             --num_passes=10 \
+             --log_period=20 \
+             --dot_period=20 \
+             --show_parameter_stats_period=100 \
+             --test_all_data_in_one_period=1 \
+             2>&1 | tee 'train.log'
+```
+
+* \--config=$config: 设置网络配置。
+* \--save\_dir=$output: 设置输出路径以保存训练完成的模型。
+* \--job=train: 设置工作模式为训练。
+* \--use\_gpu=false: 使用CPU训练，如果你安装GPU版本的PaddlePaddle，并想使用GPU来训练设置为true。
+* \--trainer\_count=4:设置线程数（或GPU个数）。
+* \--num\_passes=15: 设置pass，PaddlePaddle中的一个pass意味着对数据集中的所有样本进行一次训练。
+* \--log\_period=20: 每20个batch打印一次日志。
+* \--show\_parameter\_stats\_period=100: 每100个batch打印一次统计信息。
+* \--test\_all_data\_in\_one\_period=1: 每次测试都测试所有数据。
+
+如果运行成功，输出日志保存在路径 `demo/sentiment/train.log`中，模型保存在目录`demo/sentiment/model_output/`中。  输出日志说明如下：
+
+```
+Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875  CurrentEval: classification_error_evaluator=0.36875
+...
+Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922
+Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406
+```
+- Batch=xx: 表示训练了xx个Batch。
+- samples=xx: 表示训练了xx个样本。。
+- AvgCost=xx: 从第0个batch到当前batch的平均损失。
+- CurrentCost=xx: 最新log_period个batch处理的当前损失。
+- Eval: classification\_error\_evaluator=xx: 表示第0个batch到当前batch的分类错误。
+- CurrentEval: classification\_error\_evaluator: 最新log_period个batch的分类错误。
+- Pass=0: 通过所有训练集一次称为一遍。 0表示第一次经过训练集。
+
+默认情况下，我们使用`stacked_lstm_net`网络，当传递相同的样本数时，它的收敛速度比`bidirectional_lstm_net`快。如果要使用双向LSTM，只需删除最后一行中的注释并把“stacked_lstm_net”注释掉。
+
+## 测试模型
+
+测试模型是指使用训练出的模型评估已标记的验证集。
+
+```
+cd demo/sentiment
+./test.sh
+```
+
+test.sh:
+
+```bash
+function get_best_pass() {
+  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
+  sed  -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
+  sort | head -n 1
+}
+
+log=train.log
+LOG=`get_best_pass $log`
+LOG=(${LOG})
+evaluate_pass="model_output/pass-${LOG[1]}"
+
+echo 'evaluating from pass '$evaluate_pass
+
+model_list=./model.list
+touch $model_list | echo $evaluate_pass > $model_list
+net_conf=trainer_config.py
+paddle train --config=$net_conf \
+             --model_list=$model_list \
+             --job=test \
+             --use_gpu=false \
+             --trainer_count=4 \
+             --config_args=is_test=1 \
+             2>&1 | tee 'test.log'
+```
+
+函数`get_best_pass`依据分类错误率获得最佳模型进行测试。 在本示例中，我们默认使用IMDB的测试数据集作为验证。 与训练不同，它需要在这里指定`--job = test`和模型路径，即`--model_list = $model_list`。如果运行成功，日志将保存在“demo / sentiment / test.log”的路径中。例如，在我们的测试中，最好的模型是`model_output / pass-00002`，分类误差是0.115645，如下：
+
+```
+Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645
+```
+
+## 预测
+
+`predict.py`脚本提供了一个预测接口。在使用它之前请安装PaddlePaddle的python api。 预测IMDB的未标记评论的一个实例如下：
+
+```
+cd demo/sentiment
+./predict.sh
+```
+predict.sh:
+
+```
+#Note the default model is pass-00002, you shold make sure the model path
+#exists or change the mode path.
+model=model_output/pass-00002/
+config=trainer_config.py
+label=data/pre-imdb/labels.list
+python predict.py \
+     -n $config\
+     -w $model \
+     -b $label \
+     -d data/pre-imdb/dict.txt \
+     -i data/aclImdb/test/pos/10007_10.txt
+```
+
+* `predict.py`: 预测接口脚本。
+*  -n $config : 设置网络配置。
+*  -w $model: 设置模型路径。
+*  -b $label: 设置标签类别字典，这个字典是整数标签和字符串标签的一个对应。
+*  -d data/pre-imdb/dict.txt: 设置字典文件。
+*  -i data/aclImdb/test/pos/10014_7.txt: 设置一个要预测的示例文件。
+
+注意应该确保默认模型路径`model_output / pass-00002`存在或更改为其它模型路径。
+
+本示例的预测结果：
+
+```
+Loading parameters from model_output/pass-00002/
+./data/aclImdb/test/pos/10014_7.txt: predicting label is pos
+```
+我们真诚地感谢您的关注，并欢迎您来参与贡献。
+
+## 参考文档
+[1] Brendan O'Connor, Ramnath Balasubramanyan, Bryan R. Routledge, and Noah A. Smith. 2010. [From Tweets to Polls: Linking Text Sentiment to Public Opinion Time Series](http://homes.cs.washington.edu/~nasmith/papers/oconnor+balasubramanyan+routledge+smith.icwsm10.pdf). In ICWSM-2010. <br>
+[2] Johan Bollen, Huina Mao, Xiaojun Zeng. 2011. [Twitter mood predicts the stock market](http://arxiv.org/abs/1010.3003), Journal of Computational Science.<br>
+[3] Alex Graves, Marcus Liwicki, Santiago Fernan- dez, Roman Bertolami, Horst Bunke, and Ju ̈rgen Schmidhuber. 2009. [A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine In- telligence](http://www.cs.toronto.edu/~graves/tpami_2009.pdf), 31(5):855–868.<br>
+[4] Zachary C. Lipton, [A Critical Review of Recurrent Neural Networks for Sequence Learning](http://arxiv.org/abs/1506.00019v1), arXiv:1506.00019. <br>
+[5] Jie Zhou and Wei Xu; [End-to-end Learning of Semantic Role Labeling Using Recurrent Neural Networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf); ACL-IJCNLP 2015. <br>
diff --git a/doc_cn/faq/index.rst b/doc_cn/faq/index.rst
index 283607957ce63099a61d220478728654e993fe9a..3eb0e10ae2228740cd384270db5070e367f7007b 100644
--- a/doc_cn/faq/index.rst
+++ b/doc_cn/faq/index.rst
@@ -166,4 +166,51 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 
 这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
 
-
+7. *-cp27mu-linux_x86_64.whl is not a supported wheel on this platform.
+-----------------------------------------------------------------------
+
+出现这个问题的主要原因是，系统编译wheel包的时候，使用的 :code:`wheel` 包是最新的，
+而系统中的 :code:`pip` 包比较老。具体的解决方法是，更新 :code:`pip` 包并重新编译PaddlePaddle。
+更新 :code:`pip` 包的方法是\:
+
+..  code-block:: bash
+
+    pip install --upgrade pip
+
+8.  python相关的单元测试都过不了
+--------------------------------
+
+如果出现以下python相关的单元测试都过不了的情况：
+
+..  code-block:: bash
+
+    24 - test_PyDataProvider (Failed)
+    26 - test_RecurrentGradientMachine (Failed)
+    27 - test_NetworkCompare (Failed)
+    28 - test_PyDataProvider2 (Failed)
+    32 - test_Prediction (Failed)
+    33 - test_Compare (Failed)
+    34 - test_Trainer (Failed)
+    35 - test_TrainerOnePass (Failed)
+    36 - test_CompareTwoNets (Failed)
+    37 - test_CompareTwoOpts (Failed)
+    38 - test_CompareSparse (Failed)
+    39 - test_recurrent_machine_generation (Failed)
+    40 - test_PyDataProviderWrapper (Failed)
+    41 - test_config_parser (Failed)
+    42 - test_swig_api (Failed)
+    43 - layers_test (Failed)
+    
+并且查询PaddlePaddle单元测试的日志，提示：
+
+..  code-block:: bash
+    
+    paddle package is already in your PYTHONPATH. But unittest need a clean environment.
+    Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
+    
+解决办法是：卸载paddle包 :code:`pip uninstall paddle`。
+
+原因是：单元测试使用了一个旧版本的python包，而没有测试到代码中实际修改的python包。即单元测试需要一个干净的环境：
+
+* 如果paddle包已经在python的site-packages里面了，那么单元测试时使用的paddle包，就是site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。
+* 即便设置了 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
\ No newline at end of file
diff --git a/doc_cn/faq/reduce_min_pool_size.py b/doc_cn/faq/reduce_min_pool_size.py
index 2811b134b66b1ec55903d89e3f38a0cef8c9ef8d..5715397cc11e18246b8522fcc5b4f05780c9a0a7 100644
--- a/doc_cn/faq/reduce_min_pool_size.py
+++ b/doc_cn/faq/reduce_min_pool_size.py
@@ -3,4 +3,4 @@ def process(settings, filename):
     os.system('shuf %s > %s.shuf' % (filename, filename))  # shuffle before.
     with open('%s.shuf' % filename, 'r') as f:
         for line in f:
-            yield get_sample_from_line(line)
\ No newline at end of file
+            yield get_sample_from_line(line)
diff --git a/doc_cn/faq/word2vec_config.py b/doc_cn/faq/word2vec_config.py
index e347252476eab670abfa2cf2dc126d96b6e04857..866b40c3d4c96c1213b3f716f29b14dd38763edb 100644
--- a/doc_cn/faq/word2vec_config.py
+++ b/doc_cn/faq/word2vec_config.py
@@ -1,8 +1,12 @@
-... # the settings and define data provider is omitted.
-DICT_DIM=3000  # dictionary dimension.
-word_ids=data_layer('word_ids', size=DICT_DIM)
+...  # the settings and define data provider is omitted.
+DICT_DIM = 3000  # dictionary dimension.
+word_ids = data_layer('word_ids', size=DICT_DIM)
 
-emb = embedding_layer(input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True))
+emb = embedding_layer(
+    input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True))
 emb_sum = pooling_layer(input=emb, pooling_type=SumPooling())
 predict = fc_layer(input=emb_sum, size=DICT_DIM, act=Softmax())
-outputs(classification_cost(input=predict, label=data_layer('label', size=DICT_DIM))) 
\ No newline at end of file
+outputs(
+    classification_cost(
+        input=predict, label=data_layer(
+            'label', size=DICT_DIM)))
diff --git a/doc_cn/faq/word2vec_dataprovider.py b/doc_cn/faq/word2vec_dataprovider.py
index a0a39080cece90c6c4096bba4396bfa91b3ef759..ec2753a7d01d7dd4d804c3bed0bac1be9c3fb3d3 100644
--- a/doc_cn/faq/word2vec_dataprovider.py
+++ b/doc_cn/faq/word2vec_dataprovider.py
@@ -1,8 +1,10 @@
-DICT_DIM=3000
+DICT_DIM = 3000
+
+
 @provider(input_types=[integer_sequence(DICT_DIM), integer_value(DICT_DIM)])
 def process(settings, filename):
-	with open(filename) as f:
-		# yield word ids to predict inner word id
-		# such as [28, 29, 10, 4], 4
-		# It means the sentance is  28, 29, 4, 10, 4.
-		yield read_next_from_file(f)
\ No newline at end of file
+    with open(filename) as f:
+        # yield word ids to predict inner word id
+        # such as [28, 29, 10, 4], 4
+        # It means the sentance is  28, 29, 4, 10, 4.
+        yield read_next_from_file(f)
diff --git a/doc_cn/howto/how_to_write_docs/index.rst b/doc_cn/howto/how_to_write_docs/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a1f983b3405fa40f436885e40fca2ebbb4695491
--- /dev/null
+++ b/doc_cn/howto/how_to_write_docs/index.rst
@@ -0,0 +1,54 @@
+###############################
+如何贡献/修改PaddlePaddle的文档
+###############################
+
+PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成，生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
+
+
+如何构建PaddlePaddle的文档
+==========================
+
+PaddlePaddle的文档构建有直接构建和基于Docker构建两种方式。构建PaddlePaddle文档需要准备的环境相对较复杂，所以我们推荐使用基于Docker来构建PaddlePaddle的文档。
+
+
+使用Docker构建PaddlePaddle的文档
+--------------------------------
+
+使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后可以使用源码目录下的脚本构建文档，即
+
+..	code-block:: bash
+
+	cd TO_YOUR_PADDLE_CLONE_PATH
+	cd paddle/scripts/tools/build_docs
+	bash build_docs.sh
+
+编译完成后，该目录下会生成如下两个子目录\:
+
+* doc 英文文档目录
+* doc_cn 中文文档目录
+
+打开浏览器访问对应目录下的index.html即可访问本地文档。
+
+..	code-block:: bash
+
+	open doc_cn/index.html
+
+
+直接构建PaddlePaddle的文档
+--------------------------
+
+TBD
+
+如何书写PaddlePaddle的文档
+==========================
+
+TBD
+
+如何更新www.paddlepaddle.org文档
+================================
+
+TBD
+
+
+..	_cmake: https://cmake.org/
+..	_sphinx: http://www.sphinx-doc.org/en/1.4.8/
\ No newline at end of file
diff --git a/doc_cn/index.rst b/doc_cn/index.rst
index d2d50fbdb47f27ad5ad8d22215a9f0993145430f..f1398206fddffca77f583c195e00034e55932639 100644
--- a/doc_cn/index.rst
+++ b/doc_cn/index.rst
@@ -3,8 +3,9 @@ PaddlePaddle文档
 
 使用指南
 --------
-
+* `介绍 <introduction/index.html>`_
 * `快速入门 <demo/quick_start/index.html>`_
+* `基本使用概念 <concepts/use_concepts.html>`_
 * `编译与安装 <build_and_install/index.html>`_
 * `用户接口 <ui/index.html>`_
 * `使用示例 <demo/index.html>`_
@@ -14,6 +15,7 @@ PaddlePaddle文档
 开发指南
 --------
 * `新写Layer <../doc/dev/new_layer/index.html>`_
+* `如何贡献文档 <howto/how_to_write_docs/index.html>`_
 
 算法教程
 --------
diff --git a/doc_cn/introduction/index.md b/doc_cn/introduction/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..164cb7d4943dfbfcc00a2df7329ae2a877b2d703
--- /dev/null
+++ b/doc_cn/introduction/index.md
@@ -0,0 +1,105 @@
+# 简介
+
+PaddlePaddle 是起源于百度的开源深度学习平台。它是简单易用的：你可以通过简单的十数行配置搭建经典的神经网络模型；它也是高效强大的：PaddlePaddle可以支撑复杂集群环境下超大模型的训练，令你受益于深度学习的前沿成果。在百度内部，已经有大量产品线使用了基于PaddlePaddle的深度学习技术。
+
+这份简短的介绍将像你展示如何利用PaddlePaddle解决一个经典的学习问题。
+
+## 1. 一个经典的任务
+
+让我们从一个基础问题开始：<a href="https://www.baidu.com/s?wd=单变量线性回归">单变量的线性回归</a>。问题假定观测到了一批二维空间上的点`(x, y) `，并且已知 `x` 和 `y` 之间存在着某种线性关系，我们的目标是通过观测数据还原这个线性关系。作为一个简单基础的模型，线性回归却有着广泛的应用场景。比如可以想象一个资产定价的简化场景，其中 `x` 对应于房屋的大小，`y` 对应于房屋价格。我们可以通过观察市场上房屋的情况获得二者之间的关系，从而为新房屋的定价提供参考。
+
+
+## 2. 准备数据
+
+假设变量 `X` 和 `Y` 的真实关系为： `Y = 2X + 0.3`，这里展示如何使用观测数据还原这一线性关系。如下Python代码将随机产生2000个观测点，它们将被用作PaddlePaddle的输入。产生PaddlePaddle的输入数据和写一段普通的Python脚本几乎一样，你唯一需要增加的就是定义输入数据的类型。
+
+```python
+# -*- coding:utf-8 -*-
+# dataprovider.py
+from paddle.trainer.PyDataProvider2 import *
+import random
+
+# 定义输入数据的类型: 2个浮点数
+@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
+def process(settings, input_file):
+    for i in xrange(2000):
+        x = random.random()
+        yield [x], [2*x+0.3]
+```
+
+## 3. 训练模型
+
+为了还原 `Y = 2X + 0.3`，我们先从一条随机的直线 `Y' = wX + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `Y'` 和 `Y` 的差距不断减小，最终趋于相同。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
+
+在PaddlePaddle里，该模型的网络配置如下。
+
+```python
+# -*- coding:utf-8 -*-
+# trainer_config.py
+from paddle.trainer_config_helpers import *
+
+# 1. 定义数据来源，调用上面的process函数获得观测数据
+data_file = 'empty.list'
+with open(data_file, 'w') as f: f.writelines(' ')
+define_py_data_sources2(train_list=data_file, test_list=None, 
+        module='dataprovider', obj='process',args={})
+
+# 2. 学习算法。控制如何改变模型参数 w 和 b
+settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
+
+# 3. 神经网络配置
+x = data_layer(name='x', size=1)
+y = data_layer(name='y', size=1)
+# 线性计算单元: y_predict = wx + b
+y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
+# 损失计算，度量 y_predict 和真实 y 之间的差距
+cost = regression_cost(input=y_predict, label=y)
+outputs(cost)
+```
+这段简短的配置展示了PaddlePaddle的基本用法：
+
+- 首先，第一部分定义了数据输入。一般情况下，PaddlePaddle先从一个文件列表里获得数据文件地址，然后交给用户自定义的函数（例如上面的`process`函数）进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件，所以放一个空列表（`empty.list`）即可。
+
+- 第二部分主要是选择学习算法，它定义了模型参数如何改变。PaddlePaddle提供了很多优秀的学习算法，但这里使用一个简单的基于momentum的算法就足够了，它每次读取12个数据进行计算和模型更新。
+
+- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络单元（Layer），所以很多时候你需要做的只是声明正确的网络单元并把它们拼接起来。这里使用了三种网络单元：
+	- **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到下游的其它单元。这里数据层有两个，分别对应于变量 `X` 和 `Y`。
+	- **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以挖掘复杂的数据关系。
+	- **回归损失层**：回归损失层 `regression_cost`是众多损失函数层的一种，它们在训练过程作为网络的出口，用来计算模型的表现，并指导模型参数的改变。
+
+这样定义了网络结构并保存为`trainer_config.py`之后，运行训练命令即可：
+ ```
+ paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
+ ```
+
+PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加损失函数的输出在不断的减小，这意味着模型在不断的改进，直到逼近真实解：` Y = 2X + 0.3 `
+
+## 4. 模型检验
+
+训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用模型对另外一组数据进行预测，然后评价预测的效果。但在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
+
+PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件，所以可以利用如下方法读取模型的参数。
+
+```python
+import numpy as np
+import os
+
+def load(file_name):
+    with open(file_name, 'rb') as f:
+        f.read(16) # skip header for float type.
+        return np.fromfile(f, dtype=np.float32)
+        
+print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
+# w=1.999743, b=0.300137
+```
+<center> ![](./parameters.png) </center>
+
+从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型重合。
+
+这样，我们就完成了对单变量线性回归问题的解决：将数据输入PaddlePaddle，训练模型，最后验证结果。
+
+## 5. 推荐后续阅读
+
+- <a href="../build_and_install/index.html">安装/编译</a>：PaddlePaddle的安装与编译文档。
+- <a href="../demo/quick_start/index.html">快速入门 </a>：使用商品评论分类任务，系统性的介绍如何一步步改进，最终得到产品级的深度模型。
+- <a href="../demo/index.html">示例</a>：各种实用案例，涵盖图像、文本、推荐等多个领域。
diff --git a/doc_cn/introduction/parameters.png b/doc_cn/introduction/parameters.png
new file mode 100644
index 0000000000000000000000000000000000000000..2ec67480951e21f0400bce1c34b3108dcd65c18c
Binary files /dev/null and b/doc_cn/introduction/parameters.png differ
diff --git a/doc_cn/ui/data_provider/mnist_config.py b/doc_cn/ui/data_provider/mnist_config.py
index 7ba344338c374a7f9e7e4faa804e2e124577c0be..39becff03b08f5e75b8503aaf01e782d2b0fb3be 100644
--- a/doc_cn/ui/data_provider/mnist_config.py
+++ b/doc_cn/ui/data_provider/mnist_config.py
@@ -1,8 +1,9 @@
 from paddle.trainer_config_helpers import *
 
-define_py_data_sources2(train_list='train.list',
-                        test_list=None,
-                        module='mnist_provider',
-                        obj='process')
+define_py_data_sources2(
+    train_list='train.list',
+    test_list=None,
+    module='mnist_provider',
+    obj='process')
 img = data_layer(name='pixel', size=784)
 label = data_layer(name='label', size=10)
diff --git a/doc_cn/ui/data_provider/mnist_provider.dict.py b/doc_cn/ui/data_provider/mnist_provider.dict.py
index bf13b56372b56a1e810fad159cd51371ef46c468..2ba0b126a0d6239f84950e130410aaaa6e1f24cd 100644
--- a/doc_cn/ui/data_provider/mnist_provider.dict.py
+++ b/doc_cn/ui/data_provider/mnist_provider.dict.py
@@ -2,10 +2,9 @@ from paddle.trainer.PyDataProvider2 import *
 
 
 # Define a py data provider
-@provider(input_types={
-    'pixel': dense_vector(28 * 28),
-    'label': integer_value(10)
-})
+@provider(
+    input_types={'pixel': dense_vector(28 * 28),
+                 'label': integer_value(10)})
 def process(settings, filename):  # settings is not used currently.
     f = open(filename, 'r')  # open one of training file
 
diff --git a/doc_cn/ui/data_provider/mnist_provider.py b/doc_cn/ui/data_provider/mnist_provider.py
index 92f1915c1072562a174a62b436de8f5b39dab2d4..8b828641d55735e67ca634107d5b239150649651 100644
--- a/doc_cn/ui/data_provider/mnist_provider.py
+++ b/doc_cn/ui/data_provider/mnist_provider.py
@@ -2,10 +2,7 @@ from paddle.trainer.PyDataProvider2 import *
 
 
 # Define a py data provider
-@provider(input_types=[
-    dense_vector(28 * 28),
-    integer_value(10)
-])
+@provider(input_types=[dense_vector(28 * 28), integer_value(10)])
 def process(settings, filename):  # settings is not used currently.
     f = open(filename, 'r')  # open one of training file
 
diff --git a/doc_cn/ui/data_provider/sentimental_config.py b/doc_cn/ui/data_provider/sentimental_config.py
index 051f75e32b5c0b1f36d27a54c42db94a4682ce7b..7ce71608a2372b2484ae40ccf01f0621728ddef2 100644
--- a/doc_cn/ui/data_provider/sentimental_config.py
+++ b/doc_cn/ui/data_provider/sentimental_config.py
@@ -3,9 +3,12 @@ from paddle.trainer_config_helpers import *
 dictionary = dict()
 ...  #  read dictionary from outside
 
-define_py_data_sources2(train_list='train.list', test_list=None,
-                        module='sentimental_provider', obj='process',
-                        # above codes same as mnist sample.
-                        args={  # pass to provider.
-                            'dictionary': dictionary
-                        })
+define_py_data_sources2(
+    train_list='train.list',
+    test_list=None,
+    module='sentimental_provider',
+    obj='process',
+    # above codes same as mnist sample.
+    args={  # pass to provider.
+        'dictionary': dictionary
+    })
diff --git a/doc_cn/ui/data_provider/sentimental_provider.py b/doc_cn/ui/data_provider/sentimental_provider.py
index bda37d7722a0bb98c2c681c790bb308c0e146515..0fb0bb88e95a230f01f18b78ebb37b659c3768f1 100644
--- a/doc_cn/ui/data_provider/sentimental_provider.py
+++ b/doc_cn/ui/data_provider/sentimental_provider.py
@@ -12,7 +12,8 @@ def on_init(settings, dictionary, **kwargs):
         # The text is a sequence of integer values, and each value is a word id.
         # The whole sequence is the sentences that we want to predict its
         # sentimental.
-        integer_value(len(dictionary), seq_type=SequenceType),  # text input
+        integer_value(
+            len(dictionary), seq_type=SequenceType),  # text input
 
         # label positive/negative
         integer_value(2)
diff --git a/paddle/.common_test_util.sh b/paddle/.common_test_util.sh
index dec22e45619fb5d393be96e929a7e301bf266224..dc1525061590808e3cc9c7b606aca5d5d9195a3a 100644
--- a/paddle/.common_test_util.sh
+++ b/paddle/.common_test_util.sh
@@ -117,4 +117,4 @@ set_port()
         fi
     done
 
-}
\ No newline at end of file
+}
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index cae0f64400a7e618bffb4f7fc6a044011baf04d4..fb3af8ea92feed96a9669bfb29ef7353a256c308 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -17,5 +17,3 @@ endif()
 if(WITH_SWIG_PY)
   add_subdirectory(api)
 endif()
-
-
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
index 8f73e7626042c9b138625ec9db599fdc2e42cc9b..b539374cd4aa5a9510cdb728c1b22edf65a9f880 100644
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -12,29 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
 
 #include "paddle/parameter/Argument.h"
 
-struct ArgumentsPrivate {
-  std::vector<paddle::Argument> outputs;
-
-  inline paddle::Argument& getArg(size_t idx) throw(RangeError) {
-    if (idx < outputs.size()) {
-      return outputs[idx];
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-
-  template <typename T>
-  std::shared_ptr<T>& cast(void* rawPtr) const {
-    return *(std::shared_ptr<T>*)(rawPtr);
-  }
-};
-
 size_t Arguments::getSlotNum() const { return m->outputs.size(); }
 
 Arguments* Arguments::createArguments(size_t slotNum) {
@@ -129,7 +111,7 @@ void Arguments::setSlotSequenceStartPositions(size_t idx,
 }
 
 void Arguments::setSlotSubSequenceStartPositions(
-    size_t idx, IVector *vec) throw(RangeError) {
+    size_t idx, IVector* vec) throw(RangeError) {
   auto& a = m->getArg(idx);
   auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
   a.subSequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index fe0da763514a65911b30f42159c6fce7057d18a6..9b2d122a09adabd766014a9d21a167eec5b2de32 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -40,6 +40,8 @@ configure_file(
 
 generate_python_api(python_swig_sources)
 
+file(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
+
 # TODO(yuyang18) : make wheel name calculated by cmake
 add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
     COMMAND ${PYTHON_EXECUTABLE} setup.py  bdist_wheel
@@ -55,6 +57,7 @@ add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
             paddle_trainer
             paddle_api
             paddle_cuda
+	    ${PY_PADDLE_PYTHON_FILES}
 )
 
 install(DIRECTORY ${PROJ_ROOT}/paddle/dist/
diff --git a/paddle/api/ConfigParser.cpp b/paddle/api/ConfigParser.cpp
index c5ee784a0bda09cd7a2b857d98431bec67afcae4..bc40d871d180a6bfe21200c866181dc161f5f078 100644
--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@@ -12,19 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
 #include "paddle/trainer/Trainer.h"
 
-struct TrainerConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfig> conf;
-  TrainerConfigPrivate() : conf(std::make_shared<paddle::TrainerConfig>()) {}
-};
-
-struct ModelConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfig> conf;
-};
-
 struct ParameterConfigPrivate {
   paddle::ParameterPtr parameter;
   paddle::ParameterConfig config;
@@ -39,19 +30,6 @@ struct ParameterConfigPrivate {
   }
 };
 
-struct OptimizationConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfig> trainer_config;
-  paddle::OptimizationConfig config;
-
-  paddle::OptimizationConfig& getConfig() {
-    if (trainer_config != nullptr) {
-      return *trainer_config->mutable_opt_config();
-    } else {
-      return config;
-    }
-  }
-};
-
 TrainerConfig::TrainerConfig() : m(new TrainerConfigPrivate()) {}
 
 TrainerConfig::~TrainerConfig() { delete m; }
@@ -59,10 +37,18 @@ TrainerConfig::~TrainerConfig() { delete m; }
 TrainerConfig* TrainerConfig::createFromTrainerConfigFile(
     const std::string& confPath) {
   LOG(INFO) << "load trainer config from " << confPath;
-  paddle::TrainerConfigHelper helper(confPath);
-  //! TODO(yuyang18): Make TrainerConfigPrivate to TrainerConfigHelper
+  auto conf = std::make_shared<paddle::TrainerConfigHelper>(confPath);
   auto retv = new TrainerConfig();
-  *retv->m->conf = helper.getConfig();
+  retv->m->conf = conf;
+  return retv;
+}
+
+TrainerConfig* TrainerConfig::createFromProtoString(const std::string& str) {
+  auto retv = new TrainerConfig();
+  paddle::TrainerConfig trainerConfigProto;
+  auto conf = std::make_shared<paddle::TrainerConfigHelper>(trainerConfigProto);
+  CHECK(conf->getMutableConfig().ParseFromString(str));
+  retv->m->conf = conf;
   return retv;
 }
 
@@ -76,10 +62,6 @@ ModelConfig* TrainerConfig::getModelConfig() const {
   return retv;
 }
 
-void* ModelConfig::getPaddleModelConfig() const {
-  return m->conf->mutable_model_config();
-}
-
 ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}
 
 ParameterConfig::~ParameterConfig() {
@@ -132,8 +114,6 @@ OptimizationConfig* TrainerConfig::getOptimizationConfig() const {
   return opt_config;
 }
 
-void* OptimizationConfig::getRawPtr() { return &m->getConfig(); }
-
 OptimizationConfig* OptimizationConfig::createFromProtoString(
     const std::string& str) {
   auto conf = new OptimizationConfig();
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
index 6f1d63575a80f3011cc678df897d54d602edfb3b..9a4846d80980e23e97f89b6134e15af71207ae6b 100644
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -12,32 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "PaddleAPIPrivate.h"
+
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "Internal.h"
 
 std::vector<int> GradientMachine::defaultParamTypes = {
     PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM};
 
-struct GradientMachinePrivate {
-  std::shared_ptr<paddle::GradientMachine> machine;
-
-  template <typename T>
-  inline T& cast(void* ptr) {
-    return *(T*)(ptr);
-  }
-};
-
 GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {}
 
 GradientMachine::~GradientMachine() { delete m; }
 
 GradientMachine* GradientMachine::createFromPaddleModelPtr(
-    void* confPtr, GradientMatchineCreateMode mode,
+    const void* confPtr,
+    GradientMatchineCreateMode mode,
     const std::vector<int>& types) {
-  auto& conf = *(paddle::ModelConfig*)(confPtr);
+  auto& conf = *(const paddle::ModelConfig*)(confPtr);
   std::vector<ParameterType> realTypes;
   staticCastVector(&realTypes, types);
   auto machineRawPtr = paddle::GradientMachine::create(conf, mode, realTypes);
@@ -52,7 +44,8 @@ GradientMachine* GradientMachine::createFromPaddleModelPtr(
 }
 
 GradientMachine* GradientMachine::createByConfigProtoStr(
-    const std::string& protoStr, GradientMatchineCreateMode mode,
+    const std::string& protoStr,
+    GradientMatchineCreateMode mode,
     const std::vector<int>& types) {
   paddle::ModelConfig conf;
   conf.ParseFromString(protoStr);
@@ -64,13 +57,15 @@ GradientMachine* GradientMachine::createByConfigProtoStr(
 }
 
 GradientMachine* GradientMachine::createByModelConfig(
-    ModelConfig* conf, GradientMatchineCreateMode mode,
+    ModelConfig* conf,
+    GradientMatchineCreateMode mode,
     const std::vector<int>& types) {
-  auto confPtr = (paddle::ModelConfig*)conf->getPaddleModelConfig();
+  auto confPtr = &conf->m->conf->getModelConfig();
   return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
 }
 
-void GradientMachine::forward(const Arguments& inArgs, Arguments* outArgs,
+void GradientMachine::forward(const Arguments& inArgs,
+                              Arguments* outArgs,
                               PassType passType) {
   auto& in =
       m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
@@ -107,7 +102,8 @@ void GradientMachine::backward(const UpdateCallback& callback) {
 }
 
 void GradientMachine::forwardBackward(const Arguments& inArgs,
-                                      Arguments* outArgs, PassType passType,
+                                      Arguments* outArgs,
+                                      PassType passType,
                                       const UpdateCallback& callback) {
   auto& in =
       m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
@@ -137,7 +133,7 @@ Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
 void GradientMachine::randParameters() { m->machine->randParameters(); }
 
 Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
-  throw(UnsupportError) {
+    throw(UnsupportError) {
   auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(m->machine);
   if (nn) {
     auto mat = nn->getLayerOutput(layerName);
@@ -148,8 +144,11 @@ Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
 }
 
 SequenceGenerator* GradientMachine::asSequenceGenerator(
-    const std::vector<std::string>& dict, size_t begin_id, size_t end_id,
-    size_t max_length, size_t beam_size) {
+    const std::vector<std::string>& dict,
+    size_t begin_id,
+    size_t end_id,
+    size_t max_length,
+    size_t beam_size) {
   SequenceGenerator* r =
       SequenceGenerator::createByGradientMachineSharedPtr(&m->machine);
   r->setDict(dict);
diff --git a/paddle/api/Internal.h b/paddle/api/Internal.h
index b990f650be9fa401898a8c6d10c21d9c90eb728a..66a13bc603ed5098997f168d3f527160ac3822ef 100644
--- a/paddle/api/Internal.h
+++ b/paddle/api/Internal.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "PaddleAPI.h"
@@ -23,7 +22,8 @@ limitations under the License. */
 template <typename T1, typename T2>
 void staticCastVector(std::vector<T2>* dest, const std::vector<T1>& src) {
   dest->resize(src.size());
-  std::transform(src.begin(), src.end(), dest->begin(), [](T1 t){
-    return static_cast<T2>(t);
-  });
+  std::transform(src.begin(),
+                 src.end(),
+                 dest->begin(),
+                 [](T1 t) { return static_cast<T2>(t); });
 }
diff --git a/paddle/api/Matrix.cpp b/paddle/api/Matrix.cpp
index 6a79f83495a56907fec9d3f77b581eddd3a8baeb..f257ee65aa4a12dfcd1914ddbf0e16461a9b128c 100644
--- a/paddle/api/Matrix.cpp
+++ b/paddle/api/Matrix.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
@@ -44,15 +43,35 @@ Matrix* Matrix::createZero(size_t height, size_t width, bool useGpu) {
   return m;
 }
 
-Matrix* Matrix::createDense(const std::vector<float>& data, size_t height,
-                            size_t width, bool useGpu) {
+Matrix* Matrix::createDense(const std::vector<float>& data,
+                            size_t height,
+                            size_t width,
+                            bool useGpu) {
   auto m = new Matrix();
   m->m->mat = paddle::Matrix::create(height, width, useGpu);
   m->m->mat->copyFrom(data.data(), data.size());
   return m;
 }
 
-Matrix* Matrix::createCpuDenseFromNumpy(float* data, int dim1, int dim2,
+Matrix* Matrix::createDenseFromNumpy(float* data,
+                                     int dim1,
+                                     int dim2,
+                                     bool copy,
+                                     bool useGpu) throw(UnsupportError) {
+  if (useGpu) {
+    /// Gpu mode only supports copy=True
+    if (!copy) {
+      throw UnsupportError("Gpu mode only supports copy=True");
+    }
+    return Matrix::createGpuDenseFromNumpy(data, dim1, dim2);
+  } else {
+    return Matrix::createCpuDenseFromNumpy(data, dim1, dim2, copy);
+  }
+}
+
+Matrix* Matrix::createCpuDenseFromNumpy(float* data,
+                                        int dim1,
+                                        int dim2,
                                         bool copy) {
   auto m = new Matrix();
   if (copy) {
@@ -71,12 +90,20 @@ Matrix* Matrix::createGpuDenseFromNumpy(float* data, int dim1, int dim2) {
   return m;
 }
 
-Matrix* Matrix::createSparse(size_t height, size_t width, size_t nnz,
-                             bool isNonVal, bool isTrans, bool useGpu) {
+Matrix* Matrix::createSparse(size_t height,
+                             size_t width,
+                             size_t nnz,
+                             bool isNonVal,
+                             bool isTrans,
+                             bool useGpu) {
   auto m = new Matrix();
   m->m->mat = paddle::Matrix::createSparseMatrix(
-      height, width, nnz, isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
-      isTrans, useGpu);
+      height,
+      width,
+      nnz,
+      isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
+      isTrans,
+      useGpu);
   return m;
 }
 
@@ -207,7 +234,8 @@ FloatArray Matrix::getData() const {
 }
 
 void Matrix::sparseCopyFrom(
-    const std::vector<int>& rows, const std::vector<int>& cols,
+    const std::vector<int>& rows,
+    const std::vector<int>& cols,
     const std::vector<float>& vals) throw(UnsupportError) {
   auto cpuSparseMat =
       std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
@@ -226,7 +254,8 @@ void Matrix::sparseCopyFrom(
 
 void* Matrix::getSharedPtr() const { return &m->mat; }
 
-void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
+void Matrix::toNumpyMatInplace(float** view_data,
+                               int* dim1,
                                int* dim2) throw(UnsupportError) {
   auto cpuMat = std::dynamic_pointer_cast<paddle::CpuMatrix>(m->mat);
   if (cpuMat) {
@@ -237,7 +266,8 @@ void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
     throw UnsupportError();
   }
 }
-void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
+void Matrix::copyToNumpyMat(float** view_m_data,
+                            int* dim1,
                             int* dim2) throw(UnsupportError) {
   static_assert(sizeof(paddle::real) == sizeof(float),
                 "Currently PaddleAPI only support for single "
@@ -255,8 +285,8 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
     } else if (auto gpuMat = dynamic_cast<paddle::GpuMatrix*>(m->mat.get())) {
       auto src = gpuMat->getData();
       auto dest = *view_m_data;
-      hl_memcpy_device2host(dest, src,
-                            sizeof(paddle::real) * (*dim1) * (*dim2));
+      hl_memcpy_device2host(
+          dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
     } else {
       LOG(WARNING) << "Unexpected Situation";
       throw UnsupportError();
@@ -264,7 +294,8 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
   }
 }
 
-void Matrix::copyFromNumpyMat(float* data, int dim1,
+void Matrix::copyFromNumpyMat(float* data,
+                              int dim1,
                               int dim2) throw(UnsupportError, RangeError) {
   if (isSparse()) {
     throw UnsupportError();
diff --git a/paddle/api/Paddle.swig b/paddle/api/Paddle.swig
index a09f24ce1ccf5d026bf9431255c258483854b74b..6a0fbc537d9345f2221ab65d90733f4696be6880 100644
--- a/paddle/api/Paddle.swig
+++ b/paddle/api/Paddle.swig
@@ -4,6 +4,13 @@
 #define SWIG_FILE_WITH_INIT
 #include "api/PaddleAPI.h"   
 %}
+
+%include "exception.i"
+%typemap(throws) UnsupportError %{
+  SWIG_exception(SWIG_RuntimeError, $1.what());
+  SWIG_fail;
+%}
+
 %include "std_vector.i"
 %include "std_pair.i"
 #ifdef SWIGPYTHON
@@ -133,14 +140,21 @@ namespace std {
 %newobject Matrix::createZero;
 %newobject Matrix::createSparse;
 %newobject Matrix::createDense;
+%newobject Matrix::createDenseFromNumpy;
+%newobject Matrix::createCpuDenseFromNumpy;
+%newobject Matrix::createGpuDenseFromNumpy;
 %newobject Vector::createZero;
 %newobject Vector::create;
+%newobject Vector::createVectorFromNumpy;
 %newobject Vector::createCpuVectorFromNumpy;
 %newobject Vector::createGpuVectorFromNumpy;
 %newobject IVector::createZero;
 %newobject IVector::create;
+%newobject IVector::createVectorFromNumpy;
+%newobject IVector::createCpuVectorFromNumpy;
+%newobject IVector::createGpuVectorFromNumpy;
 %newobject Trainer::createByCommandLine;
-%newobject Trainer::getNetworkOutput;
+%newobject Trainer::getForwardOutput;
 %newobject Trainer::getLayerOutput;
 %newobject Arguments::getSlotValue;
 %newobject Arguments::getSlotIds;
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index b3140617af188b6a80067d9dbd312bd9e9155adf..c07facdb1292b34ac31247160a4347ea359e718b 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <stddef.h>
 #include <stdint.h>
 #include <string>
+#include <stdexcept>
 #include <vector>
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/TypeDefs.h"
@@ -42,6 +42,12 @@ using namespace paddle::enumeration_wrapper;  // NOLINT
  */
 void initPaddle(int argc, char** argv);
 
+/// Return FLAGS_use_gpu
+bool isUsingGpu();
+
+/// Set the Flags_use_gpu to the given parameter
+void setUseGpu(bool useGpu);
+
 /// Return true if this py_paddle is compiled in GPU Version
 bool isGpuVersion();
 
@@ -52,7 +58,11 @@ class IOError {};
 class RangeError {};
 
 /// Not support Error, such as access GPU memory directly, etc.
-class UnsupportError {};
+class UnsupportError : public std::runtime_error {
+public:
+  UnsupportError() : std::runtime_error(" "){};
+  UnsupportError(const std::string& message) : std::runtime_error(message){};
+};
 
 /// This type will map to python's list of float.
 struct FloatArray {
@@ -101,7 +111,9 @@ public:
   /**
    * Create A Matrix with height,width, which is filled by zero.
    */
-  static Matrix* createZero(size_t height, size_t width, bool useGpu = false);
+  static Matrix* createZero(size_t height,
+                            size_t width,
+                            bool useGpu = isUsingGpu());
 
   /**
    * Create Sparse Matrix.
@@ -112,9 +124,12 @@ public:
    *
    * @note the default sparse type is SPARSE_CSR.
    */
-  static Matrix* createSparse(size_t height, size_t width, size_t nnz,
-                              bool isNonVal = true, bool trans = false,
-                              bool useGpu = false);
+  static Matrix* createSparse(size_t height,
+                              size_t width,
+                              size_t nnz,
+                              bool isNonVal = true,
+                              bool trans = false,
+                              bool useGpu = isUsingGpu());
 
   /**
    * Create Dense Matrix.
@@ -122,8 +137,17 @@ public:
    * @param data  list of float should be passed in python.
    * @note        the value will be copy into a new matrix.
    */
-  static Matrix* createDense(const std::vector<float>& data, size_t height,
-                             size_t width, bool useGpu = false);
+  static Matrix* createDense(const std::vector<float>& data,
+                             size_t height,
+                             size_t width,
+                             bool useGpu = isUsingGpu());
+
+  static Matrix* createDenseFromNumpy(
+      float* data,
+      int dim1,
+      int dim2,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
 
   /**
    *  Create Cpu Dense Matrix from numpy matrix, dtype=float32
@@ -134,7 +158,9 @@ public:
    *  @param copy  true if copy into a new matrix, false will create
    *               matrix inplace.
    */
-  static Matrix* createCpuDenseFromNumpy(float* data, int dim1, int dim2,
+  static Matrix* createCpuDenseFromNumpy(float* data,
+                                         int dim1,
+                                         int dim2,
                                          bool copy = false);
 
   /// Create Gpu Dense Matrix from numpy matrix, dtype=float32
@@ -154,11 +180,13 @@ public:
    * numpy_mat = m.toNumpyMat()
    * @endcode
    */
-  void toNumpyMatInplace(float** view_data, int* dim1,
+  void toNumpyMatInplace(float** view_data,
+                         int* dim1,
                          int* dim2) throw(UnsupportError);
 
   /// Copy To numpy mat.
-  void copyToNumpyMat(float** view_m_data, int* dim1,
+  void copyToNumpyMat(float** view_m_data,
+                      int* dim1,
                       int* dim2) throw(UnsupportError);
 
   /// Copy From Numpy Mat
@@ -221,21 +249,28 @@ public:
   ~Vector();
 
   /// Create Vector filled with zero.
-  static Vector* createZero(size_t sz, bool useGpu = false);
+  static Vector* createZero(size_t sz, bool useGpu = isUsingGpu());
 
   /**
    * Create Vector from list of float.
    *
    * It will create a new vector, and copy data into it.
    */
-  static Vector* create(const std::vector<float>& data, bool useGpu = false);
-
+  static Vector* create(const std::vector<float>& data,
+                        bool useGpu = isUsingGpu());
+
+  static Vector* createVectorFromNumpy(
+      float* data,
+      int dim,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
   /**
    * Create Cpu Vector from numpy array, which dtype=float32
    *
    * If copy is false, it will create vector inplace.
    */
-  static Vector* createCpuVectorFromNumpy(float* data, int dim,
+  static Vector* createCpuVectorFromNumpy(float* data,
+                                          int dim,
                                           bool copy = false);
 
   /// Create Gpu Vector from numpy array, which dtype=float32
@@ -259,6 +294,9 @@ public:
   /// Return is GPU vector or not.
   bool isGpu() const;
 
+  /// Return a list of float, the memory is alloced and copied.
+  FloatArray getData() const;
+
   /// __len__ in python
   size_t getSize() const;
 
@@ -279,25 +317,33 @@ class IVector {
 
 public:
   /// Create IVector filled with zero
-  static IVector* createZero(size_t sz, bool useGpu = false);
+  static IVector* createZero(size_t sz, bool useGpu = isUsingGpu());
 
   /**
    * Create IVector from list of int.
    * It will create a new vector, and copy data into it.
    */
-  static IVector* create(const std::vector<int>& data, bool useGpu = false);
+  static IVector* create(const std::vector<int>& data,
+                         bool useGpu = isUsingGpu());
+
+  static IVector* createVectorFromNumpy(
+      int* data,
+      int dim,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
 
   /**
    * Create Cpu IVector from numpy array, which dtype=int32
    *
    * If copy is false, it will create vector inplace
    */
-  static IVector* createCpuVectorFromNumpy(int* data, int dim,
+  static IVector* createCpuVectorFromNumpy(int* data,
+                                           int dim,
                                            bool copy = false);
   /**
    * Create Gpu IVector from numpy array, which dtype=int32
    */
-  static IVector* createGpuVectorFromNumy(int* data, int dim);
+  static IVector* createGpuVectorFromNumpy(int* data, int dim);
 
   /// Cast to numpy array inplace.
   void toNumpyArrayInplace(int** view_data, int* dim1) throw(UnsupportError);
@@ -446,7 +492,6 @@ struct OptimizationConfigPrivate;
 class OptimizationConfig {
   DISABLE_COPY_AND_ASSIGN(OptimizationConfig);
   OptimizationConfig();
-  void* getRawPtr();
 
 public:
   static OptimizationConfig* createFromProtoString(const std::string& str);
@@ -462,6 +507,7 @@ private:
 
   friend class TrainerConfig;
   friend class ParameterOptimizer;
+  friend class Trainer;
 };
 
 struct ParameterPrivate;
@@ -515,8 +561,6 @@ public:
   virtual ~ModelConfig();
 
 private:
-  void* getPaddleModelConfig() const;
-
   ModelConfigPrivate* m;
   friend class TrainerConfig;
   friend struct TrainerConfigPrivate;
@@ -539,6 +583,7 @@ public:
 
   static TrainerConfig* createFromTrainerConfigFile(
       const std::string& configPath);
+  static TrainerConfig* createFromProtoString(const std::string& str);
 
   ModelConfig* getModelConfig() const;
 
@@ -546,6 +591,7 @@ public:
 
 private:
   TrainerConfigPrivate* m;
+  friend class Trainer;
 };
 
 /**
@@ -576,7 +622,8 @@ class ParameterTraverseCallback {
 public:
   ~ParameterTraverseCallback();
 
-  void apply(const std::vector<Vector*>& vecs, const ParameterConfig& config,
+  void apply(const std::vector<Vector*>& vecs,
+             const ParameterConfig& config,
              size_t sparseId);
 
 private:
@@ -609,7 +656,8 @@ public:
 
   void finishBatch();
 
-  void update(const std::vector<Vector*>& vecs, const ParameterConfig& conf,
+  void update(const std::vector<Vector*>& vecs,
+              const ParameterConfig& conf,
               size_t sparseId = NO_SPARSE_ID);
 
   std::vector<int> getParameterTypes() const;
@@ -649,7 +697,8 @@ public:
    * model config by TrainerConfig
    */
   static GradientMachine* createByModelConfig(
-      ModelConfig* conf, GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
+      ModelConfig* conf,
+      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
       const std::vector<int>& parameterTypes = defaultParamTypes);
 
   /**
@@ -672,7 +721,8 @@ public:
   /**
    * Combine forward/backward
    */
-  void forwardBackward(const Arguments& inArgs, Arguments* outArgs,
+  void forwardBackward(const Arguments& inArgs,
+                       Arguments* outArgs,
                        PassType passType,
                        const UpdateCallback& callback = UpdateCallback());
 
@@ -693,18 +743,22 @@ public:
    */
   SequenceGenerator* asSequenceGenerator(
       const std::vector<std::string>& dict = std::vector<std::string>(),
-      size_t begin_id = 0UL, size_t end_id = 0UL, size_t max_length = 100UL,
+      size_t begin_id = 0UL,
+      size_t end_id = 0UL,
+      size_t max_length = 100UL,
       size_t beam_size = -1UL);
 
 private:
   GradientMachinePrivate* m;
 
   static GradientMachine* createFromPaddleModelPtr(
-      void* confPtr, GradientMatchineCreateMode mode,
+      const void* confPtr,
+      GradientMatchineCreateMode mode,
       const std::vector<int>& types);
 
   // Not to use c++ 11 init-list, so we use static var as function default arg.
   static std::vector<int> defaultParamTypes;
+  friend class Trainer;
 };
 
 struct TrainerPrivate;
@@ -712,6 +766,7 @@ class Trainer {
 private:
   TrainerPrivate* m;
   Trainer();
+  Trainer(TrainerConfig* optConfig, GradientMachine* gm);
   DISABLE_COPY_AND_ASSIGN(Trainer);
 
 public:
@@ -720,38 +775,42 @@ public:
   /// Create A Trainer By TrainerConfig. using paddle command line.
   static Trainer* createByCommandLine() throw(IOError);
 
-  /// Start Train.
+  static Trainer* create(TrainerConfig* optConfig,
+                         GradientMachine* gm) throw(IOError);
+
+  /// Start training
   void startTrain();
+
+  /// Finish training
   void finishTrain();
 
-  /// Start Pass.
+  /// Start a pass.
   void startTrainPass();
-  void finishTrainPass();
 
-  void setBatchSize(size_t batchSize);
+  /// Finish a pass
+  void finishTrainPass();
 
   /**
    * Train one batch,
    *
-   * @param batchSize -1 wiil use command line or batch size set before,
-   *                  otherwise use this batchSize for train.
-   *
    * @return true if all batch finished.
    */
-  bool trainOneBatch(size_t batchSize = -1UL);
+  bool trainOneBatch(size_t batchSize);
 
-  bool prepareBatchData(size_t batchSize = -1UL);
+  void trainOneDataBatch(size_t batchSize, const Arguments& args);
 
-  void finishTrainOneBatch();
+  void startTestPeriod();
+  void testOneDataBatch(size_t batchSize, const Arguments& args);
+  void finishTestPeriod();
 
-  void forwardOneBatch() throw(UnsupportError);
+  void forwardOneBatch(size_t batchSize);
 
-  Arguments* getNetworkOutput();
+  Arguments* getForwardOutput();
 
   Matrix* getLayerOutput(const std::string& layerName);
 };
 
-/// The N-Best results generated from one input sequence.
+/// the N-Best results generated from one input sequence.
 class ISequenceResults {
 public:
   virtual ~ISequenceResults();
diff --git a/paddle/api/PaddleAPIPrivate.h b/paddle/api/PaddleAPIPrivate.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ffeff6a9726c7445db36c7c1bec7c74825884a0
--- /dev/null
+++ b/paddle/api/PaddleAPIPrivate.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/trainer/TrainerConfigHelper.h"
+
+#pragma once
+
+struct GradientMachinePrivate {
+  std::shared_ptr<paddle::GradientMachine> machine;
+
+  template <typename T>
+  inline T& cast(void* ptr) {
+    return *(T*)(ptr);
+  }
+};
+
+struct OptimizationConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> trainer_config;
+  paddle::OptimizationConfig config;
+
+  const paddle::OptimizationConfig& getConfig() {
+    if (trainer_config != nullptr) {
+      return trainer_config->getOptConfig();
+    } else {
+      return config;
+    }
+  }
+};
+
+struct TrainerConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> conf;
+  TrainerConfigPrivate() {}
+};
+
+struct ModelConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> conf;
+};
+
+struct ArgumentsPrivate {
+  std::vector<paddle::Argument> outputs;
+
+  inline paddle::Argument& getArg(size_t idx) throw(RangeError) {
+    if (idx < outputs.size()) {
+      return outputs[idx];
+    } else {
+      RangeError e;
+      throw e;
+    }
+  }
+
+  template <typename T>
+  std::shared_ptr<T>& cast(void* rawPtr) const {
+    return *(std::shared_ptr<T>*)(rawPtr);
+  }
+};
diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp
index 8b56adc97c2d6178a9e0b272a9af89732a3573f6..c5876bb1c71438578831ffffd85840c706b6224c 100644
--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "paddle/parameter/Parameter.h"
 
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp
index e087defc6043c18123909549ed63f630708d48eb..21d031e4bcb897eb693e5cff56bc77a637dc6bd2 100644
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
 #include "paddle/parameter/ParameterOptimizer.h"
 #include "Internal.h"
 #include <algorithm>
@@ -31,17 +31,21 @@ struct ParameterTraverseCallbackPrivate {
       const paddle::ParameterOptimizer::TraverseCallback& callback)
       : callback(callback) {}
 
-  void apply(const std::vector<Vector*>& vecs, const ParameterConfig& conf,
+  void apply(const std::vector<Vector*>& vecs,
+             const ParameterConfig& conf,
              size_t sparseId) {
     std::vector<paddle::VectorPtr> real_vecs;
     real_vecs.resize(vecs.size());
-    std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
-      if (v) {
-        return *(paddle::VectorPtr*)(v->getSharedPtr());
-      } else {
-        return paddle::VectorPtr();
-      }
-    });
+    std::transform(vecs.begin(),
+                   vecs.end(),
+                   real_vecs.begin(),
+                   [](Vector* v) {
+                     if (v) {
+                       return *(paddle::VectorPtr*)(v->getSharedPtr());
+                     } else {
+                       return paddle::VectorPtr();
+                     }
+                   });
 
     paddle::ParameterConfig& real_conf =
         *(paddle::ParameterConfig*)(const_cast<ParameterConfig&>(conf)
@@ -60,10 +64,9 @@ ParameterOptimizer::~ParameterOptimizer() {
 
 ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
   CHECK(config != nullptr);
-  auto opt_config_ptr = (paddle::OptimizationConfig*)config->getRawPtr();
   auto retOptimizer = new ParameterOptimizer();
   retOptimizer->m->optimizer.reset(
-      paddle::ParameterOptimizer::create(*opt_config_ptr, false));
+      paddle::ParameterOptimizer::create(config->m->getConfig(), false));
   return retOptimizer;
 }
 
@@ -86,10 +89,12 @@ void ParameterOptimizer::startBatch(size_t numSamplesProcessed) {
 void ParameterOptimizer::finishBatch() { m->optimizer->finishBatch(); }
 
 void ParameterOptimizer::update(const std::vector<Vector*>& vecs,
-                                const ParameterConfig& conf, size_t sparseId) {
-  ParameterTraverseCallbackPrivate invoker([&](
-      const paddle::VectorPtr _vecs[], const paddle::ParameterConfig& config,
-      size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
+                                const ParameterConfig& conf,
+                                size_t sparseId) {
+  ParameterTraverseCallbackPrivate invoker(
+      [&](const paddle::VectorPtr _vecs[],
+          const paddle::ParameterConfig& config,
+          size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
   invoker.apply(vecs, conf, sparseId);
 }
 
@@ -116,8 +121,9 @@ void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
 
 ParameterTraverseCallback* ParameterOptimizer::needSpecialTraversal(
     const ParameterConfig& config) const {
-  auto& param_config = *(paddle::ParameterConfig*)const_cast<ParameterConfig&>(
-                            config).getRawPtr();
+  auto& param_config =
+      *(paddle::ParameterConfig*)const_cast<ParameterConfig&>(config)
+           .getRawPtr();
   auto callback = m->optimizer->needSpecialTraversal(param_config);
   if (callback) {
     auto retCallback = new ParameterTraverseCallback();
diff --git a/paddle/api/SequenceGenerator.cpp b/paddle/api/SequenceGenerator.cpp
index 9d353ccc8e281e72a207ba19f45517fd256d6df2..d51be78d45902967107f4bf0af995958faed931a 100644
--- a/paddle/api/SequenceGenerator.cpp
+++ b/paddle/api/SequenceGenerator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 #include "paddle/gserver/gradientmachines/GradientMachine.h"
 #include "paddle/parameter/Argument.h"
@@ -42,8 +41,10 @@ struct Path {
 // position
 static void findNBest(paddle::GradientMachine* gradMachine,
                       std::vector<paddle::Argument>& inArgs,
-                      std::vector<Path>& finalPaths, size_t bos_id,
-                      size_t eos_id, size_t max_length) {
+                      std::vector<Path>& finalPaths,
+                      size_t bos_id,
+                      size_t eos_id,
+                      size_t max_length) {
   std::vector<Path> paths;
   Path emptyPath;
   paths.push_back(emptyPath);
@@ -166,7 +167,8 @@ public:
     if (id < getSize()) {
       Path& p = (*path_)[id];
       std::ostringstream sout;
-      std::transform(p.ids.begin(), p.ids.end(),
+      std::transform(p.ids.begin(),
+                     p.ids.end(),
                      std::ostream_iterator<std::string>(sout, split ? " " : ""),
                      [&](int id) { return (*dict_)[id]; });
       return sout.str();
diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp
index 95b578c8db9fdc12707c4dd7aac5a403862b47d8..7a6aa69fb652313748b1fa787847ffd74fda7a22 100644
--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
 
 #include <stdlib.h>
 #include <memory>
@@ -30,31 +31,17 @@ P_DECLARE_string(config);
 P_DECLARE_string(init_model_path);
 P_DECLARE_int32(start_pass);
 
-struct TrainPassContext {
-  int64_t batchId;
-  int32_t batchSize;
-  real avgTestCost;
-  int64_t numAvgTests;
-  int passInnerId;
-  paddle::DataBatch data;
-  std::vector<paddle::Argument> forwardOutput;
-};
-
 struct TrainerPrivate : public paddle::Trainer {
-  void startTrain();
-  void finishTrain();
-
-  void startTrainPass();
-  void finishTrainPass();
-
-  bool _trainOneBatch();
-
-  bool _prepareBatchData();
-  void _forwardOneBatch() throw(UnsupportError);
-
+  bool _trainOneBatch(size_t batchSize);
+  bool forwardOneBatch(size_t batchSize);
+  void forwardOneDataBatch(const std::vector<paddle::Argument>& inArgs);
+  void setBatchSize(size_t batchSize);
+  std::vector<paddle::Argument>& getForwardOutput();
+
+  void startTestPeriod();
+  void finishTestPeriod();
+  void testOneDataBatch(const paddle::DataBatch& dataBatch);
   TrainerPrivate() : paddle::Trainer() {}
-
-  TrainPassContext trainPassContext;
 };
 
 Trainer::Trainer() : m(new TrainerPrivate()) {
@@ -75,109 +62,115 @@ Trainer* Trainer::createByCommandLine() throw(IOError) {
   }
 }
 
-void Trainer::startTrain() { m->startTrain(); }
+Trainer::Trainer(TrainerConfig* config, GradientMachine* gm)
+    : m(new TrainerPrivate()) {
+  m->init(config->m->conf, /* testing= */ false, gm ? gm->m->machine : nullptr);
+}
 
-void TrainerPrivate::startTrain() {
-  srand(this->config_->getConfig().start_pass() + 1);
-  this->dataProvider_->reset();
-  this->trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
+Trainer* Trainer::create(TrainerConfig* config,
+                         GradientMachine* gm) throw(IOError) {
+  auto retv = new Trainer(config, gm);
+  if (retv->m->getConfig().IsInitialized()) {
+    return retv;
+  } else {
+    retv->m->getConfig().CheckInitialized();
+    throw IOError();
+  }
 }
 
-void Trainer::finishTrain() { m->finishTrain(); }
+void Trainer::startTrain() { m->startTrain(); }
 
-void TrainerPrivate::finishTrain() {
-  this->trainerInternal_.getGradientMachine()->finish();
-}
+void Trainer::finishTrain() { m->finishTrain(); }
 
 void Trainer::startTrainPass() { m->startTrainPass(); }
 
-void TrainerPrivate::startTrainPass() {
-  this->stats_.reset();
-  this->trainPassContext.batchId = 0;
-  this->trainPassContext.batchSize = this->config_->getOptConfig().batch_size();
-  this->trainPassContext.avgTestCost = 0;
-  this->trainPassContext.numAvgTests = 0;
-  this->trainPassContext.passInnerId = 0;
-  this->trainerInternal_.getParameterUpdater()->startPass();
-  this->evaluator_->start();
-}
-
 void Trainer::finishTrainPass() { m->finishTrainPass(); }
 
-void TrainerPrivate::finishTrainPass() {
-  this->trainerInternal_.getGradientMachine()->onPassEnd();
-  this->trainerInternal_.getParameterUpdater()->finishPass();
-  evaluator_->finish();
+void Trainer::trainOneDataBatch(size_t batchSize, const Arguments& inArgs) {
+  paddle::DataBatch dataBatch;
+  dataBatch.getStreams() = inArgs.m->outputs;
+  dataBatch.setSize(batchSize);
+  m->trainOneDataBatch(dataBatch);
 }
 
-void Trainer::setBatchSize(size_t batchSize) {
-  this->m->trainPassContext.batchSize = batchSize;
+bool Trainer::trainOneBatch(size_t batchSize) {
+  return m->_trainOneBatch(batchSize);
 }
 
-bool Trainer::trainOneBatch(size_t batchSize) {
-  if (batchSize == -1UL) {
-    this->setBatchSize(batchSize);
+bool TrainerPrivate::_trainOneBatch(size_t batchSize) {
+  paddle::DataBatch dataBatch;
+  CHECK(dataProvider_) << "data_provider is not specified";
+  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+  if (num == 0) {
+    return false;
   }
-  return m->_trainOneBatch();
+  trainOneDataBatch(dataBatch);
+  return false;
 }
 
-bool TrainerPrivate::_trainOneBatch() {
-  if (this->_prepareBatchData()) {
-    return true;
+void TrainerPrivate::startTestPeriod() {
+  if (!tester_) {
+    createTester();
   }
-  this->trainerInternal_.trainOneBatch(this->trainPassContext.batchId,
-                                       this->trainPassContext.data);
-  return false;
+  tester_->startTestPeriod();
+}
+
+void Trainer::startTestPeriod() { m->startTestPeriod(); }
+
+void TrainerPrivate::testOneDataBatch(const paddle::DataBatch& dataBatch) {
+  tester_->testOneDataBatch(dataBatch, &forwardOutput_);
 }
 
+void Trainer::testOneDataBatch(size_t batchSize, const Arguments& args) {
+  paddle::DataBatch dataBatch;
+  dataBatch.getStreams() = args.m->outputs;
+  dataBatch.setSize(batchSize);
+  m->testOneDataBatch(dataBatch);
+}
+
+void TrainerPrivate::finishTestPeriod() { tester_->finishTestPeriod(); }
+void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
+
 Matrix* Trainer::getLayerOutput(const std::string& layerName) {
   auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(
-          this->m->getGradientMachine());
+      this->m->getGradientMachine());
   CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
   auto m = nn->getLayerOutput(layerName);
   return Matrix::createByPaddleMatrixPtr(&m);
 }
 
-bool Trainer::prepareBatchData(size_t batchSize) {
-  if (batchSize != -1UL) {
-    this->setBatchSize(batchSize);
-  }
-  return this->m->_prepareBatchData();
-}
-
-bool TrainerPrivate::_prepareBatchData() {
-  int num = dataProvider_->getNextBatch(this->trainPassContext.batchSize,
-                                        &this->trainPassContext.data);
-  return num == 0;
+void Trainer::forwardOneBatch(size_t batchSize) {
+  m->forwardOneBatch(batchSize);
 }
 
-void Trainer::finishTrainOneBatch() { ++m->trainPassContext.batchId; }
-
-void Trainer::forwardOneBatch() throw(UnsupportError) { m->_forwardOneBatch(); }
-
-void TrainerPrivate::_forwardOneBatch() throw(UnsupportError) {
-  auto& dataBatch = this->trainPassContext.data;
-
-  int64_t actualBatchSize = dataBatch.getSize();
-  if (actualBatchSize == 0) {
-    return;
+bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
+  CHECK(dataProvider_) << "data_provider is not specified";
+  paddle::DataBatch dataBatch;
+  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+  if (num == 0) {
+    return false;
   }
 
-  const std::vector<paddle::Argument>& inArgs = dataBatch.getStreams();
-  std::vector<paddle::Argument>& outArgs = this->trainPassContext.forwardOutput;
-  outArgs.clear();
-  paddle::PassType passType =
-      this->trainerInternal_.getParameterUpdater()->startBatch(actualBatchSize);
+  forwardOneDataBatch(dataBatch.getStreams());
+  return true;
+}
+
+void TrainerPrivate::forwardOneDataBatch(
+    const std::vector<paddle::Argument>& inArgs) {
+  std::vector<paddle::Argument>& outArgs = forwardOutput_;
 
   if (config_->getOptConfig().use_sparse_remote_updater()) {
-    this->trainerInternal_.getGradientMachine()->prefetch(inArgs);
-    this->trainerInternal_.getParameterUpdater()->getParametersRemote();
+    trainerInternal_.getGradientMachine()->prefetch(inArgs);
+    trainerInternal_.getParameterUpdater()->getParametersRemote();
   }
-  this->trainerInternal_.getGradientMachine()->forward(
-        inArgs, &outArgs, passType);
+  trainerInternal_.getGradientMachine()->forward(
+      inArgs, &outArgs, paddle::PASS_TEST);
+}
+
+Arguments* Trainer::getForwardOutput() {
+  return Arguments::createByPaddleArgumentVector(&m->getForwardOutput());
 }
 
-Arguments* Trainer::getNetworkOutput() {
-  return Arguments::createByPaddleArgumentVector(
-      &m->trainPassContext.forwardOutput);
+std::vector<paddle::Argument>& TrainerPrivate::getForwardOutput() {
+  return forwardOutput_;
 }
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index 8a6741078f2f19d8c3cb081f129447d6fc5801c9..1bba1df2e1c0a2d3cd2d8307ed3a0d784bb949b4 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -37,10 +37,16 @@ FloatArray::FloatArray(const float* b, const size_t l)
 IntArray::IntArray(const int* b, const size_t l, bool f)
     : buf(b), length(l), needFree(f) {}
 
-IntWithFloatArray::IntWithFloatArray(const float* v, const int* i, size_t l,
+IntWithFloatArray::IntWithFloatArray(const float* v,
+                                     const int* i,
+                                     size_t l,
                                      bool f)
     : valBuf(v), idxBuf(i), length(l), needFree(f) {}
 
+bool isUsingGpu() { return FLAGS_use_gpu; }
+
+void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
+
 bool isGpuVersion() {
 #ifdef PADDLE_ONLY_CPU
   return false;
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index 1affc1a5fefb8a1109d2a442db10b7d7641cd9ee..cc1c098223826a06fea291a95730d7fc1fd1beb3 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PaddleAPI.h"
 
 #include "paddle/math/Vector.h"
@@ -39,6 +38,21 @@ IVector* IVector::create(const std::vector<int>& data, bool useGpu) {
   return v;
 }
 
+IVector* IVector::createVectorFromNumpy(int* data,
+                                        int dim,
+                                        bool copy,
+                                        bool useGpu) throw(UnsupportError) {
+  if (useGpu) {
+    /// if use gpu only copy=true is supported
+    if (!copy) {
+      throw UnsupportError("Gpu mode only supports copy=True");
+    }
+    return IVector::createGpuVectorFromNumpy(data, dim);
+  } else {
+    return IVector::createCpuVectorFromNumpy(data, dim, copy);
+  }
+}
+
 IVector* IVector::createCpuVectorFromNumpy(int* data, int dim, bool copy) {
   auto v = new IVector();
   if (copy) {
@@ -50,7 +64,7 @@ IVector* IVector::createCpuVectorFromNumpy(int* data, int dim, bool copy) {
   return v;
 }
 
-IVector* IVector::createGpuVectorFromNumy(int* data, int dim) {
+IVector* IVector::createGpuVectorFromNumpy(int* data, int dim) {
   auto v = new IVector();
   v->m->vec = paddle::IVector::create(dim, true);
   v->m->vec->copyFrom(data, dim);
@@ -124,8 +138,8 @@ void IVector::copyToNumpyArray(int** view_m_data, int* dim1) {
   if (auto cpuVec = dynamic_cast<paddle::CpuIVector*>(m->vec.get())) {
     std::memcpy(*view_m_data, cpuVec->getData(), sizeof(int) * (*dim1));
   } else if (auto gpuVec = dynamic_cast<paddle::GpuIVector*>(m->vec.get())) {
-    hl_memcpy_device2host(*view_m_data, gpuVec->getData(),
-                          sizeof(int) * (*dim1));
+    hl_memcpy_device2host(
+        *view_m_data, gpuVec->getData(), sizeof(int) * (*dim1));
   } else {
     LOG(INFO) << "Unexpected situation";
   }
@@ -188,12 +202,27 @@ Vector* Vector::createByPaddleVectorPtr(void* ptr) {
   }
 }
 
+Vector* Vector::createVectorFromNumpy(float* data,
+                                      int dim,
+                                      bool copy,
+                                      bool useGpu) throw(UnsupportError) {
+  if (useGpu) {
+    /// if use gpu only copy=True is supported
+    if (!copy) {
+      throw UnsupportError("Gpu mode only supports copy=True");
+    }
+    return Vector::createGpuVectorFromNumpy(data, dim);
+  } else {
+    return Vector::createCpuVectorFromNumpy(data, dim, copy);
+  }
+}
+
 Vector* Vector::createCpuVectorFromNumpy(float* data, int dim, bool copy) {
   CHECK_GT(dim, 0);
   auto retVec = new Vector();
   if (copy) {
     retVec->m->vec = paddle::Vector::create((size_t)dim, false);
-    return retVec;
+    retVec->m->vec->copyFrom(data, dim);
   } else {
     retVec->m->vec = paddle::Vector::create(data, (size_t)dim, false);
   }
@@ -225,8 +254,8 @@ void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
   if (auto cpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
     std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
   } else if (auto gpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
-    hl_memcpy_device2host(*view_m_data, gpuVec->getData(),
-                          sizeof(float) * (*dim1));
+    hl_memcpy_device2host(
+        *view_m_data, gpuVec->getData(), sizeof(float) * (*dim1));
   } else {
     LOG(INFO) << "Unexpected situation";
   }
@@ -237,6 +266,21 @@ void Vector::copyFromNumpyArray(float* data, int dim) {
   m->vec->copyFrom(data, dim);
 }
 
+FloatArray Vector::getData() const {
+  if (this->isGpu()) {
+    float* src = m->vec->getData();
+    size_t len = m->vec->getSize();
+    float* dest = new float[len];
+    hl_memcpy_device2host(dest, src, len * sizeof(float));
+    FloatArray ret_val(dest, len);
+    ret_val.needFree = true;
+    return ret_val;
+  } else {
+    FloatArray ret_val(m->vec->getData(), m->vec->getSize());
+    return ret_val;
+  }
+}
+
 bool Vector::isGpu() const {
   return std::dynamic_pointer_cast<paddle::GpuVector>(m->vec) != nullptr;
 }
diff --git a/paddle/api/__init__.py b/paddle/api/__init__.py
index 7f9e87eee6037666b86420fba194624859d356b3..c90af2ee000d46a032984ee23559e7e99b49ddad 100644
--- a/paddle/api/__init__.py
+++ b/paddle/api/__init__.py
@@ -11,4 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
diff --git a/paddle/api/paddle_api_config.py.in b/paddle/api/paddle_api_config.py.in
index 6531e5ccb3dba39315c7e35191ea1bdf0504d220..a2352250c31efa7ee3c4c8338d95dce5a5b9a511 100644
--- a/paddle/api/paddle_api_config.py.in
+++ b/paddle/api/paddle_api_config.py.in
@@ -1,6 +1,7 @@
 PADDLE_BUILD_DIR="@CMAKE_CURRENT_BINARY_DIR@/../"
 WITH_GPU="@WITH_GPU@"
 PROTOBUF_LIB="@PROTOBUF_LIBRARY@"
+ZLIB_LIB="@ZLIB_LIBRARIES@"
 CMAKE_THREAD_LIB="@CMAKE_THREAD_LIBS_INIT@"
 CMAKE_DL_LIBS="@CMAKE_DL_LIBS@"
 
@@ -15,3 +16,4 @@ GFLAGS_LOCATION="@GFLAGS_LOCATION@"
 CBLAS_LIBRARIES="@CBLAS_LIBS@"
 
 CUDA_LIBRARIES="@CUDA_LIBRARIES@"
+WITH_COVERALLS="@ON_COVERALLS@"
diff --git a/paddle/api/paddle_ld_flags.py b/paddle/api/paddle_ld_flags.py
index bc1afc5898e829bc271b62b702b3743bf7eb782b..ebe00798e8b7169ecbbef53e287ab4b78334bcf9 100644
--- a/paddle/api/paddle_ld_flags.py
+++ b/paddle/api/paddle_ld_flags.py
@@ -29,7 +29,10 @@ try:
         whole_start = ""
         whole_end = ""
 
-    LIB_DIRS = ["math", 'utils', 'parameter', "gserver", "api", "cuda", "pserver", "trainer"]
+    LIB_DIRS = [
+        "math", 'utils', 'parameter', "gserver", "api", "cuda", "pserver",
+        "trainer"
+    ]
     PARENT_LIB_DIRS = ['proto']
 
     class PaddleLDFlag(object):
@@ -38,6 +41,7 @@ try:
             self.paddle_build_dir = os.path.abspath(self.paddle_build_dir)
             self.with_gpu = PaddleLDFlag.cmake_bool(WITH_GPU)
             self.protolib = PROTOBUF_LIB
+            self.zlib = ZLIB_LIB
             self.thread = CMAKE_THREAD_LIB
             self.dl_libs = CMAKE_DL_LIBS
             self.with_python = PaddleLDFlag.cmake_bool(WITH_PYTHON)
@@ -47,25 +51,27 @@ try:
             self.glog_libs = LIBGLOG_LIBRARY
 
             self.with_gflags = PaddleLDFlag.cmake_bool(WITH_GFLAGS)
+            self.with_coverage = PaddleLDFlag.cmake_bool(WITH_COVERALLS)
             self.gflags_libs = GFLAGS_LIBRARIES
             self.gflags_location = GFLAGS_LOCATION
             self.cblas_libs = CBLAS_LIBRARIES
             self.curt = CUDA_LIBRARIES
 
         def ldflag_str(self):
-            return " ".join([self.libs_dir_str(),
-                             self.parent_dir_str(),
-                             self.libs_str()])
+            return " ".join(
+                [self.libs_dir_str(), self.parent_dir_str(), self.libs_str()])
 
         def libs_dir_str(self):
             libdirs = LIB_DIRS
-            return " ".join(map(lambda x: "-L" + os.path.join(self.paddle_build_dir, x),
-                                libdirs))
+            return " ".join(
+                map(lambda x: "-L" + os.path.join(self.paddle_build_dir, x),
+                    libdirs))
 
         def parent_dir_str(self):
             libdirs = PARENT_LIB_DIRS
-            return " ".join(map(lambda x: "-L" + os.path.join(self.paddle_build_dir, '..', x), 
-                libdirs))
+            return " ".join(
+                map(lambda x: "-L" + os.path.join(self.paddle_build_dir, '..', x),
+                    libdirs))
 
         def libs_str(self):
             libs = [
@@ -82,6 +88,7 @@ try:
                 "-lpaddle_cuda",
                 "-lpaddle_api",
                 self.normalize_flag(self.protolib),
+                self.normalize_flag(self.zlib),
                 self.normalize_flag(self.thread),
                 self.normalize_flag(self.dl_libs),
                 self.normalize_flag(self.cblas_libs),
@@ -95,6 +102,8 @@ try:
                 libs.append(self.normalize_flag(self.gflags_libs))
             if self.with_gpu:
                 libs.append(self.normalize_flag(self.curt))
+            if self.with_coverage:
+                libs.append("-fprofile-arcs")
             return " ".join(filter(lambda l: len(l) != 0, libs))
 
         def normalize_flag(self, cmake_flag):
@@ -108,10 +117,10 @@ try:
                 return cmake_flag
             elif cmake_flag.startswith("-l"):  # normal link command
                 return cmake_flag
-            elif cmake_flag in ["gflags-shared",
-                                "gflags-static",
-                                "gflags_nothreads-shared",
-                                "gflags_nothreads-static"]:  # special for gflags
+            elif cmake_flag in [
+                    "gflags-shared", "gflags-static", "gflags_nothreads-shared",
+                    "gflags_nothreads-static"
+            ]:  # special for gflags
                 assert PaddleLDFlag.cmake_bool(self.gflags_location)
                 return self.gflags_location
             elif len(cmake_flag) != 0:
@@ -127,12 +136,22 @@ try:
             :type cmake_str: str
             :rtype: bool
             """
-            if cmake_str in ["FALSE", "OFF", "NO"] or cmake_str.endswith("-NOTFOUND"):
+            if cmake_str in ["FALSE", "OFF", "NO"] or cmake_str.endswith(
+                    "-NOTFOUND"):
                 return False
             else:
                 return True
 
+        def c_flag(self):
+            if self.with_coverage:
+                return ["-fprofile-arcs", "-ftest-coverage", "-O0", "-g"]
+            else:
+                return None
 except ImportError:
+
     class PaddleLDFlag(object):
         def ldflag_str(self):
             pass
+
+        def c_flag(self):
+            pass
diff --git a/paddle/api/test/CMakeLists.txt b/paddle/api/test/CMakeLists.txt
index c4c26e6c03fdff51696f75f4d6a522cff60e7cca..08a0fe96a004d38b81d0bac881da1faeb52685f4 100644
--- a/paddle/api/test/CMakeLists.txt
+++ b/paddle/api/test/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_test(NAME test_swig_api
-    COMMAND bash ${PROJ_ROOT}/paddle/api/test/run_tests.sh)
\ No newline at end of file
+    COMMAND bash ${PROJ_ROOT}/paddle/api/test/run_tests.sh)
diff --git a/paddle/api/test/run_tests.sh b/paddle/api/test/run_tests.sh
index 1fc6fd5a8c185301612655d9971082203fe647dc..a4814f98f89c2e24195074369bc897b8b4bd2d9b 100755
--- a/paddle/api/test/run_tests.sh
+++ b/paddle/api/test/run_tests.sh
@@ -30,7 +30,7 @@ source .test_env/bin/activate
 
 pip --timeout 600  install ../../dist/*.whl
 
-test_list="testArguments.py testGradientMachine.py testMatrix.py  testVector.py testTrain.py"
+test_list="testArguments.py testGradientMachine.py testMatrix.py  testVector.py testTrain.py testTrainer.py"
 
 export PYTHONPATH=$PWD/../../../python/
 
diff --git a/paddle/api/test/testArguments.py b/paddle/api/test/testArguments.py
index daedd2409effccba27ff6818fc2603d3e1665bde..70fb169fd5c43d5768e67ad8e4c62a9f4d302eaf 100644
--- a/paddle/api/test/testArguments.py
+++ b/paddle/api/test/testArguments.py
@@ -32,7 +32,7 @@ class TestArguments(unittest.TestCase):
         iv = args.getSlotIds(0)
         assert isinstance(iv, swig_paddle.IVector)
         np_arr = iv.toNumpyArrayInplace()
-        self.assertEqual(np_arr.shape, (6,))
+        self.assertEqual(np_arr.shape, (6, ))
 
 
 if __name__ == '__main__':
diff --git a/paddle/api/test/testGradientMachine.py b/paddle/api/test/testGradientMachine.py
index 59b36a012a239730a1d0a5b239a3ba69f0cee1fb..e12613fbb8a66545dd3ad20d59b0b951e86e8683 100644
--- a/paddle/api/test/testGradientMachine.py
+++ b/paddle/api/test/testGradientMachine.py
@@ -30,8 +30,8 @@ class TestGradientMachine(unittest.TestCase):
         self.assertIsNotNone(model_config)
         machine = swig_paddle.GradientMachine.createByModelConfig(
             model_config, swig_paddle.CREATE_MODE_NORMAL,
-            swig_paddle.ParameterOptimizer.create(
-                opt_config).getParameterTypes())
+            swig_paddle.ParameterOptimizer.create(opt_config).getParameterTypes(
+            ))
         self.assertIsNotNone(machine)
         ipt, _ = util.loadMNISTTrainData()
         output = swig_paddle.Arguments.createArguments(0)
@@ -43,7 +43,7 @@ class TestGradientMachine(unittest.TestCase):
             assert isinstance(param, swig_paddle.Parameter)
             val = param.getBuf(swig_paddle.PARAMETER_VALUE)
             assert isinstance(val, swig_paddle.Vector)
-            arr = numpy.full((len(val),), 0.1, dtype="float32")
+            arr = numpy.full((len(val), ), 0.1, dtype="float32")
             val.copyFromNumpyArray(arr)
             param_config = param.getConfig().toProto()
             assert isinstance(param_config,
diff --git a/paddle/api/test/testMatrix.py b/paddle/api/test/testMatrix.py
index 2216ef30a58b0d97bba210bf0edee02a18264076..0432345edd659f13bddb1b99f62622c5ea64a4cb 100644
--- a/paddle/api/test/testMatrix.py
+++ b/paddle/api/test/testMatrix.py
@@ -42,7 +42,7 @@ class TestMatrix(unittest.TestCase):
         self.assertEqual(m.getSparseRowCols(2), [])
 
     def test_sparse_value(self):
-        m = swig_paddle.Matrix.createSparse(3, 3, 6, False)
+        m = swig_paddle.Matrix.createSparse(3, 3, 6, False, False, False)
         self.assertIsNotNone(m)
         m.sparseCopyFrom([0, 2, 3, 3], [0, 1, 2], [7.3, 4.2, 3.2])
 
@@ -66,10 +66,11 @@ class TestMatrix(unittest.TestCase):
         self.assertIsNotNone(m)
         self.assertTrue(abs(m.get(1, 1) - 0.5) < 1e-5)
 
-    def test_numpy(self):
+    def test_numpyCpu(self):
         numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
         m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat)
-        self.assertEqual((int(m.getHeight()), int(m.getWidth())), numpy_mat.shape)
+        self.assertEqual((int(m.getHeight()), int(m.getWidth())),
+                         numpy_mat.shape)
 
         # the numpy matrix and paddle matrix shared the same memory.
         numpy_mat[0, 1] = 342.23
@@ -99,8 +100,20 @@ class TestMatrix(unittest.TestCase):
 
             for a, e in zip(gpu_m.getData(), [1.0, 3.23, 3.0, 4.0, 5.0, 6.0]):
                 self.assertAlmostEqual(a, e)
+    
+    def test_numpy(self):
+        numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
+        m = swig_paddle.Matrix.createDenseFromNumpy(numpy_mat)
+        self.assertEqual((int(m.getHeight()), int(m.getWidth())), numpy_mat.shape)
+        self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
+        for a, e in zip(m.getData(), [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]):
+            self.assertAlmostEqual(a, e)
 
 
 if __name__ == "__main__":
     swig_paddle.initPaddle("--use_gpu=0")
-    unittest.main()
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestMatrix)
+    unittest.TextTestRunner().run(suite)
+    if swig_paddle.isGpuVersion():
+        swig_paddle.setUseGpu(True)
+        unittest.main()
diff --git a/paddle/api/test/testTrain.py b/paddle/api/test/testTrain.py
index 7f79c2701e9ed2e8c618be076d684c7793a8ad42..a3ba4eaaa69b39b75e7ece3095b6f236c1248d41 100644
--- a/paddle/api/test/testTrain.py
+++ b/paddle/api/test/testTrain.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from py_paddle import swig_paddle, DataProviderWrapperConverter
+from py_paddle import swig_paddle
 import paddle.trainer.config_parser
-from paddle.trainer.PyDataProviderWrapper import DenseSlot, IndexSlot
 import numpy
 import util
 
@@ -99,7 +98,8 @@ def main():
         cost_vec = outArgs.getSlotValue(0)
         assert isinstance(cost_vec, swig_paddle.Matrix)
         cost_vec = cost_vec.copyToNumpyMat()
-        print 'Finish Batch', batch_id, 'with cost ', cost_vec.sum() / batch_size
+        print 'Finish Batch', batch_id, 'with cost ', cost_vec.sum(
+        ) / batch_size
         batch_id += 1
 
     for optimizer in optimizers:
diff --git a/paddle/api/test/testTrainConfig.py b/paddle/api/test/testTrainConfig.py
index 22148e31915da0c21609fe0694274cfaee4b3950..77e0cd37d566d2571fada76b9948a9b0616ad044 100644
--- a/paddle/api/test/testTrainConfig.py
+++ b/paddle/api/test/testTrainConfig.py
@@ -1,9 +1,6 @@
 from paddle.trainer_config_helpers import *
 
-settings(
-    batch_size=100,
-    learning_method=AdamOptimizer()
-)
+settings(batch_size=100, learning_method=AdamOptimizer())
 
 din = data_layer(name='input', size=784)
 
diff --git a/paddle/api/test/testTrainer.py b/paddle/api/test/testTrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..edd5a2da5785c405b46c2559ee93837ac68d7c3a
--- /dev/null
+++ b/paddle/api/test/testTrainer.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.config_parser import parse_config
+from paddle.trainer.config_parser import logger
+from py_paddle import swig_paddle
+import util
+
+
+def main():
+    trainer_config = parse_config("./testTrainConfig.py", "")
+    model = swig_paddle.GradientMachine.createFromConfigProto(
+        trainer_config.model_config)
+    trainer = swig_paddle.Trainer.create(trainer_config, model)
+    trainer.startTrain()
+    for train_pass in xrange(2):
+        trainer.startTrainPass()
+        num = 0
+        cost = 0
+        while True:  # Train one batch
+            batch_size = 1000
+            data, atEnd = util.loadMNISTTrainData(batch_size)
+            if atEnd:
+                break
+            trainer.trainOneDataBatch(batch_size, data)
+            outs = trainer.getForwardOutput()
+            cost += sum(outs[0]['value'])
+            num += batch_size
+        trainer.finishTrainPass()
+        logger.info('train cost=%f' % (cost / num))
+
+        trainer.startTestPeriod()
+        num = 0
+        cost = 0
+        while True:  # Test one batch
+            batch_size = 1000
+            data, atEnd = util.loadMNISTTrainData(batch_size)
+            if atEnd:
+                break
+            trainer.testOneDataBatch(batch_size, data)
+            outs = trainer.getForwardOutput()
+            cost += sum(outs[0]['value'])
+            num += batch_size
+        trainer.finishTestPeriod()
+        logger.info('test cost=%f' % (cost / num))
+
+    trainer.finishTrain()
+
+
+if __name__ == '__main__':
+    swig_paddle.initPaddle("--use_gpu=0", "--trainer_count=1")
+    main()
diff --git a/paddle/api/test/testVector.py b/paddle/api/test/testVector.py
index f5b5d0e32e4208e7becb9755d1aed131f52ff146..48aaa1d73da9e6c207ad5fa2be14a531267bd901 100644
--- a/paddle/api/test/testVector.py
+++ b/paddle/api/test/testVector.py
@@ -20,20 +20,28 @@ import unittest
 
 class TestIVector(unittest.TestCase):
     def test_createZero(self):
-        m = swig_paddle.IVector.createZero(10)
+        m = swig_paddle.IVector.createZero(10, False)
         self.assertIsNotNone(m)
         for i in xrange(10):
             self.assertEqual(m[i], 0)
             m[i] = i
             self.assertEqual(m[i], i)
+        
+        m = swig_paddle.IVector.createZero(10)
+        self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
+        self.assertEqual(m.getData(), [0]*10)
 
     def test_create(self):
-        m = swig_paddle.IVector.create(range(10))
+        m = swig_paddle.IVector.create(range(10), False)
         self.assertIsNotNone(m)
         for i in xrange(10):
             self.assertEqual(m[i], i)
+        
+        m = swig_paddle.IVector.create(range(10))
+        self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
+        self.assertEqual(m.getData(), range(10))
 
-    def test_numpy(self):
+    def test_cpu_numpy(self):
         vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
         iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec)
         self.assertEqual(vec.shape[0], int(iv.__len__()))
@@ -61,25 +69,43 @@ class TestIVector(unittest.TestCase):
             expect_vec = range(0, 10)
             expect_vec[4] = 7
             self.assertEqual(vec.getData(), expect_vec)
+    
+    def test_numpy(self):
+        vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
+        iv = swig_paddle.IVector.createVectorFromNumpy(vec)
+        self.assertEqual(iv.isGpu(), swig_paddle.isUsingGpu())
+        self.assertEqual(iv.getData(), list(vec))
 
 
 class TestVector(unittest.TestCase):
     def testCreateZero(self):
-        v = swig_paddle.Vector.createZero(10)
+        v = swig_paddle.Vector.createZero(10, False)
         self.assertIsNotNone(v)
         for i in xrange(len(v)):
             self.assertTrue(util.doubleEqual(v[i], 0))
             v[i] = i
             self.assertTrue(util.doubleEqual(v[i], i))
+        
+        v = swig_paddle.Vector.createZero(10)
+        self.assertEqual(v.isGpu(), swig_paddle.isUsingGpu())
+        self.assertEqual(v.getData(), [0]*10)
 
     def testCreate(self):
-        v = swig_paddle.Vector.create([x / 100.0 for x in xrange(100)])
+        v = swig_paddle.Vector.create([x / 100.0 for x in xrange(100)], False)
         self.assertIsNotNone(v)
         for i in xrange(len(v)):
             self.assertTrue(util.doubleEqual(v[i], i / 100.0))
         self.assertEqual(100, len(v))
+        
+        v = swig_paddle.Vector.create([x / 100.0 for x in xrange(100)])
+        self.assertEqual(v.isGpu(), swig_paddle.isUsingGpu())
+        self.assertEqual(100, len(v))
+        vdata = v.getData()
+        for i in xrange(len(v)):
+            self.assertTrue(util.doubleEqual(vdata[i], i / 100.0))
+        
 
-    def testNumpy(self):
+    def testCpuNumpy(self):
         numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
         vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr)
         assert isinstance(vec, swig_paddle.Vector)
@@ -102,9 +128,18 @@ class TestVector(unittest.TestCase):
 
         for i in xrange(1, len(numpy_3)):
             util.doubleEqual(numpy_3[i], vec[i])
+    
+    def testNumpy(self):
+        numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
+        vec = swig_paddle.Vector.createVectorFromNumpy(numpy_arr)
+        self.assertEqual(vec.isGpu(), swig_paddle.isUsingGpu())
+        vecData = vec.getData()
+        for n, v in zip(numpy_arr, vecData):
+            self.assertTrue(util.doubleEqual(n, v))
+        
 
     def testCopyFromNumpy(self):
-        vec = swig_paddle.Vector.createZero(1)
+        vec = swig_paddle.Vector.createZero(1, False)
         arr = np.array([1.3, 3.2, 2.4], dtype="float32")
         vec.copyFromNumpyArray(arr)
         for i in xrange(len(vec)):
@@ -112,5 +147,9 @@ class TestVector(unittest.TestCase):
 
 
 if __name__ == '__main__':
-    swig_paddle.initPaddle("--use_gpu=1" if swig_paddle.isGpuVersion() else "--use_gpu=0")
-    unittest.main()
+    swig_paddle.initPaddle("--use_gpu=0")
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestVector)
+    unittest.TextTestRunner().run(suite)
+    if swig_paddle.isGpuVersion():
+        swig_paddle.setUseGpu(True)
+        unittest.main()
\ No newline at end of file
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index cdb730bb3cec7a32fa42cf4c6738d575b76c6032..11dbfb54b268774405ade1e532bef9a0e8c7ada9 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -81,5 +81,8 @@ else()
     add_library(paddle_cuda ${CUDA_SOURCES})
 endif()
 
-add_style_check_target(paddle_cuda ${CUDA_SOURCES})
-add_style_check_target(paddle_cuda ${CUDA_HEADERS})
+add_style_check_target(paddle_cuda
+                       ${CUDA_SOURCES}
+                       ${CUDA_HEADERS}
+                       ${CUDA_DSO_SOURCES}
+                       ${CUDA_CXX_WITH_GPU_SOURCES})
diff --git a/paddle/cuda/include/hl_activation_functions.h b/paddle/cuda/include/hl_activation_functions.h
index c8aabc7844cd48d7ebdd0077684f9efa50f941a2..03e15b2223a50625c6999f6b081ae984e76b182b 100644
--- a/paddle/cuda/include/hl_activation_functions.h
+++ b/paddle/cuda/include/hl_activation_functions.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_ACTIVATION_FUNCTIONS_H_
 #define HL_ACTIVATION_FUNCTIONS_H_
 
@@ -21,11 +20,8 @@ limitations under the License. */
 /**
  * Active functions: sigmoid, relu, tanh and linear.
  */
-#define HPPL_ACTIVE_FUNCTION  {hppl::sigmoid,   \
-                               hppl::relu,      \
-                               hppl::tanh,      \
-                               hppl::linear     \
-                              }
+#define HPPL_ACTIVE_FUNCTION \
+  { hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear }
 
 namespace hppl {
 
@@ -42,18 +38,18 @@ public:
 
 #ifdef __NVCC__
 namespace gpu {
-static __device__ Active<real>::forward  forward[]  = HPPL_ACTIVE_FUNCTION;
+static __device__ Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static __device__ Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
 }
 #else
 namespace cpu {
-static Active<real>::forward  forward[] = HPPL_ACTIVE_FUNCTION;
+static Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
 }
 
 #ifdef __AVX__
 namespace avx {
-static Active<__m256>::forward  forward[] = HPPL_ACTIVE_FUNCTION;
+static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static Active<__m256>::backward backward[] = HPPL_ACTIVE_FUNCTION;
 }
 #endif
diff --git a/paddle/cuda/include/hl_aggregate.h b/paddle/cuda/include/hl_aggregate.h
index db75809f5de195d41577ed6569e8508f48241b69..a6d9ff8483eee28b2c8a380f0aca097c7662a02e 100644
--- a/paddle/cuda/include/hl_aggregate.h
+++ b/paddle/cuda/include/hl_aggregate.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_AGGREGATE_H_
 #define HL_AGGREGATE_H_
 
diff --git a/paddle/cuda/include/hl_avx_functions.h b/paddle/cuda/include/hl_avx_functions.h
index cf062dd969bf79554e00369367e3b85c2ae7fc0d..ed339e312a7639cf9b78f130a43d67a7446576bb 100644
--- a/paddle/cuda/include/hl_avx_functions.h
+++ b/paddle/cuda/include/hl_avx_functions.h
@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_AVX_FUNCTIONS_H_
 #define HL_AVX_FUNCTIONS_H_
 
 #include <immintrin.h>
 
 namespace hppl {
-  __m256 relu(const __m256 a);
-  __m256 sigmoid(const __m256 a);
-  __m256 tanh(const __m256 a);
-  __m256 linear(const __m256 a);
-
-  __m256 relu(const __m256 a, const __m256 b);
-  __m256 sigmoid(const __m256 a, const __m256 b);
-  __m256 tanh(const __m256 a, const __m256 b);
-  __m256 linear(const __m256 a, const __m256 b);
+__m256 relu(const __m256 a);
+__m256 sigmoid(const __m256 a);
+__m256 tanh(const __m256 a);
+__m256 linear(const __m256 a);
+
+__m256 relu(const __m256 a, const __m256 b);
+__m256 sigmoid(const __m256 a, const __m256 b);
+__m256 tanh(const __m256 a, const __m256 b);
+__m256 linear(const __m256 a, const __m256 b);
 }  // namespace hppl
 
 #endif  // HL_AVX_FUNCTIONS_H_
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
index 1fe2774cc5a291dbafb61b50d63553b086512e4d..a076952467a5ce10dc1f58007dda2170aa694fbb 100644
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
-
 #ifndef HL_BASE_H_
 #define HL_BASE_H_
 
@@ -33,36 +31,36 @@ limitations under the License. */
  *          HPPL_STREAM_DEFAULT is HPPL default stream.
  */
 typedef enum {
-    HPPL_STREAM_DEFAULT = 0,    /* Thread Default Stream*/
-    HPPL_STREAM_1 = 1,
-    HPPL_STREAM_2 = 2,
-    HPPL_STREAM_3 = 3,
-    HPPL_STREAM_4 = 4,
-    HPPL_THREAD_STREAM_1 = 5,
-    HPPL_THREAD_STREAM_2 = 6,
-    HPPL_THREAD_STREAM_3 = 7,
-    HPPL_THREAD_STREAM_4 = 8,
-    HPPL_STREAM_END
+  HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/
+  HPPL_STREAM_1 = 1,
+  HPPL_STREAM_2 = 2,
+  HPPL_STREAM_3 = 3,
+  HPPL_STREAM_4 = 4,
+  HPPL_THREAD_STREAM_1 = 5,
+  HPPL_THREAD_STREAM_2 = 6,
+  HPPL_THREAD_STREAM_3 = 7,
+  HPPL_THREAD_STREAM_4 = 8,
+  HPPL_STREAM_END
 } hl_stream_t;
 
 /**
  * @brief HPPL activation mode.
  */
 typedef enum {
-    HL_ACTIVATION_SIGMOID   = 0,
-    HL_ACTIVATION_RELU      = 1,
-    HL_ACTIVATION_TANH      = 2,
-    HL_ACTIVATION_LINEAR    = 3,
-    HL_ACTIVATION_END
+  HL_ACTIVATION_SIGMOID = 0,
+  HL_ACTIVATION_RELU = 1,
+  HL_ACTIVATION_TANH = 2,
+  HL_ACTIVATION_LINEAR = 3,
+  HL_ACTIVATION_END
 } hl_activation_mode_t;
 
 /**
  * @brief Transpose type.
  */
 typedef enum {
-    HPPL_OP_N = 0, /* transpose */
-    HPPL_OP_T = 1, /* non transpose */
-    HPPL_OP_END
+  HPPL_OP_N = 0, /* transpose */
+  HPPL_OP_T = 1, /* non transpose */
+  HPPL_OP_END
 } hl_trans_op_t;
 
 /**
@@ -148,23 +146,21 @@ typedef struct {
  * @brief  Sparse matrix value type.
  */
 typedef enum {
-    HL_NO_VALUE = 0,                       /* matrix values only 0 or 1 */
-    HL_FLOAT_VALUE = 1,
-    HL_VALUE_END
+  HL_NO_VALUE = 0, /* matrix values only 0 or 1 */
+  HL_FLOAT_VALUE = 1,
+  HL_VALUE_END
 } hl_matrix_value_t;
 
-
 /**
  * @brief  HPPL matrix format.
  */
 typedef enum {
-    HL_SPARSE_CSR = 0,
-    HL_SPARSE_CSC = 1,
-    HL_SPARSE_END
+  HL_SPARSE_CSR = 0,
+  HL_SPARSE_CSC = 1,
+  HL_SPARSE_END
 } hl_matrix_format_t;
 
-
-typedef struct _hl_matrix_s * hl_matrix_s;
+typedef struct _hl_matrix_s *hl_matrix_s;
 
 /**
  * @brief   HPPL sparse matrix.
@@ -177,12 +173,12 @@ typedef struct _hl_matrix_s * hl_matrix_s;
  * @param  nnz        nonzero values of sparse matrix.
  */
 typedef struct {
-    hl_matrix_s             matrix;
-    hl_matrix_format_t      format;
-    hl_matrix_value_t       type;
-    int                     rows;
-    int                     cols;
-    size_t                  nnz;
+  hl_matrix_s matrix;
+  hl_matrix_format_t format;
+  hl_matrix_value_t type;
+  int rows;
+  int cols;
+  size_t nnz;
 } _hl_sparse_matrix_s, *hl_sparse_matrix_s;
 
 #ifndef PADDLE_TYPE_DOUBLE
@@ -195,7 +191,7 @@ typedef struct {
  *
  * HL_FLOAT_MIN: 1.17549435e-38F
  */
-#define HL_FLOAT_MAX        3.40282347e+38F
+#define HL_FLOAT_MAX 3.40282347e+38F
 /**
  * if real == double
  *
@@ -203,19 +199,26 @@ typedef struct {
  *
  * HL_FLOAT_MIN: 2.2250738585072014e-308
  */
-#define HL_FLOAT_MIN        1.17549435e-38F
+#define HL_FLOAT_MIN 1.17549435e-38F
 #else
-#define HL_FLOAT_MAX        1.7976931348623157e+308
-#define HL_FLOAT_MIN        2.2250738585072014e-308
+#define HL_FLOAT_MAX 1.7976931348623157e+308
+#define HL_FLOAT_MIN 2.2250738585072014e-308
 #endif
 
+/**
+ * The maximum input value for exp, used to avoid overflow problem.
+ *
+ * Currently only used for tanh function.
+ */
+#define EXP_MAX_INPUT 40.0
+
 /**
  * @brief DIVUP(x, y) is similar to ceil(x / y).
  * @note  For CUDA, DIVUP will be used to specify
  *        the size of blockDim.
  */
 #ifndef DIVUP
-#define DIVUP(x, y) (((x) + (y) - 1) / (y))
+#define DIVUP(x, y) (((x) + (y)-1) / (y))
 #endif
 
 #ifdef __NVCC__
@@ -224,7 +227,7 @@ typedef struct {
 #include "hl_cuda.h"
 #include "cuda_runtime.h"
 
-extern  __thread bool g_sync_flag;
+extern __thread bool g_sync_flag;
 extern __thread cudaStream_t default_stream;
 #define STREAM_DEFAULT default_stream
 
@@ -232,17 +235,15 @@ extern __thread cudaStream_t default_stream;
  * @brief   Check cuda kernel execution.
  * @param   msg   error string
  */
-#define CHECK_SYNC(msg)                                   \
-  if (true == g_sync_flag) {                              \
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);           \
-    cudaError_t err                                       \
-      = (cudaError_t)hl_get_device_last_error();          \
-    CHECK_EQ(cudaSuccess, err) << "[" << msg << "] "      \
-      << "CUDA error: "                                   \
-      << hl_get_device_error_string((size_t)err);         \
+#define CHECK_SYNC(msg)                                               \
+  if (true == g_sync_flag) {                                          \
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);                       \
+    cudaError_t err = (cudaError_t)hl_get_device_last_error();        \
+    CHECK_EQ(cudaSuccess, err)                                        \
+        << "[" << msg << "] "                                         \
+        << "CUDA error: " << hl_get_device_error_string((size_t)err); \
   }
 
-#endif  /* __NVCC__ */
-
-#endif  /* HL_BASE_H_ */
+#endif /* __NVCC__ */
 
+#endif /* HL_BASE_H_ */
diff --git a/paddle/cuda/include/hl_batch_transpose.h b/paddle/cuda/include/hl_batch_transpose.h
index 414c7996acee4ccbe2d7dbd093e25a23119fea3c..f3630e9762508fd39935e62e0007de04f9140fff 100644
--- a/paddle/cuda/include/hl_batch_transpose.h
+++ b/paddle/cuda/include/hl_batch_transpose.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_BATCH_TRANSPOSE_H_
 #define HL_BATCH_TRANSPOSE_H_
 
@@ -31,10 +30,7 @@ limitations under the License. */
  *          order. Each batch has height * width data, which are
  *          arranged in height-first (or row-first) manner.
  */
-extern void batchTranspose(const real* input,
-                           real* output,
-                           int width,
-                           int height,
-                           int batchSize);
+extern void batchTranspose(
+    const real* input, real* output, int width, int height, int batchSize);
 
 #endif  // HL_BATCH_TRANSPOSE_H_
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 5d750333e1e35d6097d33d905a02d647c3919eb1..cffaac634f0f64be5ddab961d549ae43775bb7b0 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CNN_H_
 #define HL_CNN_H_
 
@@ -37,15 +36,21 @@ limitations under the License. */
  * @param[in]   alpha
  * @param[in]   beta
  */
-extern void hl_shrink_col2feature(
-    const real * dataCol, size_t channels,
-    size_t height, size_t width,
-    size_t blockH, size_t blockW,
-    size_t strideH, size_t strideW,
-    size_t paddingH, size_t paddingW,
-    size_t outputH, size_t outputW,
-    real* dataIm,
-    real alpha = 1.0f, real beta = 0.0f);
+extern void hl_shrink_col2feature(const real* dataCol,
+                                  size_t channels,
+                                  size_t height,
+                                  size_t width,
+                                  size_t blockH,
+                                  size_t blockW,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real* dataIm,
+                                  real alpha = 1.0f,
+                                  real beta = 0.0f);
 
 /**
  * @brief   Expand feature to column.
@@ -65,14 +70,19 @@ extern void hl_shrink_col2feature(
  * @param[out]  dataCol     expand data.
  *
  */
-extern void hl_expand_feature2col(
-    const real* dataIm, size_t channels,
-    size_t height, size_t width,
-    size_t blockH, size_t blockW,
-    size_t strideH, size_t strideW,
-    size_t paddingH, size_t paddingW,
-    size_t outputH, size_t outputW,
-    real* dataCol);
+extern void hl_expand_feature2col(const real* dataIm,
+                                  size_t channels,
+                                  size_t height,
+                                  size_t width,
+                                  size_t blockH,
+                                  size_t blockW,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real* dataCol);
 
 /**
  * @brief   Maximum pool forward.
@@ -91,16 +101,24 @@ extern void hl_expand_feature2col(
  * @param[in]   paddingH    padding height.
  * @param[in]   paddingW    padding width.
  * @param[out]  tgtData     output data.
+ * @param[in]   tgtStride   stride between output data samples.
  *
  */
-extern void hl_maxpool_forward(
-    const int frameCnt, const real* inputData,
-    const int channels,
-    const int height, const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW, real* tgtData);
+extern void hl_maxpool_forward(const int frameCnt,
+                               const real* inputData,
+                               const int channels,
+                               const int height,
+                               const int width,
+                               const int pooledH,
+                               const int pooledW,
+                               const int sizeX,
+                               const int sizeY,
+                               const int strideH,
+                               const int strideW,
+                               const int paddingH,
+                               const int paddingW,
+                               real* tgtData,
+                               const int tgtStride);
 
 /**
  * @brief   Maximum pool backward.
@@ -123,19 +141,28 @@ extern void hl_maxpool_forward(
  * @param[in]   paddingH    padding height.
  * @param[in]   paddingW    padding width.
  * @param[out]  targetGrad  output grad.
+ * @param[in]   outStride   stride between output data samples.
  *
  */
-extern void hl_maxpool_backward(
-    const int frameCnt, const real* inputData,
-    const real* outData, const real* outGrad,
-    const int channels, const int height,
-    const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW,
-    real scaleA, real scaleB,
-    real* targetGrad);
+extern void hl_maxpool_backward(const int frameCnt,
+                                const real* inputData,
+                                const real* outData,
+                                const real* outGrad,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int pooledH,
+                                const int pooledW,
+                                const int sizeX,
+                                const int sizeY,
+                                const int strideH,
+                                const int strideW,
+                                const int paddingH,
+                                const int paddingW,
+                                real scaleA,
+                                real scaleB,
+                                real* targetGrad,
+                                const int outStride);
 
 /**
  * @brief   Averge pool forward.
@@ -154,22 +181,30 @@ extern void hl_maxpool_backward(
  * @param[in]   paddingH    padding height.
  * @param[in]   paddingW    padding width.
  * @param[out]  tgtData     output data.
+ * @param[in]   tgtStride   stride between output data samples.
  *
  */
-extern void hl_avgpool_forward(
-    const int frameCnt, const real* inputData,
-    const int channels,
-    const int height, const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW, real* tgtData);
+extern void hl_avgpool_forward(const int frameCnt,
+                               const real* inputData,
+                               const int channels,
+                               const int height,
+                               const int width,
+                               const int pooledH,
+                               const int pooledW,
+                               const int sizeX,
+                               const int sizeY,
+                               const int strideH,
+                               const int strideW,
+                               const int paddingH,
+                               const int paddingW,
+                               real* tgtData,
+                               const int tgtStride);
 
 /**
  * @brief   Maximum pool backward.
  *
  * @param[in]   frameCnt    batch size of input image.
- * @param[in]   outGrad     input data.
+ * @param[in]   outGrad     output grad data.
  * @param[in]   channels    number of channel.
  * @param[in]   height      image height.
  * @param[in]   width       image width.
@@ -184,18 +219,26 @@ extern void hl_avgpool_forward(
  * @param[in]   scaleA      scale.
  * @param[in]   scaleB      scale.
  * @param[out]  backGrad    output grad.
+ * @param[in]   outStride   stride between output data samples.
  *
  */
-extern void hl_avgpool_backward(
-    const int frameCnt, const real* outGrad,
-    const int channels, const int height,
-    const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    int paddingH, int paddingW,
-    real scaleA, real scaleB,
-    real* backGrad);
+extern void hl_avgpool_backward(const int frameCnt,
+                                const real* outGrad,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int pooledH,
+                                const int pooledW,
+                                const int sizeX,
+                                const int sizeY,
+                                const int strideH,
+                                const int strideW,
+                                int paddingH,
+                                int paddingW,
+                                real scaleA,
+                                real scaleB,
+                                real* backGrad,
+                                const int outStride);
 
 /**
  * @brief   Cross-map-respose normalize forward.
@@ -212,10 +255,16 @@ extern void hl_avgpool_backward(
  * @param[in]   beta        scale.
  *
  */
-extern void hl_CMRNorm_forward(
-    size_t frameCnt, const real* in, real* scale, real* out,
-    size_t channels, size_t height, size_t width, size_t sizeX,
-    real alpha, real beta);
+extern void hl_CMRNorm_forward(size_t frameCnt,
+                               const real* in,
+                               real* scale,
+                               real* out,
+                               size_t channels,
+                               size_t height,
+                               size_t width,
+                               size_t sizeX,
+                               real alpha,
+                               real beta);
 
 /**
  * @brief   Cross-map-respose normalize backward.
@@ -234,10 +283,119 @@ extern void hl_CMRNorm_forward(
  * @param[in]   beta        scale.
  *
  */
-extern void hl_CMRNorm_backward(
-    size_t frameCnt, const real* inV, const real* scale,
-    const real* outV, const real* outDiff, real *inDiff,
-    size_t channels, size_t height, size_t width, size_t sizeX,
-    real alpha, real beta);
+extern void hl_CMRNorm_backward(size_t frameCnt,
+                                const real* inV,
+                                const real* scale,
+                                const real* outV,
+                                const real* outDiff,
+                                real* inDiff,
+                                size_t channels,
+                                size_t height,
+                                size_t width,
+                                size_t sizeX,
+                                real alpha,
+                                real beta);
+
+/**
+ * @brief   Bilinear interpolation forward.
+ *
+ * @param[in]   inData      input value.
+ * @param[in]   inImgH      input image height.
+ * @param[in]   inImgW      input image width.
+ * @param[in]   inputH      input batchSize.
+ * @param[in]   inputW      input image data dim.
+ * @param[out]  outData     output value.
+ * @param[in]   outImgH     output image height.
+ * @param[in]   outImgW     output image width.
+ * @param[in]   outputH     output batchSize.
+ * @param[in]   outputW     output image data dim.
+ * @param[in]   numChannels number of channels.
+ * @param[in]   ratioH      inImgH / outImgH.
+ * @param[in]   ratioW      inImgW / outImgW.
+ *
+ */
+extern void hl_bilinear_forward(const real* inData,
+                                const size_t inImgH,
+                                const size_t inImgW,
+                                const size_t inputH,
+                                const size_t inputW,
+                                real* outData,
+                                const size_t outImgH,
+                                const size_t outImgW,
+                                const size_t outputH,
+                                const size_t outputW,
+                                const size_t numChannels,
+                                const real ratioH,
+                                const real ratioW);
+
+/**
+* @brief   Bilinear interpolation backward.
+*
+* @param[out]  inGrad      input gradient.
+* @param[in]   inImgH      input image height.
+* @param[in]   inImgW      input image width.
+* @param[in]   inputH      input batchSize.
+* @param[in]   inputW      input image data dim.
+* @param[in]   outGrad     output gradient.
+* @param[in]   outImgH     output image height.
+* @param[in]   outImgW     output image width.
+* @param[in]   outputH     output batchSize.
+* @param[in]   outputW     output image data dim.
+* @param[in]   numChannels number of channels.
+* @param[in]   ratioH      inImgH / outImgH.
+* @param[in]   ratioW      inImgW / outImgW.
+*
+*/
+extern void hl_bilinear_backward(real* inGrad,
+                                 const size_t inImgH,
+                                 const size_t inImgW,
+                                 const size_t inputH,
+                                 const size_t inputW,
+                                 const real* outGrad,
+                                 const size_t outImgH,
+                                 const size_t outImgW,
+                                 const size_t outputH,
+                                 const size_t outputW,
+                                 const size_t numChannels,
+                                 const real ratioH,
+                                 const real ratioW);
+
+/**
+ * @brief   MaxOut forward.
+ *
+ * @param[in]   inData      input data.
+ * @param[out]  outData     output data.
+ * @param[out]  idData      output maxId.
+ * @param[in]   batchSize   batchSize.
+ * @param[in]   size        number of channels * image height * image width.
+ * @param[in]   featLen     feature length = image height * image width.
+ * @param[in]   groups      number of groups.
+ */
+extern void hl_maxout_forward(const real* inData,
+                              real* outData,
+                              int* idData,
+                              size_t batchSize,
+                              size_t size,
+                              size_t featLen,
+                              size_t groups);
+
+/**
+ * @brief   MaxOut backward.
+ *
+ * @param[out]  inGrad      input grad data.
+ * @param[in]   outGrad     output grad data.
+ * @param[in]   idData      output maxId.
+ * @param[in]   batchSize   batchSize.
+ * @param[in]   size        number of channels * image height * image width.
+ * @param[in]   featLen     feature length = image height * image width.
+ * @param[in]   groups      number of groups.
+ */
+extern void hl_maxout_backward(real* inGrad,
+                               const real* outGrad,
+                               const int* idData,
+                               size_t batchSize,
+                               size_t size,
+                               size_t featLen,
+                               size_t groups);
 
 #endif /* HL_CNN_H_ */
diff --git a/paddle/cuda/include/hl_cuda.h b/paddle/cuda/include/hl_cuda.h
index 3196db67f61fd2e6b75df4abb3652df4456a0366..357286e3188a6f3184bc56e75232bf2e1ec54e44 100644
--- a/paddle/cuda/include/hl_cuda.h
+++ b/paddle/cuda/include/hl_cuda.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_H_
 #define HL_CUDA_H_
 
@@ -22,8 +21,7 @@ limitations under the License. */
 /**
  * @brief   HPPL event.
  */
-typedef struct _hl_event_st *  hl_event_t;
-
+typedef struct _hl_event_st *hl_event_t;
 
 /**
  * @brief return cuda runtime api version.
@@ -42,7 +40,7 @@ extern void hl_start();
  *                      if device is NULL, will start all GPU.
  * @param[in]   number  number of devices.
  */
-extern void hl_specify_devices_start(int* device, int number);
+extern void hl_specify_devices_start(int *device, int number);
 
 /**
  * @brief   Queries if a device may directly access a peer device's memory.
@@ -126,7 +124,7 @@ extern int hl_get_device();
  *
  * @return      dest_d   pointer to device memory.
  */
-extern void* hl_malloc_device(size_t size);
+extern void *hl_malloc_device(size_t size);
 
 /**
  * @brief   Free device memory.
@@ -143,7 +141,7 @@ extern void hl_free_mem_device(void *dest_d);
  *
  * @return      dest_h   pointer to host memory.
  */
-extern void* hl_malloc_host(size_t size);
+extern void *hl_malloc_host(size_t size);
 
 /**
  * @brief   Free host page-lock memory.
@@ -228,9 +226,9 @@ extern void hl_srand(unsigned int seed);
  * @param[in]   stream  stream id.
  */
 extern void hl_memcpy_async(void *dst,
-                           void *src,
-                           size_t size,
-                           hl_stream_t stream);
+                            void *src,
+                            size_t size,
+                            hl_stream_t stream);
 
 /**
  * @brief   Waits for stream tasks to complete.
@@ -261,8 +259,7 @@ extern void hl_destroy_event(hl_event_t event);
  *
  * @return      time   Time between start and end in ms.
  */
-extern float hl_event_elapsed_time(hl_event_t start,
-                                   hl_event_t end);
+extern float hl_event_elapsed_time(hl_event_t start, hl_event_t end);
 
 /**
  * @brief   Records an event.
@@ -300,7 +297,7 @@ extern void hl_set_device_flags_block();
 /**
  * @brief   Returns the last error string from a cuda runtime call.
  */
-extern const char* hl_get_device_error_string();
+extern const char *hl_get_device_error_string();
 
 /**
  * @brief     Returns the last error string from a cuda runtime call.
@@ -309,7 +306,7 @@ extern const char* hl_get_device_error_string();
  *
  * @see       hl_get_device_last_error()
  */
-extern const char* hl_get_device_error_string(size_t err);
+extern const char *hl_get_device_error_string(size_t err);
 
 /**
  * @brief   Returns the last error number.
diff --git a/paddle/cuda/include/hl_cuda_cublas.h b/paddle/cuda/include/hl_cuda_cublas.h
index 0ffbed18b5f9e57f22d1bbe1a98a0d899f2fa88d..db8c03c2c01c67788622d37b5330e22c31e03f34 100644
--- a/paddle/cuda/include/hl_cuda_cublas.h
+++ b/paddle/cuda/include/hl_cuda_cublas.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_CUBLAS_H_
 #define HL_CUDA_CUBLAS_H_
 
@@ -21,34 +20,39 @@ limitations under the License. */
 /**
  * @brief   Matrix transpose: C_d = T(A_d)
  *
- * @param[in]   A_d     input matrix (M x N).
- * @param[out]  C_d     output matrix (N x M).
+ * @param[in]   A_d     input matrix (dimM x dimN).
+ * @param[out]  C_d     output matrix (dimN x dimM).
  * @param[in]   dimM    matrix height.
  * @param[in]   dimN    matrix width.
  * @param[in]   lda     the first dimension of A_d.
  * @param[in]   ldc     the first dimension of C_d.
  *
  */
-extern void hl_matrix_transpose(real *A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN,
-                                int lda,
-                                int ldc);
+extern void hl_matrix_transpose(
+    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc);
 
 /*
  * @brief Matrix transpose, while lda = dimN, ldc = dimM.
  *
- * @param[in]   A_d     input matrix (M x N).
- * @param[out]  C_d     output matrix (N x M).
+ * @param[in]   A_d     input matrix (dimM x dimN).
+ * @param[out]  C_d     output matrix (dimN x dimM).
  * @param[in]   dimM    matrix height.
  * @param[in]   dimN    matrix width.
  *
  */
-extern void hl_matrix_transpose(real *A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN);
+extern void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN);
+
+/*
+ * @brief Matrix inverse
+ *
+ * @param[in]   A_d    input matrix (dimN x dimN).
+ * @param[out]  C_d    output matrix (dimN x dimN).
+ * @param[in]   dimN   matrix height = matrix width
+ * @param[in]   lda    the first dimension of A_d
+ * @param[in]   ldc    the first dimension of C_d
+ *
+ */
+extern void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc);
 
 /**
  * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -68,12 +72,19 @@ extern void hl_matrix_transpose(real *A_d,
  * @param[in]   ldc     the first dimension of C_d.
  *
  */
-extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                          real *B_d, hl_trans_op_t transb,
+extern void hl_matrix_mul(real *A_d,
+                          hl_trans_op_t transa,
+                          real *B_d,
+                          hl_trans_op_t transb,
                           real *C_d,
-                          int dimM, int dimN, int dimK,
-                          real alpha, real beta,
-                          int lda, int ldb, int ldc);
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta,
+                          int lda,
+                          int ldb,
+                          int ldc);
 
 /**
  * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -90,11 +101,16 @@ extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
  * @param[in]   beta    scalar used for multiplication.
  *
  */
-extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                          real *B_d, hl_trans_op_t transb,
+extern void hl_matrix_mul(real *A_d,
+                          hl_trans_op_t transa,
+                          real *B_d,
+                          hl_trans_op_t transb,
                           real *C_d,
-                          int dimM, int dimN, int dimK,
-                          real alpha, real beta);
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta);
 
 /**
  * @brief   This function performs the matrix-vector multiplication.
@@ -116,11 +132,17 @@ extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
  *
  */
 
-extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
-                                 real *B_d, real *C_d,
-                                 int dimM, int dimN,
-                                 real alpha, real beta,
-                                 int lda, int incb, int incc);
+extern void hl_matrix_mul_vector(real *A_d,
+                                 hl_trans_op_t trans,
+                                 real *B_d,
+                                 real *C_d,
+                                 int dimM,
+                                 int dimN,
+                                 real alpha,
+                                 real beta,
+                                 int lda,
+                                 int incb,
+                                 int incc);
 
 /**
  * @brief   This function performs the matrix-vector multiplication.
@@ -138,9 +160,13 @@ extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
  * @param[in]     beta   scalar used for multiplication.
  *
  */
-extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
-                                 real *B_d, real *C_d,
-                                 int dimM, int dimN,
-                                 real alpha, real beta);
+extern void hl_matrix_mul_vector(real *A_d,
+                                 hl_trans_op_t trans,
+                                 real *B_d,
+                                 real *C_d,
+                                 int dimM,
+                                 int dimN,
+                                 real alpha,
+                                 real beta);
 
 #endif /* HL_CUDA_CUBLAS_H_ */
diff --git a/paddle/cuda/include/hl_cuda_cudnn.h b/paddle/cuda/include/hl_cuda_cudnn.h
index f256cb54dfe69e8df7cc7fcc0ed0a58f3574acd3..3a2f916210277145efa8f6d7663a2698ea546b0b 100644
--- a/paddle/cuda/include/hl_cuda_cudnn.h
+++ b/paddle/cuda/include/hl_cuda_cudnn.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_CUDNN_H_
 #define HL_CUDA_CUDNN_H_
 
@@ -22,7 +21,7 @@ limitations under the License. */
  *  hppl pooling mode
  */
 typedef enum {
-  HL_POOLING_MAX     = 0,
+  HL_POOLING_MAX = 0,
   // average includes padded values
   HL_POOLING_AVERAGE = 1,
   // average does not include padded values
@@ -324,17 +323,16 @@ extern void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
  * @param[in]   sizeInBytes         gpu workspace size (bytes).
  * @param[in]   convBwdFilterAlgo   backward filter algorithm.
  */
-extern void hl_convolution_backward_filter(
-        hl_tensor_descriptor input,
-        real* input_data,
-        hl_tensor_descriptor output,
-        real* output_grad_data,
-        hl_filter_descriptor filter,
-        real* filter_grad_data,
-        hl_convolution_descriptor conv,
-        void* gpuWorkSpace,
-        size_t sizeInBytes,
-        int  convBwdFilterAlgo);
+extern void hl_convolution_backward_filter(hl_tensor_descriptor input,
+                                           real* input_data,
+                                           hl_tensor_descriptor output,
+                                           real* output_grad_data,
+                                           hl_filter_descriptor filter,
+                                           real* filter_grad_data,
+                                           hl_convolution_descriptor conv,
+                                           void* gpuWorkSpace,
+                                           size_t sizeInBytes,
+                                           int convBwdFilterAlgo);
 
 /**
  * @brief   convolution backward data(calculate input image grad data).
@@ -350,17 +348,16 @@ extern void hl_convolution_backward_filter(
  * @param[in]   sizeInBytes         gpu workspace size (bytes).
  * @param[in]   convBwdDataAlgo     backward data algorithm.
  */
-extern void hl_convolution_backward_data(
-        hl_tensor_descriptor input,
-        real* input_data_grad,
-        hl_tensor_descriptor output,
-        real* output_grad_data,
-        hl_filter_descriptor filter,
-        real* filter_data,
-        hl_convolution_descriptor conv,
-        void* gpuWorkSpace,
-        size_t sizeInBytes,
-        int convBwdDataAlgo);
+extern void hl_convolution_backward_data(hl_tensor_descriptor input,
+                                         real* input_data_grad,
+                                         hl_tensor_descriptor output,
+                                         real* output_grad_data,
+                                         hl_filter_descriptor filter,
+                                         real* filter_data,
+                                         hl_convolution_descriptor conv,
+                                         void* gpuWorkSpace,
+                                         size_t sizeInBytes,
+                                         int convBwdDataAlgo);
 
 /**
  * @brief   convolution backward bias(calculate bias grad data).
@@ -383,8 +380,8 @@ extern void hl_convolution_backward_bias(hl_tensor_descriptor bias,
  * @param[in]   height              matrix height.
  * @param[in]   width               matrix width.
  */
-extern void hl_softmax_forward(real *input,
-                               real *output,
+extern void hl_softmax_forward(real* input,
+                               real* output,
                                int height,
                                int width);
 
@@ -396,8 +393,8 @@ extern void hl_softmax_forward(real *input,
  * @param[in]   height              matrix height.
  * @param[in]   width               matrix width.
  */
-extern void hl_softmax_backward(real *output_value,
-                                real *output_grad,
+extern void hl_softmax_backward(real* output_value,
+                                real* output_grad,
                                 int height,
                                 int width);
 
@@ -426,18 +423,18 @@ extern void hl_softmax_backward(real *output_value,
  *
  */
 extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                           real *input,
+                                           real* input,
                                            hl_tensor_descriptor outputDesc,
-                                           real *output,
+                                           real* output,
                                            hl_tensor_descriptor bnParamDesc,
-                                           real *scale,
-                                           real *bias,
+                                           real* scale,
+                                           real* bias,
                                            double factor,
-                                           real *runningMean,
-                                           real *runningInvVar,
+                                           real* runningMean,
+                                           real* runningInvVar,
                                            double epsilon,
-                                           real *savedMean,
-                                           real *savedVar);
+                                           real* savedMean,
+                                           real* savedVar);
 
 /**
  * @brief   cudnn batch norm forward.
@@ -463,14 +460,14 @@ extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
  *
  */
 extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                            real *input,
+                                            real* input,
                                             hl_tensor_descriptor outputDesc,
-                                            real *output,
+                                            real* output,
                                             hl_tensor_descriptor bnParamDesc,
-                                            real *scale,
-                                            real *bias,
-                                            real *estimatedMean,
-                                            real *estimatedVar,
+                                            real* scale,
+                                            real* bias,
+                                            real* estimatedMean,
+                                            real* estimatedVar,
                                             double epsilon);
 
 /**
@@ -483,7 +480,8 @@ extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
  * @param[in]   inGradDesc      input tensor descriptor desc.
  * @param[in]   inGrad          input data.
  * @param[in]   dBnParamDesc    tensor descriptor desc.
- *                              bnScale, bnBias, running mean/var, save_mean/var.
+ *                              bnScale, bnBias, running mean/var,
+ * save_mean/var.
  * @param[in]   scale           batch normalization scale parameter (in original
  *                              paper scale is referred to as gamma).
  * @param[in]   scaleGrad       batch normalization scale parameter (in original
@@ -497,17 +495,17 @@ extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
  *
  */
 extern void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                                   real *input,
+                                   real* input,
                                    hl_tensor_descriptor outGradDesc,
-                                   real *outGrad,
+                                   real* outGrad,
                                    hl_tensor_descriptor inGradDesc,
-                                   real *inGrad,
+                                   real* inGrad,
                                    hl_tensor_descriptor dBnParamDesc,
-                                   real *scale,
-                                   real *scaleGrad,
-                                   real *biasGrad,
+                                   real* scale,
+                                   real* scaleGrad,
+                                   real* biasGrad,
                                    double epsilon,
-                                   real *savedMean,
-                                   real *savedInvVar);
+                                   real* savedMean,
+                                   real* savedInvVar);
 
 #endif  // HL_CUDA_CUDNN_H_
diff --git a/paddle/cuda/include/hl_device_functions.cuh b/paddle/cuda/include/hl_device_functions.cuh
index 88d950d6c17132d1d9969d0f3766395377e2de96..159c26f443cb17116da2d2d5282f883d875a85be 100755
--- a/paddle/cuda/include/hl_device_functions.cuh
+++ b/paddle/cuda/include/hl_device_functions.cuh
@@ -48,5 +48,24 @@ inline __device__ double paddleAtomicAdd(double* address, double val) {
 }
 }  // namespace paddle
 
+/**
+ * @brief  sum reduction
+ *
+ * @param[in,out]  smem       input data, better to use __shared__ memory.
+ * @param[in]      tid        thread index.
+ * @param[in]      threads    the total thread number used to reduce,
+ *                            such as, blockDim.x.
+ *
+ * @return smem[0]: the sum of each elements in smem.
+ */
+__device__ __forceinline__
+void simpleReduce(real* smem, int tid, int threads) {
+  for (unsigned int s = threads / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      smem[tid] += smem[tid + s];
+    }
+    __syncthreads();
+  }
+}
 
 #endif /* HL_DEVICE_FUNCTIONS_CUH_ */
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h
index f36c724e2da3dce11696fcda7daf98f5cda36dd6..1eb9f9ca888d3a93f04621e10346b5f9ff34cdca 100644
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_DSO_LOADER_H_
 #define HL_DSO_LOADER_H_
 
diff --git a/paddle/cuda/include/hl_functions.h b/paddle/cuda/include/hl_functions.h
index 65f366461ced0f9ee31ff9075f6dfaeb6c9b72a2..91ce9a0678463597df88c548aeac322ee19d95de 100644
--- a/paddle/cuda/include/hl_functions.h
+++ b/paddle/cuda/include/hl_functions.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_FUNCTIONS_H_
 #define HL_FUNCTIONS_H_
 
@@ -21,30 +20,30 @@ limitations under the License. */
 /**
  * sigmoid threshold maximum
  */
-#define     SIGMOID_THRESHOLD_MIN   -40.0
+#define SIGMOID_THRESHOLD_MIN -40.0
 
 /**
  * sigmoid threshold minimum
  */
-#define     SIGMOID_THRESHOLD_MAX   13.0
+#define SIGMOID_THRESHOLD_MAX 13.0
 
 #ifndef __NVCC__
 namespace hppl {
-  /*
-   * forward activation
-   */
-  real relu(const real a);
-  real sigmoid(const real a);
-  real tanh(const real a);
-  real linear(const real a);
-
-  /*
-   * backward activation
-   */
-  real relu(const real a, const real b);
-  real sigmoid(const real a, const real b);
-  real tanh(const real a, const real b);
-  real linear(const real a, const real b);
+/*
+ * forward activation
+ */
+real relu(const real a);
+real sigmoid(const real a);
+real tanh(const real a);
+real linear(const real a);
+
+/*
+ * backward activation
+ */
+real relu(const real a, const real b);
+real sigmoid(const real a, const real b);
+real tanh(const real a, const real b);
+real linear(const real a, const real b);
 }  // namespace hppl
 
 #ifdef __AVX__
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h
index 05039663b6e9f5e4a72f15ab822d723635f9b282..3be0df3b93b69811fb9c36dae223cbd927b02559 100644
--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_GPU_H_
 #define HL_GPU_H_
 
diff --git a/paddle/cuda/include/hl_lstm.h b/paddle/cuda/include/hl_lstm.h
index 1f95e318a1fe06050bbd31c2e276974f4a8bdc1e..7e527a79025969320f1aca75d313fd9d0194efd1 100644
--- a/paddle/cuda/include/hl_lstm.h
+++ b/paddle/cuda/include/hl_lstm.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_LSTM_H_
 #define HL_LSTM_H_
 
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
index 17419790471a7d1e86f2cf0017290004ec0c4dfc..96648661e345d8fa5d50cb2aae3a56ee53921f90 100644
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_MATRIX_H_
 #define HL_MATRIX_H_
 
@@ -30,13 +29,8 @@ limitations under the License. */
  * @param[in]   beta    scalar used for addition.
  *
  */
-extern void hl_matrix_add(real* A_d,
-                          real* B_d,
-                          real* C_d,
-                          int dimM,
-                          int dimN,
-                          real alpha,
-                          real beta);
+extern void hl_matrix_add(
+    real* A_d, real* B_d, real* C_d, int dimM, int dimN, real alpha, real beta);
 /**
  * @brief   Matrix Softmax.
  *
@@ -46,7 +40,7 @@ extern void hl_matrix_add(real* A_d,
  * @param[in]   dimN    matrix width.
  *
  */
-extern void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN);
+extern void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN);
 
 /**
  * @brief   Matrix softmax derivative.
@@ -58,11 +52,8 @@ extern void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN);
  * @param[in]   dimN         matrix width.
  *
  */
-extern void hl_matrix_softmax_derivative(real* grad_d,
-                                         real* output_d,
-                                         real* sftmaxSum_d,
-                                         int dimM,
-                                         int dimN);
+extern void hl_matrix_softmax_derivative(
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN);
 
 /**
  * @brief   Sequence softmax.
@@ -73,8 +64,8 @@ extern void hl_matrix_softmax_derivative(real* grad_d,
  * @param[in]   numSequence sequence number.
  *
  */
-extern void hl_sequence_softmax_forward(real *A_d,
-                                        real *C_d,
+extern void hl_sequence_softmax_forward(real* A_d,
+                                        real* C_d,
                                         const int* index,
                                         int numSequence);
 
@@ -88,11 +79,8 @@ extern void hl_sequence_softmax_forward(real *A_d,
  * @param[in]   dimN    matrix width.
  *
  */
-extern void hl_matrix_classification_error(real* A_d,
-                                           int* B_d,
-                                           real* C_d,
-                                           int dimM,
-                                           int dimN);
+extern void hl_matrix_classification_error(
+    real* A_d, int* B_d, real* C_d, int dimM, int dimN);
 
 /**
  * @brief   Matrix cross entropy.
@@ -104,11 +92,8 @@ extern void hl_matrix_classification_error(real* A_d,
  * @param[in]   dimN    matrix width.
  *
  */
-extern void hl_matrix_cross_entropy(real* A_d,
-                                    real* C_d,
-                                    int* label_d,
-                                    int dimM,
-                                    int dimN);
+extern void hl_matrix_cross_entropy(
+    real* A_d, real* C_d, int* label_d, int dimM, int dimN);
 
 /**
  * @brief   Matrix cross entropy back propagation.
@@ -120,11 +105,32 @@ extern void hl_matrix_cross_entropy(real* A_d,
  * @param[in]   dimN        matrix width.
  *
  */
-extern void hl_matrix_cross_entropy_bp(real* grad_d,
-                                       real* output_d,
-                                       int* label_d,
-                                       int dimM,
-                                       int dimN);
+extern void hl_matrix_cross_entropy_bp(
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN);
+
+/**
+ * @brief  Matrix multi-binary label cross entropy
+ *
+ * @param[in]   output    input matrix (M x N).
+ * @param[out]  entropy   output matrix (M x 1).
+ * @param[in]   mat       input sparse matrix.
+ * @param[in]   dimM      matrix height.
+ * @param[in]   dimN      matrix width.
+ */
+extern void hl_matrix_multi_binary_cross_entropy(
+    real* output, real* entropy, hl_sparse_matrix_s mat, int dimM, int dimN);
+
+/**
+ * @brief  Matrix multi-binary label cross entropy backprop
+ *
+ * @param[in]   output    input matrix (M x N).
+ * @param[out]  grad      output matrix (M x N).
+ * @param[in]   mat       input sparse matrix.
+ * @param[in]   dimM      matrix height.
+ * @param[in]   dimN      matrix width.
+ */
+extern void hl_matrix_multi_binary_cross_entropy_bp(
+    real* output, real* grad, hl_sparse_matrix_s mat, int dimM, int dimN);
 
 /**
  * @brief  Matrix zero memory.
@@ -146,12 +152,8 @@ extern void hl_matrix_zero_mem(real* data, int num);
  * @param[in]  partial_sum
  */
 
-extern void hl_param_relu_forward(real* output,
-                                  real* input,
-                                  real* w,
-                                  int width,
-                                  int height,
-                                  int partial_sum);
+extern void hl_param_relu_forward(
+    real* output, real* input, real* w, int width, int height, int partial_sum);
 /**
  * @brief parameter relu backward w
  *
@@ -229,4 +231,40 @@ extern void hl_cossim_derivative(real* grad,
                                  int input2_height,
                                  real scale);
 
+/**
+ * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
+ *
+ * @param[in]   A_d     input matrix (M x N).
+ * @param[in]   B_d     input matrix (1 x channel).
+ * @param[in]   channel width of B.
+ * @param[in]   dimM    height of A.
+ * @param[in]   dimN    width of A.
+ * @param[in]   scale   scalar used for addition.
+ *
+ */
+extern void hl_matrix_add_shared_bias(real* A_d,
+                                      real* B_d,
+                                      const int channel,
+                                      const int dimM,
+                                      const int dimN,
+                                      real scale);
+
+/**
+ * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
+ *
+ * @param[in]   B_d     input matrix (1 x channel).
+ * @param[in]   A_d     input matrix (M x N).
+ * @param[in]   channel width of B.
+ * @param[in]   dimM    height of A.
+ * @param[in]   dimN    width of A.
+ * @param[in]   scale   scalar used for addition.
+ *
+ */
+extern void hl_matrix_collect_shared_bias(real* B_d,
+                                          real* A_d,
+                                          const int channel,
+                                          const int dimM,
+                                          const int dimN,
+                                          real scale);
+
 #endif /* HL_MATRIX_H_ */
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
index 46d86b2982f065802eec83ca7554f787d1d02f3a..bb5124df44b492bd8fdeb2a0c75ebcf74d2c8157 100644
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_SEQUENCE_H_
 #define HL_SEQUENCE_H_
 
@@ -32,7 +31,7 @@ limitations under the License. */
 extern void hl_max_sequence_forward(real* input,
                                     const int* sequence,
                                     real* output,
-                                    int *index,
+                                    int* index,
                                     int numSequences,
                                     int dim);
 
@@ -46,11 +45,8 @@ extern void hl_max_sequence_forward(real* input,
  * @param[in]   dim             input dimension.
  *
  */
-extern void hl_max_sequence_backward(real* outputGrad,
-                                     int *index,
-                                     real* inputGrad,
-                                     int numSequences,
-                                     int dim);
+extern void hl_max_sequence_backward(
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim);
 
 /**
  * @brief   Context projection forward.
@@ -63,7 +59,8 @@ extern void hl_max_sequence_backward(real* outputGrad,
  * @param[in]   inputDim        input sequence dimension.
  * @param[in]   contextLength   context length.
  * @param[in]   contextStart    context start.
- * @param[in]   beginPad        number of extra timesteps added at the beginning.
+ * @param[in]   beginPad        number of extra timesteps added at the
+ * beginning.
  * @param[in]   isPadding       trainable padding.
  *
  */
@@ -109,7 +106,8 @@ extern void hl_context_projection_backward_data(real* outputGrad,
  * @param[in]   totalPad        number of extra timesteps.
  * @param[in]   contextLength   context length.
  * @param[in]   contextStart    context start.
- * @param[in]   beginPad        number of extra timesteps added at the beginning.
+ * @param[in]   beginPad        number of extra timesteps added at the
+ * beginning.
  *
  */
 extern void hl_context_projection_backward_weight(real* outputGrad,
@@ -141,9 +139,9 @@ extern void hl_context_projection_backward_weight(real* outputGrad,
  * @param[in]       seq2batch   copy direction.
  *
  */
-extern void hl_sequence2batch_copy(real *batch,
-                                   real *sequence,
-                                   const int *batchIndex,
+extern void hl_sequence2batch_copy(real* batch,
+                                   real* sequence,
+                                   const int* batchIndex,
                                    int seqWidth,
                                    int batchCount,
                                    bool seq2batch);
@@ -167,9 +165,9 @@ extern void hl_sequence2batch_copy(real *batch,
  * @param[in]       seq2batch   copy direction.
  *
  */
-extern void hl_sequence2batch_add(real *batch,
-                                  real *sequence,
-                                  int *batchIndex,
+extern void hl_sequence2batch_add(real* batch,
+                                  real* sequence,
+                                  int* batchIndex,
                                   int seqWidth,
                                   int batchCount,
                                   bool seq2batch);
diff --git a/paddle/cuda/include/hl_sparse.h b/paddle/cuda/include/hl_sparse.h
index 9acdebdebf37761e1485e3441963586ead9f3c85..c4e0be23e2031cbcb124b532216a23d8a344668d 100644
--- a/paddle/cuda/include/hl_sparse.h
+++ b/paddle/cuda/include/hl_sparse.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_SPARSE_H_
 #define HL_SPARSE_H_
 
@@ -31,7 +30,7 @@ limitations under the License. */
  */
 extern void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
                                     hl_matrix_format_t format,
-                                    hl_matrix_value_t  value_type,
+                                    hl_matrix_value_t value_type,
                                     int dimM,
                                     int dimN,
                                     int nnz);
@@ -60,10 +59,10 @@ extern void hl_free_sparse_matrix(hl_sparse_matrix_s A_d);
  *
  */
 extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       void * dest_d,
+                                       void *dest_d,
                                        size_t size,
                                        hl_matrix_format_t format,
-                                       hl_matrix_value_t  value_type,
+                                       hl_matrix_value_t value_type,
                                        int dimM,
                                        int dimN,
                                        int nnz);
@@ -94,11 +93,11 @@ extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
  *
  */
 extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       real* value_d,
-                                       int* rows_d,
-                                       int* cols_d,
+                                       real *value_d,
+                                       int *rows_d,
+                                       int *cols_d,
                                        hl_matrix_format_t format,
-                                       hl_matrix_value_t  value_type,
+                                       hl_matrix_value_t value_type,
                                        int dimM,
                                        int dimN,
                                        int nnz);
@@ -259,10 +258,14 @@ extern void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
  */
 extern void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
                                     hl_trans_op_t transa,
-                                    real *B_d, hl_trans_op_t transb,
+                                    real *B_d,
+                                    hl_trans_op_t transb,
                                     real *C_d,
-                                    int dimM, int dimN, int dimK,
-                                    real alpha, real beta);
+                                    int dimM,
+                                    int dimN,
+                                    int dimK,
+                                    real alpha,
+                                    real beta);
 
 /**
  * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
@@ -311,11 +314,16 @@ extern void hl_matrix_dense_mul_csc(real *A_d,
  * @note    transb is not support HPPL_OP_T.
  *
  */
-extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
-                                 real *B_d, hl_trans_op_t transb,
+extern void hl_sparse_matrix_mul(real *A_d,
+                                 hl_trans_op_t transa,
+                                 real *B_d,
+                                 hl_trans_op_t transb,
                                  hl_sparse_matrix_s C_d,
-                                 int dimM, int dimN, int dimK,
-                                 real alpha, real beta);
+                                 int dimM,
+                                 int dimN,
+                                 int dimK,
+                                 real alpha,
+                                 real beta);
 
 /**
  * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@@ -336,12 +344,16 @@ extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
  * @note    transa is not support HPPL_OP_T.
  *
  */
-extern void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
+extern void hl_matrix_dense_mul_csr(real *A_d,
+                                    hl_trans_op_t transa,
                                     hl_sparse_matrix_s B_d,
                                     hl_trans_op_t transb,
                                     real *C_d,
-                                    int dimM, int dimN, int dimK,
-                                    real alpha, real beta);
+                                    int dimM,
+                                    int dimN,
+                                    int dimK,
+                                    real alpha,
+                                    real beta);
 
 /**
  * @brief   Memcpy csc_matrix to host.
@@ -412,7 +424,6 @@ extern void hl_memcpy_from_csr_matrix(real *csr_val,
                                       hl_sparse_matrix_s csr_matrix,
                                       hl_stream_t stream);
 
-
 /**
  * @brief   A_d[j] += B_d[i,j] for i in range(height)
  *
@@ -423,19 +434,13 @@ extern void hl_memcpy_from_csr_matrix(real *csr_val,
  * @param[in]       scale  scale of B_d
  *
  */
-extern void hl_sparse_matrix_column_sum(real* A_d,
-                                        hl_sparse_matrix_s B_d,
-                                        int dimM,
-                                        int dimN,
-                                        real scale);
+extern void hl_sparse_matrix_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale);
 /**
  * @brief implementation of csr sparse matrix in hl_sparse_matirx_column_sum
  */
-extern void hl_matrix_csr_column_sum(real* A_d,
-                                     hl_sparse_matrix_s B_d,
-                                     int dimM,
-                                     int dimN,
-                                     real scale);
+extern void hl_matrix_csr_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale);
 
 /**
  * @brief   A_d[i,j] += B_d[j]
@@ -446,13 +451,13 @@ extern void hl_matrix_csr_column_sum(real* A_d,
  *
  */
 extern void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
-                                      real* B_d,
+                                      real *B_d,
                                       real scale);
 /**
  * @brief implementation of csr sparse matrix in hl_sparse_matrix_add_bias
  */
 extern void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
-                                   real* B_d,
+                                   real *B_d,
                                    real scale);
 
 /**
@@ -470,7 +475,7 @@ extern void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
  *
  */
 extern void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
-                                       real* B_d,
+                                       real *B_d,
                                        int dimM,
                                        int dimN,
                                        real alpha,
@@ -479,7 +484,7 @@ extern void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
  * @brief implementation of csr sparse matrix in hl_sparse_matrix_add_dense
  */
 extern void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
-                                    real* B_d,
+                                    real *B_d,
                                     int dimM,
                                     int dimN,
                                     real alpha,
@@ -493,7 +498,7 @@ extern void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
  * @return   return rows pointer, which is gpu address
  *
  */
-extern int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
+extern int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
 
 /**
  * @brief get cols pionter of GpuSparseMatrix
@@ -503,7 +508,7 @@ extern int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
  * @return   return cols pointer, which is gpu address
  *
  */
-extern int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
+extern int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
 
 /**
  * @brief get value pionter of GpuSparseMatrix
@@ -513,7 +518,6 @@ extern int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
  * @return   return value pointer, which is gpu address
  *
  */
-extern real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat);
-
+extern real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat);
 
 #endif /* HL_SPARSE_H_ */
diff --git a/paddle/cuda/include/hl_table_apply.h b/paddle/cuda/include/hl_table_apply.h
index 3c9428e9253d5ed563e4e9f62d8842667496b83c..b4ac83a66af13c2a843872faba2ebd972008a738 100644
--- a/paddle/cuda/include/hl_table_apply.h
+++ b/paddle/cuda/include/hl_table_apply.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_TABLE_APPLY_H_
 #define HL_TABLE_APPLY_H_
 
@@ -31,8 +30,10 @@ limitations under the License. */
  * @param[in]   dim             width of table.
  *
  */
-extern void hl_matrix_select_rows(real* output, int ldo,
-                                  real* table, int ldt,
+extern void hl_matrix_select_rows(real* output,
+                                  int ldo,
+                                  real* table,
+                                  int ldt,
                                   int* ids,
                                   int numSamples,
                                   int tableSize,
@@ -53,8 +54,10 @@ extern void hl_matrix_select_rows(real* output, int ldo,
  * @param[in]   dim             width of table.
  *
  */
-extern void hl_matrix_add_to_rows(real* table, int ldt,
-                                  real* input, int ldi,
+extern void hl_matrix_add_to_rows(real* table,
+                                  int ldt,
+                                  real* input,
+                                  int ldi,
                                   int* ids,
                                   int numSamples,
                                   int tableSize,
@@ -72,8 +75,7 @@ extern void hl_matrix_add_to_rows(real* table, int ldt,
  *
  */
 template <class T>
-extern void hl_vector_select_from(T* dst, int sized,
-                                  const T* src, int sizes,
-                                  const int* ids, int sizei);
+extern void hl_vector_select_from(
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei);
 
-#endif  /* HL_TABLE_APPLY_H_ */
+#endif /* HL_TABLE_APPLY_H_ */
diff --git a/paddle/cuda/include/hl_time.h b/paddle/cuda/include/hl_time.h
index 4414b0b2d2ed4ab6a48294ffaed3a43a639e5950..b0a88c66a12fcfec6ea96b877423f907dac8dfa1 100644
--- a/paddle/cuda/include/hl_time.h
+++ b/paddle/cuda/include/hl_time.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_TIME_H_
 #define HL_TIME_H_
 
diff --git a/paddle/cuda/include/hl_top_k.h b/paddle/cuda/include/hl_top_k.h
index a38d4cf862278a060f72b970d723895dc3735d0a..e8cfebbf6a3bd27c10a71d7817238bc304681fa4 100644
--- a/paddle/cuda/include/hl_top_k.h
+++ b/paddle/cuda/include/hl_top_k.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_TOP_K_H_
 #define HL_TOP_K_H_
 
@@ -31,9 +30,11 @@ limitations under the License. */
  * @param[in]   numSamples     height of input value.
  *
  */
-extern void hl_matrix_top_k(real* topVal, int ldv,
-                            int * topIds,
-                            real* src, int lds,
+extern void hl_matrix_top_k(real* topVal,
+                            int ldv,
+                            int* topIds,
+                            real* src,
+                            int lds,
                             int dim,
                             int beamSize,
                             int numSamples);
@@ -50,8 +51,9 @@ extern void hl_matrix_top_k(real* topVal, int ldv,
  *
  * @note    Only support HL_SPARSE_CSR format.
  */
-extern void hl_sparse_matrix_top_k(real* topVal, int ldv,
-                                   int * topIds,
+extern void hl_sparse_matrix_top_k(real* topVal,
+                                   int ldv,
+                                   int* topIds,
                                    hl_sparse_matrix_s src,
                                    int beamSize,
                                    int numSamples);
diff --git a/paddle/cuda/include/stub/hl_aggregate_stub.h b/paddle/cuda/include/stub/hl_aggregate_stub.h
index 4c0c68f3c98fe95f01060b82c3a1b9822d2a3715..bb53fc581e09905aa7a9b2d8dfe44b04c677c40a 100644
--- a/paddle/cuda/include/stub/hl_aggregate_stub.h
+++ b/paddle/cuda/include/stub/hl_aggregate_stub.h
@@ -12,29 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_AGGREGATE_STUB_H_
 #define HL_AGGREGATE_STUB_H_
 
 #include "hl_aggregate.h"
 
-inline void hl_matrix_row_sum(real *A_d, real *C_d,
-                              int dimM, int dimN) {}
+inline void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_row_max(real *A_d, real *C_d,
-                              int dimM, int dimN) {}
+inline void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_row_min(real *A_d, real *C_d,
-                              int dimM, int dimN) {}
+inline void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_column_sum(real *A_d, real *C_d,
-                                 int dimM, int dimN) {}
+inline void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_column_max(real *A_d, real *C_d,
-                                 int dimM, int dimN) {}
+inline void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_column_min(real *A_d, real *C_d,
-                                 int dimM, int dimN) {}
+inline void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {}
 
 inline void hl_vector_sum(real *A_d, real *C_h, int dimM) {}
 
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 38e359c3eb2f34e5874187f4b06280a3df901c8e..2f73b9671edd3609996aebff2913f5262805f869 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -12,81 +12,177 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CNN_STUB_H_
 #define HL_CNN_STUB_H_
 
 #include "hl_cnn.h"
 
-inline void hl_shrink_col2feature(
-    const real * dataCol, size_t channels,
-    size_t height, size_t width,
-    size_t blockH, size_t blockW,
-    size_t strideH, size_t strideW,
-    size_t paddingH, size_t paddingW,
-    size_t outputH, size_t outputW,
-    real* dataIm,
-    real alpha, real beta) {}
-
-inline void hl_expand_feature2col(
-    const real* dataIm, size_t channels,
-    size_t height, size_t width,
-    size_t blockH, size_t blockW,
-    size_t strideH, size_t strideW,
-    size_t paddingH, size_t paddingW,
-    size_t outputH, size_t outputW,
-    real* dataCol) {}
-
-inline void hl_maxpool_forward(
-    const int frameCnt, const real* inputData,
-    const int channels,
-    const int height, const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW, real* tgtData) {}
-
-inline void hl_maxpool_backward(
-    const int frameCnt, const real* inputData,
-    const real* outData, const real* outGrad,
-    const int channels, const int height,
-    const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW,
-    real scaleA, real scaleB,
-    real* targetGrad) {}
-
-inline void hl_avgpool_forward(
-    const int frameCnt, const real* inputData,
-    const int channels,
-    const int height, const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    const int paddingH, const int paddingW, real* tgtData) {}
-
-inline void hl_avgpool_backward(
-    const int frameCnt, const real* outGrad,
-    const int channels, const int height,
-    const int width,
-    const int pooledH, const int pooledW,
-    const int sizeX, const int sizeY,
-    const int strideH, const int strideW,
-    int paddingH, int paddingW,
-    real scaleA, real scaleB,
-    real* backGrad) {}
-
-inline void hl_CMRNorm_forward(
-    size_t frameCnt, const real* in, real* scale, real* out,
-    size_t channels, size_t height, size_t width, size_t sizeX,
-    real alpha, real beta) {}
-
-inline void hl_CMRNorm_backward(
-    size_t frameCnt, const real* inV, const real* scale,
-    const real* outV, const real* outDiff, real *inDiff,
-    size_t channels, size_t height, size_t width, size_t sizeX,
-    real alpha, real beta) {}
+inline void hl_shrink_col2feature(const real* dataCol,
+                                  size_t channels,
+                                  size_t height,
+                                  size_t width,
+                                  size_t blockH,
+                                  size_t blockW,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real* dataIm,
+                                  real alpha,
+                                  real beta) {}
+
+inline void hl_expand_feature2col(const real* dataIm,
+                                  size_t channels,
+                                  size_t height,
+                                  size_t width,
+                                  size_t blockH,
+                                  size_t blockW,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real* dataCol) {}
+
+inline void hl_maxpool_forward(const int frameCnt,
+                               const real* inputData,
+                               const int channels,
+                               const int height,
+                               const int width,
+                               const int pooledH,
+                               const int pooledW,
+                               const int sizeX,
+                               const int sizeY,
+                               const int strideH,
+                               const int strideW,
+                               const int paddingH,
+                               const int paddingW,
+                               real* tgtData,
+                               const int tgtStride) {}
+
+inline void hl_maxpool_backward(const int frameCnt,
+                                const real* inputData,
+                                const real* outData,
+                                const real* outGrad,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int pooledH,
+                                const int pooledW,
+                                const int sizeX,
+                                const int sizeY,
+                                const int strideH,
+                                const int strideW,
+                                const int paddingH,
+                                const int paddingW,
+                                real scaleA,
+                                real scaleB,
+                                real* targetGrad,
+                                const int outStride) {}
+
+inline void hl_avgpool_forward(const int frameCnt,
+                               const real* inputData,
+                               const int channels,
+                               const int height,
+                               const int width,
+                               const int pooledH,
+                               const int pooledW,
+                               const int sizeX,
+                               const int sizeY,
+                               const int strideH,
+                               const int strideW,
+                               const int paddingH,
+                               const int paddingW,
+                               real* tgtData,
+                               const int tgtStride) {}
+
+inline void hl_avgpool_backward(const int frameCnt,
+                                const real* outGrad,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int pooledH,
+                                const int pooledW,
+                                const int sizeX,
+                                const int sizeY,
+                                const int strideH,
+                                const int strideW,
+                                int paddingH,
+                                int paddingW,
+                                real scaleA,
+                                real scaleB,
+                                real* backGrad,
+                                const int outStride) {}
+
+inline void hl_CMRNorm_forward(size_t frameCnt,
+                               const real* in,
+                               real* scale,
+                               real* out,
+                               size_t channels,
+                               size_t height,
+                               size_t width,
+                               size_t sizeX,
+                               real alpha,
+                               real beta) {}
+
+inline void hl_CMRNorm_backward(size_t frameCnt,
+                                const real* inV,
+                                const real* scale,
+                                const real* outV,
+                                const real* outDiff,
+                                real* inDiff,
+                                size_t channels,
+                                size_t height,
+                                size_t width,
+                                size_t sizeX,
+                                real alpha,
+                                real beta) {}
+
+inline void hl_bilinear_forward(const real* inData,
+                                const size_t inImgH,
+                                const size_t inImgW,
+                                const size_t inputH,
+                                const size_t inputW,
+                                real* outData,
+                                const size_t outImgH,
+                                const size_t outImgW,
+                                const size_t outputH,
+                                const size_t outputW,
+                                const size_t numChannels,
+                                const real ratioH,
+                                const real ratioW) {}
+
+inline void hl_bilinear_backward(real* inGrad,
+                                 const size_t inImgH,
+                                 const size_t inImgW,
+                                 const size_t inputH,
+                                 const size_t inputW,
+                                 const real* outGrad,
+                                 const size_t outImgH,
+                                 const size_t outImgW,
+                                 const size_t outputH,
+                                 const size_t outputW,
+                                 const size_t numChannels,
+                                 const real ratioH,
+                                 const real ratioW) {}
+
+inline void hl_maxout_forward(const real* inData,
+                              real* outData,
+                              int* idData,
+                              size_t batchSize,
+                              size_t size,
+                              size_t featLen,
+                              size_t group) {}
+
+inline void hl_maxout_backward(real* inGrad,
+                               const real* outGrad,
+                               const int* idData,
+                               size_t batchSize,
+                               size_t size,
+                               size_t featLen,
+                               size_t group) {}
 
 #endif  // HL_CNN_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
index 4a5e2a25a71b38b2c38688820cbffdb10251bcac..85f7c390c47397127487b16fdc933f0afe2fb880 100644
--- a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
@@ -12,35 +12,42 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_CUBLAS_STUB_H_
 #define HL_CUDA_CUBLAS_STUB_H_
 
 #include "hl_cuda_cublas.h"
 
-inline void hl_matrix_transpose(real *A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN,
-                                int lda,
-                                int ldc) {}
+inline void hl_matrix_transpose(
+    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {}
 
-inline void hl_matrix_transpose(real *A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN) {}
+inline void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {}
 
-inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                          real *B_d, hl_trans_op_t transb,
-                          real *C_d,
-                          int dimM, int dimN, int dimK,
-                          real alpha, real beta,
-                          int lda, int ldb, int ldc) {}
+inline void hl_matrix_inverse(
+    real *A_d, real *C_d, int dimN, int lda, int ldc) {}
 
-inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                          real *B_d, hl_trans_op_t transb,
+inline void hl_matrix_mul(real *A_d,
+                          hl_trans_op_t transa,
+                          real *B_d,
+                          hl_trans_op_t transb,
+                          real *C_d,
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta,
+                          int lda,
+                          int ldb,
+                          int ldc) {}
+
+inline void hl_matrix_mul(real *A_d,
+                          hl_trans_op_t transa,
+                          real *B_d,
+                          hl_trans_op_t transb,
                           real *C_d,
-                          int dimM, int dimN, int dimK,
-                          real alpha, real beta) {}
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta) {}
 
 #endif  // HL_CUDA_CUBLAS_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
index 34c173908246e4a48c327c8aa58730756bbc72b7..3beb0e5b5170261a6c453936b8b0347f3e97dbff 100644
--- a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
@@ -12,15 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_CUDNN_STUB_H_
 #define HL_CUDA_CUDNN_STUB_H_
 
 #include "hl_cuda_cudnn.h"
 
-inline int hl_get_cudnn_lib_version() {
-  return 0;
-}
+inline int hl_get_cudnn_lib_version() { return 0; }
 
 inline void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {}
 
@@ -68,41 +65,41 @@ inline void hl_pooling_backward(hl_tensor_descriptor input,
                                 hl_pooling_descriptor pooling) {}
 
 inline void hl_create_filter_descriptor(hl_filter_descriptor* filter,
-                                       int input_feature_maps,
-                                       int output_feature_maps,
-                                       int height,
-                                       int width) {}
+                                        int input_feature_maps,
+                                        int output_feature_maps,
+                                        int height,
+                                        int width) {}
 
 inline void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {}
 
 inline void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
-        hl_tensor_descriptor image,
-        hl_filter_descriptor filter,
-        int padding_height,
-        int padding_width,
-        int stride_height,
-        int stride_width) {}
+                                             hl_tensor_descriptor image,
+                                             hl_filter_descriptor filter,
+                                             int padding_height,
+                                             int padding_width,
+                                             int stride_height,
+                                             int stride_width) {}
 
 inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
-        hl_tensor_descriptor image,
-        hl_filter_descriptor filter,
-        int padding_height,
-        int padding_width,
-        int stride_height,
-        int stride_width) {}
+                                            hl_tensor_descriptor image,
+                                            hl_filter_descriptor filter,
+                                            int padding_height,
+                                            int padding_width,
+                                            int stride_height,
+                                            int stride_width) {}
 
 inline void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {}
 
 inline void hl_conv_workspace(hl_tensor_descriptor input,
-                       hl_tensor_descriptor output,
-                       hl_filter_descriptor filter,
-                       hl_convolution_descriptor conv,
-                       int* convFwdAlgo,
-                       size_t* fwdLimitBytes,
-                       int* convBwdDataAlgo,
-                       size_t* bwdDataLimitBytes,
-                       int* convBwdFilterAlgo,
-                       size_t* bwdFilterLimitBytes) {}
+                              hl_tensor_descriptor output,
+                              hl_filter_descriptor filter,
+                              hl_convolution_descriptor conv,
+                              int* convFwdAlgo,
+                              size_t* fwdLimitBytes,
+                              int* convBwdDataAlgo,
+                              size_t* bwdDataLimitBytes,
+                              int* convBwdFilterAlgo,
+                              size_t* bwdFilterLimitBytes) {}
 
 inline void hl_convolution_forward(hl_tensor_descriptor input,
                                    real* input_data,
@@ -116,87 +113,84 @@ inline void hl_convolution_forward(hl_tensor_descriptor input,
                                    int convFwdAlgo) {}
 
 inline void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
-        real* bias_data,
-        hl_tensor_descriptor output,
-        real* output_data) {}
-
-inline void hl_convolution_backward_filter(
-        hl_tensor_descriptor input,
-        real* input_data,
-        hl_tensor_descriptor output,
-        real* output_grad_data,
-        hl_filter_descriptor filter,
-        real* filter_grad_data,
-        hl_convolution_descriptor conv,
-        void* gpuWorkSpace,
-        size_t sizeInBytes,
-        int convBwdFilterAlgo) {}
-
-inline void hl_convolution_backward_data(
-        hl_tensor_descriptor input,
-        real* input_data_grad,
-        hl_tensor_descriptor output,
-        real* output_grad_data,
-        hl_filter_descriptor filter,
-        real* filter_data,
-        hl_convolution_descriptor conv,
-        void* gpuWorkSpace,
-        size_t sizeInBytes,
-        int convBwdDataAlgo) {}
+                                            real* bias_data,
+                                            hl_tensor_descriptor output,
+                                            real* output_data) {}
+
+inline void hl_convolution_backward_filter(hl_tensor_descriptor input,
+                                           real* input_data,
+                                           hl_tensor_descriptor output,
+                                           real* output_grad_data,
+                                           hl_filter_descriptor filter,
+                                           real* filter_grad_data,
+                                           hl_convolution_descriptor conv,
+                                           void* gpuWorkSpace,
+                                           size_t sizeInBytes,
+                                           int convBwdFilterAlgo) {}
+
+inline void hl_convolution_backward_data(hl_tensor_descriptor input,
+                                         real* input_data_grad,
+                                         hl_tensor_descriptor output,
+                                         real* output_grad_data,
+                                         hl_filter_descriptor filter,
+                                         real* filter_data,
+                                         hl_convolution_descriptor conv,
+                                         void* gpuWorkSpace,
+                                         size_t sizeInBytes,
+                                         int convBwdDataAlgo) {}
 
 inline void hl_convolution_backward_bias(hl_tensor_descriptor bias,
-                                        real* bias_grad_data,
-                                        hl_tensor_descriptor output,
-                                        real* output_grad_data) {}
-
-inline void hl_softmax_forward(real *input,
-                              real *output,
-                              int height,
-                              int width) {}
+                                         real* bias_grad_data,
+                                         hl_tensor_descriptor output,
+                                         real* output_grad_data) {}
 
-inline void hl_softmax_backward(real *output_value,
-                               real *output_grad,
+inline void hl_softmax_forward(real* input,
+                               real* output,
                                int height,
                                int width) {}
 
+inline void hl_softmax_backward(real* output_value,
+                                real* output_grad,
+                                int height,
+                                int width) {}
+
 inline void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                           real *input,
+                                           real* input,
                                            hl_tensor_descriptor outputDesc,
-                                           real *output,
+                                           real* output,
                                            hl_tensor_descriptor bnParamDesc,
-                                           real *scale,
-                                           real *bias,
+                                           real* scale,
+                                           real* bias,
                                            double factor,
-                                           real *runningMean,
-                                           real *runningInvVar,
+                                           real* runningMean,
+                                           real* runningInvVar,
                                            double epsilon,
-                                           real *savedMean,
-                                           real *savedVar) {}
+                                           real* savedMean,
+                                           real* savedVar) {}
 
 inline void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                            real *input,
+                                            real* input,
                                             hl_tensor_descriptor outputDesc,
-                                            real *output,
+                                            real* output,
                                             hl_tensor_descriptor bnParamDesc,
-                                            real *scale,
-                                            real *bias,
-                                            real *estimatedMean,
-                                            real *estimatedVar,
+                                            real* scale,
+                                            real* bias,
+                                            real* estimatedMean,
+                                            real* estimatedVar,
                                             double epsilon) {}
 
 inline void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                                   real *input,
+                                   real* input,
                                    hl_tensor_descriptor outGradDesc,
-                                   real *outGrad,
+                                   real* outGrad,
                                    hl_tensor_descriptor inGradDesc,
-                                   real *inGrad,
+                                   real* inGrad,
                                    hl_tensor_descriptor dBnParamDesc,
-                                   real *scale,
-                                   real *scaleGrad,
-                                   real *biasGrad,
+                                   real* scale,
+                                   real* scaleGrad,
+                                   real* biasGrad,
                                    double epsilon,
-                                   real *savedMean,
-                                   real *savedInvVar) {}
+                                   real* savedMean,
+                                   real* savedInvVar) {}
 
 #endif  // HL_CUDA_CUDNN_STUB_H_
-
diff --git a/paddle/cuda/include/stub/hl_cuda_stub.h b/paddle/cuda/include/stub/hl_cuda_stub.h
index 675ac03b0e188e9b26038dd4e40264099618e17a..1f91068cdf8b3d472c4b403d1ec7d5293c28c07e 100644
--- a/paddle/cuda/include/stub/hl_cuda_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_CUDA_STUB_H_
 #define HL_CUDA_STUB_H_
 
@@ -24,29 +23,25 @@ inline void hl_specify_devices_start(int *device, int number) {}
 
 inline void hl_init(int device) {}
 
-inline int hl_get_cuda_lib_version(int device) {
-  return 0;
-}
+inline int hl_get_cuda_lib_version(int device) { return 0; }
 
 inline void hl_fini() {}
 
 inline void hl_set_sync_flag(bool flag) {}
 
-inline bool hl_get_sync_flag() {
-  return false;
-}
+inline bool hl_get_sync_flag() { return false; }
 
-inline int hl_get_device_count() { return 0;  }
+inline int hl_get_device_count() { return 0; }
 
 inline void hl_set_device(int device) {}
 
-inline int hl_get_device() { return 0;  }
+inline int hl_get_device() { return 0; }
 
-inline void* hl_malloc_device(size_t size) { return NULL; }
+inline void *hl_malloc_device(size_t size) { return NULL; }
 
 inline void hl_free_mem_device(void *dest_d) {}
 
-inline void* hl_malloc_host(size_t size) { return NULL;  }
+inline void *hl_malloc_host(size_t size) { return NULL; }
 
 inline void hl_free_mem_host(void *dest_h) {}
 
@@ -64,7 +59,9 @@ inline void hl_rand(real *dest_d, size_t num) {}
 
 inline void hl_srand(unsigned int seed) {}
 
-inline void hl_memcpy_async(void *dst, void *src, size_t size,
+inline void hl_memcpy_async(void *dst,
+                            void *src,
+                            size_t size,
                             hl_stream_t stream) {}
 
 inline void hl_stream_synchronize(hl_stream_t stream) {}
@@ -83,11 +80,11 @@ inline void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {}
 
 inline void hl_event_synchronize(hl_event_t event) {}
 
-inline int hl_get_device_last_error() { return 0;  }
+inline int hl_get_device_last_error() { return 0; }
 
-inline const char* hl_get_device_error_string() { return NULL; }
+inline const char *hl_get_device_error_string() { return NULL; }
 
-inline const char* hl_get_device_error_string(size_t err) { return NULL; }
+inline const char *hl_get_device_error_string(size_t err) { return NULL; }
 
 inline bool hl_cuda_event_is_ready(hl_event_t event) { return true; }
 
diff --git a/paddle/cuda/include/stub/hl_lstm_stub.h b/paddle/cuda/include/stub/hl_lstm_stub.h
index 2700bef02a5e1e40ee7603ccab7fec754196f8cd..7ccda032d26f2fbbe99136e8481416daea557a78 100644
--- a/paddle/cuda/include/stub/hl_lstm_stub.h
+++ b/paddle/cuda/include/stub/hl_lstm_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_LSTM_STUB_H_
 #define HL_LSTM_STUB_H_
 
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
index f1f1020c84d46cb14a85fa7569fa6cf36a1c8dab..1bd78d23fbaf46e6265ba0db25ea399a204bd96f 100644
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_MATRIX_STUB_H_
 #define HL_MATRIX_STUB_H_
 
@@ -26,36 +25,30 @@ inline void hl_matrix_add(real* A_d,
                           real alpha,
                           real beta) {}
 
-inline void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) {}
+inline void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {}
 
-inline void hl_sequence_softmax_forward(real *A_d,
-                                        real *C_d,
+inline void hl_sequence_softmax_forward(real* A_d,
+                                        real* C_d,
                                         const int* index,
                                         int numSequence) {}
 
-inline void hl_matrix_softmax_derivative(real* grad_d,
-                                         real* output_d,
-                                         real* sftmaxSum_d,
-                                         int dimM,
-                                         int dimN) {}
-
-inline void hl_matrix_classification_error(real* A_d,
-                                           int* B_d,
-                                           real* C_d,
-                                           int dimM,
-                                           int dimN) {}
-
-inline void hl_matrix_cross_entropy(real* A_d,
-                                    real* C_d,
-                                    int* label_d,
-                                    int dimM,
-                                    int dimN) {}
-
-inline void hl_matrix_cross_entropy_bp(real* grad_d,
-                                       real* output_d,
-                                       int* label_d,
-                                       int dimM,
-                                       int dimN) {}
+inline void hl_matrix_softmax_derivative(
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {}
+
+inline void hl_matrix_classification_error(
+    real* A_d, int* B_d, real* C_d, int dimM, int dimN) {}
+
+inline void hl_matrix_cross_entropy(
+    real* A_d, real* C_d, int* label_d, int dimM, int dimN) {}
+
+inline void hl_matrix_cross_entropy_bp(
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {}
+
+inline void hl_matrix_multi_binary_cross_entropy(
+    real* output, real* entropy, hl_sparse_matrix_s mat, int dimM, int dimN) {}
+
+inline void hl_matrix_multi_binary_cross_entropy_bp(
+    real* output, real* grad, hl_sparse_matrix_s mat, int dimM, int dimN) {}
 
 inline void hl_matrix_zero_mem(real* data, int num) {}
 
@@ -89,7 +82,6 @@ inline void hl_cossim(real* output,
                       int input2_height,
                       real scale) {}
 
-
 inline void hl_cossim_derivative(real* grad,
                                  real* output,
                                  real* prevOutX,
@@ -101,4 +93,17 @@ inline void hl_cossim_derivative(real* grad,
                                  int input2_height,
                                  real scale) {}
 
+inline void hl_matrix_add_shared_bias(real* A_d,
+                                      real* B_d,
+                                      const int channel,
+                                      const int dimM,
+                                      const int dimN,
+                                      real scale) {}
+
+inline void hl_matrix_collect_shared_bias(real* B_d,
+                                          real* A_d,
+                                          const int channel,
+                                          const int dimM,
+                                          const int dimN,
+                                          real scale) {}
 #endif  // HL_MATRIX_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
index aabd956c37f7dce48a379b995ab88a53aa65c760..381f0a6f26c5669465f029e972c6ca8b0e6e1776 100644
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_SEQUENCE_STUB_H_
 #define HL_SEQUENCE_STUB_H_
 
@@ -21,15 +20,12 @@ limitations under the License. */
 inline void hl_max_sequence_forward(real* input,
                                     const int* sequence,
                                     real* output,
-                                    int *index,
+                                    int* index,
                                     int numSequences,
                                     int dim) {}
 
-inline void hl_max_sequence_backward(real* outputGrad,
-                                     int *index,
-                                     real* inputGrad,
-                                     int numSequences,
-                                     int dim) {}
+inline void hl_max_sequence_backward(
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {}
 
 inline void hl_context_projection_forward(real* input,
                                           const int* sequence,
@@ -60,16 +56,16 @@ inline void hl_context_projection_backward_weight(real* outputGrad,
                                                   int contextStart,
                                                   int beginPad) {}
 
-inline void hl_sequence2batch_copy(real *batch,
-                                   real *sequence,
-                                   const int *batchIndex,
+inline void hl_sequence2batch_copy(real* batch,
+                                   real* sequence,
+                                   const int* batchIndex,
                                    int seqWidth,
                                    int batchCount,
                                    bool seq2batch) {}
 
-inline void hl_sequence2batch_add(real *batch,
-                                  real *sequence,
-                                  int *batchIndex,
+inline void hl_sequence2batch_add(real* batch,
+                                  real* sequence,
+                                  int* batchIndex,
                                   int seqWidth,
                                   int batchCount,
                                   bool seq2batch) {}
diff --git a/paddle/cuda/include/stub/hl_sparse_stub.h b/paddle/cuda/include/stub/hl_sparse_stub.h
index 346a1900dda5825e9a4311a2c51e8a50e6e7df0b..d47bdd2c47d097c4c68b7b7e88ef888bc18270c2 100644
--- a/paddle/cuda/include/stub/hl_sparse_stub.h
+++ b/paddle/cuda/include/stub/hl_sparse_stub.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef HL_SPARSE_STUB_H_
 #define HL_SPARSE_STUB_H_
 
@@ -20,7 +19,7 @@ limitations under the License. */
 
 inline void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
                                     hl_matrix_format_t format,
-                                    hl_matrix_value_t  value_type,
+                                    hl_matrix_value_t value_type,
                                     int dimM,
                                     int dimN,
                                     int nnz) {}
@@ -28,20 +27,20 @@ inline void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
 inline void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {}
 
 inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       void * dest_d,
+                                       void *dest_d,
                                        size_t size,
                                        hl_matrix_format_t format,
-                                       hl_matrix_value_t  value_type,
+                                       hl_matrix_value_t value_type,
                                        int dimM,
                                        int dimN,
                                        int nnz) {}
 
 inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       real* value_d,
-                                       int* rows_d,
-                                       int* cols_d,
+                                       real *value_d,
+                                       int *rows_d,
+                                       int *cols_d,
                                        hl_matrix_format_t format,
-                                       hl_matrix_value_t  value_type,
+                                       hl_matrix_value_t value_type,
                                        int dimM,
                                        int dimN,
                                        int nnz) {}
@@ -87,10 +86,14 @@ inline void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
 
 inline void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
                                     hl_trans_op_t transa,
-                                    real *B_d, hl_trans_op_t transb,
+                                    real *B_d,
+                                    hl_trans_op_t transb,
                                     real *C_d,
-                                    int dimM, int dimN, int dimK,
-                                    real alpha, real beta) {}
+                                    int dimM,
+                                    int dimN,
+                                    int dimK,
+                                    real alpha,
+                                    real beta) {}
 
 inline void hl_matrix_dense_mul_csc(real *A_d,
                                     hl_trans_op_t transa,
@@ -103,18 +106,27 @@ inline void hl_matrix_dense_mul_csc(real *A_d,
                                     real alpha,
                                     real beta) {}
 
-inline void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa,
-                                 real *B_d, hl_trans_op_t transb,
+inline void hl_sparse_matrix_mul(real *A_d,
+                                 hl_trans_op_t transa,
+                                 real *B_d,
+                                 hl_trans_op_t transb,
                                  hl_sparse_matrix_s C_d,
-                                 int dimM, int dimN, int dimK,
-                                 real alpha, real beta) {}
+                                 int dimM,
+                                 int dimN,
+                                 int dimK,
+                                 real alpha,
+                                 real beta) {}
 
-inline void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
+inline void hl_matrix_dense_mul_csr(real *A_d,
+                                    hl_trans_op_t transa,
                                     hl_sparse_matrix_s B_d,
                                     hl_trans_op_t transb,
                                     real *C_d,
-                                    int dimM, int dimN, int dimK,
-                                    real alpha, real beta) {}
+                                    int dimM,
+                                    int dimN,
+                                    int dimK,
+                                    real alpha,
+                                    real beta) {}
 
 inline void hl_memcpy_from_csc_matrix(real *csc_val,
                                       size_t val_size,
@@ -134,49 +146,39 @@ inline void hl_memcpy_from_csr_matrix(real *csr_val,
                                       hl_sparse_matrix_s csr_matrix,
                                       hl_stream_t stream) {}
 
-inline void hl_sparse_matrix_column_sum(real* A_d,
-                                        hl_sparse_matrix_s B_d,
-                                        int dimM,
-                                        int dimN,
-                                        real scale) {}
+inline void hl_sparse_matrix_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {}
 
-inline void hl_matrix_csr_column_sum(real* A_d,
-                                     hl_sparse_matrix_s B_d,
-                                     int dimM,
-                                     int dimN,
-                                     real scale) {}
+inline void hl_matrix_csr_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {}
 
 inline void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
-                                      real* B_d,
+                                      real *B_d,
                                       real scale) {}
 
 inline void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
-                                   real* B_d,
+                                   real *B_d,
                                    real scale) {}
 
 inline void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
-                                       real* B_d,
+                                       real *B_d,
                                        int dimM,
                                        int dimN,
                                        real alpha,
                                        real beta) {}
 
 inline void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
-                                    real* B_d,
+                                    real *B_d,
                                     int dimM,
                                     int dimN,
                                     real alpha,
                                     real beta) {}
 
-inline int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
-  return NULL;
-}
+inline int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) { return NULL; }
 
-inline int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
-  return NULL;
-}
+inline int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) { return NULL; }
 
-inline real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
+inline real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
   return NULL;
 }
 
diff --git a/paddle/cuda/src/avx_mathfun.h b/paddle/cuda/src/avx_mathfun.h
index 808c2508d1a1a09fb25f052047d6b0539cad8df2..2412ed5abc13b2a83521a75524f581e106788b60 100644
--- a/paddle/cuda/src/avx_mathfun.h
+++ b/paddle/cuda/src/avx_mathfun.h
@@ -32,32 +32,35 @@
 #include <immintrin.h>
 
 /* yes I know, the top of this file is quite ugly */
-# define ALIGN32_BEG
-# define ALIGN32_END __attribute__((aligned(32)))
+#define ALIGN32_BEG
+#define ALIGN32_END __attribute__((aligned(32)))
 
 /* __m128 is ugly to write */
-typedef __m256  v8sf; // vector of 8 float (avx)
-typedef __m256i v8si; // vector of 8 int   (avx)
-typedef __m128i v4si; // vector of 8 int   (avx)
+typedef __m256 v8sf;   // vector of 8 float (avx)
+typedef __m256i v8si;  // vector of 8 int   (avx)
+typedef __m128i v4si;  // vector of 8 int   (avx)
 
-#define _PI32AVX_CONST(Name, Val)                                            \
-  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { Val, Val, Val, Val }
+#define _PI32AVX_CONST(Name, Val)                                 \
+  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { \
+      Val, Val, Val, Val}
 
 _PI32AVX_CONST(1, 1);
 _PI32AVX_CONST(inv1, ~1);
 _PI32AVX_CONST(2, 2);
 _PI32AVX_CONST(4, 4);
 
-
 /* declare some AVX constants -- why can't I figure a better way to do that? */
-#define _PS256_CONST(Name, Val)                                            \
-  static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-#define _PI32_CONST256(Name, Val)                                            \
-  static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-#define _PS256_CONST_TYPE(Name, Type, Val)                                 \
-  static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val }
-
-_PS256_CONST(1  , 1.0f);
+#define _PS256_CONST(Name, Val)                                   \
+  static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PI32_CONST256(Name, Val)                                  \
+  static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+#define _PS256_CONST_TYPE(Name, Type, Val)                       \
+  static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \
+      Val, Val, Val, Val, Val, Val, Val, Val}
+
+_PS256_CONST(1, 1.0f);
 _PS256_CONST(0p5, 0.5f);
 /* the smallest non denormalized float number */
 _PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
@@ -76,14 +79,14 @@ _PI32_CONST256(0x7f, 0x7f);
 
 _PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
 _PS256_CONST(cephes_log_p0, 7.0376836292E-2);
-_PS256_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS256_CONST(cephes_log_p1, -1.1514610310E-1);
 _PS256_CONST(cephes_log_p2, 1.1676998740E-1);
-_PS256_CONST(cephes_log_p3, - 1.2420140846E-1);
-_PS256_CONST(cephes_log_p4, + 1.4249322787E-1);
-_PS256_CONST(cephes_log_p5, - 1.6668057665E-1);
-_PS256_CONST(cephes_log_p6, + 2.0000714765E-1);
-_PS256_CONST(cephes_log_p7, - 2.4999993993E-1);
-_PS256_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS256_CONST(cephes_log_p3, -1.2420140846E-1);
+_PS256_CONST(cephes_log_p4, +1.4249322787E-1);
+_PS256_CONST(cephes_log_p5, -1.6668057665E-1);
+_PS256_CONST(cephes_log_p6, +2.0000714765E-1);
+_PS256_CONST(cephes_log_p7, -2.4999993993E-1);
+_PS256_CONST(cephes_log_p8, +3.3333331174E-1);
 _PS256_CONST(cephes_log_q1, -2.12194440e-4);
 _PS256_CONST(cephes_log_q2, 0.693359375);
 
@@ -94,50 +97,51 @@ typedef union imm_xmm_union {
   v4si xmm[2];
 } imm_xmm_union;
 
-#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) {    \
-    imm_xmm_union u __attribute__((aligned(32)));  \
-    u.imm = imm_;				   \
-    xmm0_ = u.xmm[0];                            \
-    xmm1_ = u.xmm[1];                            \
-}
-
-#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) {                       \
+#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_)       \
+  {                                               \
     imm_xmm_union u __attribute__((aligned(32))); \
-    u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \
+    u.imm = imm_;                                 \
+    xmm0_ = u.xmm[0];                             \
+    xmm1_ = u.xmm[1];                             \
   }
 
+#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_)       \
+  {                                               \
+    imm_xmm_union u __attribute__((aligned(32))); \
+    u.xmm[0] = xmm0_;                             \
+    u.xmm[1] = xmm1_;                             \
+    imm_ = u.imm;                                 \
+  }
 
-#define AVX2_BITOP_USING_SSE2(fn) \
-static inline v8si avx2_mm256_##fn(v8si x, int a) \
-{ \
-  /* use SSE2 instruction to perform the bitop AVX2 */ \
-  v4si x1, x2; \
-  v8si ret; \
-  COPY_IMM_TO_XMM(x, x1, x2); \
-  x1 = _mm_##fn(x1,a); \
-  x2 = _mm_##fn(x2,a); \
-  COPY_XMM_TO_IMM(x1, x2, ret); \
-  return(ret); \
-}
+#define AVX2_BITOP_USING_SSE2(fn)                        \
+  static inline v8si avx2_mm256_##fn(v8si x, int a) {    \
+    /* use SSE2 instruction to perform the bitop AVX2 */ \
+    v4si x1, x2;                                         \
+    v8si ret;                                            \
+    COPY_IMM_TO_XMM(x, x1, x2);                          \
+    x1 = _mm_##fn(x1, a);                                \
+    x2 = _mm_##fn(x2, a);                                \
+    COPY_XMM_TO_IMM(x1, x2, ret);                        \
+    return (ret);                                        \
+  }
 
 //#warning "Using SSE2 to perform AVX2 bitshift ops"
 AVX2_BITOP_USING_SSE2(slli_epi32)
 AVX2_BITOP_USING_SSE2(srli_epi32)
 
-#define AVX2_INTOP_USING_SSE2(fn) \
-static inline v8si avx2_mm256_##fn(v8si x, v8si y) \
-{ \
-  /* use SSE2 instructions to perform the AVX2 integer operation */ \
-  v4si x1, x2; \
-  v4si y1, y2; \
-  v8si ret; \
-  COPY_IMM_TO_XMM(x, x1, x2); \
-  COPY_IMM_TO_XMM(y, y1, y2); \
-  x1 = _mm_##fn(x1,y1); \
-  x2 = _mm_##fn(x2,y2); \
-  COPY_XMM_TO_IMM(x1, x2, ret); \
-  return(ret); \
-}
+#define AVX2_INTOP_USING_SSE2(fn)                                     \
+  static inline v8si avx2_mm256_##fn(v8si x, v8si y) {                \
+    /* use SSE2 instructions to perform the AVX2 integer operation */ \
+    v4si x1, x2;                                                      \
+    v4si y1, y2;                                                      \
+    v8si ret;                                                         \
+    COPY_IMM_TO_XMM(x, x1, x2);                                       \
+    COPY_IMM_TO_XMM(y, y1, y2);                                       \
+    x1 = _mm_##fn(x1, y1);                                            \
+    x2 = _mm_##fn(x2, y2);                                            \
+    COPY_XMM_TO_IMM(x1, x2, ret);                                     \
+    return (ret);                                                     \
+  }
 
 //#warning "Using SSE2 to perform AVX2 integer ops"
 AVX2_INTOP_USING_SSE2(and_si128)
@@ -157,84 +161,83 @@ AVX2_INTOP_USING_SSE2(add_epi32)
 #define avx2_mm256_add_epi32 _mm256_add_epi32
 #endif /* __AVX2__ */
 
-
-/* natural logarithm computed for 8 simultaneous float 
+/* natural logarithm computed for 8 simultaneous float
    return NaN for x <= 0
 */
 v8sf log256_ps(v8sf x) {
   v8si imm0;
-  v8sf one = *(v8sf*)_ps256_1;
+  v8sf one = *(v8sf *)_ps256_1;
 
-  //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
+  // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
   v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
 
-  x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos);  /* cut off denormalized stuff */
+  x = _mm256_max_ps(
+      x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */
 
   // can be done with AVX2
   imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
 
   /* keep only the fractional part */
-  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask);
-  x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5);
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);
+  x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);
 
   // this is again another AVX2 instruction
-  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);
   v8sf e = _mm256_cvtepi32_ps(imm0);
 
   e = _mm256_add_ps(e, one);
 
-  /* part2: 
+  /* part2:
      if( x < SQRTHF ) {
        e -= 1;
        x = x + x - 1.0;
      } else { x = x - 1.0; }
   */
-  //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
-  v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
+  // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
+  v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
   v8sf tmp = _mm256_and_ps(x, mask);
   x = _mm256_sub_ps(x, one);
   e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
   x = _mm256_add_ps(x, tmp);
 
-  v8sf z = _mm256_mul_ps(x,x);
+  v8sf z = _mm256_mul_ps(x, x);
 
-  v8sf y = *(v8sf*)_ps256_cephes_log_p0;
+  v8sf y = *(v8sf *)_ps256_cephes_log_p0;
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);
   y = _mm256_mul_ps(y, x);
 
   y = _mm256_mul_ps(y, z);
-  
-  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1);
-  y = _mm256_add_ps(y, tmp);
 
+  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);
+  y = _mm256_add_ps(y, tmp);
 
-  tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
   y = _mm256_sub_ps(y, tmp);
 
-  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2);
+  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);
   x = _mm256_add_ps(x, y);
   x = _mm256_add_ps(x, tmp);
-  x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
+  x = _mm256_or_ps(x, invalid_mask);  // negative arg will be NAN
   return x;
 }
 
-_PS256_CONST(exp_hi,	88.3762626647949f);
-_PS256_CONST(exp_lo,	-88.3762626647949f);
+_PS256_CONST(exp_hi, 88.3762626647949f);
+_PS256_CONST(exp_lo, -88.3762626647949f);
 
 _PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
 _PS256_CONST(cephes_exp_C1, 0.693359375);
@@ -250,45 +253,45 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
 v8sf exp256_ps(v8sf x) {
   v8sf tmp = _mm256_setzero_ps(), fx;
   v8si imm0;
-  v8sf one = *(v8sf*)_ps256_1;
+  v8sf one = *(v8sf *)_ps256_1;
 
-  x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi);
-  x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo);
+  x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);
+  x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);
 
   /* express exp(x) as exp(g + n*log(2)) */
-  fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF);
-  fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5);
+  fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);
+  fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);
 
   /* how to perform a floorf with SSE: just below */
-  //imm0 = _mm256_cvttps_epi32(fx);
-  //tmp  = _mm256_cvtepi32_ps(imm0);
-  
+  // imm0 = _mm256_cvttps_epi32(fx);
+  // tmp  = _mm256_cvtepi32_ps(imm0);
+
   tmp = _mm256_floor_ps(fx);
 
   /* if greater, substract 1 */
-  //v8sf mask = _mm256_cmpgt_ps(tmp, fx);    
-  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);    
+  // v8sf mask = _mm256_cmpgt_ps(tmp, fx);
+  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
   mask = _mm256_and_ps(mask, one);
   fx = _mm256_sub_ps(tmp, mask);
 
-  tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1);
-  v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2);
+  tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);
+  v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);
   x = _mm256_sub_ps(x, tmp);
   x = _mm256_sub_ps(x, z);
 
-  z = _mm256_mul_ps(x,x);
-  
-  v8sf y = *(v8sf*)_ps256_cephes_exp_p0;
+  z = _mm256_mul_ps(x, x);
+
+  v8sf y = *(v8sf *)_ps256_cephes_exp_p0;
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);
   y = _mm256_mul_ps(y, z);
   y = _mm256_add_ps(y, x);
   y = _mm256_add_ps(y, one);
@@ -296,7 +299,7 @@ v8sf exp256_ps(v8sf x) {
   /* build 2^n */
   imm0 = _mm256_cvttps_epi32(fx);
   // another two AVX2 instructions
-  imm0 = avx2_mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+  imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);
   imm0 = avx2_mm256_slli_epi32(imm0, 23);
   v8sf pow2n = _mm256_castsi256_ps(imm0);
   y = _mm256_mul_ps(y, pow2n);
@@ -307,13 +310,12 @@ _PS256_CONST(minus_cephes_DP1, -0.78515625);
 _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
 _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
 _PS256_CONST(sincof_p0, -1.9515295891E-4);
-_PS256_CONST(sincof_p1,  8.3321608736E-3);
+_PS256_CONST(sincof_p1, 8.3321608736E-3);
 _PS256_CONST(sincof_p2, -1.6666654611E-1);
-_PS256_CONST(coscof_p0,  2.443315711809948E-005);
+_PS256_CONST(coscof_p0, 2.443315711809948E-005);
 _PS256_CONST(coscof_p1, -1.388731625493765E-003);
-_PS256_CONST(coscof_p2,  4.166664568298827E-002);
-_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
-
+_PS256_CONST(coscof_p2, 4.166664568298827E-002);
+_PS256_CONST(cephes_FOPI, 1.27323954473516);  // 4 / M_PI
 
 /* evaluation of 8 sines at onces using AVX intrisics
 
@@ -327,7 +329,7 @@ _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
    surprising but correct result.
 
 */
-v8sf sin256_ps(v8sf x) { // any x
+v8sf sin256_ps(v8sf x) {  // any x
   v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
   v8si imm0, imm2;
 
@@ -338,78 +340,78 @@ v8sf sin256_ps(v8sf x) { // any x
 
   sign_bit = x;
   /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
   /* extract the sign bit (upper one) */
-  sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
-  
+  sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);
+
   /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
 
-  /*
-    Here we start a series of integer operations, which are in the
-    realm of AVX2.
-    If we don't have AVX, let's perform them using SSE2 directives
-  */
+/*
+  Here we start a series of integer operations, which are in the
+  realm of AVX2.
+  If we don't have AVX, let's perform them using SSE2 directives
+*/
 
 #ifdef __AVX2__
   /* store the integer part of y in mm0 */
   imm2 = _mm256_cvttps_epi32(y);
   /* j=(j+1) & (~1) (see the cephes sources) */
   // another two AVX2 instruction
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
   y = _mm256_cvtepi32_ps(imm2);
 
   /* get the swap sign flag */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
+  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
   imm0 = avx2_mm256_slli_epi32(imm0, 29);
-  /* get the polynom selection mask 
+  /* get the polynom selection mask
      there is one polynom for 0 <= x <= Pi/4
      and another one for Pi/4<x<=Pi/2
 
      Both branches will be computed.
   */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
 #else
   /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
 
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
 
-  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
   y = _mm256_cvtepi32_ps(imm2);
 
-  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
+  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
+  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
 
   imm0_1 = _mm_slli_epi32(imm0_1, 29);
   imm0_2 = _mm_slli_epi32(imm0_2, 29);
 
   COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
 
   imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
   imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
 
   COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
 #endif
- 
+
   v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
   v8sf poly_mask = _mm256_castsi256_ps(imm2);
   sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
 
-  /* The magic pass: "Extended precision modular arithmetic" 
+  /* The magic pass: "Extended precision modular arithmetic"
      x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
   xmm1 = _mm256_mul_ps(y, xmm1);
   xmm2 = _mm256_mul_ps(y, xmm2);
   xmm3 = _mm256_mul_ps(y, xmm3);
@@ -418,35 +420,35 @@ v8sf sin256_ps(v8sf x) { // any x
   x = _mm256_add_ps(x, xmm3);
 
   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf*)_ps256_coscof_p0;
-  v8sf z = _mm256_mul_ps(x,x);
+  y = *(v8sf *)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x, x);
 
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
   y = _mm256_mul_ps(y, z);
   y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
   y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
-  
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+
   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 
-  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
   y2 = _mm256_mul_ps(y2, z);
   y2 = _mm256_mul_ps(y2, x);
   y2 = _mm256_add_ps(y2, x);
 
-  /* select the correct result from the two polynoms */  
+  /* select the correct result from the two polynoms */
   xmm3 = poly_mask;
-  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
+  y2 = _mm256_and_ps(xmm3, y2);  //, xmm3);
   y = _mm256_andnot_ps(xmm3, y);
-  y = _mm256_add_ps(y,y2);
+  y = _mm256_add_ps(y, y2);
   /* update the sign */
   y = _mm256_xor_ps(y, sign_bit);
 
@@ -454,7 +456,7 @@ v8sf sin256_ps(v8sf x) { // any x
 }
 
 /* almost the same as sin_ps */
-v8sf cos256_ps(v8sf x) { // any x
+v8sf cos256_ps(v8sf x) {  // any x
   v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
   v8si imm0, imm2;
 
@@ -464,53 +466,53 @@ v8sf cos256_ps(v8sf x) { // any x
 #endif
 
   /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
-  
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+
   /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
-  
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+
 #ifdef __AVX2__
   /* store the integer part of y in mm0 */
   imm2 = _mm256_cvttps_epi32(y);
   /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
   y = _mm256_cvtepi32_ps(imm2);
-  imm2 = avx2_mm256_sub_epi32(imm2, *(v8si*)_pi32_256_2);
-  
+  imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2);
+
   /* get the swap sign flag */
-  imm0 = avx2_mm256_andnot_si256(imm2, *(v8si*)_pi32_256_4);
+  imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4);
   imm0 = avx2_mm256_slli_epi32(imm0, 29);
   /* get the polynom selection mask */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
 #else
 
   /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
 
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
 
-  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
   y = _mm256_cvtepi32_ps(imm2);
 
-  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si*)_pi32avx_2);
-  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si*)_pi32avx_2);
+  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2);
 
-  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si*)_pi32avx_4);
-  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si*)_pi32avx_4);
+  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4);
+  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4);
 
   imm0_1 = _mm_slli_epi32(imm0_1, 29);
   imm0_2 = _mm_slli_epi32(imm0_2, 29);
 
   COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
 
   imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
   imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
@@ -521,58 +523,58 @@ v8sf cos256_ps(v8sf x) { // any x
   v8sf sign_bit = _mm256_castsi256_ps(imm0);
   v8sf poly_mask = _mm256_castsi256_ps(imm2);
 
-  /* The magic pass: "Extended precision modular arithmetic" 
+  /* The magic pass: "Extended precision modular arithmetic"
      x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
   xmm1 = _mm256_mul_ps(y, xmm1);
   xmm2 = _mm256_mul_ps(y, xmm2);
   xmm3 = _mm256_mul_ps(y, xmm3);
   x = _mm256_add_ps(x, xmm1);
   x = _mm256_add_ps(x, xmm2);
   x = _mm256_add_ps(x, xmm3);
-  
+
   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf*)_ps256_coscof_p0;
-  v8sf z = _mm256_mul_ps(x,x);
+  y = *(v8sf *)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x, x);
 
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
   y = _mm256_mul_ps(y, z);
   y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
   y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
-  
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+
   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 
-  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
   y2 = _mm256_mul_ps(y2, z);
   y2 = _mm256_mul_ps(y2, x);
   y2 = _mm256_add_ps(y2, x);
 
-  /* select the correct result from the two polynoms */  
+  /* select the correct result from the two polynoms */
   xmm3 = poly_mask;
-  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
+  y2 = _mm256_and_ps(xmm3, y2);  //, xmm3);
   y = _mm256_andnot_ps(xmm3, y);
-  y = _mm256_add_ps(y,y2);
+  y = _mm256_add_ps(y, y2);
   /* update the sign */
   y = _mm256_xor_ps(y, sign_bit);
 
   return y;
 }
 
-/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them..
+/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could
+   replace both of them..
    it is almost as fast, and gives you a free cosine with your sine */
 void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
-
   v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
   v8si imm0, imm2, imm4;
 
@@ -584,59 +586,59 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
 
   sign_bit_sin = x;
   /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
   /* extract the sign bit (upper one) */
-  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask);
-  
+  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask);
+
   /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
 
-#ifdef __AVX2__    
+#ifdef __AVX2__
   /* store the integer part of y in imm2 */
   imm2 = _mm256_cvttps_epi32(y);
 
   /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
 
   y = _mm256_cvtepi32_ps(imm2);
   imm4 = imm2;
 
   /* get the swap sign flag for the sine */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
+  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
   imm0 = avx2_mm256_slli_epi32(imm0, 29);
-  //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+  // v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
 
   /* get the polynom selection mask for the sine*/
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
-  //v8sf poly_mask = _mm256_castsi256_ps(imm2);
+  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
+  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+// v8sf poly_mask = _mm256_castsi256_ps(imm2);
 #else
   /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
+  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
+
+  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
+  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
 
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
-  
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
 
-  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
+  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
   y = _mm256_cvtepi32_ps(imm2);
 
   imm4_1 = imm2_1;
   imm4_2 = imm2_2;
 
-  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
-  
+  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
+  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
+
   imm0_1 = _mm_slli_epi32(imm0_1, 29);
   imm0_2 = _mm_slli_epi32(imm0_2, 29);
 
   COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
+  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
 
   imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
   imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
@@ -646,11 +648,11 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
   v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
   v8sf poly_mask = _mm256_castsi256_ps(imm2);
 
-  /* The magic pass: "Extended precision modular arithmetic" 
+  /* The magic pass: "Extended precision modular arithmetic"
      x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
   xmm1 = _mm256_mul_ps(y, xmm1);
   xmm2 = _mm256_mul_ps(y, xmm2);
   xmm3 = _mm256_mul_ps(y, xmm3);
@@ -659,16 +661,16 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
   x = _mm256_add_ps(x, xmm3);
 
 #ifdef __AVX2__
-  imm4 = avx2_mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2);
-  imm4 = avx2_mm256_andnot_si256(imm4, *(v8si*)_pi32_256_4);
+  imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2);
+  imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4);
   imm4 = avx2_mm256_slli_epi32(imm4, 29);
 #else
-  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si*)_pi32avx_2);
-  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si*)_pi32avx_2);
+  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2);
+  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2);
+
+  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4);
+  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4);
 
-  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si*)_pi32avx_4);
-  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si*)_pi32avx_4);
-  
   imm4_1 = _mm_slli_epi32(imm4_1, 29);
   imm4_2 = _mm_slli_epi32(imm4_2, 29);
 
@@ -678,44 +680,43 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
   v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
 
   sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
-  
+
   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  v8sf z = _mm256_mul_ps(x,x);
-  y = *(v8sf*)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x, x);
+  y = *(v8sf *)_ps256_coscof_p0;
 
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
   y = _mm256_mul_ps(y, z);
   y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
   y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
-  
+  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+
   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 
-  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
   y2 = _mm256_mul_ps(y2, z);
   y2 = _mm256_mul_ps(y2, x);
   y2 = _mm256_add_ps(y2, x);
 
-  /* select the correct result from the two polynoms */  
+  /* select the correct result from the two polynoms */
   xmm3 = poly_mask;
   v8sf ysin2 = _mm256_and_ps(xmm3, y2);
   v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
-  y2 = _mm256_sub_ps(y2,ysin2);
+  y2 = _mm256_sub_ps(y2, ysin2);
   y = _mm256_sub_ps(y, ysin1);
 
-  xmm1 = _mm256_add_ps(ysin1,ysin2);
-  xmm2 = _mm256_add_ps(y,y2);
- 
+  xmm1 = _mm256_add_ps(ysin1, ysin2);
+  xmm2 = _mm256_add_ps(y, y2);
+
   /* update the sign */
   *s = _mm256_xor_ps(xmm1, sign_bit_sin);
   *c = _mm256_xor_ps(xmm2, sign_bit_cos);
 }
-
diff --git a/paddle/cuda/src/hl_avx_functions.cc b/paddle/cuda/src/hl_avx_functions.cc
index 2d471206f61f281eebf6939443a2b28470ecf808..c1e0c7f9d9e7958a6b4ba3617ca488e49af20655 100644
--- a/paddle/cuda/src/hl_avx_functions.cc
+++ b/paddle/cuda/src/hl_avx_functions.cc
@@ -12,60 +12,58 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <immintrin.h>
 #include "hl_functions.h"
 
 namespace hppl {
 
-  extern __m256 exp(__m256 a);
+extern __m256 exp(__m256 a);
 
-  __m256 relu(const __m256 a) {
-    __m256 tmp = _mm256_set1_ps(0.0f);
-    return _mm256_max_ps(a, tmp);
-  }
+__m256 relu(const __m256 a) {
+  __m256 tmp = _mm256_set1_ps(0.0f);
+  return _mm256_max_ps(a, tmp);
+}
 
-  __m256 sigmoid(const __m256 a) {
-    __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
-    __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
-    __m256 tmp = _mm256_max_ps(a, min);
-    tmp = _mm256_min_ps(tmp, max);
-    tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
-    tmp = exp(tmp);
-    tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
-    tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
-    return tmp;
-  }
+__m256 sigmoid(const __m256 a) {
+  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
+  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
+  __m256 tmp = _mm256_max_ps(a, min);
+  tmp = _mm256_min_ps(tmp, max);
+  tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
+  tmp = exp(tmp);
+  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
+  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
+  return tmp;
+}
 
-  __m256 tanh(const __m256 a) {
-    __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
-    tmp = exp(tmp);
-    return _mm256_sub_ps(
-        _mm256_div_ps(_mm256_set1_ps(2.0f),
-        _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)), _mm256_set1_ps(1.0f));
-  }
+__m256 tanh(const __m256 a) {
+  __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
+  __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
+  tmp = _mm256_min_ps(tmp, max);
+  tmp = exp(tmp);
+  return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f),
+                                     _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)),
+                       _mm256_set1_ps(1.0f));
+}
 
-  __m256 linear(const __m256 a) {
-    return a;
-  }
+__m256 linear(const __m256 a) { return a; }
 
-  __m256 relu(const __m256 a, const __m256 b) {
-    return _mm256_mul_ps(a,
+__m256 relu(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(
+      a,
       _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
-      _mm256_set1_ps(1.0f)));
-  }
+                    _mm256_set1_ps(1.0f)));
+}
 
-  __m256 sigmoid(const __m256 a, const __m256 b) {
-    return _mm256_mul_ps(_mm256_mul_ps(a, b),
-        _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
-  }
+__m256 sigmoid(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(_mm256_mul_ps(a, b),
+                       _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
+}
 
-  __m256 tanh(const __m256 a, const __m256 b) {
-    return _mm256_mul_ps(a,
-      _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
-  }
+__m256 tanh(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(
+      a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
+}
 
-  __m256 linear(const __m256 a, const __m256 b) {
-    return a;
-  }
+__m256 linear(const __m256 a, const __m256 b) { return a; }
 }  // namespace hppl
diff --git a/paddle/cuda/src/hl_cpu_functions.cc b/paddle/cuda/src/hl_cpu_functions.cc
index 3fd6b278d053714a6b6f0fe33831a32e2c64e3ae..af00f352e536bf342e15315d1f6804225b87eb0b 100644
--- a/paddle/cuda/src/hl_cpu_functions.cc
+++ b/paddle/cuda/src/hl_cpu_functions.cc
@@ -12,44 +12,33 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <math.h>
 #include "hl_functions.h"
 
 namespace hppl {
 
-  real relu(const real a) {
-    return a > 0.0f ? a : 0.0f;
-  }
+real relu(const real a) { return a > 0.0f ? a : 0.0f; }
 
-  real sigmoid(const real a) {
-    const real min = SIGMOID_THRESHOLD_MIN;
-    const real max = SIGMOID_THRESHOLD_MAX;
-    real tmp = (a < min) ? min : ((a > max) ? max : a);
-    return 1.0 / (1.0 + exp(-tmp));
-  }
+real sigmoid(const real a) {
+  const real min = SIGMOID_THRESHOLD_MIN;
+  const real max = SIGMOID_THRESHOLD_MAX;
+  real tmp = (a < min) ? min : ((a > max) ? max : a);
+  return 1.0 / (1.0 + exp(-tmp));
+}
 
-  real tanh(const real a) {
-    return (2.0 / (1.0 + exp(-2.0*a))) - 1.0;
-  }
+real tanh(const real a) {
+  real tmp = -2.0 * a;
+  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+  return (2.0 / (1.0 + exp(tmp))) - 1.0;
+}
 
-  real linear(const real a) {
-    return a;
-  }
+real linear(const real a) { return a; }
 
-  real relu(const real a, const real b) {
-    return a * (b > 0.0f ? 1.0f : 0.0f);
-  }
+real relu(const real a, const real b) { return a * (b > 0.0f ? 1.0f : 0.0f); }
 
-  real sigmoid(const real a, const real b) {
-    return a * b * (1 - b);
-  }
+real sigmoid(const real a, const real b) { return a * b * (1 - b); }
 
-  real tanh(const real a, const real b) {
-    return a * (1.0f - b * b);
-  }
+real tanh(const real a, const real b) { return a * (1.0f - b * b); }
 
-  real linear(const real a, const real b) {
-    return a;
-  }
+real linear(const real a, const real b) { return a; }
 }  // namespace hppl
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index abac83a3e04472fe25bdbe662427aea56c096ad4..ae387a8bc0e0791995810df9e5f2556264d869b1 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -152,7 +152,7 @@ __global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
                                  const int ksizeW, const int ksizeH,
                                  const int strideH, const int strideW,
                                  const int offsetH, const int offsetW,
-                                 real* tgtData) {
+                                 real* tgtData, const int tgtStride) {
   int index =  blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int pw = index % pooledW;
@@ -173,7 +173,9 @@ __global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
           maxval = inputData[h * width + w];
       }
     }
-    tgtData[index] = maxval;
+    int tgtIndex = index % (pooledW * pooledH * channels) +
+        frameNum * tgtStride;
+    tgtData[tgtIndex] = maxval;
   }
 }
 
@@ -184,7 +186,7 @@ void hl_maxpool_forward(const int frameCnt, const real* inputData,
                         const int sizeX, const int sizeY,
                         const int strideH, const int strideW,
                         const int paddingH, const int paddingW,
-                        real* tgtData) {
+                        real* tgtData, const int tgtStride) {
 
   int num_kernels = pooledH * pooledW * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
@@ -194,7 +196,7 @@ void hl_maxpool_forward(const int frameCnt, const real* inputData,
   KeMaxPoolForward<<< grid, threads, 0, STREAM_DEFAULT >>>
            (num_kernels, inputData, channels, height, width,
            pooledH, pooledW, sizeX, sizeY, strideH, strideW,
-           paddingH, paddingW, tgtData);
+           paddingH, paddingW, tgtData, tgtStride);
   CHECK_SYNC("hl_maxpool_forward failed");
 }
 
@@ -207,7 +209,7 @@ __global__ void KeMaxPoolBackward(const int nthreads, const real* inputData,
                                   const int strideH, const int strideW,
                                   const int padH, const int padW,
                                   real scaleA, real scaleB,
-                                  real* targetGrad) {
+                                  real* targetGrad, const int outStride) {
   int index = blockIdx.x  * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     // find out the local index
@@ -223,8 +225,8 @@ __global__ void KeMaxPoolBackward(const int nthreads, const real* inputData,
     int pwend = offsetW >= 0 ? min(offsetW / strideW + 1, pooledW) : 0;
     real gradient = 0;
     real input = inputData[index];
-    outData += (frameNum * channels + offsetC) * pooledH * pooledW;
-    outGrad += (frameNum * channels + offsetC) * pooledH * pooledW;
+    outData += (frameNum * outStride + offsetC * pooledH * pooledW);
+    outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
     for (int ph = phstart; ph < phend; ++ph) {
       for (int pw = pwstart; pw < pwend; ++pw) {
         if (input == outData[ph * pooledW + pw]) {
@@ -246,7 +248,7 @@ void hl_maxpool_backward(const int frameCnt, const real* inputData,
                         const int strideH, const int strideW,
                         const int paddingH, const int paddingW,
                         real scaleA, real scaleB,
-                        real* targetGrad) {
+                        real* targetGrad, const int outStride) {
 
   int num_kernels = height * width * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
@@ -257,7 +259,7 @@ void hl_maxpool_backward(const int frameCnt, const real* inputData,
            strideH, strideW,
            paddingH, paddingW,
            scaleA, scaleB,
-           targetGrad);
+           targetGrad, outStride);
   CHECK_SYNC("hl_maxpool_backward");
 }
 
@@ -268,7 +270,7 @@ __global__ void KeAvgPoolForward(const int nthreads, const real* inputData,
                                  const int sizeX, const int sizeY,
                                  const int strideH, const int strideW,
                                  const int padH, const int padW,
-                                 real* tgtData) {
+                                 real* tgtData, const int tgtStride) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int pw = index % pooledW;
@@ -293,7 +295,9 @@ __global__ void KeAvgPoolForward(const int nthreads, const real* inputData,
         aveval += inputData[h * width + w];
       }
     }
-    tgtData[index] = aveval / pool_size;
+    int tgtIndex = index % (pooledW * pooledH * channels) +
+        frameNum * tgtStride;
+    tgtData[tgtIndex] = aveval / pool_size;
   }
 }
 
@@ -303,14 +307,15 @@ void hl_avgpool_forward(const int frameCnt, const real* inputData,
                         const int pooledH, const int pooledW,
                         const int sizeX, const int sizeY,
                         const int strideH, const int strideW,
-                        const int paddingH, const int paddingW, real* tgtData) {
+                        const int paddingH, const int paddingW, 
+                        real* tgtData, const int tgtStride) {
   int num_kernels = pooledH * pooledW * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
   KeAvgPoolForward<<< blocks, 1024, 0, STREAM_DEFAULT >>>
            (num_kernels, inputData, channels,
            height, width, pooledH, pooledW,
            sizeX, sizeY, strideH, strideW,
-           paddingH, paddingW, tgtData);
+           paddingH, paddingW, tgtData, tgtStride);
   CHECK_SYNC("hl_avgpool_forward failed");
 }
 
@@ -322,7 +327,7 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
                                   const int strideH, const int strideW,
                                   const int padH, const int padW,
                                   real scaleA, real scaleB,
-                                  real* tgtGrad) {
+                                  real* tgtGrad, const int outStride) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int offsetW = index % width + padW;
@@ -335,7 +340,8 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
     int phend = offsetH >= 0 ? min(offsetH / strideH + 1, pooledH) : 0;
     int pwend = offsetW >= 0 ? min(offsetW / strideW + 1, pooledW) : 0;
     real gradient = 0;
-    outGrad += (frameNum * channels + offsetC) * pooledH * pooledW;
+    outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
+
 
     for (int ph = phstart; ph < phend; ++ph) {
       for (int pw = pwstart; pw < pwend; ++pw) {
@@ -360,7 +366,7 @@ void hl_avgpool_backward(const int frameCnt, const real* outGrad,
                          const int strideH, const int strideW,
                          const int paddingH, const int paddingW,
                          real scaleA, real scaleB,
-                         real* backGrad) {
+                         real* backGrad, const int outStride) {
   int num_kernels = height * width * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
 
@@ -370,7 +376,7 @@ void hl_avgpool_backward(const int frameCnt, const real* outGrad,
            strideH, strideW,
            paddingH, paddingW,
            scaleA, scaleB,
-           backGrad);
+           backGrad, outStride);
   CHECK_SYNC("hl_avgpool_backward failed");
 }
 
@@ -522,7 +528,7 @@ void hl_CMRNorm_backward(size_t frameCnt, const real* inV,
                          size_t height, size_t width, size_t sizeX,
                          real alpha, real beta) {
   size_t threadsNum = frameCnt * height * width;
-  size_t blocksX = (threadsNum + 1024 -1) / 1024;
+  size_t blocksX = (threadsNum + 1024 - 1) / 1024;
   size_t blocksY = 1;
   dim3 threads(1024, 1);
   dim3 grid(blocksX, blocksY);
@@ -531,3 +537,194 @@ void hl_CMRNorm_backward(size_t frameCnt, const real* inV,
            height, width, sizeX, alpha, beta, inDiff);
   CHECK_SYNC("hl_CMRNorm_backward");
 }
+
+__global__ void KeBilinearInterpFw(const real* in,
+                                   const size_t inImgH,
+                                   const size_t inImgW,
+                                   const size_t inputH,
+                                   const size_t inputW,
+                                   real* out,
+                                   const size_t outImgH,
+                                   const size_t outImgW,
+                                   const size_t outputH,
+                                   const size_t outputW,
+                                   const size_t numChannels,
+                                   const real ratioH,
+                                   const real ratioW) {
+  int nthreads = outputH * outputW;                      
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < nthreads) {
+    int outIdH = tid / outputW;
+    int outIdW = tid % outputW;
+    int inImgSize = inputW / numChannels;
+    int outImgSize = outputW / numChannels;
+    int channelId = outIdW / outImgSize;
+
+    int outImgIdy = (outIdW % outImgSize) / outImgW;
+    int inImgIdy = ratioH * outImgIdy;
+    int hId = (inImgIdy < inImgH - 1) ? 1 : 0;
+    real h1lambda = ratioH * outImgIdy - inImgIdy;
+    real h2lambda = 1.f - h1lambda;
+
+    int outImgIdx = tid % outImgW;
+    int inImgIdx = ratioW * outImgIdx;
+    int wId = (inImgIdx < inImgW - 1) ? 1 : 0;
+    real w1lambda = ratioW * outImgIdx - inImgIdx;
+    real w2lambda = 1.f - w1lambda;
+
+    const real* inPos =
+      &in[outIdH * inputW + channelId * inImgSize + inImgIdy * inImgW + inImgIdx];
+
+    // bilinear interpolation
+    out[outIdH * outputW + outIdW] =
+      h2lambda * (w2lambda * inPos[0]            + w1lambda * inPos[wId]) + 
+      h1lambda * (w2lambda * inPos[hId * inImgW] + w1lambda * inPos[hId * inImgW + wId]);
+  }
+}
+
+void hl_bilinear_forward(const real* inData,
+                         const size_t inImgH,
+                         const size_t inImgW,
+                         const size_t inputH,
+                         const size_t inputW,
+                         real* outData,
+                         const size_t outImgH,
+                         const size_t outImgW,
+                         const size_t outputH,
+                         const size_t outputW,
+                         const size_t numChannels,
+                         const real ratioH,
+                         const real ratioW) {
+  int threadNum = outputH * outputW;
+  int blocks = (threadNum + 1024 - 1) / 1024;
+
+  KeBilinearInterpFw<<< blocks, 1024, 0, STREAM_DEFAULT>>>(
+    inData, inImgH, inImgW, inputH, inputW, outData, outImgH,
+    outImgW, outputH, outputW, numChannels, ratioH, ratioW);
+  CHECK_SYNC("hl_bilinear_forward failed");
+}
+
+__global__ void KeBilinearInterpBw(real* in,
+                                   const size_t inImgH,
+                                   const size_t inImgW,
+                                   const size_t inputH,
+                                   const size_t inputW,
+                                   const real* out,
+                                   const size_t outImgH,
+                                   const size_t outImgW,
+                                   const size_t outputH,
+                                   const size_t outputW,
+                                   const size_t numChannels,
+                                   const real ratioH,
+                                   const real ratioW) {
+  int nthreads = outputH * outputW;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < nthreads) {
+    int outIdH = tid / outputW;
+    int outIdW = tid % outputW;
+    int inImgSize = inputW / numChannels;
+    int outImgSize = outputW / numChannels;
+    int channelId = outIdW / outImgSize;
+
+    int outImgIdy = (outIdW % outImgSize) / outImgW;
+    int inImgIdy = ratioH * outImgIdy;
+    int hId = (inImgIdy < inImgH - 1) ? 1 : 0;
+    real h1lambda = ratioH * outImgIdy - inImgIdy;
+    real h2lambda = 1.f - h1lambda;
+
+    int outImgIdx = tid % outImgW;
+    int inImgIdx = ratioW * outImgIdx;
+    int wId = (inImgIdx < inImgW - 1) ? 1 : 0;
+    real w1lambda = ratioW * outImgIdx - inImgIdx;
+    real w2lambda = 1.f - w1lambda;
+
+    real* inPos =
+      &in[outIdH * inputW + channelId * inImgSize + inImgIdy * inImgW + inImgIdx];
+    const real* outPos = &out[outIdH * outputW + outIdW];
+    atomicAdd(&inPos[0], h2lambda * w2lambda * outPos[0]);
+    atomicAdd(&inPos[wId], h2lambda * w1lambda * outPos[0]);
+    atomicAdd(&inPos[hId * inImgW], h1lambda * w2lambda * outPos[0]);
+    atomicAdd(&inPos[hId * inImgW + wId], h1lambda * w1lambda * outPos[0]);
+  }
+}
+
+void hl_bilinear_backward(real* inGrad,
+                          const size_t inImgH,
+                          const size_t inImgW,
+                          const size_t inputH,
+                          const size_t inputW,
+                          const real* outGrad,
+                          const size_t outImgH,
+                          const size_t outImgW,
+                          const size_t outputH,
+                          const size_t outputW,
+                          const size_t numChannels,
+                          const real ratioH,
+                          const real ratioW) {
+  int threadNum = outputH * outputW;
+  int blocks = (threadNum + 1024 - 1) / 1024;
+
+  KeBilinearInterpBw<<< blocks, 1024, 0, STREAM_DEFAULT>>>(
+    inGrad, inImgH, inImgW, inputH, inputW, outGrad, outImgH,
+    outImgW, outputH, outputW, numChannels, ratioH, ratioW);
+  CHECK_SYNC("hl_bilinear_backward failed");
+}
+
+__global__ void maxoutFpCompute(size_t nthreads, const real * inData,
+                                real * outData, int* idData, 
+                                size_t size, size_t featLen, size_t groups) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if(index < nthreads) {
+    size_t batch_idx = index / size;
+    size_t i = index % size;
+    size_t channel_idx = i / featLen;
+    size_t feat_idx = i % featLen;
+    size_t data_idx = (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
+    real max = inData[data_idx];
+    int maxId = 0;
+    for (size_t g = 1; g < groups; ++g) {
+      real tmp = inData[data_idx + g * featLen];
+      if (tmp > max) {
+        max = tmp;
+        maxId = g;
+      }
+    }
+    outData[index] = max;
+    idData[index] = maxId;
+  }
+}
+
+void hl_maxout_forward(const real* inData, real* outData,
+                       int* idData, size_t batchSize, size_t size,
+                       size_t featLen, size_t groups) {
+  int num_kernels = size * batchSize;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  maxoutFpCompute<<< blocks, 1024, 0, STREAM_DEFAULT>>>(
+    num_kernels, inData, outData, idData, size, featLen, groups);
+  CHECK_SYNC("hl_maxout_forward failed");
+}
+
+__global__ void maxoutBpCompute(size_t nthreads, real* inGrad,
+                                const real* outGrad, const int* idData,
+                                size_t size, size_t featLen, size_t groups) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if(index < nthreads) {
+    size_t batch_idx = index / size;
+    size_t i = index % size;
+    size_t channel_idx = i / featLen;
+    size_t feat_idx = i % featLen;
+    size_t newIndex = batch_idx * size;
+    size_t gradIdx = (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx;
+    (inGrad + newIndex * groups)[gradIdx] += (outGrad + newIndex)[i];
+  }
+}
+
+void hl_maxout_backward(real* inGrad, const real* outGrad,
+                        const int* idData, size_t batchSize, size_t size,
+                        size_t featLen, size_t groups) {
+  int num_kernels = size * batchSize;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  maxoutBpCompute<<< blocks, 1024, 0, STREAM_DEFAULT >>>(
+    num_kernels, inGrad, outGrad, idData, size, featLen, groups);
+  CHECK_SYNC("hl_maxout_backward failed");
+}
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
index b3c9001ba397361376ee191081a71863b2e5a578..e8ba232d44b3f66254d4749d4abbcfbe46d1fd0e 100644
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <sys/time.h>
 #include <mutex>
+#include "hl_cuda.h"
 #include "hl_cuda_cublas.h"
 #include "hl_thread.ph"
 #include "hl_dso_loader.h"
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace dynload {
 
 std::once_flag cublas_dso_flag;
-void* cublas_dso_handle = nullptr;
+void *cublas_dso_handle = nullptr;
 
 /**
  * The following macro definition can generate structs
@@ -33,31 +33,30 @@ void* cublas_dso_handle = nullptr;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                          \
-   struct DynLoad__##__name {                                     \
-    template <typename... Args>                                   \
-    cublasStatus_t operator()(Args... args) {                     \
-        typedef cublasStatus_t (*cublasFunc)(Args...);            \
-        std::call_once(cublas_dso_flag, GetCublasDsoHandle,       \
-                      &cublas_dso_handle);                        \
-        void* p_##__name = dlsym(cublas_dso_handle, #__name);     \
-        return reinterpret_cast<cublasFunc>(p_##__name)(args...); \
-    }                                                             \
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    cublasStatus_t operator()(Args... args) {                                  \
+      typedef cublasStatus_t (*cublasFunc)(Args...);                           \
+      std::call_once(cublas_dso_flag, GetCublasDsoHandle, &cublas_dso_handle); \
+      void *p_##__name = dlsym(cublas_dso_handle, #__name);                    \
+      return reinterpret_cast<cublasFunc>(p_##__name)(args...);                \
+    }                                                                          \
   } __name;  // struct DynLoad__##__name
 #else
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                          \
-   struct DynLoad__##__name {                                     \
-    template <typename... Args>                                   \
-    cublasStatus_t operator()(Args... args) {                     \
-      return __name(args...);                                     \
-    }                                                             \
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)      \
+  struct DynLoad__##__name {                  \
+    template <typename... Args>               \
+    cublasStatus_t operator()(Args... args) { \
+      return __name(args...);                 \
+    }                                         \
   } __name;  // struct DynLoad__##__name
 #endif
 
-#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
-  DYNAMIC_LOAD_CUBLAS_WRAP(__name)
+#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name)
 
 // include all needed cublas functions in HPPL
+// clang-format off
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
   __macro(cublasSgemv)                    \
   __macro(cublasDgemv)                    \
@@ -75,6 +74,10 @@ DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched)
 DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched)
 DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
 DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetriBatched)
 CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
 
 #undef DYNAMIC_LOAD_CUBLAS_WRAP
@@ -83,37 +86,41 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
 
 } /* namespace dynload */
 
-
+// clang-format on
 #ifndef PADDLE_TYPE_DOUBLE
-#define     CUBLAS_GEAM     dynload::cublasSgeam
-#define     CUBLAS_GEMV     dynload::cublasSgemv
-#define     CUBLAS_GEMM     dynload::cublasSgemm
+#define CUBLAS_GEAM dynload::cublasSgeam
+#define CUBLAS_GEMV dynload::cublasSgemv
+#define CUBLAS_GEMM dynload::cublasSgemm
+#define CUBLAS_GETRF dynload::cublasSgetrfBatched
+#define CUBLAS_GETRI dynload::cublasSgetriBatched
 #else
-#define     CUBLAS_GEAM     dynload::cublasDgeam
-#define     CUBLAS_GEMV     dynload::cublasDgemv
-#define     CUBLAS_GEMM     dynload::cublasDgemm
+#define CUBLAS_GEAM dynload::cublasDgeam
+#define CUBLAS_GEMV dynload::cublasDgemv
+#define CUBLAS_GEMM dynload::cublasDgemm
+#define CUBLAS_GETRF dynload::cublasDgetrfBatched
+#define CUBLAS_GETRI dynload::cublasDgetriBatched
 #endif
 
-const char* hl_cublas_get_error_string(cublasStatus_t status) {
-  switch(status) {
-     case CUBLAS_STATUS_NOT_INITIALIZED:
-        return "[cublas status]: not initialized";
-     case CUBLAS_STATUS_ALLOC_FAILED:
-        return "[cublas status]: allocate failed";
-     case CUBLAS_STATUS_INVALID_VALUE:
-        return "[cublas status]: invalid value";
-     case CUBLAS_STATUS_ARCH_MISMATCH:
-        return "[cublas status]: arch mismatch";
-     case CUBLAS_STATUS_MAPPING_ERROR:
-        return "[cublas status]: mapping error";
-     case CUBLAS_STATUS_EXECUTION_FAILED:
-        return "[cublas status]: execution failed";
-     case CUBLAS_STATUS_INTERNAL_ERROR:
-        return "[cublas status]: internal error";
-     case CUBLAS_STATUS_SUCCESS:
-        return "[cublas status]: success";
-     default:
-        return "[cublas status]: unknown error";
+const char *hl_cublas_get_error_string(cublasStatus_t status) {
+  switch (status) {
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "[cublas status]: not initialized";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "[cublas status]: allocate failed";
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "[cublas status]: invalid value";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "[cublas status]: arch mismatch";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "[cublas status]: mapping error";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "[cublas status]: execution failed";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "[cublas status]: internal error";
+    case CUBLAS_STATUS_SUCCESS:
+      return "[cublas status]: success";
+    default:
+      return "[cublas status]: unknown error";
   }
 }
 
@@ -122,27 +129,21 @@ const char* hl_cublas_get_error_string(cublasStatus_t status) {
  * support << operator for more details error info.
  */
 cublasStatus_t g_cublasStat;
-#define CHECK_CUBLAS(cublas_func)                 \
-  g_cublasStat = cublas_func;                     \
-  CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat)   \
-      << "Cublas Error: "                         \
-      << hl_cublas_get_error_string(g_cublasStat) \
-      << " "
+#define CHECK_CUBLAS(cublas_func)               \
+  g_cublasStat = cublas_func;                   \
+  CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat) \
+      << "Cublas Error: " << hl_cublas_get_error_string(g_cublasStat) << " "
 
 void hl_cublas_init(cublasHandle_t *cublas_handle, cudaStream_t stream) {
   CHECK_CUBLAS(dynload::cublasCreate(cublas_handle))
-    << "[cublas init] Cublas create handle faild!";
+      << "[cublas init] Cublas create handle faild!";
 
   CHECK_CUBLAS(dynload::cublasSetStream(*cublas_handle, stream))
-    << "[cublas init] Cublas set stream faild!";
+      << "[cublas init] Cublas set stream faild!";
 }
 
-void hl_matrix_transpose(real *A_d,
-                         real *C_d,
-                         int dimM,
-                         int dimN,
-                         int lda,
-                         int ldc) {
+void hl_matrix_transpose(
+    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {
   real alpha = 1.0;
   real beta = 0.0;
 
@@ -150,11 +151,18 @@ void hl_matrix_transpose(real *A_d,
   CHECK_NOTNULL(C_d);
 
   CHECK_CUBLAS(CUBLAS_GEAM(t_resource.handle,
-               CUBLAS_OP_T, CUBLAS_OP_N,
-               dimM, dimN,
-               &alpha, A_d, lda,
-               &beta, nullptr, dimM,
-               C_d, ldc));
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           dimM,
+                           dimN,
+                           &alpha,
+                           A_d,
+                           lda,
+                           &beta,
+                           nullptr,
+                           dimM,
+                           C_d,
+                           ldc));
   CHECK_SYNC("hl_matrix_transpose failed");
 }
 
@@ -162,12 +170,72 @@ void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {
   hl_matrix_transpose(A_d, C_d, dimM, dimN, dimN, dimM);
 }
 
-void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                   real *B_d, hl_trans_op_t transb,
+void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
+  /* Solve Ax = I */
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  /* Step 1: Compute the LU decomposition of matrix A */
+  real **inout_h = &A_d;
+  real **inout_d = (real **)hl_malloc_device(sizeof(real *));
+  hl_memcpy(inout_d, inout_h, sizeof(real *));
+
+  int *pivot_d = (int *)hl_malloc_device(dimN * sizeof(int));
+  int *info_d = (int *)t_resource.gpu_mem;
+
+  /* Note: cublasSgetrfBatched is used to calculate a number of
+     small-sized matrices. There may be a better way to reconstruct
+     the API for better performance.
+   */
+  CHECK_CUBLAS(
+      CUBLAS_GETRF(t_resource.handle, dimN, inout_d, lda, pivot_d, info_d, 1));
+
+  int info_h;
+  hl_memcpy(&info_h, info_d, sizeof(int));
+  if (info_h != 0) {
+    LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
+  }
+
+  /* Step 2: Compute the inverse of the matrix given its LU decomposition */
+  real **out_h = &C_d;
+  real **out_d = (real **)hl_malloc_device(sizeof(real *));
+  hl_memcpy(out_d, out_h, sizeof(real *));
+
+  CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
+                            dimN,
+                            (const real **)inout_d,
+                            lda,
+                            pivot_d,
+                            out_d,
+                            ldc,
+                            info_d,
+                            1));
+
+  hl_memcpy(&info_h, info_d, sizeof(int));
+  if (info_h != 0) {
+    LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
+  }
+
+  hl_free_mem_device(inout_d);
+  hl_free_mem_device(pivot_d);
+  hl_free_mem_device(out_d);
+
+  CHECK_SYNC("hl_matrix_inverse failed");
+}
+
+void hl_matrix_mul(real *A_d,
+                   hl_trans_op_t transa,
+                   real *B_d,
+                   hl_trans_op_t transb,
                    real *C_d,
-                   int dimM, int dimN, int dimK,
-                   real alpha, real beta,
-                   int lda, int ldb, int ldc) {
+                   int dimM,
+                   int dimN,
+                   int dimK,
+                   real alpha,
+                   real beta,
+                   int lda,
+                   int ldb,
+                   int ldc) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
   CHECK_NOTNULL(C_d);
@@ -175,8 +243,8 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
   if (dimN == 1 && dimM != 1 && dimK != 1 && transb == HPPL_OP_N) {
     int m = (transa == HPPL_OP_N) ? dimM : dimK;
     int n = (transa == HPPL_OP_N) ? dimK : dimM;
-    hl_matrix_mul_vector(A_d, transa, B_d, C_d, m, n,
-                         alpha, beta, lda, ldb, ldc);
+    hl_matrix_mul_vector(
+        A_d, transa, B_d, C_d, m, n, alpha, beta, lda, ldb, ldc);
     return;
   }
 
@@ -184,8 +252,7 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
     int m = (transb == HPPL_OP_N) ? dimK : dimN;
     int n = (transb == HPPL_OP_N) ? dimN : dimK;
     hl_trans_op_t trans = (transb == HPPL_OP_N) ? HPPL_OP_T : HPPL_OP_N;
-    hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n,
-                         alpha, beta, ldb, 1, 1);
+    hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n, alpha, beta, ldb, 1, 1);
     return;
   }
 
@@ -194,26 +261,47 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
     stat = CUBLAS_GEMM(t_resource.handle,
                        CUBLAS_OP_N,
                        CUBLAS_OP_N,
-                       dimN, dimM, dimK,
-                       &alpha, B_d, ldb,
-                       A_d, lda,
-                       &beta, C_d, ldc);
+                       dimN,
+                       dimM,
+                       dimK,
+                       &alpha,
+                       B_d,
+                       ldb,
+                       A_d,
+                       lda,
+                       &beta,
+                       C_d,
+                       ldc);
   } else if ((HPPL_OP_T == transa) && (HPPL_OP_N == transb)) {
     stat = CUBLAS_GEMM(t_resource.handle,
                        CUBLAS_OP_N,
                        CUBLAS_OP_T,
-                       dimN, dimM, dimK,
-                       &alpha, B_d, ldb,
-                       A_d, lda,
-                       &beta, C_d, ldc);
+                       dimN,
+                       dimM,
+                       dimK,
+                       &alpha,
+                       B_d,
+                       ldb,
+                       A_d,
+                       lda,
+                       &beta,
+                       C_d,
+                       ldc);
   } else if ((HPPL_OP_N == transa) && (HPPL_OP_T == transb)) {
     stat = CUBLAS_GEMM(t_resource.handle,
                        CUBLAS_OP_T,
                        CUBLAS_OP_N,
-                       dimN, dimM, dimK,
-                       &alpha, B_d, ldb,
-                       A_d, lda,
-                       &beta, C_d, ldc);
+                       dimN,
+                       dimM,
+                       dimK,
+                       &alpha,
+                       B_d,
+                       ldb,
+                       A_d,
+                       lda,
+                       &beta,
+                       C_d,
+                       ldc);
   } else {
     LOG(FATAL) << "parameter transa error!";
   }
@@ -221,24 +309,46 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
   CHECK_SYNC("hl_matrix_mul failed");
 }
 
-void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
-                   real *B_d, hl_trans_op_t transb,
+void hl_matrix_mul(real *A_d,
+                   hl_trans_op_t transa,
+                   real *B_d,
+                   hl_trans_op_t transb,
                    real *C_d,
-                   int dimM, int dimN, int dimK,
-                   real alpha, real beta) {
+                   int dimM,
+                   int dimN,
+                   int dimK,
+                   real alpha,
+                   real beta) {
   int lda = (HPPL_OP_N == transa) ? dimK : dimM;
   int ldb = (HPPL_OP_N == transb) ? dimN : dimK;
   int ldc = dimN;
 
-  hl_matrix_mul(A_d, transa, B_d, transb, C_d, dimM, dimN,
-                dimK, alpha, beta, lda, ldb, ldc);
+  hl_matrix_mul(A_d,
+                transa,
+                B_d,
+                transb,
+                C_d,
+                dimM,
+                dimN,
+                dimK,
+                alpha,
+                beta,
+                lda,
+                ldb,
+                ldc);
 }
 
-void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
-                          real *B_d, real *C_d,
-                          int dimM, int dimN,
-                          real alpha, real beta,
-                          int lda, int incb, int incc) {
+void hl_matrix_mul_vector(real *A_d,
+                          hl_trans_op_t trans,
+                          real *B_d,
+                          real *C_d,
+                          int dimM,
+                          int dimN,
+                          real alpha,
+                          real beta,
+                          int lda,
+                          int incb,
+                          int incc) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
   CHECK_NOTNULL(C_d);
@@ -247,21 +357,29 @@ void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
   if (HPPL_OP_N == trans) {
     stat = CUBLAS_GEMV(t_resource.handle,
                        CUBLAS_OP_T,
-                       dimN, dimM,
+                       dimN,
+                       dimM,
                        &alpha,
-                       A_d, lda,
-                       B_d, incb,
+                       A_d,
+                       lda,
+                       B_d,
+                       incb,
                        &beta,
-                       C_d, incc);
+                       C_d,
+                       incc);
   } else if (HPPL_OP_T == trans) {
     stat = CUBLAS_GEMV(t_resource.handle,
                        CUBLAS_OP_N,
-                       dimN, dimM,
+                       dimN,
+                       dimM,
                        &alpha,
-                       A_d, lda,
-                       B_d, incb,
+                       A_d,
+                       lda,
+                       B_d,
+                       incb,
                        &beta,
-                       C_d, incc);
+                       C_d,
+                       incc);
   } else {
     LOG(FATAL) << "parameter transa error!";
   }
@@ -270,10 +388,14 @@ void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
   CHECK_SYNC("hl_matrix_mul_vector");
 }
 
-void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
-                          real *B_d, real *C_d,
-                          int dimM, int dimN,
-                          real alpha, real beta) {
-  hl_matrix_mul_vector(A_d, trans, B_d, C_d, dimM, dimN,
-                       alpha, beta, dimN, 1, 1);
+void hl_matrix_mul_vector(real *A_d,
+                          hl_trans_op_t trans,
+                          real *B_d,
+                          real *C_d,
+                          int dimM,
+                          int dimN,
+                          real alpha,
+                          real beta) {
+  hl_matrix_mul_vector(
+      A_d, trans, B_d, C_d, dimM, dimN, alpha, beta, dimN, 1, 1);
 }
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index b215c0f6e33a18630f41668f97e5e06ad6b29800..9d4ff08a78d641896e946e9bf04590d4ba93350f 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <cudnn.h>
 #include <mutex>
 #include "hl_cuda_cudnn.h"
@@ -20,6 +19,12 @@ limitations under the License. */
 #include "hl_thread.ph"
 #include "hl_dso_loader.h"
 #include "paddle/utils/Logging.h"
+#include "paddle/utils/CommandLineParser.h"
+
+P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
+               4096,
+               "Specify cuDNN max workspace limit, in units MB, "
+               "4096MB=4GB by default.");
 
 namespace dynload {
 
@@ -36,71 +41,34 @@ void* cudnn_dso_handle = nullptr;
 
 #ifdef PADDLE_USE_DSO
 
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                         \
-  struct DynLoad__##__name {                                    \
-    template <typename... Args>                                 \
-    cudnnStatus_t operator()(Args... args) {                    \
-      typedef cudnnStatus_t (*cudnnFunc)(Args...);              \
-      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle,         \
-                     &cudnn_dso_handle);                        \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);      \
-      return reinterpret_cast<cudnnFunc>(p_##__name)(args...);  \
-    }                                                           \
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                                     \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    auto operator()(Args... args) -> decltype(__name(args...)) {            \
+      using cudnn_func = decltype(__name(args...)) (*)(Args...);            \
+      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle); \
+      void* p_##__name = dlsym(cudnn_dso_handle, #__name);                  \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);             \
+    }                                                                       \
   } __name; /* struct DynLoad__##__name */
 
-struct DynLoad__cudnnGetVersion {
-  template <typename... Args>
-  size_t operator()(Args... args) {
-    typedef size_t (*cudnnFunc)(Args...);
-    std::call_once(cudnn_dso_flag, GetCudnnDsoHandle,
-                   &cudnn_dso_handle);
-    void* p_name = dlsym(cudnn_dso_handle, "cudnnGetVersion");
-    return reinterpret_cast<cudnnFunc>(p_name)(args...);
-  }
-} cudnnGetVersion; /* struct DynLoad__##__name */
-
-struct DynLoad__cudnnGetErrorString {
-  template <typename... Args>
-  const char* operator()(Args... args) {
-    typedef const char* (*cudnnFunc)(Args...);
-    std::call_once(cudnn_dso_flag, GetCudnnDsoHandle,
-                   &cudnn_dso_handle);
-    void* p_name = dlsym(cudnn_dso_handle, "cudnnGetErrorString");
-    return reinterpret_cast<cudnnFunc>(p_name)(args...);
-  }
-} cudnnGetErrorString; /* struct DynLoad__##__name */
-
-
 #else
 
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                         \
-  struct DynLoad__##__name {                                    \
-    template <typename... Args>                                 \
-    cudnnStatus_t operator()(Args... args) {                    \
-      return __name(args...);                                   \
-    }                                                           \
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                          \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      return __name(args...);                                    \
+    }                                                            \
   } __name; /* struct DynLoad__##__name */
 
-struct DynLoad__cudnnGetVersion {
-  template <typename... Args>
-  size_t operator()(Args... args) {
-    return cudnnGetVersion(args...);
-  }
-} cudnnGetVersion; /* struct DynLoad__##__name */
-
-struct DynLoad__cudnnGetErrorString {
-  template <typename... Args>
-  const char* operator()(Args... args) {
-    return cudnnGetErrorString(args...);
-  }
-} cudnnGetErrorString; /* struct DynLoad__##__name */
-
 #endif
 
 /**
  * include all needed cudnn functions in HPPL
  * different cudnn version has different interfaces
  **/
+// clang-format off
 #define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
   __macro(cudnnSetTensor4dDescriptor)                     \
   __macro(cudnnSetTensor4dDescriptorEx)                   \
@@ -128,7 +96,9 @@ struct DynLoad__cudnnGetErrorString {
   __macro(cudnnPoolingForward)                            \
   __macro(cudnnPoolingBackward)                           \
   __macro(cudnnSoftmaxBackward)                           \
-  __macro(cudnnSoftmaxForward)
+  __macro(cudnnSoftmaxForward)                            \
+  __macro(cudnnGetVersion)                                \
+  __macro(cudnnGetErrorString)
 CUDNN_DNN_ROUTINE_EACH(DYNAMIC_LOAD_CUDNN_WRAP)
 
 #define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
@@ -171,58 +141,53 @@ CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
 #undef CUDNN_DNN_ROUTINE_EACH
-
+// clang-format on
 } /* namespace dynload */
 
 /**
  * Check build-in cudnn function using glog and it **does not**
  * support << operator for more details error info.
  */
-#define CHECK_CUDNN(cudnnFunc)                               \
-  do {                                                       \
-    cudnnStatus_t cudnnStat = cudnnFunc;                     \
-    CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat)                \
-        << "Cudnn Error: "                                   \
-        << dynload::cudnnGetErrorString(cudnnStat);          \
+#define CHECK_CUDNN(cudnnFunc)                                         \
+  do {                                                                 \
+    cudnnStatus_t cudnnStat = cudnnFunc;                               \
+    CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat)                          \
+        << "Cudnn Error: " << dynload::cudnnGetErrorString(cudnnStat); \
   } while (0)
 
 bool g_is_libcudnn_init = false;
 int g_cudnn_lib_version = 0;
 
-void hl_cudnn_desc_init(cudnnTensorDescriptor_t*  cudnn_desc)
-{
-    CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
+void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) {
+  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
 }
 
-void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream)
-{
-    size_t cudnn_dso_ver = dynload::cudnnGetVersion();
-    size_t cudnn_dso_major = cudnn_dso_ver / 1000;
-    size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
-
-    // Compare cudnn header version with that of cudnn.so.
-    CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
-          (cudnn_cuh_major == cudnn_dso_major))
-        << "[cudnn init] libcudnn v" << cudnn_dso_major <<
-        " with header v" << cudnn_cuh_major << " unmatched!\n"
-        << "PaddlePaddle Requirement: "
-        << "(header v[2-3] with libcudnn v[2-3]) Or "
-        << "(header v4 with libcudnn v4) Or "
-        << "(header v5 with libcudnn v5).";
-
-    CHECK(!(CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
-        << "cudnn v5 requires cuda version >= 7.5";
-
-    CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
-    CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
-
-    g_is_libcudnn_init = true;
-    g_cudnn_lib_version = cudnn_dso_ver;
+void hl_cudnn_init(cudnnHandle_t* cudnn_handle, cudaStream_t stream) {
+  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
+  size_t cudnn_dso_major = cudnn_dso_ver / 1000;
+  size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
+
+  // Compare cudnn header version with that of cudnn.so.
+  CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
+        (cudnn_cuh_major == cudnn_dso_major))
+      << "[cudnn init] libcudnn v" << cudnn_dso_major << " with header v"
+      << cudnn_cuh_major << " unmatched!\n"
+      << "PaddlePaddle Requirement: "
+      << "(header v[2-3] with libcudnn v[2-3]) Or "
+      << "(header v4 with libcudnn v4) Or "
+      << "(header v5 with libcudnn v5).";
+
+  CHECK(!(CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
+      << "cudnn v5 requires cuda version >= 7.5";
+
+  CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
+  CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
+
+  g_is_libcudnn_init = true;
+  g_cudnn_lib_version = cudnn_dso_ver;
 }
 
-int hl_get_cudnn_lib_version() {
-  return g_cudnn_lib_version;
-}
+int hl_get_cudnn_lib_version() { return g_cudnn_lib_version; }
 
 void hl_conv_workspace(hl_tensor_descriptor input,
                        hl_tensor_descriptor output,
@@ -236,94 +201,91 @@ void hl_conv_workspace(hl_tensor_descriptor input,
                        size_t* bwdFilterLimitBytes) {
 #if CUDNN_VERSION >= 4000
 
-    CHECK_NOTNULL(input);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(filter);
-    CHECK_NOTNULL(conv);
-
-    // Specify workspace limit directly
-    size_t memoryLimitBytes = 8 * 1024 * 1024;
-
-    // cudnn convolution forward configuration
-    cudnnTensorDescriptor_t       fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
-    cudnnTensorDescriptor_t       fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnFilterDescriptor_t       fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
-    cudnnConvolutionDescriptor_t  fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
-             t_resource.cudnn_handle,
-             fwd_src_desc,
-             fwd_filter_desc,
-             fwd_conv_desc,
-             fwd_dest_desc,
-             CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-             memoryLimitBytes,
-             reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
-             t_resource.cudnn_handle,
-             fwd_src_desc,
-             fwd_filter_desc,
-             fwd_conv_desc,
-             fwd_dest_desc,
-             static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo),
-             fwdLimitBytes));
-
-    // cudnn convolution backward data configuration
-    cudnnFilterDescriptor_t       bwd_data_filter_desc =
-                                          GET_FILTER_DESCRIPTOR(filter);
-    cudnnTensorDescriptor_t       bwd_data_diff_desc =
-                                          GET_TENSOR_DESCRIPTOR(output);
-    cudnnTensorDescriptor_t       bwd_data_grad_desc =
-                                          GET_TENSOR_DESCRIPTOR(input);
-    cudnnConvolutionDescriptor_t  bwd_data_conv_desc =
-                                          GET_CONVOLUTION_DESCRIPTOR(conv);
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-             t_resource.cudnn_handle,
-             bwd_data_filter_desc,
-             bwd_data_diff_desc,
-             bwd_data_conv_desc,
-             bwd_data_grad_desc,
-             CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-             memoryLimitBytes,
-             reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
-             t_resource.cudnn_handle,
-             bwd_data_filter_desc,
-             bwd_data_diff_desc,
-             bwd_data_conv_desc,
-             bwd_data_grad_desc,
-             static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
-             bwdDataLimitBytes));
-
-    // cudnn convolution backward filter configuration
-    cudnnTensorDescriptor_t       bwd_filter_src_desc =
-                                      GET_TENSOR_DESCRIPTOR(input);
-    cudnnTensorDescriptor_t       bwd_filter_diff_desc =
-                                      GET_TENSOR_DESCRIPTOR(output);
-    cudnnConvolutionDescriptor_t  bwd_filter_conv_desc =
-                                      GET_CONVOLUTION_DESCRIPTOR(conv);
-    cudnnFilterDescriptor_t       bwd_filter_grad_desc =
-                                      GET_FILTER_DESCRIPTOR(filter);
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-             t_resource.cudnn_handle,
-             bwd_filter_src_desc,
-             bwd_filter_diff_desc,
-             bwd_filter_conv_desc,
-             bwd_filter_grad_desc,
-             CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-             memoryLimitBytes,
-             reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
-
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-             t_resource.cudnn_handle, bwd_filter_src_desc,
-             bwd_filter_diff_desc, bwd_filter_conv_desc,
-             bwd_filter_grad_desc,
-             static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
-             bwdFilterLimitBytes));
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(filter);
+  CHECK_NOTNULL(conv);
+
+  // Specify workspace limit directly
+  size_t memoryLimitBytes =
+      (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
+
+  // cudnn convolution forward configuration
+  cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
+      t_resource.cudnn_handle,
+      fwd_src_desc,
+      fwd_filter_desc,
+      fwd_conv_desc,
+      fwd_dest_desc,
+      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+      memoryLimitBytes,
+      reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
+      t_resource.cudnn_handle,
+      fwd_src_desc,
+      fwd_filter_desc,
+      fwd_conv_desc,
+      fwd_dest_desc,
+      static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo),
+      fwdLimitBytes));
+
+  // cudnn convolution backward data configuration
+  cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnConvolutionDescriptor_t bwd_data_conv_desc =
+      GET_CONVOLUTION_DESCRIPTOR(conv);
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+      t_resource.cudnn_handle,
+      bwd_data_filter_desc,
+      bwd_data_diff_desc,
+      bwd_data_conv_desc,
+      bwd_data_grad_desc,
+      CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+      memoryLimitBytes,
+      reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+      t_resource.cudnn_handle,
+      bwd_data_filter_desc,
+      bwd_data_diff_desc,
+      bwd_data_conv_desc,
+      bwd_data_grad_desc,
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
+      bwdDataLimitBytes));
+
+  // cudnn convolution backward filter configuration
+  cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
+      GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+      t_resource.cudnn_handle,
+      bwd_filter_src_desc,
+      bwd_filter_diff_desc,
+      bwd_filter_conv_desc,
+      bwd_filter_grad_desc,
+      CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+      memoryLimitBytes,
+      reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+      t_resource.cudnn_handle,
+      bwd_filter_src_desc,
+      bwd_filter_diff_desc,
+      bwd_filter_conv_desc,
+      bwd_filter_grad_desc,
+      static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
+      bwdFilterLimitBytes));
 
 #endif
 }
@@ -332,78 +294,75 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
                                  int batch_size,
                                  int feature_maps,
                                  int height,
-                                 int width)
-{
-    CHECK_NOTNULL(image_desc);
+                                 int width) {
+  CHECK_NOTNULL(image_desc);
 
-    cudnn_tensor_descriptor hl_desc =
-        (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
-    CHECK_NOTNULL(hl_desc);
+  cudnn_tensor_descriptor hl_desc =
+      (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
+  CHECK_NOTNULL(hl_desc);
 
 #ifndef PADDLE_TYPE_DOUBLE
-    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
-    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
 #endif
-    CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
-
-    CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
-                hl_desc->desc,
-                CUDNN_TENSOR_NCHW,
-                data_type,
-                batch_size,
-                feature_maps,
-                height,
-                width));
-
-    hl_desc->format = CUDNN_TENSOR_NCHW;
-    hl_desc->data_type = data_type;
-    hl_desc->batch_size = batch_size;
-    hl_desc->feature_maps = feature_maps;
-    hl_desc->height = height;
-    hl_desc->width = width;
-
-    *image_desc = (hl_tensor_descriptor)hl_desc;
+  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
+
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(hl_desc->desc,
+                                                  CUDNN_TENSOR_NCHW,
+                                                  data_type,
+                                                  batch_size,
+                                                  feature_maps,
+                                                  height,
+                                                  width));
+
+  hl_desc->format = CUDNN_TENSOR_NCHW;
+  hl_desc->data_type = data_type;
+  hl_desc->batch_size = batch_size;
+  hl_desc->feature_maps = feature_maps;
+  hl_desc->height = height;
+  hl_desc->width = width;
+
+  *image_desc = (hl_tensor_descriptor)hl_desc;
 }
 
 void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {
-    CHECK_NOTNULL(image_desc);
+  CHECK_NOTNULL(image_desc);
 
-    cudnn_tensor_descriptor hl_desc =
-        (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
-    CHECK_NOTNULL(hl_desc);
+  cudnn_tensor_descriptor hl_desc =
+      (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
+  CHECK_NOTNULL(hl_desc);
 
 #ifndef PADDLE_TYPE_DOUBLE
-    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
-    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
 #endif
-    CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
+  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
 
-    hl_desc->data_type = data_type;
+  hl_desc->data_type = data_type;
 
-    *image_desc = (hl_tensor_descriptor)hl_desc;
+  *image_desc = (hl_tensor_descriptor)hl_desc;
 }
 
 void hl_tensor_reshape(hl_tensor_descriptor image_desc,
                        int batch_size,
                        int feature_maps,
                        int height,
-                       int width)
-{
-    const int stride_w = 1;
-    const int stride_h = width * stride_w;
-    const int stride_c = height * stride_h;
-    const int stride_n = feature_maps * stride_c;
-    return hl_tensor_reshape(image_desc,
-                             batch_size,
-                             feature_maps,
-                             height,
-                             width,
-                             stride_n,
-                             stride_c,
-                             stride_h,
-                             stride_w);
+                       int width) {
+  const int stride_w = 1;
+  const int stride_h = width * stride_w;
+  const int stride_c = height * stride_h;
+  const int stride_n = feature_maps * stride_c;
+  return hl_tensor_reshape(image_desc,
+                           batch_size,
+                           feature_maps,
+                           height,
+                           width,
+                           stride_n,
+                           stride_c,
+                           stride_h,
+                           stride_w);
 }
 
 void hl_tensor_reshape(hl_tensor_descriptor image_desc,
@@ -414,45 +373,42 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc,
                        int nStride,
                        int cStride,
                        int hStride,
-                       int wStride)
-{
-    CHECK_NOTNULL(image_desc);
-
-    cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
-    CHECK_NOTNULL(hl_desc->desc);
-
-    CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
-                hl_desc->data_type,
-                batch_size,
-                feature_maps,
-                height,
-                width,
-                nStride,
-                cStride,
-                hStride,
-                wStride));
-
-    hl_desc->batch_size = batch_size;
-    hl_desc->feature_maps = feature_maps;
-    hl_desc->height = height;
-    hl_desc->width = width;
+                       int wStride) {
+  CHECK_NOTNULL(image_desc);
+
+  cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
+  CHECK_NOTNULL(hl_desc->desc);
+
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
+                                                    hl_desc->data_type,
+                                                    batch_size,
+                                                    feature_maps,
+                                                    height,
+                                                    width,
+                                                    nStride,
+                                                    cStride,
+                                                    hStride,
+                                                    wStride));
+
+  hl_desc->batch_size = batch_size;
+  hl_desc->feature_maps = feature_maps;
+  hl_desc->height = height;
+  hl_desc->width = width;
 }
 
-void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc)
-{
-    CHECK_NOTNULL(image_desc);
+void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {
+  CHECK_NOTNULL(image_desc);
 
-    cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
-    CHECK_NOTNULL(hl_desc->desc);
+  cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
+  CHECK_NOTNULL(hl_desc->desc);
 
-    CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
+  CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
 
-    hl_desc->desc = NULL;
+  hl_desc->desc = NULL;
 
-    free(image_desc);
+  free(image_desc);
 }
 
-
 void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
                                   hl_pooling_mode_t mode,
                                   int height,
@@ -460,99 +416,93 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
                                   int height_padding,
                                   int width_padding,
                                   int stride_height,
-                                  int stride_width)
-{
-    cudnnPoolingMode_t cudnn_mode;
-    switch (mode)
-    {
-        case HL_POOLING_MAX:
-            cudnn_mode = CUDNN_POOLING_MAX;
-            break;
-        case HL_POOLING_AVERAGE:
-            cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-            break;
-        case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
-            cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-            break;
-        default:
-            LOG(FATAL) << "parameter mode error";
-    }
-
-    CHECK_NOTNULL(pooling_desc);
-
-    cudnn_pooling_descriptor hl_pooling_desc =
-        (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
-    CHECK_NOTNULL(hl_pooling_desc);
-
-    CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
-
-    CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(
-                hl_pooling_desc->desc,
-                cudnn_mode,
+                                  int stride_width) {
+  cudnnPoolingMode_t cudnn_mode;
+  switch (mode) {
+    case HL_POOLING_MAX:
+      cudnn_mode = CUDNN_POOLING_MAX;
+      break;
+    case HL_POOLING_AVERAGE:
+      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+      break;
+    case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
+      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+      break;
+    default:
+      LOG(FATAL) << "parameter mode error";
+  }
+
+  CHECK_NOTNULL(pooling_desc);
+
+  cudnn_pooling_descriptor hl_pooling_desc =
+      (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
+  CHECK_NOTNULL(hl_pooling_desc);
+
+  CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
+
+  CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(hl_pooling_desc->desc,
+                                                   cudnn_mode,
 #if CUDNN_VERSION >= 5000
-                CUDNN_PROPAGATE_NAN,
+                                                   CUDNN_PROPAGATE_NAN,
 #endif
-                height,
-                width,
-                height_padding,
-                width_padding,
-                stride_height,
-                stride_width));
-
-    hl_pooling_desc->mode = cudnn_mode;
-    hl_pooling_desc->window_height = height;
-    hl_pooling_desc->window_width = width;
-    hl_pooling_desc->stride_height = stride_height;
-    hl_pooling_desc->stride_width = stride_width;
-
-    *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
+                                                   height,
+                                                   width,
+                                                   height_padding,
+                                                   width_padding,
+                                                   stride_height,
+                                                   stride_width));
+
+  hl_pooling_desc->mode = cudnn_mode;
+  hl_pooling_desc->window_height = height;
+  hl_pooling_desc->window_width = width;
+  hl_pooling_desc->stride_height = stride_height;
+  hl_pooling_desc->stride_width = stride_width;
+
+  *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
 }
 
-void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc)
-{
-    CHECK_NOTNULL(pooling_desc);
+void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {
+  CHECK_NOTNULL(pooling_desc);
 
-    cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
-    CHECK_NOTNULL(hl_pooling->desc);
+  cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
 
-    CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
+  CHECK_NOTNULL(hl_pooling->desc);
+  CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
 
-    hl_pooling->desc = NULL;
+  hl_pooling->desc = NULL;
 
-    free(pooling_desc);
+  free(pooling_desc);
 }
 
 void hl_pooling_forward(hl_tensor_descriptor input,
                         real* input_image,
                         hl_tensor_descriptor output,
                         real* output_image,
-                        hl_pooling_descriptor pooling)
-{
-    cudnnPoolingDescriptor_t    pooling_desc;
-    cudnnTensorDescriptor_t     input_desc;
-    cudnnTensorDescriptor_t     output_desc;
-
-    CHECK_NOTNULL(input);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(pooling);
-    CHECK_NOTNULL(input_image);
-    CHECK_NOTNULL(output_image);
-
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    input_desc = ((cudnn_tensor_descriptor)input)->desc;
-    output_desc = ((cudnn_tensor_descriptor)output)->desc;
-    pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
-    CHECK_CUDNN(dynload::cudnnPoolingForward(
-                t_resource.cudnn_handle,
-                pooling_desc,
-                &alpha,
-                input_desc,
-                input_image,
-                &beta,
-                output_desc,
-                output_image));
-    CHECK_SYNC("hl_pooling_forward failed");
+                        hl_pooling_descriptor pooling) {
+  cudnnPoolingDescriptor_t pooling_desc;
+  cudnnTensorDescriptor_t input_desc;
+  cudnnTensorDescriptor_t output_desc;
+
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(pooling);
+  CHECK_NOTNULL(input_image);
+  CHECK_NOTNULL(output_image);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  input_desc = ((cudnn_tensor_descriptor)input)->desc;
+  output_desc = ((cudnn_tensor_descriptor)output)->desc;
+  pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
+  CHECK_CUDNN(dynload::cudnnPoolingForward(t_resource.cudnn_handle,
+                                           pooling_desc,
+                                           &alpha,
+                                           input_desc,
+                                           input_image,
+                                           &beta,
+                                           output_desc,
+                                           output_image));
+  CHECK_SYNC("hl_pooling_forward failed");
 }
 
 void hl_pooling_backward(hl_tensor_descriptor input,
@@ -561,94 +511,87 @@ void hl_pooling_backward(hl_tensor_descriptor input,
                          hl_tensor_descriptor output,
                          real* output_image,
                          real* output_image_grad,
-                         hl_pooling_descriptor pooling)
-{
-    cudnnPoolingDescriptor_t    pooling_desc;
-    cudnnTensorDescriptor_t     input_desc;
-    cudnnTensorDescriptor_t     output_desc;
-
-    CHECK_NOTNULL(input);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(pooling);
-    CHECK_NOTNULL(input_image);
-    CHECK_NOTNULL(input_image_grad);
-    CHECK_NOTNULL(output_image);
-    CHECK_NOTNULL(output_image_grad);
-
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    input_desc = ((cudnn_tensor_descriptor)input)->desc;
-    output_desc = ((cudnn_tensor_descriptor)output)->desc;
-    pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
-    CHECK_CUDNN(dynload::cudnnPoolingBackward(
-                t_resource.cudnn_handle,
-                pooling_desc,
-                &alpha,
-                output_desc,
-                output_image,
-                output_desc,
-                output_image_grad,
-                input_desc,
-                input_image,
-                &beta,
-                input_desc,
-                input_image_grad));
+                         hl_pooling_descriptor pooling) {
+  cudnnPoolingDescriptor_t pooling_desc;
+  cudnnTensorDescriptor_t input_desc;
+  cudnnTensorDescriptor_t output_desc;
+
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(pooling);
+  CHECK_NOTNULL(input_image);
+  CHECK_NOTNULL(input_image_grad);
+  CHECK_NOTNULL(output_image);
+  CHECK_NOTNULL(output_image_grad);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  input_desc = ((cudnn_tensor_descriptor)input)->desc;
+  output_desc = ((cudnn_tensor_descriptor)output)->desc;
+  pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
+  CHECK_CUDNN(dynload::cudnnPoolingBackward(t_resource.cudnn_handle,
+                                            pooling_desc,
+                                            &alpha,
+                                            output_desc,
+                                            output_image,
+                                            output_desc,
+                                            output_image_grad,
+                                            input_desc,
+                                            input_image,
+                                            &beta,
+                                            input_desc,
+                                            input_image_grad));
   CHECK_SYNC("hl_pooling_backward failed");
 }
 
-
 void hl_create_filter_descriptor(hl_filter_descriptor* filter,
                                  int input_feature_maps,
                                  int output_feature_maps,
                                  int height,
-                                 int width)
-{
-    CHECK_NOTNULL(filter);
+                                 int width) {
+  CHECK_NOTNULL(filter);
 
-    cudnn_filter_descriptor hl_filter =
-        (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
-    CHECK_NOTNULL(hl_filter);
+  cudnn_filter_descriptor hl_filter =
+      (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
+  CHECK_NOTNULL(hl_filter);
 
-    CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
+  CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
 
 #ifndef PADDLE_TYPE_DOUBLE
-    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
-    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
 #endif
-    CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(
-             hl_filter->desc,
-             data_type,
+  CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(hl_filter->desc,
+                                                  data_type,
 #if CUDNN_VERSION >= 5000
-             CUDNN_TENSOR_NCHW,
+                                                  CUDNN_TENSOR_NCHW,
 #endif
-             output_feature_maps,
-             input_feature_maps,
-             height,
-             width));
-
-    hl_filter->data_type = data_type;
-    hl_filter->output_feature_maps = output_feature_maps;
-    hl_filter->input_feature_maps = input_feature_maps;
-    hl_filter->filter_height = height;
-    hl_filter->filter_width = width;
-
-    *filter = (hl_filter_descriptor)hl_filter;
+                                                  output_feature_maps,
+                                                  input_feature_maps,
+                                                  height,
+                                                  width));
+
+  hl_filter->data_type = data_type;
+  hl_filter->output_feature_maps = output_feature_maps;
+  hl_filter->input_feature_maps = input_feature_maps;
+  hl_filter->filter_height = height;
+  hl_filter->filter_width = width;
+
+  *filter = (hl_filter_descriptor)hl_filter;
 }
 
+void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {
+  CHECK_NOTNULL(filter);
 
-void hl_destroy_filter_descriptor(hl_filter_descriptor filter)
-{
-    CHECK_NOTNULL(filter);
-
-    cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
-    CHECK_NOTNULL(hl_filter->desc);
+  cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
+  CHECK_NOTNULL(hl_filter->desc);
 
-    CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
+  CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
 
-    hl_filter->desc = NULL;
+  hl_filter->desc = NULL;
 
-    free(filter);
+  free(filter);
 }
 
 void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
@@ -657,38 +600,36 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                       int padding_height,
                                       int padding_width,
                                       int stride_height,
-                                      int stride_width)
-{
-    CHECK_NOTNULL(conv);
-
-    cudnn_convolution_descriptor hl_conv =
-        (cudnn_convolution_descriptor)malloc(sizeof(_cudnn_convolution_descriptor));
-    CHECK_NOTNULL(hl_conv);
-
-    CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
-
-    cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
-    CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(
-                hl_conv->desc,
-                padding_height,
-                padding_width,
-                stride_height,
-                stride_width,
-                1,
-                1,
-                mode));
-
-    hl_conv->input_image = image;
-    hl_conv->filter = filter;
-    hl_conv->padding_height = padding_height;
-    hl_conv->padding_width = padding_width;
-    hl_conv->stride_height = stride_height;
-    hl_conv->stride_width = stride_width;
-    hl_conv->upscalex = 1;
-    hl_conv->upscaley = 1;
-    hl_conv->mode = mode;
-
-    *conv = (hl_convolution_descriptor)hl_conv;
+                                      int stride_width) {
+  CHECK_NOTNULL(conv);
+
+  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc(
+      sizeof(_cudnn_convolution_descriptor));
+
+  CHECK_NOTNULL(hl_conv);
+  CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
+
+  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
+                                                       padding_height,
+                                                       padding_width,
+                                                       stride_height,
+                                                       stride_width,
+                                                       1,
+                                                       1,
+                                                       mode));
+
+  hl_conv->input_image = image;
+  hl_conv->filter = filter;
+  hl_conv->padding_height = padding_height;
+  hl_conv->padding_width = padding_width;
+  hl_conv->stride_height = stride_height;
+  hl_conv->stride_width = stride_width;
+  hl_conv->upscalex = 1;
+  hl_conv->upscaley = 1;
+  hl_conv->mode = mode;
+
+  *conv = (hl_convolution_descriptor)hl_conv;
 }
 
 void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
@@ -697,47 +638,44 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                      int padding_height,
                                      int padding_width,
                                      int stride_height,
-                                     int stride_width)
-{
-    CHECK_NOTNULL(conv);
-    CHECK_NOTNULL(image);
-    CHECK_NOTNULL(filter);
-
-    cudnnConvolutionDescriptor_t  conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-    cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
-    CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(
-                conv_desc,
-                padding_height,
-                padding_width,
-                stride_height,
-                stride_width,
-                1,
-                1,
-                mode));
-
-    cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
-    hl_conv->input_image = image;
-    hl_conv->filter = filter;
-    hl_conv->padding_height = padding_height;
-    hl_conv->padding_width = padding_width;
-    hl_conv->stride_height = stride_height;
-    hl_conv->stride_width = stride_width;
-    hl_conv->upscalex = 1;
-    hl_conv->upscaley = 1;
-    hl_conv->mode = mode;
+                                     int stride_width) {
+  CHECK_NOTNULL(conv);
+  CHECK_NOTNULL(image);
+  CHECK_NOTNULL(filter);
+
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
+                                                       padding_height,
+                                                       padding_width,
+                                                       stride_height,
+                                                       stride_width,
+                                                       1,
+                                                       1,
+                                                       mode));
+
+  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
+  hl_conv->input_image = image;
+  hl_conv->filter = filter;
+  hl_conv->padding_height = padding_height;
+  hl_conv->padding_width = padding_width;
+  hl_conv->stride_height = stride_height;
+  hl_conv->stride_width = stride_width;
+  hl_conv->upscalex = 1;
+  hl_conv->upscaley = 1;
+  hl_conv->mode = mode;
 }
 
-void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv)
-{
-    CHECK_NOTNULL(conv);
+void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {
+  CHECK_NOTNULL(conv);
 
-    cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
-    CHECK_NOTNULL(hl_conv->desc);
+  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
+  CHECK_NOTNULL(hl_conv->desc);
 
-    CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
-    hl_conv->desc = NULL;
+  CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
+  hl_conv->desc = NULL;
 
-    free(conv);
+  free(conv);
 }
 
 void hl_convolution_forward(hl_tensor_descriptor input,
@@ -750,87 +688,83 @@ void hl_convolution_forward(hl_tensor_descriptor input,
                             void* gpuWorkSpace,
                             size_t sizeInBytes,
                             int convFwdAlgo) {
-    CHECK_NOTNULL(input);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(filter);
-    CHECK_NOTNULL(conv);
-    CHECK_NOTNULL(input_data);
-    CHECK_NOTNULL(output_data);
-    CHECK_NOTNULL(filter_data);
-    cudnnTensorDescriptor_t       src_desc = GET_TENSOR_DESCRIPTOR(input);
-    cudnnTensorDescriptor_t       dest_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnFilterDescriptor_t       filter_desc = GET_FILTER_DESCRIPTOR(filter);
-    cudnnConvolutionDescriptor_t  conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    CHECK_CUDNN(dynload::cudnnConvolutionForward(
-                t_resource.cudnn_handle,
-                &alpha,
-                src_desc,
-                input_data,
-                filter_desc,
-                filter_data,
-                conv_desc,
-                static_cast<cudnnConvolutionFwdAlgo_t>(convFwdAlgo),
-                gpuWorkSpace,
-                sizeInBytes,
-                &beta,
-                dest_desc,
-                output_data));
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(filter);
+  CHECK_NOTNULL(conv);
+  CHECK_NOTNULL(input_data);
+  CHECK_NOTNULL(output_data);
+  CHECK_NOTNULL(filter_data);
+  cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t dest_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  CHECK_CUDNN(dynload::cudnnConvolutionForward(
+      t_resource.cudnn_handle,
+      &alpha,
+      src_desc,
+      input_data,
+      filter_desc,
+      filter_data,
+      conv_desc,
+      static_cast<cudnnConvolutionFwdAlgo_t>(convFwdAlgo),
+      gpuWorkSpace,
+      sizeInBytes,
+      &beta,
+      dest_desc,
+      output_data));
   CHECK_SYNC("hl_convolution_forward failed");
 }
 
 void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
                                      real* bias_data,
                                      hl_tensor_descriptor output,
-                                     real* output_data)
-{
-    CHECK_NOTNULL(bias);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(bias_data);
-    CHECK_NOTNULL(output_data);
-
-    cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
-    real alpha = 1.0f;
-    real beta = 1.0f;
-
-    CHECK_CUDNN(dynload::cudnnAddTensor(
-                t_resource.cudnn_handle,
+                                     real* output_data) {
+  CHECK_NOTNULL(bias);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(bias_data);
+  CHECK_NOTNULL(output_data);
+
+  cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+
+  CHECK_CUDNN(dynload::cudnnAddTensor(t_resource.cudnn_handle,
 #if CUDNN_VERSION < 4000
-                CUDNN_ADD_SAME_C,
+                                      CUDNN_ADD_SAME_C,
 #endif
-                &alpha,
-                bias_desc,
-                bias_data,
-                &beta,
-                output_desc,
-                output_data));
+                                      &alpha,
+                                      bias_desc,
+                                      bias_data,
+                                      &beta,
+                                      output_desc,
+                                      output_data));
   CHECK_SYNC("hl_convolution_forward_add_bias failed");
 }
 
 void hl_convolution_backward_bias(hl_tensor_descriptor bias,
                                   real* bias_grad_data,
                                   hl_tensor_descriptor output,
-                                  real* output_grad_data)
-{
-    CHECK_NOTNULL(bias);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(bias_grad_data);
-    CHECK_NOTNULL(output_grad_data);
-
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
-    CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(
-                t_resource.cudnn_handle,
-                &alpha,
-                diff_desc,
-                output_grad_data,
-                &beta,
-                bias_desc,
-                bias_grad_data));
+                                  real* output_grad_data) {
+  CHECK_NOTNULL(bias);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(bias_grad_data);
+  CHECK_NOTNULL(output_grad_data);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
+  CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(t_resource.cudnn_handle,
+                                                    &alpha,
+                                                    diff_desc,
+                                                    output_grad_data,
+                                                    &beta,
+                                                    bias_desc,
+                                                    bias_grad_data));
   CHECK_SYNC("hl_convolution_backward_bias failed");
 }
 
@@ -844,38 +778,37 @@ void hl_convolution_backward_filter(hl_tensor_descriptor input,
                                     void* gpuWorkSpace,
                                     size_t sizeInBytes,
                                     int convBwdFilterAlgo) {
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(filter);
+  CHECK_NOTNULL(conv);
+  CHECK_NOTNULL(input_data);
+  CHECK_NOTNULL(output_grad_data);
+  CHECK_NOTNULL(filter_grad_data);
 
-    CHECK_NOTNULL(input);
-    CHECK_NOTNULL(output);
-    CHECK_NOTNULL(filter);
-    CHECK_NOTNULL(conv);
-    CHECK_NOTNULL(input_data);
-    CHECK_NOTNULL(output_grad_data);
-    CHECK_NOTNULL(filter_grad_data);
-
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    cudnnTensorDescriptor_t       src_desc = GET_TENSOR_DESCRIPTOR(input);
-    cudnnTensorDescriptor_t       diff_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnConvolutionDescriptor_t  conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-    cudnnFilterDescriptor_t       grad_desc = GET_FILTER_DESCRIPTOR(filter);
-
-    CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
-                t_resource.cudnn_handle,
-                &alpha,
-                src_desc,
-                input_data,
-                diff_desc,
-                output_grad_data,
-                conv_desc,
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnFilterDescriptor_t grad_desc = GET_FILTER_DESCRIPTOR(filter);
+
+  CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
+      t_resource.cudnn_handle,
+      &alpha,
+      src_desc,
+      input_data,
+      diff_desc,
+      output_grad_data,
+      conv_desc,
 #if CUDNN_VERSION >= 4000
-                static_cast<cudnnConvolutionBwdFilterAlgo_t>(convBwdFilterAlgo),
-                gpuWorkSpace,
-                sizeInBytes,
+      static_cast<cudnnConvolutionBwdFilterAlgo_t>(convBwdFilterAlgo),
+      gpuWorkSpace,
+      sizeInBytes,
 #endif
-                &beta,
-                grad_desc,
-                filter_grad_data));
+      &beta,
+      grad_desc,
+      filter_grad_data));
   CHECK_SYNC("hl_convolution_backward_filter failed");
 }
 
@@ -889,121 +822,111 @@ void hl_convolution_backward_data(hl_tensor_descriptor input,
                                   void* gpuWorkSpace,
                                   size_t sizeInBytes,
                                   int convBwdDataAlgo) {
-    real alpha = 1.0f;
-    real beta = 1.0f;
-    cudnnFilterDescriptor_t       filter_desc = GET_FILTER_DESCRIPTOR(filter);
-    cudnnTensorDescriptor_t       diff_desc = GET_TENSOR_DESCRIPTOR(output);
-    cudnnTensorDescriptor_t       grad_desc = GET_TENSOR_DESCRIPTOR(input);
-    cudnnConvolutionDescriptor_t  conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-
-    CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
-                t_resource.cudnn_handle,
-                &alpha,
-                filter_desc,
-                filter_data,
-                diff_desc,
-                output_grad_data,
-                conv_desc,
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t grad_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+
+  CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
+      t_resource.cudnn_handle,
+      &alpha,
+      filter_desc,
+      filter_data,
+      diff_desc,
+      output_grad_data,
+      conv_desc,
 #if CUDNN_VERSION >= 4000
-                static_cast<cudnnConvolutionBwdDataAlgo_t>(convBwdDataAlgo),
-                gpuWorkSpace,
-                sizeInBytes,
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(convBwdDataAlgo),
+      gpuWorkSpace,
+      sizeInBytes,
 #endif
-                &beta,
-                grad_desc,
-                input_data_grad));
+      &beta,
+      grad_desc,
+      input_data_grad));
   CHECK_SYNC("hl_convolution_backward_data failed");
 }
 
-
-void hl_softmax_forward(real *input,
-                        real *output,
-                        int height,
-                        int width)
-{
+void hl_softmax_forward(real* input, real* output, int height, int width) {
 #ifndef PADDLE_TYPE_DOUBLE
-    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
-    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
 #endif
-    CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
-                t_resource.cudnn_desc,
-                CUDNN_TENSOR_NCHW,
-                data_type,
-                height,
-                width,
-                1,
-                1));
-
-    real alpha = 1.0f;
-    real beta = 0.0f;
-    CHECK_CUDNN(dynload::cudnnSoftmaxForward(
-                t_resource.cudnn_handle,
-                CUDNN_SOFTMAX_ACCURATE,
-                CUDNN_SOFTMAX_MODE_CHANNEL,
-                &alpha,
-                t_resource.cudnn_desc,
-                input,
-                &beta,
-                t_resource.cudnn_desc,
-                output));
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
+                                                  CUDNN_TENSOR_NCHW,
+                                                  data_type,
+                                                  height,
+                                                  width,
+                                                  1,
+                                                  1));
+
+  real alpha = 1.0f;
+  real beta = 0.0f;
+  CHECK_CUDNN(dynload::cudnnSoftmaxForward(t_resource.cudnn_handle,
+                                           CUDNN_SOFTMAX_ACCURATE,
+                                           CUDNN_SOFTMAX_MODE_CHANNEL,
+                                           &alpha,
+                                           t_resource.cudnn_desc,
+                                           input,
+                                           &beta,
+                                           t_resource.cudnn_desc,
+                                           output));
   CHECK_SYNC("hl_softmax_forward failed");
 }
 
-void hl_softmax_backward(real *output_value,
-                         real *output_grad,
+void hl_softmax_backward(real* output_value,
+                         real* output_grad,
                          int height,
-                         int width)
-{
+                         int width) {
 #ifndef PADDLE_TYPE_DOUBLE
-    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
-    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
 #endif
-    CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(
-                t_resource.cudnn_desc,
-                CUDNN_TENSOR_NCHW,
-                data_type,
-                height,
-                width,
-                1,
-                1));
-
-    real alpha = 1.0f;
-    real beta = 0.0f;
-    CHECK_CUDNN(dynload::cudnnSoftmaxBackward(
-                t_resource.cudnn_handle,
-                CUDNN_SOFTMAX_ACCURATE,
-                CUDNN_SOFTMAX_MODE_CHANNEL,
-                &alpha,
-                t_resource.cudnn_desc,
-                output_value,
-                t_resource.cudnn_desc,
-                output_grad,
-                &beta,
-                t_resource.cudnn_desc,
-                output_grad));
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
+                                                  CUDNN_TENSOR_NCHW,
+                                                  data_type,
+                                                  height,
+                                                  width,
+                                                  1,
+                                                  1));
+
+  real alpha = 1.0f;
+  real beta = 0.0f;
+  CHECK_CUDNN(dynload::cudnnSoftmaxBackward(t_resource.cudnn_handle,
+                                            CUDNN_SOFTMAX_ACCURATE,
+                                            CUDNN_SOFTMAX_MODE_CHANNEL,
+                                            &alpha,
+                                            t_resource.cudnn_desc,
+                                            output_value,
+                                            t_resource.cudnn_desc,
+                                            output_grad,
+                                            &beta,
+                                            t_resource.cudnn_desc,
+                                            output_grad));
   CHECK_SYNC("hl_softmax_backward failed");
 }
 
 void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                    real *input,
+                                    real* input,
                                     hl_tensor_descriptor outputDesc,
-                                    real *output,
+                                    real* output,
                                     hl_tensor_descriptor bnParamDesc,
-                                    real *scale,
-                                    real *bias,
+                                    real* scale,
+                                    real* bias,
                                     double factor,
-                                    real *runningMean,
-                                    real *runningInvVar,
+                                    real* runningMean,
+                                    real* runningInvVar,
                                     double epsilon,
-                                    real *savedMean,
-                                    real *savedVar) {
+                                    real* savedMean,
+                                    real* savedVar) {
 #if CUDNN_VERSION >= 4007
   if ((NULL != runningMean && NULL == runningInvVar) ||
       (NULL == runningMean && NULL != runningInvVar)) {
     LOG(FATAL) << "runningMean and runningInvVar can be NULL "
-              << "but only at the same time.";
+               << "but only at the same time.";
   }
   if ((NULL != savedMean && NULL == savedVar) ||
       (NULL == savedMean && NULL != savedVar)) {
@@ -1017,10 +940,24 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
   real alpha = 1.0f;
   real beta = 1.0f;
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-  CHECK_CUDNN(dynload::cudnnBatchNormalizationForwardTraining(
-              t_resource.cudnn_handle, mode, &alpha, &beta, xDesc,
-              input, yDesc, output, bnDesc, scale, bias, factor,
-              runningMean, runningInvVar, epsilon, savedMean, savedVar));
+  CHECK_CUDNN(
+      dynload::cudnnBatchNormalizationForwardTraining(t_resource.cudnn_handle,
+                                                      mode,
+                                                      &alpha,
+                                                      &beta,
+                                                      xDesc,
+                                                      input,
+                                                      yDesc,
+                                                      output,
+                                                      bnDesc,
+                                                      scale,
+                                                      bias,
+                                                      factor,
+                                                      runningMean,
+                                                      runningInvVar,
+                                                      epsilon,
+                                                      savedMean,
+                                                      savedVar));
 
   CHECK_SYNC("hl_batch_norm_forward_training failed");
 #else
@@ -1030,15 +967,15 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
 }
 
 void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                    real *input,
-                                    hl_tensor_descriptor outputDesc,
-                                    real *output,
-                                    hl_tensor_descriptor bnParamDesc,
-                                    real *scale,
-                                    real *bias,
-                                    real *estimatedMean,
-                                    real *estimatedInvVar,
-                                    double epsilon) {
+                                     real* input,
+                                     hl_tensor_descriptor outputDesc,
+                                     real* output,
+                                     hl_tensor_descriptor bnParamDesc,
+                                     real* scale,
+                                     real* bias,
+                                     real* estimatedMean,
+                                     real* estimatedInvVar,
+                                     double epsilon) {
 #if CUDNN_VERSION >= 4007
   cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
   cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
@@ -1046,10 +983,21 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
   real alpha = 1.0f;
   real beta = 1.0f;
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-  CHECK_CUDNN(dynload::cudnnBatchNormalizationForwardInference(
-              t_resource.cudnn_handle, mode, &alpha, &beta, xDesc,
-              input, yDesc, output, bnDesc, scale, bias,
-              estimatedMean, estimatedInvVar, epsilon));
+  CHECK_CUDNN(
+      dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
+                                                       mode,
+                                                       &alpha,
+                                                       &beta,
+                                                       xDesc,
+                                                       input,
+                                                       yDesc,
+                                                       output,
+                                                       bnDesc,
+                                                       scale,
+                                                       bias,
+                                                       estimatedMean,
+                                                       estimatedInvVar,
+                                                       epsilon));
 
   CHECK_SYNC("hl_batch_norm_forward_inference failed");
 #else
@@ -1059,18 +1007,18 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
 }
 
 void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                            real *input,
+                            real* input,
                             hl_tensor_descriptor outGradDesc,
-                            real *outGrad,
+                            real* outGrad,
                             hl_tensor_descriptor inGradDesc,
-                            real *inGrad,
+                            real* inGrad,
                             hl_tensor_descriptor dBnParamDesc,
-                            real *scale,
-                            real *scaleGrad,
-                            real *biasGrad,
+                            real* scale,
+                            real* scaleGrad,
+                            real* biasGrad,
                             double epsilon,
-                            real *savedMean,
-                            real *savedInvVar) {
+                            real* savedMean,
+                            real* savedInvVar) {
 #if CUDNN_VERSION >= 4007
   if ((NULL != savedMean && NULL == savedInvVar) ||
       (NULL == savedMean && NULL != savedInvVar)) {
@@ -1085,12 +1033,25 @@ void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
   real alpha = 1.0f;
   real beta = 1.0f;
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-  CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(
-              t_resource.cudnn_handle, mode, &alpha, &beta,
-              &alpha, &beta,
-              xDesc, input, dyDesc, outGrad, dxDesc, inGrad,
-              bnDesc, scale, scaleGrad, biasGrad, epsilon,
-              savedMean, savedInvVar));
+  CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(t_resource.cudnn_handle,
+                                                       mode,
+                                                       &alpha,
+                                                       &beta,
+                                                       &alpha,
+                                                       &beta,
+                                                       xDesc,
+                                                       input,
+                                                       dyDesc,
+                                                       outGrad,
+                                                       dxDesc,
+                                                       inGrad,
+                                                       bnDesc,
+                                                       scale,
+                                                       scaleGrad,
+                                                       biasGrad,
+                                                       epsilon,
+                                                       savedMean,
+                                                       savedInvVar));
 
   CHECK_SYNC("hl_batch_norm_backward failed");
 #else
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index e9fe9f1c117a0573643c81f061bb36399523b38d..745be35b56278ed2e0033d5fd2806320d3164d7c 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <sys/time.h>
 #include <string.h>
 #include <unistd.h>
@@ -27,7 +26,7 @@ limitations under the License. */
 namespace dynload {
 
 std::once_flag curand_dso_flag;
-void* curand_dso_handle = nullptr;
+void *curand_dso_handle = nullptr;
 
 /**
  * The following macro definition can generate structs
@@ -37,34 +36,35 @@ void* curand_dso_handle = nullptr;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)                           \
-  struct DynLoad__##__name {                                       \
-    template <typename... Args>                                    \
-    curandStatus_t operator()(Args... args) {                      \
-       typedef curandStatus_t (*curandFunc)(Args...);              \
-       std::call_once(curand_dso_flag, GetCurandDsoHandle,         \
-                      &curand_dso_handle);                         \
-       void* p_##__name = dlsym(curand_dso_handle, #__name);       \
-       return reinterpret_cast<curandFunc>(p_##__name)(args...);   \
-    }                                                              \
-  } __name;  /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CURAND_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    curandStatus_t operator()(Args... args) {                                  \
+      typedef curandStatus_t (*curandFunc)(Args...);                           \
+      std::call_once(curand_dso_flag, GetCurandDsoHandle, &curand_dso_handle); \
+      void *p_##__name = dlsym(curand_dso_handle, #__name);                    \
+      return reinterpret_cast<curandFunc>(p_##__name)(args...);                \
+    }                                                                          \
+  } __name; /* struct DynLoad__##__name */
 #else
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)                           \
-  struct DynLoad__##__name {                                       \
-    template <typename... Args>                                    \
-    curandStatus_t operator()(Args... args) {                      \
-       return __name(args...);                                     \
-    }                                                              \
-  } __name;  /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CURAND_WRAP(__name)      \
+  struct DynLoad__##__name {                  \
+    template <typename... Args>               \
+    curandStatus_t operator()(Args... args) { \
+      return __name(args...);                 \
+    }                                         \
+  } __name; /* struct DynLoad__##__name */
 #endif
 
 /* include all needed curand functions in HPPL */
+// clang-format off
 #define CURAND_RAND_ROUTINE_EACH(__macro)    \
   __macro(curandCreateGenerator)             \
   __macro(curandSetStream)                   \
   __macro(curandSetPseudoRandomGeneratorSeed)\
   __macro(curandGenerateUniform)             \
   __macro(curandGenerateUniformDouble)
+// clang-format on
 
 CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
 
@@ -72,7 +72,7 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
 #undef DYNAMIC_LOAD_CURAND_WRAP
 
 std::once_flag cudart_dso_flag;
-void* cudart_dso_handle = nullptr;
+void *cudart_dso_handle = nullptr;
 
 /**
  * The following macro definition can generate structs
@@ -82,48 +82,28 @@ void* cudart_dso_handle = nullptr;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUDART_WRAP(__name)                            \
-  struct DynLoad__##__name {                                        \
-    template <typename... Args>                                     \
-    cudaError_t operator()(Args... args) {                          \
-      typedef cudaError_t (*cudartFunc)(Args...);                   \
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle,           \
-                     &cudart_dso_handle);                           \
-      void* p_##__name = dlsym(cudart_dso_handle, #__name);         \
-      return reinterpret_cast<cudartFunc>(p_##__name)(args...);     \
-    }                                                               \
-  } __name;  /* struct DynLoad__##__name */
-#else
-#define DYNAMIC_LOAD_CUDART_WRAP(__name)                            \
-  struct DynLoad__##__name {                                        \
-    template <typename... Args>                                     \
-    cudaError_t operator()(Args... args) {                          \
-      return __name(args...);                                       \
-    }                                                               \
-  } __name;  /* struct DynLoad__##__name */
-#endif
-
-#ifdef PADDLE_USE_DSO
-  struct DynLoad__cudaGetErrorString {
-    template <typename... Args>
-    const char* operator()(Args... args) {
-      typedef const char* (*cudaFunc)(Args...);
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle,
-                     &cudart_dso_handle);
-      void* p_func = dlsym(cudart_dso_handle, "cudaGetErrorString");
-      return reinterpret_cast<cudaFunc>(p_func)(args...);
-    }
-  } cudaGetErrorString;  /* struct DynLoad__cudaGetErrorString */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+      using cudart_func = decltype(__name(args...)) (*)(Args...);              \
+      std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
+      void *p_##__name = dlsym(cudart_dso_handle, #__name);                    \
+      return reinterpret_cast<cudart_func>(p_##__name)(args...);               \
+    }                                                                          \
+  } __name; /* struct DynLoad__##__name */
 #else
-struct DynLoad__cudaGetErrorString {
-  template <typename... Args>
-  const char* operator()(Args... args) {
-    return cudaGetErrorString(args...);
-  }
-} cudaGetErrorString;  /* struct DynLoad__cudaGetErrorString */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name)                         \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      return __name(args...);                                    \
+    }                                                            \
+  } __name; /* struct DynLoad__##__name */
 #endif
 
 /* include all needed cuda functions in HPPL */
+// clang-format off
 #define CUDA_ROUTINE_EACH(__macro)        \
   __macro(cudaMalloc)                     \
   __macro(cudaHostAlloc)                  \
@@ -152,58 +132,59 @@ struct DynLoad__cudaGetErrorString {
   __macro(cudaSetDeviceFlags)             \
   __macro(cudaGetLastError)               \
   __macro(cudaFuncSetCacheConfig)         \
-  __macro(cudaRuntimeGetVersion)
+  __macro(cudaRuntimeGetVersion)          \
+  __macro(cudaGetErrorString)
+// clang-format on
 
 CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
 
 #undef CUDA_ROUNTINE_EACH
 #undef DYNAMIC_LOAD_CUDART_WRAP
 
-}  /* namespace dynload */
+} /* namespace dynload */
 
 /**
  * @brief   global resource.
  */
-int                     g_system_device_num = 0;    /* system device number */
-int                     device_num = 0;             /* use    device number */
-hl_device_prop          *g_device;                  /* device info table */
-__thread thread_device_resources *t_device;         /* device resources table */
+int g_system_device_num = 0;                /* system device number */
+int device_num = 0;                         /* use    device number */
+hl_device_prop *g_device;                   /* device info table */
+__thread thread_device_resources *t_device; /* device resources table */
 int g_cuda_lib_version = 0;
 
 /* number of global stream */
-#define  NUMBER_OF_GLOBAL_STREAM    (HPPL_THREAD_STREAM_1)
+#define NUMBER_OF_GLOBAL_STREAM (HPPL_THREAD_STREAM_1)
 /* number of thread stream */
-#define  NUMBER_OF_THREAD_STREAM    (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
+#define NUMBER_OF_THREAD_STREAM (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
 /* sizeof of device memory */
-#define  HPPL_GPU_MEMORY_SIZE                (256*4)
+#define HPPL_GPU_MEMORY_SIZE (256 * 4)
 
 /**
  * Check build-in cuda function using glog and it **does not**
  * support << operator for more details error info.
  */
-#define CHECK_CUDA(cudaFunc)                               \
-  do {                                                     \
-    cudaError_t cudaStat = cudaFunc;                       \
-    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "      \
-        << dynload::cudaGetErrorString(cudaStat);          \
+#define CHECK_CUDA(cudaFunc)                                                  \
+  do {                                                                        \
+    cudaError_t cudaStat = cudaFunc;                                          \
+    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                         \
+                                    << dynload::cudaGetErrorString(cudaStat); \
   } while (0)
 
 /**
  * @brief   thread resource.
  */
-__thread _hl_thread_resource t_resource = {
-                                           {0},     /* stream */
-                                           0,       /* handle */
-                                           0,       /* gen */
-                                           0,       /* cudnn_handle */
-                                           0,       /* cudnn_desc */
-                                           NULL,    /* gen_mutex */
-                                           NULL,    /* gpu_mem */
-                                           NULL,    /* cpu_mem */
-                                           0,       /* event */
-                                           -1,      /* device */
-                                           0,       /* major */
-                                           false};  /* is_init */
+__thread _hl_thread_resource t_resource = {{0},    /* stream */
+                                           0,      /* handle */
+                                           0,      /* gen */
+                                           0,      /* cudnn_handle */
+                                           0,      /* cudnn_desc */
+                                           NULL,   /* gen_mutex */
+                                           NULL,   /* gpu_mem */
+                                           NULL,   /* cpu_mem */
+                                           0,      /* event */
+                                           -1,     /* device */
+                                           0,      /* major */
+                                           false}; /* is_init */
 
 __thread cudaStream_t default_stream = 0;
 __thread bool g_sync_flag = true;
@@ -217,18 +198,17 @@ inline pid_t gettid() {
   uint64_t tid;
   pthread_threadid_np(NULL, &tid);
 #else
-  #ifndef __NR_gettid
-  #define __NR_gettid 224
-  #endif
+#ifndef __NR_gettid
+#define __NR_gettid 224
+#endif
   pid_t tid = syscall(__NR_gettid);
 #endif
-  CHECK_NE(tid, -1);
-  return tid;    
+  CHECK_NE((int)tid, -1);
+  return tid;
 }
 
 void hl_init(int device) {
-  CHECK(hl_start_flag)
-    << "[Init failed] hl_start() did not succeed.";
+  CHECK(hl_start_flag) << "[Init failed] hl_start() did not succeed.";
 
   /* thread has been initialized */
   if (true == t_resource.is_init) {
@@ -239,16 +219,16 @@ void hl_init(int device) {
   /* create thread devcie resources */
   char *tmp;
   thread_device_resources device_res;
-  tmp = (char *)malloc(g_system_device_num*sizeof(thread_device_resources*) +
-                       device_num*sizeof(_thread_device_resources));
+  tmp = (char *)malloc(g_system_device_num * sizeof(thread_device_resources *) +
+                       device_num * sizeof(_thread_device_resources));
   CHECK_NOTNULL(tmp);
-  t_device = (thread_device_resources*)tmp;
-  device_res = (thread_device_resources)((char*)tmp +
-               g_system_device_num*sizeof(thread_device_resources*));
-  memset(t_device, 0, g_system_device_num*sizeof(thread_device_resources*));
+  t_device = (thread_device_resources *)tmp;
+  device_res = (thread_device_resources)(
+      (char *)tmp + g_system_device_num * sizeof(thread_device_resources *));
+  memset(t_device, 0, g_system_device_num * sizeof(thread_device_resources *));
 
-  char *tmp_stream = (char *)
-      malloc(device_num*NUMBER_OF_THREAD_STREAM*sizeof(cudaStream_t));
+  char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_THREAD_STREAM *
+                                    sizeof(cudaStream_t));
   CHECK_NOTNULL(tmp_stream);
 
   int num = 0;
@@ -258,8 +238,9 @@ void hl_init(int device) {
     }
 
     t_device[dev] = &device_res[num];
-    t_device[dev]->stream = (cudaStream_t*)(tmp_stream +
-        num*NUMBER_OF_THREAD_STREAM*sizeof(cudaStream_t));
+    t_device[dev]->stream =
+        (cudaStream_t *)(tmp_stream +
+                         num * NUMBER_OF_THREAD_STREAM * sizeof(cudaStream_t));
 
     hl_create_thread_resources(dev, t_device[dev]);
     num++;
@@ -285,14 +266,14 @@ void hl_fini() {
     t_resource.stream[i] = 0;
   }
 
-  char* tmp = (char*)t_device;
-  char* tmp_stream = NULL;
+  char *tmp = (char *)t_device;
+  char *tmp_stream = NULL;
   for (int dev = 0; dev < g_system_device_num; dev++) {
     if (!t_device[dev]) {
       continue;
     }
     if (!tmp_stream) {
-        tmp_stream = (char*)t_device[dev]->stream;
+      tmp_stream = (char *)t_device[dev]->stream;
     }
     for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
       CHECK_CUDA(dynload::cudaStreamDestroy(t_device[dev]->stream[j]));
@@ -309,9 +290,7 @@ void hl_fini() {
   t_resource.is_init = false;
 }
 
-int hl_get_device_count() {
-  return device_num;
-}
+int hl_get_device_count() { return device_num; }
 
 void hl_set_device(int device) {
   if (device == t_resource.device) {
@@ -319,7 +298,7 @@ void hl_set_device(int device) {
   }
 
   CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-    << "Device: " << device << " is not specified in startup.";
+      << "Device: " << device << " is not specified in startup.";
 
   CHECK_CUDA(dynload::cudaSetDevice(device));
 
@@ -331,11 +310,11 @@ void hl_set_device(int device) {
   if (true == t_resource.is_init) {
     for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
       t_resource.stream[i] =
-        t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
+          t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
     }
     t_resource.gpu_mem = t_device[device]->gpu_mem;
     t_resource.cpu_mem = t_device[device]->cpu_mem;
-    t_resource.event   = t_device[device]->mem_event;
+    t_resource.event = t_device[device]->mem_event;
   }
 
   t_resource.handle = g_device[device]->device_resources->handle;
@@ -353,11 +332,11 @@ int hl_get_device() {
   return device;
 }
 
-void* hl_malloc_device(size_t size) {
+void *hl_malloc_device(size_t size) {
   void *dest_d;
 
   CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(dynload::cudaMalloc((void**)&dest_d, size));
+  CHECK_CUDA(dynload::cudaMalloc((void **)&dest_d, size));
 
   return dest_d;
 }
@@ -367,14 +346,15 @@ void hl_free_mem_device(void *dest_d) {
 
   cudaError_t err = dynload::cudaFree(dest_d);
   CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
-    << hl_get_device_error_string();
+      << hl_get_device_error_string();
 }
 
-void* hl_malloc_host(size_t size) {
+void *hl_malloc_host(size_t size) {
   void *dest_h;
 
   CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(dynload::cudaHostAlloc((void**)&dest_h, size, cudaHostAllocDefault));
+  CHECK_CUDA(
+      dynload::cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
 
   return dest_h;
 }
@@ -383,8 +363,8 @@ void hl_free_mem_host(void *dest_h) {
   CHECK_NOTNULL(dest_h);
 
   cudaError_t err = dynload::cudaFreeHost(dest_h);
-  CHECK (cudaSuccess == err || cudaErrorCudartUnloading == err)
-    << hl_get_device_error_string();
+  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
+      << hl_get_device_error_string();
 }
 
 void hl_memcpy(void *dst, void *src, size_t size) {
@@ -406,8 +386,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
   }
   CHECK_NOTNULL(src_h);
   CHECK_NOTNULL(dest_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size,
-             cudaMemcpyHostToDevice));
+  CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
 }
 
 void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
@@ -416,8 +395,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
   }
   CHECK_NOTNULL(dest_h);
   CHECK_NOTNULL(src_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size,
-             cudaMemcpyDeviceToHost));
+  CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
 }
 
 void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
@@ -426,8 +404,8 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
   }
   CHECK_NOTNULL(dest_d);
   CHECK_NOTNULL(src_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_d, size,
-             cudaMemcpyDeviceToDevice));
+  CHECK_CUDA(
+      dynload::cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
 }
 
 void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
@@ -441,8 +419,8 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
   CHECK_LT(stream, HPPL_STREAM_END);
   cu_stream = t_resource.stream[stream];
 
-  CHECK_CUDA(dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault,
-             cu_stream));
+  CHECK_CUDA(
+      dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
 }
 
 void hl_start() {
@@ -453,8 +431,8 @@ void hl_start() {
 
 bool hl_device_can_access_peer(int device, int peerDevice) {
   int canAccessPeer;
-  CHECK_CUDA(dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device,
-             peerDevice));
+  CHECK_CUDA(
+      dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
 
   if (canAccessPeer == 1) {
     return true;
@@ -496,32 +474,32 @@ void hl_create_global_resources(hl_device_prop device_prop) {
 
   /* create curand gen */
   CHECK_EQ(dynload::curandCreateGenerator(&device_res->gen,
-           CURAND_RNG_PSEUDO_DEFAULT), CURAND_STATUS_SUCCESS)
-           << "[Start failed] Curand init failed.";
+                                          CURAND_RNG_PSEUDO_DEFAULT),
+           CURAND_STATUS_SUCCESS)
+      << "[Start failed] Curand init failed.";
 
-  CHECK_EQ(dynload::curandSetStream(device_res->gen,
-           device_res->stream[0]), CURAND_STATUS_SUCCESS)
-           << "[Start failed] Curand set stream failed!";
+  CHECK_EQ(dynload::curandSetStream(device_res->gen, device_res->stream[0]),
+           CURAND_STATUS_SUCCESS)
+      << "[Start failed] Curand set stream failed!";
 
   /* create cudnn handle */
   hl_cudnn_init(&device_res->cudnn_handle, device_res->stream[0]);
 
   int seed = gettid();
-  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(
-           device_res->gen, seed+device), CURAND_STATUS_SUCCESS);
+  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(device_res->gen,
+                                                       seed + device),
+           CURAND_STATUS_SUCCESS);
 
-  device_res->gen_mutex =
-    (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t)));
+  device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
   pthread_mutex_init(device_res->gen_mutex, NULL);
 
   CHECK_CUDA(dynload::cudaRuntimeGetVersion(&g_cuda_lib_version));
 }
 
-int hl_get_cuda_version() {
-  return g_cuda_lib_version;
-}
+int hl_get_cuda_version() { return g_cuda_lib_version; }
 
-void hl_create_thread_resources(int device, thread_device_resources device_res) {
+void hl_create_thread_resources(int device,
+                                thread_device_resources device_res) {
   CHECK_CUDA(dynload::cudaSetDevice(device));
 
   /* create thread stream */
@@ -530,15 +508,15 @@ void hl_create_thread_resources(int device, thread_device_resources device_res)
   }
 
   /* allocation device memory */
-  device_res->gpu_mem = (real*)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
+  device_res->gpu_mem = (real *)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
 
   /* allocation host memory */
-  device_res->cpu_mem = (real*)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
+  device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
 
   CHECK_CUDA(dynload::cudaEventCreate(&device_res->mem_event));
 }
 
-void hl_specify_devices_start(int* device, int number) {
+void hl_specify_devices_start(int *device, int number) {
   if (hl_start_flag) return;
 
   /* 1. get the number of devices */
@@ -550,20 +528,19 @@ void hl_specify_devices_start(int* device, int number) {
 
   /* 2. check device & create device property table */
   CHECK_LE(number, g_system_device_num)
-    << "[Start failed] System does not have enough device. "
-    << "Device number: " << g_system_device_num
-    << "Input number: " << number;
+      << "[Start failed] System does not have enough device. "
+      << "Device number: " << g_system_device_num << "Input number: " << number;
 
   char *tmp;
   hl_device_prop device_prop;
-  tmp = (char *)malloc(g_system_device_num*sizeof(hl_device_prop*) +
-                       number*sizeof(_hl_device_prop));
+  tmp = (char *)malloc(g_system_device_num * sizeof(hl_device_prop *) +
+                       number * sizeof(_hl_device_prop));
   CHECK(tmp) << "[Start failed] System memory is not enough.";
 
-  g_device = (hl_device_prop*)tmp;
-  device_prop = (hl_device_prop)((char*)tmp +
-                g_system_device_num*sizeof(hl_device_prop*));
-  memset(g_device, 0, g_system_device_num*sizeof(hl_device_prop*));
+  g_device = (hl_device_prop *)tmp;
+  device_prop = (hl_device_prop)(
+      (char *)tmp + g_system_device_num * sizeof(hl_device_prop *));
+  memset(g_device, 0, g_system_device_num * sizeof(hl_device_prop *));
   int num = 0;
   for (int i = 0; i < number; i++) {
     int dev;
@@ -574,13 +551,13 @@ void hl_specify_devices_start(int* device, int number) {
     }
 
     CHECK_LT(dev, g_system_device_num)
-      << "[Start failed] The specified device number is "
-      << "out of range. Max device number: " << g_system_device_num - 1
-      << " Specified devcie number: "<< dev;
+        << "[Start failed] The specified device number is "
+        << "out of range. Max device number: " << g_system_device_num - 1
+        << " Specified devcie number: " << dev;
 
     if (g_device[dev]) {
       /* Warning */
-      LOG(WARNING) <<"[Warning] Repeat specify device: " << dev;
+      LOG(WARNING) << "[Warning] Repeat specify device: " << dev;
       continue;
     }
 
@@ -591,11 +568,11 @@ void hl_specify_devices_start(int* device, int number) {
   device_num = num;
 
   /* 3.  create global device resources */
-  char *tmp_res = (char *)malloc(device_num*sizeof(_global_device_resources));
+  char *tmp_res = (char *)malloc(device_num * sizeof(_global_device_resources));
   CHECK_NOTNULL(tmp_res);
 
-  char *tmp_stream =
-    (char *)malloc(device_num*NUMBER_OF_GLOBAL_STREAM*sizeof(cudaStream_t));
+  char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_GLOBAL_STREAM *
+                                    sizeof(cudaStream_t));
   CHECK_NOTNULL(tmp_stream);
 
   num = 0;
@@ -604,10 +581,11 @@ void hl_specify_devices_start(int* device, int number) {
       continue;
     }
 
-    g_device[i]->device_resources = (global_device_resources)(tmp_res +
-      num*sizeof(_global_device_resources));
-    g_device[i]->device_resources->stream = (cudaStream_t*)(tmp_stream +
-      num*NUMBER_OF_GLOBAL_STREAM*sizeof(cudaStream_t));
+    g_device[i]->device_resources = (global_device_resources)(
+        tmp_res + num * sizeof(_global_device_resources));
+    g_device[i]->device_resources->stream =
+        (cudaStream_t *)(tmp_stream +
+                         num * NUMBER_OF_GLOBAL_STREAM * sizeof(cudaStream_t));
 
     hl_create_global_resources(g_device[i]);
     num++;
@@ -617,9 +595,9 @@ void hl_specify_devices_start(int* device, int number) {
   hl_start_flag = true;
   /* set default device */
   if (device == NULL) {
-      hl_set_device(0);
+    hl_set_device(0);
   } else {
-      hl_set_device(device[0]);
+    hl_set_device(device[0]);
   }
 }
 
@@ -627,35 +605,31 @@ void hl_rand(real *dest_d, size_t num) {
   pthread_mutex_lock(t_resource.gen_mutex);
   CHECK_EQ(
 #ifndef PADDLE_TYPE_DOUBLE
-  dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
+      dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
 #else
-  dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
+      dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
 #endif
-  CURAND_STATUS_SUCCESS);
+      CURAND_STATUS_SUCCESS);
   pthread_mutex_unlock(t_resource.gen_mutex);
   CHECK_SYNC("hl_rand failed");
 }
 
 void hl_srand(unsigned int seed) {
   pthread_mutex_lock(t_resource.gen_mutex);
-  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(
-           t_resource.gen, seed), CURAND_STATUS_SUCCESS);
+  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(t_resource.gen, seed),
+           CURAND_STATUS_SUCCESS);
   pthread_mutex_unlock(t_resource.gen_mutex);
 }
 
-void hl_set_sync_flag(bool flag) {
-  g_sync_flag = flag;
-}
+void hl_set_sync_flag(bool flag) { g_sync_flag = flag; }
 
-bool hl_get_sync_flag() {
-  return g_sync_flag;
-}
+bool hl_get_sync_flag() { return g_sync_flag; }
 
 void hl_stream_synchronize(hl_stream_t stream) {
   cudaStream_t cu_stream;
 
-  CHECK_LT(stream, HPPL_STREAM_END)
-    << __func__ <<": the parameter stream is error.";
+  CHECK_LT(stream, HPPL_STREAM_END) << __func__
+                                    << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
   CHECK_CUDA(dynload::cudaStreamSynchronize(cu_stream));
@@ -664,8 +638,8 @@ void hl_stream_synchronize(hl_stream_t stream) {
 void hl_create_event(hl_event_t *event) {
   CHECK_NOTNULL(event);
 
-  struct _hl_event_st* st_event =
-    (struct _hl_event_st*)malloc(sizeof(struct _hl_event_st));
+  struct _hl_event_st *st_event =
+      (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
 
   CHECK_CUDA(dynload::cudaEventCreate(&st_event->cu_event));
 
@@ -677,8 +651,8 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
   CHECK_NOTNULL(start);
   CHECK_NOTNULL(end);
 
-  CHECK_CUDA(dynload::cudaEventElapsedTime(&time,
-             start->cu_event, end->cu_event));
+  CHECK_CUDA(
+      dynload::cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
   return time;
 }
 
@@ -686,24 +660,22 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
   cudaStream_t cu_stream;
 
   CHECK_NOTNULL(event);
-  CHECK_LT(stream, HPPL_STREAM_END)
-    << __func__ <<": the parameter stream is error.";
+  CHECK_LT(stream, HPPL_STREAM_END) << __func__
+                                    << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaEventRecord(
-             event->cu_event, cu_stream));
+  CHECK_CUDA(dynload::cudaEventRecord(event->cu_event, cu_stream));
 }
 
 void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
   cudaStream_t cu_stream;
 
   CHECK_NOTNULL(event);
-  CHECK_LT(stream, HPPL_STREAM_END)
-    << __func__ <<": the parameter stream is error.";
+  CHECK_LT(stream, HPPL_STREAM_END) << __func__
+                                    << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaStreamWaitEvent(
-             cu_stream, event->cu_event, 0));
+  CHECK_CUDA(dynload::cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
 }
 
 void hl_destroy_event(hl_event_t event) {
@@ -722,15 +694,15 @@ void hl_event_synchronize(hl_event_t event) {
 void hl_get_device_name(char *name, int len, int device) {
   CHECK_NOTNULL(name);
   CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-    << "Device("<< device <<") is not specified in startup.";
+      << "Device(" << device << ") is not specified in startup.";
 
-  strncpy(name, g_device[device]->device_name , len);
+  strncpy(name, g_device[device]->device_name, len);
 }
 
 void hl_get_device_memory(size_t *mem_size, int device) {
   CHECK_NOTNULL(mem_size);
   CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-    << "Device("<< device <<") is not specified in startup.";
+      << "Device(" << device << ") is not specified in startup.";
 
   *mem_size = g_device[device]->device_mem;
 }
@@ -739,31 +711,26 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) {
   CHECK_NOTNULL(major);
   CHECK_NOTNULL(minor);
   CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-    << "Device("<< device << ") is not specified in startup.";
+      << "Device(" << device << ") is not specified in startup.";
 
   *major = g_device[device]->major;
   *minor = g_device[device]->minor;
 }
 
-int hl_get_device_last_error() {
-  return (int)dynload::cudaGetLastError();
-}
+int hl_get_device_last_error() { return (int)dynload::cudaGetLastError(); }
 
-const char* hl_get_device_error_string() {
+const char *hl_get_device_error_string() {
   cudaError_t err = dynload::cudaGetLastError();
   return dynload::cudaGetErrorString(err);
 }
 
-const char* hl_get_device_error_string(size_t err) {
+const char *hl_get_device_error_string(size_t err) {
   return dynload::cudaGetErrorString((cudaError_t)err);
 }
 
-void hl_device_synchronize() {
-  CHECK_CUDA(dynload::cudaDeviceSynchronize());
-}
+void hl_device_synchronize() { CHECK_CUDA(dynload::cudaDeviceSynchronize()); }
 void hl_set_device_flags_block() {
-  CHECK_CUDA(dynload::cudaSetDeviceFlags(
-             cudaDeviceScheduleBlockingSync));
+  CHECK_CUDA(dynload::cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
 }
 
 bool hl_cuda_event_is_ready(hl_event_t event) {
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index 067e68c41e11986fd740ea1a524763f8b1bd4c0c..0b7cd3375671d58464dac93458ec6659add8b730 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -18,8 +18,10 @@ limitations under the License. */
 #include "hl_matrix_ops.cuh"
 #include "hl_matrix_apply.cuh"
 #include "hl_sequence.h"
+#include "hl_sparse.ph"
 #include "paddle/utils/Logging.h"
 #include "hl_device_functions.cuh"
+#include "hl_gpu_matrix_kernel.cuh"
 
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b);
@@ -316,6 +318,85 @@ void hl_matrix_classification_error(real* A_d,
   CHECK_SYNC("hl_matrix_classification_error");
 }
 
+__global__ void KeMatrixMultiBinaryCrossEntropy(real* output,
+                                                real* entropy,
+                                                int* row,
+                                                int* col,
+                                                int dimM,
+                                                int dimN) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < dimM) {
+    for (int i = 0; i < dimN; i ++) {
+      entropy[index] -= log(1 - output[index * dimN + i]);
+    }
+    int *row_col = col + row[index];
+    int col_num = row[index + 1] - row[index];
+    for (int i = 0; i < col_num; i ++) {
+      real o = output[index * dimN + row_col[i]];
+      entropy[index] -= log(o / (1 - o));
+    }
+  }
+}
+
+void hl_matrix_multi_binary_cross_entropy(real* output,
+                                          real* entropy,
+                                          hl_sparse_matrix_s csr_mat,
+                                          int dimM,
+                                          int dimN) {
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(entropy);
+  CHECK_NOTNULL(csr_mat);
+  CHECK_EQ(csr_mat->format, HL_SPARSE_CSR);
+  int n_threads = 1024;
+  int blocks = (dimM + n_threads - 1) / n_threads;
+  dim3 threads(n_threads);
+  dim3 grid(blocks);
+  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
+  KeMatrixMultiBinaryCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>>
+          (output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
+  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed");
+}
+
+__global__ void KeMatrixMultiBinaryCrossEntropyBp(real* output,
+                                                  real* grad,
+                                                  int* row,
+                                                  int* col,
+                                                  int dimM,
+                                                  int dimN) {
+  int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (row_idx < dimM) {
+    for (int i = 0; i < dimN; i ++) {
+      int index = row_idx * dimN + i;
+      grad[index] += 1.0 / (1 - output[index]);
+    }
+    int col_num = row[row_idx + 1] - row[row_idx];
+    int *row_col = col + row[row_idx];
+    for (int i = 0; i < col_num; i ++) {
+      int index = row_idx * dimN + row_col[i];
+      grad[index] -= 1.0 / (output[index] * (1 - output[index]));
+    }
+  }
+}
+
+void hl_matrix_multi_binary_cross_entropy_bp(real* output,
+                                             real* grad,
+                                             hl_sparse_matrix_s csr_mat,
+                                             int dimM,
+                                             int dimN) {
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(grad);
+  CHECK_NOTNULL(csr_mat);
+  CHECK_EQ(csr_mat->format, HL_SPARSE_CSR);
+  int n_threads = 1024;
+  int blocks = (dimM + n_threads - 1) / n_threads;
+  dim3 threads(n_threads);
+  dim3 grid(blocks);
+  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
+  KeMatrixMultiBinaryCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>>
+          (output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
+  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed");
+}
+
 __global__ void KeMatrixCrossEntropy(real* O,
                                      real* E,
                                      int* label,
@@ -673,3 +754,89 @@ void hl_cossim_derivative(real* grad,
         input1_height, input2_height, scale);
   CHECK_SYNC("hl_cossim_derivate failed");
 }
+
+__global__ void KeMatrixAddSharedBias(real* A,
+                                      real* B,
+                                      const int channel,
+                                      const int M,
+                                      const int N,
+                                      real scale) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int dim = N / channel;
+  if (index < M * N) {
+    int i = index % N;
+    i = i / dim;
+    A[index] += scale * B[i];
+  }
+}
+
+void hl_matrix_add_shared_bias(real* A_d,
+                               real* B_d,
+                               const int channel,
+                               const int dimM,
+                               const int dimN,
+                               real scale) {
+  const int blocks = 512;
+  const int grids = DIVUP(dimM * dimN, blocks);
+  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>
+    (A_d, B_d, channel, dimM, dimN, scale);
+  CHECK_SYNC("hl_matrix_add_shared_bias failed");
+}
+
+
+template <int blockSize>
+__global__ void KeMatrixCollectSharedBias(real *B,
+                                          real *A,
+                                          const int channel,
+                                          const int M,
+                                          const int N,
+                                          const int dim,
+                                          const int limit,
+                                          real scale) {
+  if (dim < limit) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < channel) {
+      real sum = 0.0;
+      for (int i = 0; i < M; ++i) {
+        for (int j = 0; j < dim; ++j) {
+          sum += A[i * N + index * dim + j];
+        }
+      }
+      B[index] += scale * sum;
+    }
+  } else {
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    __shared__ real smem[blockSize];
+    real sum = 0.0;
+    for (int j = 0; j < ((dim * M + blockSize - 1) / blockSize); ++j) {
+      int n = j * blockSize + tid;
+      int m = n / dim;
+      int w = n % dim;
+      smem[tid] =  (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
+      __syncthreads();
+      simpleReduce(smem, tid, blockSize);
+      sum += smem[0];
+    }
+    if (tid == 0) {
+      B[bid] += scale * sum;
+    }
+  }
+}
+
+void hl_matrix_collect_shared_bias(real* B_d,
+                                   real* A_d,
+                                   const int channel,
+                                   const int dimM,
+                                   const int dimN,
+                                   real scale) {
+  const int dim = dimN / channel;
+  const int blocks = 256;
+  const int limit = 64;
+  int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
+
+  KeMatrixCollectSharedBias<blocks>
+      <<< grids, blocks, 0, STREAM_DEFAULT>>>
+      (B_d, A_d, channel, dimM, dimN, dim, limit, scale);
+  CHECK_SYNC("hl_matrix_collect_shared_bias failed");
+}
diff --git a/paddle/cuda/src/hl_cuda_sparse.cuh b/paddle/cuda/src/hl_cuda_sparse.cuh
index c3b98f4ebc38db055e3ac90691021665cbd97ced..9cf2d5a843343075c33d19bf34d9ed315299de83 100644
--- a/paddle/cuda/src/hl_cuda_sparse.cuh
+++ b/paddle/cuda/src/hl_cuda_sparse.cuh
@@ -908,24 +908,6 @@ int findIndex(int* indice, int num, int index) {
   return (end - 1);
 }
 
-/**
- * @brief  sum reduction
- *
- * @param[in,out]  smem       input data, better to use __shared__ memory.
- * @param[in]      tid        local thread index.
- * @param[in]      blockDimX  the size of blockDim.x.
- *
- * note: return smem[0]: the sum of each elements of smem.
- */
-__device__ __forceinline__
-void reduce(real* smem, int tid, int blockDimX) {
-  for (unsigned int s = blockDimX / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      smem[tid] += smem[tid + s];
-    }
-    __syncthreads();
-  }
-}
 
 /**
  * @brief sum columns of csr sparse matrix (csr_val), then add to a_val.
diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc
index 27bbd03bc328293d978867c6badddc13a754ece2..ff6b830b7addc5c87af0d55070260c279a046a75 100644
--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ b/paddle/cuda/src/hl_cudart_wrap.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifdef PADDLE_USE_DSO
 
 #include <mutex>
@@ -29,26 +28,26 @@ limitations under the License. */
 namespace dynload {
 
 extern std::once_flag cudart_dso_flag;
-extern void* cudart_dso_handle;
+extern void *cudart_dso_handle;
 
 /**
  * The following macro definition can generate structs
  * (for each function) to dynamic load cuda routine
  * via operator overloading.
  **/
-#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type)                    \
-  struct DynLoad__##__name {                                        \
-    template <typename... Args>                                     \
-    __type operator()(Args... args) {                               \
-      typedef __type (*cudartFunc)(Args...);                        \
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle,           \
-                    &cudart_dso_handle);                            \
-      void* p_##__name = dlsym(cudart_dso_handle, #__name);         \
-      return reinterpret_cast<cudartFunc>(p_##__name)(args...);     \
-    }                                                               \
-  } __name;  /* struct DynLoad__##__name */
+#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type)                               \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    __type operator()(Args... args) {                                          \
+      typedef __type (*cudartFunc)(Args...);                                   \
+      std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
+      void *p_##__name = dlsym(cudart_dso_handle, #__name);                    \
+      return reinterpret_cast<cudartFunc>(p_##__name)(args...);                \
+    }                                                                          \
+  } __name; /* struct DynLoad__##__name */
 
 /* include all needed cuda functions in HPPL */
+// clang-format off
 #define CUDA_ROUTINE_EACH(__macro)          \
   __macro(cudaLaunch, cudaError_t)          \
   __macro(cudaSetupArgument, cudaError_t)   \
@@ -61,16 +60,17 @@ extern void* cudart_dso_handle;
   __macro(__cudaInitModule, char)           \
   __macro(__cudaRegisterTexture, void)      \
   __macro(__cudaRegisterSurface, void)
+// clang-format on
 
 CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
 
 #if CUDART_VERSION >= 7000
-  DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
+DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
 #endif
 
 #undef CUDA_ROUNTINE_EACH
 
-}  /* namespace dynload */
+} /* namespace dynload */
 
 #if CUDART_VERSION >= 7000
 __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
@@ -78,131 +78,120 @@ __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
                                                 dim3 blockDim,
                                                 void **args,
                                                 size_t sharedMem,
-                                                cudaStream_t stream)
-{
-  return dynload::cudaLaunchKernel(func, gridDim, blockDim, args, sharedMem, stream);
+                                                cudaStream_t stream) {
+  return dynload::cudaLaunchKernel(
+      func, gridDim, blockDim, args, sharedMem, stream);
 }
 #endif /* CUDART_VERSION >= 7000 */
 
-
-__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func)
-{
+__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
   return dynload::cudaLaunch(func);
 }
 
 __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
                                                  size_t size,
-                                                 size_t offset)
-{
+                                                 size_t offset) {
   return dynload::cudaSetupArgument(arg, size, offset);
 }
 
 __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
                                                  dim3 blockDim,
                                                  size_t sharedMem,
-                                                 cudaStream_t stream)
-{
-  return dynload::cudaConfigureCall(gridDim, blockDim,
-                                    sharedMem, stream);
+                                                 cudaStream_t stream) {
+  return dynload::cudaConfigureCall(gridDim, blockDim, sharedMem, stream);
 }
 
 extern "C" {
 
-void** CUDARTAPI __cudaRegisterFatBinary(
-  void *fatCubin
-)
-{
+void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
   return dynload::__cudaRegisterFatBinary(fatCubin);
-
 }
 
-void CUDARTAPI __cudaUnregisterFatBinary(
-  void **fatCubinHandle
-)
-{
+void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
   return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
 }
 
-void CUDARTAPI __cudaRegisterFunction(
-        void   **fatCubinHandle,
-  const char    *hostFun,
-        char    *deviceFun,
-  const char    *deviceName,
-        int      thread_limit,
-        uint3   *tid,
-        uint3   *bid,
-        dim3    *bDim,
-        dim3    *gDim,
-        int     *wSize
-) {
-  return dynload::__cudaRegisterFunction(
-                fatCubinHandle, hostFun, deviceFun, deviceName,
-                thread_limit, tid, bid, bDim, gDim, wSize);
+void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle,
+                                      const char *hostFun,
+                                      char *deviceFun,
+                                      const char *deviceName,
+                                      int thread_limit,
+                                      uint3 *tid,
+                                      uint3 *bid,
+                                      dim3 *bDim,
+                                      dim3 *gDim,
+                                      int *wSize) {
+  return dynload::__cudaRegisterFunction(fatCubinHandle,
+                                         hostFun,
+                                         deviceFun,
+                                         deviceName,
+                                         thread_limit,
+                                         tid,
+                                         bid,
+                                         bDim,
+                                         gDim,
+                                         wSize);
 }
 
-void CUDARTAPI __cudaRegisterVar(
-        void **fatCubinHandle,
-        char  *hostVar,
-        char  *deviceAddress,
-  const char  *deviceName,
-        int    ext,
-        int    size,
-        int    constant,
-        int    global
-) {
-  return dynload::__cudaRegisterVar(
-                fatCubinHandle, hostVar, deviceAddress,
-                deviceName, ext, size, constant, global);
+void CUDARTAPI __cudaRegisterVar(void **fatCubinHandle,
+                                 char *hostVar,
+                                 char *deviceAddress,
+                                 const char *deviceName,
+                                 int ext,
+                                 int size,
+                                 int constant,
+                                 int global) {
+  return dynload::__cudaRegisterVar(fatCubinHandle,
+                                    hostVar,
+                                    deviceAddress,
+                                    deviceName,
+                                    ext,
+                                    size,
+                                    constant,
+                                    global);
 }
 
-
-
-extern void CUDARTAPI __cudaRegisterManagedVar(
-        void **fatCubinHandle,
-        void **hostVarPtrAddress,
-        char  *deviceAddress,
-  const char  *deviceName,
-        int    ext,
-        int    size,
-        int    constant,
-        int    global
-) {
-  return dynload::__cudaRegisterManagedVar(
-                fatCubinHandle, hostVarPtrAddress, deviceAddress,
-                deviceName, ext, size, constant, global);
+extern void CUDARTAPI __cudaRegisterManagedVar(void **fatCubinHandle,
+                                               void **hostVarPtrAddress,
+                                               char *deviceAddress,
+                                               const char *deviceName,
+                                               int ext,
+                                               int size,
+                                               int constant,
+                                               int global) {
+  return dynload::__cudaRegisterManagedVar(fatCubinHandle,
+                                           hostVarPtrAddress,
+                                           deviceAddress,
+                                           deviceName,
+                                           ext,
+                                           size,
+                                           constant,
+                                           global);
 }
 
-char CUDARTAPI __cudaInitModule(
-        void **fatCubinHandle
-) {
+char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
   return dynload::__cudaInitModule(fatCubinHandle);
 }
 
-void CUDARTAPI __cudaRegisterTexture(
-        void                    **fatCubinHandle,
-  const struct textureReference  *hostVar,
-  const void                    **deviceAddress,
-  const char                     *deviceName,
-        int                       dim,
-        int                       norm,
-        int                       ext
-) {
+void CUDARTAPI __cudaRegisterTexture(void **fatCubinHandle,
+                                     const struct textureReference *hostVar,
+                                     const void **deviceAddress,
+                                     const char *deviceName,
+                                     int dim,
+                                     int norm,
+                                     int ext) {
   return dynload::__cudaRegisterTexture(
-                fatCubinHandle, hostVar, deviceAddress,
-                deviceName, dim, norm, ext);
+      fatCubinHandle, hostVar, deviceAddress, deviceName, dim, norm, ext);
 }
 
-void CUDARTAPI __cudaRegisterSurface(
-        void                    **fatCubinHandle,
-  const struct surfaceReference  *hostVar,
-  const void                    **deviceAddress,
-  const char                     *deviceName,
-        int                       dim,
-        int                       ext
-) {
+void CUDARTAPI __cudaRegisterSurface(void **fatCubinHandle,
+                                     const struct surfaceReference *hostVar,
+                                     const void **deviceAddress,
+                                     const char *deviceName,
+                                     int dim,
+                                     int ext) {
   return dynload::__cudaRegisterSurface(
-                fatCubinHandle, hostVar, deviceAddress,
-                deviceName, dim, ext);
+      fatCubinHandle, hostVar, deviceAddress, deviceName, dim, ext);
 }
 
 } /* extern "C" */
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index eee9984e07326668a49fd2627e361804a6aacd7b..1a3ce08619fc3a5787576b30e9f4c13336990e74 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -12,27 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "hl_dso_loader.h"
-#include "paddle/utils/Logging.h"
 #include "paddle/utils/CommandLineParser.h"
+#include "paddle/utils/Logging.h"
 
-P_DEFINE_string(cudnn_dir, "",
+P_DEFINE_string(cudnn_dir,
+                "",
                 "Specify path for loading libcudnn.so. For instance, "
-                "/usr/local/cudnn/lib64. If empty [default], dlopen will search "
-                "cudnn from LD_LIBRARY_PATH");
+                "/usr/local/cudnn/lib. If empty [default], dlopen "
+                "will search cudnn from LD_LIBRARY_PATH");
 
-P_DEFINE_string(cuda_dir, "",
+P_DEFINE_string(cuda_dir,
+                "",
                 "Specify path for loading cuda library, such as libcublas, "
-                "libcurand. For instance, /usr/local/cuda/lib64. "
-                "(Note: libcudart can not be specified by cuda_dir, since some "
+                "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
+                "libcudart can not be specified by cuda_dir, since some "
                 "build-in function in cudart already ran before main entry). "
-                "If empty [default], dlopen will search cuda from LD_LIBRARY_PATH");
+                "If default, dlopen will search cuda from LD_LIBRARY_PATH");
 
-static inline std::string join(const std::string& part1, const std::string& part2) {
+static inline std::string join(const std::string& part1,
+                               const std::string& part2) {
   // directory separator
   const char sep = '/';
-
   if (!part2.empty() && part2.front() == sep) {
     return part2;
   }
@@ -46,63 +47,115 @@ static inline std::string join(const std::string& part1, const std::string& part
   return ret;
 }
 
-static inline void GetDsoHandleWithSearchPath(
-        const std::string& search_root,
-        const std::string& dso_path,
-        void** dso_handle) {
-    int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
-    *dso_handle = nullptr;
-
-    std::string dlPath = dso_path;
-    if (search_root.empty()) {
-        // default search xxx.so from LD_LIBRARY_PATH
-        *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
-    } else {
-        // search xxx.so from custom path
-        dlPath = join(search_root, dso_path);
-        *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
-        // then, search xxx.so from LD_LIBRARY_PATH
-        if (nullptr == *dso_handle) {
-            *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-        }
+static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
+                                               void** dso_handle,
+                                               int dynload_flags) {
+  VLOG(3) << "Try to find cuda library: " << dso_path
+          << " from default system path.";
+  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+  *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+
+// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
+// bring System Integrity Projection (SIP), if dso_handle
+// is null, search from default package path in Mac OS.
+#if defined(__APPLE__) || defined(__OSX__)
+  if (nullptr == *dso_handle) {
+    dso_path = join("/usr/local/cuda/lib/", dso_path);
+    *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+    if (nullptr == *dso_handle) {
+      if (dso_path == "libcudnn.dylib") {
+        LOG(FATAL)
+            << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"  // NOLINT
+            << "For instance, sudo tar -xzf "
+               "cudnn-7.5-osx-x64-v5.0-ga.tgz -C "  // NOLINT
+            << "/usr/local \n sudo chmod a+r "
+               "/usr/local/cuda/include/cudnn.h "  // NOLINT
+            << "/usr/local/cuda/lib/libcudnn*";
+      }
     }
+  }
+#endif
+}
+
+static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
+                                              const std::string& dso_name,
+                                              void** dso_handle) {
+  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+  *dso_handle = nullptr;
+
+  std::string dlPath = dso_name;
+  if (search_root.empty()) {
+    GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+  } else {
+    // search xxx.so from custom path
+    dlPath = join(search_root, dso_name);
+    *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+    // if not found, search from default path
+    if (nullptr == *dso_handle) {
+      LOG(WARNING) << "Failed to find cuda library: " << dlPath;
+      dlPath = dso_name;
+      GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+    }
+  }
 
-    CHECK(nullptr != *dso_handle)
-      << "For Gpu version of PaddlePaddle, it couldn't find CUDA library: "
-      << dlPath.c_str() << ". Please make sure you already specify its path. "
-      << "Note: for training data on Cpu using Gpu version of PaddlePaddle, "
-      << "you must specify libcudart via export LD_LIBRARY_PATH for Linux or "
-      << "export DYLD_LIBRARY_PATH for MAC OS.";
+  CHECK(nullptr != *dso_handle) << "Failed to find cuda library: " << dlPath
+                                << std::endl
+                                << "Please specify its path correctly using "
+                                   "one of the following ways: \n"  // NOLINT
+
+                                << "Method 1. set cuda and cudnn lib path at "
+                                   "runtime. "
+                                << "http://www.paddlepaddle.org/doc/ui/"
+                                   "cmd_argument/"
+                                   "argument_outline.html \n"  // NOLINT
+                                << "For instance, issue command: paddle train "
+                                   "--use_gpu=1 "
+                                << "--cuda_dir=/usr/local/cuda/lib64 "
+                                   "--cudnn_dir=/usr/local/cudnn/lib "
+                                   "...\n"  // NOLINT
+
+                                << "Method 2. set environment variable "
+                                   "LD_LIBRARY_PATH on Linux or "
+                                << "DYLD_LIBRARY_PATH on Mac OS. \n"
+                                << "For instance, issue command: export "
+                                   "LD_LIBRARY_PATH=... \n"
+
+                                << "Note: After Mac OS 10.11, using the "
+                                   "DYLD_LIBRARY_PATH is impossible "
+                                << "unless System Integrity Protection (SIP) "
+                                   "is disabled. However, "
+                                   "method 1 "  // NOLINT
+                                << "always work well.";
 }
 
 void GetCublasDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
 #else
-    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
 #endif
 }
 
 void GetCudnnDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
 #else
-    GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
 #endif
 }
 
 void GetCudartDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleWithSearchPath("", "libcudart.dylib", dso_handle);
+  GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
 #else
-    GetDsoHandleWithSearchPath("", "libcudart.so", dso_handle);
+  GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
 #endif
 }
 
 void GetCurandDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
 #else
-    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
 #endif
 }
diff --git a/paddle/cuda/src/hl_math.cc b/paddle/cuda/src/hl_math.cc
index 76d48c4a9b94d402cf84c57bd240e03a1a83b1a0..f4bf888bab4e92dd940714ef1b7aeee9242eb817 100644
--- a/paddle/cuda/src/hl_math.cc
+++ b/paddle/cuda/src/hl_math.cc
@@ -12,24 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "avx_mathfun.h"
 
 namespace hppl {
-__m256 exp(__m256 a) {
-  return exp256_ps(a);
-}
+__m256 exp(__m256 a) { return exp256_ps(a); }
 
-__m256 log(__m256 a) {
-  return log256_ps(a);
-}
+__m256 log(__m256 a) { return log256_ps(a); }
 
-__m256 sin(__m256 a) {
-  return sin256_ps(a);
-}
+__m256 sin(__m256 a) { return sin256_ps(a); }
 
-__m256 cos(__m256 a) {
-  return cos256_ps(a);
-}
+__m256 cos(__m256 a) { return cos256_ps(a); }
 
 }  // namespace hppl
diff --git a/paddle/cuda/src/hl_time.cc b/paddle/cuda/src/hl_time.cc
index adc88d60dd8d547cedcae5fd088b2fa581d8e5be..d52b2a1df07374f632def12eb52e10e10ca86028 100644
--- a/paddle/cuda/src/hl_time.cc
+++ b/paddle/cuda/src/hl_time.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <chrono>
 #include <stdlib.h>
 #include <iostream>
@@ -21,8 +20,7 @@ limitations under the License. */
 using std::chrono::high_resolution_clock;
 
 int64_t getCurrentTimeStick() {
-    high_resolution_clock::time_point tp = high_resolution_clock::now();
-    high_resolution_clock::duration dtn = tp.time_since_epoch();
-    return dtn.count();
+  high_resolution_clock::time_point tp = high_resolution_clock::now();
+  high_resolution_clock::duration dtn = tp.time_since_epoch();
+  return dtn.count();
 }
-
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 9ac4d210f6d376639df20800b6782f1f8c03d6aa..a066f80c221ee8ab4383ee6463f7b111984b58ff 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -50,7 +50,7 @@ if(NOT WITH_PYTHON)
 endif()
 
 if(WITH_GPU)
-    add_paddle_culib(paddle_gserver ${GSERVER_SOURCES})
+    cuda_add_library(paddle_gserver ${GSERVER_SOURCES})
 else()
     add_library(paddle_gserver STATIC
         ${GSERVER_SOURCES})
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index 9918d20d9082ae6c07684ce05eba68c4989dd5d5..f1bb94216c44b3e915f87a3ae49bdfd3ef812916 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -51,12 +51,14 @@ static ClassRegistrar<ActivationFunction> gActivationRegistrar;
  * @brief Macro for registering a derived activation class
  */
 #define END_DEFINE_ACTIVATION(ACTIVATION_NAME)                     \
-  };                                                               \
+  }                                                                \
+  ;                                                                \
   const std::string ACTIVATION_CLASS_NAME(ACTIVATION_NAME)::name = \
       #ACTIVATION_NAME;                                            \
   static InitFunction __reg_activation__##ACTIVATION_NAME([] {     \
-    gActivationRegistrar.registerClass<                            \
-        ACTIVATION_CLASS_NAME(ACTIVATION_NAME)>(#ACTIVATION_NAME); \
+    gActivationRegistrar                                           \
+        .registerClass<ACTIVATION_CLASS_NAME(ACTIVATION_NAME)>(    \
+            #ACTIVATION_NAME);                                     \
   });
 
 /**
@@ -111,14 +113,22 @@ void backward(Argument& act) {
     outputG->softmaxBackward(*outputV);
   } else {
     SetDevice device(act.deviceId);
-    Matrix::resizeOrCreate(sftMaxDot_, outputG->getHeight(),
+    Matrix::resizeOrCreate(sftMaxDot_,
+                           outputG->getHeight(),
                            outputG->getWidth(),
-                           /* trans */ false, useGpu(act.deviceId));
-    Matrix::resizeOrCreate(sftMaxSum_, outputG->getHeight(), 1,
-                           /* trans */ false, useGpu(act.deviceId));
+                           /* trans */ false,
+                           useGpu(act.deviceId));
+    Matrix::resizeOrCreate(sftMaxSum_,
+                           outputG->getHeight(),
+                           1,
+                           /* trans */ false,
+                           useGpu(act.deviceId));
     if (!one_ || one_->getWidth() != outputG->getWidth()) {
-      Matrix::resizeOrCreate(one_, 1, outputG->getWidth(),
-                             /* trans */ false, useGpu(act.deviceId));
+      Matrix::resizeOrCreate(one_,
+                             1,
+                             outputG->getWidth(),
+                             /* trans */ false,
+                             useGpu(act.deviceId));
       one_->one();
     }
 
@@ -130,7 +140,6 @@ void backward(Argument& act) {
 }
 END_DEFINE_ACTIVATION(softmax)
 
-
 /**
  * @brief Sequence_softmax Activation
  * @note Softmax on all frames of one sequence.
@@ -146,10 +155,16 @@ void forward(Argument& act) {
   CHECK_EQ(act.value->getWidth(), 1UL);
 
   if (!argument_.value) {
-    argument_.value = Matrix::create(nullptr, /* height= */ 1, 1,
-                                     /* trans= */ false, useGpu(act.deviceId));
-    argument_.grad = Matrix::create(nullptr, /* height= */ 1, 1,
-                                    /* trans= */ false, useGpu(act.deviceId));
+    argument_.value = Matrix::create(nullptr,
+                                     /* height= */ 1,
+                                     1,
+                                     /* trans= */ false,
+                                     useGpu(act.deviceId));
+    argument_.grad = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    1,
+                                    /* trans= */ false,
+                                    useGpu(act.deviceId));
   }
 
   auto starts = act.sequenceStartPositions->getVector(useGpu(act.deviceId));
@@ -267,8 +282,11 @@ END_DEFINE_ACTIVATION(softrelu)
 BEGIN_DEFINE_ACTIVATION(abs)
 void forward(Argument& act) {
   SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
-                         /* trans */ false, useGpu(act.deviceId));
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
 
   act.in->copyFrom(*act.value);
   act.value->abs(*act.value);
@@ -286,8 +304,11 @@ END_DEFINE_ACTIVATION(abs)
 BEGIN_DEFINE_ACTIVATION(square)
 void forward(Argument& act) {
   SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
-                         /* trans */ false, useGpu(act.deviceId));
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
 
   act.in->copyFrom(*act.value);
   act.value->square(*act.value);
@@ -295,6 +316,7 @@ void forward(Argument& act) {
 
 void backward(Argument& act) { act.grad->squareDerivative(*act.in); }
 END_DEFINE_ACTIVATION(square)
+
 /**
  * @brief Exponential Activation.
  * \f[
@@ -307,8 +329,37 @@ void forward(Argument& act) { act.value->exp(*act.value); }
 void backward(Argument& act) { act.grad->expDerivative(*act.value); }
 END_DEFINE_ACTIVATION(exponential)
 
+/**
+ * @brief Logarithm Activation.
+ * \f[
+ * f(z) = log(z)
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(log)
+void forward(Argument& act) {
+  SetDevice device(act.deviceId);
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
+
+  act.in->copyFrom(*act.value);
+  act.value->log(*act.value);
+}
+
+void backward(Argument& act) { act.grad->dotDiv(*act.grad, *act.in); }
+END_DEFINE_ACTIVATION(log)
+
 ActivationFunction* ActivationFunction::create(const std::string& type) {
   return gActivationRegistrar.createByType(type);
 }
 
+std::vector<std::string> ActivationFunction::getAllRegisteredTypes() {
+  std::vector<std::string> types;
+  gActivationRegistrar.forEachType(
+      [&](const std::string& type) { types.push_back(type); });
+  return types;
+}
+
 }  // namespace paddle
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
index 29860b4a736c37dee70c56731820a4197ea4cdbe..e9ed5c619ab5e4dd9c52c0dac24478c2a57aa1bf 100644
--- a/paddle/gserver/activations/ActivationFunction.h
+++ b/paddle/gserver/activations/ActivationFunction.h
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <string>
+#include <vector>
 
 namespace paddle {
 
@@ -32,6 +32,7 @@ struct Argument;
 class ActivationFunction {
 public:
   static ActivationFunction* create(const std::string& type);
+  static std::vector<std::string> getAllRegisteredTypes();
 
   ActivationFunction() {}
 
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
index 8cefbb30ada46d1ff1b0a4952dde0aeafb5419b1..e6cc4a246a8494d287f8638674f4ae213f38f657 100644
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "DataProvider.h"
 
 #include "paddle/utils/Util.h"
@@ -57,7 +56,7 @@ void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) {
   }
 }
 
-DoubleBuffer::DoubleBuffer(DataProvider *dataPool,
+DoubleBuffer::DoubleBuffer(DataProvider* dataPool,
                            bool useGpu,
                            int64_t batchSize) {
   batchSize_ = batchSize;
@@ -131,9 +130,10 @@ void DoubleBuffer::asyncLoadBatch() {
     taskReadySem_.wait();
     if (stopping_) break;
 
-    while (batchSize_ == 0) {
+    while (batchSize_ == 0 && !stopping_) {
       usleep(5);
     }
+    if (stopping_) break;
 
     do {
       DataBatch newBatch;
@@ -154,7 +154,7 @@ void DoubleBuffer::startAsyncLoad() {
 }
 
 ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool>
-DataProvider::registrar_;
+    DataProvider::registrar_;
 
 DataProvider* DataProvider::create(const DataConfig& config,
                                    const ModelConfig& modelConfig,
@@ -181,7 +181,8 @@ int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
   for (int i = 0; i < config_.constant_slots_size(); ++i) {
     MemoryHandlePtr handle =
         constantSlots[i] ? constantSlots[i]->getMemoryHandle() : nullptr;
-    Matrix::resizeOrCreate(constantSlots[i], batchSize,
+    Matrix::resizeOrCreate(constantSlots[i],
+                           batchSize,
                            1,         // = width
                            false,     // = trans
                            useGpu_);  // = useGpu
@@ -215,7 +216,8 @@ void DataProvider::initAsyncLoader() {
 }
 
 SimpleDataProviderBase::SimpleDataProviderBase(const DataConfig& config,
-                                               bool useGpu, bool withInfo)
+                                               bool useGpu,
+                                               bool withInfo)
     : DataProvider(config, useGpu) {
   /* initialize the size of a sample, and the buffer */
   sampleDim_ = config_.feat_dim() * (2 * config_.context_len() + 1);
@@ -336,7 +338,8 @@ int64_t SimpleDataProviderBase::fillBuffer() {
   sampleNumInBuf_ =
       n + fillBufferImp(hInputDataBuf_->getData() + n * sampleDim_,
                         hInputLabelBuf_->getData() + n,
-                        hInputInfoBuf_->getData() + n, bufferCapacity_ - n);
+                        hInputInfoBuf_->getData() + n,
+                        bufferCapacity_ - n);
 
   /* for stachastic gradient training */
   if (!skipShuffle_) {
@@ -356,11 +359,14 @@ SimpleDataProvider::SimpleDataProvider(const DataConfig& config, bool useGpu)
 
 SimpleDataProvider::~SimpleDataProvider() {}
 
-int64_t SimpleDataProvider::fillBufferImp(real* data, int* label, int* info,
+int64_t SimpleDataProvider::fillBufferImp(real* data,
+                                          int* label,
+                                          int* info,
                                           int64_t size) {
   (void)info;
   int64_t n = std::min<int64_t>(labels_.size() - currentSampleIndex_, size);
-  memcpy(data, &data_[currentSampleIndex_ * sampleDim_],
+  memcpy(data,
+         &data_[currentSampleIndex_ * sampleDim_],
          n * sampleDim_ * sizeof(real));
   memcpy(label, &labels_[currentSampleIndex_], sizeof(int) * n);
   currentSampleIndex_ += n;
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 112e45de1cb232097ed63b120d5ac631b37952e9..8b7fb27f821a47d830413eced79b3352a6969c90 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <vector>
@@ -44,15 +43,15 @@ namespace paddle {
  * @brief Macro for registering a data provider. The class type should contain
  *        a consturctor with parameter (DataConfig, bool).
  */
-#define REGISTER_DATA_PROVIDER(__type_name, __class_name)\
-  static InitFunction __reg_type_##__type_name([]() {\
-  DataProvider::registrar_.registerClass(\
-  #__type_name, \
-  [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
-    DataProvider* dp = new __class_name (conf, useGpu);\
-    return dp;\
-  });\
-})
+#define REGISTER_DATA_PROVIDER(__type_name, __class_name)                \
+  static InitFunction __reg_type_##__type_name([]() {                    \
+    DataProvider::registrar_.registerClass(                              \
+        #__type_name,                                                    \
+        [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
+          DataProvider* dp = new __class_name(conf, useGpu);             \
+          return dp;                                                     \
+        });                                                              \
+  })
 
 /**
  * @def REGISTER_DATA_PROVIDER_EX
@@ -61,8 +60,8 @@ namespace paddle {
  */
 #define REGISTER_DATA_PROVIDER_EX(__type_name, __class_name)            \
   static InitFunction __reg_type_##__type_name([] {                     \
-  DataProvider::registrar_.registerClass<__class_name>(#__type_name);   \
-})
+    DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
+  })
 
 class DataBatch;
 class BufferBatch;
@@ -181,7 +180,8 @@ public:
    * @param[in]  size    DataBatch.getSize()
    * @param[in]  dataId  sub dataprovider id (in MultiDataProvider)
    */
-  void appendArguments(const std::vector<Argument>& argus, int size,
+  void appendArguments(const std::vector<Argument>& argus,
+                       int size,
                        int dataId) {
     size_ += size;
     for (const auto& argu : argus) {
@@ -259,9 +259,7 @@ typedef Queue<BufferBatch*> BufferBatchQueue;
 
 class DoubleBuffer {
 public:
-  DoubleBuffer(DataProvider* dataPool,
-               bool useGpu,
-               int64_t batchSize = 0);
+  DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
   virtual ~DoubleBuffer();
   void removeOneBatch(DataBatch* dataBatch);
 
@@ -310,7 +308,7 @@ public:
   /**
    * @brief create only used for unittest.
    */
-  inline static DataProvider* create(const DataConfig &config,
+  inline static DataProvider* create(const DataConfig& config,
                                      bool useGpu = FLAGS_use_gpu) {
     return create(config, ModelConfig(), useGpu);
   }
@@ -462,7 +460,9 @@ protected:
    *
    * label[n] is the label for the n-th sample.
    */
-  virtual int64_t fillBufferImp(real* data, int* label, int* info,
+  virtual int64_t fillBufferImp(real* data,
+                                int* label,
+                                int* info,
                                 int64_t size) = 0;
 };
 
@@ -475,7 +475,9 @@ public:
 protected:
   void loadData(const std::string& fileName);
   void loadDataFile(const std::string& fileName);
-  virtual int64_t fillBufferImp(real* data, int* label, int* info,
+  virtual int64_t fillBufferImp(real* data,
+                                int* label,
+                                int* info,
                                 int64_t size);
 
 protected:
diff --git a/paddle/gserver/dataproviders/DataProviderGroup.h b/paddle/gserver/dataproviders/DataProviderGroup.h
index 0689f90f3e7dd3d3e1df19f3958c821d53e69700..6c178e29ee714a6bd7f58861d7cf15716fee848d 100644
--- a/paddle/gserver/dataproviders/DataProviderGroup.h
+++ b/paddle/gserver/dataproviders/DataProviderGroup.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "DataProvider.h"
@@ -65,8 +64,8 @@ void DataProviderGroup<T>::reset() {
   provider_ = nullptr;
 
   // shuffle file list
-  std::shuffle(fileList_.begin(), fileList_.end(),
-      ThreadLocalRandomEngine::get());
+  std::shuffle(
+      fileList_.begin(), fileList_.end(), ThreadLocalRandomEngine::get());
 
   startLoader();
   DataProvider::reset();
@@ -113,8 +112,9 @@ void DataProviderGroup<T>::startLoader() {
     size_t endPos = std::min(fileList_.size(), startPos + loadFileCount);
     std::vector<std::string> fileVec(fileList_.begin() + startPos,
                                      fileList_.begin() + endPos);
-    loader_->addJob([this, fileVec]()
-                        -> ProviderPtrType { return this->loadFile(fileVec); });
+    loader_->addJob([this, fileVec]() -> ProviderPtrType {
+      return this->loadFile(fileVec);
+    });
   }
   loader_->stopAddJob();
 }
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.cpp b/paddle/gserver/dataproviders/MultiDataProvider.cpp
index 8e4f53978a0451f3bb6cd5da30f017708448f9ac..51fb1f26668c55dc1c2aecd5389f327e2569a52f 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.cpp
+++ b/paddle/gserver/dataproviders/MultiDataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "MultiDataProvider.h"
 #include "paddle/utils/Logging.h"
@@ -59,10 +58,8 @@ MultiDataProvider::MultiDataProvider(const DataConfig& config,
                    "MultiDataProvider";
       subConfig.set_async_load_data(false);
     }
-    subDataProviders_[i] =
-        std::unique_ptr<DataProvider>(DataProvider::create(subConfig,
-                                                           modelConfig,
-                                                           useGpu_));
+    subDataProviders_[i] = std::unique_ptr<DataProvider>(
+        DataProvider::create(subConfig, modelConfig, useGpu_));
   }
 }
 
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.h b/paddle/gserver/dataproviders/MultiDataProvider.h
index b498ba6516c4320566b1b3cc2bd557ae016d7c39..876467c04f074cf37e48fdfa9b24f236fcfe8ba1 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.h
+++ b/paddle/gserver/dataproviders/MultiDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "DataProvider.h"
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
index 344644755f24045443b8cb3ebd08004a4b1cdcb5..0a7ff802461f2ded0e6e842c088bddf218361f79 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ProtoDataProvider.h"
 #include "paddle/utils/Util.h"
 #include "paddle/utils/StringUtil.h"
@@ -23,7 +22,8 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 #include "DataProviderGroup.h"
 
-P_DEFINE_double(memory_threshold_on_load_data, 1.0,
+P_DEFINE_double(memory_threshold_on_load_data,
+                1.0,
                 "stop loading data when memory is not sufficient");
 
 namespace paddle {
@@ -32,7 +32,8 @@ REGISTER_DATA_PROVIDER(proto_group, DataProviderGroup<ProtoDataProvider>);
 REGISTER_DATA_PROVIDER(proto_sequence_group,
                        DataProviderGroup<ProtoSequenceDataProvider>);
 
-ProtoDataProvider::ProtoDataProvider(const DataConfig& config, bool useGpu,
+ProtoDataProvider::ProtoDataProvider(const DataConfig& config,
+                                     bool useGpu,
                                      bool loadDataAll)
     : DataProvider(config, useGpu), sampleNums_(0), currentSequenceIndex_(0) {
   if (loadDataAll) {
@@ -279,7 +280,8 @@ void ProtoDataProvider::fillSlots(const DataSample& sample) {
         }
         slot.sparseNonValueData.resize(slot.indices.back() + slotSize);
         const unsigned int* ids = sample.vector_slots(i).ids().data();
-        memcpy(slot.sparseNonValueData.data() + slot.indices.back(), ids,
+        memcpy(slot.sparseNonValueData.data() + slot.indices.back(),
+               ids,
                sizeof(*ids) * slotSize);
         slot.indices.push_back(slot.indices.back() + slotSize);
         if (subSlotSize) {
@@ -318,10 +320,11 @@ void ProtoDataProvider::fillSlots(const DataSample& sample) {
         slot.varDenseData[oldSize].data.resize(varDim);
         const float* values = sample.vector_slots(i).values().data();
 #ifdef PADDLE_TYPE_DOUBLE
-        std::copy(values, values + varDim,
-                  slot.varDenseData[oldSize].data.data());
+        std::copy(
+            values, values + varDim, slot.varDenseData[oldSize].data.data());
 #else
-        memcpy(slot.varDenseData[oldSize].data.data(), values,
+        memcpy(slot.varDenseData[oldSize].data.data(),
+               values,
                sizeof(real) * varDim);
 #endif
         slot.varDenseData[oldSize].dims.resize(
@@ -374,8 +377,9 @@ void ProtoDataProvider::reset() {
 }
 
 void ProtoDataProvider::shuffle() {
-  std::shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end(),
-      ThreadLocalRandomEngine::get());
+  std::shuffle(shuffledSequenceIds_.begin(),
+               shuffledSequenceIds_.end(),
+               ThreadLocalRandomEngine::get());
 }
 
 /*
@@ -502,7 +506,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
 
   if (!iidData()) {
     ICpuGpuVector::resizeOrCreate(cpuArguments[0].sequenceStartPositions,
-                                   numSequences + 1, /* useGpu= */ false);
+                                  numSequences + 1,
+                                  /* useGpu= */ false);
     int* buf = cpuArguments[0].sequenceStartPositions->getMutableData(false);
     int pos = 0;
     int i = 0;
@@ -530,7 +535,9 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
 
     switch (slotType) {
       case SlotDef::VECTOR_DENSE: {
-        Matrix::resizeOrCreate(cpuArguments[slot].value, size, dim,
+        Matrix::resizeOrCreate(cpuArguments[slot].value,
+                               size,
+                               dim,
                                false,   // trans = false
                                false);  // useGpu = false
         real* buf = cpuArguments[slot].value->getData();
@@ -543,19 +550,27 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
       }
       case SlotDef::VECTOR_SPARSE_NON_VALUE: {
         if (!(cpuArguments[slot].value)) {
-          cpuArguments[slot].value = Matrix::createSparseMatrix(
-              size, dim, size /*DEFAULT_AVG_WIDTH = 1*/, NO_VALUE, SPARSE_CSR,
-              false, useGpu_);
+          cpuArguments[slot].value =
+              Matrix::createSparseMatrix(size,
+                                         dim,
+                                         size /*DEFAULT_AVG_WIDTH = 1*/,
+                                         NO_VALUE,
+                                         SPARSE_CSR,
+                                         false,
+                                         useGpu_);
         }
         auto mat = cpuArguments[slot].value;
         mat->resize(size, dim);
         if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
           std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
-                         slots_[slot].sparseNonValueData.data(), HPPL_STREAM_1);
+              ->copyFrom(dataPos.data(),
+                         slots_[slot].indices.data(),
+                         slots_[slot].sparseNonValueData.data(),
+                         HPPL_STREAM_1);
         } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
           std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
+              ->copyFrom(dataPos.data(),
+                         slots_[slot].indices.data(),
                          slots_[slot].sparseNonValueData.data());
         } else {
           LOG(FATAL) << "Not Supported";
@@ -571,19 +586,27 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
       }
       case SlotDef::VECTOR_SPARSE_VALUE: {
         if (!(cpuArguments[slot].value)) {
-          cpuArguments[slot].value = Matrix::createSparseMatrix(
-              size, dim, size /*DEFAULT_AVG_WIDTH = 1*/, FLOAT_VALUE,
-              SPARSE_CSR, false, useGpu_);
+          cpuArguments[slot].value =
+              Matrix::createSparseMatrix(size,
+                                         dim,
+                                         size /*DEFAULT_AVG_WIDTH = 1*/,
+                                         FLOAT_VALUE,
+                                         SPARSE_CSR,
+                                         false,
+                                         useGpu_);
         }
         auto mat = cpuArguments[slot].value;
         mat->resize(size, dim);
         if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(), slots_[slot].indices.data(),
-              slots_[slot].sparseFloatValueData.data(), HPPL_STREAM_1);
+          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
+              ->copyFrom(dataPos.data(),
+                         slots_[slot].indices.data(),
+                         slots_[slot].sparseFloatValueData.data(),
+                         HPPL_STREAM_1);
         } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
           std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(), slots_[slot].indices.data(),
+              ->copyFrom(dataPos.data(),
+                         slots_[slot].indices.data(),
                          slots_[slot].sparseFloatValueData.data());
         } else {
           LOG(FATAL) << "Not Supported";
@@ -591,7 +614,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
         break;
       }
       case SlotDef::INDEX: {
-        IVector::resizeOrCreate(cpuArguments[slot].ids, size,
+        IVector::resizeOrCreate(cpuArguments[slot].ids,
+                                size,
                                 /*  useGpu= */ false);
         int* buf = cpuArguments[slot].ids->getData();
         for (int i = 0; i < size; ++i) {
@@ -621,7 +645,9 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
         if (oldWidth < height) {
           totalDim = width * height * depth;
         }
-        Matrix::resizeOrCreate(cpuArguments[slot].value, size, totalDim,
+        Matrix::resizeOrCreate(cpuArguments[slot].value,
+                               size,
+                               totalDim,
                                false,   // trans = false
                                false);  // useGpu = false
         real* buf = cpuArguments[slot].value->getData();
@@ -637,13 +663,13 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
             }
           }
         } else {
-          memcpy(buf, slots_[slot].varDenseData[dataPos[0]].data.data(),
+          memcpy(buf,
+                 slots_[slot].varDenseData[dataPos[0]].data.data(),
                  sizeof(real) * totalDim);
         }
-        ICpuGpuVector::resizeOrCreate(
-            cpuArguments[slot].sequenceStartPositions,
-            size + 1, /* size == 1 currently */
-            /* useGpu= */ false);
+        ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+                                      size + 1, /* size == 1 currently */
+                                      /* useGpu= */ false);
         int* bufStarts =
             cpuArguments[slot].sequenceStartPositions->getMutableData(false);
         bufStarts[0] = 0;
@@ -653,16 +679,17 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
       case SlotDef::VAR_MDIM_INDEX: {
         CHECK_EQ(size, 1);
         size_t totalDim = slots_[slot].varIndices[dataPos[0]].size();
-        IVector::resizeOrCreate(cpuArguments[slot].ids, totalDim,
+        IVector::resizeOrCreate(cpuArguments[slot].ids,
+                                totalDim,
                                 /*  useGpu= */ false);
         int* buf = cpuArguments[slot].ids->getData();
-        memcpy(buf, slots_[slot].varIndices[dataPos[0]].data(),
+        memcpy(buf,
+               slots_[slot].varIndices[dataPos[0]].data(),
                sizeof(int) * totalDim);
 
-        ICpuGpuVector::resizeOrCreate(
-            cpuArguments[slot].sequenceStartPositions,
-            size + 1, /* size == 1 currently */
-            /* useGpu= */ false);
+        ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+                                      size + 1, /* size == 1 currently */
+                                      /* useGpu= */ false);
         int* bufStarts =
             cpuArguments[slot].sequenceStartPositions->getMutableData(false);
         bufStarts[0] = 0;
@@ -700,8 +727,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
         gpuArguments[i].sequenceStartPositions =
             cpuArguments[i].sequenceStartPositions;
       } else {
-        gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
-                                          HPPL_STREAM_1);
+        gpuArguments[i].resizeAndCopyFrom(
+            cpuArguments[i], useGpu_, HPPL_STREAM_1);
       }
     }
     hl_stream_synchronize(HPPL_STREAM_1);
@@ -746,10 +773,9 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
     sampleLoop(op, size);
 
     // current slot: sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(
-        cpuArguments[slot].sequenceStartPositions,
-        size + 1,
-        /* useGpu= */ false);
+    ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
+                                  size + 1,
+                                  /* useGpu= */ false);
 
     switch (slotType) {
       case SlotDef::VECTOR_SPARSE_VALUE:
@@ -821,10 +847,10 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
           };
           int subSize = subSampleLoop(op, size, slot);
           ICpuGpuVector::resizeOrCreate(
-              cpuArguments[slot].subSequenceStartPositions, subSize + 1,
-              false);
+              cpuArguments[slot].subSequenceStartPositions, subSize + 1, false);
           int* currPosOfArgumentSubSeqStart =
-            cpuArguments[slot].subSequenceStartPositions->getMutableData(false);
+              cpuArguments[slot].subSequenceStartPositions->getMutableData(
+                  false);
           int64_t* subSeqs = dataSubPos.data();
           int64_t* subIndexs = slots_[slot].subIndices.data();
           int allSubSequenceLength = 0;
@@ -849,7 +875,8 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
       }
       case SlotDef::INDEX: {
         // label slot
-        IVector::resizeOrCreate(cpuArguments[slot].ids, size,
+        IVector::resizeOrCreate(cpuArguments[slot].ids,
+                                size,
                                 /* useGpu= */ false);
         // fill labels
         int* buf = cpuArguments[slot].ids->getData();
@@ -863,7 +890,9 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
       case SlotDef::VECTOR_DENSE: {
         // copy values
         size_t dim = header_.slot_defs(slot).dim();
-        Matrix::resizeOrCreate(cpuArguments[slot].value, size, dim,
+        Matrix::resizeOrCreate(cpuArguments[slot].value,
+                               size,
+                               dim,
                                false,   // trans = false
                                false);  // useGpu = false
         real* buf = cpuArguments[slot].value->getData();
@@ -887,8 +916,8 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
     gpuArguments.resize(cpuArguments.size());
     gpuBatch.setSize(size);
     for (size_t i = 0; i < cpuArguments.size(); ++i) {
-      gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
-                                        HPPL_STREAM_1);
+      gpuArguments[i].resizeAndCopyFrom(
+          cpuArguments[i], useGpu_, HPPL_STREAM_1);
     }
     hl_stream_synchronize(HPPL_STREAM_1);
     *batch = gpuBatch;
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.h b/paddle/gserver/dataproviders/ProtoDataProvider.h
index 846dd7673abe8b836be1b728bb690daa0e8acc20..ffdcc8fdc977f53e29dc9f03fa3cf7af56acb92f 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.h
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <vector>
@@ -48,7 +47,8 @@ namespace paddle {
  */
 class ProtoDataProvider : public DataProvider {
 public:
-  ProtoDataProvider(const DataConfig& config, bool useGpu,
+  ProtoDataProvider(const DataConfig& config,
+                    bool useGpu,
                     bool loadDataAll = true);
   virtual void reset();
 
@@ -161,14 +161,16 @@ protected:
 };
 
 /**
- * @brief Special use for Proto data: instances should contain sparse-non-value slots
+ * @brief Special use for Proto data: instances should contain sparse-non-value
+ * slots
  * and label.
  *
  * @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE
  */
 class ProtoSequenceDataProvider : public ProtoDataProvider {
 public:
-  ProtoSequenceDataProvider(const DataConfig& config, bool useGpu,
+  ProtoSequenceDataProvider(const DataConfig& config,
+                            bool useGpu,
                             bool loadDataAll = true);
   ~ProtoSequenceDataProvider() {}
   virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
diff --git a/paddle/gserver/dataproviders/ProtoReader.h b/paddle/gserver/dataproviders/ProtoReader.h
index 3b1eb7e9ef03c42df31c6efc9f0e0240d64e78df..b8fca3cd7f3c5efaea35dc8e09f7ca0ec250830f 100644
--- a/paddle/gserver/dataproviders/ProtoReader.h
+++ b/paddle/gserver/dataproviders/ProtoReader.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -138,7 +137,8 @@ protected:
    *
    * @note this code depends on protobuf 2.4.0. There is nothing like
    * CodedInputStream::CurrentPosition() in protobuf 2.5.0 to tell us how many
-   * bytes has the object readed so far. Therefore, we calculated bytes ourselves.
+   * bytes has the object readed so far. Therefore, we calculated bytes
+   * ourselves.
    */
   int approximateReadedBytes_;
 };
diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp
index 1332c0ab635b6ebec05f25fd77b9703b39227bc1..bee6ca14a2ec3995a3b432fc5a39419a5dd8a8ce 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@@ -12,21 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PyDataProvider.h"
 #include "paddle/utils/PythonUtil.h"
 #include <fenv.h>
 #include "paddle/utils/Util.h"
 #include "paddle/utils/Excepts.h"
 
-
 namespace paddle {
 
 #ifndef PADDLE_NO_PYTHON
 REGISTER_DATA_PROVIDER(py, PyDataProvider);
 #endif
 
-PyDataProvider::PyDataProvider(const DataConfig& config, bool useGpu,
+PyDataProvider::PyDataProvider(const DataConfig& config,
+                               bool useGpu,
                                bool loadDataAll)
     : DataProvider(config, useGpu), batchSize_(0) {
   PyGuard guard;
@@ -50,8 +49,8 @@ void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
   classInstance_ =
       createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
   CHECK(classInstance_) << "Create class instance failed.";
-  PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
-                                      const_cast<char*>("getHeader"), NULL));
+  PyObjectPtr obj(PyObject_CallMethod(
+      classInstance_.get(), const_cast<char*>("getHeader"), NULL));
   CHECK_PY(obj) << "Call function getHeader failed.";
   std::string headerInfo =
       std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
@@ -90,7 +89,8 @@ void PyDataProvider::resetSlots() {
   }
 }
 
-void PyDataProvider::fillDenseSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillDenseSlot(ProtoSlot& slot,
+                                   char*& data,
                                    const char* dataEnd) {
   unsigned int dim = slot.dim;
   slot.sampleNum = readT<unsigned int>(data, dataEnd);
@@ -102,14 +102,17 @@ void PyDataProvider::fillDenseSlot(ProtoSlot& slot, char*& data,
   float* dat = reinterpret_cast<float*>(data);
   std::copy(dat, dat + slot.sampleNum * dim, slot.denseData.begin());
 #else
-  memcpyWithCheck(slot.denseData.data(), data,
-                  sizeof(real) * dim * slot.sampleNum, dataEnd);
+  memcpyWithCheck(slot.denseData.data(),
+                  data,
+                  sizeof(real) * dim * slot.sampleNum,
+                  dataEnd);
 #endif
   // PyDataProvider always provide data in float
   data += sizeof(float) * dim * slot.sampleNum;
 }
 
-void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot,
+                                            char*& data,
                                             const char* dataEnd) {
   slot.sampleNum = readT<unsigned int>(data, dataEnd);
   unsigned int* indexPtr = (unsigned int*)data;
@@ -121,12 +124,15 @@ void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
   length = readT<unsigned int>(data, dataEnd);
   slot.indices.push_back(length);
   slot.sparseNonValueData.resize(length);
-  memcpyWithCheck(slot.sparseNonValueData.data(), data,
-                  sizeof(unsigned int) * length, dataEnd);
+  memcpyWithCheck(slot.sparseNonValueData.data(),
+                  data,
+                  sizeof(unsigned int) * length,
+                  dataEnd);
   data += sizeof(unsigned int) * length;
 }
 
-void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot,
+                                         char*& data,
                                          const char* dataEnd) {
   slot.sampleNum = readT<unsigned int>(data, dataEnd);
   unsigned int* indexPtr = (unsigned int*)data;
@@ -153,7 +159,8 @@ void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot, char*& data,
   }
 }
 
-void PyDataProvider::fillIndexSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillIndexSlot(ProtoSlot& slot,
+                                   char*& data,
                                    const char* dataEnd) {
   slot.sampleNum = readT<unsigned int>(data, dataEnd);
   CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
@@ -163,7 +170,8 @@ void PyDataProvider::fillIndexSlot(ProtoSlot& slot, char*& data,
   data += sizeof(unsigned int) * slot.sampleNum;
 }
 
-void PyDataProvider::fillStringSlot(ProtoSlot& slot, char*& data,
+void PyDataProvider::fillStringSlot(ProtoSlot& slot,
+                                    char*& data,
                                     const char* dataEnd) {
   slot.sampleNum = readT<unsigned int>(data, dataEnd);
   for (unsigned int i = 0; i < slot.sampleNum; ++i) {
@@ -225,9 +233,8 @@ void PyDataProvider::fillSlotsByStr(const std::string& samples) {
       }
       for (size_t i = 0; i < sequenceNum; ++i) {
         size_t begin = slot.sequenceStartPositions[i];
-        size_t end = (i < sequenceNum - 1)
-                         ? slot.sequenceStartPositions[i + 1]
-                         : slot.sampleNum;
+        size_t end = (i < sequenceNum - 1) ? slot.sequenceStartPositions[i + 1]
+                                           : slot.sampleNum;
         for (size_t ii = begin; ii < end; ++ii) {
           slot.sampleSequenceIdVec.push_back(ii);
         }
@@ -255,8 +262,8 @@ void PyDataProvider::fillSlotsByStr(const std::string& samples) {
 void PyDataProvider::reset() {
   {  // Invoke PyDataProvider Reset
     PyGuard guard;
-    PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
-                                        const_cast<char*>("reset"), NULL));
+    PyObjectPtr obj(PyObject_CallMethod(
+        classInstance_.get(), const_cast<char*>("reset"), NULL));
     CHECK_PY(obj) << "Call function reset failed.";
   }
 
@@ -270,15 +277,18 @@ void PyDataProvider::reset() {
 void PyDataProvider::shuffle() {
   // py shuffle
   PyGuard guard;
-  PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
-                                      const_cast<char*>("shuffle"), NULL));
+  PyObjectPtr obj(PyObject_CallMethod(
+      classInstance_.get(), const_cast<char*>("shuffle"), NULL));
   CHECK_PY(obj) << "Call function shuffle failed.";
 }
 
-void PyDataProvider::handleDenseSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleDenseSlot(ProtoSlot& slot,
+                                     size_t slotIndex,
                                      std::vector<Argument>& cpuArguments) {
   unsigned int dim = slot.dim;
-  Matrix::resizeOrCreate(cpuArguments[slotIndex].value, slot.sampleNum, dim,
+  Matrix::resizeOrCreate(cpuArguments[slotIndex].value,
+                         slot.sampleNum,
+                         dim,
                          false,   // trans = false
                          false);  // useGpu = false
   real* buf = cpuArguments[slotIndex].value->getData();
@@ -294,19 +304,27 @@ void PyDataProvider::handleSparseNonValueSlot(
     ProtoSlot& slot, size_t slotIndex, std::vector<Argument>& cpuArguments) {
   unsigned int dim = slot.dim;
   if (!(cpuArguments[slotIndex].value)) {
-    cpuArguments[slotIndex].value = Matrix::createSparseMatrix(
-        slot.sampleNum, dim, slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/, NO_VALUE,
-        SPARSE_CSR, false, useGpu_);
+    cpuArguments[slotIndex].value =
+        Matrix::createSparseMatrix(slot.sampleNum,
+                                   dim,
+                                   slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
+                                   NO_VALUE,
+                                   SPARSE_CSR,
+                                   false,
+                                   useGpu_);
   }
   auto mat = cpuArguments[slotIndex].value;
   mat->resize(slot.sampleNum, dim, slot.sampleNum, NO_VALUE, SPARSE_CSR);
   if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
     std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
-                   slot.sparseNonValueData.data(), HPPL_STREAM_1);
+        ->copyFrom(slot.sampleSequenceIdVec.data(),
+                   slot.indices.data(),
+                   slot.sparseNonValueData.data(),
+                   HPPL_STREAM_1);
   } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
     std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
+        ->copyFrom(slot.sampleSequenceIdVec.data(),
+                   slot.indices.data(),
                    slot.sparseNonValueData.data());
   } else {
     LOG(FATAL) << "Not Supported";
@@ -317,28 +335,38 @@ void PyDataProvider::handleSparseValueSlot(
     ProtoSlot& slot, size_t slotIndex, std::vector<Argument>& cpuArguments) {
   unsigned int dim = slot.dim;
   if (!(cpuArguments[slotIndex].value)) {
-    cpuArguments[slotIndex].value = Matrix::createSparseMatrix(
-        slot.sampleNum, dim, slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
-        FLOAT_VALUE, SPARSE_CSR, false, useGpu_);
+    cpuArguments[slotIndex].value =
+        Matrix::createSparseMatrix(slot.sampleNum,
+                                   dim,
+                                   slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
+                                   FLOAT_VALUE,
+                                   SPARSE_CSR,
+                                   false,
+                                   useGpu_);
   }
   auto mat = cpuArguments[slotIndex].value;
   mat->resize(slot.sampleNum, dim, slot.sampleNum, FLOAT_VALUE, SPARSE_CSR);
   if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
     std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
-                   slot.sparseFloatValueData.data(), HPPL_STREAM_DEFAULT);
+        ->copyFrom(slot.sampleSequenceIdVec.data(),
+                   slot.indices.data(),
+                   slot.sparseFloatValueData.data(),
+                   HPPL_STREAM_DEFAULT);
   } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
     std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(),
+        ->copyFrom(slot.sampleSequenceIdVec.data(),
+                   slot.indices.data(),
                    slot.sparseFloatValueData.data());
   } else {
     LOG(FATAL) << "Not Supported";
   }
 }
 
-void PyDataProvider::handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleIndexSlot(ProtoSlot& slot,
+                                     size_t slotIndex,
                                      std::vector<Argument>& cpuArguments) {
-  IVector::resizeOrCreate(cpuArguments[slotIndex].ids, slot.sampleNum,
+  IVector::resizeOrCreate(cpuArguments[slotIndex].ids,
+                          slot.sampleNum,
                           /*useGpu_*/ false);
   int* buf = cpuArguments[slotIndex].ids->getData();
   for (size_t i = 0; i < slot.sampleNum; ++i) {
@@ -346,7 +374,8 @@ void PyDataProvider::handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
   }
 }
 
-void PyDataProvider::handleStringSlot(ProtoSlot& slot, size_t slotIndex,
+void PyDataProvider::handleStringSlot(ProtoSlot& slot,
+                                      size_t slotIndex,
                                       std::vector<Argument>& cpuArguments) {
   if (cpuArguments[slotIndex].strs) {
     cpuArguments[slotIndex].strs->resize(slot.sampleNum);
@@ -364,7 +393,8 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
   PyGuard guard;
   PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
                                       const_cast<char*>("getNextBatch"),
-                                      const_cast<char*>("i"), size));
+                                      const_cast<char*>("i"),
+                                      size));
   CHECK_PY(obj) << "Call function getNextBatch failed.";
   const std::string& samples =
       std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
@@ -381,23 +411,24 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
   if (!iidData()) {
     for (size_t j = 0; j < slotNum_; ++j) {
       auto& slot = slots_[j];
-      ICpuGpuVector::resizeOrCreate(
-          cpuArguments[j].sequenceStartPositions,
-          slot.sequenceNum + 1, /* useGpu= */ false);
+      ICpuGpuVector::resizeOrCreate(cpuArguments[j].sequenceStartPositions,
+                                    slot.sequenceNum + 1,
+                                    /* useGpu= */ false);
       int* buf = cpuArguments[j].sequenceStartPositions->getMutableData(false);
       std::copy(slot.sequenceStartPositions.begin(),
-                slot.sequenceStartPositions.end(), buf);
+                slot.sequenceStartPositions.end(),
+                buf);
       buf[slot.sequenceStartPositions.size()] = slot.sampleNum;
 
       if (slot.subSequenceStartPositions.size()) {
-        ICpuGpuVector::resizeOrCreate(
-            cpuArguments[j].subSequenceStartPositions,
-            slot.subSequenceNum + 1,
-            /*  useGpu= */ false);
+        ICpuGpuVector::resizeOrCreate(cpuArguments[j].subSequenceStartPositions,
+                                      slot.subSequenceNum + 1,
+                                      /*  useGpu= */ false);
         int* buf =
-           cpuArguments[j].subSequenceStartPositions->getMutableData(false);
+            cpuArguments[j].subSequenceStartPositions->getMutableData(false);
         std::copy(slot.subSequenceStartPositions.begin(),
-                  slot.subSequenceStartPositions.end(), buf);
+                  slot.subSequenceStartPositions.end(),
+                  buf);
         buf[slot.subSequenceNum] = slot.sampleNum;
         // check subSequenceStartPositions and sequenceStartPositions
         cpuArguments[j].checkSubset();
@@ -452,8 +483,8 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
               cpuArguments[i].subSequenceStartPositions;
         }
       } else {
-        gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
-                                          HPPL_STREAM_1);
+        gpuArguments[i].resizeAndCopyFrom(
+            cpuArguments[i], useGpu_, HPPL_STREAM_1);
       }
     }
     hl_stream_synchronize(HPPL_STREAM_1);
diff --git a/paddle/gserver/dataproviders/PyDataProvider.h b/paddle/gserver/dataproviders/PyDataProvider.h
index 939d9cf725c2fe6e4989c17e1e768c9f8aedfc95..6bb7c831fdd451abc5241199d6a4d1b1ad814517 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.h
+++ b/paddle/gserver/dataproviders/PyDataProvider.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <paddle/utils/PythonUtil.h>
@@ -25,7 +24,8 @@ namespace paddle {
 
 class PyDataProvider : public DataProvider {
 public:
-  PyDataProvider(const DataConfig& config, bool useGpu,
+  PyDataProvider(const DataConfig& config,
+                 bool useGpu,
                  bool loadDataAll = true);
 
   virtual void reset();
@@ -48,21 +48,27 @@ protected:
 
   void parseHeaderData(const std::string& headerData);
   void fillDenseSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
-  void fillSparseNonValueSlot(ProtoSlot& slot, char*& data,
+  void fillSparseNonValueSlot(ProtoSlot& slot,
+                              char*& data,
                               const char* dataEnd);
   void fillSparseValueSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
   void fillIndexSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
   void fillStringSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
   void fillSlotsByStr(const std::string& samples);
-  void handleDenseSlot(ProtoSlot& slot, size_t slotIndex,
+  void handleDenseSlot(ProtoSlot& slot,
+                       size_t slotIndex,
                        std::vector<Argument>& cpuArguments);
-  void handleSparseNonValueSlot(ProtoSlot& slot, size_t slotIndex,
+  void handleSparseNonValueSlot(ProtoSlot& slot,
+                                size_t slotIndex,
                                 std::vector<Argument>& cpuArguments);
-  void handleSparseValueSlot(ProtoSlot& slot, size_t slotIndex,
+  void handleSparseValueSlot(ProtoSlot& slot,
+                             size_t slotIndex,
                              std::vector<Argument>& cpuArguments);
-  void handleIndexSlot(ProtoSlot& slot, size_t slotIndex,
+  void handleIndexSlot(ProtoSlot& slot,
+                       size_t slotIndex,
                        std::vector<Argument>& cpuArguments);
-  void handleStringSlot(ProtoSlot& slot, size_t slotIndex,
+  void handleStringSlot(ProtoSlot& slot,
+                        size_t slotIndex,
                         std::vector<Argument>& cpuArguments);
   void resetSlots();
   void loadData(const std::vector<std::string>& fileList);
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index c464d01fdefd1fbbd65a7798441c53b6aed89ce6..967fc9026a39967477d606862e060b680512901a 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #ifndef PADDLE_NO_PYTHON
 
+#include <Python.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unordered_set>
 #include <list>
-#include <Python.h>
 #include <numpy/numpyconfig.h>
 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 #include <numpy/ndarrayobject.h>
@@ -34,7 +34,7 @@ namespace paddle {
 namespace unittest {
 
 static std::unique_ptr<std::function<void(size_t /*poolActualSize */)>>
-         OnPoolFilled;
+    OnPoolFilled;
 
 namespace pydp2 {
 
@@ -43,15 +43,11 @@ void setOnPoolFilledHook(const std::function<void(size_t)>& callback) {
   *OnPoolFilled = callback;
 }
 
-void clearOnPoolFilledHook() {
-  OnPoolFilled.reset();
-}
+void clearOnPoolFilledHook() { OnPoolFilled.reset(); }
 
 }  // namespace pydp2
 }  // namespace unittest
 
-
-
 /**
  * Slot type
  */
@@ -65,17 +61,13 @@ enum SlotType {
 /**
  * Sequence type
  */
-enum SeqType {
-  SQT_NONE = 0,
-  SQT_SEQ,
-  SQT_SUBSEQ
-};
+enum SeqType { SQT_NONE = 0, SQT_SEQ, SQT_SUBSEQ };
 
 /**
  * Cache Type.
  */
 enum CacheType {
-  NO_CACHE = 0,  // Each pass will load data from PyDataProvider2.
+  NO_CACHE = 0,           // Each pass will load data from PyDataProvider2.
   CACHE_PASS_IN_MEM = 1,  // First pass will load data from PyDataProvider2,
                           // then cache all data in memory. Load data from
                           // memory in rest passes.
@@ -87,8 +79,8 @@ struct SlotHeader {  // Slot Header will parse from python object's slots field.
   SeqType seqType;
 };
 
-inline std::ostream& operator << (std::ostream& os, const SlotHeader& header) {
-  os <<"Dim = " << header.dim << " Type = " << header.slotType
+inline std::ostream& operator<<(std::ostream& os, const SlotHeader& header) {
+  os << "Dim = " << header.dim << " Type = " << header.slotType
      << " SeqType = " << header.seqType;
   return os;
 }
@@ -158,7 +150,6 @@ protected:
   SlotHeader* headerPtr_;
 };
 
-
 /**
  * Py Data Provider Cache Interface.
  */
@@ -209,17 +200,13 @@ public:
   PyDataProvider2(const DataConfig& config,
                   const ModelConfig& modelConfig,
                   bool useGpu)
-    :DataProvider(config, useGpu),
-      callingContextCreated_(2) {
-    if (PyArray_API == NULL)
-      import_array();
+      : DataProvider(config, useGpu), callingContextCreated_(2) {
+    if (PyArray_API == NULL) import_array();
     auto& args = config.load_data_args();
     PyObjectPtr kwargs = PyObjectPtr(PyDict_New());
     if (!args.empty()) {
       kwargs = callPythonFuncRetPyObj(
-            "paddle.trainer.PyDataProvider2",
-            "deserialize_args",
-            {args});
+          "paddle.trainer.PyDataProvider2", "deserialize_args", {args});
     }
 
     py::DictHelper kwargsDict(kwargs);
@@ -245,40 +232,38 @@ public:
    * Dtor
    * @note will stop loading thread when destructing
    */
-  virtual ~PyDataProvider2() {
-    resetImpl(false);
-  }
+  virtual ~PyDataProvider2() { resetImpl(false); }
 
 private:
   void createPyDataObj(const std::string& model,
                        const std::string& className,
                        const std::string& fileListName,
-                       PyObjectPtr && kwargs) {
-    LOG(INFO) << "loading dataprovider " << model <<"::" << className;
+                       PyObjectPtr&& kwargs  // NOLINT
+                       ) {
+    LOG(INFO) << "loading dataprovider " << model << "::" << className;
 
     PyObjectPtr module = py::import(model);
     PyObjectPtr moduleDict(PyModule_GetDict(module.get()));
     CHECK_PY(moduleDict) << "Invoke module.__dict__ error";
-    PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(),
-                                         className.c_str()));
+    PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(), className.c_str()));
     CHECK_PY(cls) << "load class " << className.c_str() << "error";
 
     // If there are multiple python instance share same module, the PyObjectPtr
     // only for instance will make python reference-count error.
     //
     // So here, we increase reference count manually.
-    if (gModuleClsPtrs_.find((uintptr_t) module.get())
-        != gModuleClsPtrs_.end()) {
+    if (gModuleClsPtrs_.find((uintptr_t)module.get()) !=
+        gModuleClsPtrs_.end()) {
       // Multi instance use same module
       Py_XINCREF(module.get());
       Py_XINCREF(moduleDict.get());
     } else {
-      gModuleClsPtrs_.insert((uintptr_t) module.get());
+      gModuleClsPtrs_.insert((uintptr_t)module.get());
     }
-    if (gModuleClsPtrs_.find((uintptr_t) cls.get()) != gModuleClsPtrs_.end()) {
+    if (gModuleClsPtrs_.find((uintptr_t)cls.get()) != gModuleClsPtrs_.end()) {
       Py_XINCREF(cls.get());
     } else {
-      gModuleClsPtrs_.insert((uintptr_t) cls.get());
+      gModuleClsPtrs_.insert((uintptr_t)cls.get());
     }
 
     PyObjectPtr fileListInPy = loadPyFileLists(fileListName);
@@ -294,8 +279,8 @@ private:
     py::ObjectHelper self(this->instance_);
     bool ok;
 
-    this->skipShuffle_ = !self.getBoolAttr("should_shuffle",
-                                           &ok /*isBoolType*/);
+    this->skipShuffle_ =
+        !self.getBoolAttr("should_shuffle", &ok /*isBoolType*/);
     if (!ok) {
       this->skipShuffle_ = testing;  // shuffle when is training, skip shuffle
                                      // when is testing.
@@ -335,12 +320,12 @@ private:
       PyObjectPtr headerPtrWrap(hdPtr);
       py::ObjectHelper hd(headerPtrWrap);
       header.dim = hd.getIntAttrWithError<size_t>("dim");
-      header.seqType = (SeqType) hd.getIntAttrWithError<int>("seq_type");
-      header.slotType = (SlotType) hd.getIntAttrWithError<int>("type");
+      header.seqType = (SeqType)hd.getIntAttrWithError<int>("seq_type");
+      header.slotType = (SlotType)hd.getIntAttrWithError<int>("type");
     }
 
     DBG << "Data header size " << headers_.size();
-    for (auto & header : headers_) {
+    for (auto& header : headers_) {
       DBG << header;
     }
     cache_.reset(IPyDataProviderCache::create(
@@ -351,8 +336,7 @@ private:
     loadFileList(fileListName, fileLists_);
     PyObject* lst = PyList_New(fileLists_.size());
     for (size_t i = 0; i < fileLists_.size(); ++i) {
-      PyList_SET_ITEM(lst, i,
-                      PyString_FromString(fileLists_[i].c_str()));
+      PyList_SET_ITEM(lst, i, PyString_FromString(fileLists_[i].c_str()));
     }
     return PyObjectPtr(lst);
   }
@@ -414,11 +398,12 @@ private:
         CHECK(ok) << "CalcBatchSize must return int or long";
       }
 
-      if (this->loadThread_){  // wait poolActualSize < poolSize;
+      if (this->loadThread_) {  // wait poolActualSize < poolSize;
         std::unique_lock<std::mutex> l(mtx_);
-        pushCV_.wait(l, [this, additionalBatchSize] {
-          return this->poolActualSize_ < poolSize_;
-        });
+        pushCV_.wait(l,
+                     [this, additionalBatchSize] {
+                       return this->poolActualSize_ < poolSize_;
+                     });
       }
 
       {
@@ -433,26 +418,34 @@ private:
 
   inline void resetImpl(bool startNewThread) {
     DBG << "Reseting " << startNewThread;
+    exit_.store(true);
     if (loadThread_) {  // is loading.
-      exit_.store(true);
       loadThread_->join();
       loadThread_.reset();
     }
     {
       PyGuard g;
       callingContexts_.clear();
+      this->pullCV_.notify_one();
+    }
+
+    std::lock_guard<std::mutex> guard(mutexForReset_);
+    {
+      PyGuard g;
       dataPool_.clear();
     }
     poolActualSize_ = 0;
-    exit_ = false;
+
     if (startNewThread && cache_->reset()) {
       DBG << "Start new thread.";
       loadThread_.reset(new std::thread([this] {
+        exit_ = false;
         loadThread();
       }));
       callingContextCreated_.wait();
     }
     DBG << "Reset done";
+    exit_ = false;
   }
 
 private:
@@ -465,6 +458,8 @@ private:
   std::condition_variable pullCV_;
   std::mutex mtx_;
 
+  std::mutex mutexForReset_;
+
   ThreadBarrier callingContextCreated_;
   std::unique_ptr<IPyDataProviderCache> cache_;
 
@@ -477,14 +472,14 @@ private:
   std::vector<std::string> fileLists_;
   std::vector<SlotHeader> headers_;
   static PyObjectPtr zeroTuple_;
-  static std::unordered_set<uintptr_t > gModuleClsPtrs_;
+  static std::unordered_set<uintptr_t> gModuleClsPtrs_;
 
   class PositionRandom {
   public:
-    inline explicit PositionRandom(bool skipRand):
-        eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
+    inline explicit PositionRandom(bool skipRand)
+        : eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
 
-    inline size_t operator() (size_t len) {
+    inline size_t operator()(size_t len) {
       if (!skipRand_) {
         if (!dist_ || dist_->b() != len - 1) {
           dist_.reset(new std::uniform_int_distribution<size_t>(0, len - 1));
@@ -515,31 +510,31 @@ public:
    * Shuffle. Do nothing because PyDataProvider do shuffle implicitly by random
    * select data from datapool.
    */
-  void shuffle() {
-  }
+  void shuffle() {}
 
   /**
    * Not limited size.
    */
-  int64_t getSize() {
-    return -1;
-  }
+  int64_t getSize() { return -1; }
 
   /**
    * Loading a batch of data.
    */
-  int64_t getNextBatchInternal(int64_t size_, DataBatch *batch) {
+  int64_t getNextBatchInternal(int64_t size_, DataBatch* batch) {
+    std::lock_guard<std::mutex> guard(mutexForReset_);
     REGISTER_TIMER("PyDP2.getNextBatchInternal")
     CHECK_GE(size_, 0);
-    size_t size = (size_t) size_;
+    size_t size = (size_t)size_;
     if (loadThread_) {  // loading from thread should wait for data pool ready.
                         // but, loading from cache, cache object should ensure
                         // data pool ready.
       std::unique_lock<std::mutex> l(mtx_);
-      pullCV_.wait(l, [this, &size] {
-        return this->poolActualSize_ >= std::max(size, this->minPoolSize_)
-            || callingContexts_.empty();
-      });
+      pullCV_.wait(l,
+                   [this, &size] {
+                     return this->poolActualSize_ >=
+                                std::max(size, this->minPoolSize_) ||
+                            callingContexts_.empty();
+                   });
 
       if (unittest::OnPoolFilled) {
         (*unittest::OnPoolFilled)(this->poolActualSize_);
@@ -554,6 +549,10 @@ public:
     } else {  // loading from cache.
       poolPtr = this->cache_->load();
     }
+    if (exit_) {
+      // PyDataProvider is destructing.
+      return 0;
+    }
     CHECK(poolPtr != nullptr);
 
     std::deque<PyObjectPtr>& pool = *poolPtr;
@@ -618,35 +617,35 @@ public:
     cpuBatch.setSize(bsize);
     auto& inArgs = cpuBatch.getStreams();
     inArgs.resize(headers_.size());
-    std::vector<std::unique_ptr<IFieldScanner> > scanners;
+    std::vector<std::unique_ptr<IFieldScanner>> scanners;
     scanners.reserve(headers_.size());
     for (auto& header : headers_) {
       scanners.emplace_back(IFieldScanner::create(&header));
     }
     DBG << "Scanner created.";
-    for (size_t i=0; i < headers_.size(); ++i) {
+    for (size_t i = 0; i < headers_.size(); ++i) {
       scanners[i]->startPrepare(inArgs[i]);
     }
-    for (auto & d : data) {
+    for (auto& d : data) {
       py::SequenceHelper s(d);
-      for (size_t i=0; i < headers_.size(); ++i) {
+      for (size_t i = 0; i < headers_.size(); ++i) {
         scanners[i]->prepare(inArgs[i], s[i]);
       }
     }
-    for (size_t i=0; i < headers_.size(); ++i) {
+    for (size_t i = 0; i < headers_.size(); ++i) {
       scanners[i]->finishPrepare(inArgs[i]);
     }
-    for (size_t i=0; i < headers_.size(); ++i) {
+    for (size_t i = 0; i < headers_.size(); ++i) {
       scanners[i]->startFill(inArgs[i]);
     }
-    for (auto & d : data) {
+    for (auto& d : data) {
       py::SequenceHelper s(d);
       for (size_t i = 0; i < headers_.size(); ++i) {
         scanners[i]->fill(inArgs[i], s[i]);
       }
     }
 
-    for (size_t i=0; i < headers_.size(); ++i) {
+    for (size_t i = 0; i < headers_.size(); ++i) {
       scanners[i]->finishFill(inArgs[i]);
     }
 
@@ -664,8 +663,8 @@ public:
       gpuArguments.resize(cpuArguments.size());
       gpuBatch.setSize(size);
       for (size_t i = 0; i < headers_.size(); ++i) {
-        gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_,
-                                          HPPL_STREAM_1);
+        gpuArguments[i].resizeAndCopyFrom(
+            cpuArguments[i], useGpu_, HPPL_STREAM_1);
       }
       hl_stream_synchronize(HPPL_STREAM_1);
     } else {
@@ -675,31 +674,28 @@ public:
   }
 };
 
-std::unordered_set<uintptr_t > PyDataProvider2::gModuleClsPtrs_;
+std::unordered_set<uintptr_t> PyDataProvider2::gModuleClsPtrs_;
 PyObjectPtr PyDataProvider2::zeroTuple_(PyTuple_New(0));
 
 REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2);
 
-
 /**
  * Scanner for dense slot.
  */
-class DenseScanner: public IFieldScanner {
+class DenseScanner : public IFieldScanner {
 public:
-  explicit DenseScanner(SlotHeader* ptr):IFieldScanner(ptr), height_(0) {}
+  explicit DenseScanner(SlotHeader* ptr) : IFieldScanner(ptr), height_(0) {}
 
   /**
    * Prepare.
    * @param argument target argument
    * @param obj each timestep of a sample.
    */
-  virtual void prepare(Argument &argument, PyObject *obj) {
-    ++height_;
-  }
+  virtual void prepare(Argument& argument, PyObject* obj) { ++height_; }
 
-  virtual void finishPrepare(Argument &argument) {
-    Matrix::resizeOrCreate(argument.value, height_, headerPtr_->dim,
-                           false, false);
+  virtual void finishPrepare(Argument& argument) {
+    Matrix::resizeOrCreate(
+        argument.value, height_, headerPtr_->dim, false, false);
     height_ = 0;
   }
 
@@ -708,24 +704,23 @@ public:
    * @param argument
    * @param obj
    */
-  virtual void fill(Argument &argument, PyObject *obj) {
+  virtual void fill(Argument& argument, PyObject* obj) {
     real* dat = argument.value->getData() + height_ * headerPtr_->dim;
     if (PyArray_Check(obj)) {
-        auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
-        if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
-            real * data = (real*)PyArray_DATA((PyArrayObject*)obj);
-            auto sz = PyArray_SIZE((PyArrayObject*)obj);
-            std::copy(data, data + sz, dat);
-        } else {
-            LOG(FATAL) << "You should yield float" << sizeof(real) * 8
-                       << " array";
-        }
-     } else {
-        py::SequenceHelper s(obj);
-        // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
-        for (size_t i=0; i < headerPtr_->dim; ++i) {
-          dat[i] = (real) s.getDouble(i);
-        }
+      auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
+      if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
+        real* data = (real*)PyArray_DATA((PyArrayObject*)obj);
+        auto sz = PyArray_SIZE((PyArrayObject*)obj);
+        std::copy(data, data + sz, dat);
+      } else {
+        LOG(FATAL) << "You should yield float" << sizeof(real) * 8 << " array";
+      }
+    } else {
+      py::SequenceHelper s(obj);
+      // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
+      for (size_t i = 0; i < headerPtr_->dim; ++i) {
+        dat[i] = (real)s.getDouble(i);
+      }
     }
     ++height_;
   }
@@ -737,20 +732,18 @@ private:
 /**
  * Scanner for index slot
  */
-class IndexScanner: public IFieldScanner {
+class IndexScanner : public IFieldScanner {
 public:
-  explicit IndexScanner(SlotHeader* ptr):IFieldScanner(ptr), cnt_(0) {}
+  explicit IndexScanner(SlotHeader* ptr) : IFieldScanner(ptr), cnt_(0) {}
 
   /**
    * Prepare memory space.
    *
    * @note obj is a single timestep of sample
    */
-  virtual void prepare(Argument &argument, PyObject *obj) {
-    ++cnt_;
-  }
+  virtual void prepare(Argument& argument, PyObject* obj) { ++cnt_; }
 
-  virtual void finishPrepare(Argument &argument) {
+  virtual void finishPrepare(Argument& argument) {
     IVector::resizeOrCreate(argument.ids, cnt_, false);
     cnt_ = 0;
   }
@@ -758,9 +751,9 @@ public:
   /**
    * Fill one index to argument.
    */
-  virtual void fill(Argument &argument, PyObject *obj) {
+  virtual void fill(Argument& argument, PyObject* obj) {
     bool ok;
-    argument.ids->getData()[cnt_++] = py::castInt<int >(obj, &ok);
+    argument.ids->getData()[cnt_++] = py::castInt<int>(obj, &ok);
     CHECK(ok) << "Cannot cast int " << py::repr(obj);
   }
 
@@ -770,27 +763,25 @@ private:
 
 class SparseNonValueScanner : public IFieldScanner {
 public:
-  explicit SparseNonValueScanner(SlotHeader* ptr): IFieldScanner(ptr),
-                                                   nnz_(0),
-                                                   height_(0) {}
+  explicit SparseNonValueScanner(SlotHeader* ptr)
+      : IFieldScanner(ptr), nnz_(0), height_(0) {}
 
   /**
    * Prepare memory space
    * @note obj is a timestep of one sample.
    */
-  virtual void prepare(Argument &argument, PyObject *obj) {
+  virtual void prepare(Argument& argument, PyObject* obj) {
     ++height_;
     nnz_ += py::SequenceHelper(obj).size();
   }
 
-  virtual void finishPrepare(Argument &argument) {
-    Matrix::resizeOrCreateSparseMatrix(argument.value, height_,
-                                       headerPtr_->dim,
-                                       nnz_, NO_VALUE);
+  virtual void finishPrepare(Argument& argument) {
+    Matrix::resizeOrCreateSparseMatrix(
+        argument.value, height_, headerPtr_->dim, nnz_, NO_VALUE);
   }
 
-  virtual void startFill(Argument & argument) {
-    auto smat = (CpuSparseMatrix*) (argument.value.get());
+  virtual void startFill(Argument& argument) {
+    auto smat = (CpuSparseMatrix*)(argument.value.get());
     smat->getRows()[0] = 0;
     nnz_ = 0;
     height_ = 1;
@@ -803,14 +794,14 @@ public:
   virtual void fill(Argument& argument, PyObject* obj) {
     py::SequenceHelper s(obj);
     auto sz = s.size();
-    auto smat = (CpuSparseMatrix*) (argument.value.get());
+    auto smat = (CpuSparseMatrix*)(argument.value.get());
     int* row = smat->getRows();
     int* col = smat->getCols();
     real* dat = smat->getData();
-    row[height_] = row[height_-1] + (int)sz;
+    row[height_] = row[height_ - 1] + (int)sz;
 
     for (decltype(sz) i = 0; i < sz; ++i) {
-      setData(col+nnz_, dat+nnz_, s[i]);
+      setData(col + nnz_, dat + nnz_, s[i]);
       ++nnz_;
     }
     ++height_;
@@ -824,7 +815,7 @@ protected:
    * @param [in] obj Python Object. For sparse_non_value is a PyInt or PyLong.
    *                 For sparse_value is a Tuple (int, float).
    */
-  virtual void setData(int* col, real * dat, PyObject* obj) {
+  virtual void setData(int* col, real* dat, PyObject* obj) {
     bool ok;
     *col = py::castInt<int>(obj, &ok);
     CHECK(ok);
@@ -836,26 +827,25 @@ protected:
 
 class SparseValueScanner : public SparseNonValueScanner {
 public:
-  explicit SparseValueScanner(SlotHeader *ptr) : SparseNonValueScanner(ptr) {}
+  explicit SparseValueScanner(SlotHeader* ptr) : SparseNonValueScanner(ptr) {}
 
-  virtual void finishPrepare(Argument &argument) {
-    Matrix::resizeOrCreateSparseMatrix(argument.value, height_,
-                                       headerPtr_->dim,
-                                       nnz_, FLOAT_VALUE);
+  virtual void finishPrepare(Argument& argument) {
+    Matrix::resizeOrCreateSparseMatrix(
+        argument.value, height_, headerPtr_->dim, nnz_, FLOAT_VALUE);
   }
 
 protected:
-  virtual void setData(int *col, real *dat, PyObject *obj) {
+  virtual void setData(int* col, real* dat, PyObject* obj) {
     py::SequenceHelper s(obj);
     SparseNonValueScanner::setData(col, dat, s[0]);
-    *dat = (real) s.getDouble(1);
+    *dat = (real)s.getDouble(1);
   }
 };
 
 /**
  * Sequence Scanner. Scanner for sequence or sub-sequence.
  */
-class SequenceScanner: public IFieldScanner {
+class SequenceScanner : public IFieldScanner {
 public:
   /**
    * Ctor
@@ -864,15 +854,18 @@ public:
    *                       return a sequence start position or a sub-sequence
    *                       start position.
    */
-  SequenceScanner(std::unique_ptr<IFieldScanner>&& innerScanner,
-    const std::function<ICpuGpuVectorPtr&(Argument&)>& getSeqStartPos)
-      : IFieldScanner(nullptr), inner_(std::move(innerScanner)),
-        cnt_(0), getSeqStartPos_(getSeqStartPos) {}
+  SequenceScanner(
+      std::unique_ptr<IFieldScanner>&& innerScanner,
+      const std::function<ICpuGpuVectorPtr&(Argument&)>& getSeqStartPos)
+      : IFieldScanner(nullptr),
+        inner_(std::move(innerScanner)),
+        cnt_(0),
+        getSeqStartPos_(getSeqStartPos) {}
 
   /**
    * Start prepare. Invoke inner->startPrepare too.
    */
-  virtual void startPrepare(Argument &argument) {
+  virtual void startPrepare(Argument& argument) {
     inner_->startPrepare(argument);
   }
 
@@ -880,10 +873,10 @@ public:
    * Prepare. obj is a list or tuple. it will invoke inner_->prepare for each
    * element of sequence obj.
    */
-  virtual void prepare(Argument &argument, PyObject *obj) {
+  virtual void prepare(Argument& argument, PyObject* obj) {
     py::SequenceHelper s(obj);
     ++cnt_;
-    for (size_t i=0; i < s.size(); ++i) {
+    for (size_t i = 0; i < s.size(); ++i) {
       inner_->prepare(argument, s[i]);
     }
   }
@@ -891,7 +884,7 @@ public:
   /**
    * Finish prepare. invoke inner_->finishPrepare too.
    */
-  virtual void finishPrepare(Argument &argument) {
+  virtual void finishPrepare(Argument& argument) {
     ICpuGpuVector::resizeOrCreate(getSeqStartPos_(argument), cnt_ + 1, false);
     inner_->finishPrepare(argument);
   }
@@ -899,7 +892,7 @@ public:
   /**
    * Start fill. invoke inner->startFill too.
    */
-  virtual void startFill(Argument &argument) {
+  virtual void startFill(Argument& argument) {
     getSeqStartPos_(argument)->getMutableData(false)[0] = 0;
     cnt_ = 1;
     inner_->startFill(argument);
@@ -910,13 +903,13 @@ public:
    * sequence obj. And set seqStartPos at same time. The seqStartPos will be
    * calculated by getSeqStartPos callback passed in ctor.
    */
-  virtual void fill(Argument &argument, PyObject *obj) {
+  virtual void fill(Argument& argument, PyObject* obj) {
     getSeqStartPos_(argument)->getMutableData(false)[cnt_] =
-      getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
-          (int)getSize(obj);
+        getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
+        (int)getSize(obj);
     py::SequenceHelper s(obj);
     ++cnt_;
-    for (size_t i=0; i < s.size(); ++i) {
+    for (size_t i = 0; i < s.size(); ++i) {
       inner_->fill(argument, s[i]);
     }
   }
@@ -924,9 +917,7 @@ public:
   /**
    * Finish fill. will invoke inner->finishFill too.
    */
-  virtual void finishFill(Argument &argument) {
-    inner_->finishFill(argument);
-  }
+  virtual void finishFill(Argument& argument) { inner_->finishFill(argument); }
 
 protected:
   size_t getSize(PyObject* obj) {
@@ -934,7 +925,7 @@ protected:
     auto sc = dynamic_cast<SequenceScanner*>(inner_.get());
     if (sc) {
       size_t sum = 0;
-      for (size_t i=0; i < s.size(); ++i) {
+      for (size_t i = 0; i < s.size(); ++i) {
         sum += sc->getSize(s[i]);
       }
       return sum;
@@ -949,8 +940,7 @@ private:
   std::function<ICpuGpuVectorPtr&(Argument&)> getSeqStartPos_;
 };
 
-
-IFieldScanner* IFieldScanner::create(SlotHeader *header) {
+IFieldScanner* IFieldScanner::create(SlotHeader* header) {
   IFieldScanner* retv = nullptr;
   switch (header->slotType) {
     case ST_DENSE:
@@ -974,15 +964,15 @@ IFieldScanner* IFieldScanner::create(SlotHeader *header) {
       break;
     case SQT_SUBSEQ:
       retv = new SequenceScanner(std::unique_ptr<IFieldScanner>(retv),
-            [](Argument& arg) -> ICpuGpuVectorPtr& {
-              return arg.subSequenceStartPositions;
-            });
-      // fall through, not break;
+                                 [](Argument& arg) -> ICpuGpuVectorPtr& {
+                                   return arg.subSequenceStartPositions;
+                                 });
+    // fall through, not break;
     case SQT_SEQ:
       retv = new SequenceScanner(std::unique_ptr<IFieldScanner>(retv),
-          [](Argument& arg) -> ICpuGpuVectorPtr& {
-            return arg.sequenceStartPositions;
-          });
+                                 [](Argument& arg) -> ICpuGpuVectorPtr& {
+                                   return arg.sequenceStartPositions;
+                                 });
       break;
     default:
       LOG(FATAL) << "Not implemented";
@@ -995,19 +985,13 @@ IFieldScanner* IFieldScanner::create(SlotHeader *header) {
  * No Cache Strategy. Will destruct old data immediately and load data from
  * python every pass.
  */
-class NoCacheStrategy: public IPyDataProviderCache {
+class NoCacheStrategy : public IPyDataProviderCache {
 public:
-  virtual bool reset() {
-    return true;
-  }
+  virtual bool reset() { return true; }
 
-  virtual void drop(std::deque<PyObjectPtr> *data) {
-    data->clear();
-  }
+  virtual void drop(std::deque<PyObjectPtr>* data) { data->clear(); }
 
-  virtual std::deque<PyObjectPtr>* load() {
-    return nullptr;
-  }
+  virtual std::deque<PyObjectPtr>* load() { return nullptr; }
 };
 
 /**
@@ -1018,9 +1002,9 @@ public:
  */
 class CacheOnePassInMemory : public IPyDataProviderCache {
 public:
-  CacheOnePassInMemory() : objPool_(new std::deque<PyObjectPtr>()),
-                           droppedPool_(new std::deque<PyObjectPtr>())
-  {}
+  CacheOnePassInMemory()
+      : objPool_(new std::deque<PyObjectPtr>()),
+        droppedPool_(new std::deque<PyObjectPtr>()) {}
 
   virtual bool reset() {
     if (objPool_->empty() && droppedPool_->empty()) {
@@ -1033,25 +1017,22 @@ public:
     }
   }
 
-  virtual void drop(std::deque<PyObjectPtr> *data) {
+  virtual void drop(std::deque<PyObjectPtr>* data) {
     size_t orgSize = droppedPool_->size();
     droppedPool_->resize(orgSize + data->size());
-    for (size_t i=0; i < data->size(); ++i) {
+    for (size_t i = 0; i < data->size(); ++i) {
       std::swap((*droppedPool_)[orgSize + i], (*data)[i]);
     }
     data->clear();
   }
 
-  virtual std::deque<PyObjectPtr>* load() {
-    return objPool_.get();
-  }
+  virtual std::deque<PyObjectPtr>* load() { return objPool_.get(); }
 
 private:
-  std::unique_ptr<std::deque<PyObjectPtr> > objPool_;
-  std::unique_ptr<std::deque<PyObjectPtr> > droppedPool_;
+  std::unique_ptr<std::deque<PyObjectPtr>> objPool_;
+  std::unique_ptr<std::deque<PyObjectPtr>> droppedPool_;
 };
 
-
 IPyDataProviderCache* IPyDataProviderCache::create(CacheType ct) {
   switch (ct) {
     case NO_CACHE:
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
index e397c71c877dce8c34aefac12481373a037510f6..8f7d2fb80e9b6f2b4c83d90a04dab5219435d344 100644
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Evaluator.h"
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 
@@ -33,7 +32,8 @@ private:
     str.clear();
     int prevLabel = -1;
     for (std::vector<int>::const_iterator label = path.begin();
-         label != path.end(); label++) {
+         label != path.end();
+         label++) {
       if (*label != blank_ &&
           (str.empty() || *label != str.back() || prevLabel == blank_)) {
         str.push_back(*label);
@@ -58,8 +58,11 @@ private:
   /* "sp, dp, ip" is the weighting parameter of "substitution, deletion,
    * insertion"
    * in edit-distance error */
-  real stringAlignment(std::vector<int>& gtStr, std::vector<int>& recogStr,
-                       bool backtrace = true, real sp = 1.0, real dp = 1.0,
+  real stringAlignment(std::vector<int>& gtStr,
+                       std::vector<int>& recogStr,
+                       bool backtrace = true,
+                       real sp = 1.0,
+                       real dp = 1.0,
                        real ip = 1.0) {
     std::vector<std::vector<int>> matrix;
     int substitutions, deletions, insertions;
@@ -165,8 +168,8 @@ private:
     return distance / maxLen;
   }
 
-  real editDistance(real* output, int numTimes, int numClasses, int* labels,
-                    int labelsLen) {
+  real editDistance(
+      real* output, int numTimes, int numClasses, int* labels, int labelsLen) {
     numTimes_ = numTimes;
     numClasses_ = numClasses;
     blank_ = numClasses_ - 1;
@@ -207,7 +210,8 @@ public:
       real err = 0;
       err = editDistance(
           output.value->getData() + output.value->getWidth() * outputStarts[i],
-          outputStarts[i+1] - outputStarts[i], output.value->getWidth(),
+          outputStarts[i + 1] - outputStarts[i],
+          output.value->getWidth(),
           label.ids->getData() + labelStarts[i],
           labelStarts[i + 1] - labelStarts[i]);
 
@@ -240,7 +244,7 @@ public:
     seqClassficationError_ = 0;
   }
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     os << config_.name() << "="
        << (numSequences_ ? totalScore_ / numSequences_ : 0);
     os << "  deletions error"
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
index 22579891f397afe58d5b4285f0aece944d8b753c..923e77fc9df919794902daed6113792e7f89a552 100644
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -114,7 +114,7 @@ public:
     numCorrect_ = 0;
   }
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     double precision = (double)numCorrect_ / numOutputSegments_;
     double recall = (double)numCorrect_ / numLabelSegments_;
     double f1 =
@@ -144,7 +144,8 @@ public:
     size_t numSequences = sequenceStartPositions->getSize() - 1;
     const int* starts = sequenceStartPositions->getData();
     for (size_t i = 0; i < numSequences; ++i) {
-      eval1(output->getData() + starts[i], label->getData() + starts[i],
+      eval1(output->getData() + starts[i],
+            label->getData() + starts[i],
             starts[i + 1] - starts[i]);
     }
     return 0;
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
index 7bdcdaae53c638c93e567a2943586dcc27d75ded..f5df2b18dedde9022d04b034912e59be00f15413 100644
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "paddle/gserver/evaluators/Evaluator.h"
 
@@ -74,17 +73,19 @@ public:
     }
 
     const MatrixPtr errorMat = Matrix::create(output->getHeight(),
-      1, /* trans= */ false, useGpu(arguments[0].deviceId));
+                                              1,
+                                              /* trans= */ false,
+                                              useGpu(arguments[0].deviceId));
     errorMat->zeroMem();
     if (label != nullptr) {
       errorMat->classificationError(output, label);
     } else if (dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()) ||
                dynamic_cast<GpuSparseMatrix*>(multiBinaryLabel.get())) {
-      errorMat->classificationErrorMulti(*output, *multiBinaryLabel,
-                                         config_.classification_threshold());
+      errorMat->classificationErrorMulti(
+          *output, *multiBinaryLabel, config_.classification_threshold());
     } else {
-      errorMat->binaryClassificationError(0, *output, *multiBinaryLabel,
-                                          config_.classification_threshold());
+      errorMat->binaryClassificationError(
+          0, *output, *multiBinaryLabel, config_.classification_threshold());
     }
 
     if (supportWeight) {
@@ -126,8 +127,8 @@ public:
     int errCounter = 0;
     CpuVector errorVec(0, nullptr);
     for (size_t i = 0; i < sequenceStartPositions->getSize() - 1; ++i) {
-      errorVec.subVecFrom(errorMat->getData(), starts[i],
-                          starts[i + 1] - starts[i]);
+      errorVec.subVecFrom(
+          errorMat->getData(), starts[i], starts[i + 1] - starts[i]);
       if (errorVec.getSum() > 0) {
         errCounter += 1;
       }
@@ -315,7 +316,7 @@ public:
     return 0;
   }
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     CHECK(colIdx_ + (int32_t)colNum_ >= 0 && colIdx_ - (int32_t)colNum_ < 0)
         << "column index [" << colIdx_ << "] out of range [-" << colNum_ << ", "
         << colNum_ << ")";
@@ -330,8 +331,8 @@ public:
   }
 
   void distributeEval(ParameterClient2* client) {
-    client->reduce(sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id,
-                   0);
+    client->reduce(
+        sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id, 0);
     client->reduce(&numSamples_, &numSamples_, 1, FLAGS_trainer_id, 0);
   }
 
@@ -379,8 +380,11 @@ real AucEvaluator::evalImp(std::vector<Argument>& arguments) {
   }
 
   if (dynamic_cast<GpuMatrix*>(output.get())) {
-    Matrix::resizeOrCreate(cpuOutput_, insNum, outputDim,
-                           /* trans=*/false, /* useGpu=*/false);
+    Matrix::resizeOrCreate(cpuOutput_,
+                           insNum,
+                           outputDim,
+                           /* trans=*/false,
+                           /* useGpu=*/false);
     cpuOutput_->copyFrom(*output);
     IVector::resizeOrCreate(cpuLabel_, insNum, false);
     cpuLabel_->copyFrom(*label);
@@ -421,7 +425,7 @@ void AucEvaluator::distributeEval(ParameterClient2* client) {
   client->reduce(statNeg_, statNeg_, kBinNum_ + 1, FLAGS_trainer_id, 0);
 }
 
-double AucEvaluator::calcAuc() {
+double AucEvaluator::calcAuc() const {
   double totPos = 0.0;
   double totNeg = 0.0;
   double totPosPrev = 0.0;
@@ -479,19 +483,24 @@ real RankAucEvaluator::evalImp(std::vector<Argument>& arguments) {
   for (size_t i = 0; i < batchNum; ++i) {
     int beginPos = startPosData[i];
     int endPos = startPosData[i + 1];
-    batchAuc += calcRankAuc(outputData + beginPos, clickData + beginPos,
-                            pvData + beginPos, endPos - beginPos);
+    batchAuc += calcRankAuc(outputData + beginPos,
+                            clickData + beginPos,
+                            pvData + beginPos,
+                            endPos - beginPos);
   }
   return batchAuc;
 }
 
-double RankAucEvaluator::calcRankAuc(real* outputData, real* clickData,
-                                     real* pvData, size_t size) {
+double RankAucEvaluator::calcRankAuc(real* outputData,
+                                     real* clickData,
+                                     real* pvData,
+                                     size_t size) {
   outputPair_.clear();
   for (size_t i = 0; i < size; ++i) {
     outputPair_.push_back(std::make_pair(outputData[i], i));
   }
-  std::sort(outputPair_.begin(), outputPair_.end(),
+  std::sort(outputPair_.begin(),
+            outputPair_.end(),
             [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
               return a.first > b.first;
             });
@@ -584,7 +593,7 @@ real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
   return 0;
 }
 
-void PrecisionRecallEvaluator::printStats(std::ostream& os) {
+void PrecisionRecallEvaluator::printStats(std::ostream& os) const {
   int label = config_.positive_label();
   if (label != -1) {
     CHECK(label >= 0 && label < (int)statsInfo_.size())
@@ -790,8 +799,12 @@ real PnpairEvaluator::evalImp(std::vector<Argument>& arguments) {
   return 0;
 }
 
-void PnpairEvaluator::stat(size_t start, size_t end, PredictionResult* answers,
-                           double& pos, double& neg, double& spe) {
+void PnpairEvaluator::stat(size_t start,
+                           size_t end,
+                           PredictionResult* answers,
+                           double& pos,
+                           double& neg,
+                           double& spe) {
   for (size_t i = start; i < end; i++) {
     for (size_t j = i + 1; j < end; j++) {
       CHECK_EQ(answers[i].queryid, answers[j].queryid);
@@ -817,7 +830,8 @@ void PnpairEvaluator::stat(size_t start, size_t end, PredictionResult* answers,
 }
 
 void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
-  std::sort(predictArray.begin(), predictArray.end(),
+  std::sort(predictArray.begin(),
+            predictArray.end(),
             [](const PredictionResult& x, const PredictionResult& y) {
               return x.queryid < y.queryid;
             });
@@ -828,11 +842,16 @@ void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
   auto start = predictArray.begin();
   while (start != predictArray.end()) {
     auto end = std::find_if(
-        start + 1, predictArray.end(),
+        start + 1,
+        predictArray.end(),
         [=](const PredictionResult& x) { return x.queryid != start->queryid; });
     CHECK(end != start);
-    stat(start - predictArray.begin(), end - predictArray.begin(),
-         predictArray.data(), pos, neg, special);
+    stat(start - predictArray.begin(),
+         end - predictArray.begin(),
+         predictArray.data(),
+         pos,
+         neg,
+         special);
 
     start = end;
   }
@@ -1120,8 +1139,8 @@ public:
 
     auto resizeMatrix = [](MatrixPtr& dest, const MatrixPtr& src) {
       if (src && src->useGpu()) {
-        Matrix::resizeOrCreate(dest, src->getHeight(), src->getWidth(), false,
-                               false);
+        Matrix::resizeOrCreate(
+            dest, src->getHeight(), src->getWidth(), false, false);
         dest->copyFrom(*src);
       } else {
         dest = src;
diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h
index b79a539384e9f7620c118d14b915c3f76a9a43af..732abb6079523b1cce8d0727c94ef65581842b4c 100644
--- a/paddle/gserver/evaluators/Evaluator.h
+++ b/paddle/gserver/evaluators/Evaluator.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/pserver/ParameterClient2.h"
@@ -99,19 +98,19 @@ public:
    * @brief print the statistics of evaluate result
    * @note finish() should be called before printStats
    */
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     os << config_.name() << "="
        << (numSamples_ ? totalScore_ / numSamples_ : 0);
   }
 
   friend std::ostream& operator<<(std::ostream& os,
-                                  Evaluator& evaluator) {
+                                  const Evaluator& evaluator) {
     evaluator.printStats(os);
     return os;
   }
 
-  friend std::ostream&& operator<<(std::ostream&& os,    // NOLINT
-                                   Evaluator& evaluator) {
+  friend std::ostream&& operator<<(std::ostream&& os,  // NOLINT
+                                   const Evaluator& evaluator) {
     evaluator.printStats(os);
     return std::move(os);
   }
@@ -135,7 +134,7 @@ public:
     return -1;
   }
   virtual void finish() {}
-  virtual void printStats(std::ostream&) {}
+  virtual void printStats(std::ostream&) const {}
 };
 /**
  * @brief evaluate AUC using colIdx-th column as prediction.
@@ -165,7 +164,7 @@ public:
 
   virtual real evalImp(std::vector<Argument>& arguments);
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     os << config_.name() << "=" << calcAuc();
   }
 
@@ -184,12 +183,14 @@ private:
 
   AucEvaluator() {}
 
-  inline static double trapezoidArea(double X1, double X2, double Y1,
+  inline static double trapezoidArea(double X1,
+                                     double X2,
+                                     double Y1,
                                      double Y2) {
     return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
   }
 
-  double calcAuc();
+  double calcAuc() const;
 };
 
 /**
@@ -218,7 +219,9 @@ private:
   MatrixPtr pv_;
   std::vector<std::pair<real, int>> outputPair_;
 
-  double calcRankAuc(real* outputData, real* clickData, real* pvData,
+  double calcRankAuc(real* outputData,
+                     real* clickData,
+                     real* pvData,
                      size_t size);
 };
 /**
@@ -244,7 +247,7 @@ public:
 
   virtual real evalImp(std::vector<Argument>& arguments);
 
-  virtual void printStats(std::ostream& os);
+  virtual void printStats(std::ostream& os) const;
 
   virtual void distributeEval(ParameterClient2* client);
 
@@ -269,10 +272,12 @@ private:
   IVectorPtr cpuLabel_;
   MatrixPtr cpuWeight_;
 
-  void calcStatsInfo(const MatrixPtr& output, const IVectorPtr& label,
+  void calcStatsInfo(const MatrixPtr& output,
+                     const IVectorPtr& label,
                      const MatrixPtr& weight);
 
-  void calcStatsInfoMulti(const MatrixPtr& output, const MatrixPtr& label,
+  void calcStatsInfoMulti(const MatrixPtr& output,
+                          const MatrixPtr& label,
                           const MatrixPtr& weight);
 
   inline static double calcPrecision(double TP, double FP) {
@@ -333,13 +338,17 @@ public:
     }
   }
 
-  void stat(size_t start, size_t end, PredictionResult* answers, double& pos,
-            double& neg, double& spe);
+  void stat(size_t start,
+            size_t end,
+            PredictionResult* answers,
+            double& pos,
+            double& neg,
+            double& spe);
   void calc(std::vector<PredictionResult>& predictArray);
 
   virtual void finish() { calc(predictArray_); }
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     os << " pos/neg"
        << "=" << pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
   }
diff --git a/paddle/gserver/gradientmachines/GradientMachine.cpp b/paddle/gserver/gradientmachines/GradientMachine.cpp
index b20525f66431e16544ce1e05a617286bd5975cfc..3761fda5f370e3b1aef0e394286c49d8ec831694 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/GradientMachine.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "GradientMachine.h"
 
 #include "paddle/utils/Logging.h"
@@ -29,7 +28,8 @@ limitations under the License. */
 namespace paddle {
 
 GradientMachine* GradientMachine::create(
-    const ModelConfig& config, int mode,
+    const ModelConfig& config,
+    int mode,
     const std::vector<ParameterType>& parameterTypes) {
   if (auto gm = IGradientMachineMode::tryCreateGradientMachine(mode, config)) {
     return gm;
@@ -49,10 +49,11 @@ GradientMachine* GradientMachine::create(
       /* single thread calculate */
       nn = NeuralNetwork::create(config);
     }
-    ParamInitCallback testParamInitCb =
-        [](int paramId, Parameter* para) { para->enableType(PARAMETER_VALUE); };
-    nn->init(config, mode == kTesting ? testParamInitCb : nullptr,
-             parameterTypes);
+    ParamInitCallback testParamInitCb = [](int paramId, Parameter* para) {
+      para->enableType(PARAMETER_VALUE);
+    };
+    nn->init(
+        config, mode == kTesting ? testParamInitCb : nullptr, parameterTypes);
     return nn;
   }
   LOG(FATAL) << "Unknown model type: " << config.type();
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
index 986a1ee71dbb00781c6af93a06f3e16d6639c307..27cdf7f7890673673d5be63fecdd61d5d2a11447 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <iostream>
@@ -84,10 +83,11 @@ public:
    * Parameter will have parameterTypes
    */
   static GradientMachine* create(
-      const ModelConfig& config, int mode = kNormal,
+      const ModelConfig& config,
+      int mode = kNormal,
       const std::vector<ParameterType>& parameterTypes =
-          std::vector<ParameterType>{PARAMETER_VALUE, PARAMETER_GRADIENT,
-                                     PARAMETER_MOMENTUM});
+          std::vector<ParameterType>{
+              PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM});
 
   /**
    * Create a gradient machine from the merged model file.
@@ -137,7 +137,8 @@ public:
    * @note: if passType==PASS_TEST, then backward() should not be called
    */
   virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs, PassType passType) = 0;
+                       std::vector<Argument>* outArgs,
+                       PassType passType) = 0;
 
   /**
    * @brief Backward propagation.
@@ -211,7 +212,7 @@ public:
    * @note    This function will only been implemented and used in a
    *          multithreaded environment.
    */
- virtual void start(const TrainerConfig& config,
+  virtual void start(const TrainerConfig& config,
                      DataProviderPtr dataProvider) {
     (void)config;
     (void)dataProvider;
@@ -246,7 +247,6 @@ public:
    */
   virtual void restart() {}
 
-
   /// Set the gradient of the output from outside.
   virtual void setOutputGrad(const std::vector<Argument>& args) {
     LOG(FATAL) << "Not implemented!";
diff --git a/paddle/gserver/gradientmachines/GradientMachineMode.h b/paddle/gserver/gradientmachines/GradientMachineMode.h
index 9aff9c616cf514d53b5017dfdb6250a7cbce0198..f2f55a70671858145572e4a5c0f1c4b609145f98 100644
--- a/paddle/gserver/gradientmachines/GradientMachineMode.h
+++ b/paddle/gserver/gradientmachines/GradientMachineMode.h
@@ -23,10 +23,10 @@ public:
   virtual ~IGradientMachineMode() {}
 
 public:  // interfaces
-  /**
-   * @brief create current mode's gradient machine by model config.
-   * @param config model config
-   */
+         /**
+          * @brief create current mode's gradient machine by model config.
+          * @param config model config
+          */
   virtual GradientMachine* create(const ModelConfig& config) = 0;
 
   /**
@@ -37,11 +37,10 @@ public:  // interfaces
    * @param isGpu is using gpu.
    * @return true if mode should be this mode.
    */
-  virtual bool shouldBeMe(
-      const std::string& algo,
-      size_t trainerCount,
-      bool isLocal,
-      bool isGpu) const = 0;
+  virtual bool shouldBeMe(const std::string& algo,
+                          size_t trainerCount,
+                          bool isLocal,
+                          bool isGpu) const = 0;
 
   /**
    * @brief Is data must be in cpu even if using gpu mode.
@@ -57,13 +56,13 @@ public:  // interfaces
   virtual bool needTrainWholeDataInOneBatch() const = 0;
 
 public:  // static methods.
-  /**
-   * @brief register a custom gradient machine mode.
-   * @note For user to register a custom gradient machine mode, id should >=
-   * kCustom.
-   * @param mode mode id.
-   * @param ptr mode description object.
-   */
+         /**
+          * @brief register a custom gradient machine mode.
+          * @note For user to register a custom gradient machine mode, id should >=
+          * kCustom.
+          * @param mode mode id.
+          * @param ptr mode description object.
+          */
   static void regGradientMachineMode(
       int32_t mode, std::unique_ptr<IGradientMachineMode>&& ptr) {
     modes_.insert(std::make_pair(mode, std::move(ptr)));
@@ -102,9 +101,11 @@ public:  // static methods.
    * @param [in] isGpu using gpu or not.
    * @return true if there is a custom mode fit these conditions.
    */
-  static bool tryGetMode(int* mode, const std::string& algo,
+  static bool tryGetMode(int* mode,
+                         const std::string& algo,
                          int32_t trainerCount,
-                         bool isLocal, bool isGpu) {
+                         bool isLocal,
+                         bool isGpu) {
     for (auto it = modes_.begin(); it != modes_.end(); ++it) {
       if (it->second->shouldBeMe(algo, trainerCount, isLocal, isGpu)) {
         *mode = it->first;
@@ -130,8 +131,8 @@ public:  // static methods.
    * @brief try to create gradient machine by mode & config.
    * @return nullptr if we cannot create a gradient machine by such mode.
    */
-  static GradientMachine* tryCreateGradientMachine(
-      int32_t mode, const ModelConfig& config) {
+  static GradientMachine* tryCreateGradientMachine(int32_t mode,
+                                                   const ModelConfig& config) {
     auto m = IGradientMachineMode::mode(mode);
     if (m) {
       return m->create(config);
@@ -142,7 +143,7 @@ public:  // static methods.
 
 private:
   static std::unordered_map<int32_t, std::unique_ptr<IGradientMachineMode>>
-    modes_;
+      modes_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 0ded30eeb44e95b50ff91722ef96a9f24c81c16d..148451f18dceb0c470dadab01ff91915f994c68f 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "MultiGradientMachine.h"
 
 #include "paddle/utils/Logging.h"
@@ -22,7 +21,8 @@ limitations under the License. */
 #include "NeuralNetwork.h"
 #include "ParallelNeuralNetwork.h"
 
-P_DEFINE_bool(allow_only_one_model_on_one_gpu, true,
+P_DEFINE_bool(allow_only_one_model_on_one_gpu,
+              true,
               "If true, do not allow multiple models on one GPU device");
 #ifdef PADDLE_METRIC_LEARNING
 P_DECLARE_bool(external);
@@ -32,15 +32,15 @@ namespace paddle {
 
 // get types of the parameters which need to be merged after backward()
 static void fillMergeTypes(PassType passType,
-    std::vector<ParameterType>* mergeTypes) {
+                           std::vector<ParameterType>* mergeTypes) {
   mergeTypes->clear();
   if (passType != PASS_TEST) {
     mergeTypes->push_back(PARAMETER_GRADIENT);
   }
 }
 
-MultiGradientMachine::MultiGradientMachine(
-    const ModelConfig& config, bool useGpu)
+MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,
+                                           bool useGpu)
     : useGpu_(useGpu),
       trainerBarrier_(FLAGS_trainer_count),
       allBarrier_(FLAGS_trainer_count + 1),
@@ -65,13 +65,11 @@ MultiGradientMachine::MultiGradientMachine(
     if (para->useGpu()) return;
 
     if (para->isSparseRemoteUpdate()) {
-      para->enableType(
-        PARAMETER_VALUE,
-        FLAGS_loadsave_parameters_in_pserver
-          ? Parameter::MAT_SPARSE_ROW_PREFETCH
-          : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
-      para->enableType(
-        PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
+      para->enableType(PARAMETER_VALUE,
+                       FLAGS_loadsave_parameters_in_pserver
+                           ? Parameter::MAT_SPARSE_ROW_PREFETCH
+                           : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
+      para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
     } else if (para->isGradSparseUpdate()) {
       para->enableType(PARAMETER_VALUE);
       para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW_IDS);
@@ -100,17 +98,16 @@ MultiGradientMachine::MultiGradientMachine(
   if (useGpu_) {
     numLogicalDevices_ = 1;
 
-    for (size_t pid = 0; pid  < parameters_.size(); pid++) {
+    for (size_t pid = 0; pid < parameters_.size(); pid++) {
       if (parameters_[pid]->getConfig().device() + 1 > numLogicalDevices_) {
         numLogicalDevices_ = parameters_[pid]->getConfig().device() + 1;
       }
     }
     LOG(INFO) << "numLogicalDevices=" << numLogicalDevices_
-              << " numThreads=" << numThreads_
-              << " numDevices=" << numDevices_;
+              << " numThreads=" << numThreads_ << " numDevices=" << numDevices_;
 
-    if (numLogicalDevices_ * numThreads_ > numDevices_
-        && FLAGS_allow_only_one_model_on_one_gpu) {
+    if (numLogicalDevices_ * numThreads_ > numDevices_ &&
+        FLAGS_allow_only_one_model_on_one_gpu) {
       LOG(FATAL) << "trainer_count * num_devices_in_model "
                  << "(" << numThreads_ << "*" << numLogicalDevices_ << ")"
                  << "=" << numThreads_ * numLogicalDevices_
@@ -130,11 +127,7 @@ MultiGradientMachine::MultiGradientMachine(
   }
 
   for (int i = 0; i < numThreads_; ++i) {
-    threads_.emplace_back(
-        new TrainerThread(
-            config,
-            i,
-            this));
+    threads_.emplace_back(new TrainerThread(config, i, this));
   }
 
   bufferSizes_.resize(numLogicalDevices_, 0);
@@ -162,7 +155,7 @@ MultiGradientMachine::MultiGradientMachine(
 
   // combination of all trainers mainPara into GradientMachine parameters
   hasNonstaticCpuParamters_ = false;
-  for (size_t pid = 0; pid  < parameters_.size(); pid++) {
+  for (size_t pid = 0; pid < parameters_.size(); pid++) {
     if (parameters_[pid]->useGpu()) {
       parameters_[pid] = threads_[paraMainThread_[pid]]->getParameters()[pid];
     } else if (!parameters_[pid]->isStatic()) {
@@ -209,7 +202,7 @@ void MultiGradientMachine::allocGradBufs() {
       SetDevice device(logicalDeviceId2RealDeviceId(d, i));
       for (size_t j = 0; j < mergeTypes_.size(); j++) {
         gradBufs_[i][d].bufs.push_back(
-          Vector::create(bufferSizes_[d], /* useGpu= */true));
+            Vector::create(bufferSizes_[d], /* useGpu= */ true));
       }
     }
   }
@@ -249,18 +242,16 @@ void MultiGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
   }
 }
 
-void MultiGradientMachine::forward(
-    const std::vector<Argument>& inArgs,
-    std::vector<Argument>* outArgs,
-    PassType passType) {
+void MultiGradientMachine::forward(const std::vector<Argument>& inArgs,
+                                   std::vector<Argument>* outArgs,
+                                   PassType passType) {
   forwardImp(inArgs, outArgs, passType, TASK_FORWARD);
 }
 
-void MultiGradientMachine::forwardImp(
-    const std::vector<Argument>& inArgs,
-    std::vector<Argument>* outArgs,
-    PassType passType,
-    TaskType taskType) {
+void MultiGradientMachine::forwardImp(const std::vector<Argument>& inArgs,
+                                      std::vector<Argument>* outArgs,
+                                      PassType passType,
+                                      TaskType taskType) {
   updateThreadParameters();
   passType_ = passType;
 
@@ -282,18 +273,16 @@ void MultiGradientMachine::backward(const UpdateCallback& callback) {
   backwardImp(callback);
 }
 
-void MultiGradientMachine::forwardBackward(
-    const std::vector<Argument>& inArgs,
-    std::vector<Argument>* outArgs,
-    PassType passType,
-    const UpdateCallback& callback) {
+void MultiGradientMachine::forwardBackward(const std::vector<Argument>& inArgs,
+                                           std::vector<Argument>* outArgs,
+                                           PassType passType,
+                                           const UpdateCallback& callback) {
   backwardCallback_ = callback;
   forwardImp(inArgs, outArgs, passType, TASK_FORWARD_BACKWARD);
   backwardImp(callback);
 }
 
-void MultiGradientMachine::backwardImp(
-    const UpdateCallback& callback) {
+void MultiGradientMachine::backwardImp(const UpdateCallback& callback) {
   for (size_t i = 0; i < parameters_.size(); i++) {
     if (!parameters_[i]->useGpu() || parameters_[i]->isStatic()) continue;
     REGISTER_TIMER("controller_dequeue");
@@ -349,9 +338,8 @@ void MultiGradientMachine::eval(Evaluator* evaluator) {
   }
 }
 
-void MultiGradientMachine::getOutArgs(
-    std::vector<Argument>* outArgs,
-    PassType passType) {
+void MultiGradientMachine::getOutArgs(std::vector<Argument>* outArgs,
+                                      PassType passType) {
   for (auto& thread : threads_) {
     REGISTER_TIMER("waitOutArgs");
     thread->waitOutArgsReady();
@@ -375,7 +363,6 @@ void MultiGradientMachine::getOutArgs(
   *outArgs = outArgs_;
 }
 
-
 void MultiGradientMachine::setOutputGrad(const std::vector<Argument>& args) {
   CHECK_EQ(args.size(), outArgs_.size());
   for (size_t i = 0; i < args.size(); i++) {
@@ -390,10 +377,9 @@ void MultiGradientMachine::startTask(TaskType taskType) {
   }
 }
 
-TrainerThread::TrainerThread(
-    const ModelConfig& config,
-    int threadId,
-    MultiGradientMachine* multiMachine)
+TrainerThread::TrainerThread(const ModelConfig& config,
+                             int threadId,
+                             MultiGradientMachine* multiMachine)
     : multiMachine_(multiMachine),
       config_(config),
       threadId_(threadId),
@@ -407,8 +393,9 @@ TrainerThread::TrainerThread(
 
   partnerId_ = mod(threadId_ - 1, numThreads);
 
-  deviceId_ = !multiMachine_->useGpu() ? -1
-      : multiMachine_->logicalDeviceId2RealDeviceId(0, threadId_);
+  deviceId_ = !multiMachine_->useGpu()
+                  ? -1
+                  : multiMachine_->logicalDeviceId2RealDeviceId(0, threadId_);
   SetDevice gpuDevice(deviceId_);
 
   NeuralNetwork* nn = nullptr;
@@ -418,22 +405,20 @@ TrainerThread::TrainerThread(
     nn = new ParallelNeuralNetwork();
     for (auto& paraConfig : *config_.mutable_parameters()) {
       if (paraConfig.device() != -1) {
-        paraConfig.set_device(
-          multiMachine_->logicalDeviceId2RealDeviceId(
+        paraConfig.set_device(multiMachine_->logicalDeviceId2RealDeviceId(
             paraConfig.device(), threadId_));
       }
     }
     for (auto& layerConfig : *config_.mutable_layers()) {
       if (layerConfig.device() != -1) {
-        layerConfig.set_device(
-          multiMachine_->logicalDeviceId2RealDeviceId(
+        layerConfig.set_device(multiMachine_->logicalDeviceId2RealDeviceId(
             layerConfig.device(), threadId_));
       }
     }
   }
   // Only GPU do not share parameter values with main paramters.
-  ParamInitCallback slaveParamInitCb = std::bind(parameterInitNN, _1, _2,
-                                                 &mainParas);
+  ParamInitCallback slaveParamInitCb =
+      std::bind(parameterInitNN, _1, _2, &mainParas);
   nn->init(config_, slaveParamInitCb);
   gradientMachine_.reset(nn);
   parameters_ = gradientMachine_->getParameters();
@@ -443,9 +428,8 @@ TrainerThread::TrainerThread(
     }
   }
 
-  backwardCallback_ = std::bind(
-      &TrainerThread::backwardCallback,
-      this, std::placeholders::_1);
+  backwardCallback_ =
+      std::bind(&TrainerThread::backwardCallback, this, std::placeholders::_1);
 
   gradStream_ = HPPL_STREAM_2;
   valueStream_ = HPPL_STREAM_3;
@@ -454,25 +438,21 @@ TrainerThread::TrainerThread(
   parameterUpdated_ = false;
 }
 
-TrainerThread::~TrainerThread() {
-  stop();
-}
+TrainerThread::~TrainerThread() { stop(); }
 
 void TrainerThread::start() {
-  gradientMachine_->start(*(TrainerConfig*)nullptr, (DataProviderPtr)nullptr);
+  gradientMachine_->start(*(TrainerConfig*)nullptr, (DataProviderPtr) nullptr);
 
-  computeThread_.reset(new std::thread(
-      [this](){ computeThread(); }));
+  computeThread_.reset(new std::thread([this]() { computeThread(); }));
 
   if (multiMachine_->useGpu()) {
-    gradCollectThread_.reset(new std::thread(
-      [this](){ gradCollectThread(); }));
+    gradCollectThread_.reset(
+        new std::thread([this]() { gradCollectThread(); }));
 
-    valueDispatchThread_.reset(new std::thread(
-      [this](){ valueDispatchThread(); }));
+    valueDispatchThread_.reset(
+        new std::thread([this]() { valueDispatchThread(); }));
 
-    copyThread_.reset(new std::thread(
-      [this](){ copyGradToBufferThread(); }));
+    copyThread_.reset(new std::thread([this]() { copyGradToBufferThread(); }));
   }
 }
 
@@ -565,20 +545,14 @@ void TrainerThread::forward() {
 
   {
     REGISTER_TIMER("wait_value");
-    valueReadyCond_.wait(
-        [this]() {
-          return !parameterUpdated_;
-        });
+    valueReadyCond_.wait([this]() { return !parameterUpdated_; });
   }
 
-  {
-    fillMergeTypes(multiMachine_->getPassType(), &mergeTypes_);
-  }
+  { fillMergeTypes(multiMachine_->getPassType(), &mergeTypes_); }
 
   {
     REGISTER_TIMER("thread_forward");
-    gradientMachine_->forward(
-        inArgs_, &outArgs_, multiMachine_->getPassType());
+    gradientMachine_->forward(inArgs_, &outArgs_, multiMachine_->getPassType());
   }
   outArgsReadySem_.post();
 }
@@ -602,9 +576,8 @@ void TrainerThread::backwardCallback(Parameter* para) {
   if (multiMachine_->getNumThreads() == 1) {
     // no need to do merge if there is only one thread
     doCallback(paramId);
-  } else if (threadId_ ==
-             mod(multiMachine_->paraMainThread(paramId) - 1,
-                 multiMachine_->getNumThreads())) {
+  } else if (threadId_ == mod(multiMachine_->paraMainThread(paramId) - 1,
+                              multiMachine_->getNumThreads())) {
     notifyCopyGradToBuffer(paramId);
   } else {
     notifyGradientCollect(paramId);
@@ -625,7 +598,7 @@ void TrainerThread::copyGradToBufferThread() {
     if (stopping_) break;
 
     int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId(
-      parameters_[pid]->getDeviceId(), threadId_);
+        parameters_[pid]->getDeviceId(), threadId_);
 
     auto& gradBuf = gradBufs[pdeviceId];
 
@@ -639,9 +612,9 @@ void TrainerThread::copyGradToBufferThread() {
       SetDevice setDevice(parameters_[pid]->getDeviceId());
       for (size_t i = 0; i < mergeTypes_.size(); ++i) {
         gradBuf.bufs[i]->resize(
-          parameters_[pid]->getBuf(mergeTypes_[i])->getSize());
-        gradBuf.bufs[i]->copyFrom(
-            *parameters_[pid]->getBuf(mergeTypes_[i]), gradStream_);
+            parameters_[pid]->getBuf(mergeTypes_[i])->getSize());
+        gradBuf.bufs[i]->copyFrom(*parameters_[pid]->getBuf(mergeTypes_[i]),
+                                  gradStream_);
       }
       hl_stream_synchronize(gradStream_);
     }
@@ -667,7 +640,7 @@ void TrainerThread::gradCollectThread() {
     if (++gradReadyCount[pid] < 2) continue;
     gradReadyCount[pid] = 0;
     int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId(
-      parameters_[pid]->getDeviceId(), threadId_);
+        parameters_[pid]->getDeviceId(), threadId_);
 
     auto& gradBuf = gradBufs[pdeviceId];
 
@@ -741,8 +714,7 @@ void TrainerThread::valueDispatchThread() {
 
 void TrainerThread::notifyValueReady(int paramId) {
   if (--updateCounter_ == 0) {
-    valueReadyCond_.notify_all(
-        [this] { parameterUpdated_ = false; });
+    valueReadyCond_.notify_all([this] { parameterUpdated_ = false; });
   }
 
   notifyValueDispatch(paramId);
@@ -750,7 +722,7 @@ void TrainerThread::notifyValueReady(int paramId) {
 
 void TrainerThread::copyInArgs() {
   const std::vector<Argument>& fullInArgs = multiMachine_->getInArgs();
-  int     numThreads = multiMachine_->getAllThreads().size();
+  int numThreads = multiMachine_->getAllThreads().size();
   int32_t numSequences = fullInArgs[0].getNumSequences();
   int32_t startSeq = numSequences * threadId_ / numThreads;
   int32_t endSeq = numSequences * (threadId_ + 1) / numThreads;
@@ -767,9 +739,11 @@ void TrainerThread::copyInArgs() {
     return;
   }
 
-  for (size_t i=0; i < fullInArgs.size(); i++) {
+  for (size_t i = 0; i < fullInArgs.size(); i++) {
     inArgs_[i].resizeAndCopyFrom(
-        fullInArgs[i], startSeq, copySize,
+        fullInArgs[i],
+        startSeq,
+        copySize,
         FLAGS_parallel_nn ? false : multiMachine_->useGpu());
   }
 }
@@ -814,10 +788,8 @@ void TrainerThread::mergeGradSparse(
   std::vector<uint32_t>& ids = mainMat->getIds(threadId_);
 
   for (auto slaveParams : slaveParameters) {
-    SparseRowCpuMatrix* mat =
-        dynamic_cast<SparseRowCpuMatrix*>((*slaveParams)[pid]
-                                              ->getMat(PARAMETER_GRADIENT)
-                                              .get());
+    SparseRowCpuMatrix* mat = dynamic_cast<SparseRowCpuMatrix*>(
+        (*slaveParams)[pid]->getMat(PARAMETER_GRADIENT).get());
     mat->addTo(*mainMat, ids, threadId_, multiMachine_->getNumThreads());
     // we use a sample hash method(%) instead of range partition,
     // because range partition has balance issue sometimes,
@@ -847,9 +819,10 @@ void TrainerThread::mergeGradDense(
     Parameter* para,
     std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
   size_t pid = para->getID();
-  auto interval =
-      calcSplitArrayInterval(para->getSize(), (size_t)threadId_,
-                             multiMachine_->getNumThreads(), 8LU /*for avx*/);
+  auto interval = calcSplitArrayInterval(para->getSize(),
+                                         (size_t)threadId_,
+                                         multiMachine_->getNumThreads(),
+                                         8LU /*for avx*/);
   size_t startSeq = interval.first;
   size_t copySize = interval.second - interval.first;
 
@@ -861,8 +834,7 @@ void TrainerThread::mergeGradDense(
   CpuVector slaveGradSub(0, nullptr);
   for (auto slaveParams : slaveParameters) {
     slaveGradSub.subVecFrom(
-      *(*slaveParams)[pid]->getBuf(PARAMETER_GRADIENT),
-      startSeq, copySize);
+        *(*slaveParams)[pid]->getBuf(PARAMETER_GRADIENT), startSeq, copySize);
     destGrad.add(slaveGradSub);
   }
 }
@@ -876,7 +848,9 @@ void TrainerThread::copyOutputGrad() {
   int32_t copySize = endSeq - startSeq;
   outArgs_.resize(outputGradArgs.size());
   for (size_t i = 0; i < outputGradArgs.size(); i++) {
-    outArgs_[i].resizeAndCopyFrom(outputGradArgs[i], startSeq, copySize,
+    outArgs_[i].resizeAndCopyFrom(outputGradArgs[i],
+                                  startSeq,
+                                  copySize,
                                   multiMachine_->useGpu(),
                                   HPPL_STREAM_DEFAULT);
   }
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
index d13cf426c29e4e9f6806178f2362e8189fdb0dec..58c5486810cf280c48c62f2256480c1a4bb047bc 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <atomic>
@@ -52,7 +51,8 @@ struct GradBuffer {
  *
  *  It handles GPU and Cpu parameters differently.  In GPU, one computing thread
  *  generally corresponds to one GPU device. Thus, each thread keeps a separate
- *  copy of the parameter in its own device's memory. In CPU, we only need to keep
+ *  copy of the parameter in its own device's memory. In CPU, we only need to
+ keep
  *  one copy of the parameters in the main memory. After, each computing thread
  *  computes its own parameter gradient, the update process needs to accumulate
  *  the parameter gradients from all the computing threads, and update the
@@ -66,16 +66,21 @@ struct GradBuffer {
  *  computing thread so that the parameters in all the computing threads are
  *  synchronized. The scatter and gather process are implemented by ring-style
  *  communication. Assume we have N computing threads, its thread ids will be
- *  0, 1, ..., N-1. For each parameter, the id of the main thread is specified in
- *  paraMainThread_[pid], where pid is the id of the parameter. Each thread i only
+ *  0, 1, ..., N-1. For each parameter, the id of the main thread is specified
+ in
+ *  paraMainThread_[pid], where pid is the id of the parameter. Each thread i
+ only
  *  sends data to its partner thread (i - 1) % N. For example, for a parameter
  *  gradient that is computed in thread 4, and its main thread is 2. Its
- *  traveling process would be 4, 5,..., N-1, 0, 1, 2. In each step, the gradient
+ *  traveling process would be 4, 5,..., N-1, 0, 1, 2. In each step, the
+ gradient
  *  buffer is added to the local gradient, and the local gradient is then copied
  *  to the gradient buffer of the next thread. At last, its main thread 2 will
  *  get the accumulated parameter gradient. For the same parameter, after its
- *  value is updated, the value's traveling process would be 2, 1, 0, N-1, ... 3.
- *  At the end, all the computing threads would have the updated parameter value.
+ *  value is updated, the value's traveling process would be 2, 1, 0, N-1, ...
+ 3.
+ *  At the end, all the computing threads would have the updated parameter
+ value.
  *
  *  A computing thread (TrainerThread) uses 4 threads to do different jobs:
  *
@@ -94,8 +99,10 @@ struct GradBuffer {
  *  * Handling of sparse update
  *  Currently, sparse update is only supported for CPU parameters.
 
- *  Sparse updates refers to gradient caculation where the gradient is sparse. For
- *  example, if the input argument to a 'fc' layer is sparse, the gradient of the
+ *  Sparse updates refers to gradient caculation where the gradient is sparse.
+ For
+ *  example, if the input argument to a 'fc' layer is sparse, the gradient of
+ the
  *  weight matrix of this layer will be sparse. It is usually more efficient to
  *  treat the gradient explicitly as sparse vector during the parameter update.
 
@@ -104,7 +111,8 @@ struct GradBuffer {
 
  *  For both types of sparse updates, there is one copy of parameter value and
  *  gradient called main parameter value and gradient, and there is a copy of
- *  parameter value and gradient for each computing thread called slave parameter
+ *  parameter value and gradient for each computing thread called slave
+ parameter
  *  value and gradient. The slave parameter values are always shared with the
  *  corresponding main parameter value. The slave parameter grad is a sparse row
  *  matrix. The sparse pattern for slave parameter grads are different, because
@@ -124,7 +132,8 @@ struct GradBuffer {
  *     (SparseAutoGrowRowCpuMatrix). It is a sparse row matrix.
  *
  *     During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will
- *     gather all the non-zero gradient. And After backward(), they will be merged
+ *     gather all the non-zero gradient. And After backward(), they will be
+ merged
  *     into main parameter grad (SparseRowIdsCpuMatrix), with indices indicating
  *     which rows have nonzero gradient.
  *
@@ -136,9 +145,11 @@ struct GradBuffer {
  *     parameter values that are prefetched is up-to-date.
  *
  *     Main parameter grad type is MAT_SPARSE_ROW (SparseRowCpuMatrix).
- *     And it shares sparse pattern with value by sharing indexDictHandle_, which
+ *     And it shares sparse pattern with value by sharing indexDictHandle_,
+ which
  *     is an internal data structure used by SparseRowCpuMatrixto specify the
- *     sparsity pattern of Slave parameter value shares with main parameter value.
+ *     sparsity pattern of Slave parameter value shares with main parameter
+ value.
  *
  *     Slave parameter grad type is MAT_SPARSE_ROW_AUTO_GROW
  *     (SparsePrefetchRowCpuMatrix). It is a sparse row matrix
@@ -148,8 +159,10 @@ struct GradBuffer {
  *     parameter server.
  *
  *     During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will
- *     gather all the non-zero gradient. And After backward(), they will be merged
- *     into main parameter grad (SparseRowCpuMatrix). And the framework will send
+ *     gather all the non-zero gradient. And After backward(), they will be
+ merged
+ *     into main parameter grad (SparseRowCpuMatrix). And the framework will
+ send
  *     the merged gradient to parameter server.
  */
 class MultiGradientMachine : public GradientMachine {
@@ -165,18 +178,16 @@ public:
 
   virtual void prefetch(const std::vector<Argument>& inArgs);
 
-  virtual void forward(
-      const std::vector<Argument>& inArgs,
-      std::vector<Argument>* outArgs,
-      PassType passType);
+  virtual void forward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
 
   virtual void backward(const UpdateCallback& callback = nullptr);
 
-  void forwardBackward(
-    const std::vector<Argument>& inArgs,
-    std::vector<Argument>* outArgs,
-    PassType passType,
-    const UpdateCallback& callback);
+  void forwardBackward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType,
+                       const UpdateCallback& callback);
 
   virtual void onPassEnd();
 
@@ -186,9 +197,7 @@ public:
 
   virtual void eval(Evaluator* evaluator);
 
-  bool useGpu() const {
-    return useGpu_;
-  }
+  bool useGpu() const { return useGpu_; }
 
   /// @return whether to pass the gradients in outArgs_ to each threads.
   bool isPassGrad() { return isPassGrad_; }
@@ -203,9 +212,7 @@ public:
 protected:
   friend class TrainerThread;
 
-  std::vector<TrainerThreadPtr>& getAllThreads() {
-    return threads_;
-  }
+  std::vector<TrainerThreadPtr>& getAllThreads() { return threads_; }
   /// Calculate the real device id based on the logical device id and the
   /// thread id.
   int logicalDeviceId2RealDeviceId(int logicalId, int threadId = 0) const {
@@ -229,9 +236,7 @@ protected:
 
   std::vector<const std::vector<ParameterPtr>*> getSlaveParameters();
 
-  bool hasNonstaticCpuParamters() const {
-    return hasNonstaticCpuParamters_;
-  }
+  bool hasNonstaticCpuParamters() const { return hasNonstaticCpuParamters_; }
 
   /// Called TrainerThread to wait before merging CPU parameter gradients.
   void waitBeforeMerge() { trainerBarrier_.wait(); }
@@ -244,59 +249,41 @@ protected:
   /// finishing
   void waitForCopyInArgs() { allBarrier_.wait(); }
 
-  TrainerThreadPtr& getThread(int threadId) {
-    return threads_[threadId];
-  }
+  TrainerThreadPtr& getThread(int threadId) { return threads_[threadId]; }
 
   std::vector<GradBuffer>& getGradBuf(int threadId) {
     return gradBufs_[threadId];
   }
 
-  PassType getPassType() const {
-    return passType_;
-  }
+  PassType getPassType() const { return passType_; }
 
   /// Called by TrainerThread to notify MultiGradientMachine that the gradient
   /// for paramId is ready
   void notifyGradientTransfer(int paramId);
 
-  const std::vector<Argument>& getInArgs() {
-    return inArgs_;
-  }
+  const std::vector<Argument>& getInArgs() { return inArgs_; }
 
-  TaskType getTaskType() const {
-    return taskType_;
-  }
+  TaskType getTaskType() const { return taskType_; }
 
   const UpdateCallback& getBackwardCallback() const {
     return backwardCallback_;
   }
 
-  int getNumDevices() const {
-    return numDevices_;
-  }
+  int getNumDevices() const { return numDevices_; }
 
-  int getNumLogicalDevices() const {
-    return numLogicalDevices_;
-  }
+  int getNumLogicalDevices() const { return numLogicalDevices_; }
 
-  int getNumThreads() const {
-    return numThreads_;
-  }
+  int getNumThreads() const { return numThreads_; }
 
-  int paraMainThread(int pid) const {
-    return paraMainThread_[pid];
-  }
+  int paraMainThread(int pid) const { return paraMainThread_[pid]; }
 
 protected:
-  virtual void forwardImp(
-      const std::vector<Argument>& inArgs,
-      std::vector<Argument>* outArgs,
-      PassType passType,
-      TaskType taskType);
+  virtual void forwardImp(const std::vector<Argument>& inArgs,
+                          std::vector<Argument>* outArgs,
+                          PassType passType,
+                          TaskType taskType);
 
-  virtual void backwardImp(
-      const UpdateCallback& callback = NULL);
+  virtual void backwardImp(const UpdateCallback& callback = NULL);
 
   /// update all parameters
   void updateThreadParameters();
@@ -329,9 +316,9 @@ protected:
 
   /// ParameterType which needs to be merged from each GPU
   std::vector<ParameterType> mergeTypes_;
-  int numDevices_;  /* number of gpu devices */
+  int numDevices_;         /* number of gpu devices */
   int numLogicalDevices_;  // number of GPU used by one NN
-  int numThreads_;  /* number of train threads */
+  int numThreads_;         /* number of train threads */
 
   UpdateCallback backwardCallback_;
 
@@ -350,38 +337,25 @@ protected:
 
 class TrainerThread {
 public:
-  TrainerThread(
-      const ModelConfig& config,
-      int threadId,
-      MultiGradientMachine* multiMachine);
+  TrainerThread(const ModelConfig& config,
+                int threadId,
+                MultiGradientMachine* multiMachine);
 
   ~TrainerThread();
 
   void start();
 
-  void onPassEnd() {
-    gradientMachine_->onPassEnd();
-  }
+  void onPassEnd() { gradientMachine_->onPassEnd(); }
 
-  void waitOutArgsReady() {
-    outArgsReadySem_.wait();
-  }
+  void waitOutArgsReady() { outArgsReadySem_.wait(); }
 
-  void notifyTaskReady() {
-    taskReadySem_.post();
-  }
+  void notifyTaskReady() { taskReadySem_.post(); }
 
-  int getDeviceId() const {
-    return deviceId_;
-  }
+  int getDeviceId() const { return deviceId_; }
 
-  GradientMachine* getGradientMachine() {
-    return gradientMachine_.get();
-  }
+  GradientMachine* getGradientMachine() { return gradientMachine_.get(); }
 
-  const std::vector<ParameterPtr>& getParameters() {
-    return parameters_;
-  }
+  const std::vector<ParameterPtr>& getParameters() { return parameters_; }
 
   void stop();
 
@@ -391,26 +365,18 @@ public:
     return parameters_[paramId]->getBuf(PARAMETER_VALUE);
   }
 
-  const std::vector<Argument>& getOutArgs() {
-    return outArgs_;
-  }
+  const std::vector<Argument>& getOutArgs() { return outArgs_; }
 
   void incUpdateCounter(int n = 1) {
     updateCounter_ += n;
     parameterUpdated_ = true;
   }
 
-  void notifyGradientCollect(int paramId) {
-    gradQueue_.enqueue(paramId);
-  }
+  void notifyGradientCollect(int paramId) { gradQueue_.enqueue(paramId); }
 
-  void notifyCopyGradToBuffer(int paramId) {
-    gradBufQueue_.enqueue(paramId);
-  }
+  void notifyCopyGradToBuffer(int paramId) { gradBufQueue_.enqueue(paramId); }
 
-  void notifyValueDispatch(int paramId) {
-    valueReadyQueue_.enqueue(paramId);
-  }
+  void notifyValueDispatch(int paramId) { valueReadyQueue_.enqueue(paramId); }
 
   void prefetch();
 
@@ -421,16 +387,16 @@ protected:
   void mergeCpuGradients();
 
   void mergeGradSparse(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
+      Parameter* para,
+      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
 
   void mergeGradSparseRemote(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
+      Parameter* para,
+      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
 
   void mergeGradDense(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
+      Parameter* para,
+      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
 
   void computeThread();
   void valueDispatchThread();
@@ -499,5 +465,4 @@ protected:
   bool inArgsCopied_;
 };
 
-
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.cpp b/paddle/gserver/gradientmachines/MultiNetwork.cpp
index d30ca6f28e8647d3cb565a7899ac5f8ef879883a..e5be19cad6b450850de4cc5776017b79d3243681 100644
--- a/paddle/gserver/gradientmachines/MultiNetwork.cpp
+++ b/paddle/gserver/gradientmachines/MultiNetwork.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 #include <algorithm>
@@ -24,7 +23,8 @@ limitations under the License. */
 
 namespace paddle {
 
-void MultiNetwork::init(const ModelConfig& config, ParamInitCallback callback,
+void MultiNetwork::init(const ModelConfig& config,
+                        ParamInitCallback callback,
                         const std::vector<ParameterType>& parameterTypes,
                         bool useGpu) {
   CHECK_GT(config.sub_models_size(), 1) << "sub_models_size should GT 1";
@@ -40,10 +40,10 @@ void MultiNetwork::init(const ModelConfig& config, ParamInitCallback callback,
     std::string subModelName = config.sub_models(i).name();
     if (FLAGS_parallel_nn) {
       subNetworks_[i - 1] = std::unique_ptr<ParallelNeuralNetwork>(
-                           new ParallelNeuralNetwork(subModelName, this));
+          new ParallelNeuralNetwork(subModelName, this));
     } else {
       subNetworks_[i - 1] = std::unique_ptr<NeuralNetwork>(
-                           NeuralNetwork::newNeuralNetwork(subModelName, this));
+          NeuralNetwork::newNeuralNetwork(subModelName, this));
     }
     subNetworks_[i - 1]->init(config);
   }
@@ -64,7 +64,8 @@ void MultiNetwork::prefetch(const std::vector<Argument>& inArgs) {
 }
 
 void MultiNetwork::forward(const std::vector<Argument>& inArgs,
-                           std::vector<Argument>* outArgs, PassType passType) {
+                           std::vector<Argument>* outArgs,
+                           PassType passType) {
   // split inArgs to several vectors
   std::vector<std::vector<Argument>> argumentGroups;
   Argument::splitByDataId(inArgs, &argumentGroups);
@@ -154,7 +155,7 @@ public:
     return -1;
   }
 
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     for (auto& evaluator : evaluators_) {
       evaluator->printStats(os);
       os << ' ';
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.h b/paddle/gserver/gradientmachines/MultiNetwork.h
index a162420c3bfe63fdca26dc5da0514dc7854df091..779a2267f55c8e1b5d120d9fd1e2a0d455cc5c59 100644
--- a/paddle/gserver/gradientmachines/MultiNetwork.h
+++ b/paddle/gserver/gradientmachines/MultiNetwork.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "GradientMachine.h"
@@ -27,19 +26,22 @@ public:
   explicit MultiNetwork(std::string subModelName = "")
       : NeuralNetwork(subModelName) {}
 
-  virtual void init(const ModelConfig& config, ParamInitCallback callback,
+  virtual void init(const ModelConfig& config,
+                    ParamInitCallback callback,
                     const std::vector<ParameterType>& parameterTypes,
                     bool useGpu);
 
   virtual void prefetch(const std::vector<Argument>& inArgs);
 
   virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs, PassType passType);
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
 
   virtual void backward(const UpdateCallback& callback = nullptr);
 
   void forwardBackward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs, PassType passType,
+                       std::vector<Argument>* outArgs,
+                       PassType passType,
                        const UpdateCallback& callback);
 
   virtual void onPassEnd();
@@ -52,8 +54,7 @@ public:
     return subNetworks_;
   }
 
-  virtual void start(const TrainerConfig& config,
-                     DataProviderPtr dataProvider);
+  virtual void start(const TrainerConfig& config, DataProviderPtr dataProvider);
 
   virtual void finish();
 
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 3127b4dd9a2fd3a3da26b90100763c4ec2470cae..9932ea655ebdceb2eb1ae8920f4d320163d14262 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 
 #include "paddle/utils/Logging.h"
@@ -26,7 +25,8 @@ limitations under the License. */
 #include "paddle/gserver/layers/AgentLayer.h"
 
 namespace paddle {
-void parameterInitNN(int paramId, Parameter* para,
+void parameterInitNN(int paramId,
+                     Parameter* para,
                      std::vector<ParameterPtr>* sharedParams) {
   // Create parameters values.
   if (!para->useGpu() && sharedParams) {
@@ -35,10 +35,10 @@ void parameterInitNN(int paramId, Parameter* para,
                            (*sharedParams)[paramId]->getMat(PARAMETER_VALUE));
   } else {
     if (para->isSparseRemoteUpdate()) {
-      para->enableType(
-          PARAMETER_VALUE, FLAGS_loadsave_parameters_in_pserver
-                              ? Parameter::MAT_SPARSE_ROW_PREFETCH
-                              : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
+      para->enableType(PARAMETER_VALUE,
+                       FLAGS_loadsave_parameters_in_pserver
+                           ? Parameter::MAT_SPARSE_ROW_PREFETCH
+                           : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
     } else {
       para->enableType(PARAMETER_VALUE);
     }
@@ -65,7 +65,8 @@ NeuralNetwork* NeuralNetwork::create(const ModelConfig& config) {
 
 std::map<std::string, bool> NeuralNetwork::dllInitMap;
 
-void NeuralNetwork::init(const ModelConfig& config, ParamInitCallback callback,
+void NeuralNetwork::init(const ModelConfig& config,
+                         ParamInitCallback callback,
                          const std::vector<ParameterType>& parameterTypes,
                          bool useGpu) {
   using std::placeholders::_1;
@@ -89,12 +90,13 @@ void NeuralNetwork::init(const ModelConfig& config, ParamInitCallback callback,
   } else {
     parameters_.reserve(config.parameters_size());
     for (const auto& para_config : config.parameters()) {
-      auto parameter = std::make_shared<Parameter>(para_config, useGpu,
+      auto parameter = std::make_shared<Parameter>(para_config,
+                                                   useGpu,
                                                    /*initialize=*/false);
       paramCallback(parameters_.size(), parameter.get());
       if (!callback) {
         for (ParameterType type :
-                 (parameter->isStatic()
+             (parameter->isStatic()
                   ? std::vector<ParameterType>{PARAMETER_VALUE}
                   : parameterTypes)) {
           if (type != PARAMETER_VALUE && type != PARAMETER_GRADIENT) {
@@ -117,18 +119,19 @@ void NeuralNetwork::init(const ModelConfig& config, ParamInitCallback callback,
     layerMap_[layer->getName()] = layer;
   };
 
-  auto subModelConfig =
-      std::find_if(config.sub_models().begin(), config.sub_models().end(),
-                   [=](const SubModelConfig& sub_model) {
-                     return sub_model.name() == subModelName_;
-                   });
+  auto subModelConfig = std::find_if(config.sub_models().begin(),
+                                     config.sub_models().end(),
+                                     [=](const SubModelConfig& sub_model) {
+                                       return sub_model.name() == subModelName_;
+                                     });
   bool useSubModel = (subModelConfig != config.sub_models().end());
   CHECK_EQ(useSubModel, !subModelName_.empty());
   if (useSubModel) {
     layers_.reserve(subModelConfig->layer_names_size());
     for (const auto& layer_name : subModelConfig->layer_names()) {
       auto layer_config =
-          std::find_if(config.layers().begin(), config.layers().end(),
+          std::find_if(config.layers().begin(),
+                       config.layers().end(),
                        [=](const LayerConfig& layer_config) {
                          return layer_config.name() == layer_name;
                        });
@@ -176,14 +179,16 @@ void NeuralNetwork::init(const ModelConfig& config, ParamInitCallback callback,
   }
 }
 
-void NeuralNetwork::connect(LayerPtr agentLayer, LayerPtr realLayer,
+void NeuralNetwork::connect(LayerPtr agentLayer,
+                            LayerPtr realLayer,
                             int height) {
   AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get());
   CHECK_NOTNULL(agent);
   agent->setRealLayer(realLayer, height);
 }
 
-void NeuralNetwork::connect(std::string agentLayerName, NeuralNetwork* srcNN,
+void NeuralNetwork::connect(std::string agentLayerName,
+                            NeuralNetwork* srcNN,
                             std::string realLayerName) {
   connect(this->getLayer(agentLayerName), srcNN->getLayer(realLayerName));
 }
@@ -195,7 +200,7 @@ void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
     for (auto& para : parameters_) {
       if (para->isSparseRemoteUpdate()) {
         auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-          para->getMat(PARAMETER_VALUE).get());
+            para->getMat(PARAMETER_VALUE).get());
         para->clearGradient();
         mat->clearIndices();
       }
@@ -217,10 +222,10 @@ void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
     for (auto& para : parameters_) {
       if (para->isSparseRemoteUpdate()) {
         auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-          para->getMat(PARAMETER_VALUE).get());
+            para->getMat(PARAMETER_VALUE).get());
         mat->setupIndices();
         auto matGrad = dynamic_cast<SparseRowCpuMatrix*>(
-          para->getMat(PARAMETER_GRADIENT).get());
+            para->getMat(PARAMETER_GRADIENT).get());
         matGrad->reserveStore();
       }
     }
@@ -228,7 +233,8 @@ void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
 }
 
 void NeuralNetwork::forward(const std::vector<Argument>& inArgs,
-                            std::vector<Argument>* outArgs, PassType passType) {
+                            std::vector<Argument>* outArgs,
+                            PassType passType) {
   CHECK_EQ(inArgs.size(), dataLayers_.size());
   outArgs->resize(outputLayers_.size());
   for (size_t i = 0; i != dataLayers_.size(); ++i) {
@@ -325,7 +331,7 @@ public:
     (void)arguments;
     return -1;
   }
-  virtual void printStats(std::ostream& os) {
+  virtual void printStats(std::ostream& os) const {
     for (auto& evaluator : evaluators_) {
       evaluator->printStats(os);
       os << ' ';
@@ -344,11 +350,11 @@ protected:
 
 Evaluator* NeuralNetwork::makeEvaluator() {
   CombinedEvaluator* combinedEvaluator = new CombinedEvaluator();
-  auto subModelConfig =
-      std::find_if(config_.sub_models().begin(), config_.sub_models().end(),
-                   [=](const SubModelConfig& sub_model) {
-                     return sub_model.name() == subModelName_;
-                   });
+  auto subModelConfig = std::find_if(config_.sub_models().begin(),
+                                     config_.sub_models().end(),
+                                     [=](const SubModelConfig& sub_model) {
+                                       return sub_model.name() == subModelName_;
+                                     });
   bool useSubModel = (subModelConfig != config_.sub_models().end());
   CHECK_EQ(useSubModel, !subModelName_.empty());
   if (useSubModel) {
@@ -356,7 +362,8 @@ Evaluator* NeuralNetwork::makeEvaluator() {
     for (int i = 0; i < subModelConfig->evaluator_names_size(); ++i) {
       // find evaluator by name
       auto thisEvalConfig = std::find_if(
-          config_.evaluators().begin(), config_.evaluators().end(),
+          config_.evaluators().begin(),
+          config_.evaluators().end(),
           [=](const EvaluatorConfig& ecfg) {
             return ecfg.name() == subModelConfig->evaluator_names(i);
           });
@@ -385,17 +392,17 @@ void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
   }
 }
 
-extern NeuralNetwork* newCustomNerualNetwork(
-  const std::string& name, NeuralNetwork* network) __attribute__((weak));
+extern NeuralNetwork* newCustomNerualNetwork(const std::string& name,
+                                             NeuralNetwork* network)
+    __attribute__((weak));
 
-NeuralNetwork* NeuralNetwork::newNeuralNetwork(
-    const std::string& name,
-    NeuralNetwork* rootNetwork) {
-    if (newCustomNerualNetwork) {
-      return newCustomNerualNetwork(name, rootNetwork);
-    } else {
-      return new NeuralNetwork(name, rootNetwork);
-    }
+NeuralNetwork* NeuralNetwork::newNeuralNetwork(const std::string& name,
+                                               NeuralNetwork* rootNetwork) {
+  if (newCustomNerualNetwork) {
+    return newCustomNerualNetwork(name, rootNetwork);
+  } else {
+    return new NeuralNetwork(name, rootNetwork);
+  }
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index 06c679a63cc79b68b9fd27dfb64dfa9add8a1078..55ef45c5eeddc770ec3bc8fd0055d561eaf3b754 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -52,14 +51,15 @@ namespace paddle {
  * GPU value: NORMAL
  * GPU param: NORMAL
  */
-void parameterInitNN(int paramId, Parameter* para,
+void parameterInitNN(int paramId,
+                     Parameter* para,
                      std::vector<ParameterPtr>* sharedParams);
 
-
 class NeuralNetwork : public GradientMachine {
 public:
   virtual void init(
-      const ModelConfig& config, ParamInitCallback callback = nullptr,
+      const ModelConfig& config,
+      ParamInitCallback callback = nullptr,
       const std::vector<ParameterType>&
           parameterTypes = std::vector<ParameterType>{PARAMETER_VALUE,
                                                       PARAMETER_GRADIENT,
@@ -76,13 +76,15 @@ public:
    * @param agentLayer The up-submodel's input agent layer.
    */
   static void connect(LayerPtr agentLayer, LayerPtr realLayer, int height = 0);
-  void connect(std::string agentLayerName, NeuralNetwork* srcNN,
+  void connect(std::string agentLayerName,
+               NeuralNetwork* srcNN,
                std::string realLayerName);
 
   virtual void prefetch(const std::vector<Argument>& inArgs);
 
   virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs, PassType passType);
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
 
   virtual void backward(const UpdateCallback& callback = nullptr);
 
@@ -117,16 +119,15 @@ public:
    */
   template <typename T>
   void forEachLayer(T callback) {
-    for (auto & l : layers_) {
+    for (auto& l : layers_) {
       if (callback(l)) {
         break;
       }
     }
   }
 
-
   static NeuralNetwork* newNeuralNetwork(const std::string& name = "",
-                                        NeuralNetwork* rootNetwork = nullptr);
+                                         NeuralNetwork* rootNetwork = nullptr);
 
 protected:
   /**
@@ -139,8 +140,7 @@ protected:
    */
   NeuralNetwork(std::string subModelName = "",
                 NeuralNetwork* rootNetwork = nullptr)
-      : subModelName_(subModelName),
-        rootNetwork_(rootNetwork) {}
+      : subModelName_(subModelName), rootNetwork_(rootNetwork) {}
 
   std::string subModelName_;
   ModelConfig config_;
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
index 952df60a7d78666c84d5fd9176c3113fdbdacdc9..9dbf418c31b0969eef7477a22b6f1bf63dab9b03 100644
--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 
@@ -24,10 +23,18 @@ limitations under the License. */
 namespace paddle {
 
 void ParallelNeuralNetwork::init(
-    const ModelConfig& config, ParamInitCallback callback,
-    const std::vector<ParameterType>& parameterTypes, bool useGpu) {
+    const ModelConfig& config,
+    ParamInitCallback callback,
+    const std::vector<ParameterType>& parameterTypes,
+    bool useGpu) {
   NeuralNetwork::init(config, callback, parameterTypes, useGpu);
 
+  if (config.type() == "recurrent_nn") {
+    LOG(FATAL)
+        << "You can not add `--parallel_nn=true` on the command line, "
+        << "parallel_nn training mode does not support the recurrent_nn model.";
+  }
+
   useGpu_ = useGpu;
   numDevices_ = 0;
   if (useGpu_) {
@@ -48,8 +55,8 @@ void ParallelNeuralNetwork::addComputeThread(int deviceId) {
     }
   }
 
-  threads_.emplace_back(new ParallelThread(threads_.size(), deviceId,
-                                           deviceId >= 0 ? useGpu_ : false));
+  threads_.emplace_back(new ParallelThread(
+      threads_.size(), deviceId, deviceId >= 0 ? useGpu_ : false));
 }
 
 void ParallelNeuralNetwork::waitAllThread() {
@@ -62,7 +69,8 @@ void ParallelNeuralNetwork::waitAllThread() {
   }
 }
 
-void ParallelNeuralNetwork::dispatchByDeviceId(int deviceId, LayerPtr layer,
+void ParallelNeuralNetwork::dispatchByDeviceId(int deviceId,
+                                               LayerPtr layer,
                                                TaskType task) {
   for (auto& thread : threads_) {
     if (thread->getDeviceId() == deviceId) {
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
index 2a3db654f4e16c0ecd4be91425330208046b4a6c..71488bc3b7a52d851d0e3fb77c48f3fd36bdce83 100644
--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "NeuralNetwork.h"
@@ -35,24 +34,27 @@ enum TaskType {
 class ParallelNeuralNetwork : public NeuralNetwork {
 public:
   ParallelNeuralNetwork(std::string subModelName = "",
-      NeuralNetwork* rootNetwork = nullptr)
-    : NeuralNetwork(subModelName, rootNetwork) {}
+                        NeuralNetwork *rootNetwork = nullptr)
+      : NeuralNetwork(subModelName, rootNetwork) {}
 
   virtual void init(
-      const ModelConfig &config, ParamInitCallback callback = nullptr,
-      const std::vector<ParameterType> &
-          parameterTypes = std::vector<ParameterType>{PARAMETER_VALUE,
-                                                      PARAMETER_GRADIENT,
-                                                      PARAMETER_MOMENTUM},
+      const ModelConfig &config,
+      ParamInitCallback callback = nullptr,
+      const std::vector<ParameterType>
+          &parameterTypes = std::vector<ParameterType>{PARAMETER_VALUE,
+                                                       PARAMETER_GRADIENT,
+                                                       PARAMETER_MOMENTUM},
       bool useGpu = FLAGS_use_gpu);
 
   virtual void forward(const std::vector<Argument> &inArgs,
-                       std::vector<Argument> *outArgs, PassType passType);
+                       std::vector<Argument> *outArgs,
+                       PassType passType);
 
   virtual void backward(const UpdateCallback &callback = nullptr);
 
   void forwardBackward(const std::vector<Argument> &inArgs,
-                       std::vector<Argument> *outArgs, PassType passType,
+                       std::vector<Argument> *outArgs,
+                       PassType passType,
                        const UpdateCallback &callback = NULL);
 
   virtual void start(const TrainerConfig &config, DataProviderPtr dataProvider);
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 340cd1b9f8e927ded5d06ab0c7ab15ec75bc8469..516b61757698923eb0fde1f3b1d28074cac10044 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -53,8 +53,8 @@ typedef int (*DiyStartCalcProbCallback)(size_t nNodes, int* nodes);
  *          path.
  * @NOTE: Return -INFINITY will DROP this path IMMEDIATELY!!
  */
-typedef real (*DiyCalcProbCallback)(int handler, size_t nNodes, int* nodes,
-                                    real curProb, bool atEos);
+typedef real (*DiyCalcProbCallback)(
+    int handler, size_t nNodes, int* nodes, real curProb, bool atEos);
 
 /**
  * Finish Custom Calculation of Probability callback type.
@@ -190,13 +190,16 @@ public:
 };
 
 void RecurrentGradientMachine::init(
-    const ModelConfig& config, ParamInitCallback callback,
-    const std::vector<ParameterType>& parameterTypes, bool useGpu) {
+    const ModelConfig& config,
+    ParamInitCallback callback,
+    const std::vector<ParameterType>& parameterTypes,
+    bool useGpu) {
   NeuralNetwork::init(config, callback, parameterTypes, useGpu);
   useGpu_ = useGpu;
 
   auto subModelConfig =
-      std::find_if(config.sub_models().begin(), config.sub_models().end(),
+      std::find_if(config.sub_models().begin(),
+                   config.sub_models().end(),
                    [this](const SubModelConfig& sub_model) {
                      return sub_model.name() == this->subModelName_;
                    });
@@ -224,7 +227,8 @@ void RecurrentGradientMachine::init(
     memoryFrameLines_[i].layerName = memoryConfig.layer_name();
     memoryFrameLines_[i].linkName = memoryConfig.link_name();
     auto agentConfig =
-        std::find_if(config.layers().begin(), config.layers().end(),
+        std::find_if(config.layers().begin(),
+                     config.layers().end(),
                      [&memoryConfig](const LayerConfig& layerConfig) {
                        return layerConfig.name() == memoryConfig.link_name();
                      });
@@ -413,7 +417,8 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     //    sample is one sentence
     if (shareInlinkInfo) {
       CHECK_EQ(input1.getBatchSize(), batchSize);
-      CHECK(std::equal(starts, starts + numSequences + 1,
+      CHECK(std::equal(starts,
+                       starts + numSequences + 1,
                        input1.sequenceStartPositions->getData(false)));
     }
   }
@@ -428,7 +433,8 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
       const Argument& input1 = inFrameLines_[i].inLayer->getOutput();
       CHECK_EQ((size_t)input1.getNumSubSequences(), numSubSequences);
       if (shareInlinkInfo) {
-        CHECK(std::equal(subStarts, subStarts + numSubSequences + 1,
+        CHECK(std::equal(subStarts,
+                         subStarts + numSubSequences + 1,
                          input1.subSequenceStartPositions->getData(false)));
       }
     }
@@ -460,8 +466,10 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     // inFrameLine select rows in real layer one time
     for (size_t i = 0; i < inFrameLines_.size(); i++) {
       int curInlinkId = shareInlinkInfo ? 0 : i;
-      selectRowsOneTime(inFrameLines_[i].inLayer, info_[curInlinkId].allIds,
-                        &(inFrameLines_[i].outArg), passType);
+      selectRowsOneTime(inFrameLines_[i].inLayer,
+                        info_[curInlinkId].allIds,
+                        &(inFrameLines_[i].outArg),
+                        passType);
     }
   }
   resizeOrCreateFrames(maxSequenceLength_);
@@ -472,15 +480,17 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
       auto scatterAgent =
           dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
       createMemoryFrameInfo(&memoryFrameLine, passType);
-      scatterAgent->setRealLayerAndOutput(
-          memoryFrameLine.rootLayer, memoryFrameLine.outArg,
-          memoryFrameLine.allIds,
-          /* idIndex */ 0, memoryFrameLine.allIds->getSize());
+      scatterAgent->setRealLayerAndOutput(memoryFrameLine.rootLayer,
+                                          memoryFrameLine.outArg,
+                                          memoryFrameLine.allIds,
+                                          /* idIndex */ 0,
+                                          memoryFrameLine.allIds->getSize());
       if (memoryFrameLine.is_sequence) {  // memoryConfig is sequence
         int size = memoryFrameLine.sequenceStartPositions->getSize();
         scatterAgent->setSequenceStartPositions(
             memoryFrameLine.sequenceStartPositions,
-            /* seqStartPosIndex */ 0, size);
+            /* seqStartPosIndex */ 0,
+            size);
       }
     }
   }
@@ -489,7 +499,8 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     auto gatherAgent =
         dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
     CHECK_NOTNULL(gatherAgent);
-    gatherAgent->copyIdAndSequenceInfo(input, info_[targetInfoInlinkId_].allIds,
+    gatherAgent->copyIdAndSequenceInfo(input,
+                                       info_[targetInfoInlinkId_].allIds,
                                        info_[targetInfoInlinkId_].idIndex);
   }
 
@@ -504,15 +515,15 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
       auto scatterAgent =
           dynamic_cast<ScatterAgentLayer*>(inFrameLine.agents[i].get());
       scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer,
-                                          inFrameLine.outArg, info.allIds,
-                                          info.idIndex[i], idSize);
+                                          inFrameLine.outArg,
+                                          info.allIds,
+                                          info.idIndex[i],
+                                          idSize);
       if (hasSubseq) {
         // size: the length of subsequence
-        int size =
-            info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i];
-        scatterAgent->setSequenceStartPositions(info.sequenceStartPositions,
-                                                info.seqStartPosIndex[i],
-                                                size);
+        int size = info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i];
+        scatterAgent->setSequenceStartPositions(
+            info.sequenceStartPositions, info.seqStartPosIndex[i], size);
       }
     }
 
@@ -547,7 +558,7 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     if (hasSubseq) {
       for (auto& outFrameLine : outFrameLines_) {
         CHECK(outFrameLine.frames[i]->getOutput().sequenceStartPositions)
-          << "In hierachical RNN, all out links should be from sequences.";
+            << "In hierachical RNN, all out links should be from sequences.";
       }
     }
   }
@@ -573,8 +584,10 @@ void RecurrentGradientMachine::backward(const UpdateCallback& callback) {
 }
 
 void RecurrentGradientMachine::forwardBackward(
-    const std::vector<Argument>& inArgs, std::vector<Argument>* outArgs,
-    PassType passType, const UpdateCallback& callback) {
+    const std::vector<Argument>& inArgs,
+    std::vector<Argument>* outArgs,
+    PassType passType,
+    const UpdateCallback& callback) {
   LOG(FATAL) << "should not use this function";
 }
 
@@ -729,12 +742,15 @@ void RecurrentGradientMachine::createMemoryFrameInfo(
   // copy and check scatterId
   copyScattedId(allIds, &(*memoryFrameLine).allIds, input.getBatchSize());
   // memoryFrameLine select rows in real layer one time
-  selectRowsOneTime((*memoryFrameLine).rootLayer, (*memoryFrameLine).allIds,
-                    &(*memoryFrameLine).outArg, passType);
+  selectRowsOneTime((*memoryFrameLine).rootLayer,
+                    (*memoryFrameLine).allIds,
+                    &(*memoryFrameLine).outArg,
+                    passType);
 }
 
 void RecurrentGradientMachine::copyScattedId(std::vector<int>& srcIds,
-                                             IVectorPtr* dstIds, int size) {
+                                             IVectorPtr* dstIds,
+                                             int size) {
   int idSize = srcIds.size();
   CHECK_EQ(idSize, size);
   IVector::resizeOrCreate(*dstIds, idSize, useGpu_);
@@ -756,12 +772,12 @@ void RecurrentGradientMachine::selectRowsOneTime(LayerPtr layer,
     int height = realV->getHeight();
     int width = realV->getWidth();
     Matrix::resizeOrCreate(
-      arg->value, height, width, /* trans */ false, useGpu_);
+        arg->value, height, width, /* trans */ false, useGpu_);
     arg->value->zeroMem();
     arg->value->selectRows(*realV, *allIds);
     if (passType != PASS_TEST) {
-      Matrix::resizeOrCreate(arg->grad, height, width, /* trans */ false,
-                             useGpu_);
+      Matrix::resizeOrCreate(
+          arg->grad, height, width, /* trans */ false, useGpu_);
       arg->grad->zeroMem();
     }
   }
@@ -833,8 +849,8 @@ void RecurrentGradientMachine::generateSequence() {
             << "boot layer must be a sequence when is_sequence = true";
       }
     }
-    NeuralNetwork::connect(memoryFrameLine.agents[0], memoryFrameLine.bootLayer,
-                           ids.size());
+    NeuralNetwork::connect(
+        memoryFrameLine.agents[0], memoryFrameLine.bootLayer, ids.size());
   }
 
   // boot layer forward
@@ -847,14 +863,19 @@ void RecurrentGradientMachine::generateSequence() {
   size_t resultNum = generator_.config.num_results_per_sample();
   IVector::resizeOrCreate(
       generator_.outArg.ids,
-      generator_.config.max_num_frames() * numSequences * resultNum, false);
+      generator_.config.max_num_frames() * numSequences * resultNum,
+      false);
   if (resultNum > 1) {
     CHECK_LE(resultNum, static_cast<size_t>(generator_.config.beam_size()));
-    Matrix::resizeOrCreate(generator_.outArg.in, /* height */ numSequences,
-                           /* width */ resultNum, false, /* useGpu */ false);
+    Matrix::resizeOrCreate(generator_.outArg.in,
+                           /* height */ numSequences,
+                           /* width */ resultNum,
+                           false,
+                           /* useGpu */ false);
   }
   ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
-                                numSequences + 1, /* useGpu */ false);
+                                numSequences + 1,
+                                /* useGpu */ false);
   if (getBeamSize() > 1) {
     beamSearch(numSequences);
   } else {
@@ -906,7 +927,8 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
         auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
             memoryFrameLine.scatterAgents[machineCur].get());
         scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
-                                   scatterIds, memoryFrameLine.is_sequence);
+                                   scatterIds,
+                                   memoryFrameLine.is_sequence);
         scatterAgent->forward(PASS_TEST);
         NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
                                memoryFrameLine.scatterAgents[machineCur]);
@@ -948,7 +970,8 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
   starts[0] = 0;
   generator_.ids.clear();
   for (size_t i = 0; i < batchSize; ++i) {
-    generator_.ids.insert(generator_.ids.end(), finalPaths[i].ids.begin(),
+    generator_.ids.insert(generator_.ids.end(),
+                          finalPaths[i].ids.begin(),
                           finalPaths[i].ids.end());
     starts[i + 1] = generator_.ids.size();
     batchMachineIdVec_.insert(batchMachineIdVec_.end(),
@@ -999,8 +1022,11 @@ void RecurrentGradientMachine::forwardFrame(int machineCur) {
   if (useGpu_) {
     IVector::resizeOrCreate(cpuId_, ids->getSize(), false /* useGpu */);
     cpuId_->copyFrom(*ids);
-    Matrix::resizeOrCreate(cpuProb_, in->getHeight(), in->getWidth(),
-                           false /* trans */, false /* useGpu */);
+    Matrix::resizeOrCreate(cpuProb_,
+                           in->getHeight(),
+                           in->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
     cpuProb_->copyFrom(*in);
     IVector::resizeOrCreate(cpuEos_, eos->getSize(), false /* useGpu */);
     cpuEos_->copyFrom(*eos);
@@ -1011,7 +1037,8 @@ void RecurrentGradientMachine::forwardFrame(int machineCur) {
   }
 }
 
-void RecurrentGradientMachine::singlePathExpand(Path& curPath, size_t curPathId,
+void RecurrentGradientMachine::singlePathExpand(Path& curPath,
+                                                size_t curPathId,
                                                 std::vector<Path>& newPaths,
                                                 size_t expandWidth) {
   int calc_id =
@@ -1037,8 +1064,8 @@ void RecurrentGradientMachine::singlePathExpand(Path& curPath, size_t curPathId,
     if (id == -1) break;
 
     real newLogProb = generator_.config.log_prob() ? std::log(prob) : prob;
-    Path newPath(curPath, id, newLogProb, curPathId /*machineId*/,
-                 k /*topIndex*/);
+    Path newPath(
+        curPath, id, newLogProb, curPathId /*machineId*/, k /*topIndex*/);
     if (this->beamSearchCtrlCallbacks_) {
       if (beamSearchCtrlCallbacks_->stopDetermineCandidates(
               newPath.seqId, newPath.ids, newPath.probHistory))
@@ -1104,7 +1131,8 @@ size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
   }
   std::nth_element(newPaths.begin() + totalExpandCount,
                    newPaths.begin() + totalExpandCount + minNewPathSize,
-                   newPaths.end(), Path::greaterPath);
+                   newPaths.end(),
+                   Path::greaterPath);
   newPaths.resize(totalExpandCount + minNewPathSize);
 
   real minPathLogProb =
@@ -1116,7 +1144,8 @@ size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
 
   // Remove the already formed paths that are relatively short
   finalPaths_[seqId].erase(
-      std::remove_if(finalPaths_[seqId].begin(), finalPaths_[seqId].end(),
+      std::remove_if(finalPaths_[seqId].begin(),
+                     finalPaths_[seqId].end(),
                      [&](Path& p) { return p.logProb < minPathLogProb; }),
       finalPaths_[seqId].end());
   for (auto p : finalPaths_[seqId]) {
@@ -1139,7 +1168,8 @@ void RecurrentGradientMachine::fillGenOutputs() {
     size_t minFinalPathsSize = std::min(numResults, finalPaths_[i].size());
     std::partial_sort(finalPaths_[i].begin(),
                       finalPaths_[i].begin() + minFinalPathsSize,
-                      finalPaths_[i].end(), Path::greaterPath);
+                      finalPaths_[i].end(),
+                      Path::greaterPath);
     finalPaths_[i].resize(minFinalPathsSize);
   }
 
@@ -1154,8 +1184,8 @@ void RecurrentGradientMachine::fillGenOutputs() {
       for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
         Path& path = finalPaths_[i][j];
         generator_.ids.push_back(path.ids.size());  // sequence size
-        generator_.ids.insert(generator_.ids.end(), path.ids.begin(),
-                              path.ids.end());
+        generator_.ids.insert(
+            generator_.ids.end(), path.ids.begin(), path.ids.end());
         generator_.ids.push_back(-1);  // end of sequence
         probs[i * numResults + j] = path.logProb;
 
@@ -1198,8 +1228,12 @@ void RecurrentGradientMachine::createDataOutlink(
   }
 
   for (size_t i = 0; i < dataArgsSize_; i++) {
-    dataArgs_[i].concat(dataArgsFrame_[i], machineIdVec, starts, useGpu_,
-                        HPPL_STREAM_1, PASS_TEST);
+    dataArgs_[i].concat(dataArgsFrame_[i],
+                        machineIdVec,
+                        starts,
+                        useGpu_,
+                        HPPL_STREAM_1,
+                        PASS_TEST);
 
     auto dataAgent =
         dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
@@ -1235,7 +1269,8 @@ void RecurrentGradientMachine::beamSearch(size_t batchSize) {
       auto ptr =
           new ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&,
                               int>(beamSearchStatistics_->onEachStepStarted,
-                                   beamSearchStatistics_->onEachStepStoped, i);
+                                   beamSearchStatistics_->onEachStepStoped,
+                                   i);
       statisticsBlock.reset(ptr);
     }
     if (stopBeamSearch_) break;
@@ -1246,7 +1281,9 @@ void RecurrentGradientMachine::beamSearch(size_t batchSize) {
       std::vector<std::vector<int>*> prefixes;
       prefixes.resize(paths.size());
       std::transform(
-          paths.begin(), paths.end(), prefixes.begin(),
+          paths.begin(),
+          paths.end(),
+          prefixes.begin(),
           [](const Path& p) { return const_cast<std::vector<int>*>(&p.ids); });
       beamSearchCtrlCallbacks_->beamSearchCandidateAdjust(
           prefixes, frames_[machineCur].get(), i);
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index 6328213793ed6ca39214ec00124570ecb1ce273b..cb74a67e52f5f48d106b9fe93b1230a1675d3341 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -44,19 +44,22 @@ public:
     this->removeBeamSearchControlCallbacks();
   }
 
-  virtual void init(const ModelConfig& config, ParamInitCallback callback,
+  virtual void init(const ModelConfig& config,
+                    ParamInitCallback callback,
                     const std::vector<ParameterType>& parameterTypes,
                     bool useGpu);
 
   virtual void prefetch(const std::vector<Argument>& inArgs);
 
   virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs, PassType passType);
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
 
   virtual void backward(const UpdateCallback& callback = nullptr);
 
   void forwardBackward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs, PassType passType,
+                       std::vector<Argument>* outArgs,
+                       PassType passType,
                        const UpdateCallback& callback);
 
   virtual void resetState() {}
@@ -81,8 +84,8 @@ public:
    * beam search, so that user can customize different operations in different
    * beam search iterations.
    */
-  typedef std::function<void(const std::vector<std::vector<int>*>&,
-                             NeuralNetwork*, const int)>
+  typedef std::function<void(
+      const std::vector<std::vector<int>*>&, NeuralNetwork*, const int)>
       BeamSearchCandidatesAdjustCallback;
 
   /**
@@ -99,8 +102,9 @@ public:
    *
    * Return true if this prefix or candidate is expected to be dropped.
    */
-  typedef std::function<bool(int seqId, const std::vector<int>&,
-                             const std::vector<real>&)> DropCallback;
+  typedef std::function<bool(
+      int seqId, const std::vector<int>&, const std::vector<real>&)>
+      DropCallback;
 
   /**
     * @brief NormOrDropNodeCallback
@@ -115,8 +119,9 @@ public:
     *
     * The fourth parameter is the probability of the whole path.
     */
-  typedef std::function<void(int seqId, const std::vector<int>&,
-                             std::vector<real>&, real*)> NormOrDropNodeCallback;
+  typedef std::function<void(
+      int seqId, const std::vector<int>&, std::vector<real>&, real*)>
+      NormOrDropNodeCallback;
 
   /**
    * @brief Register beam search control callbacks. Used for prediction.
@@ -346,7 +351,8 @@ protected:
   *  If hasSubseq, will also create scattered sequenceStartPositions infomation
   *  for all realLayer of inFrameLines one time.
   */
-  void createInFrameInfo(int inlinks_id, const Argument& input,
+  void createInFrameInfo(int inlinks_id,
+                         const Argument& input,
                          PassType passType);
 
   void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine,
@@ -354,8 +360,10 @@ protected:
 
   void copyScattedId(std::vector<int>& srcIds, IVectorPtr* dstIds, int size);
 
-  void selectRowsOneTime(LayerPtr layer, const IVectorPtr& allIds,
-                         Argument* arg, PassType passType);
+  void selectRowsOneTime(LayerPtr layer,
+                         const IVectorPtr& allIds,
+                         Argument* arg,
+                         PassType passType);
 
   void createSeqPos(const std::vector<int>& sequenceStartPosition,
                     ICpuGpuVectorPtr* sequenceStartPositions);
@@ -459,7 +467,8 @@ private:
    * @param totalExpandCount : number of already shrinked paths in newPaths
    * @return size of retained paths at the end of a beam search iteration
    */
-  size_t beamShrink(std::vector<Path>& newPaths, size_t seqId,
+  size_t beamShrink(std::vector<Path>& newPaths,
+                    size_t seqId,
                     size_t totalExpandCount);
 
   /*
@@ -469,8 +478,10 @@ private:
    * @param curPathId : index of curPath in member newPaths
    * @param expandWidth : number of paths to be expanded
    */
-  void singlePathExpand(Path& curPath, size_t curPathId,
-                        std::vector<Path>& newPaths, size_t expandWidth);
+  void singlePathExpand(Path& curPath,
+                        size_t curPathId,
+                        std::vector<Path>& newPaths,
+                        size_t expandWidth);
 
   /*
    * @brief A new beam search iteration. Each half-generated paths in previous
diff --git a/paddle/gserver/layers/AddtoLayer.cpp b/paddle/gserver/layers/AddtoLayer.cpp
index 083b1957f3a724370f1de0824a6ac79d74224a03..8a9aecfa19b815814a985183ee28344a6f4f9712 100644
--- a/paddle/gserver/layers/AddtoLayer.cpp
+++ b/paddle/gserver/layers/AddtoLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "AddtoLayer.h"
 
 #include "paddle/utils/Logging.h"
diff --git a/paddle/gserver/layers/AddtoLayer.h b/paddle/gserver/layers/AddtoLayer.h
index 0f2ca0bf19ee7dea57230042dbb13e422e8821e4..883d186f3e63f3a60789c0a4f0e05db1202f3ec8 100644
--- a/paddle/gserver/layers/AddtoLayer.h
+++ b/paddle/gserver/layers/AddtoLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -21,15 +20,16 @@ limitations under the License. */
 
 namespace paddle {
 
-/** 
- * This layer just simply add all input layers together, then activate 
- * the sum inputs. Each input of this layer should be the same size, 
+/**
+ * This layer just simply add all input layers together, then activate
+ * the sum inputs. Each input of this layer should be the same size,
  * which is also the output size of this layer.
  * \f[
  *   y=f(\sum_{i}x_i + b)
  * \f]
- * where \f$y\f$ is output, \f$x\f$ is input, \f$b\f$ is bias, and \f$f\f$ is activation function.
- * 
+ * where \f$y\f$ is output, \f$x\f$ is input, \f$b\f$ is bias, and \f$f\f$ is
+ * activation function.
+ *
  * The config file api is addto_layer.
  */
 class AddtoLayer : public Layer {
@@ -41,20 +41,20 @@ public:
 
   ~AddtoLayer() {}
 
-  /** 
-   * Intialization of AddtoLayer. 
+  /**
+   * Intialization of AddtoLayer.
    */
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
 
-  /** 
+  /**
    * Forward propagation.
-   * @note There is no weight matrix for each input, 
+   * @note There is no weight matrix for each input,
    *       because it just a simple add operation.
    */
   void forward(PassType passType);
 
-  /** 
-   * Backward propagation. 
+  /**
+   * Backward propagation.
    */
   void backward(const UpdateCallback& callback = nullptr);
 };
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
index 5e07446c71ff626684894cd99305ea8dc938d00d..eb89281cb1c75cb9b0679bd40ed4cfd4e2224188 100644
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -44,8 +44,8 @@ void AgentLayer::forward(PassType passType) {
     if (realOutput.ids) {
       output_.ids->subVecFrom(*realOutput.ids, 0, numSamples_);
     } else {
-      output_.subArgFrom(realOutput, /* offset */ 0, numSamples_, getSize(),
-                         useGpu_);
+      output_.subArgFrom(
+          realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_);
     }
   } else {
     output_ = realOutput;
@@ -64,9 +64,15 @@ void SequenceAgentLayer::forward(PassType passType) {
     int numRows =
         realOutput.sequenceStartPositions->getData(false)[numSamples_];
     CHECK(!realOutput.ids) << "Not supported";
-    output_.subArgFrom(realOutput, /* offset */ 0, numRows, getSize(), useGpu_,
-                       /* trans */ false, /* seqFlag */ true,
-                       /* seqStart */ 0, /* seqSize */ numSamples_ + 1);
+    output_.subArgFrom(realOutput,
+                       /* offset */ 0,
+                       numRows,
+                       getSize(),
+                       useGpu_,
+                       /* trans */ false,
+                       /* seqFlag */ true,
+                       /* seqStart */ 0,
+                       /* seqSize */ numSamples_ + 1);
   } else {
     output_ = realOutput;
   }
@@ -107,7 +113,8 @@ void GatherAgentLayer::forward(PassType passType) {
   for (size_t i = 0; i < realLayers_.size(); ++i) {
     const MatrixPtr& realV = realLayers_[i]->getOutputValue();
     idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i],
-                                 /* size */ realV->getHeight(), useGpu_);
+                                 /* size */ realV->getHeight(),
+                                 useGpu_);
     realV->addToRows(*outV, *idsVec_[i]);
   }
 }
@@ -140,8 +147,8 @@ void ScatterAgentLayer::forward(PassType passType) {
 
   int width = this->getSize();
   if (realOutArg_.value || realOutArg_.ids) {
-    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
-                       useGpu_);
+    output_.subArgFrom(
+        realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_);
   } else {  // used in generation
     if (realLayer_->getOutput().ids) {
       IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
@@ -223,8 +230,13 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
 
   if (realOutArg_.value || realOutArg_.ids) {
     CHECK(realOutArg_.sequenceStartPositions);
-    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
-                       useGpu_, /* trans */ false, /* seqFlag */ true,
+    output_.subArgFrom(realOutArg_,
+                       /* offset */ idIndex_,
+                       idSize_,
+                       width,
+                       useGpu_,
+                       /* trans */ false,
+                       /* seqFlag */ true,
                        /* seqStart */ seqStartPosIndex_,
                        /* seqSize */ numSequences_);
   } else {
@@ -247,8 +259,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
 
     CHECK_NE(input.sequenceStartPositions.get(),
              output_.sequenceStartPositions.get());
-    ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions,
-                                  numSequences + 1, false);
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions, numSequences + 1, false);
     int* outStarts = output_.sequenceStartPositions->getMutableData(false);
 
     ICpuGpuVector::resizeOrCreate(inputStartPos_, height, false);
diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h
index 3d7bf5583407078da4d66264e62581a59d5013ae..0186653c0f26cd2b53fc6d96d0dfad09dab6fa5b 100644
--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -82,7 +81,8 @@ public:
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
 
   // call before addRealLayer
-  void copyIdAndSequenceInfo(const Argument& input, const IVectorPtr& allIds,
+  void copyIdAndSequenceInfo(const Argument& input,
+                             const IVectorPtr& allIds,
                              const std::vector<int>& idIndex);
 
   // add one real layer, can call many times
@@ -140,11 +140,12 @@ public:
    *
    * @param layer[input]    realLayer
    * @param ids[input]      row id in real layer
-   * @param copyId[input]   whether to copy a cpu version of ids, 
-   *                        false(default) in ScatterAgentLayer, and 
+   * @param copyId[input]   whether to copy a cpu version of ids,
+   *                        false(default) in ScatterAgentLayer, and
    *                        true in SequenceScatterAgentLayer.
    */
-  void setRealLayer(LayerPtr layer, const std::vector<int>& ids,
+  void setRealLayer(LayerPtr layer,
+                    const std::vector<int>& ids,
                     bool copyId = false) {
     realLayer_ = layer;
     IVector::resizeOrCreate(ids_, ids.size(), useGpu_);
@@ -161,8 +162,11 @@ public:
 
   // set real layer and output, [idIndex, idIndex + idSize) of *ids*
   // are selected row for realOutArg in realLayer
-  void setRealLayerAndOutput(LayerPtr layer, const Argument& outArg,
-                             const IVectorPtr& ids, int idIndex, int idSize) {
+  void setRealLayerAndOutput(LayerPtr layer,
+                             const Argument& outArg,
+                             const IVectorPtr& ids,
+                             int idIndex,
+                             int idSize) {
     realLayer_ = layer;
     realOutArg_ = outArg;
     ids_ = ids;
@@ -170,9 +174,9 @@ public:
     idSize_ = idSize;
   }
 
-  void setSequenceStartPositions(
-      const ICpuGpuVectorPtr& sequenceStartPositions,
-      int seqStartPosIndex, int numSequences) {
+  void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions,
+                                 int seqStartPosIndex,
+                                 int numSequences) {
     realOutArg_.sequenceStartPositions = sequenceStartPositions;
     seqStartPosIndex_ = seqStartPosIndex;
     numSequences_ = numSequences;
diff --git a/paddle/gserver/layers/AverageLayer.cpp b/paddle/gserver/layers/AverageLayer.cpp
index 6e52217de4e637c6188bec9c48005622bb983a16..af64e15fe3ba68c62f164c45400f55fcaa937068 100644
--- a/paddle/gserver/layers/AverageLayer.cpp
+++ b/paddle/gserver/layers/AverageLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "AverageLayer.h"
 
 #include "paddle/utils/Logging.h"
@@ -25,13 +24,8 @@ REGISTER_LAYER(average, AverageLayer);
 
 bool AverageLayer::init(const LayerMap& layerMap,
                         const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
+  SequencePoolLayer::init(layerMap, parameterMap);
 
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
   dataMtx_ = Matrix::create(nullptr, 1, 1, false, useGpu_);
   outMtx_ = Matrix::create(nullptr, 1, getSize(), false, useGpu_);
   // average strategy
@@ -44,57 +38,15 @@ bool AverageLayer::init(const LayerMap& layerMap,
   } else {
     LOG(FATAL) << "Unknown average strategy: " << config_.average_strategy();
   }
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
   return true;
 }
 
 void AverageLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  // average layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
+  SequencePoolLayer::forward(passType);
 
-  size_t dim = getSize();
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  if (type_) {
-    CHECK(input.subSequenceStartPositions)
-      << "when trans_type = seq, input must hasSubseq";
-  }
-  int64_t newBatchSize =
-      type_ ? input.getNumSubSequences() : input.getNumSequences();
-  ICpuGpuVectorPtr startPositions =
-      type_ ? input.subSequenceStartPositions
-            : input.sequenceStartPositions;
-  const int* starts = startPositions->getData(false);
-  size_t numSequences = startPositions->getSize() - 1;
-
-  // check
-  CHECK_EQ(numSequences, (size_t)newBatchSize);
-  CHECK_EQ(starts[numSequences], input.getBatchSize());
-  CHECK_EQ(dim, input.value->getWidth());
-
-  resetOutput(newBatchSize, dim);
-  auto startsPos = startPositions->getVector(useGpu_);
   MatrixPtr inputValue = getInputValue(0);
-  getOutputValue()->sequenceAvgForward(*inputValue, *startsPos, mode_);
-
-  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-   * thus, in this case, output_ has no sequenceStartPositions.
-   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-   * case, we should compute the new sequenceStartPositions.
-  */
-  if (type_) {
-    output_.degradeSequence(input, useGpu_);
-  }
+  getOutputValue()->sequenceAvgForward(
+      *inputValue, *startPositions_->getVector(useGpu_), mode_);
 
   /* add the bias-vector AFTER average operation */
   if (biases_.get() != NULL) {
@@ -106,26 +58,16 @@ void AverageLayer::forward(PassType passType) {
 }
 
 void AverageLayer::backward(const UpdateCallback& callback) {
-  const Argument& input = getInput(0);
-  ICpuGpuVectorPtr startPositions =
-      type_ ? input.subSequenceStartPositions
-            : input.sequenceStartPositions;
-  const int* starts = startPositions->getData(false);
-  /* Do derivation */ { backwardActivation(); }
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
+  SequencePoolLayer::backward(callback);
 
+  const int* starts = startPositions_->getData(false);
   MatrixPtr grad = getInputGrad(0);
+
   if (grad) {
     size_t dim = getSize();
     real* gradientData = getInputGrad(0)->getData();
     real* gradient = getOutputGrad()->getData();
-    size_t numSequences = startPositions->getSize() - 1;
+    size_t numSequences = startPositions_->getSize() - 1;
     for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
       // TODO(Dangqingqing) optimization for GPU
       int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
@@ -133,8 +75,8 @@ void AverageLayer::backward(const UpdateCallback& callback) {
         // empty sequence
         continue;
       }
-      dataMtx_->setData(gradientData + starts[sequenceId] * dim, sequenceLength,
-                        dim);
+      dataMtx_->setData(
+          gradientData + starts[sequenceId] * dim, sequenceLength, dim);
       outMtx_->setData(gradient + sequenceId * dim);
       switch (mode_) {
         case kAverage: {
diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h
index ae910ddefad13714a915b1b22eb28d16e2bc2b09..1edc2ace492c5b96da3255c7e93e257830789985 100644
--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
-#include "Layer.h"
+#include "SequencePoolLayer.h"
 #include "paddle/math/Matrix.h"
 
 namespace paddle {
@@ -23,20 +22,21 @@ namespace paddle {
 /**
  * A layer for "internal average" for sequence input.
  * Input: one or more sequences. Each sequence contains some instances.
- * If AverageLevel = kNonSeq:
+ * If SequenceLevel = kNonSeq:
  *    Output: output size is the number of input sequences (NOT input instances)
  *    output[i] = average_{for each instance in this sequence}{input[i]}
- * If AverageLevel = kSeq:
+ * If SequenceLevel = kSeq:
  *    Check input sequence must has sub-sequence
  *    Output: output size is the number of input sub-sequences
  *    output[i] = average_{for each instance in this sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
  */
-
-class AverageLayer : public Layer {
+class AverageLayer : public SequencePoolLayer {
 public:
   enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
-  enum AverageLevel { kNonSeq = 0, kSeq = 1 };
-  explicit AverageLayer(const LayerConfig& config) : Layer(config) {}
+  explicit AverageLayer(const LayerConfig& config)
+      : SequencePoolLayer(config) {}
 
   ~AverageLayer() {}
 
@@ -46,11 +46,8 @@ public:
   void backward(const UpdateCallback& callback = nullptr);
 
 protected:
-  std::unique_ptr<Weight> biases_;
   MatrixPtr outMtx_;
   MatrixPtr dataMtx_;
   int mode_;
-  int type_;
 };
-
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp
index 8052b35ec69c500b9005d4ffef882ceafa3bdab8..2d5bcff29fd5ad33c8eba85fc803bbf89803782e 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "Layer.h"
 #include "BatchNormBaseLayer.h"
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h
index 2302d1a8e0b17f4b67835e65a3453f8f6e20f721..d65882d39df2bb93920dad37ebc78342e31aef85 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Stat.h"
@@ -21,14 +20,15 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * @brief Batch normalization layer use to normalizes the input to across the batch.
+ * @brief Batch normalization layer use to normalizes the input to across the
+ * batch.
  *
  * By default, calculating global mean and variance statistics via a running
  * average in the training peroid. Then the pre-calculated global mean and
  * variance are used for testing.
  *
  * Moving mean and variance are located in Parameter object when constructing
- * and the calculation will change them. Now we only save global mean and 
+ * and the calculation will change them. Now we only save global mean and
  * variance of one thread in first node for GPU.
  * But the calculation in CPU is different, because parameters are shared by
  * multiple threads. Here using ShareCpuMatrix with lock to calculate. We
@@ -41,8 +41,7 @@ namespace paddle {
 
 class BatchNormBaseLayer : public Layer {
 public:
-  explicit BatchNormBaseLayer(const LayerConfig& config)
-      : Layer(config) {}
+  explicit BatchNormBaseLayer(const LayerConfig& config) : Layer(config) {}
 
   ~BatchNormBaseLayer() {}
 
@@ -55,8 +54,8 @@ public:
 
   virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
 
-  /** 
-   * @brief Calculate feature map size. Some input uses frameHeight and 
+  /**
+   * @brief Calculate feature map size. Some input uses frameHeight and
    * frameWidth to store feature size
    */
   void calFeatureMapSize();
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/gserver/layers/BatchNormalizationLayer.cpp
index b2921e6d40d3d5d3777fbb26fa9314aaa73f82da..e431c033117c5d405324e7440b84d0e79018b52a 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #ifndef PADDLE_ONLY_CPU
 #include "hl_batch_transpose.h"
@@ -44,8 +43,8 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
   tmpMat_->square();
   savedInvVar_->zeroMem();
   savedInvVar_->accumulateColSum(*tmpMat_);
-  savedInvVar_->mulScalar(1.0 / numSamples);  // E[x^2]
-  savedInvVar_->addSquare(*savedMean_, -1.0);      // E[x^2] - E^2[x]
+  savedInvVar_->mulScalar(1.0 / numSamples);   // E[x^2]
+  savedInvVar_->addSquare(*savedMean_, -1.0);  // E[x^2] - E^2[x]
 
   // Variance may be small negative value
   // because of the subtraction operation.
@@ -104,17 +103,23 @@ void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {
 #ifdef PADDLE_ONLY_CPU
     LOG(FATAL) << "paddle is compiled only for cpu";
 #else
-    batchTranspose(in->getData(), out->getData(), imgPixels_,
-                   channels_, batchSize);
+    batchTranspose(
+        in->getData(), out->getData(), imgPixels_, channels_, batchSize);
 #endif
   } else {
     for (size_t i = 0; i < batchSize; i++) {
       const MatrixPtr inTmp =
-          Matrix::create(in->getData() + i * imgPixels_ * channels_, channels_,
-                         imgPixels_, false, useGpu_);
+          Matrix::create(in->getData() + i * imgPixels_ * channels_,
+                         channels_,
+                         imgPixels_,
+                         false,
+                         useGpu_);
       MatrixPtr outTmp =
           Matrix::create(out->getData() + i * imgPixels_ * channels_,
-                         imgPixels_, channels_, false, useGpu_);
+                         imgPixels_,
+                         channels_,
+                         false,
+                         useGpu_);
       inTmp->transpose(outTmp, false);
     }
   }
@@ -135,23 +140,27 @@ void BatchNormalizationLayer::shrinkMat(const MatrixPtr& in, MatrixPtr& out) {
 #ifdef PADDLE_ONLY_CPU
     LOG(FATAL) << "paddle is compiled only for cpu";
 #else
-    batchTranspose(in->getData(), out->getData(), channels_,
-                   imgPixels_, batchSize);
+    batchTranspose(
+        in->getData(), out->getData(), channels_, imgPixels_, batchSize);
 #endif
   } else {
     for (size_t i = 0; i < batchSize; i++) {
       const MatrixPtr inTmp =
-          Matrix::create(in->getData() + i * channels_ * imgPixels_, imgPixels_,
-                         channels_, false, useGpu_);
+          Matrix::create(in->getData() + i * channels_ * imgPixels_,
+                         imgPixels_,
+                         channels_,
+                         false,
+                         useGpu_);
       MatrixPtr outTmp =
-          Matrix::create(out->getData() + i * imgPixels_ * channels_, channels_,
-                         imgPixels_, useGpu_);
+          Matrix::create(out->getData() + i * imgPixels_ * channels_,
+                         channels_,
+                         imgPixels_,
+                         useGpu_);
       inTmp->transpose(outTmp, false);
     }
   }
 }
 
-
 void BatchNormalizationLayer::forward(PassType passType) {
   Layer::forward(passType);
 
@@ -165,12 +174,12 @@ void BatchNormalizationLayer::forward(PassType passType) {
     useGlobalStats_ = config_.use_global_stats();
   }
 
-  Matrix::resizeOrCreate(expandedIn_, batchSize * imgPixels_, channels_, false,
-                         useGpu_);
-  Matrix::resizeOrCreate(normIn_, batchSize * imgPixels_, channels_, false,
-                         useGpu_);
-  Matrix::resizeOrCreate(expandedOut_, batchSize * imgPixels_, channels_, false,
-                         useGpu_);
+  Matrix::resizeOrCreate(
+      expandedIn_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      normIn_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      expandedOut_, batchSize * imgPixels_, channels_, false, useGpu_);
   expandMat(getInputValue(0), expandedIn_);
 
   if (useGlobalStats_) {
@@ -184,7 +193,7 @@ void BatchNormalizationLayer::forward(PassType passType) {
   }
 
   normIn_->assign(*expandedIn_);
-  normIn_->addBias(*savedMean_, -1);  // subtract mean.
+  normIn_->addBias(*savedMean_, -1);     // subtract mean.
   normIn_->divRowVector(*savedInvVar_);  // divide std.
 
   expandedOut_->assign(*normIn_);
@@ -211,18 +220,18 @@ void BatchNormalizationLayer::backward(const UpdateCallback& callback) {
   Matrix::resizeOrCreate(meanGrad_, 1, channels_, false, useGpu_);
   Matrix::resizeOrCreate(stdGrad_, 1, channels_, false, useGpu_);
 
-  Matrix::resizeOrCreate(expandedInGrad_, batchSize * imgPixels_, channels_,
-                         false, useGpu_);
-  Matrix::resizeOrCreate(inGrad_, batchSize, imgPixels_ * channels_, false,
-                         useGpu_);
-  Matrix::resizeOrCreate(normInGrad_, batchSize * imgPixels_, channels_, false,
-                         useGpu_);
-  Matrix::resizeOrCreate(expandedOutGrad_, batchSize * imgPixels_, channels_,
-                         false, useGpu_);
-  Matrix::resizeOrCreate(tmpMat_, batchSize * imgPixels_, channels_, false,
-                         useGpu_);
-  Matrix::resizeOrCreate(tmpGrad_, batchSize * imgPixels_, channels_, false,
-                         useGpu_);
+  Matrix::resizeOrCreate(
+      expandedInGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      inGrad_, batchSize, imgPixels_ * channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      normInGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      expandedOutGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      tmpMat_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      tmpGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
 
   expandMat(getOutputGrad(), expandedOutGrad_);
 
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h
index 175b9a80e63f796d272af3940705def7b9857df7..36925a5ed2d56e4a5c58525cc238164f72bef40c 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/BilinearInterpLayer.cpp b/paddle/gserver/layers/BilinearInterpLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c30e26dc031378ce792534c5eec6c24fc0d20ef9
--- /dev/null
+++ b/paddle/gserver/layers/BilinearInterpLayer.cpp
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "BilinearInterpLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(bilinear_interp, BilinearInterpLayer);
+
+size_t BilinearInterpLayer::getSize() {
+  inImgH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  inImgW_ = inputLayers_[0]->getOutput().getFrameWidth();
+
+  const BilinearInterpConfig& conf = config_.inputs(0).bilinear_interp_conf();
+  if (inImgH_ == 0) {
+    inImgH_ = conf.img_size_y();
+  }
+  if (inImgW_ == 0) {
+    inImgW_ = conf.img_size_x();
+  }
+
+  outImgH_ = conf.out_size_y();
+  outImgW_ = conf.out_size_x();
+  numChannels_ = conf.num_channels();
+
+  CHECK(outImgH_ > 0 && outImgW_ > 0);
+  CHECK(inImgH_ > 0 && inImgW_ > 0);
+  CHECK(numChannels_);
+
+  ratioH_ =
+      (outImgH_ > 1) ? static_cast<real>(inImgH_ - 1) / (outImgH_ - 1) : 0.f;
+  ratioW_ =
+      (outImgW_ > 1) ? static_cast<real>(inImgW_ - 1) / (outImgW_ - 1) : 0.f;
+
+  getOutput().setFrameHeight(outImgH_);
+  getOutput().setFrameWidth(outImgW_);
+  return outImgH_ * outImgW_ * numChannels_;
+}
+
+bool BilinearInterpLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(1, config_.inputs_size());
+
+  return true;
+}
+
+void BilinearInterpLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  size_t batchSize = getInput(0).getBatchSize();
+  size_t size = getSize();
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, size);
+  }
+
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwBilinearInterpTimer", getName().c_str());
+    outV->bilinearForward(*inV,
+                          inImgH_,
+                          inImgW_,
+                          outImgH_,
+                          outImgW_,
+                          numChannels_,
+                          ratioH_,
+                          ratioW_);
+  }
+}
+
+void BilinearInterpLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  MatrixPtr inputG = getInputGrad(0);
+  MatrixPtr outG = getOutputGrad();
+  {
+    REGISTER_TIMER_INFO("BwBilinearInterpTimer", getName().c_str());
+    if (inputG) {
+      inputG->bilinearBackward(*outG,
+                               outImgH_,
+                               outImgW_,
+                               inImgH_,
+                               inImgW_,
+                               numChannels_,
+                               ratioH_,
+                               ratioW_);
+    }
+  }
+}
+}  // namespace paddle
diff --git a/paddle/gserver/layers/BilinearInterpLayer.h b/paddle/gserver/layers/BilinearInterpLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..eba3c054fa8e7521e83d7c8dd1d87079a52b3967
--- /dev/null
+++ b/paddle/gserver/layers/BilinearInterpLayer.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for bilinear interpolation which is
+ *        used on conv layer output.
+ *
+ * @note  The config file api is bilinear_interp_layer.
+ */
+class BilinearInterpLayer : public Layer {
+protected:
+  size_t outImgH_, outImgW_;
+  size_t inImgH_, inImgW_;
+  real ratioH_, ratioW_;
+  size_t numChannels_;
+
+public:
+  explicit BilinearInterpLayer(const LayerConfig& config) : Layer(config) {}
+
+  virtual ~BilinearInterpLayer() {}
+
+  size_t getSize();
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp
index 8da159def82b0cb91bc8ffbd8f29891319fa6f35..17d77879b27be332a49eae4e476b776ec2f5c8e2 100644
--- a/paddle/gserver/layers/BlockExpandLayer.cpp
+++ b/paddle/gserver/layers/BlockExpandLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "BlockExpandLayer.h"
 
 #include "paddle/utils/Logging.h"
@@ -52,7 +51,7 @@ size_t BlockExpandLayer::getBlockNum() {
   if (imgSizeW_ == 0) {
     imgSizeW_ = blockConf.img_size_x();
   }
-  size_t tmpH  = 2 * paddingH_ + imgSizeH_ - blockH_;
+  size_t tmpH = 2 * paddingH_ + imgSizeH_ - blockH_;
   outputH_ = (int)tmpH < 0 ? 1 : 1 + (tmpH + strideH_ - 1) / strideH_;
   size_t tmpW = 2 * paddingW_ + imgSizeW_ - blockW_;
   outputW_ = (int)tmpW < 0 ? 1 : 1 + (tmpW + strideW_ - 1) / strideW_;
@@ -73,8 +72,8 @@ void BlockExpandLayer::forward(PassType passType) {
 
   MatrixPtr input = getPrev(0)->getOutputValue();
   Matrix::resizeOrCreate(outVTrans_, blockSize, blockNum, false, useGpu_);
-  ICpuGpuVector::resizeOrCreate(out.sequenceStartPositions,
-                                batchSize + 1, false);
+  ICpuGpuVector::resizeOrCreate(
+      out.sequenceStartPositions, batchSize + 1, false);
   IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false);
   int* start = out.sequenceStartPositions->getMutableData(false);
   int* dims = out.cpuSequenceDims->getData();
@@ -82,14 +81,29 @@ void BlockExpandLayer::forward(PassType passType) {
     outVTrans_->zeroMem();
     /* expand each block as one row */
     MatrixPtr inputTmp =
-        Matrix::create(input->getData() + i * input->getWidth(), 1,
-                       input->getWidth(), false, useGpu_);
-    outVTrans_->convExpand(*inputTmp, imgSizeH_, imgSizeW_, channels_, blockH_,
-                          blockW_, strideH_, strideW_, paddingH_, paddingW_,
-                          outputH_, outputW_);
+        Matrix::create(input->getData() + i * input->getWidth(),
+                       1,
+                       input->getWidth(),
+                       false,
+                       useGpu_);
+    outVTrans_->convExpand(*inputTmp,
+                           imgSizeH_,
+                           imgSizeW_,
+                           channels_,
+                           blockH_,
+                           blockW_,
+                           strideH_,
+                           strideW_,
+                           paddingH_,
+                           paddingW_,
+                           outputH_,
+                           outputW_);
     MatrixPtr outVTmp =
-        Matrix::create(outV->getData() + i * blockNum * blockSize, blockNum,
-                       blockSize, false, useGpu_);
+        Matrix::create(outV->getData() + i * blockNum * blockSize,
+                       blockNum,
+                       blockSize,
+                       false,
+                       useGpu_);
     outVTrans_->transpose(outVTmp, false);
     start[i] = i * blockNum;
     dims[2 * i] = outputH_;
@@ -115,15 +129,32 @@ void BlockExpandLayer::backward(const UpdateCallback& callback) {
 
   for (size_t i = 0; i < batchSize; i++) {
     MatrixPtr gradTmp =
-        Matrix::create(grad->getData() + i * blockNum * blockSize, blockNum,
-                       blockSize, false, useGpu_);
+        Matrix::create(grad->getData() + i * blockNum * blockSize,
+                       blockNum,
+                       blockSize,
+                       false,
+                       useGpu_);
     gradTmp->transpose(gradTrans, false);
     MatrixPtr preGradTmp =
-        Matrix::create(preGrad->getData() + i * preGrad->getWidth(), 1,
-                       preGrad->getWidth(), false, useGpu_);
-    preGradTmp->convShrink(*gradTrans, imgSizeH_, imgSizeW_, channels_, blockH_,
-                           blockW_, strideH_, strideW_, paddingH_, paddingW_,
-                           outputH_, outputW_, 1.0, 1.0);
+        Matrix::create(preGrad->getData() + i * preGrad->getWidth(),
+                       1,
+                       preGrad->getWidth(),
+                       false,
+                       useGpu_);
+    preGradTmp->convShrink(*gradTrans,
+                           imgSizeH_,
+                           imgSizeW_,
+                           channels_,
+                           blockH_,
+                           blockW_,
+                           strideH_,
+                           strideW_,
+                           paddingH_,
+                           paddingW_,
+                           outputH_,
+                           outputW_,
+                           1.0,
+                           1.0);
   }
 }
 
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
index f8f81721278c6c70a2bbea5f10ab9a1b9e501b35..1496fb681acd7ca7190e43cce38c7eb347932d29 100644
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/CRFDecodingLayer.cpp b/paddle/gserver/layers/CRFDecodingLayer.cpp
index d3dfbb7c80f68b8134edc15625abf58504f27017..8986741dc307ba765707d6e5817a2e376b27828e 100644
--- a/paddle/gserver/layers/CRFDecodingLayer.cpp
+++ b/paddle/gserver/layers/CRFDecodingLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "CRFDecodingLayer.h"
 
 namespace paddle {
@@ -46,7 +45,8 @@ void CRFDecodingLayer::forward(PassType passType) {
 
   for (size_t i = 0; i < numSequences; ++i) {
     crf_->decode(output.value->getData() + numClasses_ * starts[i],
-                 output_.ids->getData() + starts[i], starts[i + 1] - starts[i]);
+                 output_.ids->getData() + starts[i],
+                 starts[i + 1] - starts[i]);
   }
 
   if (inputLayers_.size() == 2) {
diff --git a/paddle/gserver/layers/CRFDecodingLayer.h b/paddle/gserver/layers/CRFDecodingLayer.h
index 005bffff6b6b803dba4c72fcbdd61cf09838f014..1914062011d3bceba2f8765fb3cfd2d29ca6d6e9 100644
--- a/paddle/gserver/layers/CRFDecodingLayer.h
+++ b/paddle/gserver/layers/CRFDecodingLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/gserver/layers/CRFLayer.cpp
index c1dcad2b5f2a840ba06e8ef9833eee7a6e5e20cb..ed4f864ba9167129db1a3f56403940d9d7807a15 100644
--- a/paddle/gserver/layers/CRFLayer.cpp
+++ b/paddle/gserver/layers/CRFLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "CRFLayer.h"
 
 namespace paddle {
@@ -73,12 +72,13 @@ void CRFLayer::forward(PassType passType) {
       crfs_.emplace_back(numClasses_,
                          parameter_->getBuf(PARAMETER_VALUE)->getData(),
                          parameter_->getBuf(PARAMETER_GRADIENT)
-                            ? parameter_->getBuf(PARAMETER_GRADIENT)->getData()
-                            : nullptr);
+                             ? parameter_->getBuf(PARAMETER_GRADIENT)->getData()
+                             : nullptr);
     }
-    output_.value->getData()[i] = crfs_[i].forward(
-        output.value->getData() + numClasses_ * starts[i],
-        label.ids->getData() + starts[i], starts[i + 1] - starts[i]);
+    output_.value->getData()[i] =
+        crfs_[i].forward(output.value->getData() + numClasses_ * starts[i],
+                         label.ids->getData() + starts[i],
+                         starts[i + 1] - starts[i]);
   }
 
   if (weightLayer_) {
@@ -87,7 +87,7 @@ void CRFLayer::forward(PassType passType) {
   }
 }
 
-void CRFLayer::backward(const UpdateCallback &callback) {
+void CRFLayer::backward(const UpdateCallback& callback) {
   const Argument& output = getInput(0);
   const Argument& label = getInput(1);
   const int* starts = label.sequenceStartPositions->getData(false);
@@ -100,7 +100,7 @@ void CRFLayer::backward(const UpdateCallback &callback) {
                       starts[i + 1] - starts[i]);
     if (weightLayer_) {
       real weight = getInputValue(*weightLayer_)->getElement(i, 0);
-      MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i+1]);
+      MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
       grad->mulScalar(weight);
     }
   }
diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h
index 58902a0d3b7e4cad67dac94be10c35ebbf83b001..21c7fc61e168cea438339db4e7abce59082fc58d 100644
--- a/paddle/gserver/layers/CRFLayer.h
+++ b/paddle/gserver/layers/CRFLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -39,7 +38,7 @@ protected:
   ParameterPtr parameter_;
   std::vector<LinearChainCRF> crfs_;
   LayerPtr weightLayer_;  // weight for each sequence
-  real coeff_;  // weight for the layer
+  real coeff_;            // weight for the layer
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CTCLayer.cpp b/paddle/gserver/layers/CTCLayer.cpp
index 6b9ffc5c749fb45be567881b8e625b48e28f69b4..be5d2c8c75d6eb2381a2c1758088de0eff462200 100644
--- a/paddle/gserver/layers/CTCLayer.cpp
+++ b/paddle/gserver/layers/CTCLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "CTCLayer.h"
 
 /* Please reference the Chapter7  in
@@ -71,8 +70,7 @@ void CTCLayer::forwardImp(const Argument& softmaxSeqs,
   resizeOutput(numSequences, 1);
   std::vector<real> out(numSequences);
 
-  const int* labelSeqsStarts =
-      labelSeqs.sequenceStartPositions->getData(false);
+  const int* labelSeqsStarts = labelSeqs.sequenceStartPositions->getData(false);
   const int* softmaxSeqsStarts =
       softmaxSeqs.sequenceStartPositions->getData(false);
 
@@ -81,22 +79,22 @@ void CTCLayer::forwardImp(const Argument& softmaxSeqs,
       ctcs_.emplace_back(numClasses_, normByTimes_);
     }
     out[i] = ctcs_[i].forward(
-            softmaxSeqs.value->getData() + numClasses_ * softmaxSeqsStarts[i],
-            softmaxSeqsStarts[i + 1] - softmaxSeqsStarts[i],
-            labelSeqs.ids->getData() + labelSeqsStarts[i],
-            labelSeqsStarts[i + 1] - labelSeqsStarts[i]);
+        softmaxSeqs.value->getData() + numClasses_ * softmaxSeqsStarts[i],
+        softmaxSeqsStarts[i + 1] - softmaxSeqsStarts[i],
+        labelSeqs.ids->getData() + labelSeqsStarts[i],
+        labelSeqsStarts[i + 1] - labelSeqsStarts[i]);
   }
   output_.value->copyFrom(out.data(), numSequences);
 }
 
-void CTCLayer::backward(const UpdateCallback &callback) {
+void CTCLayer::backward(const UpdateCallback& callback) {
   (void)callback;
   if (useGpu_) {
     backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]);
-    const_cast<Argument&>(getInput(0)).
-            resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_DEFAULT);
-    const_cast<Argument&>(getInput(1)).
-            resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_DEFAULT);
+    const_cast<Argument&>(getInput(0))
+        .resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_DEFAULT);
+    const_cast<Argument&>(getInput(1))
+        .resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_DEFAULT);
   } else {
     backwardImp(callback, getInput(0), getInput(1));
   }
@@ -107,8 +105,7 @@ void CTCLayer::backwardImp(const UpdateCallback& callback,
                            const Argument& labelSeqs) {
   size_t numSequences = labelSeqs.sequenceStartPositions->getSize() - 1;
 
-  const int* labelSeqsStarts =
-      labelSeqs.sequenceStartPositions->getData(false);
+  const int* labelSeqsStarts = labelSeqs.sequenceStartPositions->getData(false);
   const int* softmaxSeqsStarts =
       softmaxSeqs.sequenceStartPositions->getData(false);
 
diff --git a/paddle/gserver/layers/CTCLayer.h b/paddle/gserver/layers/CTCLayer.h
index 49a059e43e6af4194bf50fbab14f545b81f65795..18ba12583b5a22849f1ee849a3cce7249730fdaf 100644
--- a/paddle/gserver/layers/CTCLayer.h
+++ b/paddle/gserver/layers/CTCLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -28,7 +27,8 @@ public:
   void forwardImp(const Argument& softmaxSeqs, const Argument& labelSeqs);
   virtual void backward(const UpdateCallback& callback);
   void backwardImp(const UpdateCallback& callback,
-                   const Argument& softmaxSeqs, const Argument& labelSeqs);
+                   const Argument& softmaxSeqs,
+                   const Argument& labelSeqs);
 
 protected:
   size_t numClasses_;
diff --git a/paddle/gserver/layers/ConcatenateLayer.cpp b/paddle/gserver/layers/ConcatenateLayer.cpp
index 52a7cb6f777c3a380d51c6c48e994075ff1ef5eb..910eec8bbc10ef10f5dd4e4688eef5e87c21f506 100644
--- a/paddle/gserver/layers/ConcatenateLayer.cpp
+++ b/paddle/gserver/layers/ConcatenateLayer.cpp
@@ -110,6 +110,8 @@ protected:
   std::vector<std::unique_ptr<Projection>> projections_;
   std::vector<Argument> projOutput_;
   std::vector<std::pair<size_t, size_t>> projCol_;
+  bool sharedBias_;
+  std::unique_ptr<Weight> biases_;
 };
 
 REGISTER_LAYER(concat2, ConcatenateLayer2);
@@ -119,7 +121,6 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap,
   /* Initialize the basic parent class */
   if (!Layer::init(layerMap, parameterMap)) return false;
 
-  CHECK(!biasParameter_);
   CHECK_EQ(inputLayers_.size(), parameters_.size());
   projections_.reserve(inputLayers_.size());
   projCol_.reserve(inputLayers_.size());
@@ -128,8 +129,8 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap,
   size_t startCol = 0;
   size_t endCol = 0;
   for (size_t i = 0; i < inputLayers_.size(); i++) {
-    projections_.emplace_back(Projection::create(config_.inputs(i).proj_conf(),
-                                                 parameters_[i], useGpu_));
+    projections_.emplace_back(Projection::create(
+        config_.inputs(i).proj_conf(), parameters_[i], useGpu_));
 
     endCol += projections_[i]->getOutputSize();
     projCol_.push_back(std::make_pair(startCol, endCol));
@@ -137,6 +138,13 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap,
   }
   CHECK_EQ(getSize(), endCol);
 
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    sharedBias_ = config_.shared_biases();
+    size_t psize = config_.bias_size();
+    biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
+  }
+
   return true;
 }
 
@@ -151,11 +159,22 @@ void ConcatenateLayer2::forward(PassType passType) {
     size_t startCol = projCol_[i].first;
     size_t endCol = projCol_[i].second;
     projOutput_[i].value = output_.value->subColMatrix(startCol, endCol);
-    projOutput_[i].grad = output_.grad->subColMatrix(startCol, endCol);
+    if (output_.grad) {
+      projOutput_[i].grad = output_.grad->subColMatrix(startCol, endCol);
+    }
   }
 
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
+  {
+    AsyncGpuBlock block;
+    for (size_t i = 0; i != inputLayers_.size(); ++i) {
+      projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
+    }
+  }
+
+  /* add the bias-vector */
+  if (biases_) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    output_.value->addBias(*(biases_->getW()), 1, sharedBias_);
   }
 
   /* activation */ {
@@ -170,6 +189,13 @@ void ConcatenateLayer2::backward(const UpdateCallback& callback) {
     backwardActivation();
   }
 
+  AsyncGpuBlock block;
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("Concat2BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
   for (size_t i = 0; i != inputLayers_.size(); ++i) {
     if (projections_[i]) {
       projections_[i]->backward(callback);
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index 3b1498f7e986737e01115c44f964b4a7ee924095..30dbf168fb6e439048e0168af572d1f20a303e79 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "ContextProjection.h"
 
@@ -21,7 +20,8 @@ namespace paddle {
 REGISTER_PROJECTION(context, ContextProjection);
 
 ContextProjection::ContextProjection(const ProjectionConfig& config,
-                                     ParameterPtr parameter, bool useGpu)
+                                     ParameterPtr parameter,
+                                     bool useGpu)
     : Projection(config, parameter, useGpu) {
   CHECK(config.has_context_start());
   CHECK(config.has_context_length());
@@ -44,10 +44,13 @@ void ContextProjection::resetState() {
   CHECK_LE(config_.context_start() + config_.context_length(), 1)
       << "state is not allowed for future context";
   if (config_.context_start() >= 0) return;
-  Matrix::resizeOrCreate(state_, -config_.context_start(), config_.input_size(),
+  Matrix::resizeOrCreate(state_,
+                         -config_.context_start(),
+                         config_.input_size(),
                          false,  // trans
                          useGpu_);
-  Matrix::resizeOrCreate(state2_, -config_.context_start(),
+  Matrix::resizeOrCreate(state2_,
+                         -config_.context_start(),
                          config_.input_size(),
                          false,  // trans
                          useGpu_);
@@ -78,8 +81,7 @@ void ContextProjection::forward() {
   CHECK(in_->value);
   CHECK(in_->sequenceStartPositions);
 
-  auto startPositions =
-    in_->sequenceStartPositions->getVector(useGpu_);
+  auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
 
   int64_t inputDim = in_->value->getWidth();
   int64_t dim = out_->value->getWidth();
@@ -88,9 +90,13 @@ void ContextProjection::forward() {
   REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
   bool isPadding = config_.trainable_padding();
   out_->value->contextProjectionForward(
-      in_->value, state_ ? state_ : isPadding ? weight_->getW() : nullptr,
-      *startPositions, config_.context_length(), config_.context_start(),
-      beginPad_, state_ ? true : isPadding);
+      in_->value,
+      state_ ? state_ : isPadding ? weight_->getW() : nullptr,
+      *startPositions,
+      config_.context_length(),
+      config_.context_start(),
+      beginPad_,
+      state_ ? true : isPadding);
 
   if (state_ && config_.context_start() < 0) {
     CHECK_EQ(1, in_->getNumSequences());
@@ -116,27 +122,35 @@ void ContextProjection::backward(const UpdateCallback& callback) {
   int64_t inputDim = in_->value->getWidth();
   int64_t dim = out_->value->getWidth();
   CHECK_EQ(dim, inputDim * config_.context_length());
-  auto startPositions =
-    in_->sequenceStartPositions->getVector(useGpu_);
+  auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
 
   REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str());
   bool isPadding = config_.trainable_padding();
   if (!out_->grad->useGpu()) {
     out_->grad->contextProjectionBackward(
-        in_->grad, isPadding ? weight_->getWGrad() : nullptr, *startPositions,
-        config_.context_length(), config_.context_start(), beginPad_,
+        in_->grad,
+        isPadding ? weight_->getWGrad() : nullptr,
+        *startPositions,
+        config_.context_length(),
+        config_.context_start(),
+        beginPad_,
         isPadding);
   } else {
     if (in_->grad) {
-      out_->grad->contextProjectionBackwardData(in_->grad, *startPositions,
+      out_->grad->contextProjectionBackwardData(in_->grad,
+                                                *startPositions,
                                                 config_.context_length(),
                                                 config_.context_start());
     }
 
     if (isPadding && weight_->getWGrad()) {
       out_->grad->contextProjectionBackwardWeight(
-          weight_->getWGrad(), *startPositions, config_.context_length(),
-          config_.context_start(), weight_->getWGrad()->getHeight(), beginPad_);
+          weight_->getWGrad(),
+          *startPositions,
+          config_.context_length(),
+          config_.context_start(),
+          weight_->getWGrad()->getHeight(),
+          beginPad_);
     }
   }
 
diff --git a/paddle/gserver/layers/ContextProjection.h b/paddle/gserver/layers/ContextProjection.h
index 0786ee28f2eed9c73659eb2ca0d691da8d1e3e29..188dec0fb31bf468c76b9b922e0972c86e819a2d 100644
--- a/paddle/gserver/layers/ContextProjection.h
+++ b/paddle/gserver/layers/ContextProjection.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Projection.h"
@@ -50,7 +49,8 @@ public:
    * and if it is set, constructor will set learned weight, which is used to
    * pad output.
    */
-  ContextProjection(const ProjectionConfig& config, ParameterPtr parameter,
+  ContextProjection(const ProjectionConfig& config,
+                    ParameterPtr parameter,
                     bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp
index 9ed9572139dc8c2097857ed902a9f25a0af7ac7e..7637e245a38959220f0d1d52e1f705d86a7c7303 100644
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -12,15 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "ConvBaseLayer.h"
+#include "paddle/math/MathUtils.h"
 namespace paddle {
 
 bool ConvBaseLayer::init(const LayerMap& layerMap,
                          const ParameterMap& parameterMap) {
   /* Initialize the basic parent class */
   Layer::init(layerMap, parameterMap);
+  isDeconv_ = (config_.type() == "exconv" || config_.type() == "cudnn_conv")
+                  ? false
+                  : true;
 
   /* Initialize the convolutional layer parameter */
   numFilters_ = config_.num_filters();
@@ -35,20 +38,19 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
     filterSizeY_.push_back(conf.filter_size_y());
     filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
     channels_.push_back(conf.channels());
-    imgSize_.push_back(conf.img_size());
-    imgPixels_.push_back(imgSize_.back() * imgSize_.back());
+    imgSizeH_.push_back(conf.img_size());
+    imgSizeW_.push_back(conf.img_size());
     groups_.push_back(conf.groups());
     filterChannels_.push_back(conf.filter_channels());
-    outputX_.push_back(conf.output_x());
-    outputs_.push_back(outputX_.back() * outputX_.back());
+    outputH_.push_back(conf.output_x());
+    outputW_.push_back(conf.output_x());
   }
 
-  /* initialize the weightList */
   CHECK(inputLayers_.size() == parameters_.size());
   for (size_t i = 0; i < inputLayers_.size(); i++) {
     size_t height, width;
     height = filterPixels_[i] * filterChannels_[i];
-    width = numFilters_;
+    width = (!isDeconv_) ? numFilters_ : channels_[i];
 
     // create a new weight
     CHECK_EQ(parameters_[i]->getSize(), width * height);
@@ -57,7 +59,7 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
   }
 
   /* initialize the biases_ */
-  if (biasParameter_.get() != NULL) {
+  if (biasParameter_.get()) {
     if (sharedBiases_) {
       CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
       biases_ =
@@ -74,4 +76,51 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
   return true;
 }
 
+size_t ConvBaseLayer::calOutputSize() {
+  auto clearAndReserve = [this](IntV* vec) {
+    vec->clear();
+    vec->reserve(this->inputLayers_.size());
+  };
+  clearAndReserve(&imgSizeH_);
+  clearAndReserve(&imgSizeW_);
+  clearAndReserve(&outputH_);
+  clearAndReserve(&outputW_);
+  size_t layerSize = 0;
+
+  auto setLayerSize = [&](IntV& inH, IntV& inW, IntV& outH, IntV& outW) {
+    for (size_t i = 0; i < inputLayers_.size(); i++) {
+      inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
+      inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
+      if (isDeconv_) {
+        if (inH[i] == 0) inH[i] = config_.inputs(i).conv_conf().output_x();
+        if (inW[i] == 0) inW[i] = config_.inputs(i).conv_conf().output_x();
+        outH.push_back(imageSize(
+            inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_));
+        outW.push_back(imageSize(
+            inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_));
+      } else {
+        if (inH[i] == 0) inH[i] = config_.inputs(i).conv_conf().img_size();
+        if (inW[i] == 0) inW[i] = config_.inputs(i).conv_conf().img_size();
+        outH.push_back(outputSize(
+            inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_));
+        outW.push_back(outputSize(
+            inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_));
+      }
+      CHECK_EQ(outH[i], outH[0]);
+      CHECK_EQ(outW[i], outW[0]);
+    }
+    getOutput().setFrameHeight(outH[0]);
+    getOutput().setFrameWidth(outW[0]);
+    layerSize = outH[0] * outW[0] * size_t(numFilters_);
+  };
+
+  if (isDeconv_) {
+    setLayerSize(outputH_, outputW_, imgSizeH_, imgSizeW_);
+  } else {
+    setLayerSize(imgSizeH_, imgSizeW_, outputH_, outputW_);
+  }
+
+  return layerSize;
+}
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
index eaeaebf43be252a3a90d7fd45f41de09c3ef5c81..85f57dbe0b7c9683ba0941ea0edc611f683cf1b4 100644
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
+#include "paddle/math/MathUtils.h"
 namespace paddle {
 
 /**
@@ -27,6 +27,9 @@ class ConvBaseLayer : public Layer {
 protected:
   typedef std::vector<int> IntV;
 
+  /// True if it's deconv layer, false if it's convolution layer
+  bool isDeconv_;
+
   /// The number of filters.
   int numFilters_;
   /// The x dimension of the padding.
@@ -43,19 +46,18 @@ protected:
   IntV filterSizeY_;
   /// The spatial dimensions of the convolution input.
   IntV channels_;
-  /// The spatial dimensions of input feature map.
-  IntV imgSize_;
-  /// The total pixel size of input feature map.
-  /// imgPixels_ = imgSizeX_ * imgSizeY_.
-  IntV imgPixels_;
+  /// The spatial dimensions of input feature map height.
+  IntV imgSizeH_;
+  /// The spatial dimensions of input feature map width.
+  IntV imgSizeW_;
   /// filterPixels_ = filterSizeX_ * filterSizeY_.
   IntV filterPixels_;
   /// filterChannels_ = channels_/groups_.
   IntV filterChannels_;
-  /// The spatial dimensions of output feature map.
-  IntV outputX_;
-  /// The spatial dimensions of output feature map.
-  IntV outputs_;
+  /// The spatial dimensions of output feature map height.
+  IntV outputH_;
+  /// The spatial dimensions of output feature map width.
+  IntV outputW_;
   /// Group size, refer to grouped convolution in
   /// Alex Krizhevsky's paper: when group=2, the first half of the
   /// filters are only connected to the first half of the input channels,
@@ -80,32 +82,14 @@ public:
 
   virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
 
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-
   /**
-   * Calculate output size based on caffeMode_.
-   * - input(+padding): 0123456789
-   * - imageSize(+padding) = 10;
-   * - filterSize = 3;
-   * - stride = 2;
-   * - caffeMode_ is true:
-       - output: (012), (234), (456), (678)
-       - outputSize = 4;
-   * - caffeMode_ is false:
-   *   - output: (012), (234), (456), (678), (9)
-   *   - outputSize = 5;
+   * imgSizeH_ and imgSizeW_ will be set according to the previous input layers
+   * in this function. Then it will calculate outputH_ and outputW_ and set them
+   * into output argument.
    */
-  int outputSize(int imageSize, int filterSize, int padding, int stride) {
-    int outputSize;
-    if (!caffeMode_) {
-     outputSize =
-          (imageSize - filterSize + 2 * padding + stride - 1) / stride + 1;
-    } else {
-      outputSize = (imageSize - filterSize + 2 * padding) / stride + 1;
-    }
-    CHECK_GE(outputSize, 1);
-    return outputSize;
-  }
+  virtual size_t calOutputSize();
+
+  Weight& getWeight(int idx) { return *weights_[idx]; }
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ConvOperator.cpp b/paddle/gserver/layers/ConvOperator.cpp
index 8c72c1778451dfddbaa740921cd08cf73fe56785..9b8e18b1ba2a4502bcdcecade94ec3e29730595c 100644
--- a/paddle/gserver/layers/ConvOperator.cpp
+++ b/paddle/gserver/layers/ConvOperator.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/math/Matrix.h"
+#include "paddle/math/MathUtils.h"
 #include "Operator.h"
 
 namespace paddle {
@@ -35,8 +35,8 @@ public:
    */
   virtual ~ConvOperator() {
     if (workSpaceInBytes_ != 0) {
-        hl_free_mem_device(workSpace_);
-        workSpaceInBytes_ = 0;
+      hl_free_mem_device(workSpace_);
+      workSpaceInBytes_ = 0;
     }
 
     hl_destroy_tensor_descriptor(inputDesc_);
@@ -83,33 +83,6 @@ private:
              filterSize_ * filterSizeY_ * channels_ * numFilters_);
   }
 
-  /**
-   * Calculate output size.
-   */
-  int outputSize(int imageSize, int filterSize, int padding, int stride) {
-    int outputSize;
-    if (!caffeMode_) {
-      /* input(+padding): 0123456789
-       * imageSize(+padding) = 10;
-       * filterSize = 3;
-       * stride = 2;
-       * output: (012), (234), (456), (678), (9)
-       * outputSize = 5;
-       */
-      outputSize =
-          (imageSize - filterSize + 2 * padding + stride - 1) / stride + 1;
-    } else {
-      /* input(+padding): 0123456789
-       * imageSize(+padding) = 10;
-       * filterSize = 3;
-       * stride = 2;
-       * output: (012), (234), (456), (678)
-       * outputSize = 4;
-       */
-      outputSize = (imageSize - filterSize + 2 * padding) / stride + 1;
-    }
-    return outputSize;
-  }
   /// Most of member variables are same with CudnnConvLayer.
   /// There is no explanation here.
   int imageH_, imageW_, outputH_, outputW_;
@@ -129,7 +102,7 @@ private:
   int fwdAlgo_, bwdFilterAlgo_, bwdDataAlgo_;
   size_t fwdLimitBytes_, bwdDataLimitBytes_, bwdFilterLimitBytes_;
   size_t workSpaceInBytes_;
-  void* workSpace_;
+  void *workSpace_;
   bool isSelectAlgo_;
 };
 
@@ -160,7 +133,7 @@ ConvOperator::ConvOperator(const OperatorConfig &config, bool useGpu)
 void ConvOperator::allocConvWorkSpace(size_t maxWorkSpace) {
   if (maxWorkSpace > workSpaceInBytes_) {
     if (workSpaceInBytes_ != 0) {
-        hl_free_mem_device(workSpace_);
+      hl_free_mem_device(workSpace_);
     }
     // total amount of storage needed
     workSpace_ = hl_malloc_device(maxWorkSpace);
@@ -168,14 +141,13 @@ void ConvOperator::allocConvWorkSpace(size_t maxWorkSpace) {
   }
 }
 
-
 void ConvOperator::reshape(int batchSize) {
   imageH_ = ins_[0]->getFrameHeight();
   imageW_ = ins_[0]->getFrameWidth();
   if (imageH_ == 0) imageH_ = imgSize_;
   if (imageW_ == 0) imageW_ = imgSize_;
-  outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_);
-  outputW_ = outputSize(imageW_, filterSize_, padding_, stride_);
+  outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
+  outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_);
 
   out_->setFrameHeight(outputH_);
   out_->setFrameWidth(outputW_);
@@ -183,10 +155,16 @@ void ConvOperator::reshape(int batchSize) {
   reshapeImageDescriptors();
 
   if (!isSelectAlgo_) {
-    hl_conv_workspace(inputDesc_, outputDesc_, filterDesc_,
-               convDesc_, &fwdAlgo_, &fwdLimitBytes_,
-               &bwdDataAlgo_, &bwdDataLimitBytes_,
-               &bwdFilterAlgo_, &bwdFilterLimitBytes_);
+    hl_conv_workspace(inputDesc_,
+                      outputDesc_,
+                      filterDesc_,
+                      convDesc_,
+                      &fwdAlgo_,
+                      &fwdLimitBytes_,
+                      &bwdDataAlgo_,
+                      &bwdDataLimitBytes_,
+                      &bwdFilterAlgo_,
+                      &bwdFilterLimitBytes_);
 
     size_t maxWorkSpace = 0;
     maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
@@ -199,25 +177,48 @@ void ConvOperator::reshape(int batchSize) {
 }
 
 void ConvOperator::computeConvSizes() {
-  hl_create_filter_descriptor(&filterDesc_, channels_, numFilters_,
-                              filterSizeY_, filterSize_);
+  hl_create_filter_descriptor(
+      &filterDesc_, channels_, numFilters_, filterSizeY_, filterSize_);
   hl_create_tensor_descriptor(&inputDesc_);
-  int outputX = outputSize(imgSize_, filterSize_, padding_, stride_);
+  int outputX =
+      outputSize(imgSize_, filterSize_, padding_, stride_, caffeMode_);
   CHECK_EQ(outputX, outputX_);
   hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_, inputDesc_, filterDesc_,
-                                   paddingY_, padding_, strideY_, stride_);
+  hl_create_convolution_descriptor(&convDesc_,
+                                   inputDesc_,
+                                   filterDesc_,
+                                   paddingY_,
+                                   padding_,
+                                   strideY_,
+                                   stride_);
 }
 
 void ConvOperator::reshapeImageDescriptors() {
-  hl_tensor_reshape(inputDesc_, 1, channels_, imageH_, imageW_,
-                    channels_ * imageH_ * imageW_, imageH_ * imageW_,
-                    imageW_, 1);
-  hl_tensor_reshape(outputDesc_, 1, numFilters_, outputH_, outputW_,
-                    numFilters_ * outputH_ * outputW_, outputH_ * outputW_,
-                    outputW_, 1);
-  hl_reset_convolution_descriptor(convDesc_, inputDesc_, filterDesc_,
-                                  paddingY_, padding_, strideY_, stride_);
+  hl_tensor_reshape(inputDesc_,
+                    1,
+                    channels_,
+                    imageH_,
+                    imageW_,
+                    channels_ * imageH_ * imageW_,
+                    imageH_ * imageW_,
+                    imageW_,
+                    1);
+  hl_tensor_reshape(outputDesc_,
+                    1,
+                    numFilters_,
+                    outputH_,
+                    outputW_,
+                    numFilters_ * outputH_ * outputW_,
+                    outputH_ * outputW_,
+                    outputW_,
+                    1);
+  hl_reset_convolution_descriptor(convDesc_,
+                                  inputDesc_,
+                                  filterDesc_,
+                                  paddingY_,
+                                  padding_,
+                                  strideY_,
+                                  stride_);
   inputOffset_ = channels_ * imageH_ * imageW_;
   outputOffset_ = numFilters_ * outputH_ * outputW_;
   weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSize_;
@@ -247,17 +248,27 @@ void ConvOperator::forward() {
   reshape(batchSize);
   CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
   checkFilterSize(ins_[1]->value);
-  Matrix::resizeOrCreate(out_->value, batchSize,
-                         outputH_ * outputW_ * numFilters_, false, useGpu_);
+  Matrix::resizeOrCreate(out_->value,
+                         batchSize,
+                         outputH_ * outputW_ * numFilters_,
+                         false,
+                         useGpu_);
   {
     AsyncGpuBlock block;
     for (size_t batchId = 0; batchId < batchSize; ++batchId) {
       real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
       real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
       real *outData = out_->value->getData() + outputOffset_ * batchId;
-      hl_convolution_forward(inputDesc_, inputData, outputDesc_, outData,
-                             filterDesc_, wgtData, convDesc_, workSpace_,
-                             workSpaceInBytes_, fwdAlgo_);
+      hl_convolution_forward(inputDesc_,
+                             inputData,
+                             outputDesc_,
+                             outData,
+                             filterDesc_,
+                             wgtData,
+                             convDesc_,
+                             workSpace_,
+                             workSpaceInBytes_,
+                             fwdAlgo_);
     }
   }
 }
@@ -271,20 +282,32 @@ void ConvOperator::backward() {
       if (ins_[1]->grad) {
         real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
         real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_filter(inputDesc_, inputData, outputDesc_,
-                                       outGrad, filterDesc_, weightGrad,
-                                       convDesc_, workSpace_,
-                                       workSpaceInBytes_, bwdFilterAlgo_);
+        hl_convolution_backward_filter(inputDesc_,
+                                       inputData,
+                                       outputDesc_,
+                                       outGrad,
+                                       filterDesc_,
+                                       weightGrad,
+                                       convDesc_,
+                                       workSpace_,
+                                       workSpaceInBytes_,
+                                       bwdFilterAlgo_);
       }
 
       MatrixPtr preGrad = ins_[0]->grad;
       if (NULL != preGrad) {
         real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
         real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_data(inputDesc_, inputGrad, outputDesc_,
-                                     outGrad, filterDesc_, wgtData,
-                                     convDesc_, workSpace_,
-                                     workSpaceInBytes_, bwdDataAlgo_);
+        hl_convolution_backward_data(inputDesc_,
+                                     inputGrad,
+                                     outputDesc_,
+                                     outGrad,
+                                     filterDesc_,
+                                     wgtData,
+                                     convDesc_,
+                                     workSpace_,
+                                     workSpaceInBytes_,
+                                     bwdDataAlgo_);
       }
     }
   }
diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4ab0a1dc84164114df080bc1ae06905b15a3ff86
--- /dev/null
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -0,0 +1,254 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Stat.h"
+#include "ConvProjection.h"
+
+namespace paddle {
+
+REGISTER_PROJECTION(conv, ConvProjection);
+
+ThreadLocalD<std::vector<MemoryHandle *>> ConvProjection::convMem_;
+
+ConvProjection::ConvProjection(const ProjectionConfig &config,
+                               ParameterPtr parameter,
+                               bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(useGpu);  // only support GPU
+  getConvParams();
+  initCudnn();
+
+  size_t height = filterH_ * filterW_ * channels_ / groups_;
+  size_t width = numFilters_;
+  weight_.reset(new Weight(height, width, parameter));
+  weightOffset_ = height * width / groups_;
+}
+
+void ConvProjection::getConvParams() {
+  const ConvConfig &conf = config_.conv_conf();
+  paddingH_ = conf.padding_y();
+  paddingW_ = conf.padding();
+
+  strideH_ = conf.stride_y();
+  strideW_ = conf.stride();
+
+  filterH_ = conf.filter_size_y();
+  filterW_ = conf.filter_size();
+
+  configImgH_ = conf.img_size();
+  configImgW_ = conf.img_size();
+
+  channels_ = conf.channels();
+  numFilters_ = config_.num_filters();
+
+  groups_ = conf.groups();
+  CHECK_EQ(channels_ % groups_, 0);
+  CHECK_EQ(numFilters_ % groups_, 0);
+}
+
+void ConvProjection::initCudnn() {
+  hl_create_filter_descriptor(
+      &filterDesc_, channels_, numFilters_, filterH_, filterW_);
+  hl_create_tensor_descriptor(&inputDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+  hl_create_convolution_descriptor(&convDesc_,
+                                   inputDesc_,
+                                   filterDesc_,
+                                   paddingH_,
+                                   paddingW_,
+                                   strideH_,
+                                   strideW_);
+
+  // initialize all to default algorithms
+  fwdAlgo_ = 0;
+  bwdFilterAlgo_ = 0;
+  bwdDataAlgo_ = 0;
+  fwdLimitBytes_ = 0;
+  bwdDataLimitBytes_ = 0;
+  bwdFilterLimitBytes_ = 0;
+  workSpaceInBytes_ = 0;
+
+  batchNum_ = 0;
+  isSelectAlgo_ = false;
+}
+
+void ConvProjection::reshapeTensorDesc(int batchSize) {
+  hl_tensor_reshape(inputDesc_,
+                    batchSize,
+                    channels_,
+                    imageH_,
+                    imageW_,
+                    channels_ * imageH_ * imageW_,
+                    imageH_ * imageW_,
+                    imageW_,
+                    1);
+  hl_reset_convolution_descriptor(convDesc_,
+                                  inputDesc_,
+                                  filterDesc_,
+                                  paddingH_,
+                                  paddingW_,
+                                  strideH_,
+                                  strideW_);
+
+  // The stride between two consecutive images in ConvProjection may not be 1,
+  // for example, in the case of layer ConcatenateLayer2 with two
+  // ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
+  // So the calculation of nStride is different from CudnnConvLayer.
+  // In fact, only "nStride = out_->value->getStride()" is ok.
+  size_t nStride = numFilters_ * outputH_ * outputW_;
+  if (out_->value->isContiguous()) {
+    CHECK_EQ(nStride, out_->value->getWidth());
+  } else {
+    nStride = out_->value->getStride();
+  }
+
+  hl_tensor_reshape(outputDesc_,
+                    batchSize,
+                    numFilters_,
+                    outputH_,
+                    outputW_,
+                    nStride,
+                    outputH_ * outputW_,
+                    outputW_,
+                    1);
+}
+
+void ConvProjection::reshape(int batchSize) {
+  size_t width = calOutputSize();
+  CHECK_EQ(width, out_->value->getWidth());
+
+  isSelectAlgo_ = (batchSize == batchNum_);
+  batchNum_ = batchSize;
+
+  if (!isSelectAlgo_) {
+    reshapeTensorDesc(batchSize);
+    hl_conv_workspace(inputDesc_,
+                      outputDesc_,
+                      filterDesc_,
+                      convDesc_,
+                      &fwdAlgo_,
+                      &fwdLimitBytes_,
+                      &bwdDataAlgo_,
+                      &bwdDataLimitBytes_,
+                      &bwdFilterAlgo_,
+                      &bwdFilterLimitBytes_);
+
+    size_t maxWorkSpace = 0;
+    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+    workSpaceInBytes_ = maxWorkSpace;
+
+    VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
+            << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
+  }
+
+  isSelectAlgo_ = true;
+}
+
+void ConvProjection::forward() {
+  int batchSize = in_->value->getHeight();
+  reshape(batchSize);
+
+  void *workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    REGISTER_TIMER_INFO("CudnnConvFwTimer", getName().c_str());
+
+    real *inputData = in_->value->getData() + g * inputOffset_;
+    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+    real *outData = out_->value->getData() + g * outputOffset_;
+    hl_convolution_forward(inputDesc_,
+                           inputData,
+                           outputDesc_,
+                           outData,
+                           filterDesc_,
+                           wgtData,
+                           convDesc_,
+                           workSpace,
+                           fwdLimitBytes_,
+                           fwdAlgo_);
+  }
+}
+
+void ConvProjection::backward(const UpdateCallback &callback) {
+  REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
+
+  void *workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    real *outGrad = out_->grad->getData() + g * outputOffset_;
+    if (weight_->getWGrad()) {
+      real *inputData = in_->value->getData() + g * inputOffset_;
+      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
+      hl_convolution_backward_filter(inputDesc_,
+                                     inputData,
+                                     outputDesc_,
+                                     outGrad,
+                                     filterDesc_,
+                                     weightGrad,
+                                     convDesc_,
+                                     workSpace,
+                                     bwdFilterLimitBytes_,
+                                     bwdFilterAlgo_);
+    }
+
+    MatrixPtr preGrad = in_->grad;
+    if (NULL != preGrad) {
+      real *inputGrad = preGrad->getData() + g * inputOffset_;
+      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+      hl_convolution_backward_data(inputDesc_,
+                                   inputGrad,
+                                   outputDesc_,
+                                   outGrad,
+                                   filterDesc_,
+                                   wgtData,
+                                   convDesc_,
+                                   workSpace,
+                                   bwdDataLimitBytes_,
+                                   bwdDataAlgo_);
+    }
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+void *ConvProjection::getSpaceBytes(size_t size) {
+  std::vector<MemoryHandle *> &convMem = *convMem_;
+  if (convMem.empty()) {
+    int numDevices = hl_get_device_count();
+    convMem.resize(numDevices);
+  }
+
+  int devId = hl_get_device();
+  MemoryHandle **localMem = &(convMem[devId]);
+  if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
+    *localMem = new GpuMemoryHandle(size);
+  }
+  return (*localMem)->getBuf();
+}
+
+ConvProjection::~ConvProjection() {
+  hl_destroy_tensor_descriptor(inputDesc_);
+  hl_destroy_tensor_descriptor(outputDesc_);
+  hl_destroy_filter_descriptor(filterDesc_);
+  hl_destroy_convolution_descriptor(convDesc_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvProjection.h b/paddle/gserver/layers/ConvProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..779fe1455ade10ba55e32f4d9478d446b01b8a19
--- /dev/null
+++ b/paddle/gserver/layers/ConvProjection.h
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Projection.h"
+#include "paddle/math/MathUtils.h"
+
+namespace paddle {
+
+/**
+ * @brief Convolution projection do the same calculation with CudnnConvLayer.
+ */
+class ConvProjection : public Projection {
+public:
+  /**
+   * Constructor.
+   */
+  ConvProjection(const ProjectionConfig& config,
+                 ParameterPtr parameter,
+                 bool useGpu);
+
+  ~ConvProjection();
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+protected:
+  void getConvParams();
+  void initCudnn();
+
+  void reshapeTensorDesc(int batchSize);
+  void reshape(int batchSize);
+
+  size_t calOutputSize() {
+    imageH_ = in_->getFrameHeight();
+    imageW_ = in_->getFrameWidth();
+    if (imageH_ == 0) imageH_ = configImgH_;
+    if (imageW_ == 0) imageW_ = configImgW_;
+    outputH_ = outputSize(imageH_,
+                          filterH_,
+                          paddingH_,
+                          strideH_,
+                          /* caffeMode */ true);
+    outputW_ = outputSize(imageW_,
+                          filterW_,
+                          paddingW_,
+                          strideW_,
+                          /* caffeMode */ true);
+
+    const_cast<Argument*>(out_)->setFrameHeight(outputH_);
+    const_cast<Argument*>(out_)->setFrameWidth(outputW_);
+
+    inputOffset_ = (channels_ / groups_) * imageH_ * imageW_;
+    outputOffset_ = (numFilters_ / groups_) * outputH_ * outputW_;
+    return outputH_ * outputW_ * numFilters_;
+  }
+
+  static void* getSpaceBytes(size_t size);
+
+  /// imageH_ and imageW_ is calculated from the input layer.
+  int imageH_, imageW_;
+  /// configImgH_ and configImgW_ is obtained from config.
+  int configImgH_, configImgW_;
+  int outputH_, outputW_;
+  int channels_, numFilters_;
+  int paddingH_, paddingW_;
+  int strideH_, strideW_;
+  int filterH_, filterW_;
+  /// One group offset of input data.
+  int inputOffset_;
+  /// One group offset of output data.
+  int outputOffset_;
+  /// One group offset of weight.
+  int weightOffset_;
+  int groups_;
+
+  /// Cudnn tensor descriptor for input.
+  hl_tensor_descriptor inputDesc_;
+  /// Cudnn tensor descriptor for output.
+  hl_tensor_descriptor outputDesc_;
+  /// Cudnn tensor descriptor for filter.
+  hl_filter_descriptor filterDesc_;
+  /// Cudnn tensor descriptor for a convolution operation.
+  hl_convolution_descriptor convDesc_;
+
+  /// Record the algorithm for forward convolution, which is obtained by cudnn
+  /// api to search the best suited algorithm.
+  int fwdAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// filter coefficients.
+  int bwdFilterAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// the output.
+  int bwdDataAlgo_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// forward convolution with the specified algo.
+  size_t fwdLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardFilter with the specified algo.
+  size_t bwdDataLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardData with the specified algo.
+  size_t bwdFilterLimitBytes_;
+  /// Size of total work space.
+  size_t workSpaceInBytes_;
+
+  /// Whether to call cuDNN api to choose conv algorithm.
+  bool isSelectAlgo_;
+  /// batchNum is used to record batch size. If the batch size is changed,
+  /// the selection algorithm will be called.
+  int batchNum_;
+  bool bias_;
+
+  std::unique_ptr<Weight> weight_;
+  static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvShiftLayer.cpp b/paddle/gserver/layers/ConvShiftLayer.cpp
index 6b3881e3cc80396bfa0b801ba296cb1118fabc74..6e77c1f14e6a6896f6ef7c4042954b25bd58266a 100644
--- a/paddle/gserver/layers/ConvShiftLayer.cpp
+++ b/paddle/gserver/layers/ConvShiftLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp
index a81cf939af671f3fb34fb52ae33035a7bb524aed..7e1fef8bc600329ac62002dab7b91238b83b8023 100644
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -70,12 +69,21 @@ bool ConvexCombinationLayer::init(const LayerMap& layerMap,
   CHECK_EQ(weightDim * dataDim, inputLayers_[1]->getSize())
       << "Dimension mismatch";
 
-  tmpRow0 = Matrix::create(nullptr, /* height= */ 1, weightDim,
-                           /* trans= */ false, useGpu_);
-  tmpRow1 = Matrix::create(nullptr, /* height= */ 1, dataDim,
-                           /* trans= */ false, useGpu_);
-  tmpMtx0 = Matrix::create(nullptr, /* height= */ weightDim, dataDim,
-                           /* trans= */ false, useGpu_);
+  tmpRow0 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           weightDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow1 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpMtx0 = Matrix::create(nullptr,
+                           /* height= */ weightDim,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
 
   return true;
 }
diff --git a/paddle/gserver/layers/CosSimLayer.cpp b/paddle/gserver/layers/CosSimLayer.cpp
index 05a70aeff5e8ff3789bca966d351bffc8efb1cb3..894cb5b0d8226cc3b4b60bac38801bf0a7ec6b6a 100644
--- a/paddle/gserver/layers/CosSimLayer.cpp
+++ b/paddle/gserver/layers/CosSimLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "CosSimLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -57,9 +56,12 @@ void CosSimLayer::backward(const UpdateCallback& callback) {
     REGISTER_TIMER_INFO("CosBpAtvTimer", getName().c_str());
     MatrixPtr outG = this->getOutputGrad();
 
-    outG->cosSimDerivative(*this->getOutputValue(), *getInputValue(0),
-                           *getInputValue(1), *getInputGrad(0),
-                           *getInputGrad(1), config_.cos_scale());
+    outG->cosSimDerivative(*this->getOutputValue(),
+                           *getInputValue(0),
+                           *getInputValue(1),
+                           *getInputGrad(0),
+                           *getInputGrad(1),
+                           config_.cos_scale());
   }
 }
 
diff --git a/paddle/gserver/layers/CosSimLayer.h b/paddle/gserver/layers/CosSimLayer.h
index 65eb807ab2e6f16aab5ef2a9b08d697868c743a3..bc47998c11f267a1737ff82e8aa2958f6859bf86 100644
--- a/paddle/gserver/layers/CosSimLayer.h
+++ b/paddle/gserver/layers/CosSimLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -35,8 +34,7 @@ namespace paddle {
  */
 class CosSimLayer : public Layer {
 public:
-  explicit CosSimLayer(const LayerConfig& config)
-      : Layer(config) {}
+  explicit CosSimLayer(const LayerConfig& config) : Layer(config) {}
 
   ~CosSimLayer() {}
 
diff --git a/paddle/gserver/layers/CosSimVecMatLayer.cpp b/paddle/gserver/layers/CosSimVecMatLayer.cpp
index 7d251ace6fdfde2506e4890b276db5b0d08d51f5..56d177da6458a590299fee5b24b8a9c935510916 100644
--- a/paddle/gserver/layers/CosSimVecMatLayer.cpp
+++ b/paddle/gserver/layers/CosSimVecMatLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -67,19 +66,37 @@ bool CosSimVecMatLayer::init(const LayerMap& layerMap,
 
   CHECK_EQ(dataDim * numKeys, memoryDim) << "Dimension mismatch";
 
-  tmpRow0 = Matrix::create(nullptr, /* height= */ 1, dataDim,
-                           /* trans= */ false, useGpu_);
-  tmpRow1 = Matrix::create(nullptr, /* height= */ 1, dataDim,
-                           /* trans= */ false, useGpu_);
-  tmpRow2 = Matrix::create(nullptr, /* height= */ numKeys, 1,
-                           /* trans= */ false, useGpu_);
-  tmpRow3 = Matrix::create(nullptr, /* height= */ numKeys, 1,
-                           /* trans= */ false, useGpu_);
-
-  tmpMtx0 = Matrix::create(nullptr, /* height= */ numKeys, dataDim,
-                           /* trans= */ false, useGpu_);
-  tmpMtx1 = Matrix::create(nullptr, /* height= */ numKeys, dataDim,
-                           /* trans= */ false, useGpu_);
+  tmpRow0 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow1 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow2 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           1,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow3 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           1,
+                           /* trans= */ false,
+                           useGpu_);
+
+  tmpMtx0 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpMtx1 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
   return true;
 }
 
@@ -131,8 +148,12 @@ void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
       tmpRow2->setData(outV->rowBuf(i));
       tmpRow3->setData(outG->rowBuf(i));
 
-      tmpRow3->cosSimDerivative(*(tmpRow2), *(tmpMtx0), *(tmpRow0), *(tmpMtx1),
-                                *(tmpRow1), config_.cos_scale());
+      tmpRow3->cosSimDerivative(*(tmpRow2),
+                                *(tmpMtx0),
+                                *(tmpRow0),
+                                *(tmpMtx1),
+                                *(tmpRow1),
+                                config_.cos_scale());
     }
   } else {
     CHECK(!inG0 || !inG1) << "Not supported";
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 14ff8510f7b19dc24b7b1ba603485488ddd4979d..094c36ceb1f72ff9ee2cc9fa54de0b06312948fe 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <memory>
 #include <algorithm>
 #include "paddle/utils/Logging.h"
@@ -88,13 +87,15 @@ bool MultiClassCrossEntropy::init(const LayerMap& layerMap,
   return CostLayer::init(layerMap, parameterMap);
 }
 
-void MultiClassCrossEntropy::forwardImp(Matrix& output, Argument& label,
+void MultiClassCrossEntropy::forwardImp(Matrix& output,
+                                        Argument& label,
                                         Matrix& target) {
   target.oneHotCrossEntropy(output, *label.ids);
 }
 
-void MultiClassCrossEntropy::backwardImp(
-    Matrix& output, Argument& label, Matrix& outputG) {
+void MultiClassCrossEntropy::backwardImp(Matrix& output,
+                                         Argument& label,
+                                         Matrix& outputG) {
   outputG.oneHotCrossEntropyBp(output, *label.ids);
 }
 
@@ -152,17 +153,19 @@ bool SoftBinaryClassCrossEntropy::init(const LayerMap& layerMap,
   return CostLayer::init(layerMap, parameterMap);
 }
 
-void SoftBinaryClassCrossEntropy::forwardImp(Matrix& output, Argument& label,
+void SoftBinaryClassCrossEntropy::forwardImp(Matrix& output,
+                                             Argument& label,
                                              Matrix& target) {
-  Matrix::resizeOrCreate(targetPerDim_, output.getHeight(), output.getWidth(),
-                         false, useGpu_);
+  Matrix::resizeOrCreate(
+      targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
 
   targetPerDim_->softCrossEntropy(output, *label.value);
   targetPerDim_->rowSum(target);
 }
 
-void SoftBinaryClassCrossEntropy::backwardImp(
-    Matrix& output, Argument& label, Matrix& outputG) {
+void SoftBinaryClassCrossEntropy::backwardImp(Matrix& output,
+                                              Argument& label,
+                                              Matrix& outputG) {
   outputG.softCrossEntropyBp(output, *label.value);
 }
 
@@ -177,13 +180,15 @@ bool SumOfSquaresCostLayer::init(const LayerMap& layerMap,
   return CostLayer::init(layerMap, parameterMap);
 }
 
-void SumOfSquaresCostLayer::forwardImp(Matrix& output, Argument& label,
+void SumOfSquaresCostLayer::forwardImp(Matrix& output,
+                                       Argument& label,
                                        Matrix& target) {
   target.sumOfSquares(output, *label.value);
 }
 
-void SumOfSquaresCostLayer::backwardImp(
-    Matrix& output, Argument& label, Matrix& outputG) {
+void SumOfSquaresCostLayer::backwardImp(Matrix& output,
+                                        Argument& label,
+                                        Matrix& outputG) {
   outputG.sumOfSquaresBp(output, *label.value);
 }
 
@@ -219,8 +224,8 @@ void RankingCost::forward(PassType passType) {
     IVectorPtr idLabel = getInput(*getLabelLayer()).ids;
     CHECK(idLabel) << "label layer has neither value nor ids";
     CHECK_EQ((size_t)batchSize, idLabel->getSize());
-    Matrix::resizeOrCreate(labelBuf_, batchSize, /*width*/ 1, /*trans*/ false,
-                           useGpu_);
+    Matrix::resizeOrCreate(
+        labelBuf_, batchSize, /*width*/ 1, /*trans*/ false, useGpu_);
     labelBuf_->copyFrom(*idLabel);
     label = labelBuf_;
   }
@@ -261,8 +266,8 @@ void RankingCost::backward(const UpdateCallback& callback) {
     label = labelBuf_;
   }
 
-  Matrix::resizeOrCreate(marginGrad_, label->getHeight(), 1, /* trans= */ false,
-                         useGpu_);
+  Matrix::resizeOrCreate(
+      marginGrad_, label->getHeight(), 1, /* trans= */ false, useGpu_);
   marginGrad_->zeroMem();
   marginGrad_->logisticRegressionLossBp(*margin_, *label);
   if (weightLayer_) {
@@ -317,15 +322,14 @@ void LambdaCost::forward(PassType passType) {
   real* outputData = output->getData();
   real* targetData = target->getData();
 
-  auto startPos =
-      getInput(*getOutputLayer()).sequenceStartPositions;
+  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
   const int* startPosData = startPos->getData(false);
   size_t batchNum = startPos->getSize() - 1;
   for (size_t i = 0; i < batchNum; ++i) {
     int beginPos = startPosData[i];
     int endPos = startPosData[i + 1];
-    real NDCG = calcNDCG(outputData + beginPos, scoreData + beginPos,
-                         endPos - beginPos);
+    real NDCG = calcNDCG(
+        outputData + beginPos, scoreData + beginPos, endPos - beginPos);
     for (int j = beginPos; j < endPos; ++j) {
       targetData[j] = NDCG;
     }
@@ -336,23 +340,27 @@ void LambdaCost::backward(const UpdateCallback& callback) {
   (void)callback;
   MatrixPtr score = getInputValue(*getScoreLayer());
   MatrixPtr output = getInputValue(*getOutputLayer());
-  Matrix::resizeOrCreate(marginGrad_, score->getHeight(), 1,
-                         /* trans= */ false, useGpu_);
+  Matrix::resizeOrCreate(marginGrad_,
+                         score->getHeight(),
+                         1,
+                         /* trans= */ false,
+                         useGpu_);
   marginGrad_->zeroMem();
 
   real* gradData = marginGrad_->getData();
   real* scoreData = score->getData();
   real* outputData = output->getData();
 
-  auto startPos =
-      getInput(*getOutputLayer()).sequenceStartPositions;
+  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
   const int* startPosData = startPos->getData(false);
   size_t batchNum = startPos->getSize() - 1;
 
   for (size_t i = 0; i < batchNum; ++i) {
     int beginPos = startPosData[i];
     int endPos = startPosData[i + 1];
-    calcGrad(outputData + beginPos, scoreData + beginPos, gradData + beginPos,
+    calcGrad(outputData + beginPos,
+             scoreData + beginPos,
+             gradData + beginPos,
              endPos - beginPos);
   }
 
@@ -361,8 +369,10 @@ void LambdaCost::backward(const UpdateCallback& callback) {
 
 void LambdaCost::onPassEnd() {}
 
-void LambdaCost::calcGrad(const real* outputScore, const real* score,
-                          real* gradData, int size) {
+void LambdaCost::calcGrad(const real* outputScore,
+                          const real* score,
+                          real* gradData,
+                          int size) {
   CHECK_GE(size, truncationSize_)
       << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
   int sortSize = maxSortSize_ == -1 ? size : std::min(maxSortSize_, size);
@@ -372,13 +382,16 @@ void LambdaCost::calcGrad(const real* outputScore, const real* score,
     scorePair_.push_back(std::make_pair(score[i], i));
   }
   if (size <= sortSize) {
-    std::sort(scorePair_.begin(), scorePair_.end(),
+    std::sort(scorePair_.begin(),
+              scorePair_.end(),
               [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
                 return a.first > b.first;
               });
   } else {
     std::partial_sort(
-        scorePair_.begin(), scorePair_.begin() + sortSize, scorePair_.end(),
+        scorePair_.begin(),
+        scorePair_.begin() + sortSize,
+        scorePair_.end(),
         [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
           return a.first > b.first;
         });
@@ -414,7 +427,8 @@ void LambdaCost::calcGrad(const real* outputScore, const real* score,
   }
 }
 
-real LambdaCost::calcNDCG(const real* outputScore, const real* score,
+real LambdaCost::calcNDCG(const real* outputScore,
+                          const real* score,
                           int size) {
   CHECK_GE(size, truncationSize_)
       << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
@@ -424,7 +438,8 @@ real LambdaCost::calcNDCG(const real* outputScore, const real* score,
     outputScorePair_.push_back(std::make_pair(outputScore[i], i));
   }
   std::partial_sort(
-      outputScorePair_.begin(), outputScorePair_.begin() + truncationSize_,
+      outputScorePair_.begin(),
+      outputScorePair_.begin() + truncationSize_,
       outputScorePair_.end(),
       [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
         return a.first > b.first;
@@ -439,8 +454,10 @@ real LambdaCost::calcNDCG(const real* outputScore, const real* score,
   scoreVec_.resize(size);
   std::copy(score, score + size, scoreVec_.begin());
   real maxDCG = 0;
-  std::partial_sort(scoreVec_.begin(), scoreVec_.begin() + truncationSize_,
-                    scoreVec_.end(), std::greater<real>());
+  std::partial_sort(scoreVec_.begin(),
+                    scoreVec_.begin() + truncationSize_,
+                    scoreVec_.end(),
+                    std::greater<real>());
   for (int i = 0; i < truncationSize_; ++i) {
     maxDCG += (std::pow(2, scoreVec_[i]) - 1) / std::log(i + 2);
   }
@@ -460,27 +477,47 @@ bool MultiBinaryLabelCrossEntropy::init(const LayerMap& layerMap,
   return CostLayer::init(layerMap, parameterMap);
 }
 
-void MultiBinaryLabelCrossEntropy::forwardImp(Matrix& output, Argument& label,
+void MultiBinaryLabelCrossEntropy::forwardImp(Matrix& output,
+                                              Argument& label,
                                               Matrix& target) {
-  if (dynamic_cast<CpuSparseMatrix*>(label.value.get()) ||
-      dynamic_cast<GpuSparseMatrix*>(label.value.get())) {
-    target.multiBinaryLabelCrossEntropy(output, *label.value);
+  MatrixPtr value = nullptr;
+  if (label.ids) {
+    CHECK(!label.value);
+    value = label.ids->toOneHotSparseMatrix(output.getWidth(), useGpu_);
   } else {
-    Matrix::resizeOrCreate(targetPerDim_, output.getHeight(), output.getWidth(),
-                           false, useGpu_);
+    CHECK(label.value);
+    value = label.value;
+  }
 
-    targetPerDim_->binaryLabelCrossEntropy(output, *label.value);
+  if (dynamic_cast<CpuSparseMatrix*>(value.get()) ||
+      dynamic_cast<GpuSparseMatrix*>(value.get())) {
+    target.multiBinaryLabelCrossEntropy(output, *value);
+  } else {
+    Matrix::resizeOrCreate(
+        targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
+
+    targetPerDim_->binaryLabelCrossEntropy(output, *value);
     targetPerDim_->rowSum(target);
   }
 }
 
-void MultiBinaryLabelCrossEntropy::backwardImp(
-    Matrix& output, Argument& label, Matrix& outputG) {
-  if (dynamic_cast<CpuSparseMatrix*>(label.value.get()) ||
-      dynamic_cast<GpuSparseMatrix*>(label.value.get())) {
-    outputG.multiBinaryLabelCrossEntropyBp(output, *label.value);
+void MultiBinaryLabelCrossEntropy::backwardImp(Matrix& output,
+                                               Argument& label,
+                                               Matrix& outputG) {
+  MatrixPtr value = nullptr;
+  if (label.ids) {
+    CHECK(!value);
+    value = label.ids->toOneHotSparseMatrix(output.getWidth(), useGpu_);
+  } else {
+    CHECK(label.value);
+    value = label.value;
+  }
+
+  if (dynamic_cast<CpuSparseMatrix*>(value.get()) ||
+      dynamic_cast<GpuSparseMatrix*>(value.get())) {
+    outputG.multiBinaryLabelCrossEntropyBp(output, *value);
   } else {
-    outputG.binaryLabelCrossEntropyBp(output, *label.value);
+    outputG.binaryLabelCrossEntropyBp(output, *value);
   }
 }
 
@@ -501,8 +538,7 @@ bool HuberTwoClass::init(const LayerMap& layerMap,
   return true;
 }
 
-void HuberTwoClass::forwardImp(Matrix &output, Argument &label,
-                               Matrix &cost) {
+void HuberTwoClass::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
   if (useGpu_) {
     for (size_t i = 0; i < inputLayers_.size(); i++) {
       tmpCpuInput_[i].resizeAndCopyFrom(
@@ -513,7 +549,8 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label,
   forwardImpIn(output, label, cost);
 }
 
-void HuberTwoClass::forwardImpIn(Matrix& output, Argument& label,
+void HuberTwoClass::forwardImpIn(Matrix& output,
+                                 Argument& label,
                                  Matrix& target) {
   size_t numSamples = target.getHeight();
   CHECK_EQ((*label.ids).getSize(), numSamples);
@@ -521,7 +558,7 @@ void HuberTwoClass::forwardImpIn(Matrix& output, Argument& label,
   CHECK_EQ(output.getWidth(), (size_t)1);
   CHECK_EQ(target.getWidth(), (size_t)1);
 
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData(): output.getData();
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
   int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
   std::vector<real> cost(numSamples);
   for (size_t i = 0; i < numSamples; ++i) {
@@ -536,19 +573,21 @@ void HuberTwoClass::forwardImpIn(Matrix& output, Argument& label,
   target.copyFrom(cost.data(), numSamples);
 }
 
-void HuberTwoClass::backwardImp(Matrix &outputValue,
-                                Argument &label, Matrix &outputGrad) {
+void HuberTwoClass::backwardImp(Matrix& outputValue,
+                                Argument& label,
+                                Matrix& outputGrad) {
   if (useGpu_) {
-    backwardImpIn(*tmpCpuInput_[0].value, tmpCpuInput_[1],
-                  *tmpCpuInput_[0].grad);
+    backwardImpIn(
+        *tmpCpuInput_[0].value, tmpCpuInput_[1], *tmpCpuInput_[0].grad);
     outputGrad.copyFrom(*tmpCpuInput_[0].grad);
   } else {
     backwardImpIn(outputValue, label, outputGrad);
   }
 }
 
-void HuberTwoClass::backwardImpIn(
-    Matrix& output, Argument& label, Matrix& outputG) {
+void HuberTwoClass::backwardImpIn(Matrix& output,
+                                  Argument& label,
+                                  Matrix& outputG) {
   size_t numSamples = output.getHeight();
   real* out = output.getData();
   real* grad = outputG.getData();
@@ -562,4 +601,39 @@ void HuberTwoClass::backwardImpIn(
   }
 }
 
+/**
+ * This cost layer compute the sum of its input as loss.
+ * \f[
+ * o(i) = \sum_{j=1}^D y_{ij}
+ * \f]
+ */
+class SumCostLayer : public Layer {
+public:
+  explicit SumCostLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+    bool ret = Layer::init(layerMap, parameterMap);
+    if (!ret) return ret;
+    CHECK_EQ(inputLayers_.size(), 1UL);
+    return true;
+  }
+
+  virtual void forward(PassType passType) {
+    Layer::forward(passType);
+    const MatrixPtr& input = getInputValue(0);
+
+    /* malloc memory for the output_ if necessary */
+    int batchSize = input->getHeight();
+    int size = 1;
+    resizeOutput(batchSize, size);
+    output_.value->sumRows(*input, /* scaleSum= */ 1, /* scaleDest= */ 0);
+  }
+
+  virtual void backward(const UpdateCallback& callback = nullptr) {
+    getInputGrad(0)->add((real)1);
+  }
+};
+
+REGISTER_LAYER(sum_cost, SumCostLayer);
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
index b464e16737ae561dce6e7d4f16a4dd61f73204e0..120ff9bd2d1b402e8ef2d074a84b76b0183dcab0 100644
--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -42,10 +42,12 @@ public:
 
   virtual void backward(const UpdateCallback& callback = nullptr);
 
-  virtual void forwardImp(Matrix& outputValue, Argument& label,
+  virtual void forwardImp(Matrix& outputValue,
+                          Argument& label,
                           Matrix& cost) = 0;
 
-  virtual void backwardImp(Matrix& outputValue, Argument& label,
+  virtual void backwardImp(Matrix& outputValue,
+                           Argument& label,
                            Matrix& outputGrad) = 0;
 
 protected:
@@ -129,7 +131,7 @@ protected:
  * This cost layer compute Euclidean (L2) loss for real-valued regression
  * tasks.
  * \f[
- * L = \frac{1}{2N} \sum_{i=1}^N {|| \hat{y}_i - y_i||_2^2}
+ * L = \sum_{i=1}^N {|| \hat{y}_i - y_i||_2^2}
  * \f]
  */
 class SumOfSquaresCostLayer : public CostLayer {
@@ -225,7 +227,9 @@ public:
   void onPassEnd();
 
   real calcNDCG(const real* outputScore, const real* score, int size);
-  void calcGrad(const real* outputScore, const real* score, real* gradData,
+  void calcGrad(const real* outputScore,
+                const real* score,
+                real* gradData,
                 int size);
 
 private:
@@ -274,6 +278,7 @@ public:
  */
 class HuberTwoClass : public CostLayer {
   std::vector<Argument> tmpCpuInput_;
+
 public:
   explicit HuberTwoClass(const LayerConfig& config) : CostLayer(config) {}
 
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
index 3c6d13b0bf92ea98eb5c3331a1fdff6b177529b6..6be62b1a25407a5340bb5cdd99745db5d33ec3da 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "Layer.h"
 #include "CudnnBatchNormLayer.h"
@@ -65,16 +64,31 @@ void CudnnBatchNormLayer::forward(PassType passType) {
     REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
     real* savedMean = savedMean_->getData();
     real* savedInvVar = savedInvVar_->getData();
-    hl_batch_norm_forward_training(ioDesc_, input, ioDesc_, output,
+    hl_batch_norm_forward_training(ioDesc_,
+                                   input,
+                                   ioDesc_,
+                                   output,
                                    bnParamDesc_,
-                                   gamma, beta, 1.0 - movingAvgFraction_,
-                                   movingMean, movingVar,
-                                   EPS, savedMean, savedInvVar);
+                                   gamma,
+                                   beta,
+                                   1.0 - movingAvgFraction_,
+                                   movingMean,
+                                   movingVar,
+                                   EPS,
+                                   savedMean,
+                                   savedInvVar);
   } else {
     // used movingMean and movingVar in testing
-    hl_batch_norm_forward_inference(ioDesc_, input, ioDesc_, output,
-                                    bnParamDesc_, gamma, beta,
-                                    movingMean, movingVar, EPS);
+    hl_batch_norm_forward_inference(ioDesc_,
+                                    input,
+                                    ioDesc_,
+                                    output,
+                                    bnParamDesc_,
+                                    gamma,
+                                    beta,
+                                    movingMean,
+                                    movingVar,
+                                    EPS);
   }
 
   /* activation */ {
@@ -115,10 +129,19 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
     create(tmpBiasGrad_, 1, channels_, &betaGrad);
   }
 
-  hl_batch_norm_backward(ioDesc_, input, ioDesc_, outGrad,
-                         ioDesc_, inGrad, bnParamDesc_,
-                         gamma, gammaGrad, betaGrad,
-                         EPS, savedMean, savedInvVar);
+  hl_batch_norm_backward(ioDesc_,
+                         input,
+                         ioDesc_,
+                         outGrad,
+                         ioDesc_,
+                         inGrad,
+                         bnParamDesc_,
+                         gamma,
+                         gammaGrad,
+                         betaGrad,
+                         EPS,
+                         savedMean,
+                         savedInvVar);
 
   {
     REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h
index 03f4f591c3bfa0139c6b10f180fbdeaa19a231b8..6220e77ceb5e248e5678c9170e85aff1cb40e1cd 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Stat.h"
@@ -23,7 +22,8 @@ namespace paddle {
 
 /**
  * @brief Cudnn Batch normalization layer use to cuDNN lib to implentment.
- * @note Cudnn version must >= v4.0, and better to use the latest version (v5.1).
+ * @note Cudnn version must >= v4.0, and better to use the latest version
+ * (v5.1).
  *
  * The config file api is batch_norm_layer.
  */
diff --git a/paddle/gserver/layers/CudnnConvLayer.cpp b/paddle/gserver/layers/CudnnConvLayer.cpp
index 0f932f960f6bacb5fc80273e5dfedf86bfb9d152..93c5565d2f401549959d6b067b05289592433a3a 100644
--- a/paddle/gserver/layers/CudnnConvLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvLayer.cpp
@@ -22,215 +22,70 @@ REGISTER_LAYER(cudnn_conv, CudnnConvLayer);
 
 bool CudnnConvLayer::init(const LayerMap &layerMap,
                           const ParameterMap &parameterMap) {
-  ConvBaseLayer::init(layerMap, parameterMap);
+  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
   CHECK(useGpu_) << "CudnnConvLayer only support gpu";
 
-  maxGroups_ = 0;
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    CHECK_EQ(channels_[i] % groups_[i], 0);
-    CHECK_EQ(numFilters_ % groups_[i], 0);
-
-    hl_filter_descriptor filter;
-    hl_create_filter_descriptor(&filter, channels_[i] / groups_[i],
-                                numFilters_ / groups_[i], filterSizeY_[i],
-                                filterSize_[i]);
-    filterDesc_.push_back(filter);
-
-    hl_tensor_descriptor input;
-    hl_create_tensor_descriptor(&input);
-    inputDesc_.push_back(input);
-
-    hl_tensor_descriptor output;
-    int outputX =
-        outputSize(imgSize_[i], filterSize_[i], padding_[i], stride_[i]);
-    CHECK_EQ(outputX, outputX_[i]);
-    hl_create_tensor_descriptor(&output);
-    outputDesc_.push_back(output);
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  projections_.reserve(inputLayers_.size());
+  projConf_.reserve(inputLayers_.size());
 
-    hl_convolution_descriptor conv;
-    hl_create_convolution_descriptor(&conv, input, filter, paddingY_[i],
-                                     padding_[i], strideY_[i], stride_[i]);
-    convDesc_.push_back(conv);
-
-    weightOffset_.push_back((numFilters_ / groups_[i]) *
-                            (channels_[i] / groups_[i]) * filterPixels_[i]);
-    inputOffset_.push_back((channels_[i] / groups_[i]) * imgSize_[i] *
-                           imgSize_[i]);
-    outputOffset_.push_back((numFilters_ / groups_[i]) * outputX_[i] *
-                            outputX_[i]);
-
-    // initialize all to default algorithms
-    fwdAlgo_.push_back(0);
-    bwdFilterAlgo_.push_back(0);
-    bwdDataAlgo_.push_back(0);
-    fwdLimitBytes_.push_back(0);
-    bwdFilterLimitBytes_.push_back(0);
-    bwdDataLimitBytes_.push_back(0);
-
-    // cudnn streams per group equal to 1
-    if (groups_[i] > maxGroups_) {
-      maxGroups_ = groups_[i];
-    }
-  }
-
-  workSpaceInBytes_ = 0;
-  workSpaceData_ = NULL;
-  for (int i = 0; i < maxGroups_; ++i) {
-    workSpace_.push_back(NULL);
+  numFilters_ = config_.num_filters();
+  CHECK(config_.shared_biases());
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    ProjectionConfig *conf = new ProjectionConfig();
+    conf->set_type("conv");
+    conf->set_num_filters(numFilters_);
+    ConvConfig *convConf = conf->mutable_conv_conf();
+    *convConf = *(config_.mutable_inputs(i)->mutable_conv_conf());
+    conf->set_input_size(getPrev(i)->getSize());
+    conf->set_output_size(getSize());
+    projConf_.emplace_back(conf);
+    projections_.emplace_back(
+        Projection::create(*projConf_[i], parameters_[i], useGpu_));
   }
 
   if (biases_.get() && sharedBiases_) {
     hl_create_tensor_descriptor(&biasDesc_);
+    hl_create_tensor_descriptor(&outputDesc_);
     hl_tensor_reshape(biasDesc_, 1, numFilters_ / groups_[0], 1, 1);
     biasOffset_ = numFilters_ / groups_[0];
   }
 
-  batchNum_ = 0;
-  isSelectAlgo_ = false;
   return true;
 }
 
-void CudnnConvLayer::allocConvWorkSpace(size_t maxWorkSpace) {
-  size_t totalWorkSpace = maxWorkSpace * maxGroups_;
-
-  if (totalWorkSpace  > workSpaceInBytes_) {
-      if (workSpaceInBytes_ != 0) {
-          hl_free_mem_device(workSpaceData_);
-      }
-      // total amount of storage needed over all groups
-      workSpaceData_ = hl_malloc_device(totalWorkSpace);
-
-      // update work space address for each group
-      for (int i = 0; i < maxGroups_; ++i) {
-            workSpace_[i] = reinterpret_cast<char *>(workSpaceData_)
-                                  + i * maxWorkSpace;
-      }
-      workSpaceInBytes_ = totalWorkSpace;
-  }
-}
-
-void CudnnConvLayer::reshape(int batchSize) {
-  CHECK_NE(inputLayers_.size(), 0UL);
-  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imageH_ == 0) imageH_ = imgSize_[0];
-  if (imageW_ == 0) imageW_ = imgSize_[0];
-
-  for (size_t i = 1; i < inputLayers_.size(); i++) {
-    int imageH = inputLayers_[i]->getOutput().getFrameHeight();
-    int imageW = inputLayers_[i]->getOutput().getFrameWidth();
-    if (imageH) {
-      CHECK_EQ(imageH_, imageH) << "Inputs must have same height.";
-    }
-    if (imageW) {
-      CHECK_EQ(imageW_, imageW) << "Inputs must have same width.";
-    }
-  }
-
-  outputH_ = outputSize(imageH_, filterSizeY_[0], paddingY_[0], strideY_[0]);
-  outputW_ = outputSize(imageW_, filterSize_[0], padding_[0], stride_[0]);
-  // check outputH & outputW
-  getOutput().setFrameHeight(outputH_);
-  getOutput().setFrameWidth(outputW_);
-
-  // if the batchSize remains the same, set isSelectAlgo_ true.
-  // Otherwise, set isSelectAlgo_ false and select algo again.
-  isSelectAlgo_ = (batchSize == batchNum_);
-  batchNum_ = batchSize;
-
-  size_t maxWorkSpace = 0;
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    CHECK_EQ(inputLayers_[i]->getOutput().value->getWidth(),
-             (size_t)(channels_[i] * imageH_ * imageW_));
-
-    hl_tensor_reshape(inputDesc_[i], batchSize, channels_[i] / groups_[i],
-                      imageH_, imageW_, channels_[i] * imageH_ * imageW_,
-                      imageH_ * imageW_, imageW_, 1);
-
-    hl_tensor_reshape(outputDesc_[i], batchSize, numFilters_ / groups_[i],
-                      outputH_, outputW_, numFilters_ * outputH_ * outputW_,
-                      outputH_ * outputW_, outputW_, 1);
-
-    hl_reset_convolution_descriptor(convDesc_[i], inputDesc_[i],
-                                    filterDesc_[i], paddingY_[i],
-                                    padding_[i], strideY_[i], stride_[i]);
-
-    inputOffset_[i] = (channels_[i] / groups_[i]) * imageH_ * imageW_;
-    outputOffset_[i] = (numFilters_ / groups_[i]) * outputH_ * outputW_;
-
-    if (!isSelectAlgo_) {
-      hl_conv_workspace(inputDesc_[i], outputDesc_[i], filterDesc_[i],
-                        convDesc_[i], &fwdAlgo_[i], &fwdLimitBytes_[i],
-                        &bwdDataAlgo_[i], &bwdDataLimitBytes_[i],
-                        &bwdFilterAlgo_[i], &bwdFilterLimitBytes_[i]);
-
-      maxWorkSpace = std::max(fwdLimitBytes_[i], bwdDataLimitBytes_[i]);
-      maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_[i]);
-
-      VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_[i]
-                           << " / " << bwdDataAlgo_[i]
-                           << " / " << bwdFilterAlgo_[i];
-    }
-  }
-
-  if (!isSelectAlgo_) {
-    allocConvWorkSpace(maxWorkSpace);
-  }
-
-  isSelectAlgo_ = true;
-}
-
 void CudnnConvLayer::forward(PassType passType) {
   Layer::forward(passType);
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  reshape(batchSize);
-  resetOutput(batchSize, outputH_ * outputW_ * numFilters_);
+
+  int batchSize = getInput(0).getBatchSize();
+  resetOutput(batchSize, calOutputSize());
 
   for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    REGISTER_TIMER_INFO("CudnnConvFwTimer", getName().c_str());
-    for (int g = 0; g < groups_[i]; ++g) {
-      real *inputData = getInputValue(i)->getData() + inputOffset_[i] * g;
-      real *wgtData = weights_[i]->getW()->getData() + weightOffset_[i] * g;
-      real *outData = getOutputValue()->getData() + outputOffset_[i] * g;
-      hl_convolution_forward(inputDesc_[i], inputData, outputDesc_[i],
-                             outData, filterDesc_[i], wgtData,
-                             convDesc_[i], workSpace_[g],
-                             fwdLimitBytes_[i], fwdAlgo_[i]);
-    }
+    projections_[i]->forward(&getInput(i), &getOutput(), passType);
   }
 
   if (biases_) {
     REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
-    addBiases();
-  }
-
-  forwardActivation();
-}
-
-void CudnnConvLayer::addBiases() {
-  if (sharedBiases_) {
+    int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+    hl_tensor_reshape(outputDesc_,
+                      batchSize,
+                      numFilters_ / groups_[0],
+                      outputH_[0],
+                      outputW_[0],
+                      numFilters_ * outputH_[0] * outputW_[0],
+                      outputH_[0] * outputW_[0],
+                      outputW_[0],
+                      1);
+    outputOffset_ = getOutputValue()->getWidth() / groups_[0];
     for (int g = 0; g < groups_[0]; ++g) {
       real *biasData = biases_->getW()->getData() + biasOffset_ * g;
-      real *outData = getOutputValue()->getData() + outputOffset_[0] * g;
-      hl_convolution_forward_add_bias(biasDesc_, biasData,
-                                      outputDesc_[0], outData);
+      real *outData = getOutputValue()->getData() + outputOffset_ * g;
+      hl_convolution_forward_add_bias(
+          biasDesc_, biasData, outputDesc_, outData);
     }
-  } else {
-    LOG(FATAL) << "Not supported";
   }
-}
 
-void CudnnConvLayer::bpropBiases() {
-  if (sharedBiases_) {
-    for (int g = 0; g < groups_[0]; ++g) {
-      real *biasGrad = biases_->getWGrad()->getData() + biasOffset_ * g;
-      real *outGrad = getOutputGrad()->getData() + outputOffset_[0] * g;
-      hl_convolution_backward_bias(biasDesc_, biasGrad,
-                                   outputDesc_[0], outGrad);
-    }
-  } else {
-    LOG(FATAL) << "Not supported";
-  }
+  forwardActivation();
 }
 
 void CudnnConvLayer::backward(const UpdateCallback &callback) {
@@ -238,52 +93,23 @@ void CudnnConvLayer::backward(const UpdateCallback &callback) {
 
   if (biases_ && biases_->getWGrad()) {
     REGISTER_TIMER_INFO("CudnnConvBpBiasTimer", getName().c_str());
-    bpropBiases();
+    for (int g = 0; g < groups_[0]; ++g) {
+      real *biasGrad = biases_->getWGrad()->getData() + biasOffset_ * g;
+      real *outGrad = getOutputGrad()->getData() + outputOffset_ * g;
+      hl_convolution_backward_bias(biasDesc_, biasGrad, outputDesc_, outGrad);
+    }
     biases_->getParameterPtr()->incUpdate(callback);
   }
 
   for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
-    for (int g = 0; g < groups_[i]; ++g) {
-      real *outGrad = getOutputGrad()->getData() + outputOffset_[i] * g;
-      if (weights_[i]->getWGrad()) {
-        real *inputData = getInputValue(i)->getData() + inputOffset_[i] * g;
-        real *weightGrad =
-            weights_[i]->getWGrad()->getData() + weightOffset_[i] * g;
-        hl_convolution_backward_filter(
-            inputDesc_[i], inputData, outputDesc_[i], outGrad, filterDesc_[i],
-            weightGrad, convDesc_[i], workSpace_[g], bwdFilterLimitBytes_[i],
-            bwdFilterAlgo_[i]);
-      }
-
-      MatrixPtr preGrad = getInputGrad(i);
-      if (NULL != preGrad) {
-        real *inputGrad = preGrad->getData() + inputOffset_[i] * g;
-        real *wgtData = weights_[i]->getW()->getData() + weightOffset_[i] * g;
-        hl_convolution_backward_data(
-            inputDesc_[i], inputGrad, outputDesc_[i], outGrad, filterDesc_[i],
-            wgtData, convDesc_[i], workSpace_[g], bwdDataLimitBytes_[i],
-            bwdDataAlgo_[i]);
-      }
-    }
-    weights_[i]->getParameterPtr()->incUpdate(callback);
+    projections_[i]->backward(callback);
   }
 }
 
 CudnnConvLayer::~CudnnConvLayer() {
-  if (biasDesc_) {
+  if (biases_) {
     hl_destroy_tensor_descriptor(biasDesc_);
-  }
-
-  for (size_t i = 0; i < inputDesc_.size(); i++) {
-    hl_destroy_tensor_descriptor(inputDesc_[i]);
-    hl_destroy_tensor_descriptor(outputDesc_[i]);
-    hl_destroy_filter_descriptor(filterDesc_[i]);
-    hl_destroy_convolution_descriptor(convDesc_[i]);
-  }
-  if (workSpaceInBytes_ != 0) {
-    hl_free_mem_device(workSpaceData_);
-    workSpaceInBytes_ = 0;
+    hl_destroy_tensor_descriptor(outputDesc_);
   }
 }
 
diff --git a/paddle/gserver/layers/CudnnConvLayer.h b/paddle/gserver/layers/CudnnConvLayer.h
index a6dadba10daa49d03e4a52a9c028a87400ca23ea..6cfbadfb53839d847b8b2bcf768da0f473ac05e5 100644
--- a/paddle/gserver/layers/CudnnConvLayer.h
+++ b/paddle/gserver/layers/CudnnConvLayer.h
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "ConvBaseLayer.h"
 #include "paddle/math/Matrix.h"
+#include "Projection.h"
 #include <vector>
 
 namespace paddle {
 
 /**
- * @brief A subclass of ConvBaseLayer by cuDNN implementation. It only
+ * @brief A 2-dimension conv layer implemented by cuDNN. It only
  *        supports GPU mode. We automatic select CudnnConvLayer for GPU
  *        mode and ExpandConvLayer for CPU mode if you set type of "conv".
  *        User also can specfiy type of "exconv" or "cudnn_conv" for
@@ -31,81 +31,21 @@ namespace paddle {
  * The config file api is img_conv_layer.
  */
 class CudnnConvLayer : public ConvBaseLayer {
-private:
-  /// resize Cudnn workspace size
-  void allocConvWorkSpace(size_t maxWorkSpace);
-
 protected:
-  int imageH_, imageW_, outputH_, outputW_;
-  /// Cudnn tensor descriptor for bias.
+  std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
+  std::vector<std::unique_ptr<Projection>> projections_;
+
   hl_tensor_descriptor biasDesc_;
-  /// Cudnn tensor descriptor for input.
-  std::vector<hl_tensor_descriptor> inputDesc_;
-  /// Cudnn tensor descriptor for output.
-  std::vector<hl_tensor_descriptor> outputDesc_;
-  /// Cudnn tensor descriptor for filter.
-  std::vector<hl_filter_descriptor> filterDesc_;
-  /// Cudnn tensor descriptor for a convolution operation.
-  std::vector<hl_convolution_descriptor> convDesc_;
-  /// One sample offset of input data.
-  IntV inputOffset_;
-  /// One sample offset of output data.
-  IntV outputOffset_;
-  /// One group offset of weight.
-  IntV weightOffset_;
-  /// One group offset of bias.
+  hl_tensor_descriptor outputDesc_;
   int biasOffset_;
-
-  /// Save the algorithm for forward convolution, which is obtained by cudnn
-  /// api to search the best suited algorithm.
-  std::vector<int> fwdAlgo_;
-  /// Save the algorithm for computing convolution gradient with respect to
-  /// filter coefficients.
-  std::vector<int> bwdFilterAlgo_;
-  /// Save the algorithm for computing convolution gradient with respect to
-  /// the output.
-  std::vector<int> bwdDataAlgo_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// forward convolution with the specified algo.
-  std::vector<size_t> fwdLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardFilter with the specified algo.
-  std::vector<size_t> bwdFilterLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardData with the specified algo.
-  std::vector<size_t> bwdDataLimitBytes_;
-
-  /// Device work space address for each group.
-  std::vector<void*> workSpace_;
-  /// Max number of groups.
-  int maxGroups_;
-  /// Total work space address in device for all groups.
-  void* workSpaceData_;
-  /// Size of total work space.
-  size_t workSpaceInBytes_;
-
-  /// Is or not select conv algorihtm.
-  bool isSelectAlgo_;
-
-  /// batchNum is used to record batch size. If the batch size is changed,
-  /// the selection algorithm will be called.
-  int batchNum_;
+  int outputOffset_;
 
 public:
   explicit CudnnConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
 
   ~CudnnConvLayer();
 
-  /**
-   * Intialization. Initialize member variables and create tenor descriptor.
-   */
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  /**
-   * Reshape is done each forward. Reshape tensor decriptor
-   * inputDesc_, outputDesc_, convDesc_. And search the faster algo
-   * or the fastest algo within a given memeory limit.
-   */
-  void reshape(int batchSize);
   void forward(PassType passType);
   void backward(const UpdateCallback& callback);
   void addBiases();
diff --git a/paddle/gserver/layers/CudnnPoolLayer.cpp b/paddle/gserver/layers/CudnnPoolLayer.cpp
index 4c733591b3779f2502c308a965cb731466b464f0..21d8e2579f77c98da1e30a205952fa53e02fb853 100644
--- a/paddle/gserver/layers/CudnnPoolLayer.cpp
+++ b/paddle/gserver/layers/CudnnPoolLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/math/Matrix.h"
@@ -62,9 +61,14 @@ bool CudnnPoolLayer::init(const LayerMap &layerMap,
   strideHeight = strideY_;
   strideWidth = stride_;
 
-  hl_create_pooling_descriptor(&poolingDesc_, mode_, windowHeight,
-                               windowWidth, heightPadding, widthPadding,
-                               strideHeight, strideWidth);
+  hl_create_pooling_descriptor(&poolingDesc_,
+                               mode_,
+                               windowHeight,
+                               windowWidth,
+                               heightPadding,
+                               widthPadding,
+                               strideHeight,
+                               strideWidth);
 
   return true;
 }
@@ -80,8 +84,13 @@ void CudnnPoolLayer::reshape(int batchSize) {
   }
   CHECK_EQ(inputLayers_[0]->getOutput().value->getWidth(),
            channels_ * imageH_ * imageW_);
-  outputH_ = outputSize(imageH_, sizeY_, confPaddingY_, strideY_);
-  outputW_ = outputSize(imageW_, sizeX_, confPadding_, stride_);
+  outputH_ = outputSize(imageH_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
+                        /* caffeMode */ false);
+  outputW_ =
+      outputSize(imageW_, sizeX_, confPadding_, stride_, /* caffeMode */ false);
   getOutput().setFrameHeight(outputH_);
   getOutput().setFrameWidth(outputW_);
 
@@ -99,8 +108,7 @@ void CudnnPoolLayer::forward(PassType passType) {
 
   real *inputData = getInputValue(0)->getData();
   real *outData = getOutputValue()->getData();
-  hl_pooling_forward(inputDesc_, inputData, outputDesc_, outData,
-                     poolingDesc_);
+  hl_pooling_forward(inputDesc_, inputData, outputDesc_, outData, poolingDesc_);
 }
 
 void CudnnPoolLayer::backward(const UpdateCallback &callback) {
@@ -113,8 +121,13 @@ void CudnnPoolLayer::backward(const UpdateCallback &callback) {
   real *inputGrad = getInputGrad(0)->getData();
   real *outData = getOutputValue()->getData();
   real *outGrad = getOutputGrad()->getData();
-  hl_pooling_backward(inputDesc_, inputData, inputGrad, outputDesc_,
-                      outData, outGrad, poolingDesc_);
+  hl_pooling_backward(inputDesc_,
+                      inputData,
+                      inputGrad,
+                      outputDesc_,
+                      outData,
+                      outGrad,
+                      poolingDesc_);
 }
 
 CudnnPoolLayer::~CudnnPoolLayer() {
diff --git a/paddle/gserver/layers/CudnnPoolLayer.h b/paddle/gserver/layers/CudnnPoolLayer.h
index 2ef94720d2b9f13597cb0fb546726a2c2a67cb36..6a6b28db961553506bcf5db206a65e1e9d90fe94 100644
--- a/paddle/gserver/layers/CudnnPoolLayer.h
+++ b/paddle/gserver/layers/CudnnPoolLayer.h
@@ -12,19 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "PoolLayer.h"
 
 namespace paddle {
 
- /**
-  * @brief CudnnPoolLayer is subclass of PoolLayer, which is implemented by
-  * cudnn api and only supports GPU.
-  *
-  * The config file api is img_pool_layer.
-  */
+/**
+ * @brief CudnnPoolLayer is subclass of PoolLayer, which is implemented by
+ * cudnn api and only supports GPU.
+ *
+ * The config file api is img_pool_layer.
+ */
 
 class CudnnPoolLayer : public PoolLayer {
 protected:
diff --git a/paddle/gserver/layers/DataLayer.cpp b/paddle/gserver/layers/DataLayer.cpp
index 79b9181e694f008d99bda170c562a524212b2c73..9a4b2e9d3e256119f3ff24cfcb80d68c81f67c65 100644
--- a/paddle/gserver/layers/DataLayer.cpp
+++ b/paddle/gserver/layers/DataLayer.cpp
@@ -32,19 +32,20 @@ void DataLayer::copyDataToOutput(Argument& output) {
                                           data_.value->getWidth(),
                                           useGpu(output.deviceId));
       } else {
-        output.value->resize(data_.value->getHeight(),
-                             data_.value->getWidth());
+        output.value->resize(data_.value->getHeight(), data_.value->getWidth());
       }
       output.value->copyFrom(*data_.value);
     }
     if (data_.grad) {
-      Matrix::resizeOrCreate(output.grad, data_.grad->getHeight(),
+      Matrix::resizeOrCreate(output.grad,
+                             data_.grad->getHeight(),
                              data_.grad->getWidth(),
-                             /* trans= */ false, useGpu(output.deviceId));
+                             /* trans= */ false,
+                             useGpu(output.deviceId));
     }
     if (data_.ids) {
-      IVector::resizeOrCreate(output.ids, data_.ids->getSize(),
-                              useGpu(output.deviceId));
+      IVector::resizeOrCreate(
+          output.ids, data_.ids->getSize(), useGpu(output.deviceId));
       output.ids->copyFrom(*data_.ids);
     }
   }
diff --git a/paddle/gserver/layers/DataLayer.h b/paddle/gserver/layers/DataLayer.h
index 3abec1b0653a812dcb0a8d5e0a24d8ead55c1d0b..da74702201bd3af3cd73ad51ef2579da97674bc6 100644
--- a/paddle/gserver/layers/DataLayer.h
+++ b/paddle/gserver/layers/DataLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -20,7 +19,7 @@ limitations under the License. */
 #include "Layer.h"
 
 namespace paddle {
-/** 
+/**
  * This layer just copy data to output, and has no backward propagation.
  *
  * The config file api is data_layer.
@@ -34,12 +33,10 @@ public:
   /**
    * Prefetch sparse matrix/ids only.
    */
-  void prefetch() {
-    output_ = data_;
-  }
+  void prefetch() { output_ = data_; }
 
-  /** 
-   * Forward propagation. Copy data_ (value, in, grad, ids, cpuSequenceDims, 
+  /**
+   * Forward propagation. Copy data_ (value, in, grad, ids, cpuSequenceDims,
    * sequenceStartPositions, subSequenceStartPositions, strs) to output_.
    */
   virtual void forward(PassType passType) {
diff --git a/paddle/gserver/layers/DataNormLayer.cpp b/paddle/gserver/layers/DataNormLayer.cpp
index 150977ce1a589cc7cc2b00a495314218ecaa772c..b398f3dbedc44eb422124a725aa745f684e821e3 100644
--- a/paddle/gserver/layers/DataNormLayer.cpp
+++ b/paddle/gserver/layers/DataNormLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "DataNormLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -37,16 +36,28 @@ bool DataNormLayer::init(const LayerMap& layerMap,
       << "The parameter of DataNormLayer must be static";
 
   weight_ = std::unique_ptr<Weight>(new Weight(5, getSize(), parameters_[0]));
-  min_ = Matrix::create(nullptr, /* height= */ 1, getSize(), /* trans= */ false,
-                        useGpu_);
-  rangeReciprocal_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                    /* trans= */ false, useGpu_);
-  mean_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                         /* trans= */ false, useGpu_);
-  stdReciprocal_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                  /* trans= */ false, useGpu_);
-  decimalReciprocal_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                      /* trans= */ false, useGpu_);
+  min_ = Matrix::create(
+      nullptr, /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
+  rangeReciprocal_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
+  mean_ = Matrix::create(nullptr,
+                         /* height= */ 1,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
+  stdReciprocal_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  getSize(),
+                                  /* trans= */ false,
+                                  useGpu_);
+  decimalReciprocal_ = Matrix::create(nullptr,
+                                      /* height= */ 1,
+                                      getSize(),
+                                      /* trans= */ false,
+                                      useGpu_);
 
   min_->setData(weight_->getW()->getData());
   rangeReciprocal_->setData(weight_->getW()->getData() + getSize());
diff --git a/paddle/gserver/layers/DataNormLayer.h b/paddle/gserver/layers/DataNormLayer.h
index 232c73f0346a12d59fa0dc316ef510be75e6b2b1..1179d94fbbd4032c9275f0586de5b526eb21c095 100644
--- a/paddle/gserver/layers/DataNormLayer.h
+++ b/paddle/gserver/layers/DataNormLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/DotMulOperator.cpp b/paddle/gserver/layers/DotMulOperator.cpp
index e6d2375b474d811ce8d485ca838428dc2860b608..9409493fdaaf0e84ab2e650e2c5e3db0c1fb1fbc 100644
--- a/paddle/gserver/layers/DotMulOperator.cpp
+++ b/paddle/gserver/layers/DotMulOperator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Operator.h"
 
 namespace paddle {
@@ -42,8 +41,8 @@ DotMulOperator::DotMulOperator(const OperatorConfig& config, bool useGpu)
 }
 
 void DotMulOperator::forward() {
-  out_->value->addDotMul(*ins_[0]->value, *ins_[1]->value, 1,
-                         config_.dotmul_scale());
+  out_->value->addDotMul(
+      *ins_[0]->value, *ins_[1]->value, 1, config_.dotmul_scale());
 }
 
 void DotMulOperator::backward() {
diff --git a/paddle/gserver/layers/DotMulProjection.cpp b/paddle/gserver/layers/DotMulProjection.cpp
index f6f14c4429e2637ae722105c164a776758e1ca11..862eeb6f01db04451afb8a91ecb2c04e0f796952 100644
--- a/paddle/gserver/layers/DotMulProjection.cpp
+++ b/paddle/gserver/layers/DotMulProjection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Projection.h"
 
 namespace paddle {
@@ -29,7 +28,8 @@ namespace paddle {
 class DotMulProjection : public Projection {
 public:
   DotMulProjection(const ProjectionConfig& config,
-                   const ParameterPtr& parameter, bool useGpu);
+                   const ParameterPtr& parameter,
+                   bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
@@ -41,7 +41,8 @@ protected:
 REGISTER_PROJECTION(dot_mul, DotMulProjection);
 
 DotMulProjection::DotMulProjection(const ProjectionConfig& config,
-                                   const ParameterPtr& parameter, bool useGpu)
+                                   const ParameterPtr& parameter,
+                                   bool useGpu)
     : Projection(config, parameter, useGpu) {
   weight_.reset(new Weight(1LU, config.output_size(), parameter));
 }
diff --git a/paddle/gserver/layers/EosIdCheckLayer.cpp b/paddle/gserver/layers/EosIdCheckLayer.cpp
index 2d0778a451aae5997a4b39a7c106d96887a79a51..3a43705d263898bd407248b3d553185f7e40f798 100644
--- a/paddle/gserver/layers/EosIdCheckLayer.cpp
+++ b/paddle/gserver/layers/EosIdCheckLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 
@@ -20,7 +19,7 @@ namespace paddle {
 /**
  * A layer for checking EOS for each sample:
  * - output_id = (input_id == conf.eos_id)
- * 
+ *
  * The result is stored in output_.ids.
  * It is used by recurrent layer group.
  */
diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.cpp b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..71a69bd0d01f4f6fcd579a408008ad4e00b5fd4d
--- /dev/null
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
@@ -0,0 +1,292 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ExpandConvBaseLayer.h"
+
+#include "paddle/utils/Logging.h"
+namespace paddle {
+
+bool ExpandConvBaseLayer::init(const LayerMap &layerMap,
+                               const ParameterMap &parameterMap) {
+  /* Initialize the basic convolutional parent class */
+  ConvBaseLayer::init(layerMap, parameterMap);
+
+  /* The class fields channels_ and numFilters_ are the same as in the config
+   * i.e., channels_ is the for the input and numFilters_ is for the output
+   *
+   * But in order for the variables in convTrans having the same semantic
+   * meaning as in conv, we need to swap channels_ and numFilters here for
+   * convTrans, and in other functions too.
+   * */
+  int channel;
+  int numFilters;
+  /* Initialize the projection */
+  for (auto &inputConfig : config_.inputs()) {
+    const ConvConfig &conf = inputConfig.conv_conf();
+    numFilters = isDeconv_ ? conf.channels() : numFilters_;
+    subM_.push_back(numFilters / conf.groups());
+    subN_.push_back(conf.output_x() * conf.output_x());
+    channel = isDeconv_ ? numFilters_ : conf.channels();
+    subK_.push_back(channel * conf.filter_size() * conf.filter_size() /
+                    conf.groups());
+    /* Consistent caffe mode for multiple input */
+    caffeMode_ = conf.caffe_mode();
+  }
+
+  getOutputSize();
+
+  return true;
+}
+
+size_t ExpandConvBaseLayer::getOutputSize() {
+  CHECK_NE(inputLayers_.size(), 0UL);
+  size_t layerSize = ConvBaseLayer::calOutputSize();
+  subN_.clear();
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    subN_.push_back(outputH_[i] * outputW_[i]);
+  }
+  return layerSize;
+}
+
+void ExpandConvBaseLayer::resetExpandInput(size_t height, size_t width) {
+  Matrix::resizeOrCreate(expandInput_, height, width, false, useGpu_);
+}
+
+void ExpandConvBaseLayer::addSharedBias() {
+  size_t mapW = getOutputSize() / numFilters_;
+  size_t mapH = getOutputValue()->getElementCnt() / mapW;
+  MatrixPtr out =
+      Matrix::create(getOutputValue()->getData(), mapH, mapW, false, useGpu_);
+
+  Matrix::resizeOrCreate(transOutValue_, mapW, mapH, false, useGpu_);
+
+  out->transpose(transOutValue_, false);  // false means no memory allocation
+  transOutValue_->reshape(transOutValue_->getElementCnt() / numFilters_,
+                          numFilters_);
+
+  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
+                                  1,
+                                  biases_->getW()->getElementCnt(),
+                                  false,
+                                  useGpu_);
+  transOutValue_->addBias(*bias, 1.0f);
+
+  transOutValue_->reshape(mapW, mapH);
+  transOutValue_->transpose(out, false);  // false means no memory allocation
+
+  out->clear();
+  bias->clear();
+}
+
+void ExpandConvBaseLayer::addUnsharedBias() {
+  MatrixPtr outValue = getOutputValue();
+  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
+                                  1,
+                                  biases_->getW()->getElementCnt(),
+                                  false,
+                                  useGpu_);
+  outValue->addBias(*bias, 1.0f);
+}
+
+void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image,
+                                         size_t startIdx,
+                                         int inIdx) {
+  int channel = isDeconv_ ? numFilters_ : channels_[inIdx];
+
+  resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]);
+  real *imgData = image->getData() + startIdx * image->getWidth();
+  MatrixPtr imageTmp =
+      Matrix::create(imgData,
+                     1,
+                     imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel,
+                     false,
+                     useGpu_);
+  expandInput_->convExpand(*imageTmp,
+                           imgSizeH_[inIdx],
+                           imgSizeW_[inIdx],
+                           channel,
+                           filterSize_[inIdx],
+                           filterSize_[inIdx],
+                           stride_[inIdx],
+                           stride_[inIdx],
+                           padding_[inIdx],
+                           padding_[inIdx],
+                           outputH_[inIdx],
+                           outputW_[inIdx]);
+  imageTmp->clear();
+}
+
+void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image,
+                                        MatrixPtr out,
+                                        int inIdx,
+                                        int startIdx) {
+  int subM = subM_[inIdx];
+  int subN = subN_[inIdx];
+  int subK = subK_[inIdx];
+
+  expandOneFrame(image, startIdx, inIdx);
+
+  int numFilters = isDeconv_ ? channels_[inIdx] : numFilters_;
+
+  real *outData = out->getData() + startIdx * subN * numFilters;
+
+  real *wgtData = weights_[inIdx]->getW()->getData();
+  real *expInData = expandInput_->getData();
+  for (int g = 0; g < groups_[inIdx]; ++g) {
+    MatrixPtr A =
+        Matrix::create(wgtData, subK, subM, true, useGpu_);  // mark transpose
+    MatrixPtr B = Matrix::create(expInData, subK, subN, false, useGpu_);
+    MatrixPtr C = Matrix::create(outData, subM, subN, false, useGpu_);
+    C->mul(A, B, 1, 1);
+
+    A->clear();
+    B->clear();
+    C->clear();
+    wgtData += subK * subM;
+    expInData += subK * subN;
+    outData += subM * subN;
+  }
+}
+
+void ExpandConvBaseLayer::bpropActs(MatrixPtr out,
+                                    MatrixPtr image,
+                                    int inpIdx) {
+  int channel = isDeconv_ ? numFilters_ : channels_[inpIdx];
+
+  int subM = subM_[inpIdx];
+  int subN = subN_[inpIdx];
+  int subK = subK_[inpIdx];
+  size_t batchSize = image->getHeight();
+
+  /* reset the expand-grad memory */
+  resetExpandInput(subK * groups_[inpIdx], subN);
+
+  real *localGradData = out->getData();
+  real *tgtGradData = image->getData();
+  for (size_t n = 0; n < batchSize; n++) {
+    real *wgtData = weights_[inpIdx]->getW()->getData();
+    real *expandInData = expandInput_->getData();
+
+    for (int g = 0; g < groups_[inpIdx]; g++) {
+      // create temporary matrix
+      MatrixPtr C = Matrix::create(expandInData, subK, subN, false, useGpu_);
+      MatrixPtr B = Matrix::create(localGradData, subM, subN, false, useGpu_);
+      MatrixPtr A = Matrix::create(wgtData, subK, subM, false, useGpu_);
+      C->mul(A, B);  // mul
+
+      // clear the temporary matrix
+      A->clear();
+      B->clear();
+      C->clear();
+
+      expandInData += subK * subN;
+      localGradData += subM * subN;
+      wgtData += subK * subM;
+    }
+
+    // shrink one frame outGrad
+    MatrixPtr oneGradTmp = Matrix::create(
+        expandInput_->getData(), subK * groups_[inpIdx], subN, false, useGpu_);
+    MatrixPtr vTmp =
+        Matrix::create(tgtGradData,
+                       1,
+                       imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel,
+                       false,
+                       useGpu_);
+    vTmp->convShrink(*oneGradTmp,
+                     imgSizeH_[inpIdx],
+                     imgSizeW_[inpIdx],
+                     channel,
+                     filterSize_[inpIdx],
+                     filterSize_[inpIdx],
+                     stride_[inpIdx],
+                     stride_[inpIdx],
+                     padding_[inpIdx],
+                     padding_[inpIdx],
+                     outputH_[inpIdx],
+                     outputW_[inpIdx],
+                     1.0f,
+                     1.0f);
+    vTmp->clear();
+    oneGradTmp->clear();
+
+    // move the data-pointer
+    tgtGradData += imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel;
+  }
+}
+
+void ExpandConvBaseLayer::bpropWeights(MatrixPtr image,
+                                       MatrixPtr out,
+                                       int inpIdx) {
+  MatrixPtr weightGrad = weights_[inpIdx]->getWGrad();
+
+  int subM = subM_[inpIdx];
+  int subN = subN_[inpIdx];
+  int subK = subK_[inpIdx];
+  size_t batchSize = image->getHeight();
+  resetExpandInput(subK * groups_[inpIdx], subN);
+
+  real *gradData = out->getData();
+
+  for (size_t n = 0; n < batchSize; n++) {  // frame by frame
+    // expand
+    expandOneFrame(image, n, inpIdx);
+    real *wGradData = weightGrad->getData();
+    real *expandInData = expandInput_->getData();
+
+    // expand-mul one-group by one
+    for (int g = 0; g < groups_[inpIdx]; g++) {
+      MatrixPtr A = Matrix::create(expandInData, subK, subN, false, useGpu_);
+      MatrixPtr B = Matrix::create(gradData, subM, subN, true, useGpu_);
+      MatrixPtr C = Matrix::create(wGradData, subK, subM, false, useGpu_);
+      C->mul(A, B, 1, 1);
+
+      A->clear();
+      B->clear();
+      C->clear();
+      gradData += subM * subN;
+      wGradData += subK * subM;
+      expandInData += subK * subN;
+    }
+  }
+}
+
+void ExpandConvBaseLayer::bpropSharedBias(MatrixPtr biases, MatrixPtr v) {
+  size_t mapW = getOutputSize() / numFilters_;
+  size_t mapH = v->getElementCnt() / mapW;
+  MatrixPtr vTmp = Matrix::create(v->getData(), mapH, mapW, false, useGpu_);
+
+  Matrix::resizeOrCreate(transOutValue_, mapW, mapH, false, useGpu_);
+
+  vTmp->transpose(transOutValue_, false);  // false means no memory allocation
+  transOutValue_->reshape(transOutValue_->getElementCnt() / numFilters_,
+                          numFilters_);
+  biases->collectBias(*transOutValue_, 1.0f);
+}
+
+void ExpandConvBaseLayer::bpropBiases(MatrixPtr v) {
+  MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
+                                    1,
+                                    biases_->getWGrad()->getElementCnt(),
+                                    false,
+                                    useGpu_);
+  if (sharedBiases_) {
+    bpropSharedBias(biases, v);
+  } else {
+    biases->collectBias(*v, 1.0f);
+  }
+  biases->clear();
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.h b/paddle/gserver/layers/ExpandConvBaseLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..5939d27e2a873308d710c1670a3aec843c3573ad
--- /dev/null
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ConvBaseLayer.h"
+#include "paddle/math/Matrix.h"
+#include <vector>
+
+namespace paddle {
+
+/**
+ * @brief A subclass of ConvBaseLayer that is a superclass of both
+ * ExpandConvLayer and ExpandConvTransLayer
+ */
+class ExpandConvBaseLayer : public ConvBaseLayer {
+protected:
+  /// For expand convolution.
+  /// subM_ = numFilters_ / groups_.
+  IntV subM_;
+  /// subN_ = outputH_ * outputW_.
+  IntV subN_;
+  /// subK_ = channels_ * filterPixels_ * groups_.
+  IntV subK_;
+
+  /*The expandInput_ and transOutValue_ are used for CPU expand conv calc
+   * Expand one sample at a time. shape:
+   * (numChannels * filterPixels_, outputSizeH * outputSizeW)
+   * */
+  MatrixPtr expandInput_;
+  /// The transpose of output, which is an auxiliary matrix.
+  MatrixPtr transOutValue_;
+
+public:
+  explicit ExpandConvBaseLayer(const LayerConfig& config)
+      : ConvBaseLayer(config) {}
+
+  ~ExpandConvBaseLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  size_t getOutputSize();
+  /**
+   * Create or resize expandInput_.
+   */
+  void resetExpandInput(size_t height, size_t width);
+
+  /**
+   * Add shared bias.
+   */
+  void addSharedBias();
+
+  /**
+   * Add unshared bias.
+   */
+  void addUnsharedBias();
+  /**
+   * Expand one input sample.
+   */
+  void expandOneFrame(MatrixPtr image, size_t startIdx, int inIdx);
+
+  /**
+   * Expand one input sample and perform matrix multiplication.
+   */
+  void expandFwdOnce(MatrixPtr image, MatrixPtr out, int inIdx, int startIdx);
+
+  void bpropSharedBias(MatrixPtr biases, MatrixPtr v);
+  void bpropBiases(MatrixPtr v);
+  void bpropWeights(MatrixPtr image, MatrixPtr out, int inpIdx);
+  void bpropActs(MatrixPtr image, MatrixPtr out, int inpIdx);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
index df79c3e3037cfce063c1e392bd4c30d1a800b402..0649289c1c671ae5952dd8db9d19f576da67409c 100644
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 #include "ExpandConvLayer.h"
@@ -24,153 +23,29 @@ REGISTER_LAYER(exconv, ExpandConvLayer);
 bool ExpandConvLayer::init(const LayerMap &layerMap,
                            const ParameterMap &parameterMap) {
   /* Initialize the basic convolutional parent class */
-  ConvBaseLayer::init(layerMap, parameterMap);
-
-  /* Initialize the projection */
-  for (auto &inputConfig : config_.inputs()) {
-    const ConvConfig &conf = inputConfig.conv_conf();
-    subM_.push_back(numFilters_ / conf.groups());
-    subN_.push_back(conf.output_x() * conf.output_x());
-    subK_.push_back(conf.channels() * conf.filter_size() * conf.filter_size() /
-                    conf.groups());
-    /* Consistent caffe mode for multiple input */
-    caffeMode_ = conf.caffe_mode();
-  }
-
+  ExpandConvBaseLayer::init(layerMap, parameterMap);
   return true;
 }
 
-size_t ExpandConvLayer::getSize() {
-  CHECK_NE(inputLayers_.size(), 0UL);
-  imgSizeH_.clear();
-  imgSizeW_.clear();
-  outputH_.clear();
-  outputW_.clear();
-  subN_.clear();
-  size_t layerSize = 0;
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    imgSizeH_.push_back(inputLayers_[i]->getOutput().getFrameHeight());
-    imgSizeW_.push_back(inputLayers_[i]->getOutput().getFrameWidth());
-    if (imgSizeH_[i] == 0) imgSizeH_[i] = imgSize_[i];
-    if (imgSizeW_[i] == 0) imgSizeW_[i] = imgSize_[i];
-    outputH_.push_back(
-        outputSize(imgSizeH_[i], filterSize_[i], padding_[i], stride_[i]));
-    outputW_.push_back(
-        outputSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i]));
-    subN_.push_back(outputH_[i] * outputW_[i]);
-    CHECK(layerSize == 0 || subN_[i] * size_t(numFilters_) == layerSize);
-    layerSize = subN_[i] * numFilters_;
-  }
-  getOutput().setFrameHeight(outputH_[0]);
-  getOutput().setFrameWidth(outputW_[0]);
-  return layerSize;
-}
-
-void ExpandConvLayer::resetExpandInput(size_t height, size_t width) {
-  Matrix::resizeOrCreate(expandInput_, height, width, false, useGpu_);
-}
-
-void ExpandConvLayer::resetConvOutput(size_t batchSize, int inIdx) {
-  Matrix::resizeOrCreate(transOutValue_, batchSize * numFilters_, subN_[inIdx],
-                         false, useGpu_);
-}
-
-void ExpandConvLayer::expandOneFrame(MatrixPtr image, size_t startIdx,
-                                     int inIdx) {
-  resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]);
-  real *imgData = image->getData() + startIdx * image->getWidth();
-  MatrixPtr imageTmp = Matrix::create(
-      imgData, 1, imgSizeH_[inIdx] * imgSizeW_[inIdx] * channels_[inIdx], false,
-      useGpu_);
-  expandInput_->convExpand(*imageTmp, imgSizeH_[inIdx], imgSizeW_[inIdx],
-                           channels_[inIdx], filterSize_[inIdx],
-                           filterSize_[inIdx], stride_[inIdx], stride_[inIdx],
-                           padding_[inIdx], padding_[inIdx],
-                           outputH_[inIdx], outputW_[inIdx]);
-  imageTmp->clear();
-}
-
-void ExpandConvLayer::expandFwdOnce(MatrixPtr image, int inIdx, int startIdx) {
-  int subM = subM_[inIdx];
-  int subN = subN_[inIdx];
-  int subK = subK_[inIdx];
-
-  expandOneFrame(image, startIdx, inIdx);
-
-  real *outData =
-      getOutputValue()->getData() + startIdx * subN * numFilters_;
-
-  real *wgtData = weights_[inIdx]->getW()->getData();
-  real *expInData = expandInput_->getData();
-  for (int g = 0; g < groups_[inIdx]; ++g) {
-    MatrixPtr A =
-        Matrix::create(wgtData, subK, subM, true, useGpu_);  // mark transpose
-    MatrixPtr B = Matrix::create(expInData, subK, subN, false, useGpu_);
-    MatrixPtr C = Matrix::create(outData, subM, subN, false, useGpu_);
-    C->mul(A, B, 1, 1);
-
-    A->clear();
-    B->clear();
-    C->clear();
-    wgtData += subK * subM;
-    expInData += subK * subN;
-    outData += subM * subN;
-  }
-}
-
-void ExpandConvLayer::addSharedBias() {
-  size_t mapW = getSize() / numFilters_;
-  size_t mapH = getOutputValue()->getElementCnt() / mapW;
-  MatrixPtr out =
-      Matrix::create(getOutputValue()->getData(), mapH, mapW, false, useGpu_);
-
-  Matrix::resizeOrCreate(transOutValue_, mapW, mapH, false, useGpu_);
-
-  out->transpose(transOutValue_, false);  // false means no memory allocation
-  transOutValue_->reshape(transOutValue_->getElementCnt() / numFilters_,
-                          numFilters_);
-
-  MatrixPtr bias =
-      Matrix::create(biases_->getW()->getData(), 1,
-                     biases_->getW()->getElementCnt(), false, useGpu_);
-  transOutValue_->addBias(*bias, 1.0f);
-
-  transOutValue_->reshape(mapW, mapH);
-  transOutValue_->transpose(out, false);  // false means no memory allocation
-
-  out->clear();
-  bias->clear();
-}
-
-void ExpandConvLayer::addUnsharedBias() {
-  MatrixPtr outValue = getOutputValue();
-  MatrixPtr bias =
-      Matrix::create(biases_->getW()->getData(), 1,
-                     biases_->getW()->getElementCnt(), false, useGpu_);
-  outValue->addBias(*bias, 1.0f);
-}
-
 void ExpandConvLayer::forward(PassType passType) {
   Layer::forward(passType);
 
   /* malloc memory for the output_ if necessary */
-  /* note: one sample correspond to one colum, and the
-   *   transOutValue correspond sample to one row */
-  int batchSize = inputLayers_[0]->getOutputValue()->getWidth();
-  batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  resetOutput(batchSize, getSize());
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  resetOutput(batchSize, getOutputSize());
 
   MatrixPtr image = nullptr;
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+  MatrixPtr outV = getOutputValue();
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
     LayerPtr prevLayer = getPrev(i);
     image = prevLayer->getOutputValue();
     for (size_t off = 0; off < image->getHeight(); off++) {
       REGISTER_TIMER_INFO("expandFwdOnce", getName().c_str());
-      expandFwdOnce(image, i, off);
+      expandFwdOnce(image, outV, i, off);
     }
   }
   /* add the bias-vector */
-  if (biases_.get() != NULL) {
+  if (biases_.get()) {
     if (sharedBiases_) {
       addSharedBias();
     } else {
@@ -182,30 +57,6 @@ void ExpandConvLayer::forward(PassType passType) {
   forwardActivation();
 }
 
-void ExpandConvLayer::bpropSharedBias(MatrixPtr biases, MatrixPtr v) {
-  size_t mapW = getSize() / numFilters_;
-  size_t mapH = v->getElementCnt() / mapW;
-  MatrixPtr vTmp = Matrix::create(v->getData(), mapH, mapW, false, useGpu_);
-
-  Matrix::resizeOrCreate(transOutValue_, mapW, mapH, false, useGpu_);
-
-  vTmp->transpose(transOutValue_, false);  // false means no memory allocation
-  vTmp->reshape(transOutValue_->getElementCnt() / numFilters_, numFilters_);
-  biases->collectBias(*vTmp, 1.0f);
-}
-
-void ExpandConvLayer::bpropBiases(MatrixPtr v) {
-  MatrixPtr biases =
-      Matrix::create(biases_->getWGrad()->getData(), 1,
-                     biases_->getWGrad()->getElementCnt(), false, useGpu_);
-  if (sharedBiases_) {
-    bpropSharedBias(biases, v);
-  } else {
-    biases->collectBias(*v, 1.0f);
-  }
-  biases->clear();
-}
-
 void ExpandConvLayer::backward(const UpdateCallback &callback) {
   backwardActivation();
 
@@ -216,111 +67,18 @@ void ExpandConvLayer::backward(const UpdateCallback &callback) {
     biases_->getParameterPtr()->incUpdate(callback);
   }
 
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
     /* First, calculate the input layers error */
-    bpropActs(outGrad, i);
+    if (getPrev(i)->getOutputGrad()) {
+      bpropActs(outGrad, getPrev(i)->getOutputGrad(), i);
+    }
     if (weights_[i]->getWGrad()) {
       /* Then, calculate the W-gradient for the current layer */
-      bpropWeights(outGrad, i);
+      bpropWeights(getPrev(i)->getOutputValue(), outGrad, i);
       /* Increasing the number of gradient */
       weights_[i]->getParameterPtr()->incUpdate(callback);
     }
   }
 }
 
-void ExpandConvLayer::bpropWeights(MatrixPtr v, int inpIdx) {
-  MatrixPtr weightGrad = weights_[inpIdx]->getWGrad();
-  MatrixPtr inputV = getPrev(inpIdx)->getOutputValue();
-
-  int subM = subM_[inpIdx];
-  int subN = subN_[inpIdx];
-  int subK = subK_[inpIdx];
-  size_t batchSize = inputV->getHeight();
-  resetExpandInput(subK * groups_[inpIdx], subN);
-  resetConvOutput(batchSize, inpIdx);
-
-  real *gradData = v->getData();
-
-  for (size_t n = 0; n < batchSize; n++) {  // frame by frame
-    // expand
-    expandOneFrame(inputV, n, inpIdx);
-    real *wGradData = weightGrad->getData();
-    real *expandInData = expandInput_->getData();
-
-    // expand-mul one-group by one
-    for (int g = 0; g < groups_[inpIdx]; g++) {
-      MatrixPtr A = Matrix::create(expandInData, subK, subN, false, useGpu_);
-      MatrixPtr B = Matrix::create(gradData, subM, subN, true, useGpu_);
-      MatrixPtr C = Matrix::create(wGradData, subK, subM, false, useGpu_);
-      C->mul(A, B, 1, 1);
-
-      A->clear();
-      B->clear();
-      C->clear();
-      gradData += subM * subN;
-      wGradData += subK * subM;
-      expandInData += subK * subN;
-    }
-  }
-}
-
-void ExpandConvLayer::bpropActs(MatrixPtr v, int inpIdx) {
-  LayerPtr prevLayer = getPrev(inpIdx);
-  if (NULL == prevLayer->getOutputGrad()) {
-    return;
-  }
-
-  int subM = subM_[inpIdx];
-  int subN = subN_[inpIdx];
-  int subK = subK_[inpIdx];
-  size_t batchSize = v->getHeight();
-  MatrixPtr tgtGrad = prevLayer->getOutputGrad();
-
-  /* reset the expand-grad memory */
-  resetExpandInput(subK * groups_[inpIdx], subN);
-  resetConvOutput(batchSize, inpIdx);
-
-  real *localGradData = v->getData();
-  real *tgtGradData = tgtGrad->getData();
-  for (size_t n = 0; n < batchSize; n++) {
-    real *wgtData = weights_[inpIdx]->getW()->getData();
-    real *expandInData = expandInput_->getData();
-
-    for (int g = 0; g < groups_[inpIdx]; g++) {
-      // create temporary matrix
-      MatrixPtr C = Matrix::create(expandInData, subK, subN, false, useGpu_);
-      MatrixPtr B = Matrix::create(localGradData, subM, subN, false, useGpu_);
-      MatrixPtr A = Matrix::create(wgtData, subK, subM, false, useGpu_);
-      C->mul(A, B);  // mul
-
-      // clear the temporary matrix
-      A->clear();
-      B->clear();
-      C->clear();
-
-      expandInData += subK * subN;
-      localGradData += subM * subN;
-      wgtData += subK * subM;
-    }
-
-    // shrink one frame outGrad
-    MatrixPtr oneGradTmp = Matrix::create(
-        expandInput_->getData(), subK * groups_[inpIdx], subN, false, useGpu_);
-    MatrixPtr vTmp = Matrix::create(
-        tgtGradData, 1,
-        imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channels_[inpIdx], false,
-        useGpu_);
-    vTmp->convShrink(*oneGradTmp, imgSizeH_[inpIdx], imgSizeW_[inpIdx],
-                     channels_[inpIdx], filterSize_[inpIdx],
-                     filterSize_[inpIdx], stride_[inpIdx], stride_[inpIdx],
-                     padding_[inpIdx], padding_[inpIdx],
-                     outputH_[inpIdx], outputW_[inpIdx], 1.0f, 1.0f);
-    vTmp->clear();
-    oneGradTmp->clear();
-
-    // move the data-pointer
-    tgtGradData += imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channels_[inpIdx];
-  }
-}
-
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h
index fc3d69b1b7d14c64c95ab66dbe7725857ec38261..82a9e88a4208ea98a97bd56ef2f9f38de4f0031e 100644
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
-#include "ConvBaseLayer.h"
 #include "paddle/math/Matrix.h"
 #include <vector>
+#include "ExpandConvBaseLayer.h"
 
 namespace paddle {
 
@@ -28,73 +27,18 @@ namespace paddle {
  *
  * The config file api is img_conv_layer.
  */
-class ExpandConvLayer : public ConvBaseLayer {
-protected:
-  /// For expand convolution.
-  /// subM_ = numFilters_ / groups_.
-  IntV subM_;
-  /// subN_ = outputH_ * outputW_.
-  IntV subN_;
-  /// subK_ = channels_ * filterPixels_ * groups_.
-  IntV subK_;
-  /// The spatial dimensions of height of input feature map.
-  IntV imgSizeH_;
-  /// The spatial dimensions of width of input feature map.
-  IntV imgSizeW_;
-  /// The spatial dimensions of height of output feature map.
-  IntV outputH_;
-  /// The spatial dimensions of width of output feature map.
-  IntV outputW_;
-  /// Expand one sample at a time. shape:
-  /// (numChannels * filterPixels_, outputSizeH * outputSizeW)
-  MatrixPtr expandInput_;
-  /// The transpose of output, which is an auxiliary matrix.
-  MatrixPtr transOutValue_;
 
+class ExpandConvLayer : public ExpandConvBaseLayer {
 public:
-  explicit ExpandConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+  explicit ExpandConvLayer(const LayerConfig& config)
+      : ExpandConvBaseLayer(config) {}
 
   ~ExpandConvLayer() {}
 
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
 
-  size_t getSize();
-
-  /**
-   * Create or resize expandInput_.
-   */
-  void resetExpandInput(size_t height, size_t width);
-
-  /**
-   * Create or resize transOutValue_.
-   */
-  void resetConvOutput(size_t batchSize, int inIdx);
-
-  /**
-   * Expand one input sample.
-   */
-  void expandOneFrame(MatrixPtr image, size_t startIdx, int inIdx);
-
-  /**
-   * Expand one input sample and perform matrix multiplication.
-   */
-  void expandFwdOnce(MatrixPtr image, int inIdx, int startIdx);
-
-  /**
-   * Add shared bias.
-   */
-  void addSharedBias();
-
-  /**
-   * Add unshared bias.
-   */
-  void addUnsharedBias();
   void forward(PassType passType);
-  void bpropSharedBias(MatrixPtr biases, MatrixPtr v);
-  void bpropBiases(MatrixPtr v);
   void backward(const UpdateCallback& callback);
-  void bpropWeights(MatrixPtr v, int inpIdx);
-  void bpropActs(MatrixPtr v, int inpIdx);
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvTransLayer.cpp b/paddle/gserver/layers/ExpandConvTransLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1132ab4f92000c96b22a295b360143d2f356ec5a
--- /dev/null
+++ b/paddle/gserver/layers/ExpandConvTransLayer.cpp
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+#include "ExpandConvTransLayer.h"
+
+/* The implementation of the convTransLayer is basically a swap of forward and
+ * backward of the original convLayer.
+ * The variable naming follows the convention of the convLayer.
+ * */
+
+namespace paddle {
+
+REGISTER_LAYER(exconvt, ExpandConvTransLayer);
+
+bool ExpandConvTransLayer::init(const LayerMap &layerMap,
+                                const ParameterMap &parameterMap) {
+  /* Initialize the basic convolutional parent class */
+  ExpandConvBaseLayer::init(layerMap, parameterMap);
+
+  return true;
+}
+
+void ExpandConvTransLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  resetOutput(batchSize, getOutputSize());
+
+  MatrixPtr output = nullptr;
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    LayerPtr prevLayer = getPrev(i);
+    output = prevLayer->getOutputValue();
+    REGISTER_TIMER_INFO("shrinkFwd", getName().c_str());
+    bpropActs(output, getOutputValue(), i);
+  }
+
+  /* add the bias-vector */
+  if (biases_.get()) {
+    if (sharedBiases_) {
+      addSharedBias();
+    } else {
+      addUnsharedBias();
+    }
+  }
+
+  /* activation */
+  forwardActivation();
+}
+
+void ExpandConvTransLayer::backward(const UpdateCallback &callback) {
+  backwardActivation();
+
+  MatrixPtr imageGrad = getOutputGrad();
+  if (biases_ && biases_->getWGrad()) {
+    bpropBiases(imageGrad);
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    /* First, calculate the input layers error */
+    for (size_t off = 0; off < imageGrad->getHeight(); off++) {
+      if (getPrev(i)->getOutputGrad()) {
+        expandFwdOnce(imageGrad, getPrev(i)->getOutputGrad(), i, off);
+      }
+    }
+    if (weights_[i]->getWGrad()) {
+      /* Then, calculate the W-gradient for the current layer */
+      bpropWeights(imageGrad, getPrev(i)->getOutputValue(), i);
+      /* Increasing the number of gradient */
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvTransLayer.h b/paddle/gserver/layers/ExpandConvTransLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..47efe3f65643fd17b86832fc240cda2e30d3fcc4
--- /dev/null
+++ b/paddle/gserver/layers/ExpandConvTransLayer.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/math/Matrix.h"
+#include <vector>
+#include "ExpandConvBaseLayer.h"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of convolution layer.
+ * This layer expands input and use matrix multiplication to
+ * calculate convolution transpose (deconv) operation.
+ *
+ * The config file api is img_conv_layer with flag trans=True.
+ */
+class ExpandConvTransLayer : public ExpandConvBaseLayer {
+public:
+  explicit ExpandConvTransLayer(const LayerConfig& config)
+      : ExpandConvBaseLayer(config) {}
+
+  ~ExpandConvTransLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
index d18b51dd7973737768b4fde37b67987abea9e2c6..97c8d143fe0d84c4e59e224962b53995ee50b844 100644
--- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp
+++ b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/Stat.h"
@@ -79,9 +78,12 @@ void FeatureMapExpandLayer::forward(PassType passType) {
     for (size_t i = 0; i < batchSize; i++) {
       MatrixPtr outVTmp =
           Matrix::create(outputV->getData() + i * imgSize * numFilters_,
-                         numFilters_, imgSize, false, useGpu_);
-      MatrixPtr inVTmp = Matrix::create(inputV->getData() + i * imgSize, 1,
-                                        imgSize, false, useGpu_);
+                         numFilters_,
+                         imgSize,
+                         false,
+                         useGpu_);
+      MatrixPtr inVTmp = Matrix::create(
+          inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_);
       outVTmp->addRowVector(*inVTmp);
     }
   }
@@ -101,9 +103,12 @@ void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
     for (size_t i = 0; i < batchSize; i++) {
       MatrixPtr outGradTmp =
           Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
-                         numFilters_, imgSize, false, useGpu_);
-      MatrixPtr inGradTmp = Matrix::create(inGrad->getData() + i * imgSize, 1,
-                                           imgSize, false, useGpu_);
+                         numFilters_,
+                         imgSize,
+                         false,
+                         useGpu_);
+      MatrixPtr inGradTmp = Matrix::create(
+          inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_);
       inGradTmp->collectBias(*outGradTmp, 1);
     }
   }
diff --git a/paddle/gserver/layers/FullMatrixProjection.cpp b/paddle/gserver/layers/FullMatrixProjection.cpp
index 8241cbd37ec623622f19ff2ba35c21a4e3e3533a..35a5cb5b7a450e7233b6dddbef58a2acccfb1608 100644
--- a/paddle/gserver/layers/FullMatrixProjection.cpp
+++ b/paddle/gserver/layers/FullMatrixProjection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "FullMatrixProjection.h"
 
 namespace paddle {
@@ -52,7 +51,9 @@ void FullMatrixProjection::backward(const UpdateCallback& callback) {
   }
 
   hl_set_sync_flag(syncFlag);
-  parameter_->incUpdate(callback);
+  if (weight_->getWGrad()) {
+    parameter_->incUpdate(callback);
+  }
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/FullMatrixProjection.h b/paddle/gserver/layers/FullMatrixProjection.h
index e99444b33b82e4694ee6df4df5f5447bdc3baaa0..ddb1e7b18c4f967383feb922ce89d13a452109b2 100644
--- a/paddle/gserver/layers/FullMatrixProjection.h
+++ b/paddle/gserver/layers/FullMatrixProjection.h
@@ -30,7 +30,8 @@ namespace paddle {
 class FullMatrixProjection : public Projection {
 public:
   FullMatrixProjection(const ProjectionConfig& config,
-                       const ParameterPtr& parameter, bool useGpu);
+                       const ParameterPtr& parameter,
+                       bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
diff --git a/paddle/gserver/layers/FullyConnectedLayer.cpp b/paddle/gserver/layers/FullyConnectedLayer.cpp
index c754f8fd9480de73067b295ffacbbaab1866568a..70c56499a7738c12db40bfd0ca5fec399d72f99b 100644
--- a/paddle/gserver/layers/FullyConnectedLayer.cpp
+++ b/paddle/gserver/layers/FullyConnectedLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "FullyConnectedLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
diff --git a/paddle/gserver/layers/FullyConnectedLayer.h b/paddle/gserver/layers/FullyConnectedLayer.h
index 24b6c547e7bc8a60d9374a55074416ea1b9bbc72..e15e1236cdb75d1c41bbb993f86545334785909a 100644
--- a/paddle/gserver/layers/FullyConnectedLayer.h
+++ b/paddle/gserver/layers/FullyConnectedLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -20,9 +19,9 @@ limitations under the License. */
 #include "paddle/utils/ThreadLocal.h"
 
 namespace paddle {
-/** 
+/**
  * A layer has full connections to all neurons in the previous layer.
- * It computes an inner product with a set of learned weights, and 
+ * It computes an inner product with a set of learned weights, and
  * (optionally) adds biases.
  *
  * The config file api is fc_layer.
@@ -34,8 +33,7 @@ protected:
   std::unique_ptr<Weight> biases_;
 
 public:
-  explicit FullyConnectedLayer(const LayerConfig& config)
-      : Layer(config) {}
+  explicit FullyConnectedLayer(const LayerConfig& config) : Layer(config) {}
   ~FullyConnectedLayer() {}
 
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
@@ -48,4 +46,3 @@ public:
 };
 
 }  // namespace paddle
-
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.cpp b/paddle/gserver/layers/GatedRecurrentLayer.cpp
index e0c6ff7ea28418d7bfb2db0b20281165f328976d..495c2174f3e9afbee676622d53248c7f5aeea404 100644
--- a/paddle/gserver/layers/GatedRecurrentLayer.cpp
+++ b/paddle/gserver/layers/GatedRecurrentLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 #include "GatedRecurrentLayer.h"
 #include "paddle/utils/Stat.h"
@@ -30,8 +29,8 @@ bool GatedRecurrentLayer::init(const LayerMap& layerMap,
   CHECK_EQ(getSize() * 3, biasParameter_->getSize());
   weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0]));
   gateWeight_.reset(new Weight(getSize(), getSize() * 2, parameters_[0], 0));
-  stateWeight_.reset(new Weight(getSize(), getSize(), parameters_[0],
-                                2 * getSize() * getSize()));
+  stateWeight_.reset(new Weight(
+      getSize(), getSize(), parameters_[0], 2 * getSize() * getSize()));
   if (biasParameter_.get() != NULL) {
     bias_.reset(new Weight(1, getSize() * 3, biasParameter_));
   }
@@ -48,8 +47,8 @@ bool GatedRecurrentLayer::init(const LayerMap& layerMap,
 void GatedRecurrentLayer::resetState() {
   CHECK(!reversed_) << "state is not allowed for reversed gated "
                        "recurrent layer";
-  Matrix::resizeOrCreate(prevOutput_, 1, getSize(), /* trans= */ false,
-                         useGpu_);
+  Matrix::resizeOrCreate(
+      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
   prevOutput_->zeroMem();
 
   // TODO(hedaoyuan): support prev_batch_state
@@ -85,10 +84,16 @@ void GatedRecurrentLayer::forward(PassType passType) {
   // batchSize = length of total frames in a batch (NOT size of mini-batch)
   CHECK_EQ(starts[numSequences], batchSize);
 
-  Matrix::resizeOrCreate(gate_.value, /* height= */batchSize,
-                         getSize() * 3, /* trans= */false, useGpu_);
-  Matrix::resizeOrCreate(resetOutput_.value, /* height= */batchSize,
-                         getSize(), /* trans= */false, useGpu_);
+  Matrix::resizeOrCreate(gate_.value,
+                         /* height= */ batchSize,
+                         getSize() * 3,
+                         /* trans= */ false,
+                         useGpu_);
+  Matrix::resizeOrCreate(resetOutput_.value,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
 
   if (useBatch_) {
     forwardBatch(batchSize, numSequences, starts, input.value);
@@ -105,10 +110,16 @@ void GatedRecurrentLayer::backward(const UpdateCallback& callback) {
   const int* starts = input.sequenceStartPositions->getData(false);
   size_t numSequences = input.getNumSequences();
 
-  Matrix::resizeOrCreate(gate_.grad, /* height= */batchSize,
-                         getSize() * 3, /* trans= */false, useGpu_);
-  Matrix::resizeOrCreate(resetOutput_.grad, /* height= */batchSize,
-                         getSize(), /* trans= */false, useGpu_);
+  Matrix::resizeOrCreate(gate_.grad,
+                         /* height= */ batchSize,
+                         getSize() * 3,
+                         /* trans= */ false,
+                         useGpu_);
+  Matrix::resizeOrCreate(resetOutput_.grad,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
 
   if (useBatch_) {
     backwardBatch(batchSize, input.grad);
@@ -125,7 +136,7 @@ void GatedRecurrentLayer::backward(const UpdateCallback& callback) {
 
 void GatedRecurrentLayer::forwardSequence(int batchSize,
                                           size_t numSequences,
-                                          const int *starts,
+                                          const int* starts,
                                           MatrixPtr inputValue) {
   REGISTER_TIMER_INFO("GruFwSequenceTime", getName().c_str());
   gate_.value->assign(*inputValue);
@@ -198,7 +209,7 @@ void GatedRecurrentLayer::forwardSequence(int batchSize,
 
 void GatedRecurrentLayer::backwardSequence(int batchSize,
                                            size_t numSequences,
-                                           const int *starts,
+                                           const int* starts,
                                            MatrixPtr inputGrad) {
   REGISTER_TIMER_INFO("GruBwSequenceTime", getName().c_str());
 
@@ -211,9 +222,10 @@ void GatedRecurrentLayer::backwardSequence(int batchSize,
 
   hl_gru_grad gruGrad;
   gruGrad.gateWeightGrad =
-    (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
+      (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
   gruGrad.stateWeightGrad =
-    (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData() : nullptr);
+      (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData()
+                                : nullptr);
   gruGrad.gateGrad = gate_.grad->getData();
   gruGrad.resetOutputGrad = resetOutput_.grad->getData();
   gruGrad.outputGrad = output_.grad->getData();
@@ -298,11 +310,10 @@ void GatedRecurrentLayer::forwardBatch(int batchSize,
   if (!batchValue_) {
     batchValue_.reset(new SequenceToBatch(useGpu_));
   }
-  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts,
-                                   reversed_);
+  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);
 
   batchValue_->resizeOrCreate(*output_.value);
-  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */true);
+  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
   if (bias_ && bias_->getWGrad()) {
     gate_.value->addBias(*(bias_->getW()), 1);
   }
@@ -315,14 +326,14 @@ void GatedRecurrentLayer::forwardBatch(int batchSize,
       MatrixPtr outputValueTmp = batchValue_->getBatchValue(n);
       gruValue.outputValue = outputValueTmp->getData();
       gruValue.gateValue =
-        (batchValue_->getBatchValue(*gate_.value, n))->getData();
+          (batchValue_->getBatchValue(*gate_.value, n))->getData();
       gruValue.resetOutputValue =
-        (batchValue_->getBatchValue(*resetOutput_.value, n))->getData();
+          (batchValue_->getBatchValue(*resetOutput_.value, n))->getData();
 
       batchSize = outputValueTmp->getHeight();
       gruValue.prevOutValue =
-        (n == 0 ? nullptr
-                : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
+          (n == 0 ? nullptr
+                  : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
 
       {
         if (useGpu_) {
@@ -333,13 +344,10 @@ void GatedRecurrentLayer::forwardBatch(int batchSize,
       }
     }
   }
-  {
-    batchValue_->copyBackSeq(*output_.value);
-  }
+  { batchValue_->copyBackSeq(*output_.value); }
 }
 
-void GatedRecurrentLayer::backwardBatch(int batchSize,
-                                        MatrixPtr inputGrad) {
+void GatedRecurrentLayer::backwardBatch(int batchSize, MatrixPtr inputGrad) {
   REGISTER_TIMER_INFO("GruBwBatchTime", getName().c_str());
   hl_gru_value gruValue;
   gruValue.gateWeight = (gateWeight_->getW())->getData();
@@ -347,18 +355,17 @@ void GatedRecurrentLayer::backwardBatch(int batchSize,
 
   hl_gru_grad gruGrad;
   gruGrad.gateWeightGrad =
-    (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
+      (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
   gruGrad.stateWeightGrad =
-    (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData() : nullptr);
+      (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData()
+                                : nullptr);
 
   if (!batchGrad_) {
     batchGrad_.reset(new SequenceToBatch(useGpu_));
   }
   batchGrad_->shareIndexWith(*batchValue_);
 
-  {
-    batchGrad_->copyFromSeq(*output_.grad);
-  }
+  { batchGrad_->copyFromSeq(*output_.grad); }
 
   {
     int numBatch = batchGrad_->getNumBatch();
@@ -366,39 +373,36 @@ void GatedRecurrentLayer::backwardBatch(int batchSize,
     AsyncGpuBlock asyncGpuBlock;
     for (int n = (int)numBatch - 1; n >= 0; n--) {
       gruValue.gateValue =
-        (batchGrad_->getBatchValue(*gate_.value, n))->getData();
+          (batchGrad_->getBatchValue(*gate_.value, n))->getData();
       gruValue.resetOutputValue =
-        (batchGrad_->getBatchValue(*resetOutput_.value, n))->getData();
+          (batchGrad_->getBatchValue(*resetOutput_.value, n))->getData();
 
-      MatrixPtr outputGradTmp  = batchGrad_->getBatchValue(n);
+      MatrixPtr outputGradTmp = batchGrad_->getBatchValue(n);
       gruGrad.outputGrad = outputGradTmp->getData();
-      gruGrad.gateGrad =
-        (batchGrad_->getBatchValue(*gate_.grad , n))->getData();
+      gruGrad.gateGrad = (batchGrad_->getBatchValue(*gate_.grad, n))->getData();
       gruGrad.resetOutputGrad =
-        (batchGrad_->getBatchValue(*resetOutput_.grad , n))->getData();
+          (batchGrad_->getBatchValue(*resetOutput_.grad, n))->getData();
 
       {
         batchSize = outputGradTmp->getHeight();
         gruValue.prevOutValue =
-          (n == 0 ? nullptr
-                  : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
+            (n == 0 ? nullptr : (batchValue_->getBatchValue(n - 1, batchSize))
+                                    ->getData());
         gruGrad.prevOutGrad =
-          (n == 0 ? nullptr
-                  : (batchGrad_->getBatchValue(n - 1, batchSize))->getData());
+            (n == 0 ? nullptr
+                    : (batchGrad_->getBatchValue(n - 1, batchSize))->getData());
 
         if (useGpu_) {
-          GruCompute::backward<1>(gruValue, gruGrad, getSize(),
-                                  batchSize);
+          GruCompute::backward<1>(gruValue, gruGrad, getSize(), batchSize);
         } else {
-          GruCompute::backward<0>(gruValue, gruGrad, getSize(),
-                                  batchSize);
+          GruCompute::backward<0>(gruValue, gruGrad, getSize(), batchSize);
         }
       }
     }
   }
 
   if (inputGrad) {
-    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */false);
+    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */ false);
   }
   if (bias_ && bias_->getWGrad()) {
     bias_->getWGrad()->collectBias(*gate_.grad, /* scale */ 1);
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.h b/paddle/gserver/layers/GatedRecurrentLayer.h
index 19f71206bc00a15892815cc1e0c039659b841df6..3b8706a44e21e5a780c6423b65369dc5b695b59b 100644
--- a/paddle/gserver/layers/GatedRecurrentLayer.h
+++ b/paddle/gserver/layers/GatedRecurrentLayer.h
@@ -63,13 +63,19 @@ public:
   LayerStatePtr getState();
 
 protected:
-  void forwardSequence(int batchSize, size_t numSequences,
-                       const int *starts, MatrixPtr inputValue);
-  void backwardSequence(int batchSize, size_t numSequences,
-                        const int *starts, MatrixPtr inputGrad);
-
-  void forwardBatch(int batchSize, size_t numSequences,
-                    const int *starts, MatrixPtr inputValue);
+  void forwardSequence(int batchSize,
+                       size_t numSequences,
+                       const int* starts,
+                       MatrixPtr inputValue);
+  void backwardSequence(int batchSize,
+                        size_t numSequences,
+                        const int* starts,
+                        MatrixPtr inputGrad);
+
+  void forwardBatch(int batchSize,
+                    size_t numSequences,
+                    const int* starts,
+                    MatrixPtr inputValue);
   void backwardBatch(int batchSize, MatrixPtr inputGrad);
 
 protected:
diff --git a/paddle/gserver/layers/GetOutputLayer.cpp b/paddle/gserver/layers/GetOutputLayer.cpp
index f036cd2b5284222bbcbcdfda7b7a0142eba750a7..01579d55fd9d0918b62ae0ddd9a7e90b4a697a13 100644
--- a/paddle/gserver/layers/GetOutputLayer.cpp
+++ b/paddle/gserver/layers/GetOutputLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/GruCompute.cpp b/paddle/gserver/layers/GruCompute.cpp
index c942122633c3d9e6dd89ce57c35d50db819ba3a1..d9d423af448fd267b777ef57964dced3b7a09f63 100644
--- a/paddle/gserver/layers/GruCompute.cpp
+++ b/paddle/gserver/layers/GruCompute.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "GruCompute.h"
 #include "hl_recurrent_apply.cuh"
@@ -20,14 +19,12 @@ limitations under the License. */
 namespace paddle {
 
 void GruCompute::init(LayerConfig &config) {
-    activeNode_ = hlActiveType(config.active_type());
-    activeGate_ = hlActiveType(config.active_gate_type());
+  activeNode_ = hlActiveType(config.active_type());
+  activeGate_ = hlActiveType(config.active_gate_type());
 }
 
 template <>
-void GruCompute::forward<0>(hl_gru_value value,
-                            int frameSize,
-                            int batchSize) {
+void GruCompute::forward<0>(hl_gru_value value, int frameSize, int batchSize) {
   hl_cpu_gru_forward(hppl::forward::gru_resetOutput(),
                      hppl::forward::gru_finalOutput(),
                      value,
@@ -39,17 +36,17 @@ void GruCompute::forward<0>(hl_gru_value value,
 
 template <>
 void GruCompute::backward<0>(hl_gru_value value,
-                            hl_gru_grad  grad,
-                            int frameSize,
-                            int batchSize) {
-hl_cpu_gru_backward(hppl::backward::gru_stateGrad(),
-                    hppl::backward::gru_resetGrad(),
-                    value,
-                    grad,
-                    frameSize,
-                    batchSize,
-                    activeNode_,
-                    activeGate_);
+                             hl_gru_grad grad,
+                             int frameSize,
+                             int batchSize) {
+  hl_cpu_gru_backward(hppl::backward::gru_stateGrad(),
+                      hppl::backward::gru_resetGrad(),
+                      value,
+                      grad,
+                      frameSize,
+                      batchSize,
+                      activeNode_,
+                      activeGate_);
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/GruCompute.h b/paddle/gserver/layers/GruCompute.h
index 3a1b69b940d089d8f346756d312e0eb21d445e05..58b5aacba0403f8d10e34b055f5a69ad5ffa4837 100644
--- a/paddle/gserver/layers/GruCompute.h
+++ b/paddle/gserver/layers/GruCompute.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/TypeDefs.h"
@@ -29,7 +28,9 @@ public:
   void forward(hl_gru_value value, int frameSize, int batchSize = 1);
 
   template <bool useGpu>
-  void backward(hl_gru_value value, hl_gru_grad grad, int frameSize,
+  void backward(hl_gru_value value,
+                hl_gru_grad grad,
+                int frameSize,
                 int batchSize = 1);
 
 public:
diff --git a/paddle/gserver/layers/GruStepLayer.cpp b/paddle/gserver/layers/GruStepLayer.cpp
index 501229d10ab87af0baa8b5d3f94a218f2d064d61..6c9b0c5771bec765d043cd654fbb30ba56f8c813 100644
--- a/paddle/gserver/layers/GruStepLayer.cpp
+++ b/paddle/gserver/layers/GruStepLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 #include "GruCompute.h"
 #include "paddle/utils/Stat.h"
@@ -32,7 +31,8 @@ namespace paddle {
  * \f[
  * update \ gate: z_t = actGate(xz_t + U_z * prev_out + bias_z) \\
  * reset \ gate: r_t = actGate(xr_t + U_r * prev_out + bias_r)  \\
- * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, prev_out) + bias_o) \\
+ * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, prev_out) + bias_o)
+ * \\
  * output: h_t = dot((1-z_t), prev_out) + dot(z_t, prev_out)
  * \f]
  *
@@ -91,10 +91,16 @@ void GruStepLayer::forward(PassType passType) {
 
   int batchSize = input.getBatchSize();
   resetOutput(batchSize, getSize());
-  resetSpecifyOutput(gate_, batchSize, getSize() * 3,
-                     /* isValueClean */ false, /* isGradClean */ false);
-  resetSpecifyOutput(resetOutput_, batchSize, getSize(),
-                     /* isValueClean */ false, /* isGradClean */ false);
+  resetSpecifyOutput(gate_,
+                     batchSize,
+                     getSize() * 3,
+                     /* isValueClean */ false,
+                     /* isGradClean */ false);
+  resetSpecifyOutput(resetOutput_,
+                     batchSize,
+                     getSize(),
+                     /* isValueClean */ false,
+                     /* isGradClean */ false);
   gate_.value->assign(*input.value);
   if (bias_) {
     gate_.value->addBias(*(bias_->getW()), 1);
@@ -103,7 +109,7 @@ void GruStepLayer::forward(PassType passType) {
   hl_gru_value gruValue;
   gruValue.gateWeight = weight_->getW()->getData();
   gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2;
-  gruValue.gateValue = gate_.value->getData();;
+  gruValue.gateValue = gate_.value->getData();
   gruValue.resetOutputValue = resetOutput_.value->getData();
   gruValue.outputValue = output_.value->getData();
   gruValue.prevOutValue = prevOutput.value->getData();
@@ -125,17 +131,18 @@ void GruStepLayer::backward(const UpdateCallback& callback) {
   hl_gru_value gruValue;
   gruValue.gateWeight = weight_->getW()->getData();
   gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2;
-  gruValue.gateValue = gate_.value->getData();;
+  gruValue.gateValue = gate_.value->getData();
   gruValue.resetOutputValue = resetOutput_.value->getData();
   gruValue.outputValue = output_.value->getData();
   gruValue.prevOutValue = prevOutput.value->getData();
 
-  hl_gru_grad  gruGrad;
+  hl_gru_grad gruGrad;
   gruGrad.gateWeightGrad =
-    (weight_->getWGrad() ? weight_->getWGrad()->getData() : nullptr);
+      (weight_->getWGrad() ? weight_->getWGrad()->getData() : nullptr);
   gruGrad.stateWeightGrad =
-    (weight_->getWGrad() ?
-     weight_->getWGrad()->getData() + getSize() * getSize() * 2 : nullptr);
+      (weight_->getWGrad()
+           ? weight_->getWGrad()->getData() + getSize() * getSize() * 2
+           : nullptr);
 
   gruGrad.gateGrad = gate_.grad->getData();
   gruGrad.resetOutputGrad = resetOutput_.grad->getData();
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
index 7091c6aa222e52e09603d84f52f88de11b9a7d73..61bc77778501fb9421cd2a72459d35ac9f47a5cb 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "HierarchicalSigmoidLayer.h"
 #include "paddle/utils/Util.h"
 
@@ -61,10 +60,16 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
   int batchSize = getInputValue(0)->getHeight();
   int size = getSize();
   reserveOutput(batchSize, size);
-  Matrix::resizeOrCreate(preOutput_.value, batchSize, codeLength_,
-                         /* trans */ false, useGpu(deviceId_));
-  Matrix::resizeOrCreate(preOutput_.grad, batchSize, codeLength_,
-                         /* trans */ false, useGpu(deviceId_));
+  Matrix::resizeOrCreate(preOutput_.value,
+                         batchSize,
+                         codeLength_,
+                         /* trans */ false,
+                         useGpu(deviceId_));
+  Matrix::resizeOrCreate(preOutput_.grad,
+                         batchSize,
+                         codeLength_,
+                         /* trans */ false,
+                         useGpu(deviceId_));
 
   IVectorPtr label = getInput(*getLabelLayer()).ids;
 
@@ -76,16 +81,18 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
   }
   for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
     MatrixPtr input = getInputValue(i);
-    preOutput_.value->mulByBitCode(numClasses_, *label, *weights_[i]->getW(),
-                                   *input);
+    preOutput_.value->mulByBitCode(
+        numClasses_, *label, *weights_[i]->getW(), *input);
   }
   // keep consistent with the clipping in the following softrelu
   preOutput_.value->clip(-40.0, 40.0);
-  preOutput_.value->sumByBitCode(numClasses_, *label, *output_.value,
+  preOutput_.value->sumByBitCode(numClasses_,
+                                 *label,
+                                 *output_.value,
                                  -1);  // scaleSum
   preOutput_.value->softrelu(*preOutput_.value);
-  MatrixPtr sum = Matrix::create(batchSize,
-    1, /* trans= */ false, useGpu(deviceId_));
+  MatrixPtr sum =
+      Matrix::create(batchSize, 1, /* trans= */ false, useGpu(deviceId_));
   preOutput_.value->rowSum(*sum);
   output_.value->add(*sum);
 }
@@ -97,8 +104,8 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
   preOutput_.grad->subByBitCode(numClasses_, *label);
 
   if (biases_ && biases_->getWGrad()) {
-    preOutput_.grad->addByBitCodeBackward(numClasses_, *label,
-                                          *biases_->getWGrad());
+    preOutput_.grad->addByBitCodeBackward(
+        numClasses_, *label, *biases_->getWGrad());
 
     /* Increasing the number of gradient */
     biases_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
index 1942c5fe1e4f4da1d3d9197a3ffd80e3e55ec2ac..10762bc92687a3ea8debb7b9aa26a0cf0f94421c 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -20,15 +19,15 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * Organize the classes into a binary tree. At each node, a sigmoid function 
+ * Organize the classes into a binary tree. At each node, a sigmoid function
  * is used to calculate the probability of belonging to the right branch.
- * This idea is from "F. Morin, Y. Bengio (AISTATS 05): 
+ * This idea is from "F. Morin, Y. Bengio (AISTATS 05):
  * Hierarchical Probabilistic Neural Network Language Model."
  *
  * Here we uses a simple way of making the binary tree.
  * Assuming the number of classes C = 6,
  * The classes are organized as a binary tree in the following way:
- * 
+ *
  * @code{.py}
  * *-*-*- 2
  * | | |- 3
@@ -44,15 +43,15 @@ namespace paddle {
  * - Node 0 ... C-2 are internal nodes.
  * - Node C-1 ... 2C-2 are leaf nodes.
  * - Class c is represented by leaf node \f$c+C-1\f$.
- * 
+ *
  * We assign an id for each node:
  * - the id of root be 0.
  * - the left child of a node i is 2*i+1.
  * - the right child of a node i is 2*i+2.
  *
  * It's easy to see that:
- * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$. 
- * - the j-th level ancestor of node i is 
+ * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$.
+ * - the j-th level ancestor of node i is
  * \f$\left\lfloor(i+1)/2^{j+1}\right\rfloor - 1\f$.
  * - A node i is a left child of its parent if \f$(i-1)\%2==0\f$.
  *
@@ -69,7 +68,7 @@ public:
 protected:
   /**
    * The last of inputs is label layer.
-   */ 
+   */
   LayerPtr getLabelLayer() { return inputLayers_.back(); }
 
   WeightList weights_;
diff --git a/paddle/gserver/layers/IdentityProjection.cpp b/paddle/gserver/layers/IdentityProjection.cpp
index 6b7d20cc507e453e49708c2418f6d67abf3326f8..b38656c960f17b2c2c315eba70c61c328ed3e49a 100644
--- a/paddle/gserver/layers/IdentityProjection.cpp
+++ b/paddle/gserver/layers/IdentityProjection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "Projection.h"
 
@@ -29,7 +28,8 @@ namespace paddle {
 class IdentityProjection : public Projection {
 public:
   IdentityProjection(const ProjectionConfig& config,
-                     const ParameterPtr& parameter, bool useGpu);
+                     const ParameterPtr& parameter,
+                     bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 };
@@ -70,7 +70,8 @@ void IdentityProjection::backward(const UpdateCallback& callback) {
 class IdentityOffsetProjection : public Projection {
 public:
   IdentityOffsetProjection(const ProjectionConfig& config,
-                           const ParameterPtr& parameter, bool useGpu);
+                           const ParameterPtr& parameter,
+                           bool useGpu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 };
diff --git a/paddle/gserver/layers/InterpolationLayer.cpp b/paddle/gserver/layers/InterpolationLayer.cpp
index 4102df840a48412a9c4ceb476488febf43a8e80c..b00bee235693d56aecfdc676647e102fe8d0ebfc 100644
--- a/paddle/gserver/layers/InterpolationLayer.cpp
+++ b/paddle/gserver/layers/InterpolationLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -26,8 +25,8 @@ namespace paddle {
  * \f[
  *   y.row[i] = w[i] * x_1.row[i] + (1 - w[i]) * x_2.row[i]
  * \f]
- * where \f$x_1\f$ and \f$x_2\f$ are two (batchSize x dataDim) inputs, 
- * \f$w\f$ is (batchSize x 1) weight vector, 
+ * where \f$x_1\f$ and \f$x_2\f$ are two (batchSize x dataDim) inputs,
+ * \f$w\f$ is (batchSize x 1) weight vector,
  * and \f$y\f$ is (batchSize x dataDim) output.
  *
  * The config file api is interpolation_layer.
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index 78d15c553021de6bbda210cb782c8a240cc2bf73..0f9e7c0ff89531edeb5e7c5b2bc03f28b0a08b94 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 
 #include "paddle/utils/Logging.h"
@@ -123,19 +122,22 @@ LayerPtr Layer::create(const LayerConfig& config) {
   return LayerPtr(registrar_.createByType(config.type(), config));
 }
 
-void Layer::resetSpecifyOutput(Argument& output, size_t height, size_t width,
-                               bool isValueClean, bool isGradClean) {
+void Layer::resetSpecifyOutput(Argument& output,
+                               size_t height,
+                               size_t width,
+                               bool isValueClean,
+                               bool isGradClean) {
   SetDevice device(output.deviceId);
 
-  Matrix::resizeOrCreate(output.value, height, width, /* trans */ false,
-                         useGpu(output.deviceId));
+  Matrix::resizeOrCreate(
+      output.value, height, width, /* trans */ false, useGpu(output.deviceId));
   if (isValueClean) {
     output.value->zeroMem();
   }
 
   if (passType_ != PASS_TEST && needGradient()) {
-    Matrix::resizeOrCreate(output.grad, height, width, /* trans */ false,
-                           useGpu(output.deviceId));
+    Matrix::resizeOrCreate(
+        output.grad, height, width, /* trans */ false, useGpu(output.deviceId));
     if (isGradClean) {
       output.grad->zeroMem();
     }
@@ -227,8 +229,10 @@ void Layer::waitAndMergeOutputGrad() {
     if (outputOtherDevice_.size() == 1) return;
   }
 
-  Matrix::resizeOrCreate(tmpGrad_, output_.grad->getHeight(),
-                         output_.grad->getWidth(), /* trans */ false,
+  Matrix::resizeOrCreate(tmpGrad_,
+                         output_.grad->getHeight(),
+                         output_.grad->getWidth(),
+                         /* trans */ false,
                          useGpu(output_.deviceId));
 
   for (; i != outputOtherDevice_.size(); i++) {
@@ -258,8 +262,8 @@ void Layer::zeroGrad() {
 }
 
 void Layer::initNeedFlags() {
-  auto initFlag = [this](bool& flag, bool (Layer::*flagQueryFunc)() const,
-                         ParameterType type) {
+  auto initFlag = [this](
+      bool& flag, bool (Layer::*flagQueryFunc)() const, ParameterType type) {
     flag = false;
     if (biasParameter_ && biasParameter_->hasType(type)) {
       flag = true;
@@ -293,10 +297,12 @@ void Layer::showOutputStats() {
   }
   MatrixPtr outSquare;
   if (dynamic_cast<GpuSparseMatrix*>(out.get())) {
-    GpuSparseMatrix *tmp = dynamic_cast<GpuSparseMatrix*>(out.get());
-    outSquare = std::make_shared<CpuSparseMatrix>(
-      tmp->getHeight(), tmp->getWidth(), tmp->getElementCnt(),
-      tmp->getValueType(), tmp->getFormat());
+    GpuSparseMatrix* tmp = dynamic_cast<GpuSparseMatrix*>(out.get());
+    outSquare = std::make_shared<CpuSparseMatrix>(tmp->getHeight(),
+                                                  tmp->getWidth(),
+                                                  tmp->getElementCnt(),
+                                                  tmp->getValueType(),
+                                                  tmp->getFormat());
   } else {
     outSquare = out->clone();
   }
@@ -321,8 +327,7 @@ void Layer::showOutputStats() {
   std = std > 0 ? std : 0;
   LOG(INFO) << "The output state of " << config_.name() << ": mean=" << mean
             << ", "
-            << "std=" << std
-            << ", "
+            << "std=" << std << ", "
             << "min=" << min << ", "
             << "max=" << max;
 }
@@ -348,8 +353,8 @@ void Layer::backwardActivation() {
   if (config_.error_clipping_threshold() > 0.0f) {
     if (FLAGS_log_error_clipping) {
       CpuVector outGradVec(0, nullptr);
-      outGradVec.subVecFrom(output_.grad->getData(), 0,
-                            output_.grad->getElementCnt());
+      outGradVec.subVecFrom(
+          output_.grad->getData(), 0, output_.grad->getElementCnt());
       real maxAbsGrad = outGradVec.getAbsMax();
       if (maxAbsGrad > config_.error_clipping_threshold()) {
         real avgAbsGrad = outGradVec.getAbsSum() / outGradVec.getSize();
@@ -376,16 +381,19 @@ void Layer::forwardDropOut() {
   if (passType_ == PASS_TRAIN || passType_ == PASS_METRIC_TRAIN ||
       passType_ == PASS_METRIC_TRAIN_WITH_NOERROR) {
     // new dropOutMask_ if dropOutMask_ is null ptr
-    Matrix::resizeOrCreate(dropOutMask_, outV->getHeight(), outV->getWidth(),
-                           false, useGpu(deviceId_));
+    Matrix::resizeOrCreate(dropOutMask_,
+                           outV->getHeight(),
+                           outV->getWidth(),
+                           false,
+                           useGpu(deviceId_));
     dropOutMask_->randomizeUniform();  // generate a uniform random matrix
     dropOutMask_->biggerThanScalar(config_.drop_rate());  // random mask
     outV->dotMul(*outV, *dropOutMask_);                   // dropout
   } else if (passType_ == PASS_GC) {
     // only initialize once
     if (!dropOutMask_) {
-      dropOutMask_ = Matrix::create(outV->getHeight(), outV->getWidth(), false,
-                                    useGpu(deviceId_));
+      dropOutMask_ = Matrix::create(
+          outV->getHeight(), outV->getWidth(), false, useGpu(deviceId_));
       // We use cpu matrix to generate mask so that the mask
       // will be same for both gpu version and cpu version.
       // This will help unittest to make sure they have same result.
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index ae7cdb0028120748a3377d8f522c4af03d9cb82d..3d427a1ac6e38f2bcd49195504d1086b83e3cdf3 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -109,7 +108,7 @@ public:
   virtual void waitInputValue();
 
   /**
-   * Copy layer's output_ to other device. 
+   * Copy layer's output_ to other device.
    * If output layer is in other device, called after Layer::forward() function.
    */
   virtual void copyOutputToOtherDevice();
@@ -189,8 +188,11 @@ protected:
    * Reset to value zero if isValueClean = true,
    * Reset to grad zero if isGradClean = true.
    */
-  void resetSpecifyOutput(Argument& output, size_t height, size_t width,
-                          bool isValueClean, bool isGradClean);
+  void resetSpecifyOutput(Argument& output,
+                          size_t height,
+                          size_t width,
+                          bool isValueClean,
+                          bool isGradClean);
 
   /**
    * Add output argument to other devices.
@@ -204,48 +206,48 @@ public:
   /// Register a Layer
   static ClassRegistrar<Layer, LayerConfig> registrar_;
 
-  /** 
+  /**
    * Get the flag whether layer need to compute gradient.
    */
   bool needGradient() const { return needGradient_; }
 
-  /** 
+  /**
    * Set the flag whether layer need to compute gradient.
    */
   void setNeedGradient(bool need) { needGradient_ = need; }
 
-  /** 
+  /**
    * Set the flag whether layer need to re-compute sequence information,
    * which includes sequenceStartPositions or subSequenceStartPositions.
    */
   void setNeedSequenceInfo(bool need) { needSequenceInfo_ = need; }
 
-  /** 
+  /**
    * Get layer's name.
    */
   const std::string& getName() const { return config_.name(); }
 
-  /** 
+  /**
    * Get layer's type.
    */
   const std::string& getType() const { return config_.type(); }
 
-  /** 
+  /**
    * Get layer's size.
    */
   size_t getSize() const { return config_.size(); }
 
-  /** 
+  /**
    * Get layer's deviceId.
    */
   int getDeviceId() const { return deviceId_; }
 
-  /** 
+  /**
    * Add the inputLayer.
    */
   void addPrev(LayerPtr l) { inputLayers_.push_back(l); }
 
-  /** 
+  /**
    * Get the size of inputLayer[i].
    */
   const LayerPtr& getPrev(size_t i) { return inputLayers_[i]; }
@@ -265,7 +267,7 @@ public:
    */
   const MatrixPtr& getOutputGrad() { return output_.grad; }
   /**
-   * If layer has multi-output, set output into outputMap_. 
+   * If layer has multi-output, set output into outputMap_.
    */
   void setOutput(const std::string& name, Argument* output) {
     outputMap_[name] = output;
@@ -351,8 +353,8 @@ public:
   /**
    * Intialization for sub network if there has sub network.
    * @param rootNetwork root network
-   * @param config model config 
-   * @param parameterTypes parameter's type 
+   * @param config model config
+   * @param parameterTypes parameter's type
    * @param useGpu whether to use gpu or not
    */
   virtual void initSubNetwork(NeuralNetwork* rootNetwork,
@@ -391,7 +393,8 @@ public:
   /**
    * Reset the internal state variables.
    * Allocate them if they have not been allocated.
-   * This function need to called before Layer::forward() for generating sequence.
+   * This function need to called before Layer::forward() for generating
+   * sequence.
    *
    * This is used for sequence generation. When generating sequence, the
    * calculation at current timestamp depends on the state from previous
@@ -407,7 +410,7 @@ public:
   virtual void setState(LayerStatePtr state) {}
 
   /**
-   * Get layer state. 
+   * Get layer state.
    * @return A copy of internal state.
    */
   virtual LayerStatePtr getState() { return nullptr; }
diff --git a/paddle/gserver/layers/LinearChainCRF.cpp b/paddle/gserver/layers/LinearChainCRF.cpp
index fb54fd26cf36e2b23deba7186c3dcdd0cc445870..2b3a50b2e29cd2291a9fc21980506baa6120563c 100644
--- a/paddle/gserver/layers/LinearChainCRF.cpp
+++ b/paddle/gserver/layers/LinearChainCRF.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <algorithm>
 #include "LinearChainCRF.h"
 
diff --git a/paddle/gserver/layers/LinearChainCRF.h b/paddle/gserver/layers/LinearChainCRF.h
index c33c83b25987e1b944a84d960cf6539cff1b872f..6368f2b9de2f993c6a113315be8d642784b04726 100644
--- a/paddle/gserver/layers/LinearChainCRF.h
+++ b/paddle/gserver/layers/LinearChainCRF.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/math/Matrix.h"
@@ -31,7 +30,8 @@ public:
    * \f$P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
    *                  + \sum_{l=1}^L x_{s_l}
    *                  + \sum_{l=2}^L w_{s_{l-1},s_l})\f$
-   * where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over all possible
+   * where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over
+   * all possible
    * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF.
    */
   LinearChainCRF(int numClasses, real* para, real* grad);
diff --git a/paddle/gserver/layers/LinearChainCTC.cpp b/paddle/gserver/layers/LinearChainCTC.cpp
index c0ffadbd91c78f5dcdb9fc2370aa7eb06bfb400e..3368eb4d8a796eef367042f78b8c18d47bc1330e 100644
--- a/paddle/gserver/layers/LinearChainCTC.cpp
+++ b/paddle/gserver/layers/LinearChainCTC.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <math.h>
 #include "LinearChainCTC.h"
 #include <limits>
@@ -90,7 +89,9 @@ LinearChainCTC::LinearChainCTC(int numClasses, bool normByTimes)
   Matrix::resizeOrCreate(gradTerms_, 1, numClasses_);
 }
 
-real LinearChainCTC::forward(real* softmaxSeq, int softmaxSeqLen, int* labelSeq,
+real LinearChainCTC::forward(real* softmaxSeq,
+                             int softmaxSeqLen,
+                             int* labelSeq,
                              int labelSeqLen) {
   isInvalid_ = false;
   totalTime_ = softmaxSeqLen;
@@ -215,7 +216,9 @@ real LinearChainCTC::forward(real* softmaxSeq, int softmaxSeqLen, int* labelSeq,
   return -logProb_;
 }
 
-void LinearChainCTC::backward(real* softmaxSeq, real* grad, int* labelSeq,
+void LinearChainCTC::backward(real* softmaxSeq,
+                              real* grad,
+                              int* labelSeq,
                               int labelSeqLen) {
   /* if not meet the conditions of CTC computing, then set the grads to zeros */
   if (isInvalid_) {
@@ -246,9 +249,9 @@ void LinearChainCTC::backward(real* softmaxSeq, real* grad, int* labelSeq,
                        logMul(logProb_, logActsData[i * numClasses_ + j]))) /
             totalTime_;
       } else {
-        grad[i * numClasses_ + j] += -safeExp(logDiv(
-            gradTermsData[j],
-            logMul(logProb_, logActsData[i * numClasses_ + j])));
+        grad[i * numClasses_ + j] += -safeExp(
+            logDiv(gradTermsData[j],
+                   logMul(logProb_, logActsData[i * numClasses_ + j])));
       }
     }
   }
diff --git a/paddle/gserver/layers/LinearChainCTC.h b/paddle/gserver/layers/LinearChainCTC.h
index b09218e3e78e16bd13e9dcde8138dd68a579d4ad..0a93d2e9a6d0d697f5f081abe9fad69faac9b04b 100644
--- a/paddle/gserver/layers/LinearChainCTC.h
+++ b/paddle/gserver/layers/LinearChainCTC.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <vector>
@@ -25,11 +24,15 @@ public:
   LinearChainCTC(int numClasses, bool normByTimes);
 
   // Calculate the negative log probability as loss
-  real forward(real* softmaxSeq, int softmaxSeqLen, int* labelSeq,
+  real forward(real* softmaxSeq,
+               int softmaxSeqLen,
+               int* labelSeq,
                int labelSeqLen);
 
   // calculate the gradient
-  void backward(real* softmaxSeq, real* softmaxSeqGrad, int* labelSeq,
+  void backward(real* softmaxSeq,
+                real* softmaxSeqGrad,
+                int* labelSeq,
                 int labelSeqLen);
 
 protected:
diff --git a/paddle/gserver/layers/LstmCompute.cpp b/paddle/gserver/layers/LstmCompute.cpp
index ced9636d3528ace044bc925285ac5db88f2ddc4e..38057636edbea5d1d25d20740b16c319a653e42e 100644
--- a/paddle/gserver/layers/LstmCompute.cpp
+++ b/paddle/gserver/layers/LstmCompute.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "hl_recurrent_apply.cuh"
 #include "LstmCompute.h"
@@ -27,22 +26,31 @@ void LstmCompute::init(LayerConfig &config) {
 
 template <>
 void LstmCompute::forwardOneSequence<0>(hl_lstm_value value, int frameSize) {
-  hl_cpu_lstm_forward(hppl::forward::lstm(), value,
-                      frameSize, activeNode_, activeGate_,
+  hl_cpu_lstm_forward(hppl::forward::lstm(),
+                      value,
+                      frameSize,
+                      activeNode_,
+                      activeGate_,
                       activeState_);
 }
 
 template <>
-void LstmCompute::backwardOneSequence<0>(hl_lstm_value value, hl_lstm_grad grad,
-                                        int frameSize) {
-  hl_cpu_lstm_backward(hppl::backward::lstm(), value, grad,
-                       frameSize, activeNode_, activeGate_,
+void LstmCompute::backwardOneSequence<0>(hl_lstm_value value,
+                                         hl_lstm_grad grad,
+                                         int frameSize) {
+  hl_cpu_lstm_backward(hppl::backward::lstm(),
+                       value,
+                       grad,
+                       frameSize,
+                       activeNode_,
+                       activeGate_,
                        activeState_);
 }
 
 template <>
-void LstmCompute::forwardBatch<0>(hl_lstm_value value, int frameSize,
-                                 int batchSize) {
+void LstmCompute::forwardBatch<0>(hl_lstm_value value,
+                                  int frameSize,
+                                  int batchSize) {
   for (int b = 0; b < batchSize; b++) {
     forwardOneSequence<0>(value, frameSize);
 
@@ -57,8 +65,10 @@ void LstmCompute::forwardBatch<0>(hl_lstm_value value, int frameSize,
 }
 
 template <>
-void LstmCompute::backwardBatch<0>(hl_lstm_value value, hl_lstm_grad grad,
-                                  int frameSize, int batchSize) {
+void LstmCompute::backwardBatch<0>(hl_lstm_value value,
+                                   hl_lstm_grad grad,
+                                   int frameSize,
+                                   int batchSize) {
   for (int b = 0; b < batchSize; b++) {
     backwardOneSequence<0>(value, grad, frameSize);
 
diff --git a/paddle/gserver/layers/LstmCompute.h b/paddle/gserver/layers/LstmCompute.h
index 638acdb56d75054387f5f368eaf8afc0dbed9107..97be7218f251f21a9a50c7f8ec28e7c487420a2f 100644
--- a/paddle/gserver/layers/LstmCompute.h
+++ b/paddle/gserver/layers/LstmCompute.h
@@ -35,7 +35,9 @@ public:
   void forwardBatch(hl_lstm_value value, int frameSize, int batchSize);
 
   template <bool useGpu>
-  void backwardBatch(hl_lstm_value value, hl_lstm_grad grad, int frameSize,
+  void backwardBatch(hl_lstm_value value,
+                     hl_lstm_grad grad,
+                     int frameSize,
                      int batchSize);
 
   /**
@@ -51,7 +53,8 @@ public:
   template <bool useGpu>
   void forwardOneSequence(hl_lstm_value value, int frameSize);
   template <bool useGpu>
-  void backwardOneSequence(hl_lstm_value value, hl_lstm_grad grad,
+  void backwardOneSequence(hl_lstm_value value,
+                           hl_lstm_grad grad,
                            int frameSize);
 
 public:
diff --git a/paddle/gserver/layers/LstmLayer.cpp b/paddle/gserver/layers/LstmLayer.cpp
index 61ad47a7fbd02f19a1a8e824b2cba3a3d114b9fc..e70a20e5c0217288b795f647f3918911e3713ceb 100644
--- a/paddle/gserver/layers/LstmLayer.cpp
+++ b/paddle/gserver/layers/LstmLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "LstmLayer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/BaseMatrix.h"
@@ -35,14 +34,26 @@ bool LstmLayer::init(const LayerMap &layerMap,
   if (biasParameter_.get() != NULL) {
     bias_.reset(new Weight(1, getSize() * 7, biasParameter_));
     if (bias_->getW()) {
-      localBias_ = Matrix::create(nullptr, /* height= */ 1, getSize() * 4,
-                                  /* trans= */ false, useGpu_);
-      checkIg_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                /* trans= */ false, useGpu_);
-      checkFg_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                /* trans= */ false, useGpu_);
-      checkOg_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                /* trans= */ false, useGpu_);
+      localBias_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  getSize() * 4,
+                                  /* trans= */ false,
+                                  useGpu_);
+      checkIg_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+      checkFg_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+      checkOg_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
 
       localBias_->setData(bias_->getW()->getData());
       checkIg_->setData(bias_->getW()->getData() + getSize() * 4);
@@ -51,14 +62,26 @@ bool LstmLayer::init(const LayerMap &layerMap,
     }
 
     if (bias_->getWGrad()) {
-      localBiasGrad_ = Matrix::create(nullptr, /* height= */ 1, getSize() * 4,
-                                      /* trans= */ false, useGpu_);
-      checkIgGrad_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                    /* trans= */ false, useGpu_);
-      checkFgGrad_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                    /* trans= */ false, useGpu_);
-      checkOgGrad_ = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                    /* trans= */ false, useGpu_);
+      localBiasGrad_ = Matrix::create(nullptr,
+                                      /* height= */ 1,
+                                      getSize() * 4,
+                                      /* trans= */ false,
+                                      useGpu_);
+      checkIgGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
+      checkFgGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
+      checkOgGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
       localBiasGrad_->setData(bias_->getWGrad()->getData());
       checkIgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 4);
       checkFgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 5);
@@ -84,8 +107,8 @@ bool LstmLayer::init(const LayerMap &layerMap,
 
 void LstmLayer::resetState() {
   CHECK(!reversed_) << "state is not allowed for reversed lstmemory layer";
-  Matrix::resizeOrCreate(prevOutput_, 1, getSize(), /* trans= */ false,
-                         useGpu_);
+  Matrix::resizeOrCreate(
+      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
   Matrix::resizeOrCreate(prevState_, 1, getSize(), /* trans= */ false, useGpu_);
   prevOutput_->resize(0, getSize());
   prevState_->resize(0, getSize());
@@ -138,8 +161,10 @@ void LstmLayer::forward(PassType passType) {
   CHECK_EQ(starts[numSequences], batchSize);
 
   Matrix::resizeOrCreate(gate_.value,
-                         /* height= */ batchSize, getSize() * 4,
-                         /* trans= */ false, useGpu_);
+                         /* height= */ batchSize,
+                         getSize() * 4,
+                         /* trans= */ false,
+                         useGpu_);
   if (prevOutput_) {
     size_t prevNumSeq = useBatch_ ? numSequences : 1;
     if (prevOutput_->getHeight() == 0) {
@@ -151,18 +176,29 @@ void LstmLayer::forward(PassType passType) {
       CHECK_EQ(prevOutput_->getHeight(), prevNumSeq)
           << "the number of sequences must be the same";
     }
-    Matrix::resizeOrCreate(totalState_, prevState_->getHeight() + batchSize,
-                           getSize(), /*trans*/ false, useGpu_);
-    state_.value = Matrix::create(nullptr, /* height= */ batchSize, getSize(),
-                                  /* trans= */ false, useGpu_);
+    Matrix::resizeOrCreate(totalState_,
+                           prevState_->getHeight() + batchSize,
+                           getSize(),
+                           /*trans*/ false,
+                           useGpu_);
+    state_.value = Matrix::create(nullptr,
+                                  /* height= */ batchSize,
+                                  getSize(),
+                                  /* trans= */ false,
+                                  useGpu_);
     state_.value->setData(totalState_->getData() +
                           prevState_->getHeight() * getSize());
   } else {
-    Matrix::resizeOrCreate(state_.value, /* height= */ batchSize, getSize(),
-                           /* trans= */ false, useGpu_);
+    Matrix::resizeOrCreate(state_.value,
+                           /* height= */ batchSize,
+                           getSize(),
+                           /* trans= */ false,
+                           useGpu_);
   }
   Matrix::resizeOrCreate(preOutput_.value,
-                         /* height= */ batchSize, getSize(), /* trans= */ false,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
                          useGpu_);
 
   if (!useBatch_) {
@@ -171,7 +207,7 @@ void LstmLayer::forward(PassType passType) {
     if (!useSeqParallel_) {
       forwardBatch(batchSize, numSequences, starts, input.value);
     } else {
-      const int* starts = input.sequenceStartPositions->getData(useGpu_);
+      const int *starts = input.sequenceStartPositions->getData(useGpu_);
       forwardSeqParallel(batchSize, numSequences, starts, input.value);
     }
   }
@@ -188,13 +224,19 @@ void LstmLayer::backward(const UpdateCallback &callback) {
   size_t numSequences = input.getNumSequences();
 
   Matrix::resizeOrCreate(gate_.grad,
-                         /* height= */ batchSize, getSize() * 4,
-                         /* trans= */ false, useGpu_);
+                         /* height= */ batchSize,
+                         getSize() * 4,
+                         /* trans= */ false,
+                         useGpu_);
   Matrix::resizeOrCreate(state_.grad,
-                         /* height= */ batchSize, getSize(), /* trans= */ false,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
                          useGpu_);
   Matrix::resizeOrCreate(preOutput_.grad,
-                         /* height= */ batchSize, getSize(), /* trans= */ false,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
                          useGpu_);
   state_.grad->zero();
 
@@ -205,7 +247,7 @@ void LstmLayer::backward(const UpdateCallback &callback) {
     if (!useSeqParallel_) {
       backwardBatch(batchSize, numSequences, starts, input.grad);
     } else {
-      const int* starts = input.sequenceStartPositions->getData(useGpu_);
+      const int *starts = input.sequenceStartPositions->getData(useGpu_);
       backwardSeqParallel(batchSize, numSequences, starts, input.grad);
     }
   }
@@ -216,8 +258,10 @@ void LstmLayer::backward(const UpdateCallback &callback) {
   weight_->getParameterPtr()->incUpdate(callback);
 }
 
-void LstmLayer::forwardSequence(int batchSize, size_t numSequences,
-                                const int *starts, MatrixPtr inputValue) {
+void LstmLayer::forwardSequence(int batchSize,
+                                size_t numSequences,
+                                const int *starts,
+                                MatrixPtr inputValue) {
   REGISTER_TIMER_INFO("LstmFwSequenceTime", getName().c_str());
   gate_.value->assign(*inputValue);
   if (bias_) {
@@ -255,10 +299,16 @@ void LstmLayer::forwardSequence(int batchSize, size_t numSequences,
     }
   };
 
-  MatrixPtr frameGate = Matrix::create(nullptr, /* height= */ 1, getSize() * 4,
-                                       /* trans= */ false, useGpu_);
-  MatrixPtr frameOutput = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                         /* trans= */ false, useGpu_);
+  MatrixPtr frameGate = Matrix::create(nullptr,
+                                       /* height= */ 1,
+                                       getSize() * 4,
+                                       /* trans= */ false,
+                                       useGpu_);
+  MatrixPtr frameOutput = Matrix::create(nullptr,
+                                         /* height= */ 1,
+                                         getSize(),
+                                         /* trans= */ false,
+                                         useGpu_);
 
   if (!reversed_) {
     if (prevState_) {
@@ -316,8 +366,10 @@ void LstmLayer::forwardSequence(int batchSize, size_t numSequences,
   }
 }
 
-void LstmLayer::backwardSequence(int batchSize, size_t numSequences,
-                                 const int *starts, MatrixPtr inputGrad) {
+void LstmLayer::backwardSequence(int batchSize,
+                                 size_t numSequences,
+                                 const int *starts,
+                                 MatrixPtr inputGrad) {
   REGISTER_TIMER_INFO("LstmBwSequenceTime", getName().c_str());
   MatrixPtr weightT = weight_->getW()->getTranspose();
 
@@ -381,10 +433,16 @@ void LstmLayer::backwardSequence(int batchSize, size_t numSequences,
     }
   };
 
-  MatrixPtr frameGate = Matrix::create(nullptr, /* height= */ 1, getSize() * 4,
-                                       /* trans= */ false, useGpu_);
-  MatrixPtr frameOutput = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                                         /* trans= */ false, useGpu_);
+  MatrixPtr frameGate = Matrix::create(nullptr,
+                                       /* height= */ 1,
+                                       getSize() * 4,
+                                       /* trans= */ false,
+                                       useGpu_);
+  MatrixPtr frameOutput = Matrix::create(nullptr,
+                                         /* height= */ 1,
+                                         getSize(),
+                                         /* trans= */ false,
+                                         useGpu_);
 
   {
     AsyncGpuBlock asyncGpuBlock;
@@ -422,11 +480,15 @@ void LstmLayer::backwardSequence(int batchSize, size_t numSequences,
         if (!reversed_) {
           weight_->getWGrad()->mul(
               output_.value->subMatrix(start, length - 1)->getTranspose(),
-              gate_.grad->subMatrix(start + 1, length - 1), 1, 1);
+              gate_.grad->subMatrix(start + 1, length - 1),
+              1,
+              1);
         } else {
           weight_->getWGrad()->mul(
               output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
-              gate_.grad->subMatrix(start, length - 1), 1, 1);
+              gate_.grad->subMatrix(start, length - 1),
+              1,
+              1);
         }
       }
     }
@@ -440,8 +502,10 @@ void LstmLayer::backwardSequence(int batchSize, size_t numSequences,
   }
 }
 
-void LstmLayer::forwardBatch(int batchSize, size_t numSequences,
-                             const int *starts, MatrixPtr inputValue) {
+void LstmLayer::forwardBatch(int batchSize,
+                             size_t numSequences,
+                             const int *starts,
+                             MatrixPtr inputValue) {
   REGISTER_TIMER_INFO("LstmFwBatchTime", getName().c_str());
 
   hl_lstm_value lstmValue;
@@ -452,8 +516,8 @@ void LstmLayer::forwardBatch(int batchSize, size_t numSequences,
   if (!batchValue_) {
     batchValue_.reset(new SequenceToBatch(useGpu_));
   }
-  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_,
-                                   prevOutput_ ? true : false);
+  batchValue_->resizeOrCreateBatch(
+      batchSize, numSequences, starts, reversed_, prevOutput_ ? true : false);
 
   batchValue_->resizeOrCreate(*output_.value);
   batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
@@ -479,8 +543,11 @@ void LstmLayer::forwardBatch(int batchSize, size_t numSequences,
         MatrixPtr batch1 = batchValue_->getBatchValue(n - 1, batchSize);
         gateValue->mul(batch1, weight_->getW(), 1, 1);
       } else if (prevOutput_) {
-        Matrix::resizeOrCreate(prevBatchOutput2_, gateValue->getHeight(),
-                               getSize(), false, useGpu_);
+        Matrix::resizeOrCreate(prevBatchOutput2_,
+                               gateValue->getHeight(),
+                               getSize(),
+                               false,
+                               useGpu_);
         batchValue_->prevOutput2Batch(*prevOutput_, *prevBatchOutput2_);
         gateValue->mul(prevBatchOutput2_, weight_->getW(), 1, 1);
 
@@ -525,8 +592,10 @@ void LstmLayer::getPrevBatchState(size_t numSequences) {
   batchValue_->getSeqOutputFromBatch(*prevState_, *state_.value);
 }
 
-void LstmLayer::backwardBatch(int batchSize, size_t numSequences,
-                              const int *starts, MatrixPtr inputGrad) {
+void LstmLayer::backwardBatch(int batchSize,
+                              size_t numSequences,
+                              const int *starts,
+                              MatrixPtr inputGrad) {
   REGISTER_TIMER_INFO("LstmBwBatchTime", getName().c_str());
 
   hl_lstm_value lstmValue;
@@ -593,11 +662,11 @@ void LstmLayer::backwardBatch(int batchSize, size_t numSequences,
           }
         }
         if (useGpu_) {
-          LstmCompute::backwardBatch<1>(lstmValue, lstmGrad,
-                                        getSize(), batchSize);
+          LstmCompute::backwardBatch<1>(
+              lstmValue, lstmGrad, getSize(), batchSize);
         } else {
-          LstmCompute::backwardBatch<0>(lstmValue, lstmGrad,
-                                        getSize(), batchSize);
+          LstmCompute::backwardBatch<0>(
+              lstmValue, lstmGrad, getSize(), batchSize);
         }
       }
 
@@ -611,8 +680,8 @@ void LstmLayer::backwardBatch(int batchSize, size_t numSequences,
         MatrixPtr outputValue = batchValue_->getBatchValue(n - 1, batchSize);
         weight_->getWGrad()->mul(outputValue->getTranspose(), gateGrad, 1, 1);
       } else if (prevOutput_ && weight_->getWGrad()) {
-        weight_->getWGrad()->mul(prevBatchOutput2_->getTranspose(), gateGrad, 1,
-                                 1);
+        weight_->getWGrad()->mul(
+            prevBatchOutput2_->getTranspose(), gateGrad, 1, 1);
       }
     }
   }
@@ -625,8 +694,10 @@ void LstmLayer::backwardBatch(int batchSize, size_t numSequences,
   }
 }
 
-void LstmLayer::forwardSeqParallel(int batchSize, size_t numSequences,
-                                   const int *starts, MatrixPtr inputValue) {
+void LstmLayer::forwardSeqParallel(int batchSize,
+                                   size_t numSequences,
+                                   const int *starts,
+                                   MatrixPtr inputValue) {
   REGISTER_TIMER_INFO("LstmFwSeqParallelTime", getName().c_str());
   gate_.value->assign(*inputValue);
   if (bias_) {
@@ -641,14 +712,27 @@ void LstmLayer::forwardSeqParallel(int batchSize, size_t numSequences,
   real *checkFg = checkFg_->getData();
   real *checkOg = checkOg_->getData();
   real *weight = weight_->getW()->getData();
-  hl_lstm_parallel_forward(
-      gateValue, stateValue, preOutputValue, outputValue, checkIg, checkFg,
-      checkOg, weight, starts, getSize(), numSequences, reversed_, activeNode_,
-      activeGate_, activeState_);
+  hl_lstm_parallel_forward(gateValue,
+                           stateValue,
+                           preOutputValue,
+                           outputValue,
+                           checkIg,
+                           checkFg,
+                           checkOg,
+                           weight,
+                           starts,
+                           getSize(),
+                           numSequences,
+                           reversed_,
+                           activeNode_,
+                           activeGate_,
+                           activeState_);
 }
 
-void LstmLayer::backwardSeqParallel(int batchSize, size_t numSequences,
-                                    const int *starts, MatrixPtr inputGrad) {
+void LstmLayer::backwardSeqParallel(int batchSize,
+                                    size_t numSequences,
+                                    const int *starts,
+                                    MatrixPtr inputGrad) {
   REGISTER_TIMER_INFO("LstmBwSeqParallelTime", getName().c_str());
   real *gateValue = gate_.value->getData();
   real *gateGrad = gate_.grad->getData();
@@ -675,11 +759,27 @@ void LstmLayer::backwardSeqParallel(int batchSize, size_t numSequences,
     checkOgGrad = nullptr;
   }
 
-  hl_lstm_parallel_backward_data(
-      gateValue, gateGrad, stateValue, stateGrad, preOutputValue, preOutputGrad,
-      outputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
-      checkOgGrad, weight, starts, getSize(), numSequences, reversed_,
-      activeNode_, activeGate_, activeState_);
+  hl_lstm_parallel_backward_data(gateValue,
+                                 gateGrad,
+                                 stateValue,
+                                 stateGrad,
+                                 preOutputValue,
+                                 preOutputGrad,
+                                 outputGrad,
+                                 checkIg,
+                                 checkIgGrad,
+                                 checkFg,
+                                 checkFgGrad,
+                                 checkOg,
+                                 checkOgGrad,
+                                 weight,
+                                 starts,
+                                 getSize(),
+                                 numSequences,
+                                 reversed_,
+                                 activeNode_,
+                                 activeGate_,
+                                 activeState_);
 
   if (inputGrad) {
     inputGrad->add(*gate_.grad);
@@ -691,9 +791,14 @@ void LstmLayer::backwardSeqParallel(int batchSize, size_t numSequences,
   real *outputValue = output_.value->getData();
   if (weight_->getWGrad()) {
     real *weightGrad = weight_->getWGrad()->getData();
-    hl_lstm_parallel_backward_weight(weightGrad, outputValue, gateGrad,
-                                     starts, getSize(), batchSize,
-                                     numSequences, reversed_);
+    hl_lstm_parallel_backward_weight(weightGrad,
+                                     outputValue,
+                                     gateGrad,
+                                     starts,
+                                     getSize(),
+                                     batchSize,
+                                     numSequences,
+                                     reversed_);
   }
 }
 
diff --git a/paddle/gserver/layers/LstmLayer.h b/paddle/gserver/layers/LstmLayer.h
index e080a401416d55c8684342e00313ae4d5c9cf4e0..5b936ff44ef1bc26850c5051f4d5561529002cd4 100644
--- a/paddle/gserver/layers/LstmLayer.h
+++ b/paddle/gserver/layers/LstmLayer.h
@@ -97,12 +97,16 @@ protected:
    * @param starts Each start position of each samples.
    * @param inputValue The input values.
    */
-  void forwardSequence(int batchSize, size_t numSequences, const int *starts,
+  void forwardSequence(int batchSize,
+                       size_t numSequences,
+                       const int *starts,
                        MatrixPtr inputValue);
   /**
    * Compute lstm backward one sequence by one sequence.
    */
-  void backwardSequence(int batchSize, size_t numSequences, const int *starts,
+  void backwardSequence(int batchSize,
+                        size_t numSequences,
+                        const int *starts,
                         MatrixPtr inputGrad);
 
   /**
@@ -121,12 +125,16 @@ protected:
    * }
    * @endcode
    */
-  void forwardBatch(int batchSize, size_t numSequences, const int *starts,
+  void forwardBatch(int batchSize,
+                    size_t numSequences,
+                    const int *starts,
                     MatrixPtr inputValue);
   /**
    * Compute lstm backward one batch by one batch.
    */
-  void backwardBatch(int batchSize, size_t numSequences, const int *starts,
+  void backwardBatch(int batchSize,
+                     size_t numSequences,
+                     const int *starts,
                      MatrixPtr inputGrad);
 
   /**
@@ -134,13 +142,17 @@ protected:
    * batch value. It will launch one kernel to parallelly compute forward
    * propagation in sequence level.
    */
-  void forwardSeqParallel(int batchSize, size_t numSequences, const int *starts,
+  void forwardSeqParallel(int batchSize,
+                          size_t numSequences,
+                          const int *starts,
                           MatrixPtr inputValue);
   /**
    * Backward propagation corresponding to forwardSeqParallel.
    */
-  void backwardSeqParallel(int batchSize, size_t numSequences,
-                           const int *starts, MatrixPtr inputGrad);
+  void backwardSeqParallel(int batchSize,
+                           size_t numSequences,
+                           const int *starts,
+                           MatrixPtr inputGrad);
   /**
    * This function is used for sequence generation and get output after
    * forwardBatch.
diff --git a/paddle/gserver/layers/LstmStepLayer.cpp b/paddle/gserver/layers/LstmStepLayer.cpp
index fb0fdbf7e9c9a1a479b47ecf9463b26393642be2..e7a8d519f2dc5eade613f3ad1981434ae8d59b7c 100644
--- a/paddle/gserver/layers/LstmStepLayer.cpp
+++ b/paddle/gserver/layers/LstmStepLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 #include "LstmCompute.h"
 #include "paddle/utils/Stat.h"
@@ -49,24 +48,36 @@ bool LstmStepLayer::init(const LayerMap& layerMap,
   if (!Layer::init(layerMap, parameterMap)) return false;
   CHECK_EQ(2U, inputLayers_.size());
 
-  checkIg_ =
-      Matrix::create(nullptr,
-                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
-  checkFg_ =
-      Matrix::create(nullptr,
-                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
-  checkOg_ =
-      Matrix::create(nullptr,
-                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
-  checkIgGrad_ =
-      Matrix::create(nullptr,
-                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
-  checkFgGrad_ =
-      Matrix::create(nullptr,
-                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
-  checkOgGrad_ =
-      Matrix::create(nullptr,
-                     /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
+  checkIg_ = Matrix::create(nullptr,
+                            /* height= */ 1,
+                            getSize(),
+                            /* trans= */ false,
+                            useGpu_);
+  checkFg_ = Matrix::create(nullptr,
+                            /* height= */ 1,
+                            getSize(),
+                            /* trans= */ false,
+                            useGpu_);
+  checkOg_ = Matrix::create(nullptr,
+                            /* height= */ 1,
+                            getSize(),
+                            /* trans= */ false,
+                            useGpu_);
+  checkIgGrad_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+  checkFgGrad_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+  checkOgGrad_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
 
   if (biasParameter_.get() != NULL) {
     CHECK_EQ(getSize() * 3, biasParameter_->getSize());
@@ -101,12 +112,21 @@ void LstmStepLayer::forward(PassType passType) {
   CHECK_EQ(getSize(), prevState.value->getWidth());
   int batchSize = input.getBatchSize();
   reserveOutput(batchSize, getSize());
-  resetSpecifyOutput(state_, batchSize, getSize(), /*  isValueClean */ false,
+  resetSpecifyOutput(state_,
+                     batchSize,
+                     getSize(),
+                     /*  isValueClean */ false,
                      /* isGradClean */ true);
-  resetSpecifyOutput(gate_, batchSize, getSize() * 4,
-                     /* isValueClean */ false, /* isGradClean */ false);
-  resetSpecifyOutput(stateActive_, batchSize, getSize(),
-                     /*  isValueClean */ false, /* isGradClean */ false);
+  resetSpecifyOutput(gate_,
+                     batchSize,
+                     getSize() * 4,
+                     /* isValueClean */ false,
+                     /* isGradClean */ false);
+  resetSpecifyOutput(stateActive_,
+                     batchSize,
+                     getSize(),
+                     /*  isValueClean */ false,
+                     /* isGradClean */ false);
   gate_.value->assign(*input.value);
 
   hl_lstm_value lstmValue;
@@ -156,11 +176,9 @@ void LstmStepLayer::backward(const UpdateCallback& callback) {
   lstmGrad.checkOgGrad = checkOgGrad_->getData();
 
   if (useGpu_) {
-    LstmCompute::backwardBatch<1>(lstmValue, lstmGrad, getSize(),
-                                  batchSize);
+    LstmCompute::backwardBatch<1>(lstmValue, lstmGrad, getSize(), batchSize);
   } else {
-    LstmCompute::backwardBatch<0>(lstmValue, lstmGrad, getSize(),
-                                  batchSize);
+    LstmCompute::backwardBatch<0>(lstmValue, lstmGrad, getSize(), batchSize);
   }
 
   if (input.grad) {
diff --git a/paddle/gserver/layers/MDLstmLayer.cpp b/paddle/gserver/layers/MDLstmLayer.cpp
index 8ca92dee6d0720ad385ba85da3db2ba36372c43d..93f52c1c314105f9d0b2530218d43045224df948 100644
--- a/paddle/gserver/layers/MDLstmLayer.cpp
+++ b/paddle/gserver/layers/MDLstmLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "LstmLayer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/BaseMatrix.h"
@@ -106,7 +105,8 @@ public:
 
   bool end() { return end_; }
 
-  bool getPrePos(const std::vector<int>& delays, int idx,
+  bool getPrePos(const std::vector<int>& delays,
+                 int idx,
                  std::vector<int>& prePos) {
     bool isAvial = true;
     prePos.clear();
@@ -129,7 +129,8 @@ public:
     return isAvial;
   }
 
-  bool getNextPos(const std::vector<int>& delays, int idx,
+  bool getNextPos(const std::vector<int>& delays,
+                  int idx,
                   std::vector<int>& nextPos) {
     bool isAvial = true;
     nextPos.clear();
@@ -232,24 +233,46 @@ bool MDLstmLayer::init(const LayerMap& layerMap,
       new Weight(numBlocks_, numBlocks_ * (3 + numDims_), parameters_[0]));
   if (biasParameter_.get() != NULL) {
     bias_.reset(new Weight(1, numBlocks_ * (5 + 2 * numDims_), biasParameter_));
-    localBias_ =
-        Matrix::create(nullptr, /* height= */ 1, numBlocks_ * (3 + numDims_),
-                       /* trans= */ false, useGpu_);
-    checkIg_ = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                              /* trans= */ false, useGpu_);
-    checkFg_ = Matrix::create(nullptr, /* height= */ numDims_, numBlocks_,
-                              /* trans= */ false, useGpu_);
-    checkOg_ = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                              /* trans= */ false, useGpu_);
-    localBiasGrad_ =
-        Matrix::create(nullptr, /* height= */ 1, numBlocks_ * (3 + numDims_),
-                       /* trans= */ false, useGpu_);
-    checkIgGrad_ = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                                  /* trans= */ false, useGpu_);
-    checkFgGrad_ = Matrix::create(nullptr, /* height= */ numDims_, numBlocks_,
-                                  /* trans= */ false, useGpu_);
-    checkOgGrad_ = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                                  /* trans= */ false, useGpu_);
+    localBias_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                numBlocks_ * (3 + numDims_),
+                                /* trans= */ false,
+                                useGpu_);
+    checkIg_ = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    checkFg_ = Matrix::create(nullptr,
+                              /* height= */ numDims_,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    checkOg_ = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    localBiasGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    numBlocks_ * (3 + numDims_),
+                                    /* trans= */ false,
+                                    useGpu_);
+    checkIgGrad_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  numBlocks_,
+                                  /* trans= */ false,
+                                  useGpu_);
+    checkFgGrad_ = Matrix::create(nullptr,
+                                  /* height= */ numDims_,
+                                  numBlocks_,
+                                  /* trans= */ false,
+                                  useGpu_);
+    checkOgGrad_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  numBlocks_,
+                                  /* trans= */ false,
+                                  useGpu_);
 
     localBias_->setData(bias_->getW()->getData());
     checkIg_->setData(bias_->getW()->getData() + numBlocks_ * (3 + numDims_));
@@ -315,49 +338,79 @@ void MDLstmLayer::forward(PassType passType) {
   frameOutput_.reserve(batchSize);
 
   Matrix::resizeOrCreate(gate_.value,
-                         /* height= */ batchSize, numBlocks_ * (3 + numDims_),
-                         /* trans= */ false, useGpu_);
+                         /* height= */ batchSize,
+                         numBlocks_ * (3 + numDims_),
+                         /* trans= */ false,
+                         useGpu_);
 
   for (int i = frameGate_.size(); i < batchSize; i++) {
     Argument arg;
-    arg.value =
-        Matrix::create(nullptr, /* height= */ 1, numBlocks_ * (3 + numDims_),
-                       /* trans= */ false, useGpu_);
-    arg.grad =
-        Matrix::create(nullptr, /* height= */ 1, numBlocks_ * (3 + numDims_),
-                       /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_ * (3 + numDims_),
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_ * (3 + numDims_),
+                              /* trans= */ false,
+                              useGpu_);
     frameGate_.push_back(arg);
   }
   for (int i = frameInputGate_.size(); i < batchSize; i++) {
     Argument arg;
-    arg.value = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                               /* trans= */ false, useGpu_);
-    arg.grad = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                              /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
     frameInputGate_.push_back(arg);
   }
   for (int i = frameForgetGate_.size(); i < batchSize; i++) {
     Argument arg;
-    arg.value = Matrix::create(nullptr, /* height= */ numDims_, numBlocks_,
-                               /* trans= */ false, useGpu_);
-    arg.grad = Matrix::create(nullptr, /* height= */ numDims_, numBlocks_,
-                              /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ numDims_,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ numDims_,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
     frameForgetGate_.push_back(arg);
   }
   for (int i = frameOutputGate_.size(); i < batchSize; i++) {
     Argument arg;
-    arg.value = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                               /* trans= */ false, useGpu_);
-    arg.grad = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                              /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
     frameOutputGate_.push_back(arg);
   }
   for (int i = frameInputNode_.size(); i < batchSize; i++) {
     Argument arg;
-    arg.value = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                               /* trans= */ false, useGpu_);
-    arg.grad = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                              /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
     frameInputNode_.push_back(arg);
   }
   for (int i = frameState_.size(); i < batchSize; i++) {
@@ -374,10 +427,16 @@ void MDLstmLayer::forward(PassType passType) {
   }
   for (int i = frameOutput_.size(); i < batchSize; i++) {
     Argument arg;
-    arg.value = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                               /* trans= */ false, useGpu_);
-    arg.grad = Matrix::create(nullptr, /* height= */ 1, numBlocks_,
-                              /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
     frameOutput_.push_back(arg);
   }
 
@@ -432,13 +491,19 @@ void MDLstmLayer::forwardGate2OutputSequence(int start,
           *frameState_[start + preOffsetV[i]].value, *checkIg_, 1.0, 1.0);
 
       MatrixPtr fgGateOneDim = Matrix::create(
-          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_, 1,
-          numBlocks_, false, useGpu_);
+          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
       MatrixPtr checkFgOneDim =
-          Matrix::create(checkFg_->getData() + i * numBlocks_, 1.0, numBlocks_,
-                         false, useGpu_);
-      fgGateOneDim->addDotMul(*frameState_[start + preOffsetV[i]].value,
-                              *checkFgOneDim, 1.0, 1.0);
+          Matrix::create(checkFg_->getData() + i * numBlocks_,
+                         1.0,
+                         numBlocks_,
+                         false,
+                         useGpu_);
+      fgGateOneDim->addDotMul(
+          *frameState_[start + preOffsetV[i]].value, *checkFgOneDim, 1.0, 1.0);
     }
   }
   activationGate_->forward(frameInputGate_[idxCurr]);
@@ -449,18 +514,22 @@ void MDLstmLayer::forwardGate2OutputSequence(int start,
   for (int i = 0; i < numDims_; i++) {
     if (preOffsetV[i] >= 0) {
       MatrixPtr fgGateOneDim = Matrix::create(
-          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_, 1,
-          numBlocks_, false, useGpu_);
+          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
       frameState_[idxCurr].value->addDotMul(
           *frameState_[start + preOffsetV[i]].value, *fgGateOneDim, 1.0, 1.0);
     }
   }
   frameState_[idxCurr].value->addDotMul(*frameInputNode_[idxCurr].value,
-                                        *frameInputGate_[idxCurr].value, 1.0,
+                                        *frameInputGate_[idxCurr].value,
+                                        1.0,
                                         1.0);
 
-  frameOutputGate_[idxCurr].value->addDotMul(*frameState_[idxCurr].value,
-                                             *checkOg_, 1.0, 1.0);
+  frameOutputGate_[idxCurr].value->addDotMul(
+      *frameState_[idxCurr].value, *checkOg_, 1.0, 1.0);
   activationGate_->forward(frameOutputGate_[idxCurr]);
 
   framePreOutput_[idxCurr].value->copyFrom(*(frameState_[idxCurr].value));
@@ -493,8 +562,10 @@ void MDLstmLayer::backward(const UpdateCallback& callback) {
   size_t numSequences = input.getNumSequences();
 
   Matrix::resizeOrCreate(gate_.grad,
-                         /* height= */ batchSize, numBlocks_ * (3 + numDims_),
-                         /* trans= */ false, useGpu_);
+                         /* height= */ batchSize,
+                         numBlocks_ * (3 + numDims_),
+                         /* trans= */ false,
+                         useGpu_);
 
   for (int i = 0; i < batchSize; i++) {
     if (frameState_[i].grad == NULL)
@@ -576,8 +647,8 @@ void MDLstmLayer::backwardGate2OutputSequence(int start,
                                          *framePreOutput_[idxCurr].value);
   activationGate_->backward(frameOutputGate_[idxCurr]);
 
-  frameState_[idxCurr].grad->addDotMul(*frameOutputGate_[idxCurr].grad,
-                                       *checkOg_, 1.0, 1.0);
+  frameState_[idxCurr].grad->addDotMul(
+      *frameOutputGate_[idxCurr].grad, *checkOg_, 1.0, 1.0);
   for (int i = 0; i < numDims_; i++) {
     if (nextOffsetV[i] >= 0) {
       frameState_[idxCurr].grad->addDotMul(
@@ -586,18 +657,26 @@ void MDLstmLayer::backwardGate2OutputSequence(int start,
       MatrixPtr fgGateOneDimGrad = Matrix::create(
           frameForgetGate_[start + nextOffsetV[i]].grad->getData() +
               i * numBlocks_,
-          1, numBlocks_, false, useGpu_);
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
       MatrixPtr fgGateOneDimVal = Matrix::create(
           frameForgetGate_[start + nextOffsetV[i]].value->getData() +
               i * numBlocks_,
-          1, numBlocks_, false, useGpu_);
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
       MatrixPtr checkFgOneDim = Matrix::create(
           checkFg_->getData() + i * numBlocks_, 1, numBlocks_, false, useGpu_);
 
-      frameState_[idxCurr].grad->addDotMul(*fgGateOneDimGrad, *checkFgOneDim,
-                                           1.0, 1.0);
       frameState_[idxCurr].grad->addDotMul(
-          *frameState_[start + nextOffsetV[i]].grad, *fgGateOneDimVal, 1.0,
+          *fgGateOneDimGrad, *checkFgOneDim, 1.0, 1.0);
+      frameState_[idxCurr].grad->addDotMul(
+          *frameState_[start + nextOffsetV[i]].grad,
+          *fgGateOneDimVal,
+          1.0,
           1.0);
     }
   }
@@ -611,11 +690,15 @@ void MDLstmLayer::backwardGate2OutputSequence(int start,
   for (int i = 0; i < numDims_; i++) {
     if (preOffsetV[i] >= 0) {
       MatrixPtr fgGateOneDimGrad = Matrix::create(
-          frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_, 1,
-          numBlocks_, false, useGpu_);
+          frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
       fgGateOneDimGrad->addDotMul(*frameState_[idxCurr].grad,
                                   *frameState_[start + preOffsetV[i]].value,
-                                  1.0, 1.0);
+                                  1.0,
+                                  1.0);
     }
   }
 
@@ -627,22 +710,30 @@ void MDLstmLayer::backwardGate2OutputSequence(int start,
     for (int i = 0; i < numDims_; i++) {
       if (preOffsetV[i] >= 0) {
         checkIgGrad_->addDotMul(*frameInputGate_[idxCurr].grad,
-                                *frameState_[start + preOffsetV[i]].value, 1.0,
+                                *frameState_[start + preOffsetV[i]].value,
+                                1.0,
                                 1.0);
 
         MatrixPtr fgGateOneDimGrad = Matrix::create(
-            frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_, 1,
-            numBlocks_, false, useGpu_);
+            frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
+            1,
+            numBlocks_,
+            false,
+            useGpu_);
         MatrixPtr checkFgOneDimGrad =
-            Matrix::create(checkFgGrad_->getData() + i * numBlocks_, 1,
-                           numBlocks_, false, useGpu_);
+            Matrix::create(checkFgGrad_->getData() + i * numBlocks_,
+                           1,
+                           numBlocks_,
+                           false,
+                           useGpu_);
         checkFgOneDimGrad->addDotMul(*fgGateOneDimGrad,
                                      *frameState_[start + preOffsetV[i]].value,
-                                     1.0, 1.0);
+                                     1.0,
+                                     1.0);
       }
     }
-    checkOgGrad_->addDotMul(*frameOutputGate_[idxCurr].grad,
-                            *frameState_[idxCurr].value, 1.0, 1.0);
+    checkOgGrad_->addDotMul(
+        *frameOutputGate_[idxCurr].grad, *frameState_[idxCurr].value, 1.0, 1.0);
   }
 }
 
@@ -660,7 +751,9 @@ void MDLstmLayer::backwardOneSequence(int start, CoordIterator& coordIter) {
         if (weight_->getWGrad()) {
           weight_->getWGrad()->mul(
               frameOutput_[start + preOffset].value->getTranspose(),
-              frameGate_[start + offset].grad, 1.0, 1.0);
+              frameGate_[start + offset].grad,
+              1.0,
+              1.0);
         }
       }
     }
diff --git a/paddle/gserver/layers/MaxIdLayer.cpp b/paddle/gserver/layers/MaxIdLayer.cpp
index b80de87b4e9cc56b272f172c304027026039be06..22670fa1210e1199266cb16a1f08826c3010a84e 100644
--- a/paddle/gserver/layers/MaxIdLayer.cpp
+++ b/paddle/gserver/layers/MaxIdLayer.cpp
@@ -45,7 +45,10 @@ public:
     const Argument& input = getInput(0);
     size_t batchSize = input.getBatchSize();
     IVector::resizeOrCreate(output_.ids, batchSize * beamSize_, useGpu_);
-    Matrix::resizeOrCreate(output_.in, batchSize, beamSize_, false,
+    Matrix::resizeOrCreate(output_.in,
+                           batchSize,
+                           beamSize_,
+                           false,
                            /* useGpu */ useGpu_);
     output_.value = nullptr;
     input.value->rowMax(*output_.ids, *output_.in);
diff --git a/paddle/gserver/layers/MaxLayer.cpp b/paddle/gserver/layers/MaxLayer.cpp
index 226e0ea87dbd4a26a942cc258c5dd42388159f86..42bc6bb815232ff8dfa6b49ebf47b10c252e28c5 100644
--- a/paddle/gserver/layers/MaxLayer.cpp
+++ b/paddle/gserver/layers/MaxLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "MaxLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -21,55 +20,11 @@ namespace paddle {
 
 REGISTER_LAYER(max, MaxLayer);
 
-bool MaxLayer::init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
-  return true;
-}
-
 void MaxLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  // max layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
-
-  size_t dim = getSize();
-  const Argument& input = getInput(0);
-  int64_t newBatchSize =
-      type_ ? input.getNumSubSequences() : input.getNumSequences();
-  ICpuGpuVectorPtr startPositions =
-      type_ ? input.subSequenceStartPositions
-            : input.sequenceStartPositions;
-  auto starts = startPositions->getVector(useGpu_);
-  size_t numSequences = startPositions->getSize() - 1;
+  SequencePoolLayer::forward(passType);
 
-  CHECK_EQ(dim, input.value->getWidth());
-  CHECK_EQ(numSequences, (size_t)newBatchSize);
-  CHECK_EQ(startPositions->getData(false)[numSequences], input.getBatchSize());
-  if (type_) {
-    // when trans_type = seq, input must hasSubseq
-    CHECK_EQ(input.hasSubseq(), 1UL);
-  }
-
-  // reset output: resize to "num of sequences", not "batch size".
-  resetOutput(newBatchSize, dim);
-
-  IVector::resizeOrCreate(maxIndex_, newBatchSize * dim, useGpu(deviceId_));
+  IVector::resizeOrCreate(
+      maxIndex_, newBatchSize_ * getSize(), useGpu(deviceId_));
   maxIndex_->zeroMem();
 
   MatrixPtr inputValue = getInputValue(0);
@@ -77,16 +32,8 @@ void MaxLayer::forward(PassType passType) {
 
   {
     REGISTER_TIMER_INFO("MaxLayerForward", getName().c_str());
-    outputValue->maxSequenceForward(*inputValue, *starts, *maxIndex_);
-  }
-
-  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-   * thus, in this case, output_ has no cpuSequenceStartPositions.
-   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-   * case, we should compute the new cpuSequenceStartPositions.
-  */
-  if (type_) {
-    output_.degradeSequence(input, useGpu_);
+    outputValue->maxSequenceForward(
+        *inputValue, *startPositions_->getVector(useGpu_), *maxIndex_);
   }
 
   if (config_.output_max_index()) {
@@ -104,24 +51,14 @@ void MaxLayer::forward(PassType passType) {
 void MaxLayer::backward(const UpdateCallback& callback) {
   CHECK(!config_.output_max_index())
       << "backward is not available when output_max_index is set";
-  /* Do derivation */ { backwardActivation(); }
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
+  SequencePoolLayer::backward(callback);
 
   MatrixPtr inputGrad = getInputGrad(0);
   MatrixPtr outputGrad = getOutputGrad();
   if (inputGrad) {
-    ICpuGpuVectorPtr starts =
-        type_ ? getInput(0).subSequenceStartPositions
-              : getInput(0).sequenceStartPositions;
     REGISTER_TIMER_INFO("MaxLayerBackward", getName().c_str());
-    inputGrad->maxSequenceBackward(*outputGrad,
-        *(starts->getVector(useGpu_)), *maxIndex_);
+    inputGrad->maxSequenceBackward(
+        *outputGrad, *(startPositions_->getVector(useGpu_)), *maxIndex_);
   }
 }
 
diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h
index b4c34e665d926dd5e195fab685b1afe6d4fb579c..74df0b8b576c8ea1eef56d465e8c4ceee5019fdb 100644
--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
-#include "Layer.h"
+#include "SequencePoolLayer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/ThreadLocal.h"
 
@@ -24,29 +23,30 @@ namespace paddle {
 /**
  * A layer for "internal max" for sequence input.
  * Input: one or more sequences. Each sequence contains some instances.
- * If MaxLevel = kNonSeq:
+ * If SequenceLevel = kNonSeq:
  *    Output: output size is the number of input sequences (NOT input instances)
  *    output[i] = max_{for each instance in this sequence}{input[i]}
- * If MaxLevel = kSeq:
+ * If SequenceLevel = kSeq:
  *    Check input sequence must has sub-sequence
  *    Output: output size is the number of input sub-sequences
  *    output[i] = max_{for each instance in this sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
  */
 
-class MaxLayer : public Layer {
+class MaxLayer : public SequencePoolLayer {
 protected:
-  std::unique_ptr<Weight> biases_;
   // maxIndex_[i][j] = k : the value at (i, j) is from input[k].
   IVectorPtr maxIndex_;
-  int type_;
 
 public:
-  explicit MaxLayer(const LayerConfig& config) : Layer(config) {}
-  enum MaxLevel {kNonSeq = 0, kSeq = 1 };
+  explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
 
   ~MaxLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+    return SequencePoolLayer::init(layerMap, parameterMap);
+  }
 
   void forward(PassType passType);
   void backward(const UpdateCallback& callback = nullptr);
diff --git a/paddle/gserver/layers/MaxOutLayer.cpp b/paddle/gserver/layers/MaxOutLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a3de069bf7a6c9217e4adfeb2e65409955cc569c
--- /dev/null
+++ b/paddle/gserver/layers/MaxOutLayer.cpp
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MaxOutLayer.h"
+#include "hl_gpu.h"
+#include "hl_cnn.h"
+
+namespace paddle {
+
+REGISTER_LAYER(maxout, MaxOutLayer);
+
+size_t MaxOutLayer::getSize() {
+  const MaxOutConfig& maxoutConf = config_.inputs(0).maxout_conf();
+  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imgSizeH_ == 0) {
+    imgSizeH_ = maxoutConf.img_size_y();
+  }
+  if (imgSizeW_ == 0) {
+    imgSizeW_ = maxoutConf.img_size_x();
+  }
+
+  featLen_ = imgSizeH_ * imgSizeW_;
+  size_t layerSize = featLen_ * outputChannels_;
+
+  getOutput().setFrameHeight(imgSizeH_);
+  getOutput().setFrameWidth(imgSizeW_);
+
+  return layerSize;
+}
+
+bool MaxOutLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* the size of inputs for maxout-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  const MaxOutConfig& conf = config_.inputs(0).maxout_conf();
+  groups_ = conf.groups();
+  channels_ = conf.channels();
+  CHECK_EQ(channels_ % groups_, 0UL);
+  outputChannels_ = channels_ / groups_;
+
+  return true;
+}
+
+void MaxOutLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  /* note: one sample correspond to one column */
+  size_t batchSize = getInput(0).getBatchSize();
+  size_t size = getSize();
+  resetOutput(batchSize, size);
+  MatrixPtr inputV = getInputValue(0);
+  MatrixPtr outV = getOutputValue();
+
+  IVector::resizeOrCreate(maxoutId_, size * batchSize, useGpu_);
+  outV->maxoutForward(*inputV, *maxoutId_, outputChannels_, groups_);
+}
+
+void MaxOutLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  /* Do derivation */
+  MatrixPtr inputG = getInputGrad(0);
+  MatrixPtr outG = getOutputGrad();
+
+  if (inputG) {
+    inputG->maxoutBackward(*outG, *maxoutId_, outputChannels_, groups_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxOutLayer.h b/paddle/gserver/layers/MaxOutLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..9011a5c332b17a2f697380b1afb40ad9de504b91
--- /dev/null
+++ b/paddle/gserver/layers/MaxOutLayer.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * A layer to do max out on conv layer output.
+ * Input: output of a conv layer.
+ * Output: feature map size same as input.  Channel is (input channel) / groups.
+ * So the num of channels should be able to devided by groups.
+ *
+ * The config file api is maxout_layer.
+ */
+
+class MaxOutLayer : public Layer {
+protected:
+  size_t groups_;
+  size_t imgSizeH_, imgSizeW_;
+  /// outputChannels_ = channels_ / groups_
+  size_t channels_, outputChannels_;
+  /// feature length = imgSizeH_ * imgSizeW_
+  size_t featLen_;
+  IVectorPtr maxoutId_;
+
+public:
+  /// return imgSizeH_ * imgSizeW_ * outputChannels_;
+  size_t getSize();
+
+  explicit MaxOutLayer(const LayerConfig& config) : Layer(config) {}
+  virtual ~MaxOutLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MixedLayer.cpp b/paddle/gserver/layers/MixedLayer.cpp
index 054ddd3a228edd78c5a451f445e02afda2985b9a..1392188fcae715734d96b1402924515fa3618965 100644
--- a/paddle/gserver/layers/MixedLayer.cpp
+++ b/paddle/gserver/layers/MixedLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "MixedLayer.h"
 
@@ -29,8 +28,8 @@ bool MixedLayer::init(const LayerMap& layerMap,
   projections_.resize(inputLayers_.size());
   for (size_t i = 0; i < inputLayers_.size(); i++) {
     if (config_.inputs(i).has_proj_conf()) {
-      projections_[i].reset(Projection::create(config_.inputs(i).proj_conf(),
-                                               parameters_[i], useGpu_));
+      projections_[i].reset(Projection::create(
+          config_.inputs(i).proj_conf(), parameters_[i], useGpu_));
     } else {
       CHECK(!parameters_[i]) << "should no parameters for operators";
     }
@@ -41,9 +40,12 @@ bool MixedLayer::init(const LayerMap& layerMap,
     }
     operators_.emplace_back(Operator::create(operator_conf, useGpu_));
   }
+
   /* initialize biases_ */
   if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+    sharedBias_ = config_.shared_biases();
+    size_t psize = config_.bias_size();
+    biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
   }
 
   return true;
@@ -119,12 +121,6 @@ void MixedLayer::forward(PassType passType) {
 
   MatrixPtr outV = getOutputValue();
 
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
   for (size_t i = 0; i != inputLayers_.size(); ++i) {
     if (projections_[i]) {
       projections_[i]->forward(&getInput(i), &output_, passType);
@@ -140,6 +136,12 @@ void MixedLayer::forward(PassType passType) {
     op->forward(ins, &output_, passType);
   }
 
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    outV->addBias(*(biases_->getW()), 1, sharedBias_);
+  }
+
   /* activation */ {
     REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
     forwardActivation();
@@ -154,7 +156,7 @@ void MixedLayer::backward(const UpdateCallback& callback) {
 
   if (biases_ && biases_->getWGrad()) {
     REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
 
     /* Increasing the number of gradient */
     biases_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/gserver/layers/MixedLayer.h b/paddle/gserver/layers/MixedLayer.h
index 9bac1355bd21ff2b949e593249ee2cd9063c3c75..271e0c2538d3b7239a5d54ec43180dddff569b76 100644
--- a/paddle/gserver/layers/MixedLayer.h
+++ b/paddle/gserver/layers/MixedLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -22,8 +21,8 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * A mixed layer has multiple input layers. 
- * Each input layer was processed by a Projection or Operator. 
+ * A mixed layer has multiple input layers.
+ * Each input layer was processed by a Projection or Operator.
  * The results of all projections or Operators are summed together with bias
  * (if configured), and then go through an activation function and dropout
  * (if configured).
@@ -43,7 +42,7 @@ public:
   virtual void backward(const UpdateCallback& callback = nullptr);
   virtual void resetState();
   /**
-   * setState() should be called after getState(). 
+   * setState() should be called after getState().
    * Argument state consists of all projections states.
    */
   virtual void setState(LayerStatePtr state);
@@ -58,5 +57,6 @@ protected:
   /// the matrix size of projection state
   std::vector<int> projectionStateMatrixSize_;
   std::unique_ptr<Weight> biases_;
+  bool sharedBias_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MultinomialSampler.cpp b/paddle/gserver/layers/MultinomialSampler.cpp
index 518dc0c60cbdc2a95b7eb9c8ff33dd6a9fb87c98..e85dca72d3162d857e768221e970fe8e3951ae9c 100644
--- a/paddle/gserver/layers/MultinomialSampler.cpp
+++ b/paddle/gserver/layers/MultinomialSampler.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "MultinomialSampler.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/MultinomialSampler.h b/paddle/gserver/layers/MultinomialSampler.h
index 442124704ac0a9bdfba7ce67da279e2bc8e03394..59683d2ee29924e76ca11eb43fbd8cd175c3c357 100644
--- a/paddle/gserver/layers/MultinomialSampler.h
+++ b/paddle/gserver/layers/MultinomialSampler.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <random>
diff --git a/paddle/gserver/layers/MultiplexLayer.cpp b/paddle/gserver/layers/MultiplexLayer.cpp
index a70172d9a6344b704cd775ce872186273d2aa4b9..c681eb0623ab7b8426fe34ce6817a3f5f4ad8246 100644
--- a/paddle/gserver/layers/MultiplexLayer.cpp
+++ b/paddle/gserver/layers/MultiplexLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp
index 4faebe5d2ad6f94c36de52b36b1a0828e7710005..50b29cdea5a352093c0508995da4cf3e2afcc995 100644
--- a/paddle/gserver/layers/NCELayer.cpp
+++ b/paddle/gserver/layers/NCELayer.cpp
@@ -23,7 +23,8 @@ namespace paddle {
 /**
  * Noise-contrastive estimation.
  * Implements the method in the following paper:
- * A fast and simple algorithm for training neural probabilistic language models.
+ * A fast and simple algorithm for training neural probabilistic language
+ * models.
  *
  * The config file api is nce_layer.
  */
@@ -180,8 +181,11 @@ public:
     int size = getSize();
     resetOutput(batchSize, size);
 
-    Matrix::resizeOrCreate(sampleOut_.value, 1, samples_.size(),
-                           /* trans= */ false, useGpu_);
+    Matrix::resizeOrCreate(sampleOut_.value,
+                           1,
+                           samples_.size(),
+                           /* trans= */ false,
+                           useGpu_);
 
     forwardBias();
 
@@ -195,8 +199,11 @@ public:
   }
 
   void backward(const UpdateCallback& callback) {
-    Matrix::resizeOrCreate(sampleOut_.grad, 1, samples_.size(),
-                           /* trans= */ false, useGpu_);
+    Matrix::resizeOrCreate(sampleOut_.grad,
+                           1,
+                           samples_.size(),
+                           /* trans= */ false,
+                           useGpu_);
 
     backwardCost();
 
@@ -241,7 +248,8 @@ public:
     real* sampleOut = sampleOut_.value->getData();
 
     for (size_t i = 0; i < samples_.size(); ++i) {
-      sampleOut[i] += dotProduct(dim, inputMat->getRowBuf(samples_[i].sampleId),
+      sampleOut[i] += dotProduct(dim,
+                                 inputMat->getRowBuf(samples_[i].sampleId),
                                  weightMat->getRowBuf(samples_[i].labelId));
     }
   }
@@ -257,7 +265,9 @@ public:
 
     if (weightGradMat) {
       for (size_t i = 0; i < samples_.size(); ++i) {
-        axpy(dim, sampleGrad[i], inputMat->getRowBuf(samples_[i].sampleId),
+        axpy(dim,
+             sampleGrad[i],
+             inputMat->getRowBuf(samples_[i].sampleId),
              weightGradMat->getRowBuf(samples_[i].labelId));
       }
       weights_[layerId]->incUpdate(callback);
@@ -265,7 +275,9 @@ public:
 
     if (inputGradMat) {
       for (size_t i = 0; i < samples_.size(); ++i) {
-        axpy(dim, sampleGrad[i], weightMat->getRowBuf(samples_[i].labelId),
+        axpy(dim,
+             sampleGrad[i],
+             weightMat->getRowBuf(samples_[i].labelId),
              inputGradMat->getRowBuf(samples_[i].sampleId));
       }
     }
diff --git a/paddle/gserver/layers/NormLayer.cpp b/paddle/gserver/layers/NormLayer.cpp
index ad8b92d2ff72426d30f2488af7d168ffd8e5b65d..7f6ffe229842113869b4f2d61d59cdc0f4e1ddf8 100644
--- a/paddle/gserver/layers/NormLayer.cpp
+++ b/paddle/gserver/layers/NormLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "NormLayer.h"
 #include "NormProjectionLayer.h"
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
index 2b05be6fcb44fc3f61f9be4e464b2100284bf5c6..9e848e5268d6b4b69f24802b66c5fed7cc1bf9e4 100644
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <vector>
@@ -44,8 +43,8 @@ public:
 
 /**
  * @brief response normalization within feature maps
- * namely normalize in independent channel 
- * When code refactoring, we delete the original implementation. 
+ * namely normalize in independent channel
+ * When code refactoring, we delete the original implementation.
  * Need to implement in the futrue.
  */
 class ResponseNormLayer : public NormLayer {
diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp
index eab6e904ee998b876a4dd7c503eec3a9a84f7412..6ac468e6fc7c2962beaf8c28192890634340b296 100644
--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 #include "NormProjectionLayer.h"
@@ -65,8 +64,8 @@ void CMRProjectionNormLayer::forward(PassType passType) {
 
   denoms_->zeroMem();
 
-  outV->crossMapNormalFwd(*input, imgSizeH_, imgSizeW_, *denoms_, channels_,
-                          size_, scale_, pow_);
+  outV->crossMapNormalFwd(
+      *input, imgSizeH_, imgSizeW_, *denoms_, channels_, size_, scale_, pow_);
 }
 
 void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
@@ -81,8 +80,15 @@ void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
   MatrixPtr localOutV = getOutputValue();
   MatrixPtr preOutV = inputLayers_[0]->getOutputValue();
 
-  preOutGrad->crossMapNormalBwd(*localGrad, *denoms_, *preOutV, *localOutV,
-                                channels_, imgSizeH_, imgSizeW_, size_, scale_,
+  preOutGrad->crossMapNormalBwd(*localGrad,
+                                *denoms_,
+                                *preOutV,
+                                *localOutV,
+                                channels_,
+                                imgSizeH_,
+                                imgSizeW_,
+                                size_,
+                                scale_,
                                 pow_);
 }
 }  // namespace paddle
diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h
index 728806ea76958382a3ad06804f773c959598d043..b42e98ab0941e59a38bb1cfa73f49682dbef942c 100644
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ b/paddle/gserver/layers/NormProjectionLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "NormLayer.h"
diff --git a/paddle/gserver/layers/Operator.cpp b/paddle/gserver/layers/Operator.cpp
index 5fa8239ac5d6f11da0558c8c9eddf8af378f0df3..b89c4740142e377f0cbbe755377f37baac270552 100644
--- a/paddle/gserver/layers/Operator.cpp
+++ b/paddle/gserver/layers/Operator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Operator.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/Operator.h b/paddle/gserver/layers/Operator.h
index 9ee16f70ee3a3cae3b7e764c674bbef348a300fc..ff6558dc73b8d60f3b4a3d87c9d28c650c8f2987 100644
--- a/paddle/gserver/layers/Operator.h
+++ b/paddle/gserver/layers/Operator.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/parameter/Parameter.h"
@@ -48,12 +47,14 @@ public:
   static ClassRegistrar<Operator, OperatorConfig, bool> registrar_;
 
   /**
-   * Forward propagation. If backward() will be called, in and out must be kept valid until then.
+   * Forward propagation. If backward() will be called, in and out must be kept
+   * valid until then.
    * @param ins inputs of operator
    * @param out output of operator
    * @param passType PASS_TRAIN of PASS_TEST
    */
-  void forward(std::vector<const Argument*> ins, Argument* out,
+  void forward(std::vector<const Argument*> ins,
+               Argument* out,
                PassType passType) {
     ins_ = ins;
     out_ = out;
diff --git a/paddle/gserver/layers/OuterProdLayer.cpp b/paddle/gserver/layers/OuterProdLayer.cpp
index 708c901ba9e9d2a5421fc64789f4ac174b365dc1..9b24a4f440c9e1fc3b4e73a7234c791fff045ea9 100644
--- a/paddle/gserver/layers/OuterProdLayer.cpp
+++ b/paddle/gserver/layers/OuterProdLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -58,12 +57,15 @@ bool OuterProdLayer::init(const LayerMap& layerMap,
 
   CHECK_EQ(dim0 * dim1, getSize()) << "Dimension mismatch";
 
-  tmpRow0 = Matrix::create(nullptr, /* height= */ 1, dim0, /* trans= */ false,
-                           useGpu_);
-  tmpRow1 = Matrix::create(nullptr, /* height= */ 1, dim1, /* trans= */ false,
+  tmpRow0 = Matrix::create(
+      nullptr, /* height= */ 1, dim0, /* trans= */ false, useGpu_);
+  tmpRow1 = Matrix::create(
+      nullptr, /* height= */ 1, dim1, /* trans= */ false, useGpu_);
+  tmpMtx0 = Matrix::create(nullptr,
+                           /* height= */ dim0,
+                           dim1,
+                           /* trans= */ false,
                            useGpu_);
-  tmpMtx0 = Matrix::create(nullptr, /* height= */ dim0, dim1,
-                           /* trans= */ false, useGpu_);
   return true;
 }
 
diff --git a/paddle/gserver/layers/ParameterReluLayer.cpp b/paddle/gserver/layers/ParameterReluLayer.cpp
index 98d108db5f05252aefa76fcad3d3eb429d59e82a..cd3bffa2e1d01ef8367c39c20c8e6f366c583b68 100644
--- a/paddle/gserver/layers/ParameterReluLayer.cpp
+++ b/paddle/gserver/layers/ParameterReluLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ParameterReluLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -59,8 +58,8 @@ void ParameterReluLayer::backward(const UpdateCallback& callback) {
   }
 
   MatrixPtr preGrad = getInputGrad(0);
-  preGrad->paramReluBackwardDiff(*getOutputGrad(), *(getInputValue(0)),
-                                 *(weight_->getW()));
+  preGrad->paramReluBackwardDiff(
+      *getOutputGrad(), *(getInputValue(0)), *(weight_->getW()));
   {
     REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
     weight_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/gserver/layers/ParameterReluLayer.h b/paddle/gserver/layers/ParameterReluLayer.h
index 367e4e787c5ef24a934974af54c7b2bb8cd6de5f..029c09381f0e13de111ef30c4574d2255abfd018 100644
--- a/paddle/gserver/layers/ParameterReluLayer.h
+++ b/paddle/gserver/layers/ParameterReluLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
index 7fc27ac0bd8e05246d87bac0e9692d8496f6601f..511dfd87c12551c91e8864364dbf1a1085a989b6 100644
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "PoolLayer.h"
 #include "PoolProjectionLayer.h"
@@ -52,10 +51,8 @@ bool PoolLayer::init(const LayerMap& layerMap,
 Layer* PoolLayer::create(const LayerConfig& config) {
   CHECK_EQ(config.inputs_size(), 1);
   const std::string& pool = config.inputs(0).pool_conf().pool_type();
-  if (pool == "max-projection") {
-    return new MaxPoolProjectionLayer(config);
-  } else if (pool == "avg-projection") {
-    return new AvgPoolProjectionLayer(config);
+  if (pool == "max-projection" || pool == "avg-projection") {
+    return new PoolProjectionLayer(config);
 #ifndef PADDLE_ONLY_CPU
   } else if (CudnnPoolLayer::typeCheck(pool)) {
     return new CudnnPoolLayer(config);
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
index bde1f5b8dcbfdc4301266fa758278486fe930daf..59be295a538b007993e77f85f079f78a8b881eca 100644
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/math/MathUtils.h"
 #include <vector>
 
 namespace paddle {
@@ -47,16 +47,6 @@ public:
   static Layer* create(const LayerConfig& config);
 
   virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  /**
-   * Calculate output size according window size and padding size.
-   */
-  int outputSize(int imageSize, int windowSize, int padding, int stride) {
-    int outputSize;
-    outputSize =
-        (imageSize - windowSize + 2 * padding + stride - 1) / stride + 1;
-    return outputSize;
-  }
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjection.cpp b/paddle/gserver/layers/PoolProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1b227c8084991e4bbf1e380881a6018fe01e9180
--- /dev/null
+++ b/paddle/gserver/layers/PoolProjection.cpp
@@ -0,0 +1,171 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PoolProjection.h"
+
+namespace paddle {
+
+REGISTER_PROJECTION_CREATE_FUNC(pool, &PoolProjection::create);
+
+PoolProjection::PoolProjection(const ProjectionConfig& config,
+                               ParameterPtr parameter,
+                               bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  const PoolConfig& conf = config_.pool_conf();
+  poolType_ = conf.pool_type();
+  channels_ = conf.channels();
+  sizeX_ = conf.size_x();
+  stride_ = conf.stride();
+  outputX_ = conf.output_x();
+  imgSize_ = conf.img_size();
+  confPadding_ = conf.padding();
+
+  sizeY_ = conf.has_size_y() ? conf.size_y() : conf.size_x();
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
+  confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+}
+
+size_t PoolProjection::getSize() {
+  imgSizeY_ = in_->getFrameHeight();
+  imgSize_ = in_->getFrameWidth();
+  const PoolConfig& conf = config_.pool_conf();
+  if (imgSizeY_ == 0) {
+    imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  }
+  if (imgSize_ == 0) {
+    imgSize_ = conf.img_size();
+  }
+  outputY_ = outputSize(imgSizeY_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
+                        /* caffeMode */ false);
+  outputX_ = outputSize(imgSize_,
+                        sizeX_,
+                        confPadding_,
+                        stride_,
+                        /* caffeMode */ false);
+
+  const_cast<Argument*>(out_)->setFrameHeight(outputY_);
+  const_cast<Argument*>(out_)->setFrameWidth(outputX_);
+
+  return outputY_ * outputX_ * channels_;
+}
+
+PoolProjection* PoolProjection::create(const ProjectionConfig& config,
+                                       ParameterPtr parameter,
+                                       bool useGpu) {
+  const std::string& pool = config.pool_conf().pool_type();
+  if (pool == "max-projection") {
+    return new MaxPoolProjection(config, parameter, useGpu);
+  } else if (pool == "avg-projection") {
+    return new AvgPoolProjection(config, parameter, useGpu);
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << pool;
+    return nullptr;
+  }
+}
+
+void MaxPoolProjection::forward() {
+  size_t width = getSize();
+  CHECK_EQ(width, out_->value->getWidth());
+  MatrixPtr inputV = in_->value;
+  MatrixPtr outV = out_->value;
+  outV->maxPoolForward(*inputV,
+                       imgSizeY_,
+                       imgSize_,
+                       channels_,
+                       sizeX_,
+                       sizeY_,
+                       strideY_,
+                       stride_,
+                       outputY_,
+                       outputX_,
+                       confPaddingY_,
+                       confPadding_);
+}
+
+void MaxPoolProjection::backward(const UpdateCallback& callback) {
+  (void)callback;
+  MatrixPtr outGrad = out_->grad;
+  MatrixPtr inputV = in_->value;
+  MatrixPtr outV = out_->value;
+  MatrixPtr inputGrad = in_->grad;
+
+  if (NULL == inputGrad) {
+    return;
+  }
+  inputGrad->maxPoolBackward(*inputV,
+                             imgSizeY_,
+                             imgSize_,
+                             *outGrad,
+                             *outV,
+                             sizeX_,
+                             sizeY_,
+                             strideY_,
+                             stride_,
+                             outputY_,
+                             outputX_,
+                             1,
+                             1,
+                             confPaddingY_,
+                             confPadding_);
+}
+
+void AvgPoolProjection::forward() {
+  size_t width = getSize();
+  CHECK_EQ(width, out_->value->getWidth());
+  MatrixPtr inputV = in_->value;
+  MatrixPtr outV = out_->value;
+  outV->avgPoolForward(*inputV,
+                       imgSizeY_,
+                       imgSize_,
+                       channels_,
+                       sizeX_,
+                       sizeY_,
+                       strideY_,
+                       stride_,
+                       outputY_,
+                       outputX_,
+                       confPaddingY_,
+                       confPadding_);
+}
+
+void AvgPoolProjection::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  MatrixPtr outputGrad = out_->grad;
+  MatrixPtr inputGrad = in_->grad;
+
+  if (NULL == inputGrad) {
+    return;
+  }
+
+  inputGrad->avgPoolBackward(*outputGrad,
+                             imgSizeY_,
+                             imgSize_,
+                             sizeX_,
+                             sizeY_,
+                             strideY_,
+                             stride_,
+                             outputY_,
+                             outputX_,
+                             1,
+                             1,
+                             confPaddingY_,
+                             confPadding_);
+}
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjection.h b/paddle/gserver/layers/PoolProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c3191bd80061c13b645c2a107eaa723e2495032
--- /dev/null
+++ b/paddle/gserver/layers/PoolProjection.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Projection.h"
+#include "paddle/math/MathUtils.h"
+
+namespace paddle {
+
+class PoolProjection : public Projection {
+protected:
+  size_t imgSizeY_, imgSize_;
+  size_t outputY_, outputX_;
+  size_t strideY_, stride_;
+  size_t sizeY_, sizeX_;
+  int confPaddingY_, confPadding_;
+  size_t channels_;
+  std::string poolType_;
+
+public:
+  PoolProjection(const ProjectionConfig& config,
+                 ParameterPtr parameter,
+                 bool useGpu);
+
+  static PoolProjection* create(const ProjectionConfig& config,
+                                ParameterPtr parameter,
+                                bool useGpu);
+
+  const std::string& getPoolType() const { return poolType_; }
+
+  size_t getSize();
+};
+
+class MaxPoolProjection : public PoolProjection {
+public:
+  MaxPoolProjection(const ProjectionConfig& config,
+                    ParameterPtr parameter,
+                    bool useGpu)
+      : PoolProjection(config, parameter, useGpu) {}
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback = nullptr);
+};
+
+class AvgPoolProjection : public PoolProjection {
+public:
+  AvgPoolProjection(const ProjectionConfig& config,
+                    ParameterPtr parameter,
+                    bool useGpu)
+      : PoolProjection(config, parameter, useGpu) {}
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback = nullptr);
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjectionLayer.cpp b/paddle/gserver/layers/PoolProjectionLayer.cpp
index 5a2e9afb6e1640b0fcf7937adc5e64f4666bd789..aabc60af197af30a367c0f933276116ba316bd34 100644
--- a/paddle/gserver/layers/PoolProjectionLayer.cpp
+++ b/paddle/gserver/layers/PoolProjectionLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 #include "PoolProjectionLayer.h"
@@ -31,81 +30,36 @@ size_t PoolProjectionLayer::getSize() {
     imgSizeW_ = imgSize_;
   }
 
-  outputH_ = outputSize(imgSizeH_, sizeY_, confPaddingY_, strideY_);
-  outputW_ = outputSize(imgSizeW_, sizeX_, confPadding_, stride_);
+  outputH_ = outputSize(imgSizeH_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
+                        /* caffeMode */ false);
+  outputW_ = outputSize(imgSizeW_,
+                        sizeX_,
+                        confPadding_,
+                        stride_,
+                        /* caffeMode */ false);
 
   layerSize = outputH_ * outputW_ * channels_;
 
-  getOutput().setFrameHeight(outputH_);
-  getOutput().setFrameWidth(outputW_);
   return layerSize;
 }
 
-void MaxPoolProjectionLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  /* note: one sample correspond to one ROW */
-  MatrixPtr input = getInputValue(0);
-  int batchSize = input->getHeight();
-  int size = getSize();
-  resetOutput(batchSize, size);
-
-  MatrixPtr outV = getOutputValue();
-
-  outV->maxPoolForward(*input, imgSizeH_, imgSizeW_, channels_,
-                       sizeX_, sizeY_, strideY_, stride_,
-                       outputH_, outputW_, confPaddingY_, confPadding_);
-}
-
-void MaxPoolProjectionLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  if (NULL == getInputGrad(0)) {
-    return;
-  }
-
-  /* Do derivation */
-  MatrixPtr outGrad = getOutputGrad();
-  MatrixPtr inputV = getInputValue(0);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr inputGrad = getInputGrad(0);
-
-  inputGrad->maxPoolBackward(*inputV, imgSizeH_, imgSizeW_, *outGrad, *outV,
-                             sizeX_, sizeY_,
-                             strideY_, stride_, outputH_, outputW_, 1, 1,
-                             confPaddingY_, confPadding_);
-}
-
-void AvgPoolProjectionLayer::forward(PassType passType) {
+void PoolProjectionLayer::forward(PassType passType) {
   Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  /* note: one sample correspond to one ROW */
-  MatrixPtr input = getInputValue(0);
-  int batchSize = input->getHeight();
+  const Argument& in = getInput(0);
+  int batchSize = in.value->getHeight();
   int size = getSize();
   resetOutput(batchSize, size);
-
-  MatrixPtr outV = getOutputValue();
-
-  outV->avgPoolForward(*input, imgSizeH_, imgSizeW_, channels_,
-                       sizeX_, sizeY_, strideY_, stride_,
-                       outputH_, outputW_, confPaddingY_, confPadding_);
+  poolProjection_->forward(&in, &output_, passType);
 }
 
-void AvgPoolProjectionLayer::backward(const UpdateCallback& callback) {
+void PoolProjectionLayer::backward(const UpdateCallback& callback) {
   (void)callback;
-
   if (NULL == getInputGrad(0)) {
     return;
   }
-  /* Do derivation */
-  MatrixPtr outputGrad = getOutputGrad();
-  MatrixPtr inputGrad = getInputGrad(0);
-  inputGrad->avgPoolBackward(*outputGrad, imgSizeH_, imgSizeW_,
-                             sizeX_, sizeY_, strideY_, stride_,
-                             outputH_, outputW_, 1, 1,
-                             confPaddingY_, confPadding_);
+  poolProjection_->backward(callback);
 }
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjectionLayer.h b/paddle/gserver/layers/PoolProjectionLayer.h
index 42bbc83c62246dfc8e69aa0b427b27819a701eb6..777b6f39e7cc4ebaa7078ce3378b2688363245e8 100644
--- a/paddle/gserver/layers/PoolProjectionLayer.h
+++ b/paddle/gserver/layers/PoolProjectionLayer.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
+#include <vector>
 #include "PoolLayer.h"
+#include "PoolProjection.h"
 #include "paddle/math/Matrix.h"
-#include <vector>
 
 namespace paddle {
 /**
@@ -27,33 +27,18 @@ class PoolProjectionLayer : public PoolLayer {
 protected:
   size_t imgSizeH_, imgSizeW_;
   size_t outputH_, outputW_;
+  std::unique_ptr<PoolProjection> poolProjection_;
+  ProjectionConfig projectionConfig_;
 
 public:
-  size_t getSize();
-  explicit PoolProjectionLayer(const LayerConfig& config) : PoolLayer(config) {}
-};
-/**
- * @brief A layer for max pooling
- */
-class MaxPoolProjectionLayer : public PoolProjectionLayer {
-public:
-  explicit MaxPoolProjectionLayer(const LayerConfig& config)
-      : PoolProjectionLayer(config) {}
-
-  ~MaxPoolProjectionLayer() {}
+  explicit PoolProjectionLayer(const LayerConfig& config) : PoolLayer(config) {
+    PoolConfig* conf = projectionConfig_.mutable_pool_conf();
+    *conf = config_.inputs(0).pool_conf();
+    poolProjection_.reset(
+        PoolProjection::create(projectionConfig_, nullptr, useGpu_));
+  }
 
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
-};
-/**
- * @brief A layer for average pooling
- */
-class AvgPoolProjectionLayer : public PoolProjectionLayer {
-public:
-  explicit AvgPoolProjectionLayer(const LayerConfig& config)
-      : PoolProjectionLayer(config) {}
-
-  ~AvgPoolProjectionLayer() {}
+  size_t getSize();
 
   virtual void forward(PassType passType);
   virtual void backward(const UpdateCallback& callback = nullptr);
diff --git a/paddle/gserver/layers/PowerLayer.cpp b/paddle/gserver/layers/PowerLayer.cpp
index 44c5e6063b1aed93b3fbb175821f911ca26fac1a..0b9672f220919c6ee1a792fc2d68e8ae540ea09a 100644
--- a/paddle/gserver/layers/PowerLayer.cpp
+++ b/paddle/gserver/layers/PowerLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -26,7 +25,7 @@ namespace paddle {
  * \f[
  *   y = x^w
  * \f]
- * where \f$x\f$ is a input vector, \f$w\f$ is scalar weight, 
+ * where \f$x\f$ is a input vector, \f$w\f$ is scalar weight,
  * and output \f$y\f$ is a vector.
  *
  * The config file api is power_layer.
diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp
index 68fee69f44d0c2c144f6dde6fd8ff36bd96094f6..95be7b34cb106665d2465630233fca6b34d71e79 100644
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -18,8 +18,7 @@ namespace paddle {
 
 class PrintLayer : public Layer {
 public:
-  explicit PrintLayer(const LayerConfig& config)
-      : Layer(config) {}
+  explicit PrintLayer(const LayerConfig& config) : Layer(config) {}
   void forward(PassType passType);
   void backward(const UpdateCallback& callback) {}
 };
diff --git a/paddle/gserver/layers/Projection.cpp b/paddle/gserver/layers/Projection.cpp
index aebc08f4a0e5937e50d11a5cc832b27210c8ea42..c7eb4b644281ff6e7b58201c41888d3a8967f419 100644
--- a/paddle/gserver/layers/Projection.cpp
+++ b/paddle/gserver/layers/Projection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Projection.h"
 
 #include "ContextProjection.h"
@@ -25,7 +24,8 @@ ClassRegistrar<Projection, ProjectionConfig, ParameterPtr, bool>
     Projection::registrar_;
 
 Projection* Projection::create(const ProjectionConfig& config,
-                               ParameterPtr parameter, bool useGpu) {
+                               ParameterPtr parameter,
+                               bool useGpu) {
   return registrar_.createByType(config.type(), config, parameter, useGpu);
 }
 
diff --git a/paddle/gserver/layers/Projection.h b/paddle/gserver/layers/Projection.h
index 3fa3a0cc230ac4c8616abe0eb2c8ac41bde52d53..798503113d761091d1a1bdf9e4ec70e0c2c3b3a4 100644
--- a/paddle/gserver/layers/Projection.h
+++ b/paddle/gserver/layers/Projection.h
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
-#include "paddle/parameter/Parameter.h"
-#include "ModelConfig.pb.h"
 #include "Layer.h"
+#include "ModelConfig.pb.h"
+#include "paddle/parameter/Parameter.h"
 
 namespace paddle {
 
@@ -28,6 +27,11 @@ namespace paddle {
     Projection::registrar_.registerClass<__class_name>(#__type_name); \
   })
 
+#define REGISTER_PROJECTION_CREATE_FUNC(__type_name, createFunction)    \
+  static InitFunction __reg_type_##__type_name([]() {                   \
+    Projection::registrar_.registerClass(#__type_name, createFunction); \
+  })
+
 /**
  * A projection takes one Argument as input, calculate the result and add it
  * to output Argument.
@@ -35,9 +39,11 @@ namespace paddle {
 class Projection {
 public:
   static Projection* create(const ProjectionConfig& config,
-                            ParameterPtr parameter, bool useGpu);
+                            ParameterPtr parameter,
+                            bool useGpu);
 
-  Projection(const ProjectionConfig& config, ParameterPtr parameter,
+  Projection(const ProjectionConfig& config,
+             ParameterPtr parameter,
              bool useGpu)
       : config_(config), parameter_(parameter), useGpu_(useGpu) {}
 
@@ -50,7 +56,8 @@ public:
       registrar_;
 
   /**
-   * Forward propagation. If backward() will be called, in and out must be kept valid until then.
+   * Forward propagation. If backward() will be called, in and out must be kept
+   * valid until then.
    * @param in input of projection
    * @param out output of projection
    * @param passType PASS_TRAIN of PASS_TEST
diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp
index 30ef679f92c073cce5bb6edd11896007c0a8e68e..08453e21b8ff27138f9fa44ac834b54eb94c0688 100644
--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 #include "paddle/utils/Stat.h"
 #include "SequenceToBatch.h"
@@ -143,8 +142,8 @@ bool RecurrentLayer::init(const LayerMap& layerMap,
 
 void RecurrentLayer::resetState() {
   CHECK(!reversed_) << "state is not allowed for reversed recurrent layer";
-  Matrix::resizeOrCreate(prevOutput_, 1, getSize(), /* trans= */ false,
-                         useGpu_);
+  Matrix::resizeOrCreate(
+      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
   prevOutput_->zeroMem();
 }
 
@@ -183,16 +182,23 @@ void RecurrentLayer::forward(PassType passType) {
   }
 }
 
-void RecurrentLayer::forwardSequence(int batchSize, size_t numSequences,
+void RecurrentLayer::forwardSequence(int batchSize,
+                                     size_t numSequences,
                                      const int* starts) {
   REGISTER_TIMER_INFO("RecurrentFwSequence", getName().c_str());
   frameOutput_.reserve(batchSize);
   for (int i = frameOutput_.size(); i < batchSize; ++i) {
     Argument arg;
-    arg.value = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                               /* trans= */ false, useGpu_);
-    arg.grad = Matrix::create(nullptr, /* height= */ 1, getSize(),
-                              /* trans= */ false, useGpu_);
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               getSize(),
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              getSize(),
+                              /* trans= */ false,
+                              useGpu_);
     frameOutput_.push_back(arg);
   }
 
@@ -213,8 +219,8 @@ void RecurrentLayer::forwardOneSequence(int start, int length) {
     }
     activation_->forward(frameOutput_[start]);
     for (int i = 1; i < length; ++i) {
-      frameOutput_[start + i].value->mul(frameOutput_[start + i - 1].value,
-                                         weight_->getW(), 1, 1);
+      frameOutput_[start + i].value->mul(
+          frameOutput_[start + i - 1].value, weight_->getW(), 1, 1);
       activation_->forward(frameOutput_[start + i]);
     }
     if (prevOutput_) {
@@ -223,8 +229,8 @@ void RecurrentLayer::forwardOneSequence(int start, int length) {
   } else {
     activation_->forward(frameOutput_[start + length - 1]);
     for (int i = length - 2; i >= 0; --i) {
-      frameOutput_[start + i].value->mul(frameOutput_[start + i + 1].value,
-                                         weight_->getW(), 1, 1);
+      frameOutput_[start + i].value->mul(
+          frameOutput_[start + i + 1].value, weight_->getW(), 1, 1);
       activation_->forward(frameOutput_[start + i]);
     }
   }
@@ -256,7 +262,8 @@ void RecurrentLayer::backward(const UpdateCallback& callback) {
   weight_->getParameterPtr()->incUpdate(callback);
 }
 
-void RecurrentLayer::backwardSequence(int batchSize, size_t numSequences,
+void RecurrentLayer::backwardSequence(int batchSize,
+                                      size_t numSequences,
                                       const int* starts) {
   REGISTER_TIMER_INFO("RecurrentBwSequence", getName().c_str());
   for (int i = 0; i < batchSize; ++i) {
@@ -274,31 +281,36 @@ void RecurrentLayer::backwardOneSequence(int start, int length) {
   if (!reversed_) {
     for (int i = length - 1; i > 0; --i) {
       activation_->backward(frameOutput_[start + i]);
-      frameOutput_[start + i - 1].grad->mul(frameOutput_[start + i].grad,
-                                            weightT, 1, 1);
+      frameOutput_[start + i - 1].grad->mul(
+          frameOutput_[start + i].grad, weightT, 1, 1);
     }
     activation_->backward(frameOutput_[start]);
     if (weight_->getWGrad()) {
       weight_->getWGrad()->mul(
           output_.value->subMatrix(start, length - 1)->getTranspose(),
-          output_.grad->subMatrix(start + 1, length - 1), 1, 1);
+          output_.grad->subMatrix(start + 1, length - 1),
+          1,
+          1);
     }
   } else {
     for (int i = 0; i < length - 1; ++i) {
       activation_->backward(frameOutput_[start + i]);
-      frameOutput_[start + i + 1].grad->mul(frameOutput_[start + i].grad,
-                                            weightT, 1, 1);
+      frameOutput_[start + i + 1].grad->mul(
+          frameOutput_[start + i].grad, weightT, 1, 1);
     }
     activation_->backward(frameOutput_[start + length - 1]);
     if (weight_->getWGrad()) {
       weight_->getWGrad()->mul(
           output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
-          output_.grad->subMatrix(start, length - 1), 1, 1);
+          output_.grad->subMatrix(start, length - 1),
+          1,
+          1);
     }
   }
 }
 
-void RecurrentLayer::forwardBatch(int batchSize, size_t numSequences,
+void RecurrentLayer::forwardBatch(int batchSize,
+                                  size_t numSequences,
                                   const int* starts) {
   if (!batchValue_) {
     batchValue_.reset(new SequenceToBatch(useGpu_));
@@ -327,7 +339,8 @@ void RecurrentLayer::forwardBatch(int batchSize, size_t numSequences,
   batchValue_->copyBackSeq(*output_.value);
 }
 
-void RecurrentLayer::backwardBatch(int batchSize, size_t numSequences,
+void RecurrentLayer::backwardBatch(int batchSize,
+                                   size_t numSequences,
                                    const int* starts) {
   if (!batchGrad_) {
     batchGrad_.reset(new SequenceToBatch(useGpu_));
@@ -377,11 +390,15 @@ void RecurrentLayer::backwardBatch(int batchSize, size_t numSequences,
       if (!reversed_) {
         weight_->getWGrad()->mul(
             output_.value->subMatrix(starts[seq], len - 1)->getTranspose(),
-            output_.grad->subMatrix(starts[seq] + 1, len - 1), 1, 1);
+            output_.grad->subMatrix(starts[seq] + 1, len - 1),
+            1,
+            1);
       } else {
         weight_->getWGrad()->mul(
             output_.value->subMatrix(starts[seq] + 1, len - 1)->getTranspose(),
-            output_.grad->subMatrix(starts[seq], len - 1), 1, 1);
+            output_.grad->subMatrix(starts[seq], len - 1),
+            1,
+            1);
       }
     }
   }
diff --git a/paddle/gserver/layers/RecurrentLayerGroup.cpp b/paddle/gserver/layers/RecurrentLayerGroup.cpp
index 62dbaa2674ce624dec44b8b3c86f9a08c1cfe0ee..a5443975da4ab6ecb302087fe71b018154d439b8 100644
--- a/paddle/gserver/layers/RecurrentLayerGroup.cpp
+++ b/paddle/gserver/layers/RecurrentLayerGroup.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/gserver/layers/Layer.h"
 #include <functional>
 
@@ -31,7 +30,8 @@ class RecurrentLayerGroup : public Layer {
 public:
   explicit RecurrentLayerGroup(const LayerConfig& config) : Layer(config) {}
 
-  void initSubNetwork(NeuralNetwork* rootNetwork, const ModelConfig& config,
+  void initSubNetwork(NeuralNetwork* rootNetwork,
+                      const ModelConfig& config,
                       const std::vector<ParameterType>& parameterTypes,
                       bool useGpu);
 
@@ -53,7 +53,7 @@ public:
   /**
    * @see Layer.accessSubNetwork
    */
-  void accessSubNetwork(const std::function<void(NeuralNetwork &)> &callback) {
+  void accessSubNetwork(const std::function<void(NeuralNetwork&)>& callback) {
     callback(*network_);
   }
 
@@ -64,8 +64,10 @@ private:
 REGISTER_LAYER(recurrent_layer_group, RecurrentLayerGroup);
 
 void RecurrentLayerGroup::initSubNetwork(
-    NeuralNetwork* rootNetwork, const ModelConfig& config,
-    const std::vector<ParameterType>& parameterTypes, bool useGpu) {
+    NeuralNetwork* rootNetwork,
+    const ModelConfig& config,
+    const std::vector<ParameterType>& parameterTypes,
+    bool useGpu) {
   setNeedGradient(true);
 
   network_.reset(new RecurrentGradientMachine(config_.name(), rootNetwork));
diff --git a/paddle/gserver/layers/ResizeLayer.cpp b/paddle/gserver/layers/ResizeLayer.cpp
index dc573e838f71623e6985b19a4ae2cba6109ef6b5..3c478a33e350cf0e901381890e3df1496893f4db 100644
--- a/paddle/gserver/layers/ResizeLayer.cpp
+++ b/paddle/gserver/layers/ResizeLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/BaseMatrix.h"
@@ -68,9 +67,11 @@ void ResizeLayer::backward(const UpdateCallback& callback) {
     return;
   }
 
-  MatrixPtr tmp =
-      Matrix::create(input.grad->getData(), height * width / getSize(),
-                     getSize(), false, useGpu_);
+  MatrixPtr tmp = Matrix::create(input.grad->getData(),
+                                 height * width / getSize(),
+                                 getSize(),
+                                 false,
+                                 useGpu_);
   tmp->add(*output_.grad);
 }
 
diff --git a/paddle/gserver/layers/ScalingLayer.cpp b/paddle/gserver/layers/ScalingLayer.cpp
index a494b401ff597290cf67ef55c4bf1b062da988ab..71570810f9576df74940968426c09ae421881ba6 100644
--- a/paddle/gserver/layers/ScalingLayer.cpp
+++ b/paddle/gserver/layers/ScalingLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -26,7 +25,7 @@ namespace paddle {
  * \f[
  *   y.row[i] = w[i] * x.row[i]
  * \f]
- * where \f$x\f$ is (batchSize x dataDim) input, \f$w\f$ is 
+ * where \f$x\f$ is (batchSize x dataDim) input, \f$w\f$ is
  * (batchSize x 1) weight vector, and \f$y\f$ is (batchSize x dataDim) output.
  *
  * The config file api is scaling_layer.
diff --git a/paddle/gserver/layers/ScalingProjection.cpp b/paddle/gserver/layers/ScalingProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7999d02d384a06b900fbfa2c8bb271660b7fe008
--- /dev/null
+++ b/paddle/gserver/layers/ScalingProjection.cpp
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Projection.h"
+
+namespace paddle {
+
+class ScalingProjection : public Projection {
+public:
+  ScalingProjection(const ProjectionConfig& config,
+                    const ParameterPtr& parameter,
+                    bool useGpu)
+      : Projection(config, parameter, useGpu) {
+    CHECK_EQ(parameter->getSize(), 1UL);
+    weight_.reset(new Weight(1, 1, parameter));
+  }
+
+  void forward() {
+    CHECK(in_->value);
+    out_->value->add(*in_->value, weight_->getW()->getElement(0, 0));
+  }
+
+  void backward(const UpdateCallback& callback) {
+    if (weight_->getWGrad()) {
+      auto sum = Matrix::create(in_->value->getHeight(), 1, false, useGpu_);
+      sum->sumOfProducts(*in_->value,
+                         *out_->grad,
+                         /* scaleSum= */ 1,
+                         /* scaleDest= */ 0);
+      weight_->getWGrad()->sumCols(*sum,
+                                   /* scaleSum= */ 1,
+                                   /* scaleDest= */ 1);
+      parameter_->incUpdate(callback);
+    }
+    if (in_->grad) {
+      in_->grad->add(*out_->grad, weight_->getW()->getElement(0, 0));
+    }
+  }
+
+protected:
+  std::unique_ptr<Weight> weight_;
+};
+
+REGISTER_PROJECTION(scaling, ScalingProjection);
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
index 25ae9d519533a912fb32348c8a521405f6c77eb3..4dfa2c179dafe0d8dcc6766fbafeae129edcc49a 100644
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "SelectiveFullyConnectedLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -49,11 +48,11 @@ bool SelectiveFullyConnectedLayer::init(const LayerMap& layerMap,
 
 void SelectiveFullyConnectedLayer::prefetch() {}
 
-void SelectiveFullyConnectedLayer::reserveOutput(size_t height, size_t width,
+void SelectiveFullyConnectedLayer::reserveOutput(size_t height,
+                                                 size_t width,
                                                  size_t nnz) {
   bool flag = (passType_ == PASS_TEST &&
-               config_.selective_fc_pass_generation() &&
-               !fullOutput_);
+               config_.selective_fc_pass_generation() && !fullOutput_);
   SetDevice device(output_.deviceId);
   if (flag) {
     // output_.value is sparse matrix
@@ -61,8 +60,12 @@ void SelectiveFullyConnectedLayer::reserveOutput(size_t height, size_t width,
         dynamic_cast<GpuMatrix*>(output_.value.get())) {
       output_.value = nullptr;
     }
-    Matrix::resizeOrCreateSparseMatrix(output_.value, height, width, nnz,
-                                       FLOAT_VALUE, SPARSE_CSR,
+    Matrix::resizeOrCreateSparseMatrix(output_.value,
+                                       height,
+                                       width,
+                                       nnz,
+                                       FLOAT_VALUE,
+                                       SPARSE_CSR,
                                        /*trans=*/false,
                                        /*useGpu=*/useGpu_);
     output_.value->copyFrom(*selCols_);
@@ -74,19 +77,31 @@ void SelectiveFullyConnectedLayer::reserveOutput(size_t height, size_t width,
           dynamic_cast<GpuSparseMatrix*>(output_.value.get())) {
         output_.value = nullptr;
       }
-      Matrix::resizeOrCreate(output_.value, height, width,
-                             /*trans=*/false, /*useGpu=*/useGpu_);
+      Matrix::resizeOrCreate(output_.value,
+                             height,
+                             width,
+                             /*trans=*/false,
+                             /*useGpu=*/useGpu_);
       interOutput_ = output_.value;
     } else {
       // output_.value is dense matrix, but width = nnz /height
       CHECK_EQ(nnz % height, 0U);
       CHECK(nnz / height);
-      Matrix::resizeOrCreate(output_.value, height, nnz / height,
-                             /*trans=*/false, /*useGpu=*/useGpu_);
-      interOutput_ = Matrix::createSparseMatrix(
-          output_.value->getData(), selCols_->getRows(), selCols_->getCols(),
-          height, width, nnz, FLOAT_VALUE, SPARSE_CSR,
-          /*trans=*/false, /*useGpu=*/useGpu_);
+      Matrix::resizeOrCreate(output_.value,
+                             height,
+                             nnz / height,
+                             /*trans=*/false,
+                             /*useGpu=*/useGpu_);
+      interOutput_ = Matrix::createSparseMatrix(output_.value->getData(),
+                                                selCols_->getRows(),
+                                                selCols_->getCols(),
+                                                height,
+                                                width,
+                                                nnz,
+                                                FLOAT_VALUE,
+                                                SPARSE_CSR,
+                                                /*trans=*/false,
+                                                /*useGpu=*/useGpu_);
     }
   }
   interOutput_->zeroMem();
@@ -97,8 +112,11 @@ void SelectiveFullyConnectedLayer::reserveOutput(size_t height, size_t width,
     CHECK(nnz / height)
         << "during training, "
            "each sample must have at least one column selected.";
-    Matrix::resizeOrCreate(output_.grad, height, nnz / height,
-                           /*trans=*/false, /*useGpu=*/useGpu_);
+    Matrix::resizeOrCreate(output_.grad,
+                           height,
+                           nnz / height,
+                           /*trans=*/false,
+                           /*useGpu=*/useGpu_);
     output_.grad->zeroMem();
   }
 }
@@ -131,7 +149,7 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) {
     real scaleT = i == 0 ? real(0) : real(1);
 
     flag = nnz < (hsize * wsize) * config_.selective_fc_full_mul_ratio() &&
-                !fullOutput_;
+           !fullOutput_;
     if (flag) {
       // if the indecies are highly sparse,
       // manully compute the multiplication of
@@ -145,8 +163,11 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) {
       if (fullOutput_) {
         interOutput_->mul(input, weight->getTranspose(), 1, scaleT);
       } else {
-        Matrix::resizeOrCreate(mmat_, hsize, wsize,
-                               /*trans=*/false, /*useGpu=*/useGpu_);
+        Matrix::resizeOrCreate(mmat_,
+                               hsize,
+                               wsize,
+                               /*trans=*/false,
+                               /*useGpu=*/useGpu_);
         mmat_->mul(input, weight->getTranspose());
         interOutput_->add3(mmat_);
       }
@@ -158,7 +179,7 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) {
   }
 
   flag = (passType_ == PASS_TEST && config_.selective_fc_pass_generation() &&
-         !fullOutput_);
+          !fullOutput_);
   if (flag) {
     // during generation, output of this layer is a sparse csr matrix,
     // which is probably the input of maxid layer
@@ -166,8 +187,11 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) {
     // activiation of this layer should be exponential, not softmax.
 
     Argument arg;
-    arg.value = Matrix::create(interOutput_->getData(), 1, nnz,
-                               /*trans=*/false, /*useGpu=*/useGpu_);
+    arg.value = Matrix::create(interOutput_->getData(),
+                               1,
+                               nnz,
+                               /*trans=*/false,
+                               /*useGpu=*/useGpu_);
     activation_->forward(arg);
   } else /* train and test in train, not generating */ {
     // during training, this layer output value is *Matrix*, which is input of
@@ -187,17 +211,22 @@ void SelectiveFullyConnectedLayer::backward(const UpdateCallback& callback) {
   backwardActivation();
   MatrixPtr oGrad = getOutputGrad();
   if (!fullOutput_) {
-    interOutGrad_ = Matrix::createSparseMatrix(
-        oGrad->getData(), interOutput_->getRows(), interOutput_->getCols(),
-        interOutput_->getHeight(), interOutput_->getWidth(),
-        interOutput_->getElementCnt(), FLOAT_VALUE, SPARSE_CSR,
-        /*trans=*/false,
-        /*useGpu=*/useGpu_);
+    interOutGrad_ = Matrix::createSparseMatrix(oGrad->getData(),
+                                               interOutput_->getRows(),
+                                               interOutput_->getCols(),
+                                               interOutput_->getHeight(),
+                                               interOutput_->getWidth(),
+                                               interOutput_->getElementCnt(),
+                                               FLOAT_VALUE,
+                                               SPARSE_CSR,
+                                               /*trans=*/false,
+                                               /*useGpu=*/useGpu_);
   } else {
-    interOutGrad_ =
-        Matrix::create(oGrad->getData(), oGrad->getHeight(), oGrad->getWidth(),
-                       /*trans=*/false,
-                       /*useGpu=*/useGpu_);
+    interOutGrad_ = Matrix::create(oGrad->getData(),
+                                   oGrad->getHeight(),
+                                   oGrad->getWidth(),
+                                   /*trans=*/false,
+                                   /*useGpu=*/useGpu_);
   }
 
   if (biases_ && biases_->getWGrad()) {
@@ -240,13 +269,21 @@ void paddle::SelectiveFullyConnectedLayer::fillSelectiveData(
   size_t sampleNum = candidates->size();
   size_t outputWidth = getSize();
   size_t nnz =
-      std::accumulate(candidates->begin(), candidates->end(), 0UL,
+      std::accumulate(candidates->begin(),
+                      candidates->end(),
+                      0UL,
                       [](size_t a, const std::pair<int*, size_t>& arr) {
                         return a + arr.second;
                       });
 
   Matrix::resizeOrCreateSparseMatrix(this->cpuSelCols_,
-    sampleNum, outputWidth, nnz, NO_VALUE, SPARSE_CSR, false, false);
+                                     sampleNum,
+                                     outputWidth,
+                                     nnz,
+                                     NO_VALUE,
+                                     SPARSE_CSR,
+                                     false,
+                                     false);
   CHECK(this->cpuSelCols_ != nullptr);
   CpuSparseMatrixPtr selCols =
       std::dynamic_pointer_cast<CpuSparseMatrix>(cpuSelCols_);
@@ -272,7 +309,13 @@ void paddle::SelectiveFullyConnectedLayer::fillSelectiveData(
     this->selCols_ = this->cpuSelCols_;
   } else {
     Matrix::resizeOrCreateSparseMatrix(this->selCols_,
-          sampleNum, outputWidth, nnz, NO_VALUE, SPARSE_CSR, false, true);
+                                       sampleNum,
+                                       outputWidth,
+                                       nnz,
+                                       NO_VALUE,
+                                       SPARSE_CSR,
+                                       false,
+                                       true);
     this->selCols_->copyFrom(*cpuSelCols_, HPPL_STREAM_1);
     hl_stream_synchronize(HPPL_STREAM_1);
   }
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
index c152151cff051bc0f62bcf6702d6c6c649be8003..9f92ae060521bd7852b67d45649d1cd0792961d4 100644
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
@@ -98,8 +97,6 @@ private:
   /**
    * @brief Make SelectiveFC act as FullyConnectedLayer
    */
-  void fillFullySelectiveData() {
-    fullOutput_ = true;
-  }
+  void fillFullySelectiveData() { fullOutput_ = true; }
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceConcatLayer.cpp b/paddle/gserver/layers/SequenceConcatLayer.cpp
index dfce4dcb196132414542d4fe9f0d97200e44779c..bd72ba3d167d99b5d3fdd047d6b1bfab611b3232 100644
--- a/paddle/gserver/layers/SequenceConcatLayer.cpp
+++ b/paddle/gserver/layers/SequenceConcatLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -68,13 +67,11 @@ void SequenceConcatLayer::forward(PassType passType) {
 
   const Argument& input1 = getInput(0);
   size_t numSequences1 = input1.getNumSequences();
-  auto startPositions1 =
-      input1.sequenceStartPositions->getVector(false);
+  auto startPositions1 = input1.sequenceStartPositions->getVector(false);
 
   const Argument& input2 = getInput(1);
   size_t numSequences2 = input2.getNumSequences();
-  auto startPositions2 =
-      input2.sequenceStartPositions->getVector(false);
+  auto startPositions2 = input2.sequenceStartPositions->getVector(false);
 
   CHECK_EQ(dim, input1.value->getWidth());
   CHECK_EQ(startPositions1->getData()[numSequences1], input1.getBatchSize());
@@ -117,8 +114,8 @@ void SequenceConcatLayer::forward(PassType passType) {
     }
 
     // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions,
-                                   numSequences1 + 1, false);
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions, numSequences1 + 1, false);
 
     int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
 
@@ -150,10 +147,8 @@ void SequenceConcatLayer::backward(const UpdateCallback& callback) {
   MatrixPtr inputGrad1 = getInputGrad(0);
   MatrixPtr inputGrad2 = getInputGrad(1);
   MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions1 =
-      getInput(0).sequenceStartPositions->getVector(false);
-  auto startPositions2 =
-      getInput(1).sequenceStartPositions->getVector(false);
+  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
+  auto startPositions2 = getInput(1).sequenceStartPositions->getVector(false);
 
   size_t numSequences1 = startPositions1->getSize() - 1;
   size_t numSequences2 = startPositions2->getSize() - 1;
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
index f4d26ba21bed69182d428e03684315c8f5bc919a..0e9531eabb4b389b762e235ec01d5f16c88cd4a1 100644
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 
-#include "Layer.h"
+#include "SequencePoolLayer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/Stat.h"
 
@@ -29,20 +28,19 @@ namespace paddle {
  * If SequenceLevel = kSeq:
  *   Check input sequence must has sub-sequence
  *   Output: a sequence containing only the last instance of each sub-sequence
- * of the input sequence
+ *           of the input sequence
+ *
+ * The config file api is last_seq and first_seq.
  */
 
-class SequenceLastInstanceLayer : public Layer {
+class SequenceLastInstanceLayer : public SequencePoolLayer {
 protected:
-  std::unique_ptr<Weight> biases_;
   MatrixPtr tmpSrc_;
   MatrixPtr tmpDest_;
-  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
-  int type_;
 
 public:
   explicit SequenceLastInstanceLayer(const LayerConfig& config)
-      : Layer(config) {}
+      : SequencePoolLayer(config) {}
 
   ~SequenceLastInstanceLayer() {}
 
@@ -56,56 +54,20 @@ REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
 
 bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
                                      const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  // seqlastins layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
+  SequencePoolLayer::init(layerMap, parameterMap);
 
   tmpSrc_ =
       Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
   tmpDest_ =
       Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
 
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
   return true;
 }
 
 void SequenceLastInstanceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t dim = getSize();
-  const Argument& input = getInput(0);
+  SequencePoolLayer::forward(passType);
 
-  // check
-  CHECK(input.sequenceStartPositions);
-  if (type_) {
-    CHECK(input.subSequenceStartPositions)
-      << "when trans_type = seq, input must hasSubseq";
-  }
-  auto startPositions =
-      type_ ? input.subSequenceStartPositions->getVector(false)
-            : input.sequenceStartPositions->getVector(false);
-  size_t height = type_ ? input.getNumSubSequences() : input.getNumSequences();
-  CHECK_EQ(dim, input.value->getWidth());
-  CHECK_EQ(startPositions->getData()[height], input.getBatchSize());
-  CHECK_EQ(height, startPositions->getSize() - 1);
-
-  reserveOutput(height, dim);
-  const int* starts = startPositions->getData();
+  const int* starts = startPositions_->getData(false);
   MatrixPtr inputValue = getInputValue(0);
   MatrixPtr outputValue = getOutputValue();
 
@@ -113,21 +75,13 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
     AsyncGpuBlock asyncGpuBlock;
     REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
 
-    for (size_t seqId = 0; seqId < height; ++seqId) {
+    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
       int insId =
           config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
 
       outputValue->subMatrix(seqId, 1, tmpDest_)
           ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
     }
-    /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-     * thus, in this case, output_ has no sequenceStartPositions.
-     * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-     * case, we should compute the new sequenceStartPositions.
-    */
-    if (type_) {
-      output_.degradeSequence(input, useGpu_);
-    }
   }
 
   if (biases_.get() != NULL) {
@@ -139,23 +93,12 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
 }
 
 void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
-  /* activation, should set to 'linear' in most cases */
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
+  SequencePoolLayer::backward(callback);
 
   MatrixPtr inputGrad = getInputGrad(0);
   MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions =
-      type_ ? getInput(0).subSequenceStartPositions->getVector(false)
-            : getInput(0).sequenceStartPositions->getVector(false);
-  const int* starts = startPositions->getData();
-  size_t numSequences = startPositions->getSize() - 1;
+  const int* starts = startPositions_->getData(false);
+  size_t numSequences = startPositions_->getSize() - 1;
 
   if (inputGrad) {
     AsyncGpuBlock asyncGpuBlock;
diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c9f19b7d3b66b3ac031135c04a96ffe27245aa01
--- /dev/null
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Logging.h"
+#include "SequencePoolLayer.h"
+
+namespace paddle {
+
+bool SequencePoolLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  // seqlastins/max/average layer should have exactly 1 input
+  CHECK_EQ(1U, inputLayers_.size());
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+  // transform to which sequence type
+  if (config_.trans_type() == "non-seq") {
+    type_ = kNonSeq;
+  } else if (config_.trans_type() == "seq") {
+    type_ = kSeq;
+  } else {
+    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
+  }
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SequencePoolLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  newBatchSize_ = type_ ? input.getNumSubSequences() : input.getNumSequences();
+  size_t dim = getSize();
+  // check
+  CHECK_EQ(dim, input.value->getWidth());
+  startPositions_ =
+      type_ ? input.subSequenceStartPositions : input.sequenceStartPositions;
+  auto starts = startPositions_->getVector(false);
+  CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
+  CHECK_EQ(newBatchSize_, starts->getSize() - 1);
+
+  resetOutput(newBatchSize_, dim);
+  if (type_) {
+    CHECK(input.subSequenceStartPositions)
+        << "when trans_type = seq, input must hasSubseq";
+  }
+  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
+   * thus, in this case, output_ has no sequenceStartPositions.
+   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
+   * case, we should compute the new sequenceStartPositions.
+  */
+  if (type_) {
+    output_.degradeSequence(input, useGpu_);
+  }
+}
+
+void SequencePoolLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..669af80e1d447a9150b450f9fca4456c89ed2c36
--- /dev/null
+++ b/paddle/gserver/layers/SequencePoolLayer.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+/**
+ * A base layer for SequenceLastInstanceLayer/AverageLayer/MaxLayer.
+ *
+ * Input: one or more sequences. Each sequence contains some instances.
+ * If SequenceLevel = kNonSeq:
+ *    Output: output size is the number of input sequences (NOT input instances)
+ *    output[i] = seqlastin/average/max_{for each instance in this
+ * sequence}{input[i]}
+ * If SequenceLevel = kSeq:
+ *    Check input sequence must has sub-sequence
+ *    Output: output size is the number of input sub-sequences
+ *    output[i] = seqlastin/average/max_{for each instance in this
+ * sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
+ */
+
+class SequencePoolLayer : public Layer {
+protected:
+  int type_;
+  std::unique_ptr<Weight> biases_;
+  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
+  size_t newBatchSize_;
+  ICpuGpuVectorPtr startPositions_;
+
+public:
+  explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  virtual ~SequencePoolLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceReshapeLayer.cpp b/paddle/gserver/layers/SequenceReshapeLayer.cpp
index 05766706b002c0ab1a1ee3d5c34f134985a975eb..5ca9b8b300161688817234909f2b875801d90995 100644
--- a/paddle/gserver/layers/SequenceReshapeLayer.cpp
+++ b/paddle/gserver/layers/SequenceReshapeLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -69,8 +68,7 @@ void SequenceReshapeLayer::forward(PassType passType) {
   size_t outDim = getSize();
 
   size_t numSequences = input.getNumSequences();
-  auto startPositions =
-      input.sequenceStartPositions->getVector(false);
+  auto startPositions = input.sequenceStartPositions->getVector(false);
   const int* starts = startPositions->getData();
 
   CHECK_EQ(starts[numSequences], input.getBatchSize());
@@ -96,9 +94,7 @@ void SequenceReshapeLayer::forward(PassType passType) {
 
     // modify the sequenceStartPositions
     ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions,
-        numSequences + 1,
-        false);
+        output_.sequenceStartPositions, numSequences + 1, false);
 
     int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
 
@@ -134,8 +130,11 @@ void SequenceReshapeLayer::backward(const UpdateCallback& callback) {
   REGISTER_TIMER_INFO("SequenceReshapeLayerBackward", getName().c_str());
 
   if (inputGrad) {
-    Matrix::resizeOrCreate(reshapedOutputGrad, inputGrad->getHeight(),
-                           inputGrad->getWidth(), false, useGpu_);
+    Matrix::resizeOrCreate(reshapedOutputGrad,
+                           inputGrad->getHeight(),
+                           inputGrad->getWidth(),
+                           false,
+                           useGpu_);
     reshapedOutputGrad->copyFrom(*outputGrad);
     inputGrad->add(*reshapedOutputGrad);
   }
diff --git a/paddle/gserver/layers/SequenceToBatch.cpp b/paddle/gserver/layers/SequenceToBatch.cpp
index 88eace28b2afff982614375da8c1dd03ab324fdc..04402db9c8af2f51f30a09cbf1e9c4023fe3e531 100644
--- a/paddle/gserver/layers/SequenceToBatch.cpp
+++ b/paddle/gserver/layers/SequenceToBatch.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <vector>
 #include <algorithm>
 #include "SequenceToBatch.h"
@@ -21,8 +20,10 @@ limitations under the License. */
 
 namespace paddle {
 
-void SequenceToBatch::resizeOrCreateBatch(int batchSize, size_t numSequences,
-                                          const int *seqStarts, bool reversed,
+void SequenceToBatch::resizeOrCreateBatch(int batchSize,
+                                          size_t numSequences,
+                                          const int *seqStarts,
+                                          bool reversed,
                                           bool prevBatchState) {
   CHECK_EQ(seqStarts[numSequences], batchSize);
   IVector::resizeOrCreate(seq2BatchIdx_, batchSize, useGpu_);
@@ -50,7 +51,8 @@ void SequenceToBatch::resizeOrCreateBatch(int batchSize, size_t numSequences,
     int length = seqStarts[seqId + 1] - seqStarts[seqId];
     seqStartAndLength.emplace_back(seqStarts[seqId], length, seqId);
   }
-  std::sort(seqStartAndLength.begin(), seqStartAndLength.end(),
+  std::sort(seqStartAndLength.begin(),
+            seqStartAndLength.end(),
             [](SeqStartAndLength a, SeqStartAndLength b) {
               return a.length_ > b.length_;
             });
@@ -122,15 +124,19 @@ void SequenceToBatch::resizeOrCreateBatch(int batchSize, size_t numSequences,
 }
 
 void SequenceToBatch::resizeOrCreate(Matrix &seqValue) {
-  Matrix::resizeOrCreate(batchValue_, seqValue.getHeight(), seqValue.getWidth(),
-                         /* trans= */ false, useGpu_);
+  Matrix::resizeOrCreate(batchValue_,
+                         seqValue.getHeight(),
+                         seqValue.getWidth(),
+                         /* trans= */ false,
+                         useGpu_);
 }
 
 MatrixPtr SequenceToBatch::getBatchValue(int batchId, int numRows) {
   return getBatchValue(*batchValue_, batchId, numRows);
 }
 
-MatrixPtr SequenceToBatch::getBatchValue(Matrix &batchValue, int batchId,
+MatrixPtr SequenceToBatch::getBatchValue(Matrix &batchValue,
+                                         int batchId,
                                          int numRows) {
   int *batchStartPositions = batchStartPositions_->getData();
   int start = batchStartPositions[batchId];
@@ -151,7 +157,8 @@ void SequenceToBatch::getSeqOutputFromBatch(Matrix &sequence, Matrix &batch) {
   sequence2BatchCopy(sequence, batch, *seqEndIdxInBatch_, true);
 }
 
-void SequenceToBatch::sequence2BatchCopy(Matrix &batch, Matrix &sequence,
+void SequenceToBatch::sequence2BatchCopy(Matrix &batch,
+                                         Matrix &sequence,
                                          IVector &seq2BatchIdx,
                                          bool seq2batch) {
   int seqWidth = sequence.getWidth();
@@ -161,23 +168,27 @@ void SequenceToBatch::sequence2BatchCopy(Matrix &batch, Matrix &sequence,
   int *idxData = seq2BatchIdx.getData();
 
   if (useGpu_) {
-    hl_sequence2batch_copy(batchData, seqData, idxData, seqWidth,
-                           batchCount, seq2batch);
+    hl_sequence2batch_copy(
+        batchData, seqData, idxData, seqWidth, batchCount, seq2batch);
   } else {
     for (int i = 0; i < batchCount; ++i) {
       if (seq2batch) {
-        memcpy(batch.rowBuf(i), sequence.rowBuf(idxData[i]),
+        memcpy(batch.rowBuf(i),
+               sequence.rowBuf(idxData[i]),
                seqWidth * sizeof(real));
       } else {
-        memcpy(sequence.rowBuf(idxData[i]), batch.rowBuf(i),
+        memcpy(sequence.rowBuf(idxData[i]),
+               batch.rowBuf(i),
                seqWidth * sizeof(real));
       }
     }
   }
 }
 
-void SequenceToBatch::sequence2BatchAdd(Matrix &batch, Matrix &sequence,
-                                        IVector &seq2BatchIdx, bool seq2batch) {
+void SequenceToBatch::sequence2BatchAdd(Matrix &batch,
+                                        Matrix &sequence,
+                                        IVector &seq2BatchIdx,
+                                        bool seq2batch) {
   int seqWidth = sequence.getWidth();
   int batchCount = batch.getHeight();
   real *batchData = batch.getData();
@@ -185,8 +196,8 @@ void SequenceToBatch::sequence2BatchAdd(Matrix &batch, Matrix &sequence,
   int *idxData = seq2BatchIdx.getData();
 
   if (useGpu_) {
-    hl_sequence2batch_add(batchData, seqData, idxData, seqWidth,
-                          batchCount, seq2batch);
+    hl_sequence2batch_add(
+        batchData, seqData, idxData, seqWidth, batchCount, seq2batch);
   } else {
     for (int i = 0; i < batchCount; ++i) {
       if (seq2batch) {
@@ -199,8 +210,11 @@ void SequenceToBatch::sequence2BatchAdd(Matrix &batch, Matrix &sequence,
 }
 
 void SequenceToBatch::copyFromSeq(Matrix &seqValue) {
-  Matrix::resizeOrCreate(batchValue_, seqValue.getHeight(), seqValue.getWidth(),
-                         /* trans= */ false, useGpu_);
+  Matrix::resizeOrCreate(batchValue_,
+                         seqValue.getHeight(),
+                         seqValue.getWidth(),
+                         /* trans= */ false,
+                         useGpu_);
   sequence2BatchCopy(*batchValue_, seqValue, *seq2BatchIdx_, true);
 }
 
@@ -208,12 +222,14 @@ void SequenceToBatch::copyBackSeq(Matrix &seqValue) {
   sequence2BatchCopy(*batchValue_, seqValue, *seq2BatchIdx_, false);
 }
 
-void SequenceToBatch::copy(Matrix &seqValue, Matrix &batchValue,
+void SequenceToBatch::copy(Matrix &seqValue,
+                           Matrix &batchValue,
                            bool seq2batch) {
   sequence2BatchCopy(batchValue, seqValue, *seq2BatchIdx_, seq2batch);
 }
 
-void SequenceToBatch::add(Matrix &seqValue, Matrix &batchValue,
+void SequenceToBatch::add(Matrix &seqValue,
+                          Matrix &batchValue,
                           bool seq2batch) {
   sequence2BatchAdd(batchValue, seqValue, *seq2BatchIdx_, seq2batch);
 }
diff --git a/paddle/gserver/layers/SequenceToBatch.h b/paddle/gserver/layers/SequenceToBatch.h
index 8cba7ea3b98c3a7774f331ce88160cb9a7a89743..6bc12f207ee3fadbd2a75ca5a5dbb7ce199cc99b 100644
--- a/paddle/gserver/layers/SequenceToBatch.h
+++ b/paddle/gserver/layers/SequenceToBatch.h
@@ -43,8 +43,10 @@ public:
   explicit SequenceToBatch(bool useGpu) : useGpu_(useGpu) {}
 
   /* resize and calculate the batchIndex_ */
-  void resizeOrCreateBatch(int batchSize, size_t numSequences,
-                           const int *seqStarts, bool reversed,
+  void resizeOrCreateBatch(int batchSize,
+                           size_t numSequences,
+                           const int *seqStarts,
+                           bool reversed,
                            bool prevBatchState = false);
 
   /* sequence matrix and batch matrix copy:
@@ -81,9 +83,13 @@ public:
   }
 
 protected:
-  void sequence2BatchCopy(Matrix &batch, Matrix &sequence,
-                          IVector &seq2BatchIdx, bool seq2batch);
-  void sequence2BatchAdd(Matrix &batch, Matrix &sequence, IVector &seq2BatchIdx,
+  void sequence2BatchCopy(Matrix &batch,
+                          Matrix &sequence,
+                          IVector &seq2BatchIdx,
+                          bool seq2batch);
+  void sequence2BatchAdd(Matrix &batch,
+                         Matrix &sequence,
+                         IVector &seq2BatchIdx,
                          bool seq2batch);
 
   IVectorPtr batchStartPositions_;
diff --git a/paddle/gserver/layers/SlopeInterceptLayer.cpp b/paddle/gserver/layers/SlopeInterceptLayer.cpp
index af5fccf6506b6d37faaa030fc2696ac29586908f..dd6ffcd50b01cfa56ee9fbc428ffc2cb9b73ce17 100644
--- a/paddle/gserver/layers/SlopeInterceptLayer.cpp
+++ b/paddle/gserver/layers/SlopeInterceptLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -21,7 +20,8 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * @brief A layer for applying a slope and an intercept to the input element-wise.
+ * @brief A layer for applying a slope and an intercept to the input
+ * element-wise.
  * This layer is used in NEURAL TURING MACHINE.
  * @note There is no activation and weight in this layer.
  *
@@ -29,7 +29,8 @@ namespace paddle {
  *    y = ax + b
  * \f]
  *
- * Here, a is scale and b is offset, which are provided as attributes of the layer.
+ * Here, a is scale and b is offset, which are provided as attributes of the
+ * layer.
  *
  * The config file api is slope_intercept_layer.
  */
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp b/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9609919695853552ed54d8d55e8a669002fa3147
--- /dev/null
+++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp
@@ -0,0 +1,133 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SpatialPyramidPoolLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(spp, SpatialPyramidPoolLayer);
+
+ProjectionConfig SpatialPyramidPoolLayer::getConfig(size_t imgSizeW,
+                                                    size_t imgSizeH,
+                                                    size_t channels,
+                                                    size_t pyramidLevel,
+                                                    std::string& poolType) {
+  ProjectionConfig config;
+  config.set_type("pool");
+  PoolConfig* conf = config.mutable_pool_conf();
+  conf->set_channels(channels);
+  conf->set_img_size(imgSizeW);
+  conf->set_img_size_y(imgSizeH);
+  conf->set_pool_type(poolType);
+
+  int numBins = std::pow(2, pyramidLevel);
+
+  int sizeH = std::ceil(imgSizeH / static_cast<double>(numBins));
+  int paddingH = (sizeH * numBins - imgSizeH + 1) / 2;
+  int outSizeH = outputSize(imgSizeH, sizeH, paddingH, sizeH, true);
+
+  int sizeW = std::ceil(imgSizeW / static_cast<double>(numBins));
+  int paddingW = (sizeW * numBins - imgSizeW + 1) / 2;
+  int outSizeW = outputSize(imgSizeW, sizeW, paddingW, sizeW, true);
+
+  conf->set_stride(sizeW);
+  conf->set_stride_y(sizeH);
+  conf->set_size_x(sizeW);
+  conf->set_size_y(sizeH);
+  conf->set_padding(paddingW);
+  conf->set_padding_y(paddingH);
+  conf->set_output_x(outSizeW);
+  conf->set_output_y(outSizeH);
+  config.set_output_size(outSizeH * outSizeW * channels);
+  return config;
+}
+
+size_t SpatialPyramidPoolLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t layerSize = 0;
+  const SppConfig& sppConf = config_.inputs(0).spp_conf();
+  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imgSizeH_ == 0) {
+    imgSizeH_ = sppConf.has_img_size_y() ? sppConf.img_size_y() : imgSizeW_;
+  }
+  if (imgSizeW_ == 0) {
+    imgSizeW_ = sppConf.img_size();
+  }
+
+  size_t outputH = 1;
+  size_t outputW = (std::pow(4, pyramidHeight_) - 1) / (4 - 1);
+
+  layerSize = outputH * outputW * channels_;
+  return layerSize;
+}
+
+bool SpatialPyramidPoolLayer::init(const LayerMap& layerMap,
+                                   const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  const SppConfig& sppConf = config_.inputs(0).spp_conf();
+  pyramidHeight_ = sppConf.pyramid_height();
+  poolType_ = sppConf.pool_type();
+
+  channels_ = sppConf.channels();
+  imgSizeW_ = sppConf.img_size();
+  imgSizeH_ = sppConf.has_img_size_y() ? sppConf.img_size_y() : imgSizeW_;
+  poolProjections_.reserve(pyramidHeight_);
+  projCol_.reserve(pyramidHeight_);
+  projOutput_.resize(pyramidHeight_);
+
+  size_t startCol = 0;
+  size_t endCol = 0;
+  for (size_t i = 0; i < pyramidHeight_; i++) {
+    poolProjections_.emplace_back(PoolProjection::create(
+        getConfig(imgSizeW_, imgSizeH_, channels_, i, poolType_),
+        nullptr,
+        useGpu_));
+    endCol += poolProjections_[i]->getOutputSize();
+    projCol_.push_back(std::make_pair(startCol, endCol));
+    startCol = endCol;
+  }
+  CHECK_EQ(endCol, getSize());
+  return true;
+}
+
+void SpatialPyramidPoolLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = getInput(0).getBatchSize();
+  resetOutput(batchSize, getSize());
+  for (size_t i = 0; i < pyramidHeight_; i++) {
+    size_t startCol = projCol_[i].first;
+    size_t endCol = projCol_[i].second;
+    projOutput_[i].value = output_.value->subColMatrix(startCol, endCol);
+    if (output_.grad) {
+      projOutput_[i].grad = output_.grad->subColMatrix(startCol, endCol);
+    }
+  }
+  for (size_t i = 0; i < pyramidHeight_; i++) {
+    poolProjections_[i]->forward(&getInput(0), &projOutput_[i], passType);
+  }
+}
+
+void SpatialPyramidPoolLayer::backward(const UpdateCallback& callback) {
+  for (size_t i = 0; i < pyramidHeight_; i++) {
+    if (poolProjections_[i]) {
+      poolProjections_[i]->backward(callback);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..79db574d99bdb1137e6a55244c382f9c894239c8
--- /dev/null
+++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "PoolProjection.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+/**
+ * @brief A layer for spatial pyramid pooling on the input image by taking
+ * the max, average, etc. within regions, so that the result vector of
+ * different sized images are of the same size.
+ *
+ * The config file api is spp_layer.
+ */
+
+class SpatialPyramidPoolLayer : public Layer {
+protected:
+  size_t channels_;
+  size_t imgSizeW_;
+  size_t imgSizeH_;
+  size_t pyramidHeight_;
+  std::string poolType_;
+
+  std::vector<std::unique_ptr<PoolProjection>> poolProjections_;
+  std::vector<Argument> projOutput_;
+  std::vector<std::pair<size_t, size_t>> projCol_;
+
+public:
+  explicit SpatialPyramidPoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~SpatialPyramidPoolLayer() {}
+
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  ProjectionConfig getConfig(size_t sizeX_,
+                             size_t sizeY_,
+                             size_t channels,
+                             size_t pyamidLevel_,
+                             std::string& poolType_);
+  size_t getSize();
+
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback = nullptr);
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SubSequenceLayer.cpp b/paddle/gserver/layers/SubSequenceLayer.cpp
index ccf65ba649f21478ae20902ccd8db0a4734e22e2..664f9e13c055df08552974048428326644b69a6e 100644
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubSequenceLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -75,18 +74,15 @@ void SubSequenceLayer::forward(PassType passType) {
 
   const Argument& input = getInput(0);
   size_t numSequences1 = input.getNumSequences();
-  auto startPositions1 =
-      input.sequenceStartPositions->getVector(false);
+  auto startPositions1 = input.sequenceStartPositions->getVector(false);
 
   const Argument& offsetSeq = getInput(1);
   size_t numSequences2 = offsetSeq.getNumSequences();
-  auto startPositions2 =
-      offsetSeq.sequenceStartPositions->getVector(false);
+  auto startPositions2 = offsetSeq.sequenceStartPositions->getVector(false);
 
   const Argument& sizeSeq = getInput(2);
   size_t numSequences3 = sizeSeq.getNumSequences();
-  auto startPositions3 =
-      sizeSeq.sequenceStartPositions->getVector(false);
+  auto startPositions3 = sizeSeq.sequenceStartPositions->getVector(false);
 
   CHECK_EQ(dim, input.value->getWidth());
 
@@ -143,8 +139,8 @@ void SubSequenceLayer::forward(PassType passType) {
     }
 
     // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions,
-                                   numSequences1 + 1, false);
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions, numSequences1 + 1, false);
 
     int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
     int offset = 0;
@@ -177,8 +173,7 @@ void SubSequenceLayer::backward(const UpdateCallback& callback) {
 
   MatrixPtr inputGrad1 = getInputGrad(0);
   MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions1 =
-      getInput(0).sequenceStartPositions->getVector(false);
+  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
   size_t numSequences1 = startPositions1->getSize() - 1;
   const int* starts1 = startPositions1->getData();
 
diff --git a/paddle/gserver/layers/SumToOneNormLayer.cpp b/paddle/gserver/layers/SumToOneNormLayer.cpp
index 7b61dd08227253c6ac8bbd44c4a852c972762fe0..bcf39168408d2bac50c17d0e22ed747cf0b33d80 100644
--- a/paddle/gserver/layers/SumToOneNormLayer.cpp
+++ b/paddle/gserver/layers/SumToOneNormLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
@@ -21,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * A layer for sum-to-one normalization, 
+ * A layer for sum-to-one normalization,
  * which is used in NEURAL TURING MACHINE.
  * \f[
  *   out[i] = \frac {in[i]} {\sum_{k=1}^N in[k]}
diff --git a/paddle/gserver/layers/TableProjection.cpp b/paddle/gserver/layers/TableProjection.cpp
index 947d8cf9be1b4a6a5ce87bdcc57aa3c23967393e..2bc0d329d9605850ecdce6b4a87351579493d834 100644
--- a/paddle/gserver/layers/TableProjection.cpp
+++ b/paddle/gserver/layers/TableProjection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "TableProjection.h"
 
 namespace paddle {
@@ -20,7 +19,8 @@ namespace paddle {
 REGISTER_PROJECTION(table, TableProjection);
 
 TableProjection::TableProjection(const ProjectionConfig& config,
-                                 const ParameterPtr& parameter, bool useGpu)
+                                 const ParameterPtr& parameter,
+                                 bool useGpu)
     : Projection(config, parameter, useGpu) {
   table_.reset(
       new Weight(config.input_size(), config.output_size(), parameter));
diff --git a/paddle/gserver/layers/TableProjection.h b/paddle/gserver/layers/TableProjection.h
index eadf2de623cdf2990bc731cefcac66958c61a311..97c672508a009735a9a8f9980b715881c1f824a2 100644
--- a/paddle/gserver/layers/TableProjection.h
+++ b/paddle/gserver/layers/TableProjection.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Projection.h"
@@ -34,7 +33,8 @@ namespace paddle {
  */
 class TableProjection : public Projection {
 public:
-  TableProjection(const ProjectionConfig& config, const ParameterPtr& parameter,
+  TableProjection(const ProjectionConfig& config,
+                  const ParameterPtr& parameter,
                   bool useGpu);
   /**
    * If use sparse row matrix as parameter, prefetch feature ids in input label.
diff --git a/paddle/gserver/layers/TensorLayer.cpp b/paddle/gserver/layers/TensorLayer.cpp
index 84fe9005b003db65e1ae9072669c215a961556ab..03586cc6ff3d148a63af33d89b85d565e2198057 100644
--- a/paddle/gserver/layers/TensorLayer.cpp
+++ b/paddle/gserver/layers/TensorLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "TensorLayer.h"
 
 #include "paddle/utils/Logging.h"
@@ -72,7 +71,9 @@ void TensorLayer::forward(PassType passType) {
     MatrixPtr input1 = getInputValue(0);
     MatrixPtr input2 = getInputValue(1);
     MatrixPtr tmpMat = Matrix::create(input2->getHeight(),
-      input2->getWidth(), /* trans= */ false, input2->useGpu());
+                                      input2->getWidth(),
+                                      /* trans= */ false,
+                                      input2->useGpu());
     REGISTER_TIMER_INFO("TensorFwMulTimer", getName().c_str());
     for (size_t i = 0; i < getSize(); ++i) {
       MatrixPtr weights = weights_[i]->getW();
@@ -101,7 +102,9 @@ void TensorLayer::backward(const UpdateCallback& callback) {
   MatrixPtr input2 = getInputValue(1);
   MatrixPtr oGrad = getOutputGrad();
   MatrixPtr tmpMat = Matrix::create(input1->getHeight(),
-    input1->getWidth(), /* trans= */ false, input1->useGpu());
+                                    input1->getWidth(),
+                                    /* trans= */ false,
+                                    input1->useGpu());
 
   /* trans(grad * e1) * e2 */ {
     REGISTER_TIMER_INFO("TensorGradMulTimer", getName().c_str());
diff --git a/paddle/gserver/layers/TensorLayer.h b/paddle/gserver/layers/TensorLayer.h
index 83b87b1307ac1faa5511b69aa89c6482cbfd9d44..9ac651de4d99a23a12394c674bda827e935749b9 100644
--- a/paddle/gserver/layers/TensorLayer.h
+++ b/paddle/gserver/layers/TensorLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/TransLayer.cpp b/paddle/gserver/layers/TransLayer.cpp
index f8827bec63a9bc0aa7391906af82d5053b9ccca3..53a24d4cc4633898cff1b56f5a377959a38f6354 100644
--- a/paddle/gserver/layers/TransLayer.cpp
+++ b/paddle/gserver/layers/TransLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "TransLayer.h"
 namespace paddle {
diff --git a/paddle/gserver/layers/TransLayer.h b/paddle/gserver/layers/TransLayer.h
index 867ccb4d1950cf6b9f5e6da01a11b0abfed14072..25b091f9f414ead5048cd65cfc16b67ae1387ad9 100644
--- a/paddle/gserver/layers/TransLayer.h
+++ b/paddle/gserver/layers/TransLayer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Layer.h"
diff --git a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
index 6e3f6bf2e496cf2e1a4bada5a9dc621024b08996..c883283f782352e674d0fcf0369e8491e31d60ff 100644
--- a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
+++ b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Stat.h"
 #include "Projection.h"
 
@@ -27,7 +26,8 @@ namespace paddle {
 class TransposedFullMatrixProjection : public Projection {
 public:
   TransposedFullMatrixProjection(const ProjectionConfig& config,
-                                 ParameterPtr parameter, bool useGPu);
+                                 ParameterPtr parameter,
+                                 bool useGPu);
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
 
diff --git a/paddle/gserver/layers/ValidationLayer.cpp b/paddle/gserver/layers/ValidationLayer.cpp
index 48a7b54338fca36095d9cd4af49e09b7fb22dfdf..0fee4bd2463ac86dfcb5ecc0b5e75564d86971d2 100644
--- a/paddle/gserver/layers/ValidationLayer.cpp
+++ b/paddle/gserver/layers/ValidationLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <memory>
 #include <algorithm>
 #include <fstream>
@@ -68,8 +67,11 @@ void AucValidation::validationImp(MatrixPtr output, IVectorPtr label) {
   if (dynamic_cast<GpuMatrix*>(output.get())) {
     size_t height = output->getHeight();
     size_t width = output->getWidth();
-    Matrix::resizeOrCreate(cpuOutput_, height, width,
-                           /* trans=*/false, /* useGpu=*/false);
+    Matrix::resizeOrCreate(cpuOutput_,
+                           height,
+                           width,
+                           /* trans=*/false,
+                           /* useGpu=*/false);
     cpuOutput_->copyFrom(*output);
     IVector::resizeOrCreate(cpuLabel_, height, false);
     cpuLabel_->copyFrom(*label);
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index ff2abf76973174ac2a437830b234f4c9937c08ed..0651d0b4733ea9c3f54a42169774217b65091aa6 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -20,6 +20,21 @@ add_unittest_without_exec(test_LayerGrad
 add_test(NAME test_LayerGrad
     COMMAND test_LayerGrad)
 
+add_unittest_without_exec(test_ActivationGrad
+    test_ActivationGrad.cpp
+    LayerGradUtil.cpp
+    TestUtil.cpp)
+add_test(NAME test_ActivationGrad
+    COMMAND test_ActivationGrad)
+################# test_ConvTrans #######################
+add_unittest_without_exec(test_ConvTrans
+    test_ConvTrans.cpp
+    LayerGradUtil.cpp
+    TestUtil.cpp)
+
+add_test(NAME test_ConvTrans
+    COMMAND test_ConvTrans)
+
 ################## test_Evaluator #######################
 add_unittest(test_Evaluator
     test_Evaluator.cpp
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index 552a6c5b41c7f896c52b2132578b136200967573..47575169172832cd3f95a53ed6e4dcb87a5b7a4b 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "LayerGradUtil.h"
 
 P_DECLARE_bool(thread_local_rand_use_global_seed);
@@ -28,8 +27,13 @@ real getCostSum(LayerPtr& testLayer, MatrixPtr weights) {
   return Argument::sumCosts(outArgs);
 }
 
-real getDiffAndPrint(real newCost1, real newCost2, real callbackCount,
-                     char fill, string testLayerName, string name, real step,
+real getDiffAndPrint(real newCost1,
+                     real newCost2,
+                     real callbackCount,
+                     char fill,
+                     string testLayerName,
+                     string name,
+                     real step,
                      real delta) {
   EXPECT_FALSE(std::isnan(newCost1));
   EXPECT_FALSE(std::isnan(newCost2));
@@ -49,7 +53,8 @@ real getDiffAndPrint(real newCost1, real newCost2, real callbackCount,
   return diff;
 }
 
-void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
+void testState(LayerPtr testLayer,
+               vector<DataLayerPtr>& dataLayers,
                vector<Argument>& datas) {
   auto batchSize = datas[0].getBatchSize();
   Argument data;
@@ -82,8 +87,8 @@ void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
         data.value = datas[j].value->subMatrix(batchId, 1);
       }
       if (datas[j].ids) {
-        data.ids = IVector::create(datas[j].ids->getData() + batchId, 1,
-                                   FLAGS_use_gpu);
+        data.ids = IVector::create(
+            datas[j].ids->getData() + batchId, 1, FLAGS_use_gpu);
       }
       dataLayers[j]->setData(data);
       dataLayers[j]->forward(PASS_TEST);
@@ -128,7 +133,8 @@ void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
   }
 }
 
-void testBatchState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
+void testBatchState(LayerPtr testLayer,
+                    vector<DataLayerPtr>& dataLayers,
                     vector<Argument>& datas) {
   auto batchSize = datas[0].getBatchSize();
   Argument data;
@@ -192,8 +198,10 @@ void testBatchState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
     splitData.sequenceStartPositions = cpuSeqStartPos;
     for (size_t j = 0; j < datas.size(); ++j) {
       if (datas[j].value) {
-        Matrix::resizeOrCreate(splitData.value, splitBatchSize,
-                               datas[j].value->getWidth(), false,
+        Matrix::resizeOrCreate(splitData.value,
+                               splitBatchSize,
+                               datas[j].value->getWidth(),
+                               false,
                                FLAGS_use_gpu);
         for (size_t seqId = 0; seqId < numSequences; ++seqId) {
           if (seqLens[seqId]) {
@@ -268,8 +276,10 @@ void initWeight(MatrixPtr& weights) {
   weights->copyFrom(*tmpMat);
 }
 
-void initBatchState(LayerPtr dataLayer, LayerPtr testLayer,
-                    LayerStatePtr state, bool useGpu) {
+void initBatchState(LayerPtr dataLayer,
+                    LayerPtr testLayer,
+                    LayerStatePtr state,
+                    bool useGpu) {
   int sequenceNum = dataLayer->getOutput().getNumSequences();
   MatrixPtr prevBatchOutput =
       Matrix::create(sequenceNum, testLayer->getSize(), false, useGpu);
@@ -282,9 +292,13 @@ void initBatchState(LayerPtr dataLayer, LayerPtr testLayer,
   state->value.push_back(prevBatchState);
 }
 
-void initDataLayer(TestConfig testConf, std::vector<DataLayerPtr>* dataLayers,
-                   vector<Argument>* datas, LayerMap* layerMap,
-                   string testLayerName, size_t batchSize, bool trans,
+void initDataLayer(TestConfig testConf,
+                   std::vector<DataLayerPtr>* dataLayers,
+                   vector<Argument>* datas,
+                   LayerMap* layerMap,
+                   string testLayerName,
+                   size_t batchSize,
+                   bool trans,
                    bool useGpu) {
   ICpuGpuVectorPtr sequenceStartPositions;
   ICpuGpuVectorPtr subSequenceStartPositions;
@@ -328,13 +342,17 @@ void initDataLayer(TestConfig testConf, std::vector<DataLayerPtr>* dataLayers,
         break;
       case INPUT_SPARSE_NON_VALUE_DATA:
         data.value = makeRandomSparseMatrix(
-            batchSize, layer->getSize(),
-            /* withValue= */ false, useGpu,
+            batchSize,
+            layer->getSize(),
+            /* withValue= */ false,
+            useGpu,
             testConf.inputDefs[i].sparse.equalNnzPerSample);
         break;
       case INPUT_SPARSE_FLOAT_VALUE_DATA:
-        data.value = makeRandomSparseMatrix(batchSize, layer->getSize(),
-                                            /* withValue= */ true, useGpu);
+        data.value = makeRandomSparseMatrix(batchSize,
+                                            layer->getSize(),
+                                            /* withValue= */ true,
+                                            useGpu);
         break;
       case INPUT_DENSE_DIM_DATA:
         fillData(trans, layer->getSize(), numSequence);
@@ -379,16 +397,21 @@ void initDataLayer(TestConfig testConf, std::vector<DataLayerPtr>* dataLayers,
   }
 }
 
-void initTestLayer(TestConfig testConf, LayerMap* layerMap,
-                   std::vector<ParameterPtr>* parameters, LayerPtr* testLayer) {
+void initTestLayer(TestConfig testConf,
+                   LayerMap* layerMap,
+                   std::vector<ParameterPtr>* parameters,
+                   LayerPtr* testLayer) {
   ParameterMap parameterMap;
   size_t index = 0;
   LayerConfig testConfig = testConf.layerConfig;
   CHECK_EQ(testConf.inputDefs.size(),
            size_t(testConf.layerConfig.inputs_size()));
 
-  auto initParameter = [&](string paraName, size_t paraSize, bool isStatic,
-                           bool initialize, ParameterConfig paraConfig) {
+  auto initParameter = [&](string paraName,
+                           size_t paraSize,
+                           bool isStatic,
+                           bool initialize,
+                           ParameterConfig paraConfig) {
     paraConfig.set_name(paraName);
     paraConfig.set_size(paraSize);
     paraConfig.set_initial_std(1);
@@ -431,8 +454,11 @@ void initTestLayer(TestConfig testConf, LayerMap* layerMap,
   if (testConf.biasSize) {
     testConfig.set_bias_parameter_name("bias");
     ParameterConfig paraConfig;
-    initParameter(testConfig.bias_parameter_name(), testConf.biasSize,
-                  testConf.staticBias, true, paraConfig);
+    initParameter(testConfig.bias_parameter_name(),
+                  testConf.biasSize,
+                  testConf.staticBias,
+                  true,
+                  paraConfig);
   }
 
   *testLayer = Layer::create(testConfig);
@@ -441,9 +467,13 @@ void initTestLayer(TestConfig testConf, LayerMap* layerMap,
   (*testLayer)->setNeedGradient(true);
 }
 
-void testPerturbParameter(TestConfig testConf, const MatrixPtr weights,
-                          const LayerStatePtr state, real cost,
-                          real callbackCount, real* maxDiff, LayerPtr testLayer,
+void testPerturbParameter(TestConfig testConf,
+                          const MatrixPtr weights,
+                          const LayerStatePtr state,
+                          real cost,
+                          real callbackCount,
+                          real* maxDiff,
+                          LayerPtr testLayer,
                           std::vector<ParameterPtr>* parameters) {
   char fill = ' ';
   for (auto& parameter : *parameters) {
@@ -481,9 +511,14 @@ void testPerturbParameter(TestConfig testConf, const MatrixPtr weights,
       parameter->setValueUpdated();
       newCost[k] = getCostSum(testLayer, weights);
     }
-    real diff = getDiffAndPrint(newCost[0], newCost[1], callbackCount, fill,
-                                testLayer->getName(), parameter->getName(),
-                                step, delta);
+    real diff = getDiffAndPrint(newCost[0],
+                                newCost[1],
+                                callbackCount,
+                                fill,
+                                testLayer->getName(),
+                                parameter->getName(),
+                                step,
+                                delta);
     *maxDiff = std::max(*maxDiff, abs(diff));
     // restore parameter
     parameter->getBuf(PARAMETER_VALUE)->copyFrom(oldPara);
@@ -492,9 +527,13 @@ void testPerturbParameter(TestConfig testConf, const MatrixPtr weights,
   }
 }
 
-void testPerturbInput(TestConfig testConf, const MatrixPtr weights,
-                      const LayerStatePtr state, real cost, real callbackCount,
-                      real* maxDiff, LayerPtr testLayer,
+void testPerturbInput(TestConfig testConf,
+                      const MatrixPtr weights,
+                      const LayerStatePtr state,
+                      real cost,
+                      real callbackCount,
+                      real* maxDiff,
+                      LayerPtr testLayer,
                       std::vector<DataLayerPtr> dataLayers) {
   char fill = ' ';
   for (size_t index = 0; index < testConf.inputDefs.size(); index++) {
@@ -539,9 +578,14 @@ void testPerturbInput(TestConfig testConf, const MatrixPtr weights,
       newCost[k] = getCostSum(testLayer, weights);
     }
 
-    real diff = getDiffAndPrint(newCost[0], newCost[1], callbackCount, fill,
+    real diff = getDiffAndPrint(newCost[0],
+                                newCost[1],
+                                callbackCount,
+                                fill,
                                 testLayer->getName(),
-                                dataLayers[index]->getName(), step, delta);
+                                dataLayers[index]->getName(),
+                                step,
+                                delta);
     *maxDiff = std::max(*maxDiff, abs(diff));
     // restore parameter
     outV->copyFrom(oldPara);
@@ -549,9 +593,13 @@ void testPerturbInput(TestConfig testConf, const MatrixPtr weights,
   }
 }
 
-void testLayerGradKernel(TestConfig testConf, string testLayerName,
-                         size_t batchSize, bool trans, bool useGpu,
-                         bool useWeight, float epsilon) {
+void testLayerGradKernel(TestConfig testConf,
+                         string testLayerName,
+                         size_t batchSize,
+                         bool trans,
+                         bool useGpu,
+                         bool useWeight,
+                         float epsilon) {
 #ifdef PADDLE_ONLY_CPU
   if (useGpu) return;
 #endif
@@ -566,8 +614,14 @@ void testLayerGradKernel(TestConfig testConf, string testLayerName,
   std::vector<DataLayerPtr> dataLayers;
   LayerMap layerMap;
   vector<Argument> datas;
-  initDataLayer(testConf, &dataLayers, &datas, &layerMap, testLayerName,
-                batchSize, trans, useGpu);
+  initDataLayer(testConf,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                testLayerName,
+                batchSize,
+                trans,
+                useGpu);
   // test layer initialize
   std::vector<ParameterPtr> parameters;
   LayerPtr testLayer;
@@ -620,17 +674,28 @@ void testLayerGradKernel(TestConfig testConf, string testLayerName,
     ++callbackCount;
   }
   for (size_t i = 0; i < parameters.size(); ++i) {
-    EXPECT_EQ(parameters[i]->isStatic() ? 0 : callbackCount,
-              callbackFlags[i]);
+    EXPECT_EQ(parameters[i]->isStatic() ? 0 : callbackCount, callbackFlags[i]);
   }
 
   // Test whether the layer's forward calculation is stable
   // by adding perturbation to its parameters or its input layers
   real maxDiff = 0;
-  testPerturbParameter(testConf, weights, state, cost, callbackCount, &maxDiff,
-                       testLayer, &parameters);
-  testPerturbInput(testConf, weights, state, cost, callbackCount, &maxDiff,
-                   testLayer, dataLayers);
+  testPerturbParameter(testConf,
+                       weights,
+                       state,
+                       cost,
+                       callbackCount,
+                       &maxDiff,
+                       testLayer,
+                       &parameters);
+  testPerturbInput(testConf,
+                   weights,
+                   state,
+                   cost,
+                   callbackCount,
+                   &maxDiff,
+                   testLayer,
+                   dataLayers);
   EXPECT_LE(fabs(maxDiff), epsilon);
 
   if (testConf.testState) {
@@ -641,10 +706,15 @@ void testLayerGradKernel(TestConfig testConf, string testLayerName,
   }
 }
 
-void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,
-                   bool trans, bool useGpu, bool useWeight, float epsilon) {
-  testLayerGradKernel(testConf, testLayerName, batchSize, trans, useGpu,
-                      useWeight, epsilon);
+void testLayerGrad(TestConfig testConf,
+                   string testLayerName,
+                   size_t batchSize,
+                   bool trans,
+                   bool useGpu,
+                   bool useWeight,
+                   float epsilon) {
+  testLayerGradKernel(
+      testConf, testLayerName, batchSize, trans, useGpu, useWeight, epsilon);
   bool isStaticTest = false;
   LayerConfig testConfig = testConf.layerConfig;
   for (size_t i = 0; i < testConf.inputDefs.size(); i++) {
@@ -662,19 +732,26 @@ void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,
     isStaticTest = true;
   }
   if (isStaticTest) {
-    testLayerGradKernel(testConf, testLayerName, batchSize, trans, useGpu,
-                        useWeight, epsilon);
+    testLayerGradKernel(
+        testConf, testLayerName, batchSize, trans, useGpu, useWeight, epsilon);
   }
 }
 
-void testProjectionGrad(ProjectionConfig conf, InputType inputType,
-                        size_t parameterSize, size_t batchSize, bool useGpu,
-                        bool testState) {
+void testProjectionGrad(ProjectionConfig conf,
+                        InputType inputType,
+                        size_t parameterSize,
+                        size_t batchSize,
+                        bool useGpu,
+                        bool testState,
+                        int biasSize,
+                        bool sharedBias) {
   TestConfig config;
   conf.set_name(conf.type());
   config.layerConfig.set_type("mixed");
   config.layerConfig.set_size(conf.output_size());
-  config.biasSize = config.layerConfig.size();
+  config.biasSize = biasSize == 0 ? config.layerConfig.size() : biasSize;
+  config.layerConfig.set_bias_size(config.biasSize);
+  config.layerConfig.set_shared_biases(sharedBias);
   config.inputDefs.push_back(
       {inputType, "layer_0", conf.input_size(), parameterSize});
   *config.layerConfig.add_inputs()->mutable_proj_conf() = conf;
@@ -682,8 +759,11 @@ void testProjectionGrad(ProjectionConfig conf, InputType inputType,
   testLayerGrad(config, "mixed", batchSize, false, useGpu);
 }
 
-void testOperatorGrad(TestConfig& config, OperatorConfig& operatorConf,
-                      size_t batchSize, bool useGpu, bool testState) {
+void testOperatorGrad(TestConfig& config,
+                      OperatorConfig& operatorConf,
+                      size_t batchSize,
+                      bool useGpu,
+                      bool testState) {
   config.layerConfig.set_type("mixed");
 
   operatorConf.set_output_size(config.layerConfig.size());
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
index 1e608dc0620abd4fca5d7aa6a235daff13c41fb7..a061c7fc533ff2c639ceda4db6d89a33fd3f0435 100644
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -72,7 +72,10 @@ struct InputDef {
     sparse = {""};
     isStatic = false;
   }
-  InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn,
+  InputDef(InputType type,
+           string nameIn,
+           size_t dimIn,
+           size_t sizeIn,
            ParaSparse sparseIn) {
     inputType = type;
     name = nameIn;
@@ -98,11 +101,18 @@ struct TestConfig {
         testBatchState(false) {}
 };
 
-real getCostSum(ParameterPtr& parameter, CpuVector& cpuPara,
-                LayerPtr& testLayer, MatrixPtr weights = nullptr);
+real getCostSum(ParameterPtr& parameter,
+                CpuVector& cpuPara,
+                LayerPtr& testLayer,
+                MatrixPtr weights = nullptr);
 
-real getDiffAndPrint(real newCost1, real newCost2, real callbackCount,
-                     char fill, string testLayerName, string name, real step,
+real getDiffAndPrint(real newCost1,
+                     real newCost2,
+                     real callbackCount,
+                     char fill,
+                     string testLayerName,
+                     string name,
+                     real step,
                      real delta);
 
 /**
@@ -113,7 +123,8 @@ real getDiffAndPrint(real newCost1, real newCost2, real callbackCount,
  * @param dataLayers[in/out]   dataLayers
  * @param datas[in/out]        data of dataLayers
  */
-void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
+void testState(LayerPtr testLayer,
+               vector<DataLayerPtr>& dataLayers,
                vector<Argument>& datas);
 
 /**
@@ -124,7 +135,8 @@ void testState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
  * @param dataLayers[in/out]   dataLayers
  * @param datas[in/out]        data of dataLayers
  */
-void testBatchState(LayerPtr testLayer, vector<DataLayerPtr>& dataLayers,
+void testBatchState(LayerPtr testLayer,
+                    vector<DataLayerPtr>& dataLayers,
                     vector<Argument>& datas);
 
 /**
@@ -144,8 +156,10 @@ double genPerturbation(const real* oldGrad, real* newGrad, size_t dim);
 
 void initWeight(MatrixPtr& weights);
 
-void initBatchState(LayerPtr dataLayer, LayerPtr testLayer,
-                    LayerStatePtr state, bool useGpu);
+void initBatchState(LayerPtr dataLayer,
+                    LayerPtr testLayer,
+                    LayerStatePtr state,
+                    bool useGpu);
 
 /**
  * @brief initialize the dataLayer by its inputType
@@ -155,9 +169,13 @@ void initBatchState(LayerPtr dataLayer, LayerPtr testLayer,
  *        datas[out]          initialized data of dataLayers
  *        layerMap[out]       layerMap
  */
-void initDataLayer(TestConfig testConf, std::vector<DataLayerPtr>* dataLayers,
-                   vector<Argument>* datas, LayerMap* layerMap,
-                   string testLayerName, size_t batchSize, bool trans,
+void initDataLayer(TestConfig testConf,
+                   std::vector<DataLayerPtr>* dataLayers,
+                   vector<Argument>* datas,
+                   LayerMap* layerMap,
+                   string testLayerName,
+                   size_t batchSize,
+                   bool trans,
                    bool useGpu);
 
 /**
@@ -168,8 +186,10 @@ void initDataLayer(TestConfig testConf, std::vector<DataLayerPtr>* dataLayers,
  *        parameters[out]     parameters of testLayer
  *        testLayer[out]      testLayer
  */
-void initTestLayer(TestConfig testConf, LayerMap* layerMap,
-                   std::vector<ParameterPtr>* parameters, LayerPtr* testLayer);
+void initTestLayer(TestConfig testConf,
+                   LayerMap* layerMap,
+                   std::vector<ParameterPtr>* parameters,
+                   LayerPtr* testLayer);
 
 /**
  * @brief Test whether the layer's forward calculation is stable by adding
@@ -184,9 +204,13 @@ void initTestLayer(TestConfig testConf, LayerMap* layerMap,
  *        testLayer[in/out]    testLayer
  *        parameters[in/out]   parameters of testLayer
  */
-void testPerturbParameter(TestConfig testConf, const MatrixPtr weights,
-                          const LayerStatePtr state, real cost,
-                          real callbackCount, real* maxDiff, LayerPtr testLayer,
+void testPerturbParameter(TestConfig testConf,
+                          const MatrixPtr weights,
+                          const LayerStatePtr state,
+                          real cost,
+                          real callbackCount,
+                          real* maxDiff,
+                          LayerPtr testLayer,
                           std::vector<ParameterPtr>* parameters);
 
 /**
@@ -202,24 +226,44 @@ void testPerturbParameter(TestConfig testConf, const MatrixPtr weights,
  *        testLayer[in/out]    testLayer
  *        dataLayers[in/out]   dataLayers
  */
-void testPerturbInput(TestConfig testConf, const MatrixPtr weights,
-                      const LayerStatePtr state, real cost, real callbackCount,
-                      real* maxDiff, LayerPtr testLayer,
+void testPerturbInput(TestConfig testConf,
+                      const MatrixPtr weights,
+                      const LayerStatePtr state,
+                      real cost,
+                      real callbackCount,
+                      real* maxDiff,
+                      LayerPtr testLayer,
                       std::vector<DataLayerPtr> dataLayers);
 
-void testLayerGradKernel(TestConfig testConf, string testLayerName,
-                         size_t batchSize, bool trans, bool useGpu,
-                         bool useWeight = false, float epsilon = 0.02);
+void testLayerGradKernel(TestConfig testConf,
+                         string testLayerName,
+                         size_t batchSize,
+                         bool trans,
+                         bool useGpu,
+                         bool useWeight = false,
+                         float epsilon = 0.02);
 
-void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,
-                   bool trans, bool useGpu, bool useWeight = false,
+void testLayerGrad(TestConfig testConf,
+                   string testLayerName,
+                   size_t batchSize,
+                   bool trans,
+                   bool useGpu,
+                   bool useWeight = false,
                    float epsilon = 0.02);
 
-void testProjectionGrad(ProjectionConfig conf, InputType inputType,
-                        size_t parameterSize, size_t batchSize, bool useGpu,
-                        bool testState = false);
+void testProjectionGrad(ProjectionConfig conf,
+                        InputType inputType,
+                        size_t parameterSize,
+                        size_t batchSize,
+                        bool useGpu,
+                        bool testState = false,
+                        int biasSize = 0,
+                        bool sharedBias = false);
 
-void testOperatorGrad(TestConfig& config, OperatorConfig& operatorConf,
-                      size_t batchSize, bool useGpu, bool testState = false);
+void testOperatorGrad(TestConfig& config,
+                      OperatorConfig& operatorConf,
+                      size_t batchSize,
+                      bool useGpu,
+                      bool testState = false);
 
 }  //  namespace paddle
diff --git a/paddle/gserver/tests/TestUtil.cpp b/paddle/gserver/tests/TestUtil.cpp
index 97fbcc8176326357fdc406a9a04a4e3a937a2105..84d516683c18551765d707f26cc7003ba3432c7f 100644
--- a/paddle/gserver/tests/TestUtil.cpp
+++ b/paddle/gserver/tests/TestUtil.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "TestUtil.h"
 
 #include "paddle/utils/CommandLineParser.h"
@@ -30,8 +29,11 @@ std::string randStr(const int len) {
   return s;
 }
 
-MatrixPtr makeRandomSparseMatrix(size_t height, size_t width, bool withValue,
-                                 bool useGpu, bool equalNnzPerSample) {
+MatrixPtr makeRandomSparseMatrix(size_t height,
+                                 size_t width,
+                                 bool withValue,
+                                 bool useGpu,
+                                 bool equalNnzPerSample) {
   std::vector<int64_t> ids(height);
   std::vector<int64_t> indices(height + 1);
   indices[0] = 0;
@@ -55,8 +57,8 @@ MatrixPtr makeRandomSparseMatrix(size_t height, size_t width, bool withValue,
     for (size_t i = 0; i < data.size(); ++i) {
       data[i].col = uniformRandom(width);
     }
-    auto mat = Matrix::createSparseMatrix(height, width, data.size(), NO_VALUE,
-                                          SPARSE_CSR, false, useGpu);
+    auto mat = Matrix::createSparseMatrix(
+        height, width, data.size(), NO_VALUE, SPARSE_CSR, false, useGpu);
     if (useGpu) {
       std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
           ids.data(), indices.data(), data.data(), HPPL_STREAM_DEFAULT);
@@ -93,7 +95,7 @@ void generateSequenceStartPositions(size_t batchSize,
 }
 
 void generateSequenceStartPositions(size_t batchSize,
-    ICpuGpuVectorPtr& sequenceStartPositions) {
+                                    ICpuGpuVectorPtr& sequenceStartPositions) {
   int numSeqs;
   if (FLAGS_fixed_seq_length != 0) {
     numSeqs = std::ceil((float)batchSize / (float)FLAGS_fixed_seq_length);
@@ -101,7 +103,7 @@ void generateSequenceStartPositions(size_t batchSize,
     numSeqs = batchSize / 10 + 1;
   }
   sequenceStartPositions =
-      ICpuGpuVector::create(numSeqs + 1, /* useGpu= */false);
+      ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
   int* buf = sequenceStartPositions->getMutableData(false);
   int64_t pos = 0;
   int len = FLAGS_fixed_seq_length;
@@ -109,7 +111,8 @@ void generateSequenceStartPositions(size_t batchSize,
   for (int i = 0; i < numSeqs; ++i) {
     if (FLAGS_fixed_seq_length == 0) {
       len = uniformRandom(
-            std::min<int64_t>(maxLen, batchSize - pos - numSeqs + i)) + 1;
+                std::min<int64_t>(maxLen, batchSize - pos - numSeqs + i)) +
+            1;
     }
     buf[i] = pos;
     pos += len;
@@ -118,7 +121,6 @@ void generateSequenceStartPositions(size_t batchSize,
   buf[numSeqs] = batchSize;
 }
 
-
 void generateSubSequenceStartPositions(
     const ICpuGpuVectorPtr& sequenceStartPositions,
     ICpuGpuVectorPtr& subSequenceStartPositions) {
@@ -148,7 +150,6 @@ void generateSubSequenceStartPositions(
   subBuf[j] = buf[numSeqs];
 }
 
-
 void generateMDimSequenceData(const IVectorPtr& sequenceStartPositions,
                               IVectorPtr& cpuSequenceDims) {
   /* generate sequences with 2 dims */
@@ -174,9 +175,8 @@ void generateMDimSequenceData(const IVectorPtr& sequenceStartPositions,
   }
 }
 
-void generateMDimSequenceData(
-    const ICpuGpuVectorPtr& sequenceStartPositions,
-    IVectorPtr& cpuSequenceDims) {
+void generateMDimSequenceData(const ICpuGpuVectorPtr& sequenceStartPositions,
+                              IVectorPtr& cpuSequenceDims) {
   /* generate sequences with 2 dims */
   int numSeqs = sequenceStartPositions->getSize() - 1;
   int numDims = 2;
diff --git a/paddle/gserver/tests/TestUtil.h b/paddle/gserver/tests/TestUtil.h
index 6a75f92ffe2f640fddd45d610645274a941a61c3..000f8884e8681db8f4d2a2d6454791958b964f92 100644
--- a/paddle/gserver/tests/TestUtil.h
+++ b/paddle/gserver/tests/TestUtil.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <gtest/gtest.h>
@@ -28,8 +27,11 @@ inline bool approximatelyEqual(float a, float b, float epsilon) {
   return fabs(a - b) <= ((fabs(a) < fabs(b) ? fabs(b) : fabs(a)) * epsilon);
 }
 
-MatrixPtr makeRandomSparseMatrix(size_t height, size_t width, bool withValue,
-                                 bool useGpu, bool equalNnzPerSample = false);
+MatrixPtr makeRandomSparseMatrix(size_t height,
+                                 size_t width,
+                                 bool withValue,
+                                 bool useGpu,
+                                 bool equalNnzPerSample = false);
 
 /**
  * @brief generate sequenceStartPositions for INPUT_SEQUENCE_DATA,
@@ -39,10 +41,10 @@ MatrixPtr makeRandomSparseMatrix(size_t height, size_t width, bool withValue,
  *        sequenceStartPositions[out] generation output
  */
 void generateSequenceStartPositions(size_t batchSize,
-    IVectorPtr& sequenceStartPositions);
+                                    IVectorPtr& sequenceStartPositions);
 
 void generateSequenceStartPositions(size_t batchSize,
-   ICpuGpuVectorPtr& sequenceStartPositions);
+                                    ICpuGpuVectorPtr& sequenceStartPositions);
 
 /**
  * @brief generate subSequenceStartPositions for INPUT_HASSUB_SEQUENCE_DATA
@@ -51,9 +53,8 @@ void generateSequenceStartPositions(size_t batchSize,
  * @param sequenceStartPositions[in]     input
  *        subSequenceStartPositions[out] generation output
  */
-void generateSubSequenceStartPositions(
-    const IVectorPtr& sequenceStartPositions,
-    IVectorPtr& subSequenceStartPositions);
+void generateSubSequenceStartPositions(const IVectorPtr& sequenceStartPositions,
+                                       IVectorPtr& subSequenceStartPositions);
 
 void generateSubSequenceStartPositions(
     const ICpuGpuVectorPtr& sequenceStartPositions,
@@ -66,12 +67,10 @@ void generateSubSequenceStartPositions(
  * @param sequenceStartPositions[in]     input
  *        cpuSequenceDims[out]              generation output
  */
-void generateMDimSequenceData(
-    const IVectorPtr& sequenceStartPositions,
-    IVectorPtr& cpuSequenceDims);
-void generateMDimSequenceData(
-    const ICpuGpuVectorPtr& sequenceStartPositions,
-    IVectorPtr& cpuSequenceDims);
+void generateMDimSequenceData(const IVectorPtr& sequenceStartPositions,
+                              IVectorPtr& cpuSequenceDims);
+void generateMDimSequenceData(const ICpuGpuVectorPtr& sequenceStartPositions,
+                              IVectorPtr& cpuSequenceDims);
 
 void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b);
 
diff --git a/paddle/gserver/tests/__init__.py b/paddle/gserver/tests/__init__.py
index 7f9e87eee6037666b86420fba194624859d356b3..c90af2ee000d46a032984ee23559e7e99b49ddad 100644
--- a/paddle/gserver/tests/__init__.py
+++ b/paddle/gserver/tests/__init__.py
@@ -11,4 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
diff --git a/paddle/gserver/tests/img_conv_a.conf b/paddle/gserver/tests/img_conv_a.conf
new file mode 100644
index 0000000000000000000000000000000000000000..940589ed9ac242d6a73a74c9be39fcaafe66b7be
--- /dev/null
+++ b/paddle/gserver/tests/img_conv_a.conf
@@ -0,0 +1,39 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+data = data_layer(name ="input", size=8*16*16)
+conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                        num_channels=8,
+                        num_filters=16, stride=1,
+                        bias_attr=False,
+                        act=ReluActivation())
+conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+
+concat = concat_layer(input=[conv1, conv2])
+
+conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                      num_channels=8,
+                      num_filters=16, stride=1,
+                      bias_attr=True,
+                      act=LinearActivation())
+
+outputs(concat, conv)
diff --git a/paddle/gserver/tests/img_conv_b.conf b/paddle/gserver/tests/img_conv_b.conf
new file mode 100644
index 0000000000000000000000000000000000000000..8ca9c94541504d208b94f45bf71c8da440d18411
--- /dev/null
+++ b/paddle/gserver/tests/img_conv_b.conf
@@ -0,0 +1,32 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+data = data_layer(name ="input", size=8*16*16)
+proj1 = conv_projection(input=data, filter_size=1, filter_size_y=1,
+                        num_channels=8, num_filters=16, stride=1)
+proj2 = conv_projection(input=data, filter_size=1, filter_size_y=1,
+                        num_channels=8, num_filters=16, stride=1)
+concat = concat_layer(input=[proj1, proj2], bias_attr=False, act=ReluActivation())
+
+proj = conv_projection(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8, num_filters=16, stride=1)
+
+with mixed_layer(bias_attr=True, act=LinearActivation()) as conv:
+    conv += proj
+
+outputs(concat, conv)
diff --git a/paddle/gserver/tests/pyDataProvider.py b/paddle/gserver/tests/pyDataProvider.py
index c3155e7adea04d472cca2ed74ec31c22bc2d1586..91863b4175b1a58cb7d475732f293f32a3a6ed5a 100644
--- a/paddle/gserver/tests/pyDataProvider.py
+++ b/paddle/gserver/tests/pyDataProvider.py
@@ -16,72 +16,79 @@ import numpy
 import struct
 import traceback
 
+
 def header_creator():
     ret = ""
-    ret += struct.pack('i', 3) # slot num
-    ret += struct.pack('i', 1) # sequence flag
-    ret += struct.pack('i', 0) # slot0 dense type
-    ret += struct.pack('i', 3) # slot0 dim
-    ret += struct.pack('i', 1) # slot1 sparse non value type
-    ret += struct.pack('i', 7) # slot1 dim
-    ret += struct.pack('i', 3) # slot2 index type
-    ret += struct.pack('i', 2) # slot2 dim
+    ret += struct.pack('i', 3)  # slot num
+    ret += struct.pack('i', 1)  # sequence flag
+    ret += struct.pack('i', 0)  # slot0 dense type
+    ret += struct.pack('i', 3)  # slot0 dim
+    ret += struct.pack('i', 1)  # slot1 sparse non value type
+    ret += struct.pack('i', 7)  # slot1 dim
+    ret += struct.pack('i', 3)  # slot2 index type
+    ret += struct.pack('i', 2)  # slot2 dim
     return ret
 
+
 def dense_value_creator(sample_num):
     ret = ""
-    ret += struct.pack('i', sample_num) # slot0 sample num
-    for i in range(sample_num): # slot0 value
+    ret += struct.pack('i', sample_num)  # slot0 sample num
+    for i in range(sample_num):  # slot0 value
         ret += struct.pack('f', 1.0)
         ret += struct.pack('f', 2.0)
         ret += struct.pack('f', 3.0)
     return ret
 
+
 def sparse_value_creator(sample_num):
     ret = ""
-    ret += struct.pack('i', sample_num) # slot1 sample num
-    for i in range(sample_num): # slot1 index
+    ret += struct.pack('i', sample_num)  # slot1 sample num
+    for i in range(sample_num):  # slot1 index
         ret += struct.pack('i', i * 2)
-    ret += struct.pack('i', sample_num * 2) #slot1 length
-    for i in range(sample_num): # slot1 value
+    ret += struct.pack('i', sample_num * 2)  #slot1 length
+    for i in range(sample_num):  # slot1 value
         ret += struct.pack('i', 1)
         ret += struct.pack('i', 2)
     return ret
 
+
 def index_value_creator(sample_num):
     ret = ""
-    ret += struct.pack('i', sample_num) # slot2 sample num
-    for i in range(sample_num): # slot2 value
+    ret += struct.pack('i', sample_num)  # slot2 sample num
+    for i in range(sample_num):  # slot2 value
         ret += struct.pack('i', 0)
     return ret
 
+
 def sequenceStartPositions_creator():
     ret = ""
-    ret += struct.pack('i', 2) # slot0 sequence num
-    ret += struct.pack('i', 0) # slot0 sequence value1
-    ret += struct.pack('i', 1) # slot0 sequence value2
-    ret += struct.pack('i', 1) # slot1 sequence num
-    ret += struct.pack('i', 0) # slot1 sequence value1
-    ret += struct.pack('i', 2) # slot2 sequence num
-    ret += struct.pack('i', 0) # slot2 sequence value1
-    ret += struct.pack('i', 1) # slot2 sequence value2
+    ret += struct.pack('i', 2)  # slot0 sequence num
+    ret += struct.pack('i', 0)  # slot0 sequence value1
+    ret += struct.pack('i', 1)  # slot0 sequence value2
+    ret += struct.pack('i', 1)  # slot1 sequence num
+    ret += struct.pack('i', 0)  # slot1 sequence value1
+    ret += struct.pack('i', 2)  # slot2 sequence num
+    ret += struct.pack('i', 0)  # slot2 sequence value1
+    ret += struct.pack('i', 1)  # slot2 sequence value2
     return ret
 
+
 def subSequenceStartPositions_creator():
     ret = ""
-    ret += struct.pack('i', 3) # slot0 subsequence num
-    ret += struct.pack('i', 0) # slot0 subsequence value1
-    ret += struct.pack('i', 1) # slot0 subsequence value2
-    ret += struct.pack('i', 2) # slot0 subsequence value3
-    ret += struct.pack('i', 2) # slot1 subsequence num
-    ret += struct.pack('i', 0) # slot1 subsequence value1
-    ret += struct.pack('i', 1) # slot1 subsequence value2
-    ret += struct.pack('i', 3) # slot2 subsequence num
-    ret += struct.pack('i', 0) # slot2 subsequence value1
-    ret += struct.pack('i', 1) # slot2 subsequence value2
-    ret += struct.pack('i', 2) # slot2 subsequence value3
+    ret += struct.pack('i', 3)  # slot0 subsequence num
+    ret += struct.pack('i', 0)  # slot0 subsequence value1
+    ret += struct.pack('i', 1)  # slot0 subsequence value2
+    ret += struct.pack('i', 2)  # slot0 subsequence value3
+    ret += struct.pack('i', 2)  # slot1 subsequence num
+    ret += struct.pack('i', 0)  # slot1 subsequence value1
+    ret += struct.pack('i', 1)  # slot1 subsequence value2
+    ret += struct.pack('i', 3)  # slot2 subsequence num
+    ret += struct.pack('i', 0)  # slot2 subsequence value1
+    ret += struct.pack('i', 1)  # slot2 subsequence value2
+    ret += struct.pack('i', 2)  # slot2 subsequence value3
     return ret
 
+
 class SimpleDataProvider:
     def __init__(self, *file_list):
         self.file_list = file_list
@@ -93,17 +100,18 @@ class SimpleDataProvider:
         pass
 
     def getHeader(self):
-       return  header_creator()
+        return header_creator()
 
     def getNextBatch(self, batch_size):
         ret = ""
-        ret += struct.pack('i', 2) # batch size
-        ret += dense_value_creator(2) # slot0
-        ret += sparse_value_creator(2) # slot1
-        ret += index_value_creator(2) # slot2
+        ret += struct.pack('i', 2)  # batch size
+        ret += dense_value_creator(2)  # slot0
+        ret += sparse_value_creator(2)  # slot1
+        ret += index_value_creator(2)  # slot2
         ret += sequenceStartPositions_creator()
         return ret
 
+
 class SimpleNestDataProvider:
     def __init__(self, *file_list):
         self.file_list = file_list
@@ -119,14 +127,15 @@ class SimpleNestDataProvider:
 
     def getNextBatch(self, batch_size):
         ret = ""
-        ret += struct.pack('i', 2) # batch size
-        ret += dense_value_creator(4) # slot0
-        ret += sparse_value_creator(4) # slot1
-        ret += index_value_creator(4) # slot2
+        ret += struct.pack('i', 2)  # batch size
+        ret += dense_value_creator(4)  # slot0
+        ret += sparse_value_creator(4)  # slot1
+        ret += index_value_creator(4)  # slot2
         ret += sequenceStartPositions_creator()
         ret += subSequenceStartPositions_creator()
         return ret
 
+
 if __name__ == "__main__":
     # test code
     data_provider = SimpleDataProvider('./test_batch')
diff --git a/paddle/gserver/tests/rnn_data_provider.py b/paddle/gserver/tests/rnn_data_provider.py
index 5c3b062309c51f3549f0dde1c6aed3be94619ef5..715ac08a42d05cec9c7f4b09a0447d44835d417d 100644
--- a/paddle/gserver/tests/rnn_data_provider.py
+++ b/paddle/gserver/tests/rnn_data_provider.py
@@ -14,23 +14,28 @@
 
 from paddle.trainer.PyDataProvider2 import *
 
+# Note that each config should has an independent provider
+# in current design of PyDataProvider2.
+#######################################################
 data = [
     [[[1, 3, 2], [4, 5, 2]], 0],
     [[[0, 2], [2, 5], [0, 1, 2]], 1],
 ]
 
 
-@provider(input_types=[integer_value_sub_sequence(10),
-                       integer_value(3)],
-          should_shuffle=False)
+# Used for sequence_nest_rnn.conf
+@provider(
+    input_types=[integer_value_sub_sequence(10), integer_value(3)],
+    should_shuffle=False)
 def process_subseq(settings, file_name):
     for d in data:
         yield d
 
 
-@provider(input_types=[integer_value_sequence(10),
-                       integer_value(3)],
-          should_shuffle=False)
+# Used for sequence_rnn.conf
+@provider(
+    input_types=[integer_value_sequence(10), integer_value(3)],
+    should_shuffle=False)
 def process_seq(settings, file_name):
     for d in data:
         seq = []
@@ -38,28 +43,55 @@ def process_seq(settings, file_name):
             seq += subseq
         yield seq, d[1]
 
+
+# Used for sequence_nest_rnn_multi_input.conf
+@provider(
+    input_types=[integer_value_sub_sequence(10), integer_value(3)],
+    should_shuffle=False)
+def process_subseq2(settings, file_name):
+    for d in data:
+        yield d
+
+
+# Used for sequence_rnn_multi_input.conf
+@provider(
+    input_types=[integer_value_sequence(10), integer_value(3)],
+    should_shuffle=False)
+def process_seq2(settings, file_name):
+    for d in data:
+        seq = []
+        for subseq in d[0]:
+            seq += subseq
+        yield seq, d[1]
+
+
+###########################################################
 data2 = [
-    [[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0],
-    [[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1],
+    [[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]], 0],
+    [[[0, 2], [2, 5], [0, 1, 2]], [[1, 5], [4], [2, 3, 6, 1]], 1],
 ]
 
-@provider(input_types=[integer_value_sub_sequence(10),
-                       integer_value_sub_sequence(10),
-                       integer_value(2)],
-          should_shuffle=False)
+
+# Used for sequence_nest_rnn_multi_unequalength_inputs.conf
+@provider(
+    input_types=[
+        integer_value_sub_sequence(10), integer_value_sub_sequence(10),
+        integer_value(2)
+    ],
+    should_shuffle=False)
 def process_unequalength_subseq(settings, file_name):
     for d in data2:
         yield d
 
 
-@provider(input_types=[integer_value_sequence(10),
-                       integer_value_sequence(10),
-                       integer_value(2)],
-          should_shuffle=False)
+# Used for sequence_rnn_multi_unequalength_inputs.conf
+@provider(
+    input_types=[
+        integer_value_sequence(10), integer_value_sequence(10), integer_value(2)
+    ],
+    should_shuffle=False)
 def process_unequalength_seq(settings, file_name):
     for d in data2:
-        words1=reduce(lambda x,y: x+y, d[0])
-        words2=reduce(lambda x,y: x+y, d[1])
+        words1 = reduce(lambda x, y: x + y, d[0])
+        words2 = reduce(lambda x, y: x + y, d[1])
         yield words1, words2, d[2]
-
-
diff --git a/paddle/gserver/tests/sequenceGen.py b/paddle/gserver/tests/sequenceGen.py
index b166e778d7a33f444b91d6b37c74352a72f4ac10..fab876fd30da0a80774d06028ae2321e12354d59 100644
--- a/paddle/gserver/tests/sequenceGen.py
+++ b/paddle/gserver/tests/sequenceGen.py
@@ -20,8 +20,9 @@ from paddle.trainer.PyDataProvider2 import *
 
 def hook(settings, dict_file, **kwargs):
     settings.word_dict = dict_file
-    settings.input_types = [integer_value_sequence(len(settings.word_dict)),
-                            integer_value(3)]
+    settings.input_types = [
+        integer_value_sequence(len(settings.word_dict)), integer_value(3)
+    ]
     settings.logger.info('dict len : %d' % (len(settings.word_dict)))
 
 
@@ -32,16 +33,19 @@ def process(settings, file_name):
             label, comment = line.strip().split('\t')
             label = int(''.join(label.split()))
             words = comment.split()
-            word_slot = [settings.word_dict[w] for w in words if
-                         w in settings.word_dict]
+            word_slot = [
+                settings.word_dict[w] for w in words if w in settings.word_dict
+            ]
             yield word_slot, label
 
 
 ## for hierarchical sequence network
 def hook2(settings, dict_file, **kwargs):
     settings.word_dict = dict_file
-    settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)),
-                            integer_value_sequence(3)]
+    settings.input_types = [
+        integer_value_sub_sequence(len(settings.word_dict)),
+        integer_value_sequence(3)
+    ]
     settings.logger.info('dict len : %d' % (len(settings.word_dict)))
 
 
@@ -55,8 +59,10 @@ def process2(settings, file_name):
                 label, comment = line.strip().split('\t')
                 label = int(''.join(label.split()))
                 words = comment.split()
-                word_slot = [settings.word_dict[w] for w in words if
-                             w in settings.word_dict]
+                word_slot = [
+                    settings.word_dict[w] for w in words
+                    if w in settings.word_dict
+                ]
                 label_list.append(label)
                 word_slot_list.append(word_slot)
             else:
diff --git a/paddle/gserver/tests/sequence_layer_group.conf b/paddle/gserver/tests/sequence_layer_group.conf
index ac031b31280df297246c1ea2e279fc2c595bd8b7..087aa96ccb5a7fc2b6d4f5ce81de4e820580570a 100644
--- a/paddle/gserver/tests/sequence_layer_group.conf
+++ b/paddle/gserver/tests/sequence_layer_group.conf
@@ -21,15 +21,16 @@ dict_file = dict()
 for line_count, line in enumerate(open(dict_path, "r")):
     dict_file[line.strip()] = line_count
 
-define_py_data_sources2(train_list='gserver/tests/Sequence/train.list',
-                        test_list=None,
-                        module='sequenceGen',
-                        obj='process',
-                        args={"dict_file":dict_file})
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
 
 settings(batch_size=5)
 ######################## network configure ################################
-dict_dim = len(open(dict_path,'r').readlines())
+dict_dim = len(open(dict_path, 'r').readlines())
 word_dim = 128
 hidden_dim = 256
 label_dim = 3
@@ -39,21 +40,24 @@ data = data_layer(name="word", size=dict_dim)
 emb = embedding_layer(input=data, size=word_dim)
 
 # (lstm_input + lstm) is equal to lstmemory 
-with mixed_layer(size=hidden_dim*4) as lstm_input:
+with mixed_layer(size=hidden_dim * 4) as lstm_input:
     lstm_input += full_matrix_projection(input=emb)
 
-lstm = lstmemory_group(input=lstm_input,
-                       size=hidden_dim,
-                       act=TanhActivation(),
-                       gate_act=SigmoidActivation(),
-                       state_act=TanhActivation(),
-                       lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
+lstm = lstmemory_group(
+    input=lstm_input,
+    size=hidden_dim,
+    act=TanhActivation(),
+    gate_act=SigmoidActivation(),
+    state_act=TanhActivation(),
+    lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
 
 lstm_last = last_seq(input=lstm)
 
-with mixed_layer(size=label_dim, 
-                 act=SoftmaxActivation(), 
-                 bias_attr=True) as output:
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
     output += full_matrix_projection(input=lstm_last)
 
-outputs(classification_cost(input=output, label=data_layer(name="label", size=1)))
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_nest_layer_group.conf b/paddle/gserver/tests/sequence_nest_layer_group.conf
index 38c60b657b969f9fbcf46a00c542fa100da5a877..93a0f6da7905c0b00cf70296143ded2d4431e430 100644
--- a/paddle/gserver/tests/sequence_nest_layer_group.conf
+++ b/paddle/gserver/tests/sequence_nest_layer_group.conf
@@ -21,15 +21,16 @@ dict_file = dict()
 for line_count, line in enumerate(open(dict_path, "r")):
     dict_file[line.strip()] = line_count
 
-define_py_data_sources2(train_list='gserver/tests/Sequence/train.list.nest',
-                        test_list=None,
-                        module='sequenceGen',
-                        obj='process2',
-                        args={"dict_file":dict_file})
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/train.list.nest',
+    test_list=None,
+    module='sequenceGen',
+    obj='process2',
+    args={"dict_file": dict_file})
 
 settings(batch_size=2)
 ######################## network configure ################################
-dict_dim = len(open(dict_path,'r').readlines())
+dict_dim = len(open(dict_path, 'r').readlines())
 word_dim = 128
 hidden_dim = 256
 label_dim = 3
@@ -38,37 +39,46 @@ data = data_layer(name="word", size=dict_dim)
 
 emb_group = embedding_layer(input=data, size=word_dim)
 
+
 # (lstm_input + lstm) is equal to lstmemory 
 def lstm_group(lstm_group_input):
-    with mixed_layer(size=hidden_dim*4) as group_input:
-      group_input += full_matrix_projection(input=lstm_group_input)
+    with mixed_layer(size=hidden_dim * 4) as group_input:
+        group_input += full_matrix_projection(input=lstm_group_input)
 
-    lstm_output = lstmemory_group(input=group_input,
-                                  name="lstm_group",
-                                  size=hidden_dim,
-                                  act=TanhActivation(),
-                                  gate_act=SigmoidActivation(),
-                                  state_act=TanhActivation(),
-                                  lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
+    lstm_output = lstmemory_group(
+        input=group_input,
+        name="lstm_group",
+        size=hidden_dim,
+        act=TanhActivation(),
+        gate_act=SigmoidActivation(),
+        state_act=TanhActivation(),
+        lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
     return lstm_output
 
-lstm_nest_group = recurrent_group(input=SubsequenceInput(emb_group),
-                                  step=lstm_group,
-                                  name="lstm_nest_group")
+
+lstm_nest_group = recurrent_group(
+    input=SubsequenceInput(emb_group), step=lstm_group, name="lstm_nest_group")
 # hasSubseq ->(seqlastins) seq
-lstm_last = last_seq(input=lstm_nest_group, agg_level=AggregateLevel.EACH_SEQUENCE)
+lstm_last = last_seq(
+    input=lstm_nest_group, agg_level=AggregateLevel.EACH_SEQUENCE)
 
 # seq ->(expand) hasSubseq
-lstm_expand = expand_layer(input=lstm_last, expand_as=emb_group, expand_level=ExpandLevel.FROM_SEQUENCE)
+lstm_expand = expand_layer(
+    input=lstm_last,
+    expand_as=emb_group,
+    expand_level=ExpandLevel.FROM_SEQUENCE)
 
 # hasSubseq ->(average) seq
-lstm_average = pooling_layer(input=lstm_expand,
-                             pooling_type=AvgPooling(),
-                             agg_level=AggregateLevel.EACH_SEQUENCE)
+lstm_average = pooling_layer(
+    input=lstm_expand,
+    pooling_type=AvgPooling(),
+    agg_level=AggregateLevel.EACH_SEQUENCE)
 
-with mixed_layer(size=label_dim, 
-                 act=SoftmaxActivation(), 
-                 bias_attr=True) as output:
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
     output += full_matrix_projection(input=lstm_average)
 
-outputs(classification_cost(input=output, label=data_layer(name="label", size=1)))
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
index e8222cef525a806a6201b7290f75138c94bd0aaf..0614958b4719ddb2098dc495c4a6c615f2628457 100644
--- a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import *
 define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
                         test_list=None,
                         module='rnn_data_provider',
-                        obj='process_subseq')
+                        obj='process_subseq2')
 
 
 settings(batch_size=2, learning_rate=0.01)
diff --git a/paddle/gserver/tests/sequence_rnn_multi_input.conf b/paddle/gserver/tests/sequence_rnn_multi_input.conf
index 968621cab59be9296ae5ee962a3a359fff59e022..51881e21d971bbebeceeab1a7c4954e50e3a5e60 100644
--- a/paddle/gserver/tests/sequence_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_rnn_multi_input.conf
@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import *
 define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
                         test_list=None,
                         module='rnn_data_provider',
-                        obj='process_seq')
+                        obj='process_seq2')
 
 
 settings(batch_size=2, learning_rate=0.01)
diff --git a/paddle/gserver/tests/test_ActivationGrad.cpp b/paddle/gserver/tests/test_ActivationGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e54c5109e71de1a41ec2bda2af4a19745acbbc83
--- /dev/null
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <vector>
+#include <string>
+#include "paddle/gserver/layers/DataLayer.h"
+#include "ModelConfig.pb.h"
+#include "paddle/trainer/Trainer.h"
+
+#include "TestUtil.h"
+#include "LayerGradUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+P_DECLARE_bool(use_gpu);
+P_DECLARE_bool(thread_local_rand_use_global_seed);
+
+void testActivation(const string& act) {
+  LOG(INFO) << "test activation: " << act;
+  size_t size = 10;
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("addto");
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type(act);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  act + "_activation",
+                  100,
+                  /* trans= */ false,
+                  useGpu,
+                  /* useWeight */ true);
+  }
+}
+
+TEST(Activation, activation) {
+  auto types = ActivationFunction::getAllRegisteredTypes();
+  std::set<string> excluded{"sequence_softmax"};
+  for (auto type : types) {
+    if (excluded.count(type)) continue;
+    testActivation(type);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_ConvTrans.cpp b/paddle/gserver/tests/test_ConvTrans.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f3efdfb428d14435fbfced6cfef3b7dadd8ff5a9
--- /dev/null
+++ b/paddle/gserver/tests/test_ConvTrans.cpp
@@ -0,0 +1,246 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <vector>
+#include <string>
+#include "paddle/gserver/layers/DataLayer.h"
+#include "ModelConfig.pb.h"
+#include "paddle/trainer/Trainer.h"
+#include "paddle/utils/GlobalConstants.h"
+#include "paddle/gserver/layers/ExpandConvTransLayer.h"
+#include "paddle/math/MathUtils.h"
+
+#include "TestUtil.h"
+#include "LayerGradUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+P_DECLARE_bool(use_gpu);
+P_DECLARE_int32(gpu_id);
+P_DECLARE_double(checkgrad_eps);
+P_DECLARE_bool(thread_local_rand_use_global_seed);
+P_DECLARE_bool(prev_batch_state);
+
+// Test that the convTrans forward is the same as conv backward
+TEST(Layer, convTransLayerFwd) {
+  // Setting up conv-trans layer
+  TestConfig configt;
+  configt.biasSize = 3;
+  configt.layerConfig.set_type("exconvt");
+  configt.layerConfig.set_num_filters(3);
+  configt.layerConfig.set_partial_sum(1);
+  configt.layerConfig.set_shared_biases(true);
+
+  configt.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
+  LayerInputConfig* input = configt.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(4);
+  conv->set_channels(16);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(3 / conv->groups());
+  conv->set_img_size(16);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
+                               configt.layerConfig.num_filters());
+  configt.layerConfig.set_name("convTrans");
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      configt, &dataLayers, &datas, &layerMap, "convTrans", 100, false, false);
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr convtLayer;
+  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
+  convtLayer->getBiasParameter()->zeroMem();
+  convtLayer->forward(PASS_GC);
+
+  // Setting up conv-layer config
+  TestConfig config;
+  config.biasSize = 16;
+  config.layerConfig.set_type("exconv");
+  config.layerConfig.set_num_filters(16);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 768, 384});
+  input = config.layerConfig.add_inputs();
+  conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(4);
+  conv->set_channels(3);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(16);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+                              config.layerConfig.num_filters());
+  config.layerConfig.set_name("conv");
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers2;
+  LayerMap layerMap2;
+  vector<Argument> datas2;
+  initDataLayer(
+      config, &dataLayers2, &datas2, &layerMap2, "conv", 100, false, false);
+  // test layer initialize
+  std::vector<ParameterPtr> parameters2;
+  LayerPtr convLayer;
+  initTestLayer(config, &layerMap2, &parameters2, &convLayer);
+
+  // Sync convLayer and convtLayer parameter
+  convLayer->getBiasParameter()->zeroMem();
+  convLayer->getParameters()[0]
+      ->getBuf(PARAMETER_VALUE)
+      ->copyFrom(*(convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)));
+
+  // Set convLayer outputGrad as convTransLayer input value
+  convLayer->forward(PASS_GC);
+  convLayer->getOutput().grad->copyFrom(*(dataLayers[0]->getOutputValue()));
+
+  vector<int> callbackFlags(parameters2.size(), 0);
+  auto callback = [&](Parameter* para) { ++callbackFlags[para->getID()]; };
+  convLayer->backward(callback);
+
+  // Check that the convLayer backward is the same as convTransLayer forward
+  checkMatrixEqual(convtLayer->getOutputValue(),
+                   dataLayers2[0]->getOutputGrad());
+}
+
+// Do one forward pass of convTrans layer and check to see if its output
+// matches the given result
+void doOneConvtTest(size_t imgSize,
+                    size_t output_x,
+                    size_t stride,
+                    size_t padding,
+                    size_t filter_size,
+                    MatrixPtr& result) {
+  TestConfig configt;
+  configt.biasSize = 1;
+  configt.layerConfig.set_type("exconvt");
+  configt.layerConfig.set_num_filters(1);
+  configt.layerConfig.set_partial_sum(1);
+  configt.layerConfig.set_shared_biases(true);
+
+  configt.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", output_x * output_x, filter_size * filter_size});
+  LayerInputConfig* input = configt.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(filter_size);
+  conv->set_filter_size_y(filter_size);
+  conv->set_channels(1);
+  conv->set_padding(padding);
+  conv->set_padding_y(padding);
+  conv->set_stride(stride);
+  conv->set_stride_y(stride);
+  conv->set_groups(1);
+  conv->set_filter_channels(1);
+  conv->set_img_size(imgSize);
+  conv->set_output_x(output_x);
+
+  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
+                               configt.layerConfig.num_filters());
+  configt.layerConfig.set_name("convTrans");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      configt, &dataLayers, &datas, &layerMap, "convTrans", 1, false, false);
+  dataLayers[0]->getOutputValue()->zeroMem();
+  dataLayers[0]->getOutputValue()->add(1.0);
+
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr convtLayer;
+  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
+  convtLayer->getBiasParameter()->zeroMem();
+  convtLayer->getParameters()[0]->zeroMem();
+  convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->add(1.0);
+  convtLayer->forward(PASS_GC);
+
+  checkMatrixEqual(convtLayer->getOutputValue(), result);
+}
+
+TEST(Layer, convTransLayerFwd2) {
+  MatrixPtr result;
+  result = Matrix::create(1, 5 * 5, false, false);
+  result->zeroMem();
+  result->add(1.0);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 1,
+                 /* stride */ 1,
+                 /* padding */ 0,
+                 /* filter_size */ 5,
+                 result);
+
+  float resultData[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
+                        4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
+  result->setData(resultData);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 2,
+                 /* stride */ 1,
+                 /* padding */ 0,
+                 /* filter_size */ 4,
+                 result);
+
+  float resultData2[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
+                         4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
+  result->setData(resultData2);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 2,
+                 /* stride */ 2,
+                 /* padding */ 1,
+                 /* filter_size */ 5,
+                 result);
+
+  float resultData3[] = {1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 4,
+                         2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1};
+  result->setData(resultData3);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 2,
+                 /* stride */ 2,
+                 /* padding */ 0,
+                 /* filter_size */ 3,
+                 result);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index 3a591a316b8bafccac9c59ff28e57b4e27f8377a..be639ea09380d02ed8251874bf690fc3596bddf2 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <gtest/gtest.h>
 #include <vector>
 #include "ModelConfig.pb.h"
@@ -48,8 +47,10 @@ struct TestConfig {
   TestConfig() : testAccumulate(true) {}
 };
 
-void testEvaluator(TestConfig testConf, string testEvaluatorName,
-                   size_t batchSize, bool useGpu) {
+void testEvaluator(TestConfig testConf,
+                   string testEvaluatorName,
+                   size_t batchSize,
+                   bool useGpu) {
 #ifdef PADDLE_ONLY_CPU
   if (useGpu) return;
 #endif
@@ -79,8 +80,10 @@ void testEvaluator(TestConfig testConf, string testEvaluatorName,
         data.ids->rand(dim);  // now rand number can be 0 to inputDefs[i].dim.
         break;
       case INPUT_SPARSE_NON_VALUE_DATA:
-        data.value = makeRandomSparseMatrix(batchSize, dim,
-                                            /* withValue= */ false, useGpu);
+        data.value = makeRandomSparseMatrix(batchSize,
+                                            dim,
+                                            /* withValue= */ false,
+                                            useGpu);
         break;
       default:
         LOG(FATAL) << " unknown inputType ";
@@ -116,8 +119,9 @@ void testEvaluator(TestConfig testConf, string testEvaluatorName,
   }
 }
 
-void testEvaluatorAll(TestConfig testConf, string testEvaluatorName,
-                   size_t batchSize) {
+void testEvaluatorAll(TestConfig testConf,
+                      string testEvaluatorName,
+                      size_t batchSize) {
   testEvaluator(testConf, testEvaluatorName, batchSize, true);
   testEvaluator(testConf, testEvaluatorName, batchSize, false);
 }
@@ -142,8 +146,8 @@ TEST(Evaluator, classification_error) {
   config.evaluatorConfig.set_classification_threshold(0.4);
   config.inputDefs.push_back({INPUT_DATA, "weight", 1});
   // Not support GPU
-  testEvaluator(config, "classification_error_weight_multi_binary_label", 50,
-                false);
+  testEvaluator(
+      config, "classification_error_weight_multi_binary_label", 50, false);
 }
 
 TEST(Evaluator, sum) {
@@ -211,8 +215,8 @@ TEST(Evaluator, precision_recall) {
   config.evaluatorConfig.set_classification_threshold(0.4);
   config.inputDefs.push_back({INPUT_DATA, "weight", 1});
   // Not support GPU
-  testEvaluator(config, "precision_recall_weight_multi_binary_label", 100,
-                false);
+  testEvaluator(
+      config, "precision_recall_weight_multi_binary_label", 100, false);
 }
 
 TEST(Evaluator, ctc_error_evaluator) {
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index c5723f8574ab3d7a15bfe7c8db8a9d03951f08b1..374ae57dd3681f891cf3f5b698085f0b8fbc6cd7 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <vector>
 #include <string>
-#include "paddle/gserver/layers/DataLayer.h"
+#include <vector>
 #include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/trainer/Trainer.h"
+#include "paddle/math/MathUtils.h"
 
-#include "TestUtil.h"
 #include "LayerGradUtil.h"
+#include "TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
@@ -68,8 +69,10 @@ TEST(Projection, context) {
               std::max(0, conf.context_start() + conf.context_length() - 1);
           for (auto useGpu : {false, true}) {
             testProjectionGrad(
-                conf, INPUT_SEQUENCE_DATA,
-                trainablePadding ? conf.input_size() * pad : 0, batchSize,
+                conf,
+                INPUT_SEQUENCE_DATA,
+                trainablePadding ? conf.input_size() * pad : 0,
+                batchSize,
                 useGpu,
                 contextStart + contextLength <= 1);  // = testState
           }
@@ -85,8 +88,11 @@ TEST(Projection, trans_fc) {
   conf.set_input_size(50);
   conf.set_output_size(20);
   for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 1000,
-                       /* batchSize */ 100, useGpu);
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 1000,
+                       /* batchSize */ 100,
+                       useGpu);
   }
 }
 
@@ -96,8 +102,11 @@ TEST(Projection, fc) {
   conf.set_input_size(10);
   conf.set_output_size(20);
   for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 200,
-                       /* batchSize */ 100, useGpu);
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 200,
+                       /* batchSize */ 100,
+                       useGpu);
   }
 }
 
@@ -107,8 +116,11 @@ TEST(Projection, dot_mul) {
   conf.set_input_size(20);
   conf.set_output_size(20);
   for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 20,
-                       /* batchSize */ 100, useGpu);
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 20,
+                       /* batchSize */ 100,
+                       useGpu);
   }
 }
 
@@ -118,8 +130,11 @@ TEST(Projection, table) {
   conf.set_input_size(10);
   conf.set_output_size(20);
   for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf, INPUT_LABEL, /* parameterSize */ 200,
-                       /* batchSize */ 100, useGpu);
+    testProjectionGrad(conf,
+                       INPUT_LABEL,
+                       /* parameterSize */ 200,
+                       /* batchSize */ 100,
+                       useGpu);
   }
 }
 
@@ -129,8 +144,95 @@ TEST(Projection, identity) {
   conf.set_input_size(10);
   conf.set_output_size(10);
   for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 0,
-                       /* batchSize */ 100, useGpu);
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 0,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, scaling) {
+  ProjectionConfig conf;
+  conf.set_type("scaling");
+  conf.set_input_size(10);
+  conf.set_output_size(10);
+  for (auto useGpu : {false}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 1,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST(Projection, conv) {
+  const int NUM_FILTERS = 16;
+  const int FILTER_SIZE = 2;
+  const int FILTER_SIZE_Y = 3;
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 16;
+
+  ProjectionConfig conf;
+  conf.set_type("conv");
+  conf.set_num_filters(NUM_FILTERS);
+
+  ConvConfig* conv = conf.mutable_conv_conf();
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_channels(CHANNELS);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(IMAGE_SIZE);
+  int output_x = outputSize(conv->img_size(),
+                            conv->filter_size(),
+                            conv->padding(),
+                            conv->stride(),
+                            /* caffeMode */ true);
+  int output_y = outputSize(conv->img_size(),
+                            conv->filter_size_y(),
+                            conv->padding_y(),
+                            conv->stride_y(),
+                            /* caffeMode */ true);
+  conv->set_output_x(output_x);
+  conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
+  conf.set_output_size(output_x * output_y * NUM_FILTERS);
+
+  testProjectionGrad(
+      conf,
+      INPUT_DATA,
+      /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE * FILTER_SIZE_Y,
+      /* batchSize */ 100,
+      true,
+      false,
+      NUM_FILTERS,
+      true);
+}
+#endif
+
+TEST(Layer, BilinearInterpLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("bilinear_interp");
+  config.biasSize = 0;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf();
+  bilinear->set_img_size_x(32);
+  bilinear->set_img_size_y(32);
+  bilinear->set_num_channels(4);
+
+  for (auto useGpu : {false, true}) {
+    for (auto outSize : {32, 64}) {
+      bilinear->set_out_size_x(outSize);
+      bilinear->set_out_size_y(outSize);
+      testLayerGrad(config, "bilinear_interp", 10, false, useGpu);
+    }
   }
 }
 
@@ -180,8 +282,13 @@ TEST(Layer, CRFLayer) {
   config.layerConfig.add_inputs();
 
   // Not support GPU now
-  testLayerGrad(config, "crf", 100, /* trans */ false, /* useGpu */ false,
-                false /*useWeight*/, 0.03 /*epsilon*/);
+  testLayerGrad(config,
+                "crf",
+                100,
+                /* trans */ false,
+                /* useGpu */ false,
+                false /*useWeight*/,
+                0.03 /*epsilon*/);
 }
 
 TEST(Layer, CTCLayer) {
@@ -254,14 +361,17 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
   conv->set_groups(1);
   conv->set_filter_channels(conv->channels() / conv->groups());
   conv->set_img_size(16);
-  conv->set_output_x(
-      (2 * conv->padding() + conv->img_size() - conv->filter_size()) /
-          ((float)conv->stride()) +
-      1.5);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
   config.layerConfig.set_size(conv->output_x() * conv->output_x() *
                               config.layerConfig.num_filters());
 
   testLayerGrad(config, "conv", 100, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "conv", 2, trans, useGpu, true, 0.02);
 }
 
 TEST(Layer, convLayer) {
@@ -272,6 +382,47 @@ TEST(Layer, convLayer) {
 #endif
 }
 
+void testConvTransLayer(const string& type, bool trans, bool useGpu) {
+  TestConfig config;
+  config.biasSize = 3;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(3);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 288});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(3);
+  conv->set_channels(16);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(3 / conv->groups());
+  conv->set_img_size(16);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+
+  config.layerConfig.set_size(conv->img_size() * conv->img_size() *
+                              config.layerConfig.num_filters());
+
+  testLayerGrad(config, "convTrans", 100, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "convTrans", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, convTransLayer) {
+  for (auto useGpu : {false, true}) {
+    testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
+  }
+}
+
 TEST(Layer, blockExpandLayer) {
   TestConfig config;
   config.biasSize = 0;
@@ -289,16 +440,16 @@ TEST(Layer, blockExpandLayer) {
   blockExpand->set_block_y(32);
   blockExpand->set_stride_x(2);
   blockExpand->set_stride_y(2);
-  blockExpand->set_output_x(
-      1 +
-      (2 * blockExpand->padding_x() + blockExpand->img_size_x() -
-       blockExpand->block_x() + blockExpand->stride_x() - 1) /
-          blockExpand->stride_x());
-  blockExpand->set_output_y(
-      1 +
-      (2 * blockExpand->padding_y() + blockExpand->img_size_y() -
-       blockExpand->block_y() + blockExpand->stride_y() - 1) /
-          blockExpand->stride_y());
+  blockExpand->set_output_x(outputSize(blockExpand->img_size_x(),
+                                       blockExpand->block_x(),
+                                       blockExpand->padding_x(),
+                                       blockExpand->stride_x(),
+                                       /* caffeMode */ false));
+  blockExpand->set_output_y(outputSize(blockExpand->img_size_y(),
+                                       blockExpand->block_y(),
+                                       blockExpand->padding_y(),
+                                       blockExpand->stride_y(),
+                                       /* caffeMode */ false));
   config.layerConfig.set_size(blockExpand->block_x() * blockExpand->block_y() *
                               blockExpand->channels());
 
@@ -307,6 +458,24 @@ TEST(Layer, blockExpandLayer) {
   }
 }
 
+TEST(Layer, maxoutLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("maxout");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  MaxOutConfig* maxout = input->mutable_maxout_conf();
+
+  maxout->set_img_size_x(32);
+  maxout->set_img_size_y(32);
+  maxout->set_channels(4);
+  maxout->set_groups(2);
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "maxout", 10, false, useGpu);
+  }
+}
 void testFcLayer(string format, size_t nnz) {
   TestConfig config;
   config.biasSize = 4096;
@@ -323,7 +492,11 @@ void testFcLayer(string format, size_t nnz) {
             << config.inputDefs[0].sparse.format;
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "fc", 100, /* trans */ false, useGpu,
+    testLayerGrad(config,
+                  "fc",
+                  100,
+                  /* trans */ false,
+                  useGpu,
                   /* weight */ true);
   }
 }
@@ -351,11 +524,19 @@ TEST(Layer, SelectiveFullyConnectedLayer) {
       {INPUT_SPARSE_NON_VALUE_DATA, "index", nout, 0, ParaSparse("csr", true)});
   config.layerConfig.add_inputs();
 
-  testLayerGrad(config, "selective_fc", 100,
-                /* trans= */ false, /* useGup= */ false, false);
+  testLayerGrad(config,
+                "selective_fc",
+                100,
+                /* trans= */ false,
+                /* useGup= */ false,
+                false);
 #ifndef PADDLE_ONLY_CPU
-  testLayerGrad(config, "selective_fc", 100,
-                /* trans= */ false, /* useGup= */ true, false);
+  testLayerGrad(config,
+                "selective_fc",
+                100,
+                /* trans= */ false,
+                /* useGup= */ true,
+                false);
 #endif
 }
 
@@ -372,7 +553,10 @@ TEST(Layer, DataNormLayer) {
   for (auto strategy : {"z-score", "min-max", "decimal-scaling"}) {
     config.layerConfig.set_data_norm_strategy(strategy);
     // The parameters are static, so not support GPU now
-    testLayerGrad(config, "data_norm", 200, /* trans */ false,
+    testLayerGrad(config,
+                  "data_norm",
+                  200,
+                  /* trans */ false,
                   /* useGpu */ false);
   }
 }
@@ -404,12 +588,12 @@ TEST(Layer, multi_cross) {
   config.layerConfig.add_inputs();
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "multi-class-cross-entropy", 100, /* trans */ false,
-                  useGpu);
+    testLayerGrad(
+        config, "multi-class-cross-entropy", 100, /* trans */ false, useGpu);
   }
 }
 
-TEST(Layer, multi_binary_label) {
+TEST(Layer, multi_binary_label_sparse_mat) {
   TestConfig config;
   config.layerConfig.set_type("multi_binary_label_cross_entropy");
   config.biasSize = 0;
@@ -419,9 +603,32 @@ TEST(Layer, multi_binary_label) {
   config.layerConfig.add_inputs();
   config.layerConfig.add_inputs();
 
-  // Not support GPU now
-  testLayerGrad(config, "multi_binary_label_cross_entropy", 100,
-                /* trans */ false, /* useGpu */ false);
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "multi_binary_label_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
+  }
+}
+
+TEST(layer, multi_binary_label_id) {
+  TestConfig config;
+  config.layerConfig.set_type("multi_binary_label_cross_entropy");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "multi_binary_label_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
+  }
 }
 
 TEST(Layer, multi_cross_with_selfnorm) {
@@ -436,7 +643,9 @@ TEST(Layer, multi_cross_with_selfnorm) {
   config.layerConfig.add_inputs();
 
   // Not support GPU now
-  testLayerGrad(config, "multi_class_cross_entropy_with_selfnorm", 100,
+  testLayerGrad(config,
+                "multi_class_cross_entropy_with_selfnorm",
+                100,
                 /* trans */ false,
                 /* useGpu */ false);
 }
@@ -452,8 +661,11 @@ TEST(Layer, multi_cross_soft) {
   config.layerConfig.add_inputs();
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "soft_binary_class_cross_entropy", 100,
-                  /* trans */ false, useGpu);
+    testLayerGrad(config,
+                  "soft_binary_class_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
   }
 }
 
@@ -483,7 +695,10 @@ TEST(Layer, sparse_square_error) {
   config.layerConfig.add_inputs();
 
   // "GpuSparseMatrix" as label is not supported
-  testLayerGrad(config, "square_error", 100, /* trans */ false,
+  testLayerGrad(config,
+                "square_error",
+                100,
+                /* trans */ false,
                 /* useGpu */ false);
 }
 
@@ -498,7 +713,10 @@ TEST(Layer, sparse_float_square_error) {
   config.layerConfig.add_inputs();
 
   // "GpuSparseMatrix" as label is not supported
-  testLayerGrad(config, "square_error", 100, /* trans */ false,
+  testLayerGrad(config,
+                "square_error",
+                100,
+                /* trans */ false,
                 /* useGpu */ false);
 }
 
@@ -541,10 +759,14 @@ void testExpandLayer(string trans_type, bool hasSubseq) {
 
   config.inputDefs.push_back(
       {trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA,
-       "layer_0", 10, 0});
+       "layer_0",
+       10,
+       0});
   config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA, "layer_1",
-       10, 0});
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+       "layer_1",
+       10,
+       0});
   config.layerConfig.add_inputs();
   config.layerConfig.add_inputs();
   config.layerConfig.set_trans_type(trans_type);
@@ -568,8 +790,10 @@ void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) {
   config.biasSize = 0;
 
   config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA, "layer_0",
-       10, 0});
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+       "layer_0",
+       10,
+       0});
   config.layerConfig.add_inputs();
   config.layerConfig.set_trans_type(trans_type);
 
@@ -599,9 +823,11 @@ TEST(Layer, MaxLayer) {
 }
 
 TEST(Layer, SequenceLastInstanceLayer) {
-  testDegradeLayer(false, "seqlastins",
+  testDegradeLayer(false,
+                   "seqlastins",
                    "non-seq");  // seq seqlastins to non-seq
-  testDegradeLayer(true, "seqlastins",
+  testDegradeLayer(true,
+                   "seqlastins",
                    "non-seq");  // hasSubseq seqlastins to non-seq
   testDegradeLayer(true, "seqlastins", "seq");  // hasSubseq seqlastins to seq
 }
@@ -786,7 +1012,8 @@ TEST(Layer, NormLayer) {
 }
 #endif
 
-void setPoolConfig(TestConfig* config, PoolConfig* pool,
+void setPoolConfig(TestConfig* config,
+                   PoolConfig* pool,
                    const string& poolType) {
   (*config).biasSize = 0;
   (*config).layerConfig.set_type("pool");
@@ -805,8 +1032,8 @@ void setPoolConfig(TestConfig* config, PoolConfig* pool,
   pool->set_stride(sw);
   pool->set_stride_y(sh);
 
-  int ow = (pool->img_size() - kw + 2 * pw + sw - 1) / sw + 1;
-  int oh = (pool->img_size_y() - kh + 2 * ph + sh - 1) / sh + 1;
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
   pool->set_output_x(ow);
   pool->set_output_y(oh);
 }
@@ -862,6 +1089,34 @@ TEST(Layer, PoolLayer) {
 #endif
 }
 
+void testSppLayer(const string& poolType,
+                  const int pyramidHeight,
+                  bool trans,
+                  bool useGpu) {
+  TestConfig config;
+  config.layerConfig.set_type("spp");
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  SppConfig* sppConfig = input->mutable_spp_conf();
+  sppConfig->set_pool_type(poolType);
+  sppConfig->set_pyramid_height(pyramidHeight);
+  sppConfig->set_channels(16);
+  sppConfig->set_img_size(10);
+  sppConfig->set_img_size_y(20);
+  int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1);
+  config.layerConfig.set_size(outputSize * sppConfig->channels());
+  testLayerGrad(config, "spp", 100, trans, useGpu);
+}
+
+TEST(Layer, SpatialPyramidPoolLayer) {
+  for (auto useGpu : {false, true}) {
+    for (auto pyramidHeight : {1, 2, 3}) {
+      testSppLayer("avg-projection", pyramidHeight, false, useGpu);
+      testSppLayer("max-projection", pyramidHeight, false, useGpu);
+    }
+  }
+}
+
 TEST(Layer, rankCostLayer) {
   TestConfig config;
   config.layerConfig.set_type("rank-cost");
@@ -879,6 +1134,19 @@ TEST(Layer, rankCostLayer) {
   }
 }
 
+TEST(Layer, sumCostLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("sum_cost");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "sum_cost", 100, false, useGpu);
+  }
+}
+
 TEST(Layer, weightedRankCostLayer) {
   TestConfig config;
   config.layerConfig.set_type("rank-cost");
@@ -939,7 +1207,7 @@ TEST(Layer, LstmLayer) {
   TestConfig config;
   config.layerConfig.set_type("lstmemory");
   config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_type("tanh");
   config.layerConfig.set_active_state_type("sigmoid");
   config.layerConfig.set_active_gate_type("sigmoid");
   config.biasSize = 28;
@@ -1046,7 +1314,8 @@ TEST(Layer, NCELayer) {
 
     for (auto isIdLabel : {false, true}) {
       config.inputDefs[1] = {
-          isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA, "label",
+          isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA,
+          "label",
           /* dim= */ numClasses,
           /* paraSize= */ 0};
 
@@ -1068,7 +1337,10 @@ TEST(Layer, NCELayer) {
                   << " isIdLabel=" << isIdLabel << " withWeight=" << withWeight
                   << " withDist=" << withDist;
         // Not support GPU now
-        testLayerGrad(config, "nce", 100, /* trans= */ false,
+        testLayerGrad(config,
+                      "nce",
+                      100,
+                      /* trans= */ false,
                       /* useGpu */ false);
       }
     }
@@ -1146,7 +1418,8 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
   config.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE);
   config.layerConfig.set_active_type("sigmoid");
   config.biasSize = CHANNELS;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0",
+  config.inputDefs.push_back({INPUT_DATA,
+                              "layer_0",
                               /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS,
                               /* paraSize= */ CHANNELS});
 
@@ -1163,7 +1436,11 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
   img_conf->set_channels(CHANNELS);
   img_conf->set_img_size(IMG_SIZE);
 
-  testLayerGrad(config, "batch_norm", 64, /* trans= */ trans, useGpu,
+  testLayerGrad(config,
+                "batch_norm",
+                64,
+                /* trans= */ trans,
+                useGpu,
                 /* useWeight */ true);
 }
 
@@ -1198,12 +1475,13 @@ TEST(Operator, conv) {
   conv->set_groups(1);
   conv->set_filter_channels(conv->channels() / conv->groups());
   conv->set_img_size(IMAGE_SIZE);
-  int outputSize =
-      int(1.0 * (2 * conv->padding() + conv->img_size() - conv->filter_size()) /
-          conv->stride()) +
-      1;
-  conv->set_output_x(outputSize);
-  config.layerConfig.set_size(outputSize * outputSize *
+  int output_x = outputSize(conv->img_size(),
+                            conv->filter_size(),
+                            conv->padding(),
+                            conv->stride(),
+                            /* caffeMode */ true);
+  conv->set_output_x(output_x);
+  config.layerConfig.set_size(output_x * output_x *
                               config.layerConfig.num_filters());
   config.layerConfig.set_size(conv->output_x() * conv->output_x() *
                               NUM_FILTERS);
@@ -1211,8 +1489,10 @@ TEST(Operator, conv) {
   config.inputDefs.push_back(
       {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE * CHANNELS, 0});
   config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1",
-       FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS, 0});
+      {INPUT_DATA,
+       "layer_1",
+       FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS,
+       0});
   config.layerConfig.add_inputs();
   config.layerConfig.add_inputs();
 
@@ -1226,12 +1506,17 @@ TEST(Layer, FeatureMapExpandLayer) {
   const int INPUT_SIZE = 100;
   config.layerConfig.set_size(INPUT_SIZE * CHANNELS);
   config.layerConfig.set_num_filters(CHANNELS);
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0",
-                              /* dim= */ INPUT_SIZE, /* paraSize= */ 0});
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
+                              "layer_0",
+                              /* dim= */ INPUT_SIZE,
+                              /* paraSize= */ 0});
   config.layerConfig.add_inputs();
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "featmap_expand",
-                  /*batch_size*/ 100, /* trans= */ false, useGpu,
+    testLayerGrad(config,
+                  "featmap_expand",
+                  /*batch_size*/ 100,
+                  /* trans= */ false,
+                  useGpu,
                   /* useWeight */ true);
   }
 }
diff --git a/paddle/gserver/tests/test_LinearChainCRF.cpp b/paddle/gserver/tests/test_LinearChainCRF.cpp
index f45e40c8b6acb5ff7d3e16f9a6f9a5acba13e84e..913d6ed7511a0c3c7c0b40e1fbdb48a17b51b1b2 100644
--- a/paddle/gserver/tests/test_LinearChainCRF.cpp
+++ b/paddle/gserver/tests/test_LinearChainCRF.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <gtest/gtest.h>
 #include <vector>
 #include "paddle/gserver/layers/LinearChainCRF.h"
diff --git a/paddle/gserver/tests/test_MultinomialSampler.cpp b/paddle/gserver/tests/test_MultinomialSampler.cpp
index 73b4d0b8b7110d4ab79809875e2481cd2b565a68..3fc099adbdb6cb562c4bfc419b777ef534bdfed7 100644
--- a/paddle/gserver/tests/test_MultinomialSampler.cpp
+++ b/paddle/gserver/tests/test_MultinomialSampler.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <random>
 
 #include <gtest/gtest.h>
@@ -43,7 +42,7 @@ TEST(MultinomialSampler, gen) {
   int size = 1024 * 4;
   default_random_engine reng;
 
-  for (size_t iter=0; iter < 256; ++iter) {
+  for (size_t iter = 0; iter < 256; ++iter) {
     uniform_int_distribution<int> rand(1, numGrids / size * 1.8);
     vector<real> prob;
     int sum = 0;
@@ -138,7 +137,6 @@ void benchmarkRandom() {
   LOG(INFO) << "sum1=" << sum1;
 }
 
-
 int main(int argc, char** argv) {
   initMain(argc, argv);
   testing::InitGoogleTest(&argc, argv);
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index b3ef53067301b4f7f50ba799a035a80fa1c39e65..1810bc31fc2ce00ed6d8fd588c0dfa9ce398cb45 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -41,7 +41,8 @@ struct DataOut {
   std::vector<VectorPtr> paraGrads;
 };
 
-void initArgument(DataIn& data, const std::string& configPath,
+void initArgument(DataIn& data,
+                  const std::string& configPath,
                   bool useGpu = FLAGS_use_gpu) {
   TrainerConfigHelper config(configPath);
   size_t batchSize = config.getOptConfig().batch_size();
@@ -122,9 +123,10 @@ void calcGradient(DataIn& in, DataOut& out, const std::string& configPath) {
   }
   gradientMachine->backward();
   for (size_t i = 0; i < in.outGrads.size(); i++) {
-    MatrixPtr value =
-        Matrix::create(outArgs[i].value->getHeight(),
-                       outArgs[i].value->getWidth(), false, false);
+    MatrixPtr value = Matrix::create(outArgs[i].value->getHeight(),
+                                     outArgs[i].value->getWidth(),
+                                     false,
+                                     false);
     value->copyFrom(*outArgs[i].value);
     out.outValues.push_back(value);
   }
@@ -147,8 +149,12 @@ void calcGradient(DataIn& in, DataOut& out, const std::string& configPath) {
   gradientMachine->finish();
 }
 
-void checkBuffer(real* A, const char* desA, real* B, const char* desB,
-                 size_t len, size_t width = 1) {
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 size_t width = 1) {
   int nNum = 0;
   for (size_t i = 0; i < len; ++i) {
     real diff = fabs(A[i] - B[i]);
@@ -168,8 +174,10 @@ void compareGradient(DataOut& outA, DataOut& outB) {
             << "------------------------------";
   for (size_t i = 0; i < outA.outValues.size(); ++i) {
     LOG(INFO) << "OUTPUT VALUE: " << i;
-    checkBuffer(outA.outValues[i]->getData(), "network A output",
-                outB.outValues[i]->getData(), "network B output",
+    checkBuffer(outA.outValues[i]->getData(),
+                "network A output",
+                outB.outValues[i]->getData(),
+                "network B output",
                 outA.outValues[i]->getElementCnt(),
                 outA.outValues[i]->getWidth());
   }
@@ -180,8 +188,10 @@ void compareGradient(DataOut& outA, DataOut& outB) {
               << "------------------------------";
     for (size_t i = 0; i < outA.paraGrads.size(); ++i) {
       LOG(INFO) << "PARAMETER GRADIENT: " << i;
-      checkBuffer(outA.paraGrads[i]->getData(), "Network A",
-                  outB.paraGrads[i]->getData(), "Network B",
+      checkBuffer(outA.paraGrads[i]->getData(),
+                  "Network A",
+                  outB.paraGrads[i]->getData(),
+                  "Network B",
                   outA.paraGrads[i]->getSize());
     }
   }
@@ -236,8 +246,16 @@ TEST(Compare, img_pool) {
   compareNetwork(config_file_a, config_file_b);
   FLAGS_use_gpu = useGpu;
 }
-#endif
 
+TEST(Compare, img_conv) {
+  std::string config_file_a = "./gserver/tests/img_conv_a.conf";
+  std::string config_file_b = "./gserver/tests/img_conv_b.conf";
+  bool useGpu = FLAGS_use_gpu;
+  FLAGS_use_gpu = true;
+  compareNetwork(config_file_a, config_file_b);
+  FLAGS_use_gpu = useGpu;
+}
+#endif
 
 P_DEFINE_string(config_file_a, "", "config of one network to compare");
 P_DEFINE_string(config_file_b, "", "config of another network to compare");
diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp
index 68f7f43261c8353b6836416bea97dad4f817ba75..01070bc1cb3023bc0321f0a8e867b8abd7030e08 100644
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <memory>
 #include <string>
 
@@ -41,7 +40,9 @@ const int kSpraseMatrixDim = 1024;
 
 using namespace paddle;  // NOLINT
 
-void prepareData(DataBatch* batch, const int* numPerSlotType, bool iid,
+void prepareData(DataBatch* batch,
+                 const int* numPerSlotType,
+                 bool iid,
                  bool useGpu) {
   batch->clear();
   int64_t size = uniformRandom(100) + 10;
@@ -137,7 +138,7 @@ inline int getSlotDim(const Argument& arg) {
 
 inline SlotDef::SlotType getSlotType(const Argument& arg) {
   if (arg.value) {
-    auto & m = *arg.value;
+    auto& m = *arg.value;
     auto& type = typeid(m);
     if (type == typeid(CpuMatrix) || type == typeid(GpuMatrix)) {
       return SlotDef::VECTOR_DENSE;
@@ -169,8 +170,12 @@ inline SlotDef::SlotType getSlotType(const Argument& arg) {
   return SlotDef::VECTOR_DENSE;
 }
 
-void getColRow(const Argument& arg, int64_t pos, bool useGpu, int* colNum,
-               const int** rowCols, const real** rowValues) {
+void getColRow(const Argument& arg,
+               int64_t pos,
+               bool useGpu,
+               int* colNum,
+               const int** rowCols,
+               const real** rowValues) {
   SlotDef::SlotType type = getSlotType(arg);
   GpuSparseMatrixPtr matGpu;
   CpuSparseMatrixPtr matCpu;
@@ -190,8 +195,11 @@ void getColRow(const Argument& arg, int64_t pos, bool useGpu, int* colNum,
   }
 }
 
-void makeSample(const vector<Argument>& arguments, int64_t pos,
-                bool isBeginning, DataSample* sample, bool useGpu) {
+void makeSample(const vector<Argument>& arguments,
+                int64_t pos,
+                bool isBeginning,
+                DataSample* sample,
+                bool useGpu) {
   sample->set_is_beginning(isBeginning);
   int slotid = 0;
   for (auto& arg : arguments) {
@@ -272,8 +280,7 @@ void writeData(const DataBatch& batch, bool useGpu, bool dataCompression) {
 
   int64_t totalSeqs = batch.getNumSequences();
   int64_t seq = 0;
-  ICpuGpuVectorPtr sequenceStartPositions =
-      arguments[0].sequenceStartPositions;
+  ICpuGpuVectorPtr sequenceStartPositions = arguments[0].sequenceStartPositions;
   int64_t numWritten = 0;
   vector<string> curProtoFiles =
       dataCompression ? protoFilesCompressed : protoFiles;
@@ -306,8 +313,11 @@ void writeData(const DataBatch& batch, bool useGpu, bool dataCompression) {
 }
 
 // check that the sample at pos1 in args1 is same as the sample at pos2 in args2
-void checkSample(const vector<Argument>& args1, int64_t pos1,
-                 const vector<Argument>& args2, int64_t pos2, bool useGpu) {
+void checkSample(const vector<Argument>& args1,
+                 int64_t pos1,
+                 const vector<Argument>& args2,
+                 int64_t pos2,
+                 bool useGpu) {
   EXPECT_EQ(args1.size(), args2.size());
   VLOG(1) << " pos1=" << pos1 << " pos2=" << pos2;
 
@@ -361,8 +371,11 @@ void checkSample(const vector<Argument>& args1, int64_t pos1,
   }
 }
 
-void testProtoDataProvider(int* numPerSlotType, bool iid, bool async,
-                           bool useGpu, bool dataCompression,
+void testProtoDataProvider(int* numPerSlotType,
+                           bool iid,
+                           bool async,
+                           bool useGpu,
+                           bool dataCompression,
                            int numConstantSlots = 0) {
   mkDir(kTestDir);
   DataBatch data;
@@ -377,7 +390,9 @@ void testProtoDataProvider(int* numPerSlotType, bool iid, bool async,
 
   for (int i = 0; i < numConstantSlots; ++i) {
     config.add_constant_slots(i + 11);
-    MatrixPtr w = Matrix::create(data.getSize(), 1, /* trans= */ false,
+    MatrixPtr w = Matrix::create(data.getSize(),
+                                 1,
+                                 /* trans= */ false,
                                  /* useGpu= */ false);
     w->assign(config.constant_slots(i));
     data.appendData(w);
@@ -393,16 +408,14 @@ void testProtoDataProvider(int* numPerSlotType, bool iid, bool async,
 
   size_t seq1 = 0;
   vector<Argument>& args1 = data.getStreams();
-  ICpuGpuVectorPtr sequenceStartPositions1 =
-      args1[0].sequenceStartPositions;
+  ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions;
 
   dataProvider->reset();
 
   while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
     CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
     vector<Argument>& args2 = batch.getStreams();
-    ICpuGpuVectorPtr sequenceStartPositions2 =
-        args2[0].sequenceStartPositions;
+    ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions;
     for (auto& arg : args2) {
       EXPECT_EQ(iid, !arg.sequenceStartPositions);
     }
@@ -494,8 +507,8 @@ TEST(ProtoDataProvider, test) {
                 numSparseValueVectorSlots;
             numPerSlotType[SlotDef::INDEX] = numIdSlots;
             numPerSlotType[SlotDef::STRING] = numStrSlots;
-            testProtoDataProvider(numPerSlotType, iid, async, useGpu,
-                                  dataCompression);
+            testProtoDataProvider(
+                numPerSlotType, iid, async, useGpu, dataCompression);
           }  // end for (int dataCompression : numTwoArray)
         }    // end for (int useGpu : numTwoArray)
       }      // end for (int async : numTwoArray)
@@ -531,7 +544,9 @@ TEST(ProtoDataProvider, constant_slots) {
             numPerSlotType[SlotDef::INDEX] = 1;
             testProtoDataProvider(numPerSlotType,
                                   /* iid= */ true,
-                                  /* async= */ false, useGpu, dataCompression,
+                                  /* async= */ false,
+                                  useGpu,
+                                  dataCompression,
                                   numConstantSlots);
           }  // end for (int dataCompression : numTwoArray)
         }    // end for (int useGpu : numTwoArray)
@@ -541,16 +556,17 @@ TEST(ProtoDataProvider, constant_slots) {
 }
 
 void checkSampleSequence(const vector<Argument>& args1,
-                         const vector<Argument>& args2, int64_t offset,
-                         int64_t numSeqs, bool useGpu) {
+                         const vector<Argument>& args2,
+                         int64_t offset,
+                         int64_t numSeqs,
+                         bool useGpu) {
   // check slot num are equal
   EXPECT_EQ(args1.size(), args2.size());
   for (size_t i = 0; i < args1.size(); i++) {
     auto type = getSlotType(args1[i]);
     // check for args2: sequenceStartPositions vs numSeqs
     // (1) size
-    EXPECT_EQ(args2[i].sequenceStartPositions->getSize(),
-              (size_t)numSeqs + 1);
+    EXPECT_EQ(args2[i].sequenceStartPositions->getSize(), (size_t)numSeqs + 1);
     // (2) content
     auto checkArgContent = [&](const Argument& args, int numSeqs) {
       for (int j = 0; j <= numSeqs; j++) {
@@ -579,8 +595,8 @@ void checkSampleSequence(const vector<Argument>& args1,
         const real* rowValues1;  // nullptr
         int totalLength = 0;
         for (int j = 0; j < numSeqs; j++) {
-          getColRow(args1[i], offset + j, useGpu, &colNum1, &rowCols1,
-                    &rowValues1);
+          getColRow(
+              args1[i], offset + j, useGpu, &colNum1, &rowCols1, &rowValues1);
           // (1) lengths
           EXPECT_EQ(totalLength,
                     args2[i].sequenceStartPositions->getElement(j));
@@ -626,13 +642,16 @@ void checkSampleSequence(const vector<Argument>& args1,
   }
 }
 
-void testProtoSequenceDataProvider(int* numPerSlotType, bool async,
+void testProtoSequenceDataProvider(int* numPerSlotType,
+                                   bool async,
                                    bool useGpu) {
   mkDir(kTestDir);
   DataBatch data;
 
-  prepareData(&data, numPerSlotType,
-              /* iid */ true, useGpu);
+  prepareData(&data,
+              numPerSlotType,
+              /* iid */ true,
+              useGpu);
   writeData(data, useGpu, /* dataCompression */ false);
 
   DataConfig config;
@@ -649,8 +668,7 @@ void testProtoSequenceDataProvider(int* numPerSlotType, bool async,
   DataBatch batch;
 
   vector<Argument>& args1 = data.getStreams();
-  ICpuGpuVectorPtr sequenceStartPositions1 =
-      args1[0].sequenceStartPositions;
+  ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions;
 
   dataProvider->reset();
 
@@ -658,8 +676,7 @@ void testProtoSequenceDataProvider(int* numPerSlotType, bool async,
   while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
     CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
     vector<Argument>& args2 = batch.getStreams();
-    ICpuGpuVectorPtr sequenceStartPositions2 =
-        args2[0].sequenceStartPositions;
+    ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions;
     for (auto& arg : args1) {
       // args1 should not has sequence
       EXPECT_EQ(true, !arg.sequenceStartPositions);
diff --git a/paddle/gserver/tests/test_PyDataProvider.cpp b/paddle/gserver/tests/test_PyDataProvider.cpp
index 6ad45e3a65a6276ea9fa5bf8b3878c943caf7cba..802f9aa4cb558f48fe55d7d7d5c882d25925bb32 100644
--- a/paddle/gserver/tests/test_PyDataProvider.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <memory>
 #include <string>
 
@@ -114,9 +113,10 @@ void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu) {
   // Dense
   real* data;
   if (useGpu) {
-    MatrixPtr cpuMatrixPtr =
-        Matrix::create(argumentList[0].value->getHeight(),
-                       argumentList[0].value->getWidth(), 0, 0);
+    MatrixPtr cpuMatrixPtr = Matrix::create(argumentList[0].value->getHeight(),
+                                            argumentList[0].value->getWidth(),
+                                            0,
+                                            0);
     cpuMatrixPtr->copyFrom(*argumentList[0].value);
     data = cpuMatrixPtr->getData();
   } else {
diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
index 6bf1e329251219fcbf68b95f2d80a3235cb7037f..24aa73910f254e636dfb88182552fe47c12c8543 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -31,14 +31,11 @@ extern void clearOnPoolFilledHook();
 }  // namespace unittest
 }  // namespace paddle
 
-
 const paddle::real epsilon = 1e-5;
 
-static inline int64_t readDataBatch(
-    paddle::DataBatch* batch,
-    const std::string& funcName,
-    int64_t batchSize = 65535) {
-
+static inline int64_t readDataBatch(paddle::DataBatch* batch,
+                                    const std::string& funcName,
+                                    int64_t batchSize = 65535) {
   paddle::DataConfig config;
   config.set_type("py2");
   config.set_files(FLAGS_train_list.c_str());
@@ -64,18 +61,19 @@ TEST(PyDataProvider2, dense_no_seq) {
   provider->setSkipShuffle();  // skip shuffle for unittest.
 
   paddle::DataBatch batch;
-  for (size_t pass=0; pass < 2; ++pass) {  // read 2 passes
+  for (size_t pass = 0; pass < 2; ++pass) {  // read 2 passes
     provider->reset();
     int64_t num = provider->getNextBatchInternal(100, &batch);
     ASSERT_NE(num, 0);
     ASSERT_EQ((size_t)batch.getStreams().size(), (size_t)1);
     ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
     // Check batch data.
-    for (size_t i=0; i < 100; ++i) {
-      for (size_t j=0; j < 200; ++j) {
-        paddle::real tmp = (paddle::real)((j-100.0) * (i+1) / 200.0);
-        ASSERT_NEAR(batch.getStreams()[0].value->getData()[i*200 + j],
-                    tmp, epsilon);}
+    for (size_t i = 0; i < 100; ++i) {
+      for (size_t j = 0; j < 200; ++j) {
+        paddle::real tmp = (paddle::real)((j - 100.0) * (i + 1) / 200.0);
+        ASSERT_NEAR(
+            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
+      }
     }
 
     num = provider->getNextBatchInternal(100, &batch);
@@ -83,12 +81,13 @@ TEST(PyDataProvider2, dense_no_seq) {
     ASSERT_EQ(batch.getStreams().size(), (size_t)1);
     ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
     // Check batch data.
-    for (size_t i=0; i < 100; ++i) {
+    for (size_t i = 0; i < 100; ++i) {
       size_t ii = i + 100;
-      for (size_t j=0; j < 200; ++j) {
-        paddle::real tmp = (paddle::real)((j-100.0) * (ii+1) / 200.0);
-        ASSERT_NEAR(batch.getStreams()[0].value->getData()[i*200 + j],
-                    tmp, epsilon);}
+      for (size_t j = 0; j < 200; ++j) {
+        paddle::real tmp = (paddle::real)((j - 100.0) * (ii + 1) / 200.0);
+        ASSERT_NEAR(
+            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
+      }
     }
     num = provider->getNextBatchInternal(100, &batch);
     ASSERT_EQ(num, 0);
@@ -106,11 +105,11 @@ TEST(PyDataProvider2, index_no_seq) {
 
   provider->setSkipShuffle();  // skip shuffle for unittest.
   paddle::DataBatch batch;
-  for (size_t pass=0; pass < 2; ++pass) {
+  for (size_t pass = 0; pass < 2; ++pass) {
     provider->reset();
     int64_t num = provider->getNextBatchInternal(10000, &batch);
     CHECK_EQ(num, 200);
-    for (int i=0; i < 200; ++i) {
+    for (int i = 0; i < 200; ++i) {
       CHECK_EQ(i, batch.getStreams()[0].ids->getData()[i]);
     }
   }
@@ -118,13 +117,14 @@ TEST(PyDataProvider2, index_no_seq) {
 
 TEST(PyDataProvider2, init_hook) {
   paddle::PyObjectPtr pickle = paddle::py::import("pickle");
-  paddle::PyObjectPtr globals(
-      PyModule_GetDict(PyImport_AddModule("__main__")));
+  paddle::PyObjectPtr globals(PyModule_GetDict(PyImport_AddModule("__main__")));
   PyDict_SetItemString(globals.get(), "pickle", pickle.get());
   paddle::PyObjectPtr locals(PyDict_New());
   paddle::PyObjectPtr mdl(PyRun_String(
       "dumps = pickle.dumps({'value':[float(x) for x in xrange(20)]})",
-      Py_file_input, globals.get(), locals.get()));
+      Py_file_input,
+      globals.get(),
+      locals.get()));
   CHECK_PY(mdl) << "Error!";
   paddle::PyObjectPtr dps(PyDict_GetItemString(locals.get(), "dumps"));
   CHECK_PY(dps) << "Error!";
@@ -145,9 +145,9 @@ TEST(PyDataProvider2, init_hook) {
   ASSERT_EQ(num, 200);
   auto& mat = batch.getStreams()[0].value;
   ASSERT_EQ((size_t)mat->getWidth(), (size_t)20);
-  for (size_t i=0; i < 200; ++i) {
-    for (size_t j=0; j < 20; ++j) {
-      ASSERT_NEAR((paddle::real)j, mat->getData()[i*20 + j], epsilon);
+  for (size_t i = 0; i < 200; ++i) {
+    for (size_t j = 0; j < 20; ++j) {
+      ASSERT_NEAR((paddle::real)j, mat->getData()[i * 20 + j], epsilon);
     }
   }
 }
@@ -168,11 +168,11 @@ TEST(PyDataProvider2, sparse_no_value_no_seq) {
   auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
       batch.getStreams()[0].value);
   CHECK(csm != nullptr);
-  for (int i=0; i < 200; ++i) {
+  for (int i = 0; i < 200; ++i) {
     CHECK_EQ(csm->getColNum(i), (size_t)10);
     int* cols = csm->getRowCols(i);
-    for (int j=0; j < 10; ++j) {
-      CHECK_EQ(cols[j], (i+1)*(j+1));
+    for (int j = 0; j < 10; ++j) {
+      CHECK_EQ(cols[j], (i + 1) * (j + 1));
     }
   }
 }
@@ -183,13 +183,13 @@ TEST(PyDataProvider2, sparse_value_no_seq) {
   auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
       batch.getStreams()[0].value);
   CHECK(csm != nullptr);
-  for (int i=0; i < 200; ++i) {
+  for (int i = 0; i < 200; ++i) {
     CHECK_EQ(csm->getColNum(i), (size_t)10);
     int* cols = csm->getRowCols(i);
     real* dat = csm->getRowValues(i);
-    for (int j=0; j < 10; ++j) {
-      EXPECT_EQ(cols[j], (i+1)*(j+1));
-      EXPECT_EQ(dat[j], real(j)/real(i+1));
+    for (int j = 0; j < 10; ++j) {
+      EXPECT_EQ(cols[j], (i + 1) * (j + 1));
+      EXPECT_EQ(dat[j], real(j) / real(i + 1));
     }
   }
 }
@@ -198,10 +198,10 @@ TEST(PyDataProvider2, index_seq) {
   paddle::DataBatch batch;
   CHECK_EQ(readDataBatch(&batch, "test_index_seq"), 200);
   auto& arg = batch.getStreams()[0];
-  CHECK_EQ((int)arg.ids->getSize(), (200 + 1) * 200 /2);
+  CHECK_EQ((int)arg.ids->getSize(), (200 + 1) * 200 / 2);
   size_t tmp = 0;
-  for (size_t i=0; i < 200; ++i) {  // CHECK DATA CORRECT
-    for (size_t j=0; j < i+1; ++j) {
+  for (size_t i = 0; i < 200; ++i) {  // CHECK DATA CORRECT
+    for (size_t j = 0; j < i + 1; ++j) {
       ASSERT_EQ((size_t)arg.ids->getData()[tmp], j);
       ++tmp;
     }
@@ -221,9 +221,9 @@ TEST(PyDataProvider2, index_sub_seq) {
   ASSERT_EQ(readDataBatch(&batch, "test_index_sub_seq"), 200);
   auto& arg = batch.getStreams()[0];
   size_t tmp = 0;
-  for (size_t i=0; i < 200; ++i) {
-    for (size_t j=0; j < i+1; ++j) {
-      for (size_t k=0; k < j+1; ++k) {
+  for (size_t i = 0; i < 200; ++i) {
+    for (size_t j = 0; j < i + 1; ++j) {
+      for (size_t k = 0; k < j + 1; ++k) {
         CHECK_EQ((size_t)arg.ids->getData()[tmp++], k);
       }
     }
@@ -236,14 +236,14 @@ TEST(PyDataProvider2, index_sub_seq) {
   ASSERT_EQ(arg.sequenceStartPositions->getData(false)[0], 0);
   size_t idx = 1;
   tmp = 0;
-  for (size_t i=0; i < 200; ++i) {
-    for (size_t j=0; j < i+1; ++j) {
-      tmp += j+1;
+  for (size_t i = 0; i < 200; ++i) {
+    for (size_t j = 0; j < i + 1; ++j) {
+      tmp += j + 1;
       ASSERT_EQ((size_t)arg.subSequenceStartPositions->getData(false)[idx],
-          (size_t)tmp);
+                (size_t)tmp);
       ++idx;
     }
-    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i+1], tmp);
+    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i + 1], tmp);
   }
 }
 
@@ -264,7 +264,7 @@ TEST(PyDataProvider2, min_pool_size) {
 
   paddle::unittest::pydp2::setOnPoolFilledHook([&](size_t poolSize) {
     if (totalData > batchSize) {
-      CHECK_GE(poolSize, std::min(totalData-batchSize, minPoolSize));
+      CHECK_GE(poolSize, std::min(totalData - batchSize, minPoolSize));
     }
   });
   while (true) {
@@ -287,7 +287,7 @@ TEST(PyDataProvider2, can_over_batch_size) {
   config.set_load_data_args("");
   paddle::DataBatch batch;
   std::unique_ptr<paddle::DataProvider> provider(
-  paddle::DataProvider::create(config, false));
+      paddle::DataProvider::create(config, false));
   provider->reset();
   constexpr size_t batchSize = 100;
   while (true) {
@@ -313,7 +313,7 @@ TEST(PyDataProvider2, input_order) {
   *modelConfig.add_input_layer_names() = "input2";
   paddle::DataBatch batch;
   std::unique_ptr<paddle::DataProvider> provider(
-  paddle::DataProvider::create(config, modelConfig, false));
+      paddle::DataProvider::create(config, modelConfig, false));
   provider->reset();
   constexpr size_t batchSize = 100;
   while (true) {
@@ -338,7 +338,7 @@ TEST(PyDataProvider2, test_check) {
   config.set_load_data_args("");
   paddle::DataBatch batch;
   std::unique_ptr<paddle::DataProvider> provider(
-  paddle::DataProvider::create(config, false));
+      paddle::DataProvider::create(config, false));
   provider->reset();
   while (true) {
     size_t realBatchSize = provider->getNextBatchInternal(100, &batch);
@@ -346,13 +346,30 @@ TEST(PyDataProvider2, test_check) {
       break;
     } else {
       auto& ivec = batch.getStream(0).ids;
-      for (size_t i=0; i < ivec->getSize(); ++i) {
+      for (size_t i = 0; i < ivec->getSize(); ++i) {
         CHECK_LT(ivec->getData()[i], 10);
       }
     }
   }
 }
 
+TEST(PyDataProvider2, multiThread) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_dense_no_seq");
+  config.set_async_load_data(true);
+
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->reset();
+  paddle::DataBatch batch;
+  provider->getNextBatch(100, &batch);
+  provider->reset();
+  provider.reset();
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   paddle::initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_PyDataProvider2.py b/paddle/gserver/tests/test_PyDataProvider2.py
index 71c3335231e52132e6c7e9aaf0cb92d0db2e20df..7ca30198fb1d0e7384db2c28524c7898dcd27e50 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.py
+++ b/paddle/gserver/tests/test_PyDataProvider2.py
@@ -33,16 +33,19 @@ def test_init_hooker(setting, value, **kwargs):
     setting.value = value
 
 
-@provider(input_types=[dense_vector(20, seq_type=SequenceType.NO_SEQUENCE)],
-          init_hook=test_init_hooker)
+@provider(
+    input_types=[dense_vector(
+        20, seq_type=SequenceType.NO_SEQUENCE)],
+    init_hook=test_init_hooker)
 def test_init_hook(setting, filename):
     for i in xrange(200):
         yield setting.value
 
 
-@provider(
-    input_types=[
-        sparse_binary_vector(30000, seq_type=SequenceType.NO_SEQUENCE)])
+@provider(input_types=[
+    sparse_binary_vector(
+        30000, seq_type=SequenceType.NO_SEQUENCE)
+])
 def test_sparse_non_value_no_seq(setting, filename):
     for i in xrange(200):
         yield [(i + 1) * (j + 1) for j in xrange(10)]
@@ -77,28 +80,28 @@ def test_min_pool_size(setting, filename):
         yield random.randint(0, 100 - 1)
 
 
-@provider(input_types=[index_slot(100, seq_type=SequenceType.SEQUENCE)],
-          can_over_batch_size=False,
-          calc_batch_size=lambda x: len(x[0]))
+@provider(
+    input_types=[index_slot(
+        100, seq_type=SequenceType.SEQUENCE)],
+    can_over_batch_size=False,
+    calc_batch_size=lambda x: len(x[0]))
 def test_can_over_batch_size(setting, filename):
     for _ in xrange(1 << 10):
         seq_len = random.randint(0, 99)
         yield [random.randint(0, 100 - 1) for _ in xrange(seq_len)]
 
 
-@provider(input_types={'input1':index_slot(10), 'input2': index_slot(10)})
+@provider(input_types={'input1': index_slot(10), 'input2': index_slot(10)})
 def test_input_order(setting, filename):
     for _ in xrange(1000):
-        yield {
-            'input1': 0,
-            'input2': 1
-        }
+        yield {'input1': 0, 'input2': 1}
 
 
-@provider(input_types=[index_slot(10)],
-          check=True,
-          check_fail_continue=True,
-          should_shuffle="123")  # also test should shuffle
+@provider(
+    input_types=[index_slot(10)],
+    check=True,
+    check_fail_continue=True,
+    should_shuffle="123")  # also test should shuffle
 def test_check(settings, filename):
     yield_good_value = False
 
@@ -108,4 +111,3 @@ def test_check(settings, filename):
             if i < 10:
                 yield_good_value = True
             yield i
-
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
index d104db3e5b32d5ae5c874f7ef3e5c51fea6366ec..80d713dac03a42b370d50ebb17d089e9be2f17ff 100644
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -45,12 +45,16 @@ public:
     auto p = const_cast<TrainerForTest*>(this);
     auto& params = p->getGradientMachine()->getParameters();
     return std::accumulate(
-        params.begin(), params.end(), 0UL,
+        params.begin(),
+        params.end(),
+        0UL,
         [](size_t a, const ParameterPtr& p) { return a + p->getSize(); });
   }
 };
 
-void CalCost(const string& conf, const string& dir, real* cost,
+void CalCost(const string& conf,
+             const string& dir,
+             real* cost,
              int num_passes) {
   auto config = std::make_shared<TrainerConfigHelper>(conf);
   TrainerForTest trainer;
@@ -82,8 +86,8 @@ void CalCost(const string& conf, const string& dir, real* cost,
       int num = dataProvider->getNextBatch(batchSize, &dataBatch);
       if (num == 0) break;
       totalCost += trainer.calcGradient(dataBatch, vecW, vecGradient);
-      sgdUpdate(learningRate, momentum, decayRate, &vecW, &vecGradient,
-                &vecMomentum);
+      sgdUpdate(
+          learningRate, momentum, decayRate, &vecW, &vecGradient, &vecMomentum);
     }
     cost[i] = totalCost;
   }
@@ -119,7 +123,8 @@ TEST(RecurrentGradientMachine, HasSubSequence) {
   for (bool useGpu : {false, true}) {
     test("gserver/tests/sequence_layer_group.conf",
          "gserver/tests/sequence_nest_layer_group.conf",
-         1e-5, useGpu);
+         1e-5,
+         useGpu);
   }
 }
 
@@ -127,7 +132,8 @@ TEST(RecurrentGradientMachine, rnn) {
   for (bool useGpu : {false, true}) {
     test("gserver/tests/sequence_rnn.conf",
          "gserver/tests/sequence_nest_rnn.conf",
-         1e-6, useGpu);
+         1e-6,
+         useGpu);
   }
 }
 
@@ -135,16 +141,18 @@ TEST(RecurrentGradientMachine, rnn_multi_input) {
   for (bool useGpu : {false, true}) {
     test("gserver/tests/sequence_rnn_multi_input.conf",
          "gserver/tests/sequence_nest_rnn_multi_input.conf",
-         1e-6, useGpu);
+         1e-6,
+         useGpu);
   }
 }
 
 TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) {
-    for (bool useGpu : {false, true}) {
-        test("gserver/tests/sequence_rnn_multi_unequalength_inputs.conf",
-        "gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.conf",
-             1e-6, useGpu);
-    }
+  for (bool useGpu : {false, true}) {
+    test("gserver/tests/sequence_rnn_multi_unequalength_inputs.conf",
+         "gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.conf",
+         1e-6,
+         useGpu);
+  }
 }
 
 int main(int argc, char** argv) {
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index 9b933b153d158bef565c0964232525ba99b8b3d4..0643cec38b3a5d96de64438c7342f827fde808a9 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -71,7 +71,9 @@ void checkError(const CpuVector& vector1, const CpuVector& vector2) {
   EXPECT_EQ(count, 0) << "There are " << count << " different element.";
 }
 
-LayerPtr creatDataLayer(string name, size_t batchSize, int layerSize,
+LayerPtr creatDataLayer(string name,
+                        size_t batchSize,
+                        int layerSize,
                         bool useGpu) {
   LayerConfig dataConfig;
   dataConfig.set_name(name);
@@ -96,7 +98,9 @@ LayerPtr creatDataLayer(string name, size_t batchSize, int layerSize,
   return layer;
 }
 
-ParameterPtr creatParameter(string name, int pid, size_t paraSize,
+ParameterPtr creatParameter(string name,
+                            int pid,
+                            size_t paraSize,
                             bool useGpu) {
   ParameterConfig paraConfig;
   paraConfig.set_name(name);
@@ -112,7 +116,9 @@ ParameterPtr creatParameter(string name, int pid, size_t paraSize,
   return parameter;
 }
 
-ParameterPtr creatParameterBias(string name, int pid, size_t paraSize,
+ParameterPtr creatParameterBias(string name,
+                                int pid,
+                                size_t paraSize,
                                 bool useGpu) {
   ParameterConfig paraConfig;
   paraConfig.set_name(name);
@@ -127,8 +133,10 @@ ParameterPtr creatParameterBias(string name, int pid, size_t paraSize,
   return parameter;
 }
 
-LayerPtr initRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
-                            int layerSize, bool useGpu) {
+LayerPtr initRecurrentLayer(LayerConfig layerConfig,
+                            size_t batchSize,
+                            int layerSize,
+                            bool useGpu) {
   FLAGS_use_gpu = useGpu;
   LayerMap layerMap;
   ParameterMap parameterMap;
@@ -214,7 +222,7 @@ TEST(Layer, RecurrentLayer) {
 #define protected public
 #include "paddle/gserver/layers/LstmLayer.h"
 #include "paddle/gserver/layers/GatedRecurrentLayer.h"
-template<class T>
+template <class T>
 class TestRecurrentLayer {
 public:
   LayerConfig config_;
@@ -227,25 +235,34 @@ public:
   LayerMap layerMap_;
   ParameterMap parameterMap_;
   TestRecurrentLayer(const LayerConfig& config,
-    bool useGpu, bool useBatch = false)
-    : config_(config), useGpu_(useGpu), useBatch_(useBatch) {}
+                     bool useGpu,
+                     bool useBatch = false)
+      : config_(config), useGpu_(useGpu), useBatch_(useBatch) {}
   void init(size_t batchSize) {
     FLAGS_use_gpu = useGpu_;
     testLayer_ = Layer::create(config_);
     if (typeid(T) == typeid(GatedRecurrentLayer)) {
       dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
-                                  batchSize, config_.size() * 3, useGpu_);
+                                  batchSize,
+                                  config_.size() * 3,
+                                  useGpu_);
       para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
-                             0, config_.size() * config_.size() * 3, useGpu_);
-      bias_ = creatParameterBias(config_.bias_parameter_name(),
-                                 1, config_.size() * 3, useGpu_);
+                             0,
+                             config_.size() * config_.size() * 3,
+                             useGpu_);
+      bias_ = creatParameterBias(
+          config_.bias_parameter_name(), 1, config_.size() * 3, useGpu_);
     } else if (typeid(T) == typeid(LstmLayer)) {
       dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
-                                  batchSize, config_.size() * 4, useGpu_);
+                                  batchSize,
+                                  config_.size() * 4,
+                                  useGpu_);
       para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
-                             0, config_.size() * config_.size() * 4, useGpu_);
-      bias_ = creatParameterBias(config_.bias_parameter_name(),
-                                 1, config_.size() * 7, useGpu_);
+                             0,
+                             config_.size() * config_.size() * 4,
+                             useGpu_);
+      bias_ = creatParameterBias(
+          config_.bias_parameter_name(), 1, config_.size() * 7, useGpu_);
     }
     layerMap_[dataLayer_->getName()] = dataLayer_;
     parameterMap_[para_->getName()] = para_;
@@ -266,15 +283,17 @@ public:
   }
 };
 
-template<class T>
-void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
-                         bool cpuBatch, bool gpuBatch) {
+template <class T>
+void checkRecurrentLayer(LayerConfig layerConfig,
+                         size_t batchSize,
+                         bool cpuBatch,
+                         bool gpuBatch) {
   TestRecurrentLayer<T> testCpu(layerConfig, false, cpuBatch);
   TestRecurrentLayer<T> testGpu(layerConfig, true, gpuBatch);
   testCpu.init(batchSize);
   testGpu.init(batchSize);
-  auto checkError = [](MatrixPtr cpu, MatrixPtr gpu,
-                       int numSequences, const char* str) {
+  auto checkError = [](
+      MatrixPtr cpu, MatrixPtr gpu, int numSequences, const char* str) {
     CpuMatrix check(gpu->getHeight(), gpu->getWidth());
     check.copyFrom(*gpu);
     int height = cpu->getHeight();
@@ -290,8 +309,8 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
         }
       }
     }
-    EXPECT_EQ(count, 0) << "[" << str << "]" <<
-      "There are " << count << " different element.";
+    EXPECT_EQ(count, 0) << "[" << str << "]"
+                        << "There are " << count << " different element.";
   };
   T* cpuLayer = dynamic_cast<T*>(testCpu.testLayer_.get());
   T* gpuLayer = dynamic_cast<T*>(testGpu.testLayer_.get());
@@ -312,8 +331,8 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
   testCpu.forward();
   testGpu.forward();
 
-  checkError(cpuLayer->getOutputValue(),
-             gpuLayer->getOutputValue(), 1, "outputValue");
+  checkError(
+      cpuLayer->getOutputValue(), gpuLayer->getOutputValue(), 1, "outputValue");
 
   /* check backward */
   cpuLayer->getOutputGrad()->randomizeUniform();
@@ -327,11 +346,15 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize,
   checkError(cpuInput.grad, gpuInput.grad, 1, "inputGrad");
   // check weight grad
   int numSequences = cpuInput.getNumSequences();
-  checkError(cpuLayer->weight_->getWGrad(), gpuLayer->weight_->getWGrad(),
-             numSequences, "weightGrad");
+  checkError(cpuLayer->weight_->getWGrad(),
+             gpuLayer->weight_->getWGrad(),
+             numSequences,
+             "weightGrad");
   // check bias grad
-  checkError(cpuLayer->bias_->getWGrad(), gpuLayer->bias_->getWGrad(),
-             numSequences, "biasGrad");
+  checkError(cpuLayer->bias_->getWGrad(),
+             gpuLayer->bias_->getWGrad(),
+             numSequences,
+             "biasGrad");
 }
 
 TEST(Layer, GatedRecurrentLayer) {
@@ -357,7 +380,7 @@ TEST(Layer, GatedRecurrentLayer) {
             layerConfig.set_size(frameSize);
             layerConfig.set_reversed(reversed);
             checkRecurrentLayer<GatedRecurrentLayer>(
-              layerConfig, batchSize, cpuBatch, gpuBatch);
+                layerConfig, batchSize, cpuBatch, gpuBatch);
           }
         }
       }
@@ -369,7 +392,7 @@ TEST(Layer, LstmLayer) {
   LayerConfig layerConfig;
   layerConfig.set_type("lstmemory");
   layerConfig.set_active_type("relu");
-  layerConfig.set_active_state_type("sigmoid");
+  layerConfig.set_active_state_type("tanh");
   layerConfig.set_active_gate_type("sigmoid");
 
   layerConfig.add_inputs();
@@ -388,8 +411,8 @@ TEST(Layer, LstmLayer) {
                       << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
             layerConfig.set_size(frameSize);
             layerConfig.set_reversed(reversed);
-            checkRecurrentLayer<LstmLayer>
-              (layerConfig, batchSize, cpuBatch, gpuBatch);
+            checkRecurrentLayer<LstmLayer>(
+                layerConfig, batchSize, cpuBatch, gpuBatch);
           }
         }
       }
diff --git a/paddle/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
index 9a83217f1a8471e61c2938eff7185cfa585b6c7d..204b03332ff5bba3b9f3e5d98050942d6f0f390f 100644
--- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp
+++ b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <paddle/utils/PythonUtil.h>
 #include <cstdlib>
 #include <ctime>
@@ -53,7 +52,7 @@ int randint(int* data, size_t int_max, size_t size) {
   int this_int = 0;
 
   while (count < size) {
-    this_int = std::rand() % int_max; // NOLINT
+    this_int = std::rand() % int_max;  // NOLINT
     if (tmp.find(this_int) == tmp.end()) {
       tmp[this_int] = 0;
       count += 1;
@@ -71,8 +70,10 @@ int randint(int* data, size_t int_max, size_t size) {
   return 0;
 }
 
-void calcOutput(ComData& comData, const string configFile,
-    const string configArgs, bool useGpu) {
+void calcOutput(ComData& comData,
+                const string configFile,
+                const string configArgs,
+                bool useGpu) {
   FLAGS_config = configFile;
   FLAGS_config_args = configArgs;
   FLAGS_use_gpu = useGpu;
@@ -95,8 +96,8 @@ void calcOutput(ComData& comData, const string configFile,
 
   vector<Argument>& inArgs = dataBatch.getStreams();
   trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
-  trainer.getGradientMachine()->forwardBackward(inArgs, &comData.outArgs,
-                                                PASS_TRAIN);
+  trainer.getGradientMachine()->forwardBackward(
+      inArgs, &comData.outArgs, PASS_TRAIN);
   trainer.getGradientMachine()->finish();
 }
 
@@ -108,8 +109,8 @@ void checkMatrix(real* A, real* B, size_t matSize) {
 #endif
   int diffNum = 0;
   for (size_t i = 0; i < matSize; ++i) {
-    if (std::isinf(A[i]) || std::isnan(A[i])
-        || std::isinf(B[i]) || std::isnan(B[i])) {
+    if (std::isinf(A[i]) || std::isnan(A[i]) || std::isinf(B[i]) ||
+        std::isnan(B[i])) {
     } else if (fabs(A[i] - B[i]) > err) {
       diffNum++;
     }
@@ -117,8 +118,10 @@ void checkMatrix(real* A, real* B, size_t matSize) {
   EXPECT_EQ(0, diffNum);
 }
 
-void checkTranspose(real* matrix, real* transpose,
-    size_t width, size_t matSize) {
+void checkTranspose(real* matrix,
+                    real* transpose,
+                    size_t width,
+                    size_t matSize) {
 #ifndef PADDLE_TYPE_DOUBLE
   real err = 1e-3;
 #else
@@ -149,20 +152,20 @@ void compareOutput(ComData& fcData, ComData& selFcData) {
   // check cost
   LOG(INFO) << "Check cost";
   CpuMatrix fcCost(outArgsFc[0].value->getHeight(),
-                 outArgsFc[0].value->getWidth());
+                   outArgsFc[0].value->getWidth());
   CpuMatrix selfcCost(outArgsSelfc[0].value->getHeight(),
-                    outArgsSelfc[0].value->getWidth());
+                      outArgsSelfc[0].value->getWidth());
   fcCost.copyFrom(*outArgsFc[0].value);
   selfcCost.copyFrom(*outArgsSelfc[0].value);
   checkMatrix(fcCost.getData(), selfcCost.getData(), fcCost.getElementCnt());
 
   // check selective fc output and fc output
-  LOG(INFO) << "Compare output of SelectiveFullyConectedLayer " <<
-    "with FullyConectedLayer";
+  LOG(INFO) << "Compare output of SelectiveFullyConectedLayer "
+            << "with FullyConectedLayer";
   CpuMatrix fcOut(outArgsFc[1].value->getHeight(),
-                 outArgsFc[1].value->getWidth());
+                  outArgsFc[1].value->getWidth());
   CpuMatrix selfcOut(outArgsSelfc[1].value->getHeight(),
-                    outArgsSelfc[1].value->getWidth());
+                     outArgsSelfc[1].value->getWidth());
 
   fcOut.copyFrom(*outArgsFc[1].value);
   selfcOut.copyFrom(*outArgsSelfc[1].value);
@@ -189,32 +192,40 @@ void compareOutput(ComData& fcData, ComData& selFcData) {
     CpuVector paraGrad1(*p1->getBuf(PARAMETER_GRADIENT));
     CpuVector paraGrad2(*p2->getBuf(PARAMETER_GRADIENT));
     if (paramName == "rand_fc_param.bias") {
-      checkMatrix(paraValue1.getData(),
-                  paraValue2.getData(),
-                  paraValue1.getSize());
-      checkMatrix(paraGrad1.getData(),
-                 paraGrad2.getData(),
-                 paraGrad1.getSize());
+      checkMatrix(
+          paraValue1.getData(), paraValue2.getData(), paraValue1.getSize());
+      checkMatrix(
+          paraGrad1.getData(), paraGrad2.getData(), paraGrad1.getSize());
     } else {
-      checkTranspose(paraValue1.getData(), paraValue2.getData(),
-          fcLayerWidth, paraValue1.getSize());
-      checkTranspose(paraGrad1.getData(), paraGrad2.getData(),
-          fcLayerWidth, paraGrad1.getSize());
+      checkTranspose(paraValue1.getData(),
+                     paraValue2.getData(),
+                     fcLayerWidth,
+                     paraValue1.getSize());
+      checkTranspose(paraGrad1.getData(),
+                     paraGrad2.getData(),
+                     fcLayerWidth,
+                     paraGrad1.getSize());
     }
   }
 }
 
-void compareSparseMulOutput(real* fcOutput, real* selOutput, size_t nnz,
-    const std::shared_ptr<std::vector<std::pair<int*, size_t> > > &selCols) {
+void compareSparseMulOutput(
+    real* fcOutput,
+    real* selOutput,
+    size_t nnz,
+    const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& selCols) {
 #ifndef PADDLE_TYPE_DOUBLE
   real err = 1e-3;
 #else
   real err = 1e-10;
 #endif
-  size_t nnzCount = std::accumulate(selCols->begin(), selCols->end(), 0UL,
-                            [](size_t a, const std::pair<int*, size_t>& arr){
-    return a+arr.second;
-  });
+  size_t nnzCount =
+      std::accumulate(selCols->begin(),
+                      selCols->end(),
+                      0UL,
+                      [](size_t a, const std::pair<int*, size_t>& arr) {
+                        return a + arr.second;
+                      });
   EXPECT_EQ(nnz, nnzCount);
 
   size_t sampleNum = selCols->size();
@@ -225,18 +236,20 @@ void compareSparseMulOutput(real* fcOutput, real* selOutput, size_t nnz,
       size_t selIdx = (*selCols)[i].first[j];
       if (fabs(fcOutput[i * fcLayerWidth + selIdx] - selOutput[count]) > err) {
         diffNum++;
-        LOG(INFO) << count << " diff : "
-                  << fcOutput[i * fcLayerWidth + selIdx] << "\t"
-                  << selOutput[count];
-       }
+        LOG(INFO) << count << " diff : " << fcOutput[i * fcLayerWidth + selIdx]
+                  << "\t" << selOutput[count];
+      }
       count++;
     }
   }
   EXPECT_EQ(0, diffNum);
 }
 
-LayerPtr creatDataLayer(string name, size_t batchSize, size_t layerSize,
-    std::vector<real>& values, bool useGpu) {
+LayerPtr creatDataLayer(string name,
+                        size_t batchSize,
+                        size_t layerSize,
+                        std::vector<real>& values,
+                        bool useGpu) {
   LayerConfig dataConfig;
   dataConfig.set_name(name);
   dataConfig.set_type("data");
@@ -253,8 +266,8 @@ LayerPtr creatDataLayer(string name, size_t batchSize, size_t layerSize,
   return layer;
 }
 
-ParameterPtr creatParameter(string name, int pid, size_t paraSize,
-        string paramFile, bool useGpu) {
+ParameterPtr creatParameter(
+    string name, int pid, size_t paraSize, string paramFile, bool useGpu) {
   ParameterConfig paraConfig;
   paraConfig.set_name(name);
   paraConfig.set_size(paraSize);
@@ -268,16 +281,19 @@ ParameterPtr creatParameter(string name, int pid, size_t paraSize,
   return parameter;
 }
 
-LayerPtr initFcLayer(LayerPtr dataLayer, LayerConfig layerConfig,
-    int dataLayerSize, int fcLayerSize,
-    string paraName, string paraFile, bool useGpu) {
+LayerPtr initFcLayer(LayerPtr dataLayer,
+                     LayerConfig layerConfig,
+                     int dataLayerSize,
+                     int fcLayerSize,
+                     string paraName,
+                     string paraFile,
+                     bool useGpu) {
   LayerMap layerMap;
   ParameterMap parameterMap;
 
   layerMap[dataLayer->getName()] = dataLayer;
-  ParameterPtr para =
-      creatParameter(paraName, 0, dataLayerSize * fcLayerSize,
-      paraFile, useGpu);
+  ParameterPtr para = creatParameter(
+      paraName, 0, dataLayerSize * fcLayerSize, paraFile, useGpu);
   parameterMap[para->getName()] = para;
 
   layerConfig.add_inputs();
@@ -296,14 +312,13 @@ LayerPtr initFcLayer(LayerPtr dataLayer, LayerConfig layerConfig,
 #ifndef PADDLE_TYPE_DOUBLE
 // The parameter file used in fc.conf and selective_fc.conf is float
 TEST(Layer, SelectiveFcLayer_train_dense_mul) {
-  const string& fcConfig =
-      "gserver/tests/SelectiveFcTest/conf/fc.conf";
+  const string& fcConfig = "gserver/tests/SelectiveFcTest/conf/fc.conf";
   const string& fcConfigArgs =
-    "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
+      "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
   const string& selFcConfig =
       "gserver/tests/SelectiveFcTest/conf/selective_fc.conf";
   const string& selConfigArgs =
-    "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
+      "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
 
   for (auto useGpu : {false, true}) {
 #ifdef PADDLE_ONLY_CPU
@@ -323,7 +338,7 @@ TEST(Layer, SelectiveFcLayer_train_dense_mul) {
 }
 #endif  // PADDLE_TYPE_DOUBLE
 
-void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config,
+void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
                                         bool useGpu) {
   FLAGS_use_gpu = useGpu;
   size_t batchSize = 100;
@@ -332,21 +347,26 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config,
   for (size_t j = 0; j < batchSize * dataLayerSize; ++j) {
     values[j] = std::rand() / real(RAND_MAX);
   }
-  LayerPtr dataLayer = creatDataLayer(
-      "data", batchSize, dataLayerSize, values, useGpu);
+  LayerPtr dataLayer =
+      creatDataLayer("data", batchSize, dataLayerSize, values, useGpu);
 
   const string& selfcParaFile =
-    "gserver/tests/SelectiveFcTest/model/rand_fc_param.w.transpose";
+      "gserver/tests/SelectiveFcTest/model/rand_fc_param.w.transpose";
   const string& selfcParaName = "rand_fc_param.w.transpose";
 
   std::shared_ptr<SelectiveFullyConnectedLayer> selfcLayer =
-    std::dynamic_pointer_cast<SelectiveFullyConnectedLayer>(initFcLayer(
-        dataLayer, config, dataLayerSize, fcLayerWidth,
-        selfcParaName, selfcParaFile, useGpu));
+      std::dynamic_pointer_cast<SelectiveFullyConnectedLayer>(
+          initFcLayer(dataLayer,
+                      config,
+                      dataLayerSize,
+                      fcLayerWidth,
+                      selfcParaName,
+                      selfcParaFile,
+                      useGpu));
 
   // create selected columns
-  std::shared_ptr<std::vector<std::pair<int*, size_t> > > selCols(
-     new std::vector<std::pair<int*, size_t> > (batchSize));
+  std::shared_ptr<std::vector<std::pair<int*, size_t>>> selCols(
+      new std::vector<std::pair<int*, size_t>>(batchSize));
   size_t maxNNZ = 30;
   srand((size_t)(time(NULL)));
   int total = 0;
@@ -364,8 +384,9 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config,
 
   MatrixPtr outMatSelfc = selfcLayer->getOutputValue();
   CpuSparseMatrixPtr cpuOutMatSelfc(
-    new CpuSparseMatrix(outMatSelfc->getHeight(), outMatSelfc->getWidth(),
-                        outMatSelfc->getElementCnt()));
+      new CpuSparseMatrix(outMatSelfc->getHeight(),
+                          outMatSelfc->getWidth(),
+                          outMatSelfc->getElementCnt()));
   cpuOutMatSelfc->copyFrom(*outMatSelfc, HPPL_STREAM_DEFAULT);
 #ifndef PADDLE_ONLY_CPU
   if (useGpu) {
@@ -376,7 +397,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config,
   size_t nnz = cpuOutMatSelfc->getElementCnt();
 
   const string& fcParaFile =
-    "gserver/tests/SelectiveFcTest/model/rand_fc_param.w";
+      "gserver/tests/SelectiveFcTest/model/rand_fc_param.w";
   const string& fcParaName = "rand_fc_param.w";
   LayerConfig fcLayerConfig;
   fcLayerConfig.set_name("fc_layer");
@@ -384,13 +405,18 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config,
   fcLayerConfig.set_active_type("linear");
   fcLayerConfig.set_size(fcLayerWidth);
 
-  LayerPtr fcLayer = initFcLayer(dataLayer, fcLayerConfig,
-      dataLayerSize, fcLayerWidth, fcParaName, fcParaFile, useGpu);
+  LayerPtr fcLayer = initFcLayer(dataLayer,
+                                 fcLayerConfig,
+                                 dataLayerSize,
+                                 fcLayerWidth,
+                                 fcParaName,
+                                 fcParaFile,
+                                 useGpu);
   fcLayer->forward(PASS_TEST);
 
   MatrixPtr outMatFc = fcLayer->getOutputValue();
   MatrixPtr cpuOutMatFc(
-    new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
+      new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
   cpuOutMatFc->copyFrom(*outMatFc, HPPL_STREAM_DEFAULT);
 #ifndef PADDLE_ONLY_CPU
   if (useGpu) {
@@ -401,7 +427,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config,
 
   compareSparseMulOutput(outValueFc, outValueSelfc, nnz, selCols);
   for (size_t i = 0; i < batchSize; ++i) {
-    delete [](*selCols)[i].first;
+    delete[](*selCols)[i].first;
   }
 }
 
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index f7aa60380f23eeea91ee852480862f6b19caedec..cba8b37289b53b7d75c64a6a95c9e3900b193902 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <mutex>
@@ -48,10 +47,10 @@ public:
    * @return Pointer to the allocated memory
    */
   virtual void* alloc(size_t size) {
-      void* ptr;
-      CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
-      CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
-      return ptr;
+    void* ptr;
+    CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
+    CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
+    return ptr;
   }
 
   /**
@@ -59,12 +58,12 @@ public:
    * @param ptr  Pointer to be free.
    */
   virtual void free(void* ptr) {
-    if (ptr) { ::free(ptr); }
+    if (ptr) {
+      ::free(ptr);
+    }
   }
 
-  virtual std::string getName() {
-    return "cpu_alloc";
-  }
+  virtual std::string getName() { return "cpu_alloc"; }
 };
 
 /**
@@ -81,7 +80,7 @@ public:
    */
   virtual void* alloc(size_t size) {
     void* ptr = hl_malloc_device(size);
-    CHECK(ptr)<< "Fail to allocate GPU memory " << size << " bytes";
+    CHECK(ptr) << "Fail to allocate GPU memory " << size << " bytes";
     return ptr;
   }
 
@@ -95,9 +94,7 @@ public:
     }
   }
 
-  virtual std::string getName() {
-    return "gpu_alloc";
-  }
+  virtual std::string getName() { return "gpu_alloc"; }
 };
 
 /**
@@ -128,9 +125,7 @@ public:
     }
   }
 
-  virtual std::string getName() {
-    return "cuda_host_alloc";
-  }
+  virtual std::string getName() { return "cuda_host_alloc"; }
 };
 
 }  // namespace paddle
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 8b888b1ee5e46ec5cac316d9f90095a7e314ae13..2f32b3fdd1a26c5b1bca43d0bd0ebb0896a012c4 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -625,7 +625,10 @@ void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
   applyBinary(binary::SquareDerivative<T>(), b);
 }
 
-DEFINE_MATRIX_BINARY_OP(Tanh, b = 2.0 / (1.0 + exp(-2 * a)) - 1.0);
+DEFINE_MATRIX_BINARY_OP(Tanh,
+    T tmp = -2.0 * a;
+    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+    b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
 template<>
 void BaseMatrixT<real>::tanh(BaseMatrixT& b) {
   applyBinary(binary::Tanh<real>(), b);
@@ -1446,8 +1449,10 @@ template<>
 template <class Agg>
 int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
+  CHECK_EQ(height_, numRows);
+  CHECK_EQ(width_, 1UL);
   aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
             numCols, offset, false_type(), true_type() /*aAsColVector*/);
 
@@ -1458,20 +1463,71 @@ template<>
 template <class Agg, class Saver>
 int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
+  CHECK_EQ(height_, numRows);
+  CHECK_EQ(width_, 1UL);
   aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
             false_type(), true_type() /*aAsColVector*/);
 
   return 0;
 }
 
+template<>
+template <class Agg>
+int BaseMatrixT<real>::applyRow(
+     Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) {
+  if (scaleDest != 0) {
+    applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b);
+  } else {
+    applyRow(agg, base::binary::second(), b);
+    if (scaleAgg != 1) {
+      mulScalar(scaleAgg);
+    }
+  }
+  return 0;
+}
+
+template<>
+template <class Agg, class Op, class Saver>
+int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
+                                BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
+  CHECK_EQ(height_, numRows);
+  CHECK_EQ(width_, 1UL);
+  CHECK_EQ(c.height_, numRows);
+  CHECK_EQ(c.width_, numCols);
+  aggregate(agg, op, sv,
+            b, c, numRows, numCols, offset,
+            false_type(), true_type() /*aAsColVector*/);
+  return 0;
+}
+
+template<>
+template <class Agg, class Op>
+int BaseMatrixT<real>::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
+                                BaseMatrixT& b, BaseMatrixT& c) {
+  if (scaleDest != 0) {
+    applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c);
+  } else {
+    applyRow(agg, op, base::binary::second(), b, c);
+    if (scaleAgg != 1) {
+      mulScalar(scaleAgg);
+    }
+  }
+  return 0;
+}
+
 template<>
 template <class Agg>
 int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
+  CHECK_EQ(width_, numCols);
+  CHECK_EQ(height_, 1UL);
   aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
             numCols, offset, true_type() /*aAsRowVector*/, false_type());
 
@@ -1482,8 +1538,10 @@ template<>
 template <class Agg, class Saver>
 int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
+  CHECK_EQ(width_, numCols);
+  CHECK_EQ(height_, 1UL);
   aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
             true_type() /*aAsRowVector*/, false_type());
 
@@ -1491,8 +1549,23 @@ int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
 }
 
 template<>
-void BaseMatrixT<real>::sumRows(BaseMatrixT& b) {
-  applyRow(aggregate::sum(), b);
+template <class Agg>
+int BaseMatrixT<real>::applyCol(
+     Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) {
+  if (scaleDest != 0) {
+    applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b);
+  } else {
+    applyCol(agg, base::binary::second(), b);
+    if (scaleAgg != 1) {
+      mulScalar(scaleAgg);
+    }
+  }
+  return 0;
+}
+
+template<>
+void BaseMatrixT<real>::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) {
+  applyRow(aggregate::sum(), scaleDest, scaleSum, b);
 }
 
 template<>
@@ -1521,18 +1594,22 @@ void BaseMatrixT<real>::minCols(BaseMatrixT& b) {
 }
 
 template<>
-void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scale) {
-  applyCol(aggregate::sum(), base::binary::add2(1.0, scale), b);
+void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) {
+  applyCol(aggregate::sum(), scaleDest, scaleSum, b);
 }
 
 template<>
-void BaseMatrixT<real>::sumOfSquares(BaseMatrixT& b, BaseMatrixT& c) {
-  int numRows = b.height_;
-  int numCols = b.width_;
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  aggregate(aggregate::sum(), base::binary::squaredDiff(), base::binary::add(),
-            b, c, numRows, numCols, offset, false_type(),
-            true_type() /*aAsColVector*/);
+void BaseMatrixT<real>::sumOfSquaredDiffs(
+    BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) {
+  applyRow(aggregate::sum(), base::binary::squaredDiff(),
+           scaleDest, scaleSum, b, c);
+}
+
+template<>
+void BaseMatrixT<real>::sumOfProducts(
+    BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) {
+  applyRow(aggregate::sum(), base::binary::mul(),
+           scaleDest, scaleSum, b, c);
 }
 
 template class BaseMatrixT<real>;
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 2dd2c2c7a9b985924d53cb3bf8840eb1e55eee3e..d41dcee682cce15e94d45dafeb12bb0dce19b221 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <cstddef>
 #include <stdint.h>
@@ -52,9 +51,14 @@ public:
   size_t cRow_;
   size_t dCol_;
   size_t dRow_;
-  MatrixOffset(size_t aCol = 0, size_t aRow = 0, size_t bCol = 0,
-               size_t bRow = 0, size_t cCol = 0, size_t cRow = 0,
-               size_t dCol = 0, size_t dRow = 0)
+  MatrixOffset(size_t aCol = 0,
+               size_t aRow = 0,
+               size_t bCol = 0,
+               size_t bRow = 0,
+               size_t cCol = 0,
+               size_t cRow = 0,
+               size_t dCol = 0,
+               size_t dRow = 0)
       : aCol_(aCol),
         aRow_(aRow),
         bCol_(bCol),
@@ -65,7 +69,7 @@ public:
         dRow_(dRow) {}
 };
 
-template<class T>
+template <class T>
 class BaseMatrixT {
 public:
   size_t height_, width_;
@@ -97,8 +101,12 @@ public:
         trans_(mat.trans_),
         useGpu_(useGpu) {}
 
-  BaseMatrixT(size_t height, size_t width, size_t stride, T* data, bool trans,
-             bool use_gpu)
+  BaseMatrixT(size_t height,
+              size_t width,
+              size_t stride,
+              T* data,
+              bool trans,
+              bool use_gpu)
       : height_(height),
         width_(width),
         stride_(stride),
@@ -167,12 +175,17 @@ public:
    * @endcode
    */
   template <class Op, class bAsRowVector, class bAsColVector>
-  int applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
-                  MatrixOffset& offset, bAsRowVector, bAsColVector);
+  int applyBinary(Op op,
+                  BaseMatrixT& b,
+                  int numRows,
+                  int numCols,
+                  MatrixOffset& offset,
+                  bAsRowVector,
+                  bAsColVector);
 
   template <class Op>
-  int applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
-                  MatrixOffset& offset);
+  int applyBinary(
+      Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset);
 
   /**
    * ternary operator: element wise op(a, b, c).
@@ -212,13 +225,22 @@ public:
    * @endcode
    */
   template <class Op, class cAsRowVector, class cAsColVector>
-  int applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, int numRows,
-                   int numCols, MatrixOffset& offset, cAsRowVector,
+  int applyTernary(Op op,
+                   BaseMatrixT& b,
+                   BaseMatrixT& c,
+                   int numRows,
+                   int numCols,
+                   MatrixOffset& offset,
+                   cAsRowVector,
                    cAsColVector);
 
   template <class Op>
-  int applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, int numRows,
-                   int numCols, MatrixOffset& offset);
+  int applyTernary(Op op,
+                   BaseMatrixT& b,
+                   BaseMatrixT& c,
+                   int numRows,
+                   int numCols,
+                   MatrixOffset& offset);
 
   /**
    * quaternary operator: element wise op(a, b, c, d).
@@ -247,8 +269,13 @@ public:
    * @endcode
    */
   template <class Op>
-  int applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d,
-                      int numRows, int numCols, MatrixOffset& offset);
+  int applyQuaternary(Op op,
+                      BaseMatrixT& b,
+                      BaseMatrixT& c,
+                      BaseMatrixT& d,
+                      int numRows,
+                      int numCols,
+                      MatrixOffset& offset);
 
   /**
    * a aggregate expression that apply each row(or column) of matrix b.
@@ -266,10 +293,20 @@ public:
    *    a[i] = sv(a[i], dst)
    * @endcode
    */
-  template <class Agg, class Op, class Saver, class aAsRowVector,
+  template <class Agg,
+            class Op,
+            class Saver,
+            class aAsRowVector,
             class aAsColVector>
-  int aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, int numRows,
-                int numCols, MatrixOffset& offset, aAsRowVector, aAsColVector);
+  int aggregate(Agg agg,
+                Op op,
+                Saver sv,
+                BaseMatrixT& b,
+                int numRows,
+                int numCols,
+                MatrixOffset& offset,
+                aAsRowVector,
+                aAsColVector);
 
   /**
    * a aggregate expression that apply each row(or column) of matrix b and c.
@@ -288,10 +325,20 @@ public:
    *     a[i] = sv(a[i], dst)
    * @endcode
    */
-  template <class Agg, class Op, class Saver, class aAsRowVector,
+  template <class Agg,
+            class Op,
+            class Saver,
+            class aAsRowVector,
             class aAsColVector>
-  int aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c,
-                int numRows, int numCols, MatrixOffset& offset, aAsRowVector,
+  int aggregate(Agg agg,
+                Op op,
+                Saver sv,
+                BaseMatrixT& b,
+                BaseMatrixT& c,
+                int numRows,
+                int numCols,
+                MatrixOffset& offset,
+                aAsRowVector,
                 aAsColVector);
 
   /**
@@ -305,6 +352,27 @@ public:
   template <class Agg>
   int applyRow(Agg agg, BaseMatrixT& b);
 
+  /**
+   * a aggregate expression that apply each row of matrix b.
+   *
+   * @code
+   * for each row i & 0 <= j < b.width_, do:
+   *   dst = agg(op(b[i*ldb + j], c[i*ldc + j])
+   *   this[i] = sv(this[i], dst)
+   * @endcode
+   */
+  template <class Agg, class Op, class Saver>
+  int applyRow(Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c);
+
+  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
+  template <class Agg, class Op>
+  int applyRow(Agg agg,
+               Op op,
+               real scaleDest,
+               real scaleAgg,
+               BaseMatrixT& b,
+               BaseMatrixT& c);
+
   /**
    * a aggregate expression that apply each row of matrix b.
    *
@@ -317,6 +385,10 @@ public:
   template <class Agg, class Saver>
   int applyRow(Agg agg, Saver sv, BaseMatrixT& b);
 
+  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
+  template <class Agg>
+  int applyRow(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b);
+
   /**
    * a aggregate expression that apply each column of matrix b.
    *
@@ -340,6 +412,10 @@ public:
   template <class Agg, class Saver>
   int applyCol(Agg agg, Saver sv, BaseMatrixT& b);
 
+  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
+  template <class Agg>
+  int applyCol(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b);
+
   bool useGpu() const { return useGpu_; }
 
   const T* rowBuf(size_t row) const { return data_ + width_ * row; }
@@ -639,8 +715,7 @@ public:
    * this = a*p1 + b*p2 + c*p3
    * @endcode
    */
-  void add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2,
-            T p3);
+  void add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3);
 
   /**
    * @code
@@ -650,9 +725,9 @@ public:
    */
   void sgdUpdate(BaseMatrixT& b,  //  grad
                  BaseMatrixT& c,  //  mom
-                 T p1,        //  learningRate,
-                 T p2,        //  momentum,
-                 T p3);       //  decayRate
+                 T p1,            //  learningRate,
+                 T p2,            //  momentum,
+                 T p3);           //  decayRate
 
   /**
    * @code
@@ -663,9 +738,9 @@ public:
   void sgdUpdate(BaseMatrixT& b,  // grad,
                  BaseMatrixT& c,  // mom,
                  BaseMatrixT& d,  // lr,
-                 T p1,        // learningRate,
-                 T p2,        // momentum,
-                 T p3);       // decayRate
+                 T p1,            // learningRate,
+                 T p2,            // momentum,
+                 T p3);           // decayRate
 
   /// apply L1/L2 to *this*
   void applyL1(T learningRate, T decayRate);
@@ -742,17 +817,21 @@ public:
    * this = b>c ? b : c
    * @endcode
    */
-   void max(BaseMatrixT& b, BaseMatrixT& c);  //  NOLINT
+  void max(BaseMatrixT& b, BaseMatrixT& c);  //  NOLINT
 
   /**
    * @code
    * this[destCol] += (b>p1 == c>p1) ? 0 : 1)
    * @endcode
    */
-  void binaryClassificationError(size_t destCol, BaseMatrixT& b, BaseMatrixT& c,
+  void binaryClassificationError(size_t destCol,
+                                 BaseMatrixT& b,
+                                 BaseMatrixT& c,
                                  T p);
-  void binaryClassificationError2(size_t destCol, BaseMatrixT& b,
-                                  BaseMatrixT& c, T p);
+  void binaryClassificationError2(size_t destCol,
+                                  BaseMatrixT& b,
+                                  BaseMatrixT& c,
+                                  T p);
 
   /**
    * @code
@@ -808,8 +887,8 @@ public:
    * this += sqr(p1*b + p2*c + p3*d)
    * @endcode
    */
-  void addSquareSum(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1,
-                    T p2, T p3);
+  void addSquareSum(
+      BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3);
 
   /**
    * @code
@@ -920,7 +999,9 @@ public:
   void addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
 
   /// calculate the sum of each row of the matrix b.
-  void sumRows(BaseMatrixT& b);
+  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij}
+  void sumRows(BaseMatrixT& b, T scaleSum, T scaleDest);
+
   /// calculate the maximum value of each row of the matrix b.
   void maxRows(BaseMatrixT& b);
   /// calculate the minimum value of each row of the matrix b.
@@ -932,10 +1013,19 @@ public:
   void maxCols(BaseMatrixT& b);
   /// calculate the minimum value of each column of the matrix b.
   void minCols(BaseMatrixT& b);
-  void sumCols(BaseMatrixT& b, T scale);
 
-  /// calculate the sum of each row of (b - c)^2.
-  void sumOfSquares(BaseMatrixT& b, BaseMatrixT& c);
+  /// calculate the sum of each column of the matrix b.
+  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji}
+  void sumCols(BaseMatrixT& b, T scaleSum, T scaleDest);
+
+  /// this_i = scaleDest * this_i + scaleSum * \sum_j (b_{ij} - c_{ij})^2
+  void sumOfSquaredDiffs(BaseMatrixT& b,
+                         BaseMatrixT& c,
+                         T scaleSum,
+                         T scaleDest);
+
+  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
+  void sumOfProducts(BaseMatrixT& b, BaseMatrixT& c, T scaleSum, T scaleDest);
 
   /**
    * @code
@@ -950,9 +1040,7 @@ public:
    */
   void rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
 
-  virtual bool isSparse() const {
-    return false;
-  }
+  virtual bool isSparse() const { return false; }
 };
 
 typedef BaseMatrixT<real> BaseMatrix;
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
index db305812a7c036177022836d877661c8f83e999f..93b1bf46a10078b4ae83efdbf268f64b6da052dc 100644
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -23,7 +23,7 @@ if(NOT WITH_GPU)
     add_library(paddle_math STATIC
         ${MATH_SOURCES})
 else()
-    add_paddle_culib(paddle_math ${MATH_SOURCES})
+    cuda_add_library(paddle_math ${MATH_SOURCES})
 endif()
 
 
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index 842efdbe3d77ec3443374f62df5c520252aa7ce4..ad3f8e64efd37c27c7f462dd7c8311577a05a391 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "hl_gpu.h"
 #include "CpuSparseMatrix.h"
 #include "SparseMatrix.h"
@@ -24,24 +23,35 @@ namespace paddle {
 
 const size_t CpuSparseMatrix::DEFAULT_AVG_WIDTH;
 
-CpuSparseMatrix::CpuSparseMatrix(size_t height, size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
+CpuSparseMatrix::CpuSparseMatrix(size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
                                  bool trans)
     : Matrix(NULL, height, width, trans, false) {
   resize(height, width, nnz, valueType, format);
 }
 
-CpuSparseMatrix::CpuSparseMatrix(CpuMemHandlePtr dataHandle, size_t height,
-                                 size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
+CpuSparseMatrix::CpuSparseMatrix(CpuMemHandlePtr dataHandle,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
                                  bool trans)
     : Matrix(dataHandle, height, width, trans, false) {
   resize(height, width, nnz, valueType, format);
 }
 
-CpuSparseMatrix::CpuSparseMatrix(real* data, int* rows, int* cols,
-                                 size_t height, size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
+CpuSparseMatrix::CpuSparseMatrix(real* data,
+                                 int* rows,
+                                 int* cols,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
                                  bool trans)
     : Matrix(NULL, height, width, trans, false) {
   cols_ = cols;
@@ -54,8 +64,11 @@ CpuSparseMatrix::CpuSparseMatrix(real* data, int* rows, int* cols,
   format_ = format;
 }
 
-void CpuSparseMatrix::resize(size_t newHeight, size_t newWidth, size_t newNnz,
-                             SparseValueType valueType, SparseFormat format) {
+void CpuSparseMatrix::resize(size_t newHeight,
+                             size_t newWidth,
+                             size_t newNnz,
+                             SparseValueType valueType,
+                             SparseFormat format) {
   CHECK_LE(newNnz, newHeight * newWidth);
   size_t newSize = 0;
   if (format == SPARSE_CSR) {
@@ -110,23 +123,38 @@ void CpuSparseMatrix::sparseResize() {
 }
 
 void CpuSparseMatrix::resize(size_t newHeight, size_t newWidth) {
-  resize(newHeight, newWidth, newHeight * std::min(DEFAULT_AVG_WIDTH, newWidth),
-         valueType_, format_);
+  resize(newHeight,
+         newWidth,
+         newHeight * std::min(DEFAULT_AVG_WIDTH, newWidth),
+         valueType_,
+         format_);
 }
 
 MatrixPtr CpuSparseMatrix::getTranspose() {
   if (!memoryHandle_ && !value_) {
-    MatrixPtr dest(new CpuSparseMatrix(height_, width_, elementCnt_, valueType_,
-                                       format_, true));
+    MatrixPtr dest(new CpuSparseMatrix(
+        height_, width_, elementCnt_, valueType_, format_, true));
     return dest;
   } else if (memoryHandle_) {
     MatrixPtr dest(new CpuSparseMatrix(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_), height_,
-        width_, elementCnt_, valueType_, format_, true));
+        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
+        height_,
+        width_,
+        elementCnt_,
+        valueType_,
+        format_,
+        true));
     return dest;
   } else if (value_) {
-    MatrixPtr dest(new CpuSparseMatrix(value_, rows_, cols_, height_, width_,
-                                       elementCnt_, valueType_, format_, true));
+    MatrixPtr dest(new CpuSparseMatrix(value_,
+                                       rows_,
+                                       cols_,
+                                       height_,
+                                       width_,
+                                       elementCnt_,
+                                       valueType_,
+                                       format_,
+                                       true));
     return dest;
   } else {
     return NULL;
@@ -140,7 +168,10 @@ void CpuSparseMatrix::mul(MatrixPtr a, MatrixPtr b, real scaleAB, real scaleT) {
 
   if (dynamic_cast<CpuMatrix*>(a.get()) && dynamic_cast<CpuMatrix*>(b.get())) {
     CpuMatrix::mul(dynamic_cast<CpuMatrix*>(a.get()),
-                   dynamic_cast<CpuMatrix*>(b.get()), this, scaleAB, scaleT);
+                   dynamic_cast<CpuMatrix*>(b.get()),
+                   this,
+                   scaleAB,
+                   scaleT);
   } else {
     LOG(FATAL) << "not supported";
   }
@@ -243,7 +274,8 @@ void CpuSparseMatrix::randomizeUniform() {
   }
 }
 
-void CpuSparseMatrix::copyFrom(std::vector<int>& rows, std::vector<int>& cols,
+void CpuSparseMatrix::copyFrom(std::vector<int>& rows,
+                               std::vector<int>& cols,
                                std::vector<real>& values) {
   size_t size = format_ == SPARSE_CSR ? cols.size() : rows.size();
   resize(height_, width_, size, valueType_, format_);
@@ -302,11 +334,11 @@ MatrixPtr CpuSparseMatrix::clone(size_t height, size_t width, bool useGpu) {
   }
   CHECK(width && height);
   if (!useGpu) {
-    return std::make_shared<CpuSparseMatrix>(height, width, 0, valueType_,
-                                             format_);
+    return std::make_shared<CpuSparseMatrix>(
+        height, width, 0, valueType_, format_);
   } else {
-    return std::make_shared<GpuSparseMatrix>(height, width, elementCnt_,
-                                             valueType_, format_);
+    return std::make_shared<GpuSparseMatrix>(
+        height, width, elementCnt_, valueType_, format_);
   }
 }
 
@@ -315,13 +347,25 @@ MatrixPtr CpuSparseMatrix::subMatrix(size_t startRow, size_t numRows) {
   CHECK_EQ(format_, SPARSE_CSR);
   if (valueType_ == NO_VALUE) {
     return std::make_shared<CpuSparseMatrix>(
-        nullptr, rows_ + startRow, cols_, numRows, width_,
-        rows_[startRow + numRows] - rows_[startRow], valueType_, format_,
+        nullptr,
+        rows_ + startRow,
+        cols_,
+        numRows,
+        width_,
+        rows_[startRow + numRows] - rows_[startRow],
+        valueType_,
+        format_,
         trans_);
   } else {
     return std::make_shared<CpuSparseMatrix>(
-        value_, rows_ + startRow, cols_, numRows, width_,
-        rows_[startRow + numRows] - rows_[startRow], valueType_, format_,
+        value_,
+        rows_ + startRow,
+        cols_,
+        numRows,
+        width_,
+        rows_[startRow + numRows] - rows_[startRow],
+        valueType_,
+        format_,
         trans_);
   }
 }
@@ -404,14 +448,13 @@ void CpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   }
 }
 
-void CpuSparseMatrix::setRow(size_t row, size_t colNum,
-                             const unsigned int* cols, const real* values) {
+void CpuSparseMatrix::setRow(size_t row,
+                             size_t colNum,
+                             const unsigned int* cols,
+                             const real* values) {
   if (format_ == SPARSE_CSR) {
     CHECK_LT(row, height_);
     CHECK(NULL != cols);
-    for (size_t i = row; i < height_; i++) {
-      CHECK_EQ(rows_[i + 1], rows_[i]);
-    }
     if (0 == row) {
       rows_[row] = 0;
     }
@@ -497,11 +540,23 @@ void CpuSparseMatrix::copyFrom(const GpuSparseMatrix& src, hl_stream_t stream) {
   CHECK_EQ(size_t(elementCnt_), src.getElementCnt());
   size_t valSize = valueType_ == NO_VALUE ? 0 : elementCnt_;
   if (format_ == SPARSE_CSC)
-    hl_memcpy_from_csc_matrix(value_, valSize, rows_, elementCnt_, cols_,
-                              width_ + 1, src.sMatrix_.get(), stream);
+    hl_memcpy_from_csc_matrix(value_,
+                              valSize,
+                              rows_,
+                              elementCnt_,
+                              cols_,
+                              width_ + 1,
+                              src.sMatrix_.get(),
+                              stream);
   else
-    hl_memcpy_from_csr_matrix(value_, valSize, rows_, height_ + 1, cols_,
-                              elementCnt_, src.sMatrix_.get(), stream);
+    hl_memcpy_from_csr_matrix(value_,
+                              valSize,
+                              rows_,
+                              height_ + 1,
+                              cols_,
+                              elementCnt_,
+                              src.sMatrix_.get(),
+                              stream);
 }
 
 void CpuSparseMatrix::copyFrom(const CpuSparseMatrix& src) {
@@ -539,14 +594,16 @@ void CpuSparseMatrix::copyFrom(const CpuSparseMatrix& src) {
   }
 }
 
-void CpuSparseMatrix::copyRow(int offsets, size_t colNum,
+void CpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
                               const sparse_non_value_t* row) {
   for (size_t j = 0; j < colNum; j++) {
     cols_[offsets + j] = row[j].col;
   }
 }
 
-void CpuSparseMatrix::copyRow(int offsets, size_t colNum,
+void CpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
                               const sparse_float_value_t* row) {
   for (size_t j = 0; j < colNum; j++) {
     cols_[offsets + j] = row[j].col;
@@ -599,7 +656,8 @@ void CpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
   if (format_ == SPARSE_CSR) {
     int* srcCols = src.getCols();
     size_t numLessWidth =
-        std::count_if(srcCols, srcCols + src.getElementCnt(),
+        std::count_if(srcCols,
+                      srcCols + src.getElementCnt(),
                       [this](size_t n) { return n < this->width_; });
     resize(height_, width_, numLessWidth, valueType_, format_);
     rows_[0] = 0;
@@ -639,13 +697,15 @@ void CpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
 
 void CpuSparseMatrix::zeroMem() {
   CHECK(valueType_ == FLOAT_VALUE);
-  memset(value_, 0, elementCnt_* sizeof(real));
+  memset(value_, 0, elementCnt_ * sizeof(real));
 }
 
-template void CpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices,
+template void CpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
                                         sparse_non_value_t* data);
 
-template void CpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices,
+template void CpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
                                         sparse_float_value_t* data);
 
 template void CpuSparseMatrix::copyFrom(int64_t* indices,
@@ -676,7 +736,9 @@ void CpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
     }
 
     size_t outsize = std::min(num, beam);
-    std::partial_sort(vec.begin(), vec.begin() + outsize, vec.end(),
+    std::partial_sort(vec.begin(),
+                      vec.begin() + outsize,
+                      vec.end(),
                       [](const valuepair& a, const valuepair& b) {
                         return a.first > b.first;
                       });
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index fd3b5030bea7ac937d9cf828e29d3441446a65f6..861564555166da0bb70d500569dc0d4f89dd2fe5 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <cstddef>
 #include "Matrix.h"
@@ -21,24 +20,38 @@ namespace paddle {
 
 class CpuSparseMatrix : public Matrix {
 public:
-  CpuSparseMatrix(size_t height, size_t width,
+  CpuSparseMatrix(size_t height,
+                  size_t width,
                   size_t nnz, /* used to allocate space */
                   SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format = SPARSE_CSR, bool trans = false);
-
-  CpuSparseMatrix(CpuMemHandlePtr memHandle, size_t height, size_t width,
-                  size_t nnz, SparseValueType valueType, SparseFormat format,
+                  SparseFormat format = SPARSE_CSR,
+                  bool trans = false);
+
+  CpuSparseMatrix(CpuMemHandlePtr memHandle,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
                   bool trans);
 
-  CpuSparseMatrix(real* data, int* rows, int* cols, size_t height, size_t width,
-                  size_t nnz, SparseValueType valueType, SparseFormat format,
+  CpuSparseMatrix(real* data,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
                   bool trans);
 
   ~CpuSparseMatrix() {}
 
-  void resize(size_t newHeight, size_t newWidth,
+  void resize(size_t newHeight,
+              size_t newWidth,
               size_t newNnz, /* used to allocate space */
-              SparseValueType valueType, SparseFormat format);
+              SparseValueType valueType,
+              SparseFormat format);
   void resize(size_t newHeight, size_t newWidth);
 
   MatrixPtr getTranspose();
@@ -75,8 +88,6 @@ public:
     }
   }
 
-
-
   real* getColumn(size_t i) const {
     if (format_ == SPARSE_CSC) {
       return value_ + cols_[i];
@@ -182,7 +193,7 @@ public:
    * getData is convenient to get value
    */
   real* getData() { return getValue(); }
-  const real* getData() const { return getValue();}
+  const real* getData() const { return getValue(); }
 
   /**
    * @brief only set value_ of FLOAT_VALUE sparse matrix to zero
@@ -220,7 +231,9 @@ public:
 
   void printOneRow(std::ostream& os, size_t idx) const;
 
-  void setRow(size_t row, size_t colNum, const unsigned int* cols,
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
               const real* values);
 
   void randomizeUniform();
@@ -241,7 +254,8 @@ public:
 
   virtual MatrixPtr subMatrix(size_t startRow, size_t numRows);
 
-  void copyFrom(std::vector<int>& rows, std::vector<int>& cols,
+  void copyFrom(std::vector<int>& rows,
+                std::vector<int>& cols,
                 std::vector<real>& values);
 
   void copyFrom(const CpuMatrix& src);
@@ -285,9 +299,7 @@ protected:
 
   // BaseMatrixT interface
 public:
-  bool isSparse() const {
-    return true;
-  }
+  bool isSparse() const { return true; }
 
 private:
   using Matrix::copyFrom;
diff --git a/paddle/math/ExecViaCpu.h b/paddle/math/ExecViaCpu.h
index 64e5b8312168499d4267937cdc7f0b872fa5ea37..67fb6c0cda6f46ddf4547b9ec9faaa8931c75eed 100644
--- a/paddle/math/ExecViaCpu.h
+++ b/paddle/math/ExecViaCpu.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 /*
  execViaCpu is used to do operations on GpuMatirx and/or GpuIVector through
  cpu functions. It can automatically make a temporary CPU copy for the
@@ -46,8 +45,10 @@ public:
   explicit CopyToCpu(Matrix& arg) : arg_(arg) {
     if (arg.useGpu()) {
       CHECK(!arg.isTransposed()) << "Not supported";
-      copied_ = Matrix::create(arg.getHeight(), arg.getWidth(),
-                               /* trans= */ false, /* useGpu= */ false);
+      copied_ = Matrix::create(arg.getHeight(),
+                               arg.getWidth(),
+                               /* trans= */ false,
+                               /* useGpu= */ false);
       copied_->copyFrom(arg);
     }
   }
@@ -69,8 +70,10 @@ public:
   explicit CopyToCpu(const Matrix& arg) : arg_(arg) {
     if (arg.useGpu()) {
       CHECK(!arg.isTransposed()) << "Not supported";
-      copied_ = Matrix::create(arg.getHeight(), arg.getWidth(),
-                               /* trans= */ false, /* useGpu= */ false);
+      copied_ = Matrix::create(arg.getHeight(),
+                               arg.getWidth(),
+                               /* trans= */ false,
+                               /* useGpu= */ false);
       copied_->copyFrom(arg);
     }
   }
@@ -165,7 +168,8 @@ class GpuFuncWrapper2
           std::is_function<F>::value,
           std::is_pointer<F>::value &&
               std::is_function<typename std::remove_pointer<F>::type>::value,
-          std::is_class<F>::value, F> {};
+          std::is_class<F>::value,
+          F> {};
 
 template <typename F>
 class GpuFuncWrapper
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index da493379e3a37ecb8f4d8f9f333629b3e71d90a5..1217163beecf19c2af215e3d4c72db644cd74b51 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -12,173 +12,275 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "MathFunctions.h"
 #include "hl_matrix_ops.cuh"
 #include "hl_matrix_apply.cuh"
 
 namespace paddle {
 
-template<>
-void gemm<float>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-                 const int M, const int N, const int K,
-                 const float alpha, const float* A, const int lda,
-                 const float* B, const int ldb,
-                 const float beta, float* C, const int ldc) {
-  cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
+template <>
+void gemm<float>(const CBLAS_TRANSPOSE transA,
+                 const CBLAS_TRANSPOSE transB,
+                 const int M,
+                 const int N,
+                 const int K,
+                 const float alpha,
+                 const float* A,
+                 const int lda,
+                 const float* B,
+                 const int ldb,
+                 const float beta,
+                 float* C,
+                 const int ldc) {
+  cblas_sgemm(CblasRowMajor,
+              transA,
+              transB,
+              M,
+              N,
+              K,
+              alpha,
+              A,
+              lda,
+              B,
+              ldb,
+              beta,
+              C,
+              ldc);
+}
+
+template <>
+void gemm<double>(const CBLAS_TRANSPOSE transA,
+                  const CBLAS_TRANSPOSE transB,
+                  const int M,
+                  const int N,
+                  const int K,
+                  const double alpha,
+                  const double* A,
+                  const int lda,
+                  const double* B,
+                  const int ldb,
+                  const double beta,
+                  double* C,
+                  const int ldc) {
+  cblas_dgemm(CblasRowMajor,
+              transA,
+              transB,
+              M,
+              N,
+              K,
+              alpha,
+              A,
+              lda,
+              B,
+              ldb,
+              beta,
+              C,
+              ldc);
 }
 
-template<>
-void gemm<double>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-                  const int M, const int N, const int K,
-                  const double alpha, const double* A, const int lda,
-                  const double* B, const int ldb,
-                  const double beta, double* C, const int ldc) {
-  cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
+template <>
+int getrf<float>(const CBLAS_ORDER order,
+                 const int M,
+                 const int N,
+                 float* A,
+                 const int lda,
+                 int* ipiv) {
+#ifdef PADDLE_USE_ATLAS
+  return clapack_sgetrf(order, M, N, A, lda, ipiv);
+#else
+  return LAPACKE_sgetrf(order, M, N, A, lda, ipiv);
+#endif
+}
+
+template <>
+int getrf<double>(const CBLAS_ORDER order,
+                  const int M,
+                  const int N,
+                  double* A,
+                  const int lda,
+                  int* ipiv) {
+#ifdef PADDLE_USE_ATLAS
+  return clapack_dgetrf(order, M, N, A, lda, ipiv);
+#else
+  return LAPACKE_dgetrf(order, M, N, A, lda, ipiv);
+#endif
+}
+
+template <>
+int getri<float>(const CBLAS_ORDER order,
+                 const int N,
+                 float* A,
+                 const int lda,
+                 const int* ipiv) {
+#ifdef PADDLE_USE_ATLAS
+  return clapack_sgetri(order, N, A, lda, ipiv);
+#else
+  return LAPACKE_sgetri(order, N, A, lda, ipiv);
+#endif
+}
+
+template <>
+int getri<double>(const CBLAS_ORDER order,
+                  const int N,
+                  double* A,
+                  const int lda,
+                  const int* ipiv) {
+#ifdef PADDLE_USE_ATLAS
+  return clapack_dgetri(order, N, A, lda, ipiv);
+#else
+  return LAPACKE_dgetri(order, N, A, lda, ipiv);
+#endif
 }
 
-template<>
+template <>
 void axpy<float>(const int n, const float alpha, const float* x, float* y) {
   cblas_saxpy(n, alpha, x, 1, y, 1);
 }
 
-template<>
+template <>
 void axpy<double>(const int n, const double alpha, const double* x, double* y) {
   cblas_daxpy(n, alpha, x, 1, y, 1);
 }
 
-template<>
+template <>
 float dotProduct<float>(const int n, const float* x, const float* y) {
   return cblas_sdot(n, x, 1, y, 1);
 }
 
-template<>
+template <>
 double dotProduct<double>(const int n, const double* x, const double* y) {
   return cblas_ddot(n, x, 1, y, 1);
 }
 
 #ifdef PADDLE_USE_MKL
 
-template<>
+template <>
 void vExp<float>(const int n, const float* a, float* r) {
   vsExp(n, a, r);
 }
 
-template<>
+template <>
 void vExp<double>(const int n, const double* a, double* r) {
   vdExp(n, a, r);
 }
 
-template<>
+template <>
 void vPow<float>(const int n, const float* a, const float b, float* r) {
   vsPowx(n, a, b, r);
 }
 
-template<>
+template <>
 void vPow<double>(const int n, const double* a, const double b, double* r) {
   vdPowx(n, a, b, r);
 }
 
-template<>
+template <>
 void vLog<float>(const int n, const float* a, float* r) {
   vsLn(n, a, r);
 }
 
-template<>
+template <>
 void vLog<double>(const int n, const double* a, double* r) {
   vdLn(n, a, r);
 }
 
-template<>
+template <>
 void vAdd<float>(const int n, const float* a, const float* b, float* r) {
   vsAdd(n, a, b, r);
 }
 
-template<>
+template <>
 void vAdd<double>(const int n, const double* a, const double* b, double* r) {
   vdAdd(n, a, b, r);
 }
 
-template<>
+template <>
 void vInvSqrt<float>(const int n, const float* a, float* r) {
   vsInvSqrt(n, a, r);
 }
 
-template<>
+template <>
 void vInvSqrt<double>(const int n, const double* a, double* r) {
   vdInvSqrt(n, a, r);
 }
 
-template<>
+template <>
 void vLog1p<float>(const int n, const float* a, float* r) {
   vsLog1p(n, a, r);
 }
 
-template<>
+template <>
 void vLog1p<double>(const int n, const double* a, double* r) {
   vdLog1p(n, a, r);
 }
 
-template<>
+template <>
 void vTanh<float>(const int n, const float* a, float* r) {
   vsTanh(n, a, r);
 }
 
-template<>
+template <>
 void vTanh<double>(const int n, const double* a, double* r) {
   vdTanh(n, a, r);
 }
 #else
 
 DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
-template<class T>
+template <class T>
 void vExp(const int n, const T* a, T* r) {
   hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
-    binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
+      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
 DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
-template<class T>
+template <class T>
 void vLog(const int n, const T* a, T* r) {
   hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
-    binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
+      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
 DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
-template<class T>
+template <class T>
 void vInvSqrt(const int n, const T* a, T* r) {
   hl_cpu_apply_binary_op<T, binary::vInvSqrt<T>, 0, 0>(
-    binary::vInvSqrt<T>(), const_cast<T*>(a), r, 1, n, n, n);
+      binary::vInvSqrt<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
 DEFINE_MATRIX_BINARY_OP(vLog1p, b = std::log(1.0f + a));
-template<class T>
+template <class T>
 void vLog1p(const int n, const T* a, T* r) {
   hl_cpu_apply_binary_op<T, binary::vLog1p<T>, 0, 0>(
-    binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
+      binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
-DEFINE_MATRIX_BINARY_OP(vTanh, b = 2.0 / (1.0 + std::exp(-2 * a)) - 1.0);
-template<class T>
+DEFINE_MATRIX_BINARY_OP(vTanh, T tmp = -2.0 * a;
+                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
+template <class T>
 void vTanh(const int n, const T* a, T* r) {
   hl_cpu_apply_binary_op<T, binary::vTanh<T>, 0, 0>(
-    binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
+      binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
-template<class T>
+template <class T>
 void vPow(const int n, const T* a, const T b, T* r) {
   hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
-    binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
+      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
 }
 
 DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
-template<class T>
+template <class T>
 void vAdd(const int n, const T* a, const T* b, T* r) {
   hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
-    const_cast<T*>(a), const_cast<T*>(b), r, 1, n, n, n , n);
+                                                     const_cast<T*>(a),
+                                                     const_cast<T*>(b),
+                                                     r,
+                                                     1,
+                                                     n,
+                                                     n,
+                                                     n,
+                                                     n);
 }
 
 template void vExp(const int n, const float* a, float* r);
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index 43075977dc9cef1573cf6dd75d9ef577b07d337e..0741c456780e36c6b87dd44d89ffc601ac928f31 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -17,51 +17,78 @@ limitations under the License. */
 
 #ifdef PADDLE_USE_MKL
 #include <mkl.h>
+#include <mkl_lapacke.h>
 #else
 extern "C" {
 #include <cblas.h>
 }
+#ifdef PADDLE_USE_ATLAS
+extern "C" {
+#include <clapack.h>
+}
+#else
+#include <lapacke.h>
+#endif
 #endif
 
 #include <cmath>
 
 namespace paddle {
 
-template<class T>
-void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
-          const int M, const int N, const int K,
-          const T alpha, const T* A, const int lda,
-          const T* B, const int ldb,
-          const T beta, T* C, const int ldc);
-
-template<class T>
+template <class T>
+void gemm(const CBLAS_TRANSPOSE transA,
+          const CBLAS_TRANSPOSE transB,
+          const int M,
+          const int N,
+          const int K,
+          const T alpha,
+          const T* A,
+          const int lda,
+          const T* B,
+          const int ldb,
+          const T beta,
+          T* C,
+          const int ldc);
+
+template <class T>
+int getrf(const CBLAS_ORDER Order,
+          const int M,
+          const int N,
+          T* A,
+          const int lda,
+          int* ipiv);
+
+template <class T>
+int getri(
+    const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv);
+
+template <class T>
 void axpy(const int n, const T alpha, const T* x, T* y);
 
-template<class T>
+template <class T>
 T dotProduct(const int n, const T* x, const T* y);
 
-template<class T>
+template <class T>
 void vExp(const int n, const T* a, T* r);
 
-template<class T>
+template <class T>
 void vPow(const int n, const T* a, const T b, T* r);
 
-template<class T>
+template <class T>
 void vLog(const int n, const T* a, T* r);
 
-template<class T>
+template <class T>
 void vAdd(const int n, const T* a, const T* b, T* r);
 
-template<class T>
+template <class T>
 void vInvSqrt(const int n, const T* a, T* r);
 
-template<class T>
+template <class T>
 void vLog1p(const int n, const T* a, T* r);
 
-template<class T>
+template <class T>
 void vTanh(const int n, const T* a, T* r);
 
 }  // namespace paddle
 
 #endif  // MATHFUNCTIONS_H_
-
diff --git a/paddle/math/MathUtils.cpp b/paddle/math/MathUtils.cpp
index 5b78ab1b07bda0b28dd1688b6364ecf1882f4073..878e0b8723025e75f7838e981517f58a3dcb5424 100644
--- a/paddle/math/MathUtils.cpp
+++ b/paddle/math/MathUtils.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "MathUtils.h"
 #include <algorithm>
 #include "paddle/utils/Logging.h"
@@ -24,12 +23,8 @@ namespace paddle {
  * major is rows and minor is cols, according to
  * major value to initialize minor value"
  */
-void sparseRand(int* major,
-                int* minor,
-                int nnz,
-                int majorLen,
-                int minorMax,
-                bool useGpu) {
+void sparseRand(
+    int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu) {
   CHECK(size_t(nnz) > size_t(1));
   int* cpuMajor;
   int* cpuMinor;
@@ -62,7 +57,8 @@ void sparseRand(int* major,
       cpuMinor[j] = idx;
       used[idx] = 1;
     }
-    std::sort(cpuMinor + cpuMajor[i], cpuMinor + cpuMajor[i + 1],
+    std::sort(cpuMinor + cpuMajor[i],
+              cpuMinor + cpuMajor[i + 1],
               [](int a, int b) { return a < b; });
   }
   /*memcpy result to gpu*/
@@ -72,5 +68,30 @@ void sparseRand(int* major,
   }
 }
 
+int outputSize(
+    int imageSize, int filterSize, int padding, int stride, bool caffeMode) {
+  int outputSize;
+  if (!caffeMode) {
+    outputSize =
+        (imageSize - filterSize + 2 * padding + stride - 1) / stride + 1;
+  } else {
+    outputSize = (imageSize - filterSize + 2 * padding) / stride + 1;
+  }
+  CHECK_GE(outputSize, 1);
+  return outputSize;
+}
+
+int imageSize(
+    int outputSize, int filterSize, int padding, int stride, bool caffeMode) {
+  int imageSize;
+  if (!caffeMode) {
+    imageSize =
+        (outputSize - 1) * stride + filterSize - 2 * padding - stride + 1;
+  } else {
+    imageSize = (outputSize - 1) * stride + filterSize - 2 * padding;
+  }
+  CHECK_GE(imageSize, 1);
+  return imageSize;
+}
 
 }  // namespace paddle
diff --git a/paddle/math/MathUtils.h b/paddle/math/MathUtils.h
index 83375022abbe268e22bbeb46e8e4b96a7198cb5f..907116c00281bfcf34c6652564f55a37c3f47a8c 100644
--- a/paddle/math/MathUtils.h
+++ b/paddle/math/MathUtils.h
@@ -41,7 +41,30 @@ namespace paddle {
  *
  * rows is [1, 3, 4, 0, 2, 4, 1, 2, 3, 4]
  */
-void sparseRand(int* major, int* minor, int nnz, int majorLen, int minorMax,
-                bool useGpu);
+void sparseRand(
+    int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu);
+
+/**
+ * Calculate output size based on caffeMode_.
+ * - input(+padding): 0123456789
+ * - imageSize(+padding) = 10;
+ * - filterSize = 3;
+ * - stride = 2;
+ * - caffeMode is true:
+     - output: (012), (234), (456), (678)
+     - outputSize = 4;
+ * - caffeMode is false:
+ *   - output: (012), (234), (456), (678), (9)
+ *   - outputSize = 5;
+ */
+int outputSize(
+    int imageSize, int filterSize, int padding, int stride, bool caffeMode);
+
+/**
+ * Calculate image size based on output size and caffeMode_.
+ * It is the reverse function of outputSize()
+ */
+int imageSize(
+    int outputSize, int filterSize, int padding, int stride, bool caffeMode);
 
 }  // namespace paddle
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 78519ce7aa8742192eb15e5c4705572a7df5dbdc..b70b47a5fcc72edea8fa5a680c4af962ea0f4ae9 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -13,19 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Matrix.h"
+#include "MathFunctions.h"
 #include "SparseMatrix.h"
 #include "SparseRowMatrix.h"
-#include "MathFunctions.h"
 
-#include <cmath>
 #include <float.h>
 #include <algorithm>
+#include <cmath>
 
-#include "paddle/utils/Logging.h"
 #include <string.h>
+#include "hl_cnn.h"
 #include "hl_gpu.h"
 #include "hl_table_apply.h"
 #include "hl_top_k.h"
+#include "paddle/utils/Logging.h"
 
 #include "paddle/utils/ThreadLocal.h"
 
@@ -39,63 +40,80 @@ inline real _square(real a) { return a * a; }
 
 inline real _safelog(real a) { return a > 0.0f ? std::log(a) : -40.0f; }
 
-Matrix::Matrix(MemoryHandlePtr memHandle, size_t height, size_t width,
-               bool trans, bool use_gpu)
+Matrix::Matrix(MemoryHandlePtr memHandle,
+               size_t height,
+               size_t width,
+               bool trans,
+               bool use_gpu)
     : BaseMatrix(
-        height, width,
-        memHandle ? (reinterpret_cast<real*>(memHandle->getBuf())) : nullptr,
-        trans, use_gpu) {
+          height,
+          width,
+          memHandle ? (reinterpret_cast<real*>(memHandle->getBuf())) : nullptr,
+          trans,
+          use_gpu) {
   elementCnt_ = width * height;
   memoryHandle_ = memHandle;
 }
 
-Matrix::Matrix(real* data, size_t height, size_t width, bool trans,
-               bool use_gpu)
+Matrix::Matrix(
+    real* data, size_t height, size_t width, bool trans, bool use_gpu)
     : BaseMatrix(height, width, data, trans, use_gpu) {
   elementCnt_ = width * height;
 }
 
-Matrix::Matrix(real* data, size_t height, size_t width, size_t stride,
-               bool trans, bool use_gpu)
+Matrix::Matrix(real* data,
+               size_t height,
+               size_t width,
+               size_t stride,
+               bool trans,
+               bool use_gpu)
     : BaseMatrix(height, width, stride, data, trans, use_gpu) {
   elementCnt_ = width * height;
 }
 
-MatrixPtr Matrix::createSparseMatrix(real* data, int* row, int* col,
-                                     size_t height, size_t width,
+MatrixPtr Matrix::createSparseMatrix(real* data,
+                                     int* row,
+                                     int* col,
+                                     size_t height,
+                                     size_t width,
                                      size_t nnz, /* used to allocate space */
                                      SparseValueType valueType, /*value type*/
-                                     SparseFormat format, bool trans,
+                                     SparseFormat format,
+                                     bool trans,
                                      bool useGpu) {
   if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(data, row, col, height, width, nnz,
-                                             valueType, format, trans);
+    return std::make_shared<GpuSparseMatrix>(
+        data, row, col, height, width, nnz, valueType, format, trans);
   } else {
-    return std::make_shared<CpuSparseMatrix>(data, row, col, height, width, nnz,
-                                             valueType, format, trans);
+    return std::make_shared<CpuSparseMatrix>(
+        data, row, col, height, width, nnz, valueType, format, trans);
   }
 }
 
-MatrixPtr Matrix::createSparseMatrix(size_t height, size_t width,
+MatrixPtr Matrix::createSparseMatrix(size_t height,
+                                     size_t width,
                                      size_t nnz, /* used to allocate space */
                                      SparseValueType valueType, /*value type*/
-                                     SparseFormat format, bool trans,
+                                     SparseFormat format,
+                                     bool trans,
                                      bool useGpu) {
   if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(height, width, nnz, valueType,
-                                             format, trans);
+    return std::make_shared<GpuSparseMatrix>(
+        height, width, nnz, valueType, format, trans);
   } else {
-    return std::make_shared<CpuSparseMatrix>(height, width, nnz, valueType,
-                                             format, trans);
+    return std::make_shared<CpuSparseMatrix>(
+        height, width, nnz, valueType, format, trans);
   }
 }
 
-MatrixPtr Matrix::create(MemoryHandlePtr memHandle, size_t height, size_t width,
+MatrixPtr Matrix::create(MemoryHandlePtr memHandle,
+                         size_t height,
+                         size_t width,
                          bool trans) {
   if (auto gpuHandle = std::dynamic_pointer_cast<GpuMemoryHandle>(memHandle)) {
     return std::make_shared<GpuMatrix>(gpuHandle, height, width, trans);
   } else if (auto cpuHandle =
-             std::dynamic_pointer_cast<CpuMemoryHandle>(memHandle)) {
+                 std::dynamic_pointer_cast<CpuMemoryHandle>(memHandle)) {
     return std::make_shared<CpuMatrix>(cpuHandle, height, width, trans);
   } else {
     LOG(FATAL) << "Wrong";
@@ -111,8 +129,8 @@ MatrixPtr Matrix::create(size_t height, size_t width, bool trans, bool useGpu) {
   }
 }
 
-MatrixPtr Matrix::create(real* data, size_t height, size_t width, bool trans,
-                         bool useGpu) {
+MatrixPtr Matrix::create(
+    real* data, size_t height, size_t width, bool trans, bool useGpu) {
   if (useGpu) {
     return std::make_shared<GpuMatrix>(data, height, width, trans);
   } else {
@@ -120,8 +138,12 @@ MatrixPtr Matrix::create(real* data, size_t height, size_t width, bool trans,
   }
 }
 
-MatrixPtr Matrix::create(real* data, size_t height, size_t width, size_t stride,
-                         bool trans, bool useGpu) {
+MatrixPtr Matrix::create(real* data,
+                         size_t height,
+                         size_t width,
+                         size_t stride,
+                         bool trans,
+                         bool useGpu) {
   if (useGpu) {
     return std::make_shared<GpuMatrix>(data, height, width, stride, trans);
   } else {
@@ -129,20 +151,23 @@ MatrixPtr Matrix::create(real* data, size_t height, size_t width, size_t stride,
   }
 }
 
-MatrixPtr Matrix::createSparseMatrix(size_t height, size_t width, size_t nnz,
-                                     SparseValueType valueType, bool trans,
+MatrixPtr Matrix::createSparseMatrix(size_t height,
+                                     size_t width,
+                                     size_t nnz,
+                                     SparseValueType valueType,
+                                     bool trans,
                                      bool useGpu) {
   if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(height, width, nnz, valueType,
-                                             SPARSE_CSR, trans);
+    return std::make_shared<GpuSparseMatrix>(
+        height, width, nnz, valueType, SPARSE_CSR, trans);
   } else {
-    return std::make_shared<CpuSparseMatrix>(height, width, nnz, valueType,
-                                             SPARSE_CSR, trans);
+    return std::make_shared<CpuSparseMatrix>(
+        height, width, nnz, valueType, SPARSE_CSR, trans);
   }
 }
 
-void Matrix::resizeOrCreate(MatrixPtr& matrix, size_t height, size_t width,
-                            bool trans, bool useGpu) {
+void Matrix::resizeOrCreate(
+    MatrixPtr& matrix, size_t height, size_t width, bool trans, bool useGpu) {
   if (!matrix) {
     matrix = Matrix::create(height, width, trans, useGpu);
   } else {
@@ -151,14 +176,17 @@ void Matrix::resizeOrCreate(MatrixPtr& matrix, size_t height, size_t width,
   }
 }
 
-void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix, size_t height,
-                                        size_t width, size_t nnz,
+void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix,
+                                        size_t height,
+                                        size_t width,
+                                        size_t nnz,
                                         SparseValueType valueType,
-                                        SparseFormat format, bool trans,
+                                        SparseFormat format,
+                                        bool trans,
                                         bool useGpu) {
   if (!matrix) {
-    matrix = Matrix::createSparseMatrix(height, width, nnz, valueType, format,
-                                        trans, useGpu);
+    matrix = Matrix::createSparseMatrix(
+        height, width, nnz, valueType, format, trans, useGpu);
   } else {
     CHECK(dynamic_cast<CpuSparseMatrix*>(matrix.get()) ||
           dynamic_cast<GpuSparseMatrix*>(matrix.get()));
@@ -175,7 +203,9 @@ void Matrix::reshape(size_t height, size_t width) {
   stride_ = width_;
 }
 
-MatrixPtr Matrix::subMatrix(size_t startRow, size_t endRow, size_t startCol,
+MatrixPtr Matrix::subMatrix(size_t startRow,
+                            size_t endRow,
+                            size_t startCol,
                             size_t endCol) {
   CHECK_LE(startRow, endRow);
   CHECK_LE(endRow, getHeight());
@@ -183,13 +213,28 @@ MatrixPtr Matrix::subMatrix(size_t startRow, size_t endRow, size_t startCol,
   CHECK_LE(endCol, getWidth());
 
   return Matrix::create(getData() + startRow * getStride() + startCol,
-                        endRow - startRow, endCol - startCol, getStride(),
-                        trans_, useGpu_);
+                        endRow - startRow,
+                        endCol - startCol,
+                        getStride(),
+                        trans_,
+                        useGpu_);
+}
+
+void Matrix::setDiag(real value) {
+  CHECK(data_ != NULL);
+  CHECK_EQ(height_, width_);
+
+  zeroMem();
+  BaseMatrix diag(height_, 1, stride_ + 1, data_, false, useGpu_);
+  diag.assign(value);
 }
 
 GpuMatrix::GpuMatrix(size_t height, size_t width, bool trans)
     : Matrix(std::make_shared<GpuMemoryHandle>(height * width * sizeof(real)),
-             height, width, trans, true) {}
+             height,
+             width,
+             trans,
+             true) {}
 
 GpuMatrix::~GpuMatrix() {}
 
@@ -202,6 +247,7 @@ void GpuMatrix::resetOne() {
   CHECK(data_ != NULL);
   one();
 }
+
 void GpuMatrix::resize(size_t newHeight, size_t newWidth) {
   size_t newSize = newHeight * newWidth;
   if (NULL == memoryHandle_.get() ||
@@ -231,7 +277,7 @@ real GpuMatrix::getSum() {
 void GpuMatrix::accumulateColSum(Matrix& src) {
   CHECK_EQ(getWidth(), src.getWidth());
   CHECK_EQ(getHeight(), (size_t)1);
-  sumCols(src, 1.0);
+  sumCols(src, 1.0, 1.0);
 }
 
 real GpuMatrix::getAbsSum() {
@@ -247,11 +293,11 @@ void GpuMatrix::copyFrom(const Matrix& src) {
   CHECK(elementCnt_ == src.getElementCnt());
 
   if (typeid(src) == typeid(CpuMatrix)) {
-    hl_memcpy_host2device(data_, const_cast<real*>(src.getData()),
-                          sizeof(real) * elementCnt_);
+    hl_memcpy_host2device(
+        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
   } else if (typeid(src) == typeid(GpuMatrix)) {
-    hl_memcpy_device2device(data_, const_cast<real*>(src.getData()),
-                            sizeof(real) * elementCnt_);
+    hl_memcpy_device2device(
+        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
   } else {
     LOG(FATAL) << "Wrong";
   }
@@ -261,8 +307,10 @@ void GpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
   CHECK(isContiguous());
   CHECK(src.isContiguous());
   CHECK(elementCnt_ == src.getElementCnt());
-  hl_memcpy_async(this->getData(), const_cast<real*>(src.getData()),
-                  sizeof(real) * elementCnt_, stream);
+  hl_memcpy_async(this->getData(),
+                  const_cast<real*>(src.getData()),
+                  sizeof(real) * elementCnt_,
+                  stream);
 }
 
 void GpuMatrix::copyFrom(const real* hostSrc, size_t size) {
@@ -313,7 +361,9 @@ MatrixPtr GpuMatrix::getTranspose() {
   if (memoryHandle_.get() != NULL) {
     MatrixPtr copy_T(
         new GpuMatrix(std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
-                      height_, width_, true));
+                      height_,
+                      width_,
+                      true));
     return copy_T;
   } else {
     MatrixPtr copy_T(new GpuMatrix(data_, height_, width_, true));
@@ -335,25 +385,62 @@ void GpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   hl_matrix_transpose(data, dataTrans, height_, width_, lda, ldc);
 }
 
+MatrixPtr GpuMatrix::getInverse() {
+  MatrixPtr matInv;
+  inverse(matInv, true);
+  return matInv;
+}
+
+void GpuMatrix::inverse(MatrixPtr matInv, bool memAlloc) {
+  CHECK_EQ(height_, width_);
+
+  if (memAlloc) {
+    matInv = std::make_shared<GpuMatrix>(height_, width_);
+  } else {
+    CHECK(matInv != NULL);
+  }
+
+  real* data = getData();
+  real* dataInv = matInv->getData();
+  int lda = getStride();
+  int ldc = matInv->getStride();
+
+  hl_matrix_inverse(data, dataInv, height_, lda, ldc);
+}
+
 void GpuMatrix::addBias(Matrix& b, real scale) {
   CHECK(b.getHeight() == 1) << "the Bias should be a vector";
   BaseMatrix::addBias(b, scale);
 }
 
+void GpuMatrix::addSharedBias(Matrix& b, real scale) {
+  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
+  CHECK_LE(b.getWidth(), getWidth());
+  CHECK_EQ(getWidth() % b.getWidth(), 0UL);
+  hl_matrix_add_shared_bias(
+      getData(), b.getData(), b.getWidth(), getHeight(), getWidth(), scale);
+}
+
 void GpuMatrix::collectBias(Matrix& a, real scale) {
   CHECK_EQ(getHeight(), (size_t)1);
   CHECK_EQ(width_, a.getWidth());
-  GpuSparseMatrix* sMatPtr  = dynamic_cast<GpuSparseMatrix*>(&a);
+  GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
   if (!sMatPtr) {
-    sumCols(a, scale);
+    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
   } else {
     real* data = getData();
     hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
-    hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(),
-                                width_, scale);
+    hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(), width_, scale);
   }
 }
 
+void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  CHECK_EQ(a.getWidth() % getWidth(), 0UL);
+  hl_matrix_collect_shared_bias(
+      getData(), a.getData(), getWidth(), a.getHeight(), a.getWidth(), scale);
+}
+
 void GpuMatrix::sequenceAvgForward(Matrix& a,
                                    const IVector& startsPos,
                                    int mode) {
@@ -369,7 +456,9 @@ void GpuMatrix::sequenceAvgForward(Matrix& a,
 }
 
 /* this = scaleAB*(a*b) +  scaleT*this */
-void GpuMatrix::mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB,
+void GpuMatrix::mul(const GpuMatrix& a,
+                    const GpuMatrix& b,
+                    real scaleAB,
                     real scaleT) {
   CHECK(!isTransposed()) << "Not supported";
 
@@ -401,11 +490,24 @@ void GpuMatrix::mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB,
   hl_trans_op_t transa = !a.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
   hl_trans_op_t transb = !b.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
 
-  hl_matrix_mul(A_d, transa, B_d, transb, C_d, dimM, dimN, dimK,
-                scaleAB, scaleT, lda, ldb, ldc);
-}
-
-void GpuMatrix::mul(const GpuSparseMatrix& a, const GpuMatrix& b, real scaleAB,
+  hl_matrix_mul(A_d,
+                transa,
+                B_d,
+                transb,
+                C_d,
+                dimM,
+                dimN,
+                dimK,
+                scaleAB,
+                scaleT,
+                lda,
+                ldb,
+                ldc);
+}
+
+void GpuMatrix::mul(const GpuSparseMatrix& a,
+                    const GpuMatrix& b,
+                    real scaleAB,
                     real scaleT) {
   CHECK(isContiguous());
   CHECK(b.isContiguous());
@@ -423,11 +525,21 @@ void GpuMatrix::mul(const GpuSparseMatrix& a, const GpuMatrix& b, real scaleAB,
   hl_sparse_matrix_s A_d = a.sMatrix_.get();
   real* B_d = b.data_;
   real* C_d = data_;
-  hl_matrix_csr_mul_dense(A_d, transA, B_d, HPPL_OP_N, C_d, height_,
-                          width_, b.height_, scaleAB, scaleT);
-}
-
-void GpuMatrix::mul(const GpuMatrix& a, const GpuSparseMatrix& b, real scaleAB,
+  hl_matrix_csr_mul_dense(A_d,
+                          transA,
+                          B_d,
+                          HPPL_OP_N,
+                          C_d,
+                          height_,
+                          width_,
+                          b.height_,
+                          scaleAB,
+                          scaleT);
+}
+
+void GpuMatrix::mul(const GpuMatrix& a,
+                    const GpuSparseMatrix& b,
+                    real scaleAB,
                     real scaleT) {
   CHECK(isContiguous());
   CHECK(a.isContiguous());
@@ -445,11 +557,27 @@ void GpuMatrix::mul(const GpuMatrix& a, const GpuSparseMatrix& b, real scaleAB,
         << "Matrix dimensions are not equal";
   }
   if (b.format_ == SPARSE_CSC) {
-    hl_matrix_dense_mul_csc(A_d, HPPL_OP_N, B_d, transB, C_d, height_,
-                            width_, a.width_, scaleAB, scaleT);
+    hl_matrix_dense_mul_csc(A_d,
+                            HPPL_OP_N,
+                            B_d,
+                            transB,
+                            C_d,
+                            height_,
+                            width_,
+                            a.width_,
+                            scaleAB,
+                            scaleT);
   } else {
-    hl_matrix_dense_mul_csr(A_d, HPPL_OP_N, B_d, transB, C_d, height_,
-                            width_, a.width_, scaleAB, scaleT);
+    hl_matrix_dense_mul_csr(A_d,
+                            HPPL_OP_N,
+                            B_d,
+                            transB,
+                            C_d,
+                            height_,
+                            width_,
+                            a.width_,
+                            scaleAB,
+                            scaleT);
   }
 }
 
@@ -458,7 +586,9 @@ void GpuMatrix::mul(const MatrixPtr a, const MatrixPtr b) {
   mul(a, b, 1.0, 0.0);
 }
 
-void GpuMatrix::mul(const MatrixPtr a, const MatrixPtr b, real scaleAB,
+void GpuMatrix::mul(const MatrixPtr a,
+                    const MatrixPtr b,
+                    real scaleAB,
                     real scaleT) {
   GpuMatrixPtr a_ptr = std::dynamic_pointer_cast<GpuMatrix>(a);
   GpuMatrixPtr b_ptr = std::dynamic_pointer_cast<GpuMatrix>(b);
@@ -511,8 +641,14 @@ void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
   size_t tableSize = table.getHeight();
   int* index = ids.getData();
 
-  hl_matrix_select_rows(a, stride_, table.getData(), table.stride_,
-                        index, numSamples, tableSize, dim);
+  hl_matrix_select_rows(a,
+                        stride_,
+                        table.getData(),
+                        table.stride_,
+                        index,
+                        numSamples,
+                        tableSize,
+                        dim);
 #endif
 }
 
@@ -529,15 +665,21 @@ void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
   size_t tableSize = table.getHeight();
   int* index = ids.getData();
 
-  hl_matrix_add_to_rows(table.getData(), table.stride_, a, stride_,
-                        index, numSamples, tableSize, dim);
+  hl_matrix_add_to_rows(table.getData(),
+                        table.stride_,
+                        a,
+                        stride_,
+                        index,
+                        numSamples,
+                        tableSize,
+                        dim);
 #endif
 }
 
 void GpuMatrix::colMerge(Matrix& src) {
   CHECK(src.height_ == height_);
   if (!trans_ && !src.trans_) {
-    sumRows(src);
+    sumRows(src, /* scaleSum= */ 1, /* scaleDest= */ 0);
   } else {
     LOG(FATAL) << "Is not supported";
   }
@@ -547,7 +689,7 @@ void GpuMatrix::rowSum(Matrix& sum) {
   CHECK_EQ(sum.getHeight(), getHeight());
   CHECK_EQ(sum.getWidth(), (size_t)1);
 
-  sum.sumRows(*this);
+  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
 }
 
 void GpuMatrix::rowMax(Matrix& max) {
@@ -583,6 +725,46 @@ void GpuMatrix::colMax(Matrix& max) {
   max.maxCols(*this);
 }
 
+void GpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
+  LOG(FATAL) << "Is not supported";
+}
+
+void GpuMatrix::maxoutForward(Matrix& a,
+                              IVector& id,
+                              size_t channels,
+                              size_t groups) {
+  CHECK(dynamic_cast<GpuMatrix*>(&a));
+  CHECK(dynamic_cast<GpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+
+  size_t size = getWidth();
+  size_t batchSize = getHeight();
+  const real* input = a.getData();
+  real* output = getData();
+  int* idForGpu = id.getData();
+
+  hl_maxout_forward(
+      input, output, idForGpu, batchSize, size, size / channels, groups);
+}
+
+void GpuMatrix::maxoutBackward(Matrix& a,
+                               IVector& id,
+                               size_t channels,
+                               size_t groups) {
+  CHECK(dynamic_cast<GpuMatrix*>(&a));
+  CHECK(dynamic_cast<GpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+
+  size_t size = a.getWidth();
+  size_t batchSize = getHeight();
+  real* input = getData();
+  const real* output = a.getData();
+  const int* idForGpu = id.getData();
+
+  hl_maxout_backward(
+      input, output, idForGpu, batchSize, size, size / channels, groups);
+}
+
 /*calulate the error of classification */
 void GpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) {
   GpuMatrixPtr output_ptr = std::dynamic_pointer_cast<GpuMatrix>(output);
@@ -596,8 +778,8 @@ void GpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) {
   real* recResult_d = data_;
   int* label_d = label_ptr->getData();
 
-  hl_matrix_classification_error(output_d, label_d, recResult_d,
-                                 height_, output_ptr->width_);
+  hl_matrix_classification_error(
+      output_d, label_d, recResult_d, height_, output_ptr->width_);
 }
 
 /* copy -log(output[i * width + label]) to this->data[i] */
@@ -634,13 +816,15 @@ void GpuMatrix::oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
   hl_matrix_cross_entropy_bp(grad_d, output_d, label_d, height_, width_);
 }
 
-void GpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
+void GpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                               IVector& label,
                                                real alpha) {
   LOG(FATAL) << "Not implemented";
 }
 
 void GpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                                 IVector& label, real alpha) {
+                                                 IVector& label,
+                                                 real alpha) {
   LOG(FATAL) << "Not implemented";
 }
 
@@ -666,8 +850,7 @@ void GpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
   real* outputData = output.getData();
   auto starts = index.getData();
   int numSequences = index.getSize() - 1;
-  hl_sequence_softmax_forward(inputData, outputData,
-                              starts, numSequences);
+  hl_sequence_softmax_forward(inputData, outputData, starts, numSequences);
 }
 
 void GpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
@@ -681,8 +864,7 @@ void GpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
   real* output_d = output.data_;
   real* sftmaxSum_d = sftmaxSum.data_;
   real* grad_d = data_;
-  hl_matrix_softmax_derivative(grad_d, output_d, sftmaxSum_d, height_,
-                               width_);
+  hl_matrix_softmax_derivative(grad_d, output_d, sftmaxSum_d, height_, width_);
 }
 
 void GpuMatrix::softmaxBackward(Matrix& outputV) {
@@ -709,7 +891,10 @@ void GpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
     LOG(FATAL) << "not supported: GpuSparseMatrix as label";
   }
 
-  BaseMatrix::sumOfSquares(output, label);
+  BaseMatrix::sumOfSquaredDiffs(output,
+                                label,
+                                /* scaleSum= */ 1,
+                                /* scaleDest= */ 1);
 }
 
 void GpuMatrix::sumOfSquaresBp(Matrix& outputV, Matrix& label) {
@@ -733,7 +918,7 @@ void GpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
 }
 void GpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
   CHECK(output1.useGpu_ == true && output2.useGpu_ == true)
-  << "Matrix type are not equal";
+      << "Matrix type are not equal";
   size_t numSamples = getHeight();
   size_t dim = output1.getWidth();
   CHECK_EQ(getWidth(), 1UL);
@@ -742,15 +927,18 @@ void GpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
   real* out = getData();
   real* x = output1.getData();
   real* y = output2.getData();
-  hl_cossim(out, x, y,
-      dim, output1.getHeight(), output2.getHeight(), scale);
-}
-void GpuMatrix::cosSimDerivative(Matrix& output, Matrix& prevOut1,
-                                 Matrix& prevOut2, Matrix& prevGrad1,
-                                 Matrix& prevGrad2, real scale) {
+  hl_cossim(out, x, y, dim, output1.getHeight(), output2.getHeight(), scale);
+}
+void GpuMatrix::cosSimDerivative(Matrix& output,
+                                 Matrix& prevOut1,
+                                 Matrix& prevOut2,
+                                 Matrix& prevGrad1,
+                                 Matrix& prevGrad2,
+                                 real scale) {
   CHECK(output.useGpu_ == true && prevOut1.useGpu_ == true &&
         prevOut2.useGpu_ == true && prevGrad1.useGpu_ == true &&
-        prevGrad2.useGpu_ == true) << "Matrix type are not equal";
+        prevGrad2.useGpu_ == true)
+      << "Matrix type are not equal";
   CHECK_EQ(getWidth(), 1UL);
   CHECK_EQ(output.getWidth(), 1UL);
 
@@ -770,9 +958,16 @@ void GpuMatrix::cosSimDerivative(Matrix& output, Matrix& prevOut1,
   real* prevOutY = prevOut2.getData();
   real* prevGradX = prevGrad1.getData();
   real* prevGradY = prevGrad2.getData();
-  hl_cossim_derivative(grad, out, prevOutX, prevOutY,
-      prevGradX, prevGradY, dim,
-      prevOut1.getHeight(), prevOut2.getHeight(), scale);
+  hl_cossim_derivative(grad,
+                       out,
+                       prevOutX,
+                       prevOutY,
+                       prevGradX,
+                       prevGradY,
+                       dim,
+                       prevOut1.getHeight(),
+                       prevOut2.getHeight(),
+                       scale);
 }
 
 void GpuMatrix::randomizeUniform() {
@@ -821,10 +1016,18 @@ void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
   LOG(INFO) << "the  diffCnt is " << diffCnt;
 }
 
-void GpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
-                           int channels, int blockH, int blockW, int strideH,
-                           int strideW, int paddingH, int paddingW,
-                           int outputH, int outputW) {
+void GpuMatrix::convExpand(Matrix& feature,
+                           int feaImgHeight,
+                           int feaImgWidth,
+                           int channels,
+                           int blockH,
+                           int blockW,
+                           int strideH,
+                           int strideW,
+                           int paddingH,
+                           int paddingW,
+                           int outputH,
+                           int outputW) {
   CHECK(feature.useGpu_ == true) << "Matrix type are not equal";
 
   CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels),
@@ -834,17 +1037,35 @@ void GpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
   size_t elemCnt = outputH * outputW * blockH * blockW * channels;
   CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal";
 
-  hl_expand_feature2col(feature.getData(), channels, feaImgHeight,
-                        feaImgWidth, blockH, blockW, strideH, strideW,
-                        paddingH, paddingW, outputH, outputW,
+  hl_expand_feature2col(feature.getData(),
+                        channels,
+                        feaImgHeight,
+                        feaImgWidth,
+                        blockH,
+                        blockW,
+                        strideH,
+                        strideW,
+                        paddingH,
+                        paddingW,
+                        outputH,
+                        outputW,
                         getData());
 }
 
-void GpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight,
-                           int thisImgWidth, int channels, int blockH,
-                           int blockW, int strideH, int strideW, int paddingH,
-                           int paddingW, int outputH, int outputW,
-                           real alpha, real beta) {
+void GpuMatrix::convShrink(Matrix& expandFeat,
+                           int thisImgHeight,
+                           int thisImgWidth,
+                           int channels,
+                           int blockH,
+                           int blockW,
+                           int strideH,
+                           int strideW,
+                           int paddingH,
+                           int paddingW,
+                           int outputH,
+                           int outputW,
+                           real alpha,
+                           real beta) {
   CHECK(expandFeat.useGpu_ == true) << "Matrix type are not equal";
   CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
            getHeight() * getWidth())
@@ -853,18 +1074,35 @@ void GpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight,
   size_t elemCnt = outputH * outputW * blockW * blockH * channels;
   CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth())
       << "Matrix dimensions are not equal";
-  hl_shrink_col2feature(
-      expandFeat.getData(), channels, thisImgHeight, thisImgWidth, blockH,
-      blockW, strideH, strideW, paddingH, paddingW, outputH, outputW,
-      getData(), alpha, beta);
-}
-
-void GpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
-                               size_t imgSizeW, size_t channels,
-                               size_t sizeX, size_t sizeY,
-                               size_t strideH, size_t strideW,
-                               size_t outputH, size_t outputW,
-                               size_t paddingH, size_t paddingW) {
+  hl_shrink_col2feature(expandFeat.getData(),
+                        channels,
+                        thisImgHeight,
+                        thisImgWidth,
+                        blockH,
+                        blockW,
+                        strideH,
+                        strideW,
+                        paddingH,
+                        paddingW,
+                        outputH,
+                        outputW,
+                        getData(),
+                        alpha,
+                        beta);
+}
+
+void GpuMatrix::maxPoolForward(Matrix& inputMat,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
+                               size_t paddingW) {
   CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
 
   real* inputData = inputMat.getData();
@@ -875,18 +1113,38 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
   CHECK(height_ == inputMat.getHeight());
   CHECK(width_ == outputH * outputW * channels);
 
-  hl_maxpool_forward(frameNum, inputData, channels, height, width,
-                     outputH, outputW, sizeX, sizeY, strideH, strideW,
-                     paddingH, paddingW, data_);
-}
-
-void GpuMatrix::maxPoolBackward(Matrix& inputMat, size_t imgSizeH,
-                                size_t imgSizeW, Matrix& outGrad, Matrix& outV,
-                                size_t sizeX, size_t sizeY,
-                                size_t strideH, size_t strideW,
-                                size_t outputH, size_t outputW,
-                                real scaleTargets, real scaleOutput,
-                                size_t paddingH, size_t paddingW) {
+  hl_maxpool_forward(frameNum,
+                     inputData,
+                     channels,
+                     height,
+                     width,
+                     outputH,
+                     outputW,
+                     sizeX,
+                     sizeY,
+                     strideH,
+                     strideW,
+                     paddingH,
+                     paddingW,
+                     data_,
+                     getStride());
+}
+
+void GpuMatrix::maxPoolBackward(Matrix& inputMat,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                Matrix& outGrad,
+                                Matrix& outV,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW) {
   CHECK(inputMat.useGpu_ == true && outGrad.useGpu_ == true &&
         outV.useGpu_ == true)
       << "Matrix type are not equal";
@@ -904,19 +1162,39 @@ void GpuMatrix::maxPoolBackward(Matrix& inputMat, size_t imgSizeH,
   CHECK(outGrad.getHeight() == outV.getHeight() &&
         outGrad.getWidth() == outV.getWidth());
 
-
-  hl_maxpool_backward(frameNum, inputData, outData, outDiff, channels,
-                      height, width, outputH, outputW, sizeX, sizeY,
-                      strideH, strideW, paddingH, paddingW,
-                      scaleTargets, scaleOutput, data_);
-}
-
-void GpuMatrix::avgPoolForward(Matrix& inputMat, size_t imgSizeH,
-                               size_t imgSizeW, size_t channels,
-                               size_t sizeX, size_t sizeY,
-                               size_t strideH, size_t strideW,
-                               size_t outputH, size_t outputW,
-                               size_t paddingH, size_t paddingW) {
+  hl_maxpool_backward(frameNum,
+                      inputData,
+                      outData,
+                      outDiff,
+                      channels,
+                      height,
+                      width,
+                      outputH,
+                      outputW,
+                      sizeX,
+                      sizeY,
+                      strideH,
+                      strideW,
+                      paddingH,
+                      paddingW,
+                      scaleTargets,
+                      scaleOutput,
+                      data_,
+                      outGrad.getStride());
+}
+
+void GpuMatrix::avgPoolForward(Matrix& inputMat,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
+                               size_t paddingW) {
   CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
 
   real* inputData = inputMat.getData();
@@ -927,18 +1205,36 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat, size_t imgSizeH,
   CHECK(height_ == inputMat.getHeight());
   CHECK(width_ == outputH * outputW * channels);
 
-  hl_avgpool_forward(frameNum, inputData, channels, height, width,
-                     outputH, outputW, sizeX, sizeY,
-                     strideH, strideW,
-                     paddingH, paddingW, data_);
-}
-
-void GpuMatrix::avgPoolBackward(Matrix& outGrad, size_t imgSizeH,
-                                size_t imgSizeW, size_t sizeX, size_t sizeY,
-                                size_t strideH, size_t strideW,
-                                size_t outputH, size_t outputW,
-                                real scaleTargets, real scaleOutput,
-                                size_t paddingH, size_t paddingW) {
+  hl_avgpool_forward(frameNum,
+                     inputData,
+                     channels,
+                     height,
+                     width,
+                     outputH,
+                     outputW,
+                     sizeX,
+                     sizeY,
+                     strideH,
+                     strideW,
+                     paddingH,
+                     paddingW,
+                     data_,
+                     getStride());
+}
+
+void GpuMatrix::avgPoolBackward(Matrix& outGrad,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW) {
   CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
 
   real* outDiff = outGrad.getData();
@@ -950,16 +1246,32 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad, size_t imgSizeH,
   CHECK(height_ == outGrad.getHeight());
   CHECK(outGrad.getWidth() == outputH * outputW * channels);
 
-  hl_avgpool_backward(frameNum, outDiff, channels, height, width,
-                      outputH, outputW, sizeX, sizeY,
-                      strideH, strideW, paddingH, paddingW,
-                      scaleTargets, scaleOutput,
-                      data_);
-}
-
-void GpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH,
-                                  size_t imgSizeW, Matrix& denoms,
-                                  size_t channels, size_t sizeX, float scale,
+  hl_avgpool_backward(frameNum,
+                      outDiff,
+                      channels,
+                      height,
+                      width,
+                      outputH,
+                      outputW,
+                      sizeX,
+                      sizeY,
+                      strideH,
+                      strideW,
+                      paddingH,
+                      paddingW,
+                      scaleTargets,
+                      scaleOutput,
+                      data_,
+                      outGrad.getStride());
+}
+
+void GpuMatrix::crossMapNormalFwd(Matrix& input,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  Matrix& denoms,
+                                  size_t channels,
+                                  size_t sizeX,
+                                  float scale,
                                   float pow) {
   size_t num = input.getHeight();
   size_t height = imgSizeH;
@@ -969,14 +1281,27 @@ void GpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH,
   CHECK(denoms.getHeight() == input.getHeight() &&
         denoms.getWidth() == input.getWidth() && input.getHeight() == height_ &&
         input.getWidth() == width_);
-  hl_CMRNorm_forward(num, input.getData(), denoms.getData(), data_,
-                     channels, height, width, sizeX, scale, -pow);
-}
-
-void GpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
-                                  Matrix& preOutV, Matrix& localOutV,
-                                  size_t channels, size_t imgSizeH,
-                                  size_t imgSizeW, size_t sizeX, float scale,
+  hl_CMRNorm_forward(num,
+                     input.getData(),
+                     denoms.getData(),
+                     data_,
+                     channels,
+                     height,
+                     width,
+                     sizeX,
+                     scale,
+                     -pow);
+}
+
+void GpuMatrix::crossMapNormalBwd(Matrix& localGrad,
+                                  Matrix& denoms,
+                                  Matrix& preOutV,
+                                  Matrix& localOutV,
+                                  size_t channels,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t sizeX,
+                                  float scale,
                                   float pow) {
   size_t num = preOutV.getHeight();
   size_t height = imgSizeH;
@@ -989,9 +1314,17 @@ void GpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
   CHECK(denoms.getHeight() == localGrad.getHeight() &&
         denoms.getWidth() == localGrad.getWidth());
 
-  hl_CMRNorm_backward(num, preOutV.getData(), denoms.getData(),
-                      localOutV.getData(), localGrad.getData(), data_,
-                      channels, height, width, sizeX, -pow,
+  hl_CMRNorm_backward(num,
+                      preOutV.getData(),
+                      denoms.getData(),
+                      localOutV.getData(),
+                      localGrad.getData(),
+                      data_,
+                      channels,
+                      height,
+                      width,
+                      sizeX,
+                      -pow,
                       2.0f * pow * scale);
 }
 
@@ -1013,8 +1346,8 @@ void GpuMatrix::maxSequenceForward(Matrix& input,
   CHECK_EQ(numSequences, sequence.getSize() - 1);
   CHECK_EQ(numSequences * dim, index.getSize());
 
-  hl_max_sequence_forward(inputData, starts, outData, maxIndex,
-                                    numSequences, dim);
+  hl_max_sequence_forward(
+      inputData, starts, outData, maxIndex, numSequences, dim);
 }
 
 void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
@@ -1037,10 +1370,13 @@ void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
   hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim);
 }
 
-void GpuMatrix::contextProjectionForward(MatrixPtr input, MatrixPtr weight,
+void GpuMatrix::contextProjectionForward(MatrixPtr input,
+                                         MatrixPtr weight,
                                          const IVector& sequence,
-                                         int contextLength, int contextStart,
-                                         size_t beginPad, bool isPadding) {
+                                         int contextLength,
+                                         int contextStart,
+                                         size_t beginPad,
+                                         bool isPadding) {
   CHECK(dynamic_cast<GpuMatrix*>(input.get()));
   CHECK(dynamic_cast<const GpuIVector*>(&sequence));
   if (weight) CHECK(dynamic_cast<GpuMatrix*>(weight.get()));
@@ -1054,9 +1390,16 @@ void GpuMatrix::contextProjectionForward(MatrixPtr input, MatrixPtr weight,
   real* inputData = input->getData();
   const int* starts = sequence.getData();
 
-  hl_context_projection_forward(
-      inputData, starts, isPadding ? weight->getData() : NULL, outData,
-      numSequences, inputDim, contextLength, contextStart, beginPad, isPadding);
+  hl_context_projection_forward(inputData,
+                                starts,
+                                isPadding ? weight->getData() : NULL,
+                                outData,
+                                numSequences,
+                                inputDim,
+                                contextLength,
+                                contextStart,
+                                beginPad,
+                                isPadding);
 }
 
 void GpuMatrix::contextProjectionBackwardData(MatrixPtr inputGrad,
@@ -1075,15 +1418,20 @@ void GpuMatrix::contextProjectionBackwardData(MatrixPtr inputGrad,
   real* inGrad = inputGrad->getData();
   const int* starts = sequence.getData();
 
-  hl_context_projection_backward_data(outGrad, starts, inGrad,
-                                      numSequences, inputDim,
-                                      contextLength, contextStart);
+  hl_context_projection_backward_data(outGrad,
+                                      starts,
+                                      inGrad,
+                                      numSequences,
+                                      inputDim,
+                                      contextLength,
+                                      contextStart);
 }
 
 void GpuMatrix::contextProjectionBackwardWeight(MatrixPtr weightGrad,
                                                 const IVector& sequence,
                                                 int contextLength,
-                                                int contextStart, int totalPad,
+                                                int contextStart,
+                                                int totalPad,
                                                 size_t beginPad) {
   CHECK(dynamic_cast<GpuMatrix*>(weightGrad.get()));
   CHECK(dynamic_cast<const GpuIVector*>(&sequence));
@@ -1097,9 +1445,15 @@ void GpuMatrix::contextProjectionBackwardWeight(MatrixPtr weightGrad,
   real* wtGrad = weightGrad->getData();
   const int* starts = sequence.getData();
 
-  hl_context_projection_backward_weight(
-      outGrad, starts, wtGrad, numSequences, weightDim, totalPad, contextLength,
-      contextStart, beginPad);
+  hl_context_projection_backward_weight(outGrad,
+                                        starts,
+                                        wtGrad,
+                                        numSequences,
+                                        weightDim,
+                                        totalPad,
+                                        contextLength,
+                                        contextStart,
+                                        beginPad);
 }
 
 void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
@@ -1111,8 +1465,7 @@ void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
   size_t numSamples = data.getHeight();
   size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
   real* output = getData();
-  hl_param_relu_forward(output, input, w, numElements, numSamples,
-      partial_sum);
+  hl_param_relu_forward(output, input, w, numElements, numSamples, partial_sum);
 }
 
 void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
@@ -1124,8 +1477,8 @@ void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
   size_t partial_sum = numElements / (this->getHeight() * this->getWidth());
-  hl_param_relu_backward_w(wgrad, ograd, input,
-      numElements, numSamples, partial_sum);
+  hl_param_relu_backward_w(
+      wgrad, ograd, input, numElements, numSamples, partial_sum);
 }
 
 void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
@@ -1136,21 +1489,134 @@ void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
   size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
-  hl_param_relu_backward_diff(ograd, input, w, diff,
-      numElements, numSamples, partial_sum);
+  hl_param_relu_backward_diff(
+      ograd, input, w, diff, numElements, numSamples, partial_sum);
 }
 
 void GpuMatrix::addColumnVector(const Matrix& b) {
   BaseMatrix::addColVector(const_cast<Matrix&>(b));
 }
 
+void GpuMatrix::bilinearForward(const Matrix& in,
+                                const size_t inImgH,
+                                const size_t inImgW,
+                                const size_t outImgH,
+                                const size_t outImgW,
+                                const size_t numChannels,
+                                const real ratioH,
+                                const real ratioW) {
+  CHECK(dynamic_cast<const GpuMatrix*>(&in));
+
+  const size_t outputW = getWidth();
+  const size_t outputH = getHeight();
+  const size_t inputW = in.getWidth();
+  const size_t inputH = in.getHeight();
+
+  real* outData = getData();
+  const real* inData = in.getData();
+
+  if (inImgH == outImgW && inImgW == outImgW) {
+    this->copyFrom(in);
+  } else {
+    hl_bilinear_forward(inData,
+                        inImgH,
+                        inImgW,
+                        inputH,
+                        inputW,
+                        outData,
+                        outImgH,
+                        outImgW,
+                        outputH,
+                        outputW,
+                        numChannels,
+                        ratioH,
+                        ratioW);
+  }
+}
+
+void GpuMatrix::bilinearBackward(const Matrix& out,
+                                 const size_t outImgH,
+                                 const size_t outImgW,
+                                 const size_t inImgH,
+                                 const size_t inImgW,
+                                 const size_t numChannels,
+                                 const real ratioH,
+                                 const real ratioW) {
+  CHECK(dynamic_cast<const GpuMatrix*>(&out));
+
+  const size_t inputW = getWidth();
+  const size_t inputH = getHeight();
+  const size_t outputW = out.getWidth();
+  const size_t outputH = out.getHeight();
+
+  real* inGrad = getData();
+  const real* outGrad = out.getData();
+
+  if (outImgH == inImgH && outImgW == inImgW) {
+    this->add(const_cast<Matrix&>(out));
+  } else {
+    hl_bilinear_backward(inGrad,
+                         inImgH,
+                         inImgW,
+                         inputH,
+                         inputW,
+                         outGrad,
+                         outImgH,
+                         outImgW,
+                         outputH,
+                         outputW,
+                         numChannels,
+                         ratioH,
+                         ratioW);
+  }
+}
+
+void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
+  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
+  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
+
+  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
+  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
+  CHECK(height_ == outputPtr->height_ && width_ == 1 &&
+        outputPtr->width_ == labelPtr->getWidth() &&
+        outputPtr->height_ == labelPtr->getHeight())
+      << "Matrix dimensions are not equal";
+
+  real* output_d = outputPtr->data_;
+  real* entropy_d = data_;
+  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
+  hl_matrix_multi_binary_cross_entropy(
+      output_d, entropy_d, mat_d, height_, outputPtr->width_);
+}
+
+void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
+  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
+
+  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
+  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
+  CHECK(height_ == outputPtr->height_ && width_ == outputPtr->width_ &&
+        outputPtr->width_ == labelPtr->getWidth() &&
+        outputPtr->height_ == labelPtr->getHeight())
+      << "Matrix dimensions are not equal";
+
+  real* output_d = outputPtr->data_;
+  real* grad_d = data_;
+  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
+  hl_matrix_multi_binary_cross_entropy_bp(
+      output_d, grad_d, mat_d, height_, width_);
+}
+
 /**
  * CpuMatrix
  */
 
 CpuMatrix::CpuMatrix(size_t height, size_t width, bool trans)
     : Matrix(std::make_shared<CpuMemoryHandle>(height * width * sizeof(real)),
-             height, width, trans, false) {}
+             height,
+             width,
+             trans,
+             false) {}
 
 CpuMatrix::~CpuMatrix() {}
 
@@ -1172,8 +1638,8 @@ void CpuMatrix::copyFrom(const Matrix& src) {
   if (typeid(src) == typeid(GpuMatrix)) {
     CHECK(src.isContiguous());
     CHECK(elementCnt_ == src.getElementCnt());
-    hl_memcpy_device2host(data_, const_cast<real*>(src.getData()),
-                          sizeof(real) * elementCnt_);
+    hl_memcpy_device2host(
+        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
   } else if (typeid(src) == typeid(CpuMatrix) ||
              typeid(src) == typeid(SharedCpuMatrix)) {
     CHECK(src.isContiguous());
@@ -1238,8 +1704,10 @@ void CpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
   CHECK(src.isContiguous());
   CHECK(elementCnt_ == src.getElementCnt());
   if (typeid(src) == typeid(GpuMatrix)) {
-    hl_memcpy_async(this->getData(), const_cast<real*>(src.getData()),
-                    sizeof(real) * elementCnt_, stream);
+    hl_memcpy_async(this->getData(),
+                    const_cast<real*>(src.getData()),
+                    sizeof(real) * elementCnt_,
+                    stream);
   } else if (typeid(src) == typeid(CpuMatrix)) {
     memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
   } else {
@@ -1341,7 +1809,7 @@ void CpuMatrix::accumulateColSum(Matrix& src) {
   CHECK_EQ(getWidth(), src.getWidth());
   CHECK_EQ(getHeight(), (size_t)1);
 
-  sumCols(src, 1.0);
+  sumCols(src, /* scaleSum= */ 1, /* scaleDest= */ 1);
 }
 
 real CpuMatrix::getAbsSum() {
@@ -1358,8 +1826,10 @@ real CpuMatrix::getAbsSum() {
 MatrixPtr CpuMatrix::getTranspose() {
   if (memoryHandle_.get() != NULL) {
     return std::make_shared<CpuMatrix>(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_), height_,
-        width_, true);
+        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
+        height_,
+        width_,
+        true);
   } else {
     MatrixPtr copy_T(new CpuMatrix(data_, height_, width_, true));
     return copy_T;
@@ -1384,10 +1854,58 @@ void CpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   }
 }
 
-void CpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
-                           int channels, int blockH, int blockW, int strideH,
-                           int strideW, int paddingH, int paddingW,
-                           int outputH, int outputW) {
+MatrixPtr CpuMatrix::getInverse() {
+  MatrixPtr matInv;
+  inverse(matInv, true);
+  return matInv;
+}
+
+void CpuMatrix::inverse(MatrixPtr matInv, bool memAlloc) {
+  CHECK_EQ(height_, width_);
+
+  if (memAlloc) {
+    matInv = std::make_shared<CpuMatrix>(height_, width_);
+  } else {
+    CHECK(matInv != NULL);
+  }
+
+  CHECK_EQ(height_, matInv->getHeight());
+  CHECK_EQ(width_, matInv->getWidth());
+  matInv->copyFrom(*this);
+
+  real* data = getData();
+  real* dataInv = matInv->getData();
+  int ldc = matInv->getStride();
+
+  if (height_ == 1) {
+    CHECK_NE(*data, 0);
+    *dataInv = 1.0 / (*data);
+    return;
+  }
+
+  /* Compute the LU decomposition of the matrix */
+  std::vector<int> ipiv(height_);
+  CBLAS_ORDER order = (matInv->isTransposed() ? CblasColMajor : CblasRowMajor);
+  int info = getrf<real>(order, height_, height_, dataInv, ldc, ipiv.data());
+  CHECK_EQ(info, 0);
+
+  /* Compute the inverse of the matrix given its LU decompsotion */
+  info = getri<real>(order, height_, dataInv, ldc, ipiv.data());
+  CHECK_EQ(info, 0);
+}
+
+void CpuMatrix::convExpand(Matrix& feature,
+                           int feaImgHeight,
+                           int feaImgWidth,
+                           int channels,
+                           int blockH,
+                           int blockW,
+                           int strideH,
+                           int strideW,
+                           int paddingH,
+                           int paddingW,
+                           int outputH,
+                           int outputW) {
   CHECK(feature.useGpu_ == false) << "Matrix type are not equal";
 
   CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels),
@@ -1424,11 +1942,20 @@ void CpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
   }
 }
 
-void CpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight,
-                           int thisImgWidth, int channels, int blockH,
-                           int blockW, int strideH, int strideW, int paddingH,
-                           int paddingW, int outputH, int outputW,
-                           real alpha, real beta) {
+void CpuMatrix::convShrink(Matrix& expandFeat,
+                           int thisImgHeight,
+                           int thisImgWidth,
+                           int channels,
+                           int blockH,
+                           int blockW,
+                           int strideH,
+                           int strideW,
+                           int paddingH,
+                           int paddingW,
+                           int outputH,
+                           int outputW,
+                           real alpha,
+                           real beta) {
   CHECK(expandFeat.useGpu_ == false) << "Matrix type are not equal";
   CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
            getHeight() * getWidth())
@@ -1464,12 +1991,18 @@ void CpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight,
   }
 }
 
-void CpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
-                               size_t imgSizeW, size_t channels,
-                               size_t sizeX, size_t sizeY,
-                               size_t strideH, size_t strideW,
-                               size_t outputH, size_t outputW,
-                               size_t paddingH, size_t paddingW) {
+void CpuMatrix::maxPoolForward(Matrix& inputMat,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
+                               size_t paddingW) {
   real* inputData = inputMat.getData();
   real* outData = data_;
   size_t num = inputMat.getHeight();
@@ -1477,15 +2010,21 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
   size_t inHeight = imgSizeH;
   CHECK(inHeight * inWidth == inputMat.getWidth() / channels);
   CHECK_EQ(num, this->getHeight());
-  CHECK_EQ(channels*outputH*outputW, this->getWidth());
+  CHECK_EQ(channels * outputH * outputW, this->getWidth());
+  size_t outStride = getStride();
 
   /* initialize the data_ */
-  for (size_t i = 0; i < height_ * width_; i++) {
-    outData[i] = -(real)FLT_MAX;
+  for (size_t i = 0; i < height_; i++) {
+    for (size_t j = 0; j < width_; j++) {
+      outData[i * outStride + j] = -(real)FLT_MAX;
+    }
   }
 
   /* pool max one by one */
-  for (size_t n = 0; n < num; ++n) {         // frame by frame
+  for (size_t n = 0; n < num; ++n) {  // frame by frame
+    if (!isContiguous()) {
+      outData = data_ + n * outStride;
+    }
     for (size_t c = 0; c < channels; ++c) {  // channel by channel
       for (size_t ph = 0; ph < outputH; ++ph) {
         for (size_t pw = 0; pw < outputW; ++pw) {
@@ -1510,12 +2049,21 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH,
   }
 }
 
-void CpuMatrix::maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
-                                Matrix& outGrad, Matrix& outV, size_t sizeX,
-                                size_t sizeY, size_t strideH, size_t strideW,
-                                size_t outputH, size_t outputW,
-                                real scaleTargets, real scaleOutput,
-                                size_t paddingH, size_t paddingW) {
+void CpuMatrix::maxPoolBackward(Matrix& image,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                Matrix& outGrad,
+                                Matrix& outV,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW) {
   size_t num = image.getHeight();
   size_t channels = size_t(width_ / imgSizeH / imgSizeW);
   CHECK(image.getWidth() == imgSizeH * imgSizeW * channels);
@@ -1527,7 +2075,16 @@ void CpuMatrix::maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
   real* inData = image.getData();
   real* otData = outV.getData();
   real* otGrad = outGrad.getData();
+
+  size_t outStride = outV.getStride();
+  real* origOutData = otData;
+  real* origOutGrad = otGrad;
+
   for (size_t n = 0; n < num; ++n) {
+    if (!outV.isContiguous()) {
+      otData = origOutData + n * outStride;
+      otGrad = origOutGrad + n * outStride;
+    }
     for (size_t c = 0; c < channels; ++c) {
       for (size_t ph = 0; ph < outputH; ++ph) {
         for (size_t pw = 0; pw < outputW; ++pw) {
@@ -1556,11 +2113,18 @@ void CpuMatrix::maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
   }
 }
 
-void CpuMatrix::avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                               size_t channels, size_t sizeX, size_t sizeY,
-                               size_t strideH, size_t strideW,
-                               size_t outputH, size_t outputW,
-                               size_t paddingH, size_t paddingW) {
+void CpuMatrix::avgPoolForward(Matrix& input,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
+                               size_t paddingW) {
   // The main loop
   size_t num = input.getHeight();
   size_t inHeight = imgSizeH;
@@ -1571,6 +2135,9 @@ void CpuMatrix::avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
   real* inData = input.getData();
 
   for (size_t n = 0; n < num; ++n) {
+    if (!isContiguous()) {
+      tgtData = data_ + n * getStride();
+    }
     for (size_t c = 0; c < channels; ++c) {
       for (size_t ph = 0; ph < outputH; ++ph) {
         for (size_t pw = 0; pw < outputW; ++pw) {
@@ -1601,12 +2168,19 @@ void CpuMatrix::avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
   }
 }
 
-void CpuMatrix::avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                                size_t sizeX, size_t sizeY,
-                                size_t strideH, size_t strideW,
-                                size_t outputH, size_t outputW,
-                                real scaleTargets, real scaleOutput,
-                                size_t paddingH, size_t paddingW) {
+void CpuMatrix::avgPoolBackward(Matrix& input,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW) {
   size_t num = input.getHeight();
   size_t channels = input.getWidth() / outputH / outputW;
   CHECK(imgSizeH * imgSizeW * channels == getWidth());
@@ -1614,6 +2188,9 @@ void CpuMatrix::avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
   real* outData = getData();
 
   for (size_t n = 0; n < num; ++n) {
+    if (!input.isContiguous()) {
+      inData = input.getData() + n * input.getStride();
+    }
     for (size_t c = 0; c < channels; ++c) {
       for (size_t ph = 0; ph < outputH; ++ph) {
         for (size_t pw = 0; pw < outputW; ++pw) {
@@ -1642,9 +2219,13 @@ void CpuMatrix::avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
   }
 }
 
-void CpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH,
-                                  size_t imgSizeW, Matrix& denoms,
-                                  size_t channels, size_t sizeX, float scale,
+void CpuMatrix::crossMapNormalFwd(Matrix& input,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  Matrix& denoms,
+                                  size_t channels,
+                                  size_t sizeX,
+                                  float scale,
                                   float pow) {
   size_t num = input.getHeight();
   size_t height = imgSizeH;
@@ -1694,10 +2275,15 @@ void CpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH,
   integralData = NULL;
 }
 
-void CpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
-                                  Matrix& preOutV, Matrix& localOutV,
-                                  size_t channels, size_t imgSizeH,
-                                  size_t imgSizeW, size_t size, float scale,
+void CpuMatrix::crossMapNormalBwd(Matrix& localGrad,
+                                  Matrix& denoms,
+                                  Matrix& preOutV,
+                                  Matrix& localOutV,
+                                  size_t channels,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t size,
+                                  float scale,
                                   float pow) {
   LOG(FATAL) << "Not implemented";
 
@@ -1785,10 +2371,13 @@ void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
   }
 }
 
-void CpuMatrix::contextProjectionForward(MatrixPtr input, MatrixPtr weight,
+void CpuMatrix::contextProjectionForward(MatrixPtr input,
+                                         MatrixPtr weight,
                                          const IVector& sequence,
-                                         int contextLength, int contextStart,
-                                         size_t beginPad, bool isPadding) {
+                                         int contextLength,
+                                         int contextStart,
+                                         size_t beginPad,
+                                         bool isPadding) {
   CHECK(dynamic_cast<CpuMatrix*>(input.get()));
   CHECK(dynamic_cast<const CpuIVector*>(&sequence));
   if (weight) CHECK(dynamic_cast<CpuMatrix*>(weight.get()));
@@ -1839,8 +2428,10 @@ void CpuMatrix::contextProjectionForward(MatrixPtr input, MatrixPtr weight,
 void CpuMatrix::contextProjectionBackward(MatrixPtr inputGrad,
                                           MatrixPtr weightGrad,
                                           const IVector& sequence,
-                                          int contextLength, int contextStart,
-                                          size_t beginPad, bool isPadding) {
+                                          int contextLength,
+                                          int contextStart,
+                                          size_t beginPad,
+                                          bool isPadding) {
   if (inputGrad) CHECK(dynamic_cast<CpuMatrix*>(inputGrad.get()));
   if (weightGrad) CHECK(dynamic_cast<CpuMatrix*>(weightGrad.get()));
   CHECK(dynamic_cast<const CpuIVector*>(&sequence));
@@ -1906,15 +2497,15 @@ inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
   }
 }
 
-inline void colVecAddTo(real* a, const real* b, size_t len, size_t aWidth,
-                        size_t bWidth) {
+inline void colVecAddTo(
+    real* a, const real* b, size_t len, size_t aWidth, size_t bWidth) {
   for (unsigned int i = 0; i < len; ++i) {
     a[i * aWidth] += b[i * bWidth];
   }
 }
 
-inline void colVecAddTo(real* a, real* b, real c, size_t len, size_t aWidth,
-                        size_t bWidth) {
+inline void colVecAddTo(
+    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
   for (unsigned int i = 0; i < len; ++i) {
     a[i * aWidth] += b[i * bWidth] * c;
   }
@@ -1947,12 +2538,30 @@ void CpuMatrix::addBias(Matrix& b, real scale) {
   }
 }
 
+void CpuMatrix::addSharedBias(Matrix& b, real scale) {
+  CHECK_EQ(b.getHeight(), (size_t)1);
+  real* aData = getData();
+  real* bData = b.getData();
+  size_t numSamples = getHeight();
+  size_t channel = b.getWidth();
+  CHECK_EQ(getWidth() % channel, 0UL);
+  size_t dim = getWidth() / channel;
+
+  for (size_t i = 0; i < numSamples; i++) {
+    for (size_t c = 0; c < channel; c++) {
+      for (size_t j = 0; j < dim; j++) {
+        aData[i * getStride() + c * dim + j] += scale * bData[c];
+      }
+    }
+  }
+}
+
 void CpuMatrix::collectBias(Matrix& a, real scale) {
   CHECK_EQ(getHeight(), (size_t)1);
   CHECK_EQ(width_, a.getWidth());
   CpuSparseMatrix* aptr = dynamic_cast<CpuSparseMatrix*>(&a);
   if (!aptr) {
-    sumCols(a, scale);
+    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
   } else {
     size_t nnz = aptr->getElementCnt();
     int* cols = aptr->getCols();
@@ -1964,6 +2573,23 @@ void CpuMatrix::collectBias(Matrix& a, real scale) {
   }
 }
 
+void CpuMatrix::collectSharedBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  real* B = getData();
+  real* A = a.getData();
+  size_t numSamples = a.getHeight();
+  size_t channel = getWidth();
+  CHECK_EQ(a.getWidth() % channel, 0UL);
+  size_t dim = a.getWidth() / channel;
+  for (size_t i = 0; i < numSamples; i++) {
+    for (size_t c = 0; c < channel; c++) {
+      for (size_t j = 0; j < dim; j++) {
+        B[c] += scale * A[i * channel * dim + c * dim + j];
+      }
+    }
+  }
+}
+
 void CpuMatrix::sequenceAvgForward(Matrix& a,
                                    const IVector& startsPos,
                                    int mode) {
@@ -1974,7 +2600,7 @@ void CpuMatrix::sequenceAvgForward(Matrix& a,
   real* dst = getData();
   real* src = a.getData();
   const int* starts = startsPos.getData();
-  MatrixPtr outMtx = Matrix::create(1, 1, false, false);
+  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
   MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
   for (size_t i = 0; i < height; i++) {
     int sequenceLength = starts[i + 1] - starts[i];
@@ -1986,13 +2612,17 @@ void CpuMatrix::sequenceAvgForward(Matrix& a,
     dataMtx->setData(src + starts[i] * width, sequenceLength, width);
     if (mode == 0) {
       // plain average
-      outMtx->sumCols(*dataMtx, (real)1 / (real)sequenceLength);
+      outMtx->sumCols(*dataMtx,
+                      (real)1 / (real)sequenceLength,
+                      /* scaleDest= */ 1);
     } else if (mode == 1) {
       // sum instead of average
-      outMtx->sumCols(*dataMtx, (real)1);
+      outMtx->sumCols(*dataMtx, /* scaleSum= */ 1, /* scaleDest= */ 1);
     } else if (mode == 2) {
       // divide by square root of sequenceLength
-      outMtx->sumCols(*dataMtx, (real)1 / std::sqrt(sequenceLength));
+      outMtx->sumCols(*dataMtx,
+                      (real)1 / std::sqrt(sequenceLength),
+                      /* scaleDest= */ 1);
     } else {
       LOG(FATAL) << "should not reach here";
     }
@@ -2000,27 +2630,37 @@ void CpuMatrix::sequenceAvgForward(Matrix& a,
 }
 
 /* this = scaleAB*(a*b) + scaleT*this*/
-void CpuMatrix::mul(const MatrixPtr a, const MatrixPtr b, real scaleAB,
+void CpuMatrix::mul(const MatrixPtr a,
+                    const MatrixPtr b,
+                    real scaleAB,
                     real scaleT) {
   CHECK(!isTransposed()) << "Not supported";
 
   if (dynamic_cast<CpuMatrix*>(a.get()) && dynamic_cast<CpuMatrix*>(b.get())) {
-    mul(dynamic_cast<CpuMatrix*>(a.get()), dynamic_cast<CpuMatrix*>(b.get()),
-        scaleAB, scaleT);
+    mul(dynamic_cast<CpuMatrix*>(a.get()),
+        dynamic_cast<CpuMatrix*>(b.get()),
+        scaleAB,
+        scaleT);
   } else if (dynamic_cast<CpuSparseMatrix*>(a.get()) &&
              dynamic_cast<CpuMatrix*>(b.get())) {
     mul(dynamic_cast<CpuSparseMatrix*>(a.get()),
-        dynamic_cast<CpuMatrix*>(b.get()), scaleAB, scaleT);
+        dynamic_cast<CpuMatrix*>(b.get()),
+        scaleAB,
+        scaleT);
   } else if (dynamic_cast<CpuMatrix*>(a.get()) &&
              dynamic_cast<CpuSparseMatrix*>(b.get())) {
     mul(dynamic_cast<CpuMatrix*>(a.get()),
-        dynamic_cast<CpuSparseMatrix*>(b.get()), scaleAB, scaleT);
+        dynamic_cast<CpuSparseMatrix*>(b.get()),
+        scaleAB,
+        scaleT);
   } else {
     LOG(FATAL) << "Not supported";
   }
 }
 
-void CpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
+void CpuMatrix::mul(CpuSparseMatrix* a,
+                    CpuMatrix* b,
+                    real scaleAB,
                     real scaleT) {
   if (dynamic_cast<CacheRowCpuMatrix*>(b)) {
     return mul(a, dynamic_cast<CacheRowCpuMatrix*>(b), this, scaleAB, scaleT);
@@ -2070,11 +2710,35 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
   int ldb = b->getStride();
   int ldc = getStride();
 #ifndef PADDLE_TYPE_DOUBLE
-  cblas_sgemm(CblasRowMajor, a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb,
-              scaleT, C, ldc);
+  cblas_sgemm(CblasRowMajor,
+              a_trans,
+              b_trans,
+              M,
+              N,
+              K,
+              scaleAB,
+              A,
+              lda,
+              B,
+              ldb,
+              scaleT,
+              C,
+              ldc);
 #else
-  cblas_dgemm(CblasRowMajor, a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb,
-              scaleT, C, ldc);
+  cblas_dgemm(CblasRowMajor,
+              a_trans,
+              b_trans,
+              M,
+              N,
+              K,
+              scaleAB,
+              A,
+              lda,
+              B,
+              ldb,
+              scaleT,
+              C,
+              ldc);
 // TODO(yuyang18): Is gemm defined other place?
 #endif
 
@@ -2082,8 +2746,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
           << " B[1]=" << B[1] << " C[0]=" << C[0] << " C[1]=" << C[1];
 }
 
-void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c,
-                    real scaleAB, real scaleT) {
+void CpuMatrix::mul(
+    CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c, real scaleAB, real scaleT) {
   CHECK(!c->isTransposed()) << "Not supported";
   CHECK_EQ(c->getValueType(), FLOAT_VALUE);
 
@@ -2190,7 +2854,9 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c,
   }
 }
 
-void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB,
+void CpuMatrix::mul(CpuMatrix* a,
+                    CpuSparseMatrix* b,
+                    real scaleAB,
                     real scaleT) {
   CHECK(!trans_) << "Not supported";
   CHECK(!a->isTransposed()) << "Not supported";
@@ -2228,8 +2894,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB,
           int start = b->getColStartIdx(j);
           int end = b->getColStartIdx(j + 1);
           for (int i = start; i < end; ++i) {
-            colVecAddTo(C + j, A + rows[i], B[i], height_, width_,
-                        a->getWidth());
+            colVecAddTo(
+                C + j, A + rows[i], B[i], height_, width_, a->getWidth());
           }
         }
       }
@@ -2251,8 +2917,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB,
           int start = b->getColStartIdx(i);
           int end = b->getColStartIdx(i + 1);
           for (int j = start; j < end; ++j) {
-            colVecAddTo(C + rows[j], A + i, B[j], height_, width_,
-                        a->getWidth());
+            colVecAddTo(
+                C + rows[j], A + i, B[j], height_, width_, a->getWidth());
           }
         }
       }
@@ -2277,8 +2943,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB,
           int start = b->getRowStartIdx(j);
           int end = b->getRowStartIdx(j + 1);
           for (int i = start; i < end; ++i) {
-            colVecAddTo(C + cols[i], A + j, B[i], height_, width_,
-                        a->getWidth());
+            colVecAddTo(
+                C + cols[i], A + j, B[i], height_, width_, a->getWidth());
           }
         }
       }
@@ -2300,8 +2966,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB,
           int start = b->getRowStartIdx(i);
           int end = b->getRowStartIdx(i + 1);
           for (int j = start; j < end; ++j) {
-            colVecAddTo(C + i, A + cols[j], B[j], height_, width_,
-                        a->getWidth());
+            colVecAddTo(
+                C + i, A + cols[j], B[j], height_, width_, a->getWidth());
           }
         }
       }
@@ -2400,8 +3066,8 @@ void CpuMatrix::addToRowsImp(TableMatType& table, IVector& ids) {
 static ThreadLocal<std::vector<const real*>> threadLocalColArray;
 
 template <typename MatBType, typename MatCType>
-void CpuMatrix::mul(CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB,
-                    real scaleT) {
+void CpuMatrix::mul(
+    CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT) {
   CHECK(!c->isTransposed()) << "Not supported";
   CHECK(!b->isTransposed()) << "Not supported";
   // TODO(yuyang18): Maybe bug implementation here.
@@ -2504,18 +3170,26 @@ void CpuMatrix::mul(CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB,
 
 // instantiation mul() called in SparseRowMatrix.cpp
 template void CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(
-    CpuSparseMatrix* a, CpuMatrix* b, SparseRowCpuMatrix* c, real scaleAB,
+    CpuSparseMatrix* a,
+    CpuMatrix* b,
+    SparseRowCpuMatrix* c,
+    real scaleAB,
     real scaleT);
 template void CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
-    CpuSparseMatrix* a, CpuMatrix* b, SparseAutoGrowRowCpuMatrix* c,
-    real scaleAB, real scaleT);
+    CpuSparseMatrix* a,
+    CpuMatrix* b,
+    SparseAutoGrowRowCpuMatrix* c,
+    real scaleAB,
+    real scaleT);
 template void CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(CpuSparseMatrix* a,
                                                            CpuMatrix* b,
                                                            CacheRowCpuMatrix* c,
                                                            real scaleAB,
                                                            real scaleT);
 
-void SharedCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
+void SharedCpuMatrix::mul(CpuSparseMatrix* a,
+                          CpuMatrix* b,
+                          real scaleAB,
                           real scaleT) {
   CHECK(!isTransposed()) << "Not supported";
   CHECK(!b->isTransposed()) << "Not supported";
@@ -2555,8 +3229,8 @@ void SharedCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
     for (int k = 0; k < blockNum_; ++k) {
       blockSeq.push_back(k);
     }
-    std::shuffle(blockSeq.begin(), blockSeq.end(),
-        ThreadLocalRandomEngine::get());
+    std::shuffle(
+        blockSeq.begin(), blockSeq.end(), ThreadLocalRandomEngine::get());
   }
   std::vector<int>& localBufRows = *localBufRows_;
   int* cols = a->getCols();
@@ -2594,8 +3268,8 @@ void SharedCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
         localBufRows.push_back(i);
         size_t bufPos = localBufRows.size() - 1;
         for (int j = start; j < end; ++j) {
-          vecAddTo(localC + bufPos * width, B + cols[j] * width, value[j],
-                   width);
+          vecAddTo(
+              localC + bufPos * width, B + cols[j] * width, value[j], width);
         }
       }
     }
@@ -2679,7 +3353,7 @@ void CpuMatrix::rowSum(Matrix& sum) {
   CHECK_EQ(sum.getHeight(), getHeight());
   CHECK_EQ(sum.getWidth(), (size_t)1);
 
-  sum.sumRows(*this);
+  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
 }
 
 void CpuMatrix::rowMaxId(IVector& maxIds) {
@@ -2731,7 +3405,9 @@ void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
     }
 
     std::partial_sort(
-        vec.begin(), vec.begin() + beam, vec.end(),
+        vec.begin(),
+        vec.begin() + beam,
+        vec.end(),
         [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
           return l.first > r.first;
         });
@@ -2748,6 +3424,101 @@ void CpuMatrix::colMax(Matrix& max) {
   max.maxCols(*this);
 }
 
+void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
+  CHECK(isContiguous());
+  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
+  size_t numSamples = getWidth();
+  size_t beam = maxVal.getHeight();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getWidth(), numSamples);
+
+  real* a = getData();
+  int* s = maxIds.getData();
+  real* t = maxVal.getData();
+  size_t dim = getHeight();
+  for (size_t i = 0; i < numSamples; i++) {
+    std::vector<std::pair<real, size_t>> vec;
+    for (size_t j = 0; j < dim; j++) {
+      vec.push_back(std::pair<real, size_t>(a[i + j * numSamples], j));
+    }
+
+    std::partial_sort(
+        vec.begin(),
+        vec.begin() + beam,
+        vec.end(),
+        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
+          return l.first > r.first;
+        });
+    for (size_t j = 0; j < beam; j++) {
+      t[i + j * numSamples] = vec[j].first;
+      s[i + j * numSamples] = vec[j].second;
+    }
+  }
+}
+
+void CpuMatrix::maxoutForward(Matrix& a,
+                              IVector& id,
+                              size_t channels,
+                              size_t groups) {
+  CHECK(dynamic_cast<CpuMatrix*>(&a));
+  CHECK(dynamic_cast<CpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+
+  size_t size = getWidth();
+  size_t batchSize = getHeight();
+  size_t featLen = size / channels;
+  const real* input = a.getData();
+  int* idForCpu = id.getData();
+
+  MatrixPtr maxInMat, maxOutMat;
+  Matrix::resizeOrCreate(maxInMat, groups, size, false, false);
+  Matrix::resizeOrCreate(maxOutMat, 1, size, false, false);
+
+  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
+    size_t newIndex = batch_idx * size;
+    IVectorPtr tmpId = IVector::create(idForCpu + newIndex, size, false);
+
+    for (size_t i = 0; i < channels; ++i) {
+      size_t newFeatLen = i * featLen;
+      for (size_t j = 0; j < groups; ++j) {
+        maxInMat->subMatrix(j, j + 1, newFeatLen, newFeatLen + featLen)
+            ->copyFrom(input + (newIndex + newFeatLen) * groups + j * featLen,
+                       featLen);
+      }
+    }
+    maxInMat->colMax(*tmpId, *maxOutMat);
+    this->subRowMatrix(batch_idx, batch_idx + 1)->copyFrom(*maxOutMat);
+  }
+}
+
+void CpuMatrix::maxoutBackward(Matrix& a,
+                               IVector& id,
+                               size_t channels,
+                               size_t groups) {
+  CHECK(dynamic_cast<CpuMatrix*>(&a));
+  CHECK(dynamic_cast<CpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+
+  size_t size = a.getWidth();
+  size_t batchSize = getHeight();
+  size_t featLen = size / channels;
+  size_t newFeatLen = groups * featLen;
+  real* inputG = getData();
+  const real* outG = a.getData();
+  int* idForCpu = id.getData();
+
+  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
+    size_t newIndex = batch_idx * size;
+    int* idData = idForCpu + newIndex;
+
+    for (size_t i = 0; i < size; ++i) {
+      int gradIdx =
+          idData[i] * featLen + (i / featLen) * newFeatLen + i % featLen;
+      (inputG + newIndex * groups)[gradIdx] += (outG + newIndex)[i];
+    }
+  }
+}
+
 void CpuMatrix::rowNormalizeL1(Matrix& out) {
   CHECK(!out.useGpu());
 
@@ -2844,7 +3615,8 @@ void CpuMatrix::oneHotCrossEntropyBp(Matrix& output, IVector& label) {
     but we define the scalar function here for sanity check
     deletion of the function does not affect anything neverthelss
 */
-void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
+void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                               IVector& label,
                                                real alpha) {
   CHECK(dynamic_cast<CpuMatrix*>(&output));
   CHECK(dynamic_cast<CpuIVector*>(&label));
@@ -2875,7 +3647,8 @@ void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
     but we define the scalar function here for sanity check
     deletion of the function does not affect anything neverthelss
 */
-void CpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& output, IVector& label,
+void CpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& output,
+                                                 IVector& label,
                                                  real alpha) {
   CHECK(dynamic_cast<CpuMatrix*>(&output));
   CHECK(dynamic_cast<CpuIVector*>(&label));
@@ -2956,10 +3729,16 @@ void CpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
   CHECK_EQ(output.getWidth(), 1UL);
   CHECK(isContiguous());
 
-  MatrixPtr inTmp = Matrix::create(nullptr, /* height= */ 1, 1,
-                                 /* trans= */ false, false);
-  MatrixPtr outTmp = Matrix::create(nullptr, /* height= */ 1, 1,
-                                 /* trans= */ false, false);
+  MatrixPtr inTmp = Matrix::create(nullptr,
+                                   /* height= */ 1,
+                                   1,
+                                   /* trans= */ false,
+                                   false);
+  MatrixPtr outTmp = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    1,
+                                    /* trans= */ false,
+                                    false);
   size_t numSequences = index.getSize() - 1;
   auto starts = index.getData();
   for (size_t i = 0; i < numSequences; ++i) {
@@ -3015,9 +3794,12 @@ void CpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
   }
 }
 
-void CpuMatrix::cosSimDerivative(Matrix& output, Matrix& prevOut1,
-                                 Matrix& prevOut2, Matrix& prevGrad1,
-                                 Matrix& prevGrad2, real scale) {
+void CpuMatrix::cosSimDerivative(Matrix& output,
+                                 Matrix& prevOut1,
+                                 Matrix& prevOut2,
+                                 Matrix& prevGrad1,
+                                 Matrix& prevGrad2,
+                                 real scale) {
   CHECK(output.useGpu_ == false) << "Matrix type are not equal";
 
   CHECK_EQ(getWidth(), 1UL);
@@ -3047,8 +3829,11 @@ void CpuMatrix::cosSimDerivative(Matrix& output, Matrix& prevOut1,
     CHECK_EQ(prevOut2.getHeight(), numSamples);
     CHECK_EQ(prevGrad2.getHeight(), numSamples);
   }
-  for (size_t i = 0; i < numSamples; ++i, prevOutX += dim, prevOutY += yInc,
-              prevGradX += dim, prevGradY += yInc) {
+  for (size_t i = 0; i < numSamples; ++i,
+              prevOutX += dim,
+              prevOutY += yInc,
+              prevGradX += dim,
+              prevGradY += yInc) {
     real squareSumX = 0;
     real squareSumY = 0;
     real xy = 0;
@@ -3105,7 +3890,8 @@ void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
         int* cols = labelptr->getCols();
         for (size_t i = 0; i < numSamples; ++i) {
           for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1); ++j) {
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
             cost[i] += 1.0 - 2.0 * out[i * dim + cols[j]];
             /*
              * explanation of above line: original codes are follows:
@@ -3121,7 +3907,8 @@ void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
           real sum1 = 0;
           real sum2 = 0;
           for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1); ++j) {
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
             sum1 += values[j] * values[j];
             sum2 += values[j] * out[i * dim + cols[j]];
             /*
@@ -3143,7 +3930,10 @@ void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
     }
   }
 
-  BaseMatrix::sumOfSquares(output, label);
+  BaseMatrix::sumOfSquaredDiffs(output,
+                                label,
+                                /* scaleSum= */ 1,
+                                /* scaleDest= */ 1);
 }
 
 /* calculate the error of outputV according to label */
@@ -3173,7 +3963,8 @@ void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
         int* cols = labelptr->getCols();
         for (size_t i = 0; i < numSamples; ++i) {
           for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1); ++j) {
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
             grad[i * dim + cols[j]] -= 2.0;
             /*
              * explanation of above line: original codes are follows:
@@ -3188,7 +3979,8 @@ void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
         real* values = labelptr->getValue();
         for (size_t i = 0; i < numSamples; ++i) {
           for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1); ++j) {
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
             grad[i * dim + cols[j]] -= 2.0 * values[j];
             /*
              * explanation of above line: original codes are follows:
@@ -3229,9 +4021,7 @@ void CpuMatrix::tanh(Matrix& output) {
   size_t dim = getWidth();
   CHECK_EQ(output.getHeight(), numSamples);
   CHECK_EQ(output.getWidth(), dim);
-  errno = 0;
   vTanh(numSamples * dim, getData(), output.getData());
-  CHECK_EQ(errno, 0) << "vTanh error";
 }
 
 void CpuMatrix::tanhDerivative(Matrix& output) {
@@ -3253,10 +4043,8 @@ void CpuMatrix::softrelu(Matrix& output) {
       out[j] = x;
     }
   }
-  errno = 0;
   vExp(numSamples * dim, output.getData(), output.getData());
   vLog1p(numSamples * dim, output.getData(), output.getData());
-  CHECK_EQ(errno, 0) << "vExp+vLog1p error";
 }
 
 void CpuMatrix::softreluDerivative(Matrix& output) {
@@ -3271,9 +4059,7 @@ void CpuMatrix::softreluDerivative(Matrix& output) {
   MatrixPtr tmpMat = Matrix::create(numSamples, dim);
   real* tmp = tmpMat->getData();
 
-  errno = 0;
   vExp(size, output.getData(), tmpMat->getData());
-  CHECK_EQ(errno, 0) << "vExp error";
 
   for (size_t i = 0; i < size; ++i) {
     grad[i] *= (1.0 - 1.0 / tmp[i]);
@@ -3296,10 +4082,7 @@ void CpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
     out[i] = p2 * in[i];
   }
 
-  // out = tanh(out)
-  errno = 0;
   vTanh(numSamples * dim, out, out);
-  CHECK_EQ(errno, 0) << "vTanh error";
 
   // out = p1 * out
   for (size_t i = 0; i < numSamples * dim; ++i) {
@@ -3472,8 +4255,8 @@ void CpuMatrix::circularConv(Matrix& in0, Matrix& in1) {
   }
 }
 
-void CpuMatrix::circularConvDerivative(Matrix& outG, Matrix& in0, Matrix& in1,
-                                       Matrix& inG0, Matrix& inG1) {
+void CpuMatrix::circularConvDerivative(
+    Matrix& outG, Matrix& in0, Matrix& in1, Matrix& inG0, Matrix& inG1) {
   size_t height = in0.getHeight();
   size_t width0 = in0.getWidth();
   size_t width1 = in1.getWidth();
@@ -3493,8 +4276,12 @@ void CpuMatrix::circularConvDerivative(Matrix& outG, Matrix& in0, Matrix& in1,
   real* inGV1 = inG1.getData();
 
   int leftCtxLen = (width1 - 1) / 2;
-  for (size_t x = 0; x < height; ++x, outGV += width0, inV0 += width0,
-              inV1 += width1, inGV0 += width0, inGV1 += width1) {
+  for (size_t x = 0; x < height; ++x,
+              outGV += width0,
+              inV0 += width0,
+              inV1 += width1,
+              inGV0 += width0,
+              inGV1 += width1) {
     for (size_t j = 0; j < width1; ++j) {  // iterate over width1
       for (size_t i = 0; i < width0; ++i) {
         // such over all dimensions of outG
@@ -3563,7 +4350,8 @@ void CpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
 }
 
 /* calculate the classification error for multi binary label */
-void CpuMatrix::classificationErrorMulti(Matrix& output, Matrix& label,
+void CpuMatrix::classificationErrorMulti(Matrix& output,
+                                         Matrix& label,
                                          real threshold) {
   CHECK(dynamic_cast<CpuMatrix*>(&output));
   auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
@@ -3598,6 +4386,112 @@ void CpuMatrix::classificationErrorMulti(Matrix& output, Matrix& label,
   }
 }
 
+void CpuMatrix::bilinearForward(const Matrix& in,
+                                const size_t inImgH,
+                                const size_t inImgW,
+                                const size_t outImgH,
+                                const size_t outImgW,
+                                const size_t numChannels,
+                                const real ratioH,
+                                const real ratioW) {
+  CHECK(dynamic_cast<const CpuMatrix*>(&in));
+
+  size_t outputW = getWidth();
+  size_t batchSize = getHeight();
+  size_t inputW = in.getWidth();
+  size_t inputH = in.getHeight();
+  size_t inPosOffset = inImgH * inImgW;
+  size_t outPosOffset = outImgH * outImgW;
+  (void)(inputH);
+
+  real* outData = getData();
+  const real* inData = in.getData();
+
+  if (inImgH == outImgH && inImgW == outImgW) {
+    this->copyFrom(in);
+  } else {
+    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
+      for (size_t i = 0; i < outImgH; ++i) {  // loop for images
+        size_t h = ratioH * i;
+        size_t hid = (h < inImgH - 1) ? 1 : 0;
+        real h1lambda = ratioH * i - h;
+        real h2lambda = 1 - h1lambda;
+
+        for (size_t j = 0; j < outImgW; ++j) {
+          size_t w = ratioW * j;
+          size_t wid = (w < inImgW - 1) ? 1 : 0;
+          real w1lambda = ratioW * j - w;
+          real w2lambda = 1 - w1lambda;
+          // calculate four position for bilinear interpolation
+          const real* inPos = &inData[k * inputW + h * inImgW + w];
+          real* outPos = &outData[k * outputW + i * outImgW + j];
+          for (size_t c = 0; c < numChannels; ++c) {  // loop for channels
+            // bilinear interpolation
+            outPos[0] =
+                h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wid]) +
+                h1lambda * (w2lambda * inPos[hid * inImgW] +
+                            w1lambda * inPos[hid * inImgW + wid]);
+            inPos += inPosOffset;
+            outPos += outPosOffset;
+          }
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::bilinearBackward(const Matrix& out,
+                                 const size_t outImgH,
+                                 const size_t outImgW,
+                                 const size_t inImgH,
+                                 const size_t inImgW,
+                                 const size_t numChannels,
+                                 const real ratioH,
+                                 const real ratioW) {
+  CHECK(dynamic_cast<const CpuMatrix*>(&out));
+
+  size_t inputW = getWidth();
+  size_t inputH = getHeight();
+  size_t outputW = out.getWidth();
+  size_t batchSize = out.getHeight();
+  size_t inPosOffset = inImgH * inImgW;
+  size_t outPosOffset = outImgH * outImgW;
+  (void)(inputH);
+
+  real* inGrad = getData();
+  const real* outGrad = out.getData();
+
+  if (inImgH == outImgH && inImgW == outImgW) {
+    this->add(const_cast<Matrix&>(out));
+  } else {
+    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
+      for (size_t i = 0; i < outImgH; ++i) {  // loop for images
+        size_t h = ratioH * i;
+        size_t hid = (h < inImgH - 1) ? 1 : 0;
+        real h1lambda = ratioH * i - h;
+        real h2lambda = 1 - h1lambda;
+        for (size_t j = 0; j < outImgW; ++j) {
+          size_t w = ratioW * j;
+          size_t wid = (w < inImgW - 1) ? 1 : 0;
+          real w1lambda = ratioW * j - w;
+          real w2lambda = 1 - w1lambda;
+
+          real* inPos = &inGrad[k * inputW + h * inImgW + w];
+          const real* outPos = &outGrad[k * outputW + i * outImgW + j];
+          for (size_t c = 0; c < numChannels; ++c) {  // loop for channels
+            inPos[0] += h2lambda * w2lambda * outPos[0];
+            inPos[wid] += h2lambda * w1lambda * outPos[0];
+            inPos[hid * inImgW] += h1lambda * w2lambda * outPos[0];
+            inPos[hid * inImgW + wid] += h1lambda * w1lambda * outPos[0];
+            inPos += inPosOffset;
+            outPos += outPosOffset;
+          }
+        }
+      }
+    }
+  }
+}
+
 ////////////////////////////////////////////////////////////////
 //               functions executed via cpu                   //
 ////////////////////////////////////////////////////////////////
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 25104fe1c6d70afbf39ab47a17ce0bf21a121427..075dc845768d7dfa156d33d057a30b28628c099c 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -77,12 +76,19 @@ typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr;
  */
 class Matrix : public BaseMatrix {
 protected:
-  Matrix(MemoryHandlePtr memHandle, size_t height, size_t width, bool trans,
+  Matrix(MemoryHandlePtr memHandle,
+         size_t height,
+         size_t width,
+         bool trans,
          bool use_gpu);
 
   Matrix(real* data, size_t height, size_t width, bool trans, bool use_gpu);
 
-  Matrix(real* data, size_t height, size_t width, size_t stride, bool trans,
+  Matrix(real* data,
+         size_t height,
+         size_t width,
+         size_t stride,
+         bool trans,
          bool use_gpu);
 
   static ThreadLocal<MatrixPtr> tmpMat_;
@@ -94,38 +100,66 @@ public:
 public:
   virtual ~Matrix() {}
 
-  static MatrixPtr create(MemoryHandlePtr memHandle, size_t height,
-                          size_t width, bool trans = false);
-  static MatrixPtr create(size_t height, size_t width, bool trans = false,
+  static MatrixPtr create(MemoryHandlePtr memHandle,
+                          size_t height,
+                          size_t width,
+                          bool trans = false);
+  static MatrixPtr create(size_t height,
+                          size_t width,
+                          bool trans = false,
+                          bool useGpu = false);
+  static MatrixPtr create(real* data,
+                          size_t height,
+                          size_t width,
+                          bool trans = false,
                           bool useGpu = false);
-  static MatrixPtr create(real* data, size_t height, size_t width,
-                          bool trans = false, bool useGpu = false);
-  static MatrixPtr create(real* data, size_t height, size_t width,
-                          size_t stride, bool trans = false,
+  static MatrixPtr create(real* data,
+                          size_t height,
+                          size_t width,
+                          size_t stride,
+                          bool trans = false,
                           bool useGpu = false);
 
-  static MatrixPtr createSparseMatrix(size_t height, size_t width, size_t nnz,
+  static MatrixPtr createSparseMatrix(size_t height,
+                                      size_t width,
+                                      size_t nnz,
                                       SparseValueType valueType = FLOAT_VALUE,
-                                      bool trans = false, bool useGpu = false);
-  static MatrixPtr createSparseMatrix(size_t height, size_t width, size_t nnz,
+                                      bool trans = false,
+                                      bool useGpu = false);
+  static MatrixPtr createSparseMatrix(size_t height,
+                                      size_t width,
+                                      size_t nnz,
                                       SparseValueType valueType = FLOAT_VALUE,
                                       SparseFormat foramt = SPARSE_CSR,
-                                      bool trans = false, bool useGpu = false);
-
-  static MatrixPtr createSparseMatrix(real* data, int* row, int* col,
-                                      size_t height, size_t width,
+                                      bool trans = false,
+                                      bool useGpu = false);
+
+  static MatrixPtr createSparseMatrix(real* data,
+                                      int* row,
+                                      int* col,
+                                      size_t height,
+                                      size_t width,
                                       size_t nnz, /* used to allocate space */
                                       SparseValueType valueType, /*value type*/
-                                      SparseFormat format, bool trans,
+                                      SparseFormat format,
+                                      bool trans,
                                       bool useGpu);
 
   static void resizeOrCreateSparseMatrix(
-      MatrixPtr& matrix, size_t height, size_t width, size_t nnz,
-      SparseValueType valueType = FLOAT_VALUE, SparseFormat foramt = SPARSE_CSR,
-      bool trans = false, bool useGpu = false);
-
-  static void resizeOrCreate(MatrixPtr& a, size_t height, size_t width,
-                             bool trans = false, bool useGpu = false);
+      MatrixPtr& matrix,
+      size_t height,
+      size_t width,
+      size_t nnz,
+      SparseValueType valueType = FLOAT_VALUE,
+      SparseFormat foramt = SPARSE_CSR,
+      bool trans = false,
+      bool useGpu = false);
+
+  static void resizeOrCreate(MatrixPtr& a,
+                             size_t height,
+                             size_t width,
+                             bool trans = false,
+                             bool useGpu = false);
 
   /**
    * @brief  set the data buffer used to hold the matrix data.
@@ -163,12 +197,12 @@ public:
   // if refactor sparse matrix
   virtual int* getRows() const {
     LOG(FATAL) << "Not implemented";
-    return nullptr;   //! suppress warning for no return value.
+    return nullptr;  //! suppress warning for no return value.
   }
 
   virtual int* getCols() const {
     LOG(FATAL) << "Not implemented";
-    return nullptr;   //! suppress warning for no return value.
+    return nullptr;  //! suppress warning for no return value.
   }
 
   virtual SparseFormat getFormat() const {
@@ -178,7 +212,7 @@ public:
 
   virtual SparseValueType getValueType() const {
     LOG(FATAL) << "Not implemented";
-    return NO_VALUE;    //! suppress warning for no return value.
+    return NO_VALUE;  //! suppress warning for no return value.
   }
 
   /**
@@ -195,6 +229,8 @@ public:
 
   virtual void resetOne() { LOG(FATAL) << "Not implemented"; }
 
+  void setDiag(real value);
+
   virtual void copyFrom(const Matrix& src) { LOG(FATAL) << "Not implemented"; }
 
   virtual void trimFrom(const CpuSparseMatrix& src) {
@@ -206,7 +242,9 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  MatrixPtr subMatrix(size_t startRow, size_t endRow, size_t startCol,
+  MatrixPtr subMatrix(size_t startRow,
+                      size_t endRow,
+                      size_t startCol,
                       size_t endCol);
 
   MatrixPtr subRowMatrix(size_t startRow, size_t endRow) {
@@ -219,8 +257,11 @@ public:
 
   virtual MatrixPtr subMatrix(size_t startRow, size_t numRows) {
     CHECK_LE(startRow + numRows, getHeight());
-    return Matrix::create(getData() + startRow * getWidth(), numRows,
-                          getWidth(), trans_, useGpu_);
+    return Matrix::create(getData() + startRow * getWidth(),
+                          numRows,
+                          getWidth(),
+                          trans_,
+                          useGpu_);
   }
   virtual MatrixPtr subMatrix(size_t startRow, size_t numRows, MatrixPtr dest) {
     CHECK_LE(startRow + numRows, getHeight());
@@ -265,7 +306,8 @@ public:
    * as this, otherwise the new matrix will have the specified size.
    *
    */
-  virtual MatrixPtr clone(size_t height = 0, size_t width = 0,
+  virtual MatrixPtr clone(size_t height = 0,
+                          size_t width = 0,
                           bool useGpu = false) {
     LOG(FATAL) << "Not implemented";
     return nullptr;
@@ -303,9 +345,11 @@ public:
   /**
    * @note This should only be used for sparse matrix.
    */
-  virtual void resize(size_t newHeight, size_t newWidth,
+  virtual void resize(size_t newHeight,
+                      size_t newWidth,
                       size_t newNnz, /* total item used to allocate space */
-                      SparseValueType valueType, SparseFormat format) = 0;
+                      SparseValueType valueType,
+                      SparseFormat format) = 0;
 
   /**
    * @brief This should only be used for sparse matrix.
@@ -313,7 +357,9 @@ public:
    * Currently must be called for each row in order.
    * The matrix is not valid until setRow is called for the last row.
    */
-  virtual void setRow(size_t row, size_t colNum, const unsigned int* cols,
+  virtual void setRow(size_t row,
+                      size_t colNum,
+                      const unsigned int* cols,
                       const real* values) = 0;
 
   virtual MatrixPtr getTranspose() = 0;
@@ -328,6 +374,21 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
+  virtual MatrixPtr getInverse() {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;
+  }
+
+  /**
+   * @brief  inverse.
+   *
+   * if allocate matInv's memory outside, then set memAlloc as false;
+   * else set as true.
+   */
+  virtual void inverse(MatrixPtr matInv, bool memAlloc) {
+    LOG(FATAL) << "Not implemented";
+  }
+
 public:
   /// Only set all variables to 0 or NULL but not free them.
   virtual void clear() {
@@ -343,13 +404,38 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
+  virtual void addSharedBias(Matrix& b, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void addBias(Matrix& b, real scale, bool sharedBias) {
+    if (!sharedBias) {
+      addBias(b, scale);
+    } else {
+      addSharedBias(b, scale);
+    }
+  }
+
   /// add each sample from a to this.
   virtual void collectBias(Matrix& a, real scale) {
     LOG(FATAL) << "Not implemented";
   }
 
-  virtual void sequenceAvgForward(Matrix& a, const IVector& startsPos,
-    int mode) {
+  virtual void collectSharedBias(Matrix& a, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void collectBias(Matrix& a, real scale, bool sharedBias) {
+    if (!sharedBias) {
+      collectBias(a, scale);
+    } else {
+      collectSharedBias(a, scale);
+    }
+  }
+
+  virtual void sequenceAvgForward(Matrix& a,
+                                  const IVector& startsPos,
+                                  int mode) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -358,7 +444,9 @@ public:
    * this = scaleAB*(a*b) + scaleT*this
    * @endcode
    */
-  virtual void mul(const MatrixPtr a, const MatrixPtr b, real scaleAB,
+  virtual void mul(const MatrixPtr a,
+                   const MatrixPtr b,
+                   real scaleAB,
                    real scaleT) {
     LOG(FATAL) << "Not implemented";
   }
@@ -375,7 +463,8 @@ public:
    * where index(i, j) = ((codes(i) + numClasses) >> (j + 1)) - 1
    * @endcode
    */
-  virtual void addByBitCode(size_t numClasses, const IVector& codes,
+  virtual void addByBitCode(size_t numClasses,
+                            const IVector& codes,
                             const Matrix& vec) {
     (void)numClasses;
     (void)codes;
@@ -390,7 +479,8 @@ public:
    * where index is same as the index for addByBitCode
    * @endcode
    */
-  virtual void addByBitCodeBackward(size_t numClasses, const IVector& codes,
+  virtual void addByBitCodeBackward(size_t numClasses,
+                                    const IVector& codes,
                                     Matrix& vec) {
     (void)numClasses;
     (void)codes;
@@ -405,8 +495,10 @@ public:
    * where index is same as the index for addByBitCode
    * @endcode
    */
-  virtual void mulByBitCode(size_t numClasses, const IVector& codes,
-                            const Matrix& mat, const Matrix& input) {
+  virtual void mulByBitCode(size_t numClasses,
+                            const IVector& codes,
+                            const Matrix& mat,
+                            const Matrix& input) {
     (void)numClasses;
     (void)codes;
     (void)mat;
@@ -422,7 +514,8 @@ public:
    * @endcode
    */
   virtual void mulByBitCodeBackwardWeight(size_t numClasses,
-                                          const IVector& codes, Matrix& mat,
+                                          const IVector& codes,
+                                          Matrix& mat,
                                           const Matrix& input) {
     (void)numClasses;
     (void)codes;
@@ -440,7 +533,8 @@ public:
    */
   virtual void mulByBitCodeBackwardError(size_t numClasses,
                                          const IVector& codes,
-                                         const Matrix& mat, Matrix& input) {
+                                         const Matrix& mat,
+                                         Matrix& input) {
     (void)numClasses;
     (void)codes;
     (void)mat;
@@ -455,7 +549,9 @@ public:
    * where bit(i, j) = ((codes(i) + numClasses) & 2^j) ? 1 : 0
    * @endcode
    */
-  virtual void sumByBitCode(size_t numClasses, IVector& codes, Matrix& sum,
+  virtual void sumByBitCode(size_t numClasses,
+                            IVector& codes,
+                            Matrix& sum,
                             real scaleSum) {
     (void)numClasses;
     (void)codes;
@@ -493,16 +589,44 @@ public:
     LOG(FATAL) << "Not implemeted";
   }
 
+  /**
+   * set the max of each column of this to mat
+   */
   virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
 
+  /**
+   * @brief Get the top k elements of each column of this matrix.
+   *
+   * The row ids and values of these elements are stored in
+   * maxIds and max respectively. where k is the size of maxIds.
+   * And note that the top k elements are not sorted.
+   */
+  virtual void colMax(IVector& maxIds, Matrix& maxVal) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  virtual void maxoutForward(Matrix& a,
+                             IVector& id,
+                             size_t channels,
+                             size_t groups) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  virtual void maxoutBackward(Matrix& a,
+                              IVector& id,
+                              size_t channels,
+                              size_t groups) {
+    LOG(FATAL) << "not implemented";
+  }
+
   virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
 
   /**
    * @brief Get the top k elements of each row of this matrix.
    *
    * The column ids and values of these elements are stored in
-   * maxIds and max respectively. Note that the top k
-   * elements are not sorted.
+   * maxIds and max respectively. where k is the size of maxIds.
+   * And note that the top k elements are not sorted.
    */
   virtual void rowMax(IVector& maxIds, Matrix& max) {
     LOG(FATAL) << "Not implemented";
@@ -569,7 +693,8 @@ public:
   }
 
   /// copy -log(output[label]) to this->data[i].
-  virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
+  virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                              IVector& label,
                                               real alpha) {
     LOG(FATAL) << "Not implemented";
   }
@@ -595,13 +720,14 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  virtual void circularConvDerivative(Matrix& output, Matrix& prevOut1,
-                                      Matrix& prevOut2, Matrix& prevGrad1,
+  virtual void circularConvDerivative(Matrix& output,
+                                      Matrix& prevOut1,
+                                      Matrix& prevOut2,
+                                      Matrix& prevGrad1,
                                       Matrix& prevGrad2) {
     LOG(FATAL) << "Not implemented";
   }
 
-
   /* output_ij = exp(this_{ij}) / (sum_j exp(this_ij)) */
   virtual void softmax(Matrix& output) {
     (void)output;
@@ -662,9 +788,12 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  virtual void cosSimDerivative(Matrix& output, Matrix& prevOut1,
-                                Matrix& prevOut2, Matrix& prevGrad1,
-                                Matrix& prevGrad2, real scale = 1.0f) {
+  virtual void cosSimDerivative(Matrix& output,
+                                Matrix& prevOut1,
+                                Matrix& prevOut2,
+                                Matrix& prevGrad1,
+                                Matrix& prevGrad2,
+                                real scale = 1.0f) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -716,10 +845,18 @@ public:
    * It will expand a feature matrix according to the
    * convolution filters
    */
-  virtual void convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
-                          int channels, int blockH, int blockW, int strideH,
-                          int strideW, int paddingH, int paddingW,
-                          int outputH, int outputW) {
+  virtual void convExpand(Matrix& feature,
+                          int feaImgHeight,
+                          int feaImgWidth,
+                          int channels,
+                          int blockH,
+                          int blockW,
+                          int strideH,
+                          int strideW,
+                          int paddingH,
+                          int paddingW,
+                          int outputH,
+                          int outputW) {
     LOG(FATAL) << "Not implemeted";
   }
 
@@ -728,11 +865,20 @@ public:
    *
    * Its function is to restore a expanded-matrix into a feature matrix
    */
-  virtual void convShrink(Matrix& expandColMat, int thisImgHeight,
-                          int thisImgWidth, int channels, int blockH,
-                          int blockW, int strideH, int strideW, int paddingH,
-                          int paddingW, int outputH, int outputW,
-                          real alpha = 1.0f, real beta = 0.0f) {
+  virtual void convShrink(Matrix& expandColMat,
+                          int thisImgHeight,
+                          int thisImgWidth,
+                          int channels,
+                          int blockH,
+                          int blockW,
+                          int strideH,
+                          int strideW,
+                          int paddingH,
+                          int paddingW,
+                          int outputH,
+                          int outputW,
+                          real alpha = 1.0f,
+                          real beta = 0.0f) {
     LOG(FATAL) << "Not implemeted";
   }
 
@@ -740,54 +886,93 @@ public:
    * Pooling forward operation, pick out the largest element
    * in the sizeX of value
    */
-  virtual void maxPoolForward(Matrix& inputMat, size_t imgSizeH,
-                              size_t imgSizeW, size_t channels, size_t sizeX,
-                              size_t sizeY, size_t strideH, size_t strideW,
-                              size_t outputH, size_t outputW,
-                              size_t paddingH, size_t paddingW) {
+  virtual void maxPoolForward(Matrix& inputMat,
+                              size_t imgSizeH,
+                              size_t imgSizeW,
+                              size_t channels,
+                              size_t sizeX,
+                              size_t sizeY,
+                              size_t strideH,
+                              size_t strideW,
+                              size_t outputH,
+                              size_t outputW,
+                              size_t paddingH,
+                              size_t paddingW) {
     LOG(FATAL) << "Not implemeted";
   }
 
   /// Pooling backward operation.
-  virtual void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
-                               Matrix& outGrad, Matrix& outV, size_t sizeX,
-                               size_t sizeY, size_t strideH, size_t strideW,
-                               size_t outputH, size_t outputW,
-                               real scaleTargets, real scaleOutput,
-                               size_t paddingH, size_t paddingW) {
+  virtual void maxPoolBackward(Matrix& image,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               Matrix& outGrad,
+                               Matrix& outV,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               real scaleTargets,
+                               real scaleOutput,
+                               size_t paddingH,
+                               size_t paddingW) {
     LOG(FATAL) << "Not implemeted";
   }
 
   /// Pooling forward operation, caculate the average of sizeX elements.
-  virtual void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                              size_t channels, size_t sizeX, size_t sizeY,
-                              size_t strideH, size_t strideW,
-                              size_t outputH, size_t outputW,
-                              size_t paddingH, size_t paddingW) {
+  virtual void avgPoolForward(Matrix& input,
+                              size_t imgSizeH,
+                              size_t imgSizeW,
+                              size_t channels,
+                              size_t sizeX,
+                              size_t sizeY,
+                              size_t strideH,
+                              size_t strideW,
+                              size_t outputH,
+                              size_t outputW,
+                              size_t paddingH,
+                              size_t paddingW) {
     LOG(FATAL) << "Not implemeted";
   }
 
-  virtual void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                               size_t sizeX, size_t sizeY,
-                               size_t strideH, size_t strideW,
-                               size_t outputH, size_t outputW,
-                               real scaleTargets, real scaleOutput,
-                               size_t paddingH, size_t paddingW) {
+  virtual void avgPoolBackward(Matrix& input,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               real scaleTargets,
+                               real scaleOutput,
+                               size_t paddingH,
+                               size_t paddingW) {
     LOG(FATAL) << "Not implemeted";
   }
 
   /// normalize-operation.
-  virtual void crossMapNormalFwd(Matrix& input, size_t imgSizeH,
-                                 size_t imgSizeW, Matrix& denoms,
-                                 size_t channels, size_t sizeX, float scale,
+  virtual void crossMapNormalFwd(Matrix& input,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 Matrix& denoms,
+                                 size_t channels,
+                                 size_t sizeX,
+                                 float scale,
                                  float pow) {
     LOG(FATAL) << "Not implemeted";
   }
 
-  virtual void crossMapNormalBwd(Matrix& localGrad, Matrix& denoms,
-                                 Matrix& preOutV, Matrix& localOutV,
-                                 size_t channels, size_t imgSizeH,
-                                 size_t imgSizeW, size_t size, float scale,
+  virtual void crossMapNormalBwd(Matrix& localGrad,
+                                 Matrix& denoms,
+                                 Matrix& preOutV,
+                                 Matrix& localOutV,
+                                 size_t channels,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t size,
+                                 float scale,
                                  float pow) {
     LOG(FATAL) << "Not implemeted";
   }
@@ -800,20 +985,24 @@ public:
    *
    * output[i] is set to max_input[i].
    */
-  virtual void maxSequenceForward(Matrix& input, const IVector& sequence,
+  virtual void maxSequenceForward(Matrix& input,
+                                  const IVector& sequence,
                                   IVector& index) {
     LOG(FATAL) << "Not implemeted";
   }
 
-  virtual void maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
+  virtual void maxSequenceBackward(Matrix& outputGrad,
+                                   const IVector& sequence,
                                    IVector& index) {
     LOG(FATAL) << "Not implemeted";
   }
 
-  virtual void contextProjectionForward(MatrixPtr input, MatrixPtr weight,
+  virtual void contextProjectionForward(MatrixPtr input,
+                                        MatrixPtr weight,
                                         const IVector& sequence,
                                         int contextLength,
-                                        int contextStart, size_t beginPad,
+                                        int contextStart,
+                                        size_t beginPad,
                                         bool isPadding) {
     LOG(FATAL) << "Not implemeted";
   }
@@ -822,7 +1011,8 @@ public:
                                          MatrixPtr weightGrad,
                                          const IVector& sequence,
                                          int contextLength,
-                                         int contextStart, size_t beginPad,
+                                         int contextStart,
+                                         size_t beginPad,
                                          bool isPadding) {
     LOG(FATAL) << "Not implemeted";
   }
@@ -837,7 +1027,8 @@ public:
   virtual void contextProjectionBackwardWeight(MatrixPtr weightGrad,
                                                const IVector& sequence,
                                                int contextLength,
-                                               int contextStart, int totalPad,
+                                               int contextStart,
+                                               int totalPad,
                                                size_t beginPad) {
     LOG(FATAL) << "Not implemeted";
   }
@@ -916,7 +1107,8 @@ public:
    *            / output->getWidth()
    * @endcode
    */
-  virtual void classificationErrorMulti(Matrix& output, Matrix& label,
+  virtual void classificationErrorMulti(Matrix& output,
+                                        Matrix& label,
                                         real threshold) {
     LOG(FATAL) << "Not implemented";
   }
@@ -930,6 +1122,26 @@ public:
   virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
     LOG(FATAL) << "Not implemented";
   }
+  virtual void bilinearForward(const Matrix& in,
+                               const size_t inImgH,
+                               const size_t inImgW,
+                               const size_t outImgH,
+                               const size_t outImgW,
+                               const size_t numChannels,
+                               const real ratioH,
+                               const real ratioW) {
+    LOG(FATAL) << "Not implemented";
+  }
+  virtual void bilinearBackward(const Matrix& out,
+                                const size_t outImgH,
+                                const size_t outImgW,
+                                const size_t inImgH,
+                                const size_t inImgW,
+                                const size_t numChannels,
+                                const real ratioH,
+                                const real ratioW) {
+    LOG(FATAL) << "Not implemented";
+  }
 };
 
 inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
@@ -944,24 +1156,34 @@ public:
   GpuMatrix(size_t height, size_t width, bool trans = false);
   GpuMatrix(real* data, size_t height, size_t width, bool trans = false)
       : Matrix(data, height, width, trans, true) {}
-  GpuMatrix(real* data, size_t height, size_t width, size_t stride,
+  GpuMatrix(real* data,
+            size_t height,
+            size_t width,
+            size_t stride,
             bool trans = false)
       : Matrix(data, height, width, stride, trans, true) {}
-  GpuMatrix(GpuMemHandlePtr dataHandle, size_t height, size_t width,
+  GpuMatrix(GpuMemHandlePtr dataHandle,
+            size_t height,
+            size_t width,
             bool trans = false)
       : Matrix(dataHandle, height, width, trans, true) {}
   ~GpuMatrix();
 
   void zeroMem();
   void resetOne();
+  void setDiag(real value);
 
   void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight, size_t newWidth,
+  void resize(size_t newHeight,
+              size_t newWidth,
               size_t newNnz, /* used to allocate space */
-              SparseValueType valueType, SparseFormat format) {
+              SparseValueType valueType,
+              SparseFormat format) {
     LOG(FATAL) << "Only Support Sparse Matrix";
   }
-  void setRow(size_t row, size_t colNum, const unsigned int* cols,
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
               const real* values) {
     LOG(FATAL) << "Only Support Sparse Matrix";
   }
@@ -995,8 +1217,12 @@ public:
   MatrixPtr getTranspose();
   void transpose(MatrixPtr matTrans, bool memAlloc);
 
+  MatrixPtr getInverse();
+  void inverse(MatrixPtr matInv, bool memAlloc);
+
   /// add b to each sample of this.
   void addBias(Matrix& b, real scale);
+  void addSharedBias(Matrix& b, real scale);
 
   /**
    * @code
@@ -1004,6 +1230,7 @@ public:
    * @endcode
    */
   void collectBias(Matrix& a, real scale);
+  void collectSharedBias(Matrix& a, real scale);
 
   void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
 
@@ -1046,10 +1273,14 @@ public:
 
   void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
 
-  void mul(const GpuSparseMatrix& a, const GpuMatrix& b, real scaleAB,
+  void mul(const GpuSparseMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
            real scaleT);
 
-  void mul(const GpuMatrix& a, const GpuSparseMatrix& b, real scaleAB,
+  void mul(const GpuMatrix& a,
+           const GpuSparseMatrix& b,
+           real scaleAB,
            real scaleT);
 
   /**
@@ -1085,12 +1316,17 @@ public:
   void rowMax(Matrix& max);
   void rowMax(IVector& maxIds, Matrix& max);
   void colMax(Matrix& max);
+  void colMax(IVector& maxIds, Matrix& max);
+  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
 
   void oneHotCrossEntropy(Matrix& output, IVector& label);
   void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
+  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                      IVector& label,
                                       real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV, IVector& label,
+  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                        IVector& label,
                                         real alpha);
 
   void softmax(Matrix& output);
@@ -1110,8 +1346,12 @@ public:
   void scaledTanh(Matrix& output, real p1, real p2);
 
   void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output, Matrix& prevOut1, Matrix& prevOut2,
-                        Matrix& prevGrad1, Matrix& prevGrad2, real scale);
+  void cosSimDerivative(Matrix& output,
+                        Matrix& prevOut1,
+                        Matrix& prevOut2,
+                        Matrix& prevGrad1,
+                        Matrix& prevGrad2,
+                        real scale);
 
   virtual void print(std::ostream& os) const;
   virtual void print(std::ostream& os, size_t height, size_t width) const;
@@ -1125,72 +1365,159 @@ public:
 
   void classificationError(MatrixPtr output, IVectorPtr label);
 
-  void convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
-                  int channels, int blockH, int blockW, int strideH,
-                  int strideW, int paddingH, int paddingW,
-                  int outputH, int outputW);
-
-  void convShrink(Matrix& expandColMat, int thisImgHeight, int thisImgWidth,
-                  int channels, int blockH, int blochW, int strideH,
-                  int strideW, int paddingH, int paddingWreal,
-                  int outputH, int outputW,
-                  real alpha = 1.0f, real beta = 0.0f);
-
-  void maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW,
-                      size_t channels, size_t sizeX, size_t sizeY,
-                      size_t strideH, size_t strideW,
-                      size_t outputH, size_t outputW,
-                      size_t paddingH, size_t paddingW);
-
-  void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
-                       Matrix& outGrad, Matrix& outV, size_t sizeX,
-                       size_t sizeY, size_t strideH, size_t strideW,
-                       size_t outputH, size_t outputW,
-                       real scaleTargets, real scaleOutput,
-                       size_t paddingH, size_t paddingW);
-
-  void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                      size_t channels, size_t sizeX, size_t sizeY,
-                      size_t strideH, size_t strideW,
-                      size_t outputH, size_t outputW,
-                      size_t paddingH, size_t paddingW);
-
-  void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                       size_t sizeX, size_t sizeY,
-                       size_t strideH, size_t strideW,
-                       size_t outputH, size_t outputW,
-                       real scaleTargets, real scaleOutput,
-                       size_t paddingH, size_t paddingW);
-
-  void crossMapNormalFwd(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                         Matrix& denoms, size_t channels, size_t sizeX,
-                         float scale, float pow);
-
-  void crossMapNormalBwd(Matrix& localGrad, Matrix& denoms, Matrix& preOutV,
-                         Matrix& localOutV, size_t channels, size_t imgSizeH,
-                         size_t imgSizeW, size_t sizeX,
-                         float scale, float pow);
-
-  void maxSequenceForward(Matrix& input, const IVector& sequence,
+  void convExpand(Matrix& feature,
+                  int feaImgHeight,
+                  int feaImgWidth,
+                  int channels,
+                  int blockH,
+                  int blockW,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  int outputH,
+                  int outputW);
+
+  void convShrink(Matrix& expandColMat,
+                  int thisImgHeight,
+                  int thisImgWidth,
+                  int channels,
+                  int blockH,
+                  int blochW,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingWreal,
+                  int outputH,
+                  int outputW,
+                  real alpha = 1.0f,
+                  real beta = 0.0f);
+
+  void maxPoolForward(Matrix& inputMat,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW);
+
+  void maxPoolBackward(Matrix& image,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       Matrix& outGrad,
+                       Matrix& outV,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void avgPoolForward(Matrix& input,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW);
+
+  void avgPoolBackward(Matrix& input,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void crossMapNormalFwd(Matrix& input,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         Matrix& denoms,
+                         size_t channels,
+                         size_t sizeX,
+                         float scale,
+                         float pow);
+
+  void crossMapNormalBwd(Matrix& localGrad,
+                         Matrix& denoms,
+                         Matrix& preOutV,
+                         Matrix& localOutV,
+                         size_t channels,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t sizeX,
+                         float scale,
+                         float pow);
+
+  void maxSequenceForward(Matrix& input,
+                          const IVector& sequence,
                           IVector& index);
 
-  void maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
+  void maxSequenceBackward(Matrix& outputGrad,
+                           const IVector& sequence,
                            IVector& index);
 
-  void contextProjectionForward(MatrixPtr input, MatrixPtr weight,
-                                const IVector& sequence, int contextLength,
-                                int contextStart, size_t beginPad,
+  void contextProjectionForward(MatrixPtr input,
+                                MatrixPtr weight,
+                                const IVector& sequence,
+                                int contextLength,
+                                int contextStart,
+                                size_t beginPad,
                                 bool isPadding);
 
   void contextProjectionBackwardData(MatrixPtr inputGrad,
                                      const IVector& sequence,
-                                     int contextLength, int contextStart);
+                                     int contextLength,
+                                     int contextStart);
 
   void contextProjectionBackwardWeight(MatrixPtr weightGrad,
                                        const IVector& sequence,
                                        int contextLength,
-                                       int contextStart, int totalPad,
+                                       int contextStart,
+                                       int totalPad,
                                        size_t beginPad);
+
+  void bilinearForward(const Matrix& in,
+                       const size_t inImgH,
+                       const size_t inImgW,
+                       const size_t outImgH,
+                       const size_t outImgW,
+                       const size_t numChannels,
+                       const real ratioH,
+                       const real ratioW);
+
+  void bilinearBackward(const Matrix& out,
+                        const size_t outImgH,
+                        const size_t outImgW,
+                        const size_t inImgH,
+                        const size_t inImgW,
+                        const size_t numChannels,
+                        const real ratioH,
+                        const real ratioW);
+
+  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
+
+  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
 };
 
 class CpuMatrix : public Matrix {
@@ -1198,11 +1525,16 @@ public:
   CpuMatrix(size_t height, size_t width, bool trans = false);
   CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
       : Matrix(data, height, width, trans, false) {}
-  CpuMatrix(real* data, size_t height, size_t width, size_t stride,
+  CpuMatrix(real* data,
+            size_t height,
+            size_t width,
+            size_t stride,
             bool trans = false)
       : Matrix(data, height, width, stride, trans, false) {}
 
-  CpuMatrix(CpuMemHandlePtr dataHandle, size_t height, size_t width,
+  CpuMatrix(CpuMemHandlePtr dataHandle,
+            size_t height,
+            size_t width,
             bool trans = false)
       : Matrix(dataHandle, height, width, trans, false) {}
 
@@ -1210,13 +1542,19 @@ public:
 
   void zeroMem();
   void resetOne();
+  void setDiag(real value);
+
   void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight, size_t newWidth,
+  void resize(size_t newHeight,
+              size_t newWidth,
               size_t newNnz, /* used to allocate space */
-              SparseValueType valueType, SparseFormat format) {
+              SparseValueType valueType,
+              SparseFormat format) {
     LOG(FATAL) << "Only Support Sparse Matrix";
   }
-  void setRow(size_t row, size_t colNum, const unsigned int* cols,
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
               const real* values) {
     LOG(FATAL) << "Only Support Sparse Matrix";
   }
@@ -1229,6 +1567,9 @@ public:
   MatrixPtr getTranspose();
   void transpose(MatrixPtr matTrans, bool memAlloc);
 
+  MatrixPtr getInverse();
+  void inverse(MatrixPtr matInv, bool memAlloc);
+
   void copyFrom(const Matrix& src);
 
   void copyFrom(const Matrix& src, hl_stream_t stream);
@@ -1245,67 +1586,132 @@ public:
 
   MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
 
-  void convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
-                  int channels, int blcokH, int blockW, int strideH,
-                  int strideW, int paddingH, int paddingW,
-                  int outputH, int outputW);
-
-  void convShrink(Matrix& expandFeat, int thisImgHeight, int thisImgWidth,
-                  int channels, int blockH, int blockW, int strideH,
-                  int strideW, int paddingH, int paddingW,
-                  int outputH, int outputW,
-                  real alpha = 1.0f, real beta = 0.0f);
-
-  void maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW,
-                      size_t channels, size_t sizeX, size_t sizeY,
-                      size_t strideH, size_t strideW,
-                      size_t outputH, size_t outputW,
-                      size_t paddingH, size_t paddingW);
-
-  void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW,
-                       Matrix& outGrad, Matrix& outV,
-                       size_t sizeX, size_t sizeY,
-                       size_t strideH, size_t strideW,
-                       size_t outputH, size_t outputW,
-                       real scaleTargets, real scaleOutput,
-                       size_t paddingH, size_t paddingW);
-
-  void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                      size_t channels, size_t sizeX, size_t sizeY,
-                      size_t strideH, size_t strideW,
-                      size_t outputH, size_t outputW,
-                      size_t paddingH, size_t paddingW);
-
-  void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                       size_t sizeX, size_t sizeY,
-                       size_t strideH, size_t strideW,
-                       size_t outputH, size_t outputW,
-                       real scaleTargets, real scaleOutput,
-                       size_t paddingH, size_t paddingW);
-
-  void crossMapNormalFwd(Matrix& input, size_t imgSizeH, size_t imgSizeW,
-                         Matrix& denoms, size_t channels, size_t sizeX,
-                         float scale, float pow);
-
-  void crossMapNormalBwd(Matrix& localGrad, Matrix& denoms, Matrix& preOutV,
-                         Matrix& localOutV, size_t channels, size_t imgSizeH,
-                         size_t imgSizeW, size_t sizeX,
-                         float scale, float pow);
-
-  void maxSequenceForward(Matrix& input, const IVector& sequence,
+  void convExpand(Matrix& feature,
+                  int feaImgHeight,
+                  int feaImgWidth,
+                  int channels,
+                  int blcokH,
+                  int blockW,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  int outputH,
+                  int outputW);
+
+  void convShrink(Matrix& expandFeat,
+                  int thisImgHeight,
+                  int thisImgWidth,
+                  int channels,
+                  int blockH,
+                  int blockW,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  int outputH,
+                  int outputW,
+                  real alpha = 1.0f,
+                  real beta = 0.0f);
+
+  void maxPoolForward(Matrix& inputMat,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW);
+
+  void maxPoolBackward(Matrix& image,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       Matrix& outGrad,
+                       Matrix& outV,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void avgPoolForward(Matrix& input,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW);
+
+  void avgPoolBackward(Matrix& input,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void crossMapNormalFwd(Matrix& input,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         Matrix& denoms,
+                         size_t channels,
+                         size_t sizeX,
+                         float scale,
+                         float pow);
+
+  void crossMapNormalBwd(Matrix& localGrad,
+                         Matrix& denoms,
+                         Matrix& preOutV,
+                         Matrix& localOutV,
+                         size_t channels,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t sizeX,
+                         float scale,
+                         float pow);
+
+  void maxSequenceForward(Matrix& input,
+                          const IVector& sequence,
                           IVector& index);
 
-  void maxSequenceBackward(Matrix& outputGrad, const IVector& sequence,
+  void maxSequenceBackward(Matrix& outputGrad,
+                           const IVector& sequence,
                            IVector& index);
 
-  void contextProjectionForward(MatrixPtr input, MatrixPtr weight,
-                                const IVector& sequence, int contextLength,
-                                int contextStart, size_t beginPad,
+  void contextProjectionForward(MatrixPtr input,
+                                MatrixPtr weight,
+                                const IVector& sequence,
+                                int contextLength,
+                                int contextStart,
+                                size_t beginPad,
                                 bool isPadding);
 
-  void contextProjectionBackward(MatrixPtr inputGrad, MatrixPtr weightGrad,
-                                 const IVector& sequence, int contextLength,
-                                 int contextStart, size_t beginPad,
+  void contextProjectionBackward(MatrixPtr inputGrad,
+                                 MatrixPtr weightGrad,
+                                 const IVector& sequence,
+                                 int contextLength,
+                                 int contextStart,
+                                 size_t beginPad,
                                  bool isPadding);
 
   real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
@@ -1314,13 +1720,14 @@ public:
 public:
   /// add b to each sample of this.
   void addBias(Matrix& b, real scale);
+  void addSharedBias(Matrix& b, real scale);
 
   /// add each sample of a to this.
   void collectBias(Matrix& a, real scale);
+  void collectSharedBias(Matrix& a, real scale);
 
   void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
 
-
   /**
    * @code
    * this.row[i] += table.row[ids[i]]
@@ -1367,7 +1774,10 @@ public:
 
   void mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, real scaleT);
 
-  static void mul(CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c, real scaleAB,
+  static void mul(CpuMatrix* a,
+                  CpuMatrix* b,
+                  CpuSparseMatrix* c,
+                  real scaleAB,
                   real scaleT);
 
   /**
@@ -1377,8 +1787,8 @@ public:
    * Define B,C as template instead of virtual class for performance sake.
    */
   template <typename MatBType, typename MatCType>
-  static void mul(CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB,
-                  real scaleT);
+  static void mul(
+      CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT);
 
   virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
 
@@ -1395,18 +1805,25 @@ public:
   void rowMax(Matrix& max);
   void rowMax(IVector& maxIds, Matrix& maxVal);
   void colMax(Matrix& max);
+  void colMax(IVector& maxIds, Matrix& maxVal);
+  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
   void rowNormalizeL1(Matrix& out);
 
   void oneHotCrossEntropy(Matrix& output, IVector& label);
   void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label,
+  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                      IVector& label,
                                       real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV, IVector& label,
+  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                        IVector& label,
                                         real alpha);
 
   void circularConv(Matrix& b, Matrix& c);
-  void circularConvDerivative(Matrix& output, Matrix& prevOut1,
-                              Matrix& prevOut2, Matrix& prevGrad1,
+  void circularConvDerivative(Matrix& output,
+                              Matrix& prevOut1,
+                              Matrix& prevOut2,
+                              Matrix& prevGrad1,
                               Matrix& prevGrad2);
 
   void softmax(Matrix& output);
@@ -1427,8 +1844,12 @@ public:
   void scaledTanh(Matrix& output, real p1, real p2);
 
   void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output, Matrix& prevOut1, Matrix& prevOut2,
-                        Matrix& prevGrad1, Matrix& prevGrad2, real scale);
+  void cosSimDerivative(Matrix& output,
+                        Matrix& prevOut1,
+                        Matrix& prevOut2,
+                        Matrix& prevGrad1,
+                        Matrix& prevGrad2,
+                        real scale);
 
   void print(std::ostream& os) const;
   void print(std::ostream& os, size_t height, size_t width) const;
@@ -1449,19 +1870,28 @@ public:
 
   void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
 
-  void addByBitCodeBackward(size_t numClasses, const IVector& codes,
+  void addByBitCodeBackward(size_t numClasses,
+                            const IVector& codes,
                             Matrix& vec);
 
-  void mulByBitCode(size_t numClasses, const IVector& codes, const Matrix& mat,
+  void mulByBitCode(size_t numClasses,
+                    const IVector& codes,
+                    const Matrix& mat,
                     const Matrix& input);
 
-  void mulByBitCodeBackwardWeight(size_t numClasses, const IVector& codes,
-                                  Matrix& mat, const Matrix& input);
+  void mulByBitCodeBackwardWeight(size_t numClasses,
+                                  const IVector& codes,
+                                  Matrix& mat,
+                                  const Matrix& input);
 
-  void mulByBitCodeBackwardError(size_t numClasses, const IVector& codes,
-                                 const Matrix& mat, Matrix& input);
+  void mulByBitCodeBackwardError(size_t numClasses,
+                                 const IVector& codes,
+                                 const Matrix& mat,
+                                 Matrix& input);
 
-  void sumByBitCode(size_t numClasses, IVector& codes, Matrix& sum,
+  void sumByBitCode(size_t numClasses,
+                    IVector& codes,
+                    Matrix& sum,
                     real scaleSum);
 
   void subByBitCode(size_t numClasses_, IVector& codes);
@@ -1469,6 +1899,24 @@ public:
   void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
   void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
   void classificationErrorMulti(Matrix& output, Matrix& label, real threshold);
+
+  void bilinearForward(const Matrix& in,
+                       const size_t inImgH,
+                       const size_t inImgW,
+                       const size_t outImgH,
+                       const size_t outImgW,
+                       const size_t numChannels,
+                       const real ratioH,
+                       const real ratioW);
+
+  void bilinearBackward(const Matrix& out,
+                        const size_t outImgH,
+                        const size_t outImgW,
+                        const size_t inImgH,
+                        const size_t inImgW,
+                        const size_t numChannels,
+                        const real ratioH,
+                        const real ratioW);
 };
 
 class SharedCpuMatrix : public CpuMatrix {
@@ -1478,20 +1926,25 @@ public:
       : CpuMatrix(height, width, trans) {
     initShared(blockNum);
   }
-  SharedCpuMatrix(int blockNum, real* data, size_t height, size_t width,
-                  bool trans = false)
+  SharedCpuMatrix(
+      int blockNum, real* data, size_t height, size_t width, bool trans = false)
       : CpuMatrix(data, height, width, trans) {
     initShared(blockNum);
   }
 
-  SharedCpuMatrix(int blockNum, CpuMemHandlePtr dataHandle, size_t height,
-                  size_t width, bool trans = false)
+  SharedCpuMatrix(int blockNum,
+                  CpuMemHandlePtr dataHandle,
+                  size_t height,
+                  size_t width,
+                  bool trans = false)
       : CpuMatrix(dataHandle, height, width, trans) {
     initShared(blockNum);
   }
 
-  SharedCpuMatrix(CpuMemHandlePtr dataHandle, size_t height,
-                  size_t width, bool trans = false)
+  SharedCpuMatrix(CpuMemHandlePtr dataHandle,
+                  size_t height,
+                  size_t width,
+                  bool trans = false)
       : CpuMatrix(dataHandle, height, width, trans) {
     initBlock(1);
   }
diff --git a/paddle/math/MatrixBitCode.cpp b/paddle/math/MatrixBitCode.cpp
index 8497c26e35404a4de970bc2d28b23ebf1090ae6c..ac5b10c7bd56bb34393ac8abb98900351afc2e41 100644
--- a/paddle/math/MatrixBitCode.cpp
+++ b/paddle/math/MatrixBitCode.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Util.h"
 #include "Matrix.h"
@@ -80,8 +79,8 @@ private:
        op(tmat(i, j), vec(0, index(i, j)))
 */
 template <class CodeTable, class Op, class TMat, class Mat>
-static void addByBitCodeT(Op op, CodeTable codeTable, const IVector& codes,
-                          TMat& tmat, Mat& vec) {
+static void addByBitCodeT(
+    Op op, CodeTable codeTable, const IVector& codes, TMat& tmat, Mat& vec) {
   CHECK(!vec.useGpu());
 
   size_t numClasses = codeTable.size();
@@ -109,7 +108,8 @@ static void addByBitCodeT(Op op, CodeTable codeTable, const IVector& codes,
 /* For j < codeLength:
    this(i, j) += vec(0, index(i, j))
 */
-void CpuMatrix::addByBitCode(size_t numClasses, const IVector& codes,
+void CpuMatrix::addByBitCode(size_t numClasses,
+                             const IVector& codes,
                              const Matrix& vec) {
   auto op = [](real& t, real v) { t += v; };
   addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec);
@@ -118,7 +118,8 @@ void CpuMatrix::addByBitCode(size_t numClasses, const IVector& codes,
 /* For j < codeLength:
    vec(0, index(i, j)) += this(i, j)
 */
-void CpuMatrix::addByBitCodeBackward(size_t numClasses, const IVector& codes,
+void CpuMatrix::addByBitCodeBackward(size_t numClasses,
+                                     const IVector& codes,
                                      Matrix& vec) {
   auto op = [](real t, real& v) { v += t; };
   addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec);
@@ -129,10 +130,18 @@ void CpuMatrix::addByBitCodeBackward(size_t numClasses, const IVector& codes,
     for j < codeLength:
       op(tmat(i, j), mat.row(index(i, j)), input.row(i))
 */
-template <class Op, class CodeTable, class IVec, class TMat, class WMat,
+template <class Op,
+          class CodeTable,
+          class IVec,
+          class TMat,
+          class WMat,
           class InMat>
-void mulByBitCodeT(Op op, CodeTable codeTable, IVec& codes, TMat& tmat,
-                   WMat& weight, InMat& input) {
+void mulByBitCodeT(Op op,
+                   CodeTable codeTable,
+                   IVec& codes,
+                   TMat& tmat,
+                   WMat& weight,
+                   InMat& input) {
   CHECK(!tmat.useGpu() && !weight.useGpu() && !input.useGpu());
 
   size_t numClasses = codeTable.size();
@@ -161,10 +170,12 @@ void mulByBitCodeT(Op op, CodeTable codeTable, IVec& codes, TMat& tmat,
 /* For j < codeLength:
    this(i, j) += <weight.row(index(i, j)), input.row(i)>
 */
-void CpuMatrix::mulByBitCode(size_t numClasses, const IVector& codes,
-                             const Matrix& weight, const Matrix& input) {
-  auto op = [](real& t, const real* weightRow, const real* inputRow,
-               size_t inputDim) {
+void CpuMatrix::mulByBitCode(size_t numClasses,
+                             const IVector& codes,
+                             const Matrix& weight,
+                             const Matrix& input) {
+  auto op = [](
+      real& t, const real* weightRow, const real* inputRow, size_t inputDim) {
     real sum = 0;
     for (size_t k = 0; k < inputDim; ++k) {
       sum += weightRow[k] * inputRow[k];
@@ -179,14 +190,15 @@ void CpuMatrix::mulByBitCode(size_t numClasses, const IVector& codes,
    weight.row(index(i, j)) += this(i, j) * input.row(i)
 */
 void CpuMatrix::mulByBitCodeBackwardWeight(size_t numClasses,
-                                           const IVector& codes, Matrix& weight,
+                                           const IVector& codes,
+                                           Matrix& weight,
                                            const Matrix& input) {
-  auto op =
-      [](const real t, real* weightRow, const real* inputRow, size_t inputDim) {
-        for (size_t k = 0; k < inputDim; ++k) {
-          weightRow[k] += t * inputRow[k];
-        }
-      };
+  auto op = [](
+      const real t, real* weightRow, const real* inputRow, size_t inputDim) {
+    for (size_t k = 0; k < inputDim; ++k) {
+      weightRow[k] += t * inputRow[k];
+    }
+  };
 
   mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
 }
@@ -196,20 +208,24 @@ void CpuMatrix::mulByBitCodeBackwardWeight(size_t numClasses,
 */
 void CpuMatrix::mulByBitCodeBackwardError(size_t numClasses,
                                           const IVector& codes,
-                                          const Matrix& weight, Matrix& input) {
-  auto op =
-      [](const real t, const real* weightRow, real* inputRow, size_t inputDim) {
-        for (size_t k = 0; k < inputDim; ++k) {
-          inputRow[k] += t * weightRow[k];
-        }
-      };
+                                          const Matrix& weight,
+                                          Matrix& input) {
+  auto op = [](
+      const real t, const real* weightRow, real* inputRow, size_t inputDim) {
+    for (size_t k = 0; k < inputDim; ++k) {
+      inputRow[k] += t * weightRow[k];
+    }
+  };
 
   mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
 }
 
 template <class CodeTable>
-void sumByBitCodeT(CodeTable codeTable, IVector& codes, const CpuMatrix& tmat,
-                   Matrix& sum, real scaleSum) {
+void sumByBitCodeT(CodeTable codeTable,
+                   IVector& codes,
+                   const CpuMatrix& tmat,
+                   Matrix& sum,
+                   real scaleSum) {
   size_t maxCodeLength = codeTable.getMaxCodeLength();
   size_t numSamples = tmat.getHeight();
   size_t oWidth = tmat.getWidth();
@@ -237,7 +253,9 @@ void sumByBitCodeT(CodeTable codeTable, IVector& codes, const CpuMatrix& tmat,
 /* For j < codeLength:
    sum(i, 0) = \sum_j  bit(i, j) * this(i, j)
 */
-void CpuMatrix::sumByBitCode(size_t numClasses, IVector& codes, Matrix& sum,
+void CpuMatrix::sumByBitCode(size_t numClasses,
+                             IVector& codes,
+                             Matrix& sum,
                              real scaleSum) {
   sumByBitCodeT(SimpleCodeTable(numClasses), codes, *this, sum, scaleSum);
 }
diff --git a/paddle/math/MemoryHandle.cpp b/paddle/math/MemoryHandle.cpp
index 11f746df5c2fb32175ebace1fd7dac3a2934cf9d..9101957fc6c221bed4aa8e0c76b4c6735e50fd2d 100644
--- a/paddle/math/MemoryHandle.cpp
+++ b/paddle/math/MemoryHandle.cpp
@@ -21,8 +21,7 @@ namespace paddle {
 /**
  * Calculate the actual allocation size according to the required size.
  */
-MemoryHandle::MemoryHandle(size_t size)
-    : size_(size), buf_(nullptr) {
+MemoryHandle::MemoryHandle(size_t size) : size_(size), buf_(nullptr) {
   if (size_ <= 256) {
     // Memory allocation in cuda is always aligned to at least 256 bytes.
     // In many cases it is 512 bytes.
@@ -44,9 +43,7 @@ GpuMemoryHandle::GpuMemoryHandle(size_t size) : MemoryHandle(size) {
   buf_ = allocator_->alloc(allocSize_);
 }
 
-GpuMemoryHandle::~GpuMemoryHandle() {
-  allocator_->free(buf_, allocSize_);
-}
+GpuMemoryHandle::~GpuMemoryHandle() { allocator_->free(buf_, allocSize_); }
 
 CpuMemoryHandle::CpuMemoryHandle(size_t size) : MemoryHandle(size) {
   CHECK(size != 0) << " allocate 0 bytes";
@@ -54,8 +51,6 @@ CpuMemoryHandle::CpuMemoryHandle(size_t size) : MemoryHandle(size) {
   buf_ = allocator_->alloc(allocSize_);
 }
 
-CpuMemoryHandle::~CpuMemoryHandle() {
-  allocator_->free(buf_, allocSize_);
-}
+CpuMemoryHandle::~CpuMemoryHandle() { allocator_->free(buf_, allocSize_); }
 
 }  // namespace paddle
diff --git a/paddle/math/MemoryHandle.h b/paddle/math/MemoryHandle.h
index 809fba2d0a8963ba60f5abaa2d2daf415c2d985d..f12635d5d4b6ff7204d4d3e8d6f07d438c0ce1e8 100644
--- a/paddle/math/MemoryHandle.h
+++ b/paddle/math/MemoryHandle.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -32,9 +31,9 @@ public:
 
 protected:
   PoolAllocator* allocator_;
-  size_t size_;         // the requested size
-  size_t allocSize_;    // the allocated size
-  int deviceId_;        // the device id of memory if gpu memory
+  size_t size_;       // the requested size
+  size_t allocSize_;  // the allocated size
+  int deviceId_;      // the device id of memory if gpu memory
   void* buf_;
 };
 
diff --git a/paddle/math/PoolAllocator.cpp b/paddle/math/PoolAllocator.cpp
index 3a03496eb190ba6792708d9bcffd77cd0e45d4fc..2c150949dd4eca08824401685beecc19142cbd76 100644
--- a/paddle/math/PoolAllocator.cpp
+++ b/paddle/math/PoolAllocator.cpp
@@ -12,21 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PoolAllocator.h"
 
 namespace paddle {
 
 PoolAllocator::PoolAllocator(Allocator* allocator,
-  size_t sizeLimit, const std::string& name)
+                             size_t sizeLimit,
+                             const std::string& name)
     : allocator_(allocator),
       sizeLimit_(sizeLimit),
       poolMemorySize_(0),
       name_(name) {}
 
-PoolAllocator::~PoolAllocator() {
-  freeAll();
-}
+PoolAllocator::~PoolAllocator() { freeAll(); }
 
 void* PoolAllocator::alloc(size_t size) {
   if (sizeLimit_ > 0) {
diff --git a/paddle/math/PoolAllocator.h b/paddle/math/PoolAllocator.h
index aca8ffb0ab42e10d76dc9fbaad657a8afab316e9..5d33b453127a5aaa355ba8c569baf1eefe931c96 100644
--- a/paddle/math/PoolAllocator.h
+++ b/paddle/math/PoolAllocator.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
diff --git a/paddle/math/SIMDFunctions.cpp b/paddle/math/SIMDFunctions.cpp
index 6147bed3d81112d57f03d23bbb6f5c2f327d4dc1..1fb156f29bbb586b6251f961bb4fd5f4d5da0737 100644
--- a/paddle/math/SIMDFunctions.cpp
+++ b/paddle/math/SIMDFunctions.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
-
 #include "SIMDFunctions.h"
 #include <immintrin.h>
 #include <algorithm>
@@ -85,7 +83,9 @@ static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) {
   return;
 }
 
-static void col_max_sse(float* result, const float* data, int dim,
+static void col_max_sse(float* result,
+                        const float* data,
+                        int dim,
                         int numSamples) {
   // first sample, direct copy
   for (int d = 0; d < dim; ++d) {
@@ -195,7 +195,9 @@ static void batch_addto_avx(float* a, const float* b[], int batch, size_t len) {
   return;
 }
 
-static void col_max_avx(float* result, const float* data, int dim,
+static void col_max_avx(float* result,
+                        const float* data,
+                        int dim,
                         int numSamples) {
   // first sample, direct copy
   for (int d = 0; d < dim; ++d) {
@@ -289,8 +291,8 @@ static void decayL1_avx(float* dst, float* src, float lambda, size_t sz) {
   }
 }
 
-static void decayL1_avx(float* dst, float* src, float* lr, float lambda,
-                        size_t sz) {
+static void decayL1_avx(
+    float* dst, float* src, float* lr, float lambda, size_t sz) {
   int64_t i;
   int64_t size = sz;
   float src_val;
@@ -379,8 +381,8 @@ void colMaxImpl(float* result, const float* data, int dim, int numSamples) {
 void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len) {
   decayL1_avx(dst, src, lambda, len);
 }
-void decayL1AvxImpl(float* dst, float* src, float* lr, float lambda,
-                    size_t len) {
+void decayL1AvxImpl(
+    float* dst, float* src, float* lr, float lambda, size_t len) {
   decayL1_avx(dst, src, lr, lambda, len);
 }
 
diff --git a/paddle/math/SIMDFunctions.h b/paddle/math/SIMDFunctions.h
index 2b984d5f96a620a95752231749a8b8b74f47d010..ac82f109104d7c21f346f909984306de105c0fd4 100644
--- a/paddle/math/SIMDFunctions.h
+++ b/paddle/math/SIMDFunctions.h
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
-
 #pragma once
 #include <stddef.h>
 #include <stdint.h>
@@ -123,8 +121,8 @@ void batchAddToImpl(float* a, const float* b[], int batch, size_t len);
 void colMaxImpl(float* result, const float* data, int dim, int numSamples);
 #ifdef __AVX__
 void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len);
-void decayL1AvxImpl(float* dst, float* src, float* lr, float lambda,
-                    size_t len);
+void decayL1AvxImpl(
+    float* dst, float* src, float* lr, float lambda, size_t len);
 #endif
 }  // namespace internal
 
@@ -153,8 +151,8 @@ inline void decayL1(float* dst, float* src, float lambda, size_t len) {
 }
 
 template <>
-inline void decayL1(float* dst, float* src, float* lr, float lambda,
-                    size_t len) {
+inline void decayL1(
+    float* dst, float* src, float* lr, float lambda, size_t len) {
 #ifdef __AVX__
   internal::decayL1AvxImpl(dst, src, lr, lambda, len);
 #else
diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp
index 67ac0488623075729996aa603bd0e89c7ce98d9f..2b0bff9535d5a9ba4a47def4d6f964c799325535 100644
--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
@@ -22,18 +22,25 @@ limitations under the License. */
 
 namespace paddle {
 
-GpuSparseMatrix::GpuSparseMatrix(size_t height, size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
+GpuSparseMatrix::GpuSparseMatrix(size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
                                  bool trans)
     : Matrix(NULL, height, width, trans, true) {
   resize(height, width, nnz, valueType, format);
 }
 
 GpuSparseMatrix::GpuSparseMatrix(GpuMemHandlePtr dataHandle,
-                                 hl_sparse_matrix_s_ptr sMatrix, size_t height,
-                                 size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
-                                 bool trans, MemoryHandlePtr sMemoryHandle)
+                                 hl_sparse_matrix_s_ptr sMatrix,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans,
+                                 MemoryHandlePtr sMemoryHandle)
     : Matrix(dataHandle, height, width, trans, true) {
   CHECK(dataHandle && sMatrix) << "Invalid argument pointer";
 
@@ -67,10 +74,14 @@ GpuSparseMatrix::GpuSparseMatrix(GpuMemHandlePtr dataHandle,
     sparseResizeCSC();
 }
 
-GpuSparseMatrix::GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix, size_t height,
-                                 size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
-                                 bool trans, MemoryHandlePtr sMemoryHandle)
+GpuSparseMatrix::GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans,
+                                 MemoryHandlePtr sMemoryHandle)
     : Matrix(NULL, height, width, trans, true) {
   CHECK(sMatrix) << "Invalid argument pointer";
   sMatrix_ = sMatrix;
@@ -80,9 +91,14 @@ GpuSparseMatrix::GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix, size_t height,
   valueType_ = valueType;
 }
 
-GpuSparseMatrix::GpuSparseMatrix(real* value, int* rows, int* cols,
-                                 size_t height, size_t width, size_t nnz,
-                                 SparseValueType valueType, SparseFormat format,
+GpuSparseMatrix::GpuSparseMatrix(real* value,
+                                 int* rows,
+                                 int* cols,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
                                  bool trans)
     : Matrix(NULL, height, width, trans, true) {
   size_t size = 0;
@@ -118,9 +134,15 @@ GpuSparseMatrix::GpuSparseMatrix(real* value, int* rows, int* cols,
       /* construct hl_sparse_matrix_s */
       hl_sparse_matrix_s tmp;
       hl_construct_sparse_matrix(
-          &tmp, value, rows, cols, HL_SPARSE_CSR,
-          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, height_,
-          width_, elementCnt_);
+          &tmp,
+          value,
+          rows,
+          cols,
+          HL_SPARSE_CSR,
+          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
+          height_,
+          width_,
+          elementCnt_);
       hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
       sMatrix_ = tmp2;
     }
@@ -143,9 +165,15 @@ GpuSparseMatrix::GpuSparseMatrix(real* value, int* rows, int* cols,
       /* construct hl_sparse_matrix_s */
       hl_sparse_matrix_s tmp;
       hl_construct_sparse_matrix(
-          &tmp, value, rows, cols, HL_SPARSE_CSC,
-          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, height_,
-          width_, elementCnt_);
+          &tmp,
+          value,
+          rows,
+          cols,
+          HL_SPARSE_CSC,
+          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
+          height_,
+          width_,
+          elementCnt_);
       hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
       sMatrix_ = tmp2;
     }
@@ -171,8 +199,13 @@ void GpuSparseMatrix::sparseResizeCSR() {
     /* construct hl_sparse_matrix_s */
     hl_sparse_matrix_s tmp;
     hl_construct_sparse_matrix(
-        &tmp, data_, memoryHandle_->getSize(), HL_SPARSE_CSR,
-        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, height_, width_,
+        &tmp,
+        data_,
+        memoryHandle_->getSize(),
+        HL_SPARSE_CSR,
+        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
+        height_,
+        width_,
         elementCnt_);
     hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
     sMatrix_ = tmp2;
@@ -197,16 +230,24 @@ void GpuSparseMatrix::sparseResizeCSC() {
     /* construct hl_sparse_matrix_s */
     hl_sparse_matrix_s tmp;
     hl_construct_sparse_matrix(
-        &tmp, memoryHandle_->getBuf(), memoryHandle_->getSize(), HL_SPARSE_CSC,
-        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, height_, width_,
+        &tmp,
+        memoryHandle_->getBuf(),
+        memoryHandle_->getSize(),
+        HL_SPARSE_CSC,
+        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
+        height_,
+        width_,
         elementCnt_);
     hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
     sMatrix_ = tmp2;
   }
 }
 
-void GpuSparseMatrix::resize(size_t newHeight, size_t newWidth, size_t newNnz,
-                             SparseValueType valueType, SparseFormat format) {
+void GpuSparseMatrix::resize(size_t newHeight,
+                             size_t newWidth,
+                             size_t newNnz,
+                             SparseValueType valueType,
+                             SparseFormat format) {
   if (format == SPARSE_CSR) {
     resizeCSR(newHeight, newWidth, newNnz, valueType);
   } else {
@@ -214,8 +255,10 @@ void GpuSparseMatrix::resize(size_t newHeight, size_t newWidth, size_t newNnz,
   }
 }
 
-void GpuSparseMatrix::resizeCSR(size_t newHeight, size_t newWidth,
-                                size_t newNnz, SparseValueType valueType) {
+void GpuSparseMatrix::resizeCSR(size_t newHeight,
+                                size_t newWidth,
+                                size_t newNnz,
+                                SparseValueType valueType) {
   size_t newSize = (newHeight + 1) * sizeof(int) + newNnz * sizeof(int);
   if (NO_VALUE != valueType) {
     newSize += newNnz * sizeof(real);
@@ -266,8 +309,10 @@ void GpuSparseMatrix::resizeCSR(size_t newHeight, size_t newWidth,
   }
 }
 
-void GpuSparseMatrix::resizeCSC(size_t newHeight, size_t newWidth,
-                                size_t newNnz, SparseValueType valueType) {
+void GpuSparseMatrix::resizeCSC(size_t newHeight,
+                                size_t newWidth,
+                                size_t newNnz,
+                                SparseValueType valueType) {
   size_t newSize = (newWidth + 1) * sizeof(int) + newNnz * sizeof(int);
   if (NO_VALUE != valueType) {
     newSize += newNnz * sizeof(real);
@@ -327,24 +372,37 @@ MatrixPtr GpuSparseMatrix::getTranspose() {
   CHECK(memoryHandle_.get() || sMatrix_) << "not supported";
   if (memoryHandle_.get()) {
     MatrixPtr copy_T(new GpuSparseMatrix(
-        std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_), sMatrix_,
-        height_, width_, elementCnt_, valueType_, format_, true,
+        std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
+        sMatrix_,
+        height_,
+        width_,
+        elementCnt_,
+        valueType_,
+        format_,
+        true,
         sMemoryHandle_));
     return copy_T;
   } else {
-    MatrixPtr copy_T(new GpuSparseMatrix(sMatrix_, height_, width_, elementCnt_,
-                                         valueType_, format_, true,
+    MatrixPtr copy_T(new GpuSparseMatrix(sMatrix_,
+                                         height_,
+                                         width_,
+                                         elementCnt_,
+                                         valueType_,
+                                         format_,
+                                         true,
                                          sMemoryHandle_));
     return copy_T;
   }
 }
 
-void GpuSparseMatrix::copyRow(int offsets, size_t colNum,
+void GpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
                               const sparse_non_value_t* row) {
   memcpy(cols_ + offsets, row, sizeof(int) * colNum);
 }
 
-void GpuSparseMatrix::copyRow(int offsets, size_t colNum,
+void GpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
                               const sparse_float_value_t* row) {
   for (size_t j = 0; j < colNum; j++) {
     cols_[offsets + j] = row[j].col;
@@ -368,7 +426,9 @@ void GpuSparseMatrix::copyFrom(const Matrix& src) {
 }
 
 template <class T>
-void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data,
+void GpuSparseMatrix::copyFrom(int64_t* ids,
+                               int64_t* indices,
+                               T* data,
                                hl_stream_t stream) {
   CHECK_EQ(format_, SPARSE_CSR);
   size_t nnz = 0;
@@ -377,7 +437,9 @@ void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data,
     nnz += indices[id + 1] - indices[id];
   }
 
-  resize(height_, width_, nnz,
+  resize(height_,
+         width_,
+         nnz,
          sizeof(T) == sizeof(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE,
          format_);
 
@@ -399,8 +461,10 @@ void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data,
   hl_memcpy_csr_matrix(sMatrix_.get(), value_, rows_, cols_, stream);
 }
 
-void GpuSparseMatrix::setRow(size_t row, size_t colNum,
-                             const unsigned int* cols, const real* values) {
+void GpuSparseMatrix::setRow(size_t row,
+                             size_t colNum,
+                             const unsigned int* cols,
+                             const real* values) {
   CHECK_EQ(format_, SPARSE_CSR);
   if (NO_VALUE == valueType_) {
     CHECK_LT(row, height_);
@@ -427,8 +491,8 @@ void GpuSparseMatrix::setRow(size_t row, size_t colNum,
     sMatrix_->rows = height_;
     sMatrix_->cols = width_;
     sMatrix_->nnz = elementCnt_;
-    hl_memcpy_csr_matrix(sMatrix_.get(), value_, rows_, cols_,
-                         HPPL_STREAM_DEFAULT);
+    hl_memcpy_csr_matrix(
+        sMatrix_.get(), value_, rows_, cols_, HPPL_STREAM_DEFAULT);
   }
 }
 
@@ -438,8 +502,8 @@ void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   CHECK_EQ(format_, SPARSE_CSC);
   int nnz = sMatrix_->nnz;
   if (memAlloc) {
-    matTrans = std::make_shared<GpuSparseMatrix>(width_, height_, nnz,
-                                                 valueType_, format_, false);
+    matTrans = std::make_shared<GpuSparseMatrix>(
+        width_, height_, nnz, valueType_, format_, false);
   } else {
     CHECK(matTrans != nullptr);
   }
@@ -449,9 +513,14 @@ void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   CpuIVector cols_full(nnz);
   CpuVector value(nnz);
   hl_stream_t stream = HPPL_STREAM_1;
-  hl_memcpy_from_csc_matrix(value.getData(), nnz, rows.getData(), nnz,
-                            cols.getData(), width_ + 1,
-                            sMatrix_.get(), stream);
+  hl_memcpy_from_csc_matrix(value.getData(),
+                            nnz,
+                            rows.getData(),
+                            nnz,
+                            cols.getData(),
+                            width_ + 1,
+                            sMatrix_.get(),
+                            stream);
 
   hl_stream_synchronize(stream);
 
@@ -465,12 +534,14 @@ void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
 
   /*sort row index and column index by the ascending order*/
   for (int i = 0; i < nnz; i++) {
-    dataVec.emplace_back(rows.getData()[i], cols_full.getData()[i],
-                         value.getData()[i]);
+    dataVec.emplace_back(
+        rows.getData()[i], cols_full.getData()[i], value.getData()[i]);
   }
-  std::sort(dataVec.begin(), dataVec.end(), [](Element a, Element b) {
-    return a.row < b.row || (a.row == b.row && a.col < b.col);
-  });
+  std::sort(dataVec.begin(),
+            dataVec.end(),
+            [](Element a, Element b) {
+              return a.row < b.row || (a.row == b.row && a.col < b.col);
+            });
 
   /*get sorted data, row index, and col index, put them in the right place*/
   cols.resize(height_ + 1);
@@ -494,13 +565,18 @@ void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   /*copy back from cpu*/
   GpuSparseMatrixPtr dest =
       std::dynamic_pointer_cast<GpuSparseMatrix>(matTrans);
-  hl_memcpy_csc_matrix((dest->sMatrix_).get(), value.getData(),
-                       rows.getData(), cols.getData(), stream);
+  hl_memcpy_csc_matrix((dest->sMatrix_).get(),
+                       value.getData(),
+                       rows.getData(),
+                       cols.getData(),
+                       stream);
   hl_stream_synchronize(stream);
 }
 
-void GpuSparseMatrix::mul(const GpuMatrixPtr a, const GpuMatrixPtr b,
-                          real scaleAB, real scaleT) {
+void GpuSparseMatrix::mul(const GpuMatrixPtr a,
+                          const GpuMatrixPtr b,
+                          real scaleAB,
+                          real scaleT) {
   CHECK(a->useGpu_ && b->useGpu_) << "type not match";
   CHECK(!trans_) << "trans not supported";
   real* A_d = a->getData();
@@ -527,11 +603,13 @@ void GpuSparseMatrix::mul(const GpuMatrixPtr a, const GpuMatrixPtr b,
   int dimM = height_;
   int dimN = width_;
   int dimK = !b->trans_ ? b->getHeight() : b->getWidth();
-  hl_sparse_matrix_mul(A_d, a_trans, B_d, b_trans, C_d, dimM,
-                       dimN, dimK, scaleAB, scaleT);
+  hl_sparse_matrix_mul(
+      A_d, a_trans, B_d, b_trans, C_d, dimM, dimN, dimK, scaleAB, scaleT);
 }
 
-void GpuSparseMatrix::mul(const MatrixPtr a, const MatrixPtr b, real scaleAB,
+void GpuSparseMatrix::mul(const MatrixPtr a,
+                          const MatrixPtr b,
+                          real scaleAB,
                           real scaleT) {
   if (std::dynamic_pointer_cast<GpuMatrix>(a) &&
       std::dynamic_pointer_cast<GpuMatrix>(b)) {
@@ -559,9 +637,14 @@ void GpuSparseMatrix::print(std::ostream& os) const {
     IVectorPtr cols = IVector::create(width_ + 1, false);
     VectorPtr value = Vector::create(nnz, false);
     hl_stream_t stream = HPPL_STREAM_DEFAULT;
-    hl_memcpy_from_csc_matrix(
-        value->getData(), value->getSize(), rows->getData(), rows->getSize(),
-        cols->getData(), cols->getSize(), sMatrix_.get(), stream);
+    hl_memcpy_from_csc_matrix(value->getData(),
+                              value->getSize(),
+                              rows->getData(),
+                              rows->getSize(),
+                              cols->getData(),
+                              cols->getSize(),
+                              sMatrix_.get(),
+                              stream);
     hl_stream_synchronize(stream);
 
     printBuf(os, cols->getData(), width_ + 1, "col idx");
@@ -574,11 +657,10 @@ void GpuSparseMatrix::copyFromCSR(CpuSparseMatrix& src, hl_stream_t stream) {
   trans_ = src.trans_;
   size_t nnz = src.getElementCnt();
 
-  resize(src.getHeight(), src.getWidth(), nnz, valueType_,
-         src.getFormat());
+  resize(src.getHeight(), src.getWidth(), nnz, valueType_, src.getFormat());
   // if have different value type, only copy rows and cols
   SparseValueType vType =
-    valueType_ != src.getValueType() ? NO_VALUE : valueType_;
+      valueType_ != src.getValueType() ? NO_VALUE : valueType_;
 
   sMatrix_->format = HL_SPARSE_CSR;
   sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
@@ -588,7 +670,9 @@ void GpuSparseMatrix::copyFromCSR(CpuSparseMatrix& src, hl_stream_t stream) {
 
   hl_memcpy_csr_matrix(sMatrix_.get(),
                        vType == NO_VALUE ? NULL : src.getValue(),
-                       src.getRows(), src.getCols(), stream);
+                       src.getRows(),
+                       src.getCols(),
+                       stream);
 
   // restore type of sMatrix_
   sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
@@ -598,12 +682,11 @@ void GpuSparseMatrix::copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream) {
   trans_ = src.trans_;
   size_t nnz = src.getElementCnt();
 
-  resize(src.getHeight(), src.getWidth(), nnz, valueType_,
-         src.getFormat());
+  resize(src.getHeight(), src.getWidth(), nnz, valueType_, src.getFormat());
 
   // if have different value type, only copy rows and cols
   SparseValueType vType =
-    valueType_ != src.getValueType() ? NO_VALUE : valueType_;
+      valueType_ != src.getValueType() ? NO_VALUE : valueType_;
 
   sMatrix_->format = HL_SPARSE_CSC;
   sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
@@ -613,7 +696,9 @@ void GpuSparseMatrix::copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream) {
 
   hl_memcpy_csc_matrix(sMatrix_.get(),
                        vType == NO_VALUE ? NULL : src.getValue(),
-                       src.getRows(), src.getCols(), stream);
+                       src.getRows(),
+                       src.getCols(),
+                       stream);
 
   // restore type of sMatrix_
   sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
@@ -622,23 +707,24 @@ void GpuSparseMatrix::copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream) {
 void GpuSparseMatrix::copyFrom(GpuSparseMatrix& src, hl_stream_t stream) {
   CHECK(trans_ == src.trans_);
   CHECK(format_ == src.getFormat());
-  resize(src.getHeight(), src.getWidth(), elementCnt_, valueType_,
+  resize(src.getHeight(),
+         src.getWidth(),
+         elementCnt_,
+         valueType_,
          src.getFormat());
 
   size_t rowSize = format_ == SPARSE_CSC ? elementCnt_ : height_ + 1;
   size_t colSize = format_ == SPARSE_CSC ? width_ + 1 : elementCnt_;
 
   if (valueType_ == FLOAT_VALUE && src.getValueType() == FLOAT_VALUE) {
-    hl_memcpy_async(getValue(), src.getValue(),
-                    sizeof(real) * elementCnt_, stream);
+    hl_memcpy_async(
+        getValue(), src.getValue(), sizeof(real) * elementCnt_, stream);
   }
   CHECK(getRows());
   CHECK(src.getRows());
 
-  hl_memcpy_async(getRows(), src.getRows(),
-                  sizeof(int) * rowSize, stream);
-  hl_memcpy_async(getCols(), src.getCols(),
-                  sizeof(int) * colSize, stream);
+  hl_memcpy_async(getRows(), src.getRows(), sizeof(int) * rowSize, stream);
+  hl_memcpy_async(getCols(), src.getCols(), sizeof(int) * colSize, stream);
 }
 
 void GpuSparseMatrix::copyFrom(CpuSparseMatrix& src, hl_stream_t stream) {
@@ -652,7 +738,8 @@ void GpuSparseMatrix::copyFrom(CpuSparseMatrix& src, hl_stream_t stream) {
 void GpuSparseMatrix::trimFromCSR(const CpuSparseMatrix& src) {
   trans_ = src.trans_;
   int* srcCols = src.getCols();
-  size_t nnz = std::count_if(srcCols, srcCols + src.getElementCnt(),
+  size_t nnz = std::count_if(srcCols,
+                             srcCols + src.getElementCnt(),
                              [this](size_t n) { return n < this->width_; });
   resize(height_, width_, nnz, valueType_, format_);
 
@@ -678,9 +765,11 @@ void GpuSparseMatrix::trimFromCSR(const CpuSparseMatrix& src) {
   sMatrix_->cols = width_;
   sMatrix_->nnz = nnz;
 
-  hl_memcpy_csr_matrix(
-      sMatrix_.get(), valueType_ == NO_VALUE ? NULL : value_, rows_, cols_,
-      /*default stream = */ HPPL_STREAM_DEFAULT);
+  hl_memcpy_csr_matrix(sMatrix_.get(),
+                       valueType_ == NO_VALUE ? NULL : value_,
+                       rows_,
+                       cols_,
+                       /*default stream = */ HPPL_STREAM_DEFAULT);
 }
 
 void GpuSparseMatrix::trimFromCSC(const CpuSparseMatrix& src) {
@@ -703,9 +792,11 @@ void GpuSparseMatrix::trimFromCSC(const CpuSparseMatrix& src) {
   sMatrix_->cols = width_;
   sMatrix_->nnz = nnz;
 
-  hl_memcpy_csc_matrix(
-      sMatrix_.get(), valueType_ == NO_VALUE ? NULL : value_, rows_, cols_,
-      /*default stream = */ HPPL_STREAM_DEFAULT);
+  hl_memcpy_csc_matrix(sMatrix_.get(),
+                       valueType_ == NO_VALUE ? NULL : value_,
+                       rows_,
+                       cols_,
+                       /*default stream = */ HPPL_STREAM_DEFAULT);
 }
 
 void GpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
@@ -766,10 +857,12 @@ void GpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
 #endif
 }
 
-template void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices,
+template void GpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
                                         sparse_non_value_t* data,
                                         hl_stream_t stream);
-template void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices,
+template void GpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
                                         sparse_float_value_t* data,
                                         hl_stream_t stream);
 }  // namespace paddle
diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h
index 4b9a03302bf531a08b889a4b15d36fc8e71458dd..175ef54b858b7f8f31f45796d733af81a9d67066 100644
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <cstddef>
 #include "Matrix.h"
@@ -35,25 +34,41 @@ public:
   SparseFormat format_;
 
 public:
-  GpuSparseMatrix(size_t height, size_t width,
+  GpuSparseMatrix(size_t height,
+                  size_t width,
                   size_t nnz, /* used to allocate space */
                   SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format_ = SPARSE_CSR, bool trans = false);
+                  SparseFormat format_ = SPARSE_CSR,
+                  bool trans = false);
 
-  GpuSparseMatrix(GpuMemHandlePtr dataHandle, hl_sparse_matrix_s_ptr sMatrix,
-                  size_t height, size_t width,
+  GpuSparseMatrix(GpuMemHandlePtr dataHandle,
+                  hl_sparse_matrix_s_ptr sMatrix,
+                  size_t height,
+                  size_t width,
                   size_t nnz, /* used to allocate space */
                   SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format_ = SPARSE_CSR, bool trans = false,
+                  SparseFormat format_ = SPARSE_CSR,
+                  bool trans = false,
                   MemoryHandlePtr sMemoryHandle = NULL);
 
-  GpuSparseMatrix(real* value, int* rows, int* cols, size_t height,
-                  size_t width, size_t nnz, SparseValueType valueType,
-                  SparseFormat format, bool trans);
-
-  GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix, size_t height, size_t width,
-                  size_t nnz, SparseValueType valueType, SparseFormat format,
-                  bool trans, MemoryHandlePtr sMemoryHandle);
+  GpuSparseMatrix(real* value,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans);
+
+  GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans,
+                  MemoryHandlePtr sMemoryHandle);
 
 protected:
   struct Element {
@@ -67,9 +82,11 @@ protected:
 public:
   ~GpuSparseMatrix() {}
 
-  void resize(size_t newHeight, size_t newWidth,
+  void resize(size_t newHeight,
+              size_t newWidth,
               size_t newNnz, /* used to allocate space */
-              SparseValueType valueType, SparseFormat format);
+              SparseValueType valueType,
+              SparseFormat format);
 
   void resize(size_t newHeight, size_t newWidth);
 
@@ -77,13 +94,19 @@ public:
 
   void sparseResizeCSC();
 
-  void resizeCSR(size_t newHeight, size_t newWidth, size_t newNnz,
+  void resizeCSR(size_t newHeight,
+                 size_t newWidth,
+                 size_t newNnz,
                  SparseValueType valueType);
 
-  void resizeCSC(size_t newHeight, size_t newWidth, size_t newNnz,
+  void resizeCSC(size_t newHeight,
+                 size_t newWidth,
+                 size_t newNnz,
                  SparseValueType valueType);
 
-  void mul(const GpuMatrixPtr a, const GpuMatrixPtr b, real scaleAB,
+  void mul(const GpuMatrixPtr a,
+           const GpuMatrixPtr b,
+           real scaleAB,
            real scaleT);
   /// B = A , B.trans = !A.trans
   MatrixPtr getTranspose();
@@ -104,7 +127,9 @@ public:
   template <class T>
   void copyFrom(int64_t* ids, int64_t* indices, T* data, hl_stream_t stream);
 
-  void setRow(size_t row, size_t colNum, const unsigned int* cols,
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
               const real* values);
   SparseValueType getValueType() const;
   SparseFormat getFormat() const { return format_; }
@@ -173,7 +198,7 @@ public:
    * getData is convenient to get value
    */
   real* getData() { return getValue(); }
-  const real* getData() const { return getValue();}
+  const real* getData() const { return getValue(); }
 
   /**
    * @brief  Get top k value of each row in sparse matrix.
@@ -204,9 +229,7 @@ public:
 
   // BaseMatrixT interface
 public:
-  bool isSparse() const {
-    return true;
-  }
+  bool isSparse() const { return true; }
 
 private:
   using Matrix::mul;
diff --git a/paddle/math/SparseRowMatrix.cpp b/paddle/math/SparseRowMatrix.cpp
index 0b5de252258a9691d8c44193693fc60463bd0e62..eefaf4b71f4f027d00405bd4b158adc66a902ef7 100644
--- a/paddle/math/SparseRowMatrix.cpp
+++ b/paddle/math/SparseRowMatrix.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "SparseRowMatrix.h"
 #include "CpuSparseMatrix.h"
 
@@ -26,7 +25,8 @@ limitations under the License. */
 #include "paddle/utils/Util.h"
 #include "paddle/utils/Thread.h"
 
-P_DEFINE_bool(allow_inefficient_sparse_update, false,
+P_DEFINE_bool(allow_inefficient_sparse_update,
+              false,
               "Whether to allow inefficient sparse update");
 
 namespace paddle {
@@ -45,7 +45,9 @@ void SparseRowCpuMatrix::init(size_t height, size_t width) {
   globalIndices_ = indexDictHandle_->globalIndices.data();
 }
 
-void SparseRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
+void SparseRowCpuMatrix::mul(CpuSparseMatrix* a,
+                             CpuMatrix* b,
+                             real scaleAB,
                              real scaleT) {
   CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(a, b, this, scaleAB, scaleT);
 }
@@ -55,24 +57,25 @@ void SparseRowCpuMatrix::copyFrom(const real* src, size_t size) {
 }
 
 void SparseRowCpuMatrix::zeroMem() {
-  apply(
-    [](real* buf, size_t len) {
-      memset(buf, 0, sizeof(real) * len);
-    });
+  apply([](real* buf, size_t len) { memset(buf, 0, sizeof(real) * len); });
   clearRows();
 }
 
 void SparseRowCpuMatrix::applyL1Decay(real learningRate, real decayRate) {
   apply([=](real* buf, size_t len) {
-      CpuVector value(0, nullptr);
-      value.subVecFrom(buf, 0, len);
-      value.applyL1(learningRate, decayRate);
-    });
+    CpuVector value(0, nullptr);
+    value.subVecFrom(buf, 0, len);
+    value.applyL1(learningRate, decayRate);
+  });
 }
 
-void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value, IVector& t0,
-                                   real learningRate, int currentTime,
-                                   real decayRate, bool useL1, bool fini) {
+void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value,
+                                   IVector& t0,
+                                   real learningRate,
+                                   int currentTime,
+                                   real decayRate,
+                                   bool useL1,
+                                   bool fini) {
   std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
 
   // t0 and value are vectors
@@ -124,7 +127,7 @@ void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value, IVector& t0,
       for (size_t j = 0; j < this->width_; ++j) {
         v[j] -= learningRate * g[j];
       }
-      simd::decayL1(v, v, learningRate*decayRate, this->width_);
+      simd::decayL1(v, v, learningRate * decayRate, this->width_);
 
       // state update to t+1
       t[0] = currentTime + 1;
@@ -173,8 +176,10 @@ void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value, IVector& t0,
   }
 }
 
-void SparseRowCpuMatrix::addTo(BaseMatrix& dest, std::vector<uint32_t>& ids,
-                               size_t tid, size_t numThreads) {
+void SparseRowCpuMatrix::addTo(BaseMatrix& dest,
+                               std::vector<uint32_t>& ids,
+                               size_t tid,
+                               size_t numThreads) {
   CHECK(!dest.useGpu_);
   CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);
 
@@ -182,14 +187,14 @@ void SparseRowCpuMatrix::addTo(BaseMatrix& dest, std::vector<uint32_t>& ids,
   for (size_t i = 0; i < localIndices.size(); ++i) {
     uint32_t id = localIndices[i];
     if (id % numThreads == tid) {
-      simd::addTo(dest.rowBuf(id), getLocalRow(i),
-                  this->width_);
+      simd::addTo(dest.rowBuf(id), getLocalRow(i), this->width_);
       ids.push_back(id);
     }
   }
 }
 
-void SparseRowCpuMatrix::addTo(SparseRowCpuMatrix& dest, size_t tid,
+void SparseRowCpuMatrix::addTo(SparseRowCpuMatrix& dest,
+                               size_t tid,
                                size_t numThreads) {
   CHECK(!dest.useGpu_);
   CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);
@@ -214,25 +219,35 @@ void SparseRowCpuMatrix::zeroMemThread(size_t tid, size_t numThreads) {
   }
 }
 
-void SparseAutoGrowRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b,
-                                     real scaleAB, real scaleT) {
-  CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(a, b, this, scaleAB,
-                                                        scaleT);
+void SparseAutoGrowRowCpuMatrix::mul(CpuSparseMatrix* a,
+                                     CpuMatrix* b,
+                                     real scaleAB,
+                                     real scaleT) {
+  CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
+      a, b, this, scaleAB, scaleT);
 }
 
-void CacheRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
+void CacheRowCpuMatrix::mul(CpuSparseMatrix* a,
+                            CpuMatrix* b,
+                            real scaleAB,
                             real scaleT) {
   CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(a, b, this, scaleAB, scaleT);
 }
 
 void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) {
   std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  for (size_t i = 0; i < len; i++) {
+    CHECK_LT(*(ids + i), this->getHeight())
+        << "id:" << *(ids + i) << "Height:" << this->getHeight()
+        << "sparse id value exceeds the max input dimension, "
+        << "it could be caused invalid input data samples";
+  }
   localIndices.insert(localIndices.end(), ids, ids + len);
 }
 
 void SparsePrefetchRowCpuMatrix::addRows(MatrixPtr input) {
   CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(input.get());
-  CHECK(mat) << "only support non value sparse matrix";
+  CHECK(mat) << "only support sparse matrix";
   addRows(reinterpret_cast<const unsigned int*>(mat->getCols()),
           mat->getElementCnt());
 }
@@ -243,7 +258,13 @@ void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) {
   int* index = ids->getData();
   for (size_t i = 0; i < numSamples; ++i) {
     if (index[i] == -1) continue;
-    localIndices.push_back((unsigned int)index[i]);
+
+    unsigned int id = (unsigned int)index[i];
+    CHECK_LT(id, this->getHeight())
+        << "id:" << id << "Height:" << this->getHeight()
+        << "sparse id value exceeds the max input dimension, "
+        << "it could be caused invalid input data samples";
+    localIndices.push_back(id);
   }
 }
 
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index 2dcd81188d6431c317e82ee35e968cddfb334f59..56f113a3614e2e22809abbdaa708557ed3344464 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <algorithm>
@@ -41,12 +40,15 @@ public:
 
   /// heightStore is max number of rows of the sparse matrix.
   SparseRowCpuMatrix(CpuMemHandlePtr dataHandle,
-                     size_t height, size_t width,
-                     IndexDictPtr indexDictHandle = nullptr, bool trans = false)
+                     size_t height,
+                     size_t width,
+                     IndexDictPtr indexDictHandle = nullptr,
+                     bool trans = false)
       : CpuMatrix(nullptr, height, width, trans),
         storeMat_(dataHandle,
                   dataHandle ? dataHandle->getSize() / sizeof(real) / width : 0,
-                  width, trans),
+                  width,
+                  trans),
         indexDictHandle_(indexDictHandle) {
     init(height, width);
   }
@@ -123,8 +125,12 @@ public:
    * While pass finished, caller should call this func one more time
    *  with (fini=true) to let weight decay catch up current time.
    */
-  void sgdUpdate(BaseMatrix& value, IVector& t0, real learningRate,
-                 int currentTime, real decayRate, bool useL1,
+  void sgdUpdate(BaseMatrix& value,
+                 IVector& t0,
+                 real learningRate,
+                 int currentTime,
+                 real decayRate,
+                 bool useL1,
                  bool fini = false);
 
   /**
@@ -135,7 +141,9 @@ public:
    *  ids occured in *this* append to *ids*
    *  filtered by  (id % numThreads == tid)
    */
-  void addTo(BaseMatrix& dest, std::vector<uint32_t>& ids, size_t tid,
+  void addTo(BaseMatrix& dest,
+             std::vector<uint32_t>& ids,
+             size_t tid,
              size_t numThreads);
 
   /**
@@ -166,7 +174,7 @@ public:
   }
 
 protected:
-  template<typename Func>
+  template <typename Func>
   void apply(Func f) {
     real* data = storeMat_.getData() ? storeMat_.getData() : rowStore_.data();
     f(data, localIndices_->size() * width_);
@@ -211,9 +219,11 @@ class SyncThreadPool;
 class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
 public:
   SparsePrefetchRowCpuMatrix(CpuMemHandlePtr dataHandle,
-                             size_t height, size_t width,
+                             size_t height,
+                             size_t width,
                              IndexDictPtr indexDictHandle = nullptr,
-                             SyncThreadPool* pool = nullptr, bool trans = false)
+                             SyncThreadPool* pool = nullptr,
+                             bool trans = false)
       : SparseRowCpuMatrix(dataHandle, height, width, indexDictHandle, trans),
         pool_(pool) {}
 
@@ -239,7 +249,8 @@ protected:
 
 class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {
 public:
-  SparseAutoGrowRowCpuMatrix(size_t height, size_t width,
+  SparseAutoGrowRowCpuMatrix(size_t height,
+                             size_t width,
                              IndexDictPtr indexDictHandle = nullptr,
                              bool trans = false)
       : SparseRowCpuMatrix(nullptr, height, width, indexDictHandle, trans) {}
@@ -261,8 +272,10 @@ public:
 
 class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {
 public:
-  CacheRowCpuMatrix(size_t height, size_t width,
-                    IndexDictPtr indexDictHandle = nullptr, bool trans = false)
+  CacheRowCpuMatrix(size_t height,
+                    size_t width,
+                    IndexDictPtr indexDictHandle = nullptr,
+                    bool trans = false)
       : SparseAutoGrowRowCpuMatrix(height, width, indexDictHandle, trans),
         sourceData_(nullptr) {}
 
@@ -277,8 +290,8 @@ public:
       id = globalIndices_[row] = localIndices_->size();
       localIndices_->push_back(row);
       checkStoreSize();
-      memcpy(getLocalRow(id), sourceData_ + width_ * row,
-             sizeof(float) * width_);
+      memcpy(
+          getLocalRow(id), sourceData_ + width_ * row, sizeof(float) * width_);
     }
     return getLocalRow(id);
   }
@@ -300,7 +313,9 @@ public:
  */
 class SparseRowIdsCpuMatrix : public CpuMatrix {
 public:
-  SparseRowIdsCpuMatrix(CpuMemHandlePtr dataHandle, size_t height, size_t width,
+  SparseRowIdsCpuMatrix(CpuMemHandlePtr dataHandle,
+                        size_t height,
+                        size_t width,
                         bool trans = false)
       : CpuMatrix(dataHandle, height, width, trans) {}
 
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 0403c3521cf54d833b32ff0810ba6d29dfc8f3c6..57ea5c926647d21a82c87fc262e2999e45e7534f 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "Allocator.h"
 #include "Storage.h"
 
-P_DEFINE_int32(pool_limit_size, 536870912,
+P_DEFINE_int32(pool_limit_size,
+               536870912,
                "maximum memory size managed by a memory pool, default is 512M");
 
 namespace paddle {
@@ -25,11 +25,10 @@ namespace paddle {
 // Initialization StorageEngine singleton.
 // Other modules may rely on storage management,
 // so StorageEngine need to be initialized before other modules.
-static InitFunction __init_storage_engine([](){StorageEngine::singleton();},
+static InitFunction __init_storage_engine([]() { StorageEngine::singleton(); },
                                           std::numeric_limits<int>::max());
 
-StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {
-}
+StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {}
 
 StorageEngine::~StorageEngine() {
   if (cpuAllocator_) {
@@ -49,8 +48,8 @@ PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) {
   {
     // if gpuAllocator_ has been constructed
     ReadLockGuard guard(lock_);
-    if (deviceId < static_cast<int>(gpuAllocator_.size())
-        && (gpuAllocator_[deviceId] != nullptr)) {
+    if (deviceId < static_cast<int>(gpuAllocator_.size()) &&
+        (gpuAllocator_[deviceId] != nullptr)) {
       return gpuAllocator_[deviceId];
     }
   }
@@ -63,9 +62,9 @@ PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) {
     }
     if (gpuAllocator_[deviceId] == nullptr) {
       std::string name =
-        "gpu" + std::to_string(deviceId) + std::string("_pool");
-      gpuAllocator_[deviceId] = new PoolAllocator(
-        new GpuAllocator(), FLAGS_pool_limit_size, name);
+          "gpu" + std::to_string(deviceId) + std::string("_pool");
+      gpuAllocator_[deviceId] =
+          new PoolAllocator(new GpuAllocator(), FLAGS_pool_limit_size, name);
     }
     return gpuAllocator_[deviceId];
   }
@@ -86,10 +85,10 @@ PoolAllocator* StorageEngine::getCpuAllocator() {
     if (cpuAllocator_ == nullptr) {
       if (FLAGS_use_gpu) {
         cpuAllocator_ = new PoolAllocator(
-          new CudaHostAllocator(), FLAGS_pool_limit_size, "cuda_host_pool");
+            new CudaHostAllocator(), FLAGS_pool_limit_size, "cuda_host_pool");
       } else {
         cpuAllocator_ = new PoolAllocator(
-          new CpuAllocator(), FLAGS_pool_limit_size, "cpu_pool");
+            new CpuAllocator(), FLAGS_pool_limit_size, "cpu_pool");
       }
     }
     return cpuAllocator_;
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
index 7553ea25e09d2f52f1f8b9205f954510b77cbfa9..b2ade83138428a510e6be1bfa82290008e4167d0 100644
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "Vector.h"
 
@@ -21,6 +20,7 @@ limitations under the License. */
 #include "paddle/utils/ThreadLocal.h"
 #include "paddle/utils/Thread.h"
 #include "paddle/utils/Flags.h"
+#include "Matrix.h"
 #include "hl_gpu.h"
 #include "hl_table_apply.h"
 
@@ -48,7 +48,8 @@ std::shared_ptr<VectorT<T>> VectorT<T>::createParallelVector(
 }
 
 template <class T>
-std::shared_ptr<VectorT<T>> VectorT<T>::create(T* data, size_t size,
+std::shared_ptr<VectorT<T>> VectorT<T>::create(T* data,
+                                               size_t size,
                                                bool useGpu) {
   if (useGpu) {
     return std::make_shared<GpuVectorT<T>>(size, data);
@@ -62,10 +63,10 @@ std::shared_ptr<VectorT<T>> VectorT<T>::create(size_t size,
                                                MemoryHandlePtr memoryHandle,
                                                size_t offset) {
   if (auto cpuMemHandle =
-      std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle)) {
+          std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle)) {
     return std::make_shared<CpuVectorT<T>>(size, cpuMemHandle, offset);
   } else if (auto gpuMemHandle =
-             std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle)) {
+                 std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle)) {
     return std::make_shared<GpuVectorT<T>>(size, gpuMemHandle, offset);
   } else {
     LOG(FATAL) << "Wrong";
@@ -73,23 +74,47 @@ std::shared_ptr<VectorT<T>> VectorT<T>::create(size_t size,
   }
 }
 
+template <>
+MatrixPtr VectorT<real>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
+  LOG(FATAL) << "Wrong for real vector";
+  return nullptr;
+}
+
+template <>
+MatrixPtr VectorT<int>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
+  size_t height = getSize();
+  size_t width = idRange;
+  MatrixPtr mat = Matrix::createSparseMatrix(
+      height, idRange, height, NO_VALUE, SPARSE_CSR, false, useGpu);
+
+  CpuIVector cpuIds(height);
+  cpuIds.copyFrom(*this);
+  int* idData = cpuIds.getData();
+
+  for (decltype(height) i = 0; i < height; i++) {
+    const unsigned int id = idData[i];
+    CHECK_LT(id, width);
+    mat->setRow(i, 1, &id, nullptr);
+  }
+  return mat;
+}
+
 template <class T>
 GpuVectorT<T>::GpuVectorT(size_t size)
-    : VectorT<T>(size, std::make_shared<GpuMemoryHandle>(sizeof(T) * size),
+    : VectorT<T>(size,
+                 std::make_shared<GpuMemoryHandle>(sizeof(T) * size),
                  0, /* offset = 0 */
                  true /* useGpu = true */) {}
 
 template <class T>
 T GpuVectorT<T>::getElement(size_t i) const {
   T elem = 0;
-  hl_memcpy_device2host(&elem, const_cast<T*>(&this->getData()[i]),
-                        sizeof(T));
+  hl_memcpy_device2host(&elem, const_cast<T*>(&this->getData()[i]), sizeof(T));
   return elem;
 }
 template <class T>
 void GpuVectorT<T>::setElement(size_t i, const T& value) {
-  hl_memcpy_host2device(&this->getData()[i], const_cast<T*>(&value),
-                        sizeof(T));
+  hl_memcpy_host2device(&this->getData()[i], const_cast<T*>(&value), sizeof(T));
 }
 
 template <class T>
@@ -193,8 +218,7 @@ real GpuVectorT<real>::getMin() {
 template <class T>
 T GpuVectorT<T>::get(size_t pos) {
   T val = (T)0;
-  hl_memcpy_device2host((void*)&val, (void*)(this->getData() + pos),
-                        sizeof(T));
+  hl_memcpy_device2host((void*)&val, (void*)(this->getData() + pos), sizeof(T));
   return val;
 }
 
@@ -203,7 +227,7 @@ void GpuVectorT<T>::histogram(std::ostream& os, int type) {
   LOG(FATAL) << "Not implemented";
 }
 
-template<class T>
+template <class T>
 void GpuVectorT<T>::zeroMem() {
   BaseMatrixT<T>::zero();
 }
@@ -226,8 +250,10 @@ void GpuVectorT<T>::copyFrom(const VectorT<T>& src) {
 template <class T>
 void GpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
   CHECK_EQ(src.getSize(), this->getSize());
-  hl_memcpy_async((void*)this->getData(), (void*)src.getData(),
-                  sizeof(T) * this->getSize(), stream);
+  hl_memcpy_async((void*)this->getData(),
+                  (void*)src.getData(),
+                  sizeof(T) * this->getSize(),
+                  stream);
 }
 
 template <class T>
@@ -243,15 +269,16 @@ void GpuVectorT<T>::copyFrom(const T* gpuSrc, size_t size, hl_stream_t stream) {
   CHECK(gpuSrc != NULL);
   CHECK_LE(size, this->size_);
 
-  hl_memcpy_async((void*)this->getData(), (void*)gpuSrc,
-                  sizeof(T) * size, stream);
+  hl_memcpy_async(
+      (void*)this->getData(), (void*)gpuSrc, sizeof(T) * size, stream);
 }
 
 template <class T>
 void GpuVectorT<T>::copyTo(CpuVectorT<T>* dest) const {
   CHECK_EQ(this->getSize(), dest->getSize());
 
-  hl_memcpy_device2host((void*)dest->getData(), (void*)this->getData(),
+  hl_memcpy_device2host((void*)dest->getData(),
+                        (void*)this->getData(),
                         sizeof(T) * this->getSize());
 }
 
@@ -259,7 +286,8 @@ template <class T>
 void GpuVectorT<T>::copyTo(GpuVectorT<T>* dest) const {
   CHECK_EQ(this->getSize(), dest->getSize());
 
-  hl_memcpy_device2device((void*)dest->getData(), (void*)this->getData(),
+  hl_memcpy_device2device((void*)dest->getData(),
+                          (void*)this->getData(),
                           sizeof(T) * this->getSize());
 }
 
@@ -271,7 +299,8 @@ void GpuVectorT<int>::rand() {
 template <>
 void GpuVectorT<int>::print(std::ostream& os, size_t num) const {
   IVectorPtr dest = IVector::create(this->size_, false);
-  hl_memcpy_device2host((void*)dest->getData(), (void*)this->getData(),
+  hl_memcpy_device2host((void*)dest->getData(),
+                        (void*)this->getData(),
                         sizeof(int) * this->getSize());
   dest->print(os, num);
 }
@@ -279,7 +308,8 @@ void GpuVectorT<int>::print(std::ostream& os, size_t num) const {
 template <>
 void GpuVectorT<real>::print(std::ostream& os, size_t num) const {
   VectorPtr dest = Vector::create(this->size_, false);
-  hl_memcpy_device2host((void*)dest->getData(), (void*)this->getData(),
+  hl_memcpy_device2host((void*)dest->getData(),
+                        (void*)this->getData(),
                         sizeof(int) * this->getSize());
   dest->print(os, num);
 }
@@ -402,8 +432,8 @@ void GpuVectorT<real>::randnorm(real mean, real std) {
   CpuVector cpuVec = CpuVector(this->getSize());
   cpuVec.randnorm(mean, std);
 
-  hl_memcpy_host2device(data_, cpuVec.getData(),
-                        this->getSize() * sizeof(real));
+  hl_memcpy_host2device(
+      data_, cpuVec.getData(), this->getSize() * sizeof(real));
 }
 
 template <>
@@ -411,19 +441,22 @@ void GpuVectorT<real>::uniform(real left, real right) {
   CpuVector cpuVec = CpuVector(this->getSize());
   cpuVec.uniform(left, right);
 
-  hl_memcpy_host2device(data_, cpuVec.getData(),
-                        this->getSize() * sizeof(real));
+  hl_memcpy_host2device(
+      data_, cpuVec.getData(), this->getSize() * sizeof(real));
 }
 
 template <class T>
 CpuVectorT<T>::CpuVectorT(size_t size)
-    : VectorT<T>(size, std::make_shared<CpuMemoryHandle>(sizeof(T) * size),
+    : VectorT<T>(size,
+                 std::make_shared<CpuMemoryHandle>(sizeof(T) * size),
                  0, /* offset = 0 */
                  false /* useGpu = false */) {}
 
 template <class T>
 CpuVectorT<T>::CpuVectorT(const VectorT<T>& src)
-    : VectorT<T>(src.getSize(), src.getMemoryHandle(), 0, /* offset = 0 */
+    : VectorT<T>(src.getSize(),
+                 src.getMemoryHandle(),
+                 0, /* offset = 0 */
                  false /* useGpu = false */) {
   if (typeid(*this->memoryHandle_.get()) != typeid(CpuMemoryHandle)) {
     this->memoryHandle_ =
@@ -620,8 +653,10 @@ void CpuVectorT<T>::copyFrom(const VectorT<T>& src) {
 template <class T>
 void CpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
   if (typeid(src) == typeid(GpuVectorT<T>)) {
-    hl_memcpy_async((void*)this->getData(), (void*)src.getData(),
-                    sizeof(T) * this->getSize(), stream);
+    hl_memcpy_async((void*)this->getData(),
+                    (void*)src.getData(),
+                    sizeof(T) * this->getSize(),
+                    stream);
   } else {
     src.copyTo(this);
   }
@@ -635,7 +670,8 @@ void CpuVectorT<T>::copyFrom(const T* hostSrc, size_t size) {
 }
 
 template <class T>
-void CpuVectorT<T>::copyFrom(const T* hostSrc, size_t size,
+void CpuVectorT<T>::copyFrom(const T* hostSrc,
+                             size_t size,
                              hl_stream_t stream) {
   (void)stream;
 
@@ -653,7 +689,8 @@ void CpuVectorT<T>::copyTo(CpuVectorT<T>* dest) const {
 template <class T>
 void CpuVectorT<T>::copyTo(GpuVectorT<T>* dest) const {
   CHECK_EQ(this->getSize(), dest->getSize());
-  hl_memcpy_host2device((void*)dest->getData(), (void*)this->getData(),
+  hl_memcpy_host2device((void*)dest->getData(),
+                        (void*)this->getData(),
                         sizeof(T) * this->getSize());
 }
 
@@ -697,8 +734,8 @@ void ParallelCpuVectorT<T>::parallelExec(ExecFunc func) {
 template <>
 void ParallelCpuVectorT<real>::parallelExec(ExecFunc func) {
   pool_->exec([this, func](int tid, size_t numThreads) {
-    auto interval = calcSplitArrayInterval(this->getSize(), (size_t)tid,
-                                           numThreads, 8LU /*for avx*/);
+    auto interval = calcSplitArrayInterval(
+        this->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
     // setup sub bufs
     CpuVector subVec(0, nullptr);
     subVec.subVecFrom(*this, interval);
@@ -717,7 +754,8 @@ void ParallelCpuVectorT<real>::exec(SyncThreadPool::JobFunc func) {
 }
 
 template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, bool useGpu) : sync_(nullptr) {
+CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, bool useGpu)
+    : sync_(nullptr) {
   if (!useGpu) {
     cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size);
   } else {
@@ -728,7 +766,7 @@ CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, bool useGpu) : sync_(nullptr) {
 
 template <class T>
 CpuGpuVectorT<T>::CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src)
-  : sync_(nullptr) {
+    : sync_(nullptr) {
   bool useGpu = src->useGpu();
   if (useGpu) {
     gpuVectorT_ = src;
@@ -740,7 +778,7 @@ CpuGpuVectorT<T>::CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src)
 
 template <class T>
 CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, T* data, bool useGpu)
-  : sync_(nullptr) {
+    : sync_(nullptr) {
   if (!useGpu) {
     cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size, data);
     setSync(DATA_AT_CPU);
@@ -751,8 +789,8 @@ CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, T* data, bool useGpu)
 }
 
 template <class T>
-std::shared_ptr<CpuGpuVectorT<T>>
-CpuGpuVectorT<T>::create(size_t size, bool useGpu) {
+std::shared_ptr<CpuGpuVectorT<T>> CpuGpuVectorT<T>::create(size_t size,
+                                                           bool useGpu) {
   return std::make_shared<CpuGpuVectorT<T>>(size, useGpu);
 }
 
@@ -783,9 +821,9 @@ void CpuGpuVectorT<T>::resize(size_t size, bool useGpu) {
 }
 
 template <class T>
-void CpuGpuVectorT<T>::resizeOrCreate(
-    std::shared_ptr<CpuGpuVectorT<T>>& vec,
-    size_t size, bool useGpu) {
+void CpuGpuVectorT<T>::resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>>& vec,
+                                      size_t size,
+                                      bool useGpu) {
   if (vec) {
     vec->resize(size, useGpu);
   } else {
@@ -807,7 +845,9 @@ void CpuGpuVectorT<T>::resizeOrCreate(size_t size, bool useGpu) {
 
 template <class T>
 CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
-  size_t offset, size_t size) : sync_(nullptr) {
+                                size_t offset,
+                                size_t size)
+    : sync_(nullptr) {
   CHECK_LE(offset + size, static_cast<size_t>(src.getSize()));
 #ifndef PADDLE_ONLY_CPU
   SyncedFlag* flag = src.getSync();
@@ -818,21 +858,21 @@ CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
   }
 #endif
   auto cMemHandle = (src.getVector(false))->getMemoryHandle();
-  cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size,
-    std::dynamic_pointer_cast<CpuMemoryHandle>(cMemHandle), offset);
+  cpuVectorT_ = std::make_shared<CpuVectorT<T>>(
+      size, std::dynamic_pointer_cast<CpuMemoryHandle>(cMemHandle), offset);
 #ifndef PADDLE_ONLY_CPU
   auto gMemHandle = (src.getVector(true))->getMemoryHandle();
-  gpuVectorT_ = std::make_shared<GpuVectorT<T>>(size,
-    std::dynamic_pointer_cast<GpuMemoryHandle>(gMemHandle), offset);
+  gpuVectorT_ = std::make_shared<GpuVectorT<T>>(
+      size, std::dynamic_pointer_cast<GpuMemoryHandle>(gMemHandle), offset);
   src.setSync(SYNCED);
 #endif
   setSync(src.getSync());
 }
 
 template <class T>
-std::shared_ptr<const VectorT<T>>
-CpuGpuVectorT<T>::getVector(bool useGpu) const {
-  auto * self = const_cast<CpuGpuVectorT<T>*>(this);
+std::shared_ptr<const VectorT<T>> CpuGpuVectorT<T>::getVector(
+    bool useGpu) const {
+  auto* self = const_cast<CpuGpuVectorT<T>*>(this);
   if (useGpu) {
     self->copyToGpu();
     return std::const_pointer_cast<const VectorT<T>>(gpuVectorT_);
@@ -938,8 +978,10 @@ void CpuGpuVectorT<T>::copyFrom(const T* data, size_t size, bool useGpu) {
 }
 
 template <class T>
-void CpuGpuVectorT<T>::copyFrom(const T* data, size_t size,
-    hl_stream_t stream, bool useGpu) {
+void CpuGpuVectorT<T>::copyFrom(const T* data,
+                                size_t size,
+                                hl_stream_t stream,
+                                bool useGpu) {
   if (useGpu) {
     copyToGpu(data, size, stream);
   } else {
@@ -949,7 +991,10 @@ void CpuGpuVectorT<T>::copyFrom(const T* data, size_t size,
 
 template <class T>
 void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src,
-    size_t offset, size_t size, bool useGpu, hl_stream_t stream) {
+                                size_t offset,
+                                size_t size,
+                                bool useGpu,
+                                hl_stream_t stream) {
   if (useGpu) {
     VectorT<T>::resizeOrCreate(gpuVectorT_, size, true);
     gpuVectorT_->copyFrom(src.getData(true) + offset, size, stream);
@@ -961,8 +1006,7 @@ void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src,
 }
 
 template <class T>
-void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src,
-    hl_stream_t stream) {
+void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src, hl_stream_t stream) {
   switch (*src.getSync()) {
     case DATA_AT_CPU:
       copyFrom(*(src.getVector(false)), stream);
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
index ee0a83bf038f04ee9f7b3561639aa90da68a6e29..46a25c04dff6041222b8c97b8904322546f2bbe3 100644
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -37,12 +36,13 @@ class BaseVector;
 
 class SyncThreadPool;
 
-template<class T>
+class Matrix;
+
+template <class T>
 class BaseVector : public BaseMatrixT<T> {
 public:
   BaseVector(size_t size, T* data, bool useGpu)
-    : BaseMatrixT<T>(1, size, data, false, useGpu),
-      size_(this->width_) {}
+      : BaseMatrixT<T>(1, size, data, false, useGpu), size_(this->width_) {}
 
   ~BaseVector() {}
 
@@ -111,7 +111,8 @@ public:
     this->size_ = newSize;
   }
 
-  static void resizeOrCreate(std::shared_ptr<VectorT<T>>& vec, size_t size,
+  static void resizeOrCreate(std::shared_ptr<VectorT<T>>& vec,
+                             size_t size,
                              bool useGpu) {
     if (vec) {
       vec->resize(size);
@@ -155,6 +156,12 @@ public:
     subVecFrom(src, interval.first, interval.second - interval.first);
   }
 
+  /**
+   * convert the vector to a sparse one_hot matrix of width idRange
+   * only applies to IVector
+   */
+  std::shared_ptr<Matrix> toOneHotSparseMatrix(size_t idRange, bool useGpu);
+
   /**
    * This function will crash if the size of src and dest is different.
    */
@@ -423,11 +430,7 @@ public:
    *
    * SYNCED: data is located in CPU and GPU simultaneously.
    */
-  enum SyncedFlag {
-    DATA_AT_CPU = 0,
-    DATA_AT_GPU = 1,
-    SYNCED = 2
-  };
+  enum SyncedFlag { DATA_AT_CPU = 0, DATA_AT_GPU = 1, SYNCED = 2 };
 
   /**
    * @brief A constructor, create cpuVectorT_ or gpuVectorT_.
@@ -461,8 +464,7 @@ public:
    */
   CpuGpuVectorT(size_t size, T* data, bool useGpu);
 
-  CpuGpuVectorT(CpuGpuVectorT<T>& src,
-    size_t offset, size_t size);
+  CpuGpuVectorT(CpuGpuVectorT<T>& src, size_t offset, size_t size);
 
   virtual ~CpuGpuVectorT() {}
 
@@ -481,8 +483,8 @@ public:
    * @brief resize or create CpuGpuVectorT.
    */
   static void resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>>& vec,
-                             size_t size, bool useGpu);
-
+                             size_t size,
+                             bool useGpu);
 
   /**
    * @brief return a const cpuVectorT_ or gpuVectorT_.
@@ -514,10 +516,10 @@ public:
    */
   const T* getData(bool useGpu) const;
 
-// TODO(yuyang18): Make getData more c++ style.
-//  inline T* getData(bool useGpu) {
-//    return getMutableData(useGpu);
-//  }
+  // TODO(yuyang18): Make getData more c++ style.
+  //  inline T* getData(bool useGpu) {
+  //    return getMutableData(useGpu);
+  //  }
 
   T* getMutableData(bool useGpu);
 
@@ -607,8 +609,11 @@ public:
   /**
    * @brief copy from (src + offset) using specifed-stream.
    */
-  void copyFrom(CpuGpuVectorT<T>& src, size_t offset, size_t size,
-                bool useGpu, hl_stream_t stream);
+  void copyFrom(CpuGpuVectorT<T>& src,
+                size_t offset,
+                size_t size,
+                bool useGpu,
+                hl_stream_t stream);
 
   /**
    * @brief copy from src using specifed-stream.
@@ -618,16 +623,12 @@ public:
   /**
    * @brief return sync_.
    */
-  inline SyncedFlag* getSync() const {
-    return sync_;
-  }
+  inline SyncedFlag* getSync() const { return sync_; }
 
   /**
    * @brief set sync_.
    */
-  inline void setSync(SyncedFlag* sync) {
-    sync_ = sync;
-  }
+  inline void setSync(SyncedFlag* sync) { sync_ = sync; }
 
   inline void setSync(SyncedFlag syncFlag) {
     if (sync_) {
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index eb72f11e1c6538cd2c66bc56dbc8686a942bd308..247be983ba3296383c8e2f30f1036859ecfde492 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -13,3 +13,4 @@ add_simple_unittest(test_sparseMatrixCompare)
 add_simple_unittest(test_perturbation)
 add_simple_unittest(test_CpuGpuVector)
 add_simple_unittest(test_Allocator)
+add_simple_unittest(test_FPException)
diff --git a/paddle/math/tests/test_Allocator.cpp b/paddle/math/tests/test_Allocator.cpp
index c94e7f043c04a4551e0be76c6761a1078fadcd36..084322a1caf579cf6237b41c51efa220c6f2d5a2 100644
--- a/paddle/math/tests/test_Allocator.cpp
+++ b/paddle/math/tests/test_Allocator.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <gtest/gtest.h>
 #include "paddle/utils/Util.h"
 #include "paddle/utils/Logging.h"
@@ -21,11 +20,12 @@ limitations under the License. */
 #include "paddle/math/Allocator.h"
 #include "paddle/math/PoolAllocator.h"
 
-using namespace paddle;     // NOLINT
+using namespace paddle;  // NOLINT
 
-template<typename Allocator>
+template <typename Allocator>
 void testPoolAllocator() {
-  PoolAllocator* pool = new PoolAllocator(new Allocator(), /* sizeLimit */1024);
+  PoolAllocator* pool =
+      new PoolAllocator(new Allocator(), /* sizeLimit */ 1024);
 
   /* alloc from system memory */
   void* ptr1 = pool->alloc(10);
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
index ae201f172373caa45186cdc378cf9dd06a136181..b3eca19a7291d2b71b801793f824c1087a3ded27 100644
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
@@ -23,7 +23,10 @@ using namespace paddle;  // NOLINT
 const int height = 10;
 const int width = 16;
 
-real f(Matrix& mat1, const Matrix& mat2, IVector& vec1, const IVector& vec2,
+real f(Matrix& mat1,
+       const Matrix& mat2,
+       IVector& vec1,
+       const IVector& vec2,
        real scalar) {
   CHECK(!mat1.useGpu());
   CHECK(!mat2.useGpu());
@@ -37,8 +40,11 @@ real f(Matrix& mat1, const Matrix& mat2, IVector& vec1, const IVector& vec2,
 
 class Functor {
 public:
-  real operator()(Matrix& mat1, const Matrix& mat2, IVector& vec1,
-                  const IVector& vec2, real scalar) {
+  real operator()(Matrix& mat1,
+                  const Matrix& mat2,
+                  IVector& vec1,
+                  const IVector& vec2,
+                  real scalar) {
     a_ = f(mat1, mat2, vec1, vec2, scalar);
     return a_;
   }
@@ -93,9 +99,13 @@ TEST(ExecViaCpu, test1) {
   testWrapper(f);
   testWrapper(&f);
 
-  auto lambda =
-      [](Matrix& mat1, const Matrix& mat2, IVector& vec1, const IVector& vec2,
-         real scalar) -> real { return f(mat1, mat2, vec1, vec2, scalar); };
+  auto lambda = [](Matrix& mat1,
+                   const Matrix& mat2,
+                   IVector& vec1,
+                   const IVector& vec2,
+                   real scalar) -> real {
+    return f(mat1, mat2, vec1, vec2, scalar);
+  };
   LOG(INFO) << "lambda is_class=" << std::is_class<decltype(lambda)>::value
             << " is_function=" << std::is_function<decltype(lambda)>::value;
   testWrapper(lambda);
diff --git a/paddle/math/tests/test_FPException.cpp b/paddle/math/tests/test_FPException.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f996e0daddd3ef41e195de48640631a979a87192
--- /dev/null
+++ b/paddle/math/tests/test_FPException.cpp
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/**
+ * This test is about floating point calculation exception.
+ * Paddle catches FE_INVALID, FE DIVBYZERO and FE_OVERFLOW exceptions.
+ *
+ * Some exceptions occur in the middle of a set of formulas,
+ * that can be circumvented by some tricks.
+ * For example,
+ * calculate tanh
+ *   b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
+ *
+ * If the result of (-2 * a) is too large,
+ * a FE_OVERFLOW exception occurs when calculating exp.
+ * But the result of tanh is no overflow problem,
+ * so we can add some tricks to prevent exp calculate an excessive value.
+ *
+ */
+#include <fenv.h>
+#include <gtest/gtest.h>
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Excepts.h"
+
+using namespace paddle;  // NOLINT
+
+void SetTensorValue(Matrix& matrix, real value) {
+  int height = matrix.getHeight();
+  int width = matrix.getWidth();
+  int stride = matrix.getStride();
+  real* data = matrix.getData();
+  for (int i = 0; i < height; i++) {
+    int j = rand() % width;  // NOLINT
+    if (typeid(matrix) == typeid(CpuMatrix)) {
+      data[i * stride + j] = value;
+    } else if (typeid(matrix) == typeid(GpuMatrix)) {
+      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
+    } else {
+      LOG(FATAL) << "should not reach here";
+    }
+  }
+}
+
+template <typename Matrix>
+void testTanh(real illegal) {
+  MatrixPtr A = std::make_shared<Matrix>(10, 10);
+  MatrixPtr B = std::make_shared<Matrix>(10, 10);
+  A->randomizeUniform();
+  B->randomizeUniform();
+
+  SetTensorValue(*A, illegal);
+
+  A->tanh(*B);
+}
+
+template <typename Matrix>
+void testSigmoid(real illegal) {
+  MatrixPtr A = std::make_shared<Matrix>(10, 10);
+  MatrixPtr B = std::make_shared<Matrix>(10, 10);
+  A->randomizeUniform();
+  B->randomizeUniform();
+
+  SetTensorValue(*A, illegal);
+
+  A->sigmoid(*B);
+}
+
+TEST(fp, overflow) {
+  for (auto illegal : {-90.0, 90.0}) {
+    LOG(INFO) << " illegal=" << illegal;
+    testTanh<CpuMatrix>(illegal);
+    testSigmoid<CpuMatrix>(illegal);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/math/tests/test_SIMDFunctions.cpp b/paddle/math/tests/test_SIMDFunctions.cpp
index 491b0cda7b9e1a13882aee6621e0de984709ae80..8405b96fc2b915e2e1a5676ab5e3f25b4acde75a 100644
--- a/paddle/math/tests/test_SIMDFunctions.cpp
+++ b/paddle/math/tests/test_SIMDFunctions.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
-
 #include "paddle/math/SIMDFunctions.h"
 #include "paddle/utils/Util.h"
 
@@ -128,13 +126,13 @@ TEST(SIMDFunction, decayL1_WithLR) {
   typedef std::function<void(float*, float*, float*, float, size_t)>
       DecayL1MethodType;
 
-  DecayL1MethodType naive = [](float* d, float* s, float* lr, float l,
-                               size_t len) {
+  DecayL1MethodType naive = [](
+      float* d, float* s, float* lr, float l, size_t len) {
     paddle::simd::naive::decayL1<float>(d, s, lr, l, len);
   };
 
-  DecayL1MethodType simd = [](float* d, float* s, float* lr, float l,
-                              size_t len) {
+  DecayL1MethodType simd = [](
+      float* d, float* s, float* lr, float l, size_t len) {
     paddle::simd::decayL1<float>(d, s, lr, l, len);
   };
 
diff --git a/paddle/math/tests/test_batchTranspose.cpp b/paddle/math/tests/test_batchTranspose.cpp
index 737504da388be72de70d37d87dc866b8448f6cd2..a9596992b2b1fced417c048600b05b39882b2bf2 100644
--- a/paddle/math/tests/test_batchTranspose.cpp
+++ b/paddle/math/tests/test_batchTranspose.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "test_matrixUtil.h"
 #include "hl_batch_transpose.h"
 
@@ -48,8 +47,8 @@ TEST(MatrixBatchTransTest, test_batch_matrix_transpose) {
             cData[sample_id * nx * ny + j * nx + i];
   // device
   gMat->copyFrom(*cMat, HPPL_STREAM_DEFAULT);
-  batchTranspose(gMat->getData(), gBatchTransMat->getData(), nx, ny,
-                 numSamples);
+  batchTranspose(
+      gMat->getData(), gBatchTransMat->getData(), nx, ny, numSamples);
   cMat_d2h->copyFrom(*gBatchTransMat, HPPL_STREAM_DEFAULT);
   checkMatrixEqual(cBatchTransMat, cMat_d2h);
 }
diff --git a/paddle/math/tests/test_matrix.cpp b/paddle/math/tests/test_matrix.cpp
index 71c9622420aef73848ee7e85c505a6d40f64f3c1..3788218aab100d4ad683e85149a9513e54ca2480 100644
--- a/paddle/math/tests/test_matrix.cpp
+++ b/paddle/math/tests/test_matrix.cpp
@@ -48,7 +48,8 @@ struct MatrixPara {
 };
 
 #ifndef PADDLE_ONLY_CPU
-void test_sparse_matrix_mul(MatrixPara paraA, MatrixPara paraB,
+void test_sparse_matrix_mul(MatrixPara paraA,
+                            MatrixPara paraB,
                             MatrixPara paraC) {
   // for cpu sparse matrix mul
   MatrixPtr cpuMatrixA, cpuMatrixB, cpuMatrixC, gpuMatrixC_d2h;
@@ -58,12 +59,20 @@ void test_sparse_matrix_mul(MatrixPara paraA, MatrixPara paraB,
   MatrixPtr cpuDenseA, cpuDenseB, cpuDenseC;
 
   if (paraA.sparse) {
-    cpuMatrixA = Matrix::createSparseMatrix(paraA.height, paraA.width,
-                                            paraA.nnz, FLOAT_VALUE,
-                                            paraA.format, paraA.trans, false);
-    gpuMatrixA = Matrix::createSparseMatrix(paraA.height, paraA.width,
-                                            paraA.nnz, FLOAT_VALUE,
-                                            paraA.format, paraA.trans, true);
+    cpuMatrixA = Matrix::createSparseMatrix(paraA.height,
+                                            paraA.width,
+                                            paraA.nnz,
+                                            FLOAT_VALUE,
+                                            paraA.format,
+                                            paraA.trans,
+                                            false);
+    gpuMatrixA = Matrix::createSparseMatrix(paraA.height,
+                                            paraA.width,
+                                            paraA.nnz,
+                                            FLOAT_VALUE,
+                                            paraA.format,
+                                            paraA.trans,
+                                            true);
   } else {
     cpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, false);
     gpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, true);
@@ -71,12 +80,20 @@ void test_sparse_matrix_mul(MatrixPara paraA, MatrixPara paraB,
   cpuDenseA = Matrix::create(paraA.height, paraA.width, paraA.trans, false);
 
   if (paraB.sparse) {
-    cpuMatrixB = Matrix::createSparseMatrix(paraB.height, paraB.width,
-                                            paraB.nnz, FLOAT_VALUE,
-                                            paraB.format, paraB.trans, false);
-    gpuMatrixB = Matrix::createSparseMatrix(paraB.height, paraB.width,
-                                            paraB.nnz, FLOAT_VALUE,
-                                            paraB.format, paraB.trans, true);
+    cpuMatrixB = Matrix::createSparseMatrix(paraB.height,
+                                            paraB.width,
+                                            paraB.nnz,
+                                            FLOAT_VALUE,
+                                            paraB.format,
+                                            paraB.trans,
+                                            false);
+    gpuMatrixB = Matrix::createSparseMatrix(paraB.height,
+                                            paraB.width,
+                                            paraB.nnz,
+                                            FLOAT_VALUE,
+                                            paraB.format,
+                                            paraB.trans,
+                                            true);
   } else {
     cpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, false);
     gpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, true);
@@ -84,15 +101,27 @@ void test_sparse_matrix_mul(MatrixPara paraA, MatrixPara paraB,
   cpuDenseB = Matrix::create(paraB.height, paraB.width, paraB.trans, false);
 
   if (paraC.sparse) {
-    cpuMatrixC = Matrix::createSparseMatrix(paraC.height, paraC.width,
-                                            paraC.nnz, FLOAT_VALUE,
-                                            paraC.format, paraC.trans, false);
-    gpuMatrixC = Matrix::createSparseMatrix(paraC.height, paraC.width,
-                                            paraC.nnz, FLOAT_VALUE,
-                                            paraC.format, paraC.trans, true);
-    gpuMatrixC_d2h = Matrix::createSparseMatrix(
-        paraC.height, paraC.width, paraC.nnz, FLOAT_VALUE, paraC.format,
-        paraC.trans, false);
+    cpuMatrixC = Matrix::createSparseMatrix(paraC.height,
+                                            paraC.width,
+                                            paraC.nnz,
+                                            FLOAT_VALUE,
+                                            paraC.format,
+                                            paraC.trans,
+                                            false);
+    gpuMatrixC = Matrix::createSparseMatrix(paraC.height,
+                                            paraC.width,
+                                            paraC.nnz,
+                                            FLOAT_VALUE,
+                                            paraC.format,
+                                            paraC.trans,
+                                            true);
+    gpuMatrixC_d2h = Matrix::createSparseMatrix(paraC.height,
+                                                paraC.width,
+                                                paraC.nnz,
+                                                FLOAT_VALUE,
+                                                paraC.format,
+                                                paraC.trans,
+                                                false);
   } else {
     cpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, false);
     gpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, true);
@@ -267,8 +296,8 @@ TEST(Matrix, CpuSparseMatrixSubMatrix) {
   }
 }
 
-void sparseValid(int* major, int* minor, size_t nnz, size_t majorLen,
-                 size_t minorLen) {
+void sparseValid(
+    int* major, int* minor, size_t nnz, size_t majorLen, size_t minorLen) {
   CHECK_EQ(nnz, size_t(major[majorLen - 1]));
   CHECK_EQ(nnz, minorLen);
   for (size_t i = 0; i < majorLen - 1; i++) {
@@ -375,14 +404,25 @@ TEST(Matrix, SparseMatrixCSRFormatTrimFrom) {
   int64_t trimedIndices[11] = {0, 1, 3, 3, 7, 7, 9, 10, 12, 16, 19};
   sparse_float_value_t trimedData[19];
   int trimedValue[19] = {
-      1,           // row_0 : 1
-      3, 1,        // row_1 : 2
-      0, 1, 2, 3,  // row_3 : 4
-      2, 3,        // row_5 : 2
-      3,           // row_6 : 1
-      0, 1,        // row_7 : 2
-      0, 1, 2, 3,  // row_8 : 4
-      2, 3, 1      // row_9 : 3
+      1,  // row_0 : 1
+      3,
+      1,  // row_1 : 2
+      0,
+      1,
+      2,
+      3,  // row_3 : 4
+      2,
+      3,  // row_5 : 2
+      3,  // row_6 : 1
+      0,
+      1,  // row_7 : 2
+      0,
+      1,
+      2,
+      3,  // row_8 : 4
+      2,
+      3,
+      1  // row_9 : 3
   };
   for (size_t i = 0; i < 19; i++) {
     trimedData[i].col = trimedValue[i];
@@ -415,9 +455,13 @@ TEST(Matrix, SparseMatrixCSRFormatTrimFrom) {
       height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, true);
   matC->trimFrom(*mat);
 
-  CpuSparseMatrixPtr matD = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, matC->getElementCnt(), FLOAT_VALUE, SPARSE_CSR,
-      false);
+  CpuSparseMatrixPtr matD =
+      std::make_shared<CpuSparseMatrix>(height,
+                                        trimedWidth,
+                                        matC->getElementCnt(),
+                                        FLOAT_VALUE,
+                                        SPARSE_CSR,
+                                        false);
   matD->copyFrom(*matC, HPPL_STREAM_DEFAULT);
   hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   checkSMatrixEqual2(matA, matD);
@@ -462,11 +506,17 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
   int trimedIndices[6] = {0, 1, 5, 5, 9, 13};
   int trimedValue[13] = {
       1,  // col_0 : 1
-      5, 3, 1,
+      5,
+      3,
+      1,
       6,  // col_1 : 4
-      0, 1, 2,
+      0,
+      1,
+      2,
       3,  // col_3 : 4
-      4, 5, 6,
+      4,
+      5,
+      6,
       7  // col_4 : 4
   };
   std::vector<int> rowsA(trimedValue, trimedValue + 13);
@@ -499,9 +549,13 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
       height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, true);
   matC->trimFrom(*mat);
 
-  CpuSparseMatrixPtr matD = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, matC->getElementCnt(), FLOAT_VALUE, SPARSE_CSC,
-      false);
+  CpuSparseMatrixPtr matD =
+      std::make_shared<CpuSparseMatrix>(height,
+                                        trimedWidth,
+                                        matC->getElementCnt(),
+                                        FLOAT_VALUE,
+                                        SPARSE_CSC,
+                                        false);
   matD->copyFrom(*matC, HPPL_STREAM_DEFAULT);
   hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   checkSMatrixEqual2(matA, matD);
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index e1bda79a8acb16ffb9025ff92afa2bb24d76c4fe..ae5bc5a86a1790ce30a8d7f83c9564f52d7cf7ea 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -21,11 +21,12 @@ limitations under the License. */
 #include "paddle/math/SparseMatrix.h"
 #include <gtest/gtest.h>
 #include "paddle/gserver/tests/TestUtil.h"
+#include "paddle/utils/Stat.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-template<class T>
+template <class T>
 void VectorCheckEqual(const VectorT<T>& vector1, const VectorT<T>& vector2) {
   CHECK(vector1.getSize() == vector2.getSize());
 
@@ -88,8 +89,102 @@ void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) {
   EXPECT_EQ(count, 0) << "There are " << count << " different element.";
 }
 
-void testMatrixProjectionForward(int contextStart, int contextLength,
-                                 bool padding, int batchSize, int inputDim) {
+void testBilinearFwdBwd(int numSamples,
+                        int imgSizeH,
+                        int imgSizeW,
+                        int channels) {
+  int inWidth = imgSizeH * imgSizeW * channels;
+  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
+  real ratioH = 0.5;
+  real ratioW = 0.5;
+  // forward
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+
+  input->randomizeUniform();
+  inputGpu->copyFrom(*input);
+
+  target->bilinearForward(*input,
+                          imgSizeH,
+                          imgSizeW,
+                          2 * imgSizeH,
+                          2 * imgSizeW,
+                          channels,
+                          ratioH,
+                          ratioW);
+  targetGpu->bilinearForward(*inputGpu,
+                             imgSizeH,
+                             imgSizeW,
+                             2 * imgSizeH,
+                             2 * imgSizeW,
+                             channels,
+                             ratioH,
+                             ratioW);
+
+  // check
+  targetCheck->copyFrom(*targetGpu);
+  MatrixCheckErr(*target, *targetCheck);
+
+  // backward
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr targetCheckGrad =
+      CpuMatrix::create(numSamples, inWidth, false, false);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->bilinearBackward(*targetGrad,
+                              2 * imgSizeH,
+                              2 * imgSizeW,
+                              imgSizeH,
+                              imgSizeW,
+                              channels,
+                              ratioH,
+                              ratioW);
+  inputGpuGrad->bilinearBackward(*targetGpuGrad,
+                                 2 * imgSizeH,
+                                 2 * imgSizeW,
+                                 imgSizeH,
+                                 imgSizeW,
+                                 channels,
+                                 ratioH,
+                                 ratioW);
+
+  // check
+  targetCheckGrad->copyFrom(*inputGpuGrad);
+  MatrixCheckErr(*inputGrad, *targetCheckGrad);
+}
+
+TEST(Matrix, BilinearFwdBwd) {
+  for (auto numSamples : {5, 10}) {
+    for (auto channels : {8, 16}) {
+      for (auto imgSizeH : {14, 28}) {
+        for (auto imgSizeW : {16, 30}) {
+          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
+          testBilinearFwdBwd(numSamples, imgSizeH, imgSizeW, channels);
+        }
+      }
+    }
+  }
+}
+
+void testMatrixProjectionForward(int contextStart,
+                                 int contextLength,
+                                 bool padding,
+                                 int batchSize,
+                                 int inputDim) {
   MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
   MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
   cpuInput->randomizeUniform();
@@ -121,12 +216,20 @@ void testMatrixProjectionForward(int contextStart, int contextLength,
 
   // calculate
   int beginPad = std::max(0, -contextStart);
-  cpuOutput->contextProjectionForward(cpuInput, cpuWeight, *cpuSequence,
-                                      contextLength, contextStart, beginPad,
+  cpuOutput->contextProjectionForward(cpuInput,
+                                      cpuWeight,
+                                      *cpuSequence,
+                                      contextLength,
+                                      contextStart,
+                                      beginPad,
                                       padding);
 
-  gpuOutput->contextProjectionForward(gpuInput, gpuWeight, *gpuSequence,
-                                      contextLength, contextStart, beginPad,
+  gpuOutput->contextProjectionForward(gpuInput,
+                                      gpuWeight,
+                                      *gpuSequence,
+                                      contextLength,
+                                      contextStart,
+                                      beginPad,
                                       padding);
 
   // check
@@ -137,8 +240,11 @@ void testMatrixProjectionForward(int contextStart, int contextLength,
   MatrixCheckEqual(*cpuOutput, *outputCheck);
 }
 
-void testMatrixProjectionBackward(int contextStart, int contextLength,
-                                  bool padding, int batchSize, int inputDim) {
+void testMatrixProjectionBackward(int contextStart,
+                                  int contextLength,
+                                  bool padding,
+                                  int batchSize,
+                                  int inputDim) {
   MatrixPtr cpuOutputGrad =
       std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
   MatrixPtr gpuOutputGrad =
@@ -170,15 +276,22 @@ void testMatrixProjectionBackward(int contextStart, int contextLength,
 
   // calculate
   int beginPad = std::max(0, -contextStart);
-  cpuOutputGrad->contextProjectionBackward(cpuInputGrad, cpuWeightGrad,
-                                           *cpuSequence, contextLength,
-                                           contextStart, beginPad, padding);
-  gpuOutputGrad->contextProjectionBackwardData(gpuInputGrad, *gpuSequence,
-                                               contextLength, contextStart);
+  cpuOutputGrad->contextProjectionBackward(cpuInputGrad,
+                                           cpuWeightGrad,
+                                           *cpuSequence,
+                                           contextLength,
+                                           contextStart,
+                                           beginPad,
+                                           padding);
+  gpuOutputGrad->contextProjectionBackwardData(
+      gpuInputGrad, *gpuSequence, contextLength, contextStart);
   if (padding) {
-    gpuOutputGrad->contextProjectionBackwardWeight(
-        gpuWeightGrad, *gpuSequence, contextLength,
-        contextStart, pad, beginPad);
+    gpuOutputGrad->contextProjectionBackwardWeight(gpuWeightGrad,
+                                                   *gpuSequence,
+                                                   contextLength,
+                                                   contextStart,
+                                                   pad,
+                                                   beginPad);
   }
 
   // check
@@ -200,13 +313,19 @@ TEST(Matrix, projection) {
         for (auto batchSize : {1, 2, 5, 20, 100}) {
           for (auto inputDim : {15, 32, 63, 128, 200}) {
             VLOG(3) << " contextStart=" << contextStart
-                      << " contextLength=" << contextLength
-                      << " trainablePadding=" << trainablePadding
-                      << " batchSize=" << batchSize << " inputDim=" << inputDim;
-            testMatrixProjectionForward(contextStart, contextLength,
-                                        trainablePadding, batchSize, inputDim);
-            testMatrixProjectionBackward(contextStart, contextLength,
-                                         trainablePadding, batchSize, inputDim);
+                    << " contextLength=" << contextLength
+                    << " trainablePadding=" << trainablePadding
+                    << " batchSize=" << batchSize << " inputDim=" << inputDim;
+            testMatrixProjectionForward(contextStart,
+                                        contextLength,
+                                        trainablePadding,
+                                        batchSize,
+                                        inputDim);
+            testMatrixProjectionBackward(contextStart,
+                                         contextLength,
+                                         trainablePadding,
+                                         batchSize,
+                                         inputDim);
           }
         }
       }
@@ -639,9 +758,35 @@ void testMatrixTranspose(int height, int width) {
   MatrixCheckEqual(*cpuT, *outputCheck);
 }
 
+void testMatrixInverse(int height) {
+  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, height);
+  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, height);
+  MatrixPtr cpuI = std::make_shared<CpuMatrix>(height, height);
+  MatrixPtr gpuI = std::make_shared<GpuMatrix>(height, height);
+
+  /* Make matrix well conditioned: cpu * cpuT + Identity */
+  cpu->randomizeUniform();
+  MatrixPtr cpuT = cpu->getTranspose();
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, height);
+  outputCheck->mul(cpu, cpuT);
+  cpu->setDiag(1.0);
+  cpu->add(*outputCheck);
+
+  gpu->copyFrom(*cpu);
+  cpu->inverse(cpuI, false);
+  gpu->inverse(gpuI, false);
+
+  outputCheck->copyFrom(*gpuI);
+  MatrixCheckErr(*cpuI, *outputCheck);
+
+  outputCheck->mul(cpu, cpuI);
+  cpu->setDiag(1.0);
+  MatrixCheckErr(*cpu, *outputCheck);
+}
+
 TEST(Matrix, unary) {
-  for (auto height : {1, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
       VLOG(3) << " height=" << height << " width=" << width;
 
       // applyUnary
@@ -673,6 +818,8 @@ TEST(Matrix, unary) {
       // transpose
       testMatrixTranspose(height, width);
     }
+    // inverse
+    testMatrixInverse(height);
   }
 }
 
@@ -716,7 +863,6 @@ void testSequenceSoftmax(int batchSize) {
   MatrixCheckErr(*cpuInput, *outputCheck);
 }
 
-
 void testMatrixSoftmaxThreshold(int height, int width) {
   MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
   MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
@@ -1119,7 +1265,7 @@ TEST(Matrix, AtOffset) {
     for (auto width1 : {1, 32, 100, 512, 1000}) {
       for (auto width2 : {1, 32, 100, 512, 1000}) {
         VLOG(3) << " height=" << height << " width1=" << width1
-                  << " width2=" << width2;
+                << " width2=" << width2;
 
         testMatrixAddAtOffset(height, width1, width2);
         testMatrixAssignAtOffset(height, width1, width2);
@@ -1187,7 +1333,7 @@ TEST(Matrix, tableProjection) {
     for (auto tableSize : {10, 100}) {
       for (auto inputDim : {20, 50}) {
         VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
-                  << " inputDim=" << inputDim;
+                << " inputDim=" << inputDim;
         testMatrixSelectRows(numSamples, tableSize, inputDim);
         testMatrixAddToRows(numSamples, tableSize, inputDim);
       }
@@ -1262,8 +1408,12 @@ void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
     }
   };
 
-  auto subMatrix = [](MatrixPtr& sub, MatrixPtr matrix, size_t startRow,
-                      size_t endRow, size_t startCol, size_t endCol) {
+  auto subMatrix = [](MatrixPtr& sub,
+                      MatrixPtr matrix,
+                      size_t startRow,
+                      size_t endRow,
+                      size_t startCol,
+                      size_t endCol) {
     if (!matrix->isTransposed()) {
       sub = matrix->subMatrix(startRow, endRow, startCol, endCol);
     } else {
@@ -1307,9 +1457,9 @@ TEST(Matrix, mul) {
               continue;
             }
             VLOG(3) << setiosflags(ios::left) << setfill(' ')
-                      << " transa=" << transa << " transb=" << transb
-                      << " dimM=" << setw(5) << dimM << " dimN=" << setw(5)
-                      << dimN << " dimK=" << setw(5) << dimK;
+                    << " transa=" << transa << " transb=" << transb
+                    << " dimM=" << setw(5) << dimM << " dimN=" << setw(5)
+                    << dimN << " dimK=" << setw(5) << dimK;
 
             testMatrixMul(transa, transb, dimM, dimN, dimK);
             testSubMatrixMul(transa, transb, dimM, dimN, dimK);
@@ -1339,7 +1489,7 @@ TEST(Vector, rowFunc) {
   }
 }
 
-template<class T>
+template <class T>
 void testVectorReset(int size) {
   std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
   std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
@@ -1353,14 +1503,14 @@ void testVectorReset(int size) {
   VectorCheckEqual(*cpu, *out);
 }
 
-template<class T>
+template <class T>
 void testVecortSelectFrom(int size) {
   std::shared_ptr<CpuVectorT<T>> cpuDst = std::make_shared<CpuVectorT<T>>(size);
   std::shared_ptr<GpuVectorT<T>> gpuDst = std::make_shared<GpuVectorT<T>>(size);
-  std::shared_ptr<CpuVectorT<T>>
-    cpuSrc = std::make_shared<CpuVectorT<T>>(size*2);
-  std::shared_ptr<GpuVectorT<T>>
-    gpuSrc = std::make_shared<GpuVectorT<T>>(size*2);
+  std::shared_ptr<CpuVectorT<T>> cpuSrc =
+      std::make_shared<CpuVectorT<T>>(size * 2);
+  std::shared_ptr<GpuVectorT<T>> gpuSrc =
+      std::make_shared<GpuVectorT<T>>(size * 2);
   CpuIVectorPtr cpuIds = std::make_shared<CpuVectorT<int>>(size);
   GpuIVectorPtr gpuIds = std::make_shared<GpuVectorT<int>>(size);
 
@@ -1381,7 +1531,7 @@ void testVecortSelectFrom(int size) {
   VectorCheckEqual(*cpuDst, *out);
 }
 
-template<class T>
+template <class T>
 void testVecotrZeroMem(int size) {
   std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
   std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
@@ -1394,7 +1544,7 @@ void testVecotrZeroMem(int size) {
   VectorCheckEqual(*cpu, *out);
 }
 
-template<class T>
+template <class T>
 void testVectorIsEqual(int size) {
   std::shared_ptr<CpuVectorT<T>> cpuA = std::make_shared<CpuVectorT<T>>(size);
   std::shared_ptr<CpuVectorT<T>> cpuB = std::make_shared<CpuVectorT<T>>(size);
@@ -1452,12 +1602,11 @@ void testMatrixTopK(int samples, int dim, int beamSize) {
 
 TEST(Matrix, topK) {
   for (auto samples : {1, 5, 31, 90, 150, 500}) {
-    for (auto dim : {1, 5 , 8, 10, 15, 64, 80, 120, 256, 300,
-                     1280, 5120, 50000}) {
+    for (auto dim :
+         {1, 5, 8, 10, 15, 64, 80, 120, 256, 300, 1280, 5120, 50000}) {
       for (auto beamSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
         if (beamSize > dim) continue;
-        VLOG(3) << " samples=" << samples
-                << " beamSize=" << beamSize
+        VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
                 << " dim=" << dim;
         testMatrixTopK(samples, dim, beamSize);
       }
@@ -1507,10 +1656,8 @@ TEST(SMatrix, topK) {
       for (auto beamSize : {1, 5, 40, 100, 500}) {
         for (auto ratio : {0.01, 0.001}) {
           if (beamSize > dim) continue;
-          VLOG(3) << " samples=" << samples
-                  << " beamSize=" << beamSize
-                  << " dim=" << dim
-                  << " ratio=" << ratio;
+          VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
+                  << " dim=" << dim << " ratio=" << ratio;
           testSMatrixTopK(samples, dim, beamSize, ratio);
         }
       }
@@ -1631,8 +1778,7 @@ TEST(Matrix, cosSim) {
   }
 }
 
-void testCosSimDerivate(int heightX, int heightY, int width,
-                        real scale) {
+void testCosSimDerivate(int heightX, int heightY, int width, real scale) {
   MatrixPtr prevOutX = CpuMatrix::create(heightX, width, false, false);
   MatrixPtr prevOutY = CpuMatrix::create(heightY, width, false, false);
   MatrixPtr grad = CpuMatrix::create(heightX, 1, false, false);
@@ -1661,12 +1807,8 @@ void testCosSimDerivate(int heightX, int heightY, int width,
   prevGradXGpu->copyFrom(*prevGradX);
   prevGradYGpu->copyFrom(*prevGradY);
 
-  grad->cosSimDerivative(*output,
-                         *prevOutX,
-                         *prevOutY,
-                         *prevGradX,
-                         *prevGradY,
-                         scale);
+  grad->cosSimDerivative(
+      *output, *prevOutX, *prevOutY, *prevGradX, *prevGradY, scale);
 
   gradGpu->cosSimDerivative(*outputGpu,
                             *prevOutXGpu,
@@ -1675,10 +1817,8 @@ void testCosSimDerivate(int heightX, int heightY, int width,
                             *prevGradYGpu,
                             scale);
 
-  MatrixPtr prevGradXCheck = CpuMatrix::create(heightX, width, false,
-                                               false);
-  MatrixPtr prevGradYCheck = CpuMatrix::create(heightY, width, false,
-                                               false);
+  MatrixPtr prevGradXCheck = CpuMatrix::create(heightX, width, false, false);
+  MatrixPtr prevGradYCheck = CpuMatrix::create(heightY, width, false, false);
   prevGradXCheck->copyFrom(*prevGradXGpu);
   prevGradYCheck->copyFrom(*prevGradYGpu);
   MatrixCheckErr(*prevGradX, *prevGradXCheck);
@@ -1697,8 +1837,7 @@ TEST(Matrix, cosSimDerivate) {
   }
 }
 
-void testParamReluForward(int height, int width, int w_height,
-                                                 int w_width) {
+void testParamReluForward(int height, int width, int w_height, int w_width) {
   MatrixPtr output = CpuMatrix::create(height, width, false, false);
   MatrixPtr input = CpuMatrix::create(height, width, false, false);
   MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
@@ -1735,8 +1874,7 @@ TEST(Matrix, paramReluForward) {
   }
 }
 
-void testParamReluBackwardW(int height, int width, int w_height,
-                                                   int w_width) {
+void testParamReluBackwardW(int height, int width, int w_height, int w_width) {
   MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
   MatrixPtr input = CpuMatrix::create(height, width, false, false);
   MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
@@ -1773,8 +1911,10 @@ TEST(Matrix, paramReluBackwardW) {
   }
 }
 
-void testParamReluBackwardDiff(int height, int width, int w_height,
-                                                      int w_width) {
+void testParamReluBackwardDiff(int height,
+                               int width,
+                               int w_height,
+                               int w_width) {
   MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
   MatrixPtr input = CpuMatrix::create(height, width, false, false);
   MatrixPtr diff = CpuMatrix::create(height, width, false, false);
@@ -1846,11 +1986,16 @@ TEST(Matrix, classificationError) {
   }
 }
 
-void testMaxPoolFwdBwd(int numSamples, int channels,
-                       int imgSizeH, int imgSizeW,
-                       int ksizeH, int ksizeW,
-                       int strideH, int strideW,
-                       int padH, int padW) {
+void testMaxPoolFwdBwd(int numSamples,
+                       int channels,
+                       int imgSizeH,
+                       int imgSizeW,
+                       int ksizeH,
+                       int ksizeW,
+                       int strideH,
+                       int strideW,
+                       int padH,
+                       int padW) {
   int outH = 0, outW = 0;
   outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
   outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
@@ -1868,12 +2013,30 @@ void testMaxPoolFwdBwd(int numSamples, int channels,
   inputGpu->copyFrom(*input);
   targetGpu->copyFrom(*target);
 
-  target->maxPoolForward(*input, imgSizeH, imgSizeW,
-                         channels, ksizeW, ksizeH,
-                         strideH, strideW, outH, outW, padH, padW);
-  targetGpu->maxPoolForward(*inputGpu, imgSizeH, imgSizeW,
-                            channels, ksizeW, ksizeH,
-                            strideH, strideW, outH, outW, padH, padW);
+  target->maxPoolForward(*input,
+                         imgSizeH,
+                         imgSizeW,
+                         channels,
+                         ksizeW,
+                         ksizeH,
+                         strideH,
+                         strideW,
+                         outH,
+                         outW,
+                         padH,
+                         padW);
+  targetGpu->maxPoolForward(*inputGpu,
+                            imgSizeH,
+                            imgSizeW,
+                            channels,
+                            ksizeW,
+                            ksizeH,
+                            strideH,
+                            strideW,
+                            outH,
+                            outW,
+                            padH,
+                            padW);
   MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
   targetCheck->copyFrom(*targetGpu);
   checkMatrixEqual(target, targetCheck);
@@ -1881,35 +2044,60 @@ void testMaxPoolFwdBwd(int numSamples, int channels,
   MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
   MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
   MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth,
-                                              false, true);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
 
   inputGrad->randomizeUniform();
   targetGrad->randomizeUniform();
   inputGpuGrad->copyFrom(*inputGrad);
   targetGpuGrad->copyFrom(*targetGrad);
 
-  inputGrad->maxPoolBackward(*input, imgSizeH, imgSizeW,
-                             *targetGrad, *target,
-                             ksizeW, ksizeH,
-                             strideH, strideW,
-                             outH, outW, 1.0, 1.0, padH, padW);
-  inputGpuGrad->maxPoolBackward(*inputGpu, imgSizeH, imgSizeW,
-                                *targetGpuGrad, *targetGpu,
-                                ksizeW, ksizeH,
-                                strideH, strideW,
-                                outH, outW, 1.0, 1.0, padH, padW);
-  MatrixPtr targetBwdCheck = CpuMatrix::create(numSamples, inWidth,
-                                               false, false);
+  inputGrad->maxPoolBackward(*input,
+                             imgSizeH,
+                             imgSizeW,
+                             *targetGrad,
+                             *target,
+                             ksizeW,
+                             ksizeH,
+                             strideH,
+                             strideW,
+                             outH,
+                             outW,
+                             1.0,
+                             1.0,
+                             padH,
+                             padW);
+  inputGpuGrad->maxPoolBackward(*inputGpu,
+                                imgSizeH,
+                                imgSizeW,
+                                *targetGpuGrad,
+                                *targetGpu,
+                                ksizeW,
+                                ksizeH,
+                                strideH,
+                                strideW,
+                                outH,
+                                outW,
+                                1.0,
+                                1.0,
+                                padH,
+                                padW);
+  MatrixPtr targetBwdCheck =
+      CpuMatrix::create(numSamples, inWidth, false, false);
   targetBwdCheck->copyFrom(*inputGpuGrad);
   checkMatrixEqual(inputGrad, targetBwdCheck);
 }
 
-void testAvgPoolFwdBwd(int numSamples, int channels,
-                       int imgSizeH, int imgSizeW,
-                       int ksizeH, int ksizeW,
-                       int strideH, int strideW,
-                       int padH, int padW) {
+void testAvgPoolFwdBwd(int numSamples,
+                       int channels,
+                       int imgSizeH,
+                       int imgSizeW,
+                       int ksizeH,
+                       int ksizeW,
+                       int strideH,
+                       int strideW,
+                       int padH,
+                       int padW) {
   int outH = 0, outW = 0;
   outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
   outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
@@ -1927,12 +2115,30 @@ void testAvgPoolFwdBwd(int numSamples, int channels,
   inputGpu->copyFrom(*input);
   targetGpu->copyFrom(*target);
 
-  target->avgPoolForward(*input, imgSizeH, imgSizeW,
-                         channels, ksizeW, ksizeH,
-                         strideH, strideW, outH, outW, padH, padW);
-  targetGpu->avgPoolForward(*inputGpu, imgSizeH, imgSizeW,
-                            channels, ksizeW, ksizeH,
-                            strideH, strideW, outH, outW, padH, padW);
+  target->avgPoolForward(*input,
+                         imgSizeH,
+                         imgSizeW,
+                         channels,
+                         ksizeW,
+                         ksizeH,
+                         strideH,
+                         strideW,
+                         outH,
+                         outW,
+                         padH,
+                         padW);
+  targetGpu->avgPoolForward(*inputGpu,
+                            imgSizeH,
+                            imgSizeW,
+                            channels,
+                            ksizeW,
+                            ksizeH,
+                            strideH,
+                            strideW,
+                            outH,
+                            outW,
+                            padH,
+                            padW);
   MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
   targetCheck->copyFrom(*targetGpu);
   MatrixCheckErr(*target, *targetCheck);
@@ -1940,24 +2146,42 @@ void testAvgPoolFwdBwd(int numSamples, int channels,
   MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
   MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
   MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth,
-                                              false, true);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
 
   inputGrad->randomizeUniform();
   targetGrad->randomizeUniform();
   inputGpuGrad->copyFrom(*inputGrad);
   targetGpuGrad->copyFrom(*targetGrad);
 
-  inputGrad->avgPoolBackward(*targetGrad, imgSizeH, imgSizeW,
-                             ksizeW, ksizeH,
-                             strideH, strideW,
-                             outH, outW, 1.0, 1.0, padH, padW);
-  inputGpuGrad->avgPoolBackward(*targetGpuGrad, imgSizeH, imgSizeW,
-                                ksizeW, ksizeH,
-                                strideH, strideW,
-                                outH, outW, 1.0, 1.0, padH, padW);
-  MatrixPtr targetBwdCheck = CpuMatrix::create(numSamples, inWidth,
-                                               false, false);
+  inputGrad->avgPoolBackward(*targetGrad,
+                             imgSizeH,
+                             imgSizeW,
+                             ksizeW,
+                             ksizeH,
+                             strideH,
+                             strideW,
+                             outH,
+                             outW,
+                             1.0,
+                             1.0,
+                             padH,
+                             padW);
+  inputGpuGrad->avgPoolBackward(*targetGpuGrad,
+                                imgSizeH,
+                                imgSizeW,
+                                ksizeW,
+                                ksizeH,
+                                strideH,
+                                strideW,
+                                outH,
+                                outW,
+                                1.0,
+                                1.0,
+                                padH,
+                                padW);
+  MatrixPtr targetBwdCheck =
+      CpuMatrix::create(numSamples, inWidth, false, false);
   targetBwdCheck->copyFrom(*inputGpuGrad);
   MatrixCheckErr(*inputGrad, *targetBwdCheck);
 }
@@ -1971,24 +2195,37 @@ TEST(Matrix, PoolFwdBwd) {
             for (auto sizeY : {2, 5}) {
               for (auto sH : {1, 2}) {
                 for (auto sW : {1, 2}) {
-                   for (auto pH : {0, (sizeY - 1)/2}) {
-                     for (auto pW : {0, (sizeX - 1)/2}) {
-                       VLOG(3) << " numSamples=" << numSamples
-                               << " channels=" << channels
-                               << " imgSizeH=" << imgSizeH
-                               << " imgSizeW=" << imgSizeW
-                               << " sizeX=" << sizeX
-                               << " sizeY=" << sizeY
-                               << " strideH=" << sH
-                               << " strideW=" << sW
-                               << " padingH=" << pH
-                               << " padingW=" << pW;
-                       testMaxPoolFwdBwd(numSamples, channels, imgSizeH,
-                         imgSizeW, sizeX, sizeY, sH, sW, pH, pW);
-                       testAvgPoolFwdBwd(numSamples, channels, imgSizeH,
-                         imgSizeW, sizeX, sizeY, sH, sW, pH, pW);
-                     }
-                   }
+                  for (auto pH : {0, (sizeY - 1) / 2}) {
+                    for (auto pW : {0, (sizeX - 1) / 2}) {
+                      VLOG(3) << " numSamples=" << numSamples
+                              << " channels=" << channels
+                              << " imgSizeH=" << imgSizeH
+                              << " imgSizeW=" << imgSizeW << " sizeX=" << sizeX
+                              << " sizeY=" << sizeY << " strideH=" << sH
+                              << " strideW=" << sW << " padingH=" << pH
+                              << " padingW=" << pW;
+                      testMaxPoolFwdBwd(numSamples,
+                                        channels,
+                                        imgSizeH,
+                                        imgSizeW,
+                                        sizeX,
+                                        sizeY,
+                                        sH,
+                                        sW,
+                                        pH,
+                                        pW);
+                      testAvgPoolFwdBwd(numSamples,
+                                        channels,
+                                        imgSizeH,
+                                        imgSizeW,
+                                        sizeX,
+                                        sizeY,
+                                        sH,
+                                        sW,
+                                        pH,
+                                        pW);
+                    }
+                  }
                 }
               }
             }
@@ -1999,6 +2236,183 @@ TEST(Matrix, PoolFwdBwd) {
   }
 }
 
+void testMaxOutFwdBwd(
+    int numSamples, int imgSizeH, int imgSizeW, int channels, int groups) {
+  int inWidth = imgSizeH * imgSizeW * channels;
+  int outChannels = channels / groups;
+  int outWidth = imgSizeH * imgSizeW * outChannels;
+
+  // forward
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+
+  IVectorPtr id = CpuIVector::create(numSamples * outWidth, false);
+  IVectorPtr idGpu = GpuIVector::create(numSamples * outWidth, true);
+  IVectorPtr idCheck = CpuIVector::create(numSamples * outWidth, false);
+
+  input->randomizeUniform();
+  inputGpu->copyFrom(*input);
+
+  target->maxoutForward(*input, *id, outChannels, groups);
+  targetGpu->maxoutForward(*inputGpu, *idGpu, outChannels, groups);
+
+  // check
+  targetCheck->copyFrom(*targetGpu);
+  MatrixCheckErr(*target, *targetCheck);
+  idCheck->copyFrom(*idGpu);
+  VectorCheckEqual(*id, *idCheck);
+
+  // backward
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr targetCheckGrad =
+      CpuMatrix::create(numSamples, inWidth, false, false);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->maxoutBackward(*targetGrad, *id, outChannels, groups);
+  inputGpuGrad->maxoutBackward(*targetGpuGrad, *idGpu, outChannels, groups);
+
+  // check
+  targetCheckGrad->copyFrom(*inputGpuGrad);
+  MatrixCheckErr(*inputGrad, *targetCheckGrad);
+}
+
+TEST(Matrix, MaxOutFwdBwd) {
+  for (auto numSamples : {5, 10}) {
+    for (auto channels : {8, 16}) {
+      for (auto imgSizeH : {14, 28}) {
+        for (auto imgSizeW : {16, 30}) {
+          for (auto groups : {2, 4}) {
+            VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
+                    << " groups=" << groups;
+            testMaxOutFwdBwd(numSamples, imgSizeH, imgSizeW, channels, groups);
+          }
+        }
+      }
+    }
+  }
+}
+
+void testAddSharedBias(int numSamples, int dim, int channel) {
+  MatrixPtr cpuData = std::make_shared<CpuMatrix>(numSamples, dim);
+  MatrixPtr gpuData = std::make_shared<GpuMatrix>(numSamples, dim);
+
+  MatrixPtr cpuBias = std::make_shared<CpuMatrix>(1, channel);
+  MatrixPtr gpuBias = std::make_shared<GpuMatrix>(1, channel);
+
+  cpuData->randomizeUniform();
+  gpuData->copyFrom(*cpuData);
+  cpuBias->randomizeUniform();
+  gpuBias->copyFrom(*cpuBias);
+
+  cpuData->addSharedBias(*cpuBias, 1.0);
+  gpuData->addSharedBias(*gpuBias, 1.0);
+
+  MatrixPtr check = std::make_shared<CpuMatrix>(numSamples, dim);
+  check->copyFrom(*gpuData);
+  MatrixCheckErr(*cpuData, *check);
+}
+
+void testCollectSharedBias(int numSamples, int dim, int channel) {
+  MatrixPtr cpuData = std::make_shared<CpuMatrix>(numSamples, dim);
+  MatrixPtr gpuData = std::make_shared<GpuMatrix>(numSamples, dim);
+
+  MatrixPtr cpuBias = std::make_shared<CpuMatrix>(1, channel);
+  MatrixPtr gpuBias = std::make_shared<GpuMatrix>(1, channel);
+
+  cpuData->randomizeUniform();
+  gpuData->copyFrom(*cpuData);
+  cpuBias->randomizeUniform();
+  gpuBias->copyFrom(*cpuBias);
+
+  cpuBias->collectSharedBias(*cpuData, 1.0);
+  gpuBias->collectSharedBias(*gpuData, 1.0);
+
+  MatrixPtr check = std::make_shared<CpuMatrix>(1, channel);
+  check->copyFrom(*gpuBias);
+  MatrixCheckErr(*cpuBias, *check);
+}
+
+TEST(Matrix, sharedBias) {
+  for (auto numSamples : {1, 100, 520}) {
+    for (auto dim : {100 * 16, 100 * 32}) {
+      for (auto channel : {8, 16}) {
+        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
+                << " channel=" << channel;
+        testAddSharedBias(numSamples, dim, channel);
+        testCollectSharedBias(numSamples, dim, channel);
+      }
+    }
+  }
+}
+
+void testMultiBinaryLabelCrossEntropy(int numSamples, int dim) {
+  MatrixPtr output = std::make_shared<CpuMatrix>(numSamples, dim);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, dim);
+
+  MatrixPtr cpuEntropy = std::make_shared<CpuMatrix>(numSamples, 1);
+  MatrixPtr gpuEntropy = std::make_shared<GpuMatrix>(numSamples, 1);
+
+  MatrixPtr cpuGrad = std::make_shared<CpuMatrix>(numSamples, dim);
+  MatrixPtr gpuGrad = std::make_shared<GpuMatrix>(numSamples, dim);
+
+  MatrixPtr cpuLabel = std::make_shared<CpuSparseMatrix>(
+      numSamples, dim, numSamples, NO_VALUE, SPARSE_CSR, false);
+  MatrixPtr gpuLabel = std::make_shared<GpuSparseMatrix>(
+      numSamples, dim, numSamples, NO_VALUE, SPARSE_CSR, false);
+  for (int i = 0; i < numSamples; i++) {
+    const unsigned int id = rand() % dim;  // NOLINT
+    cpuLabel->setRow(i, 1, &id, nullptr);
+    gpuLabel->setRow(i, 1, &id, nullptr);
+  }
+
+  output->randomizeUniform();
+  cpuOutput->zeroMem();
+  output->softmax(*cpuOutput);
+  gpuOutput->copyFrom(*cpuOutput);
+
+  cpuEntropy->zeroMem();
+  gpuEntropy->zeroMem();
+  cpuEntropy->multiBinaryLabelCrossEntropy(*cpuOutput, *cpuLabel);
+  gpuEntropy->multiBinaryLabelCrossEntropy(*gpuOutput, *gpuLabel);
+
+  MatrixPtr check1 = std::make_shared<CpuMatrix>(numSamples, 1);
+  check1->copyFrom(*gpuEntropy);
+  MatrixCheckErr(*cpuEntropy, *check1);
+
+  cpuGrad->zeroMem();
+  gpuGrad->zeroMem();
+  cpuGrad->multiBinaryLabelCrossEntropyBp(*cpuOutput, *cpuLabel);
+  gpuGrad->multiBinaryLabelCrossEntropyBp(*gpuOutput, *gpuLabel);
+
+  MatrixPtr check2 = std::make_shared<CpuMatrix>(numSamples, dim);
+  check2->copyFrom(*gpuGrad);
+  MatrixCheckErr(*cpuGrad, *check2);
+}
+
+TEST(Matrix, multiBinaryCrossEntropy) {
+  for (auto numSamples : {100, 1000, 10000}) {
+    for (auto dim : {100, 1000, 10000}) {
+      VLOG(3) << " numSamples=" << numSamples << " dim=" << dim;
+      testMultiBinaryLabelCrossEntropy(numSamples, dim);
+    }
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/math/tests/test_matrixUtil.h b/paddle/math/tests/test_matrixUtil.h
index fa682164aa8643dd088bd0ece757728e03488b76..5300e7168b9dc61b65e64346424e65c11665cf99 100644
--- a/paddle/math/tests/test_matrixUtil.h
+++ b/paddle/math/tests/test_matrixUtil.h
@@ -104,8 +104,7 @@ void checkSMatrixEqual2Dense(const CpuSparseMatrixPtr& a,
   }
 }
 
-void checkSMatrixErr(const CpuSparseMatrixPtr& a,
-                     const CpuSparseMatrixPtr& b) {
+void checkSMatrixErr(const CpuSparseMatrixPtr& a, const CpuSparseMatrixPtr& b) {
 #ifndef PADDLE_TYPE_DOUBLE
   real err = 1e-3;
 #else
@@ -126,7 +125,8 @@ void checkSMatrixErr(const CpuSparseMatrixPtr& a,
         real bVal = b->getValue()[r];
         if (std::abs(aVal - bVal) > err) {
           if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
-            LOG(INFO) << "a=" << aVal << "\t" << "b=" << bVal;
+            LOG(INFO) << "a=" << aVal << "\t"
+                      << "b=" << bVal;
             count++;
           }
         }
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
index 4fa9bc72013da6a3d551854516e0f0d2fe5ee1ef..837c2f47ba05a04988431e14cb6bc2490f42d32e 100644
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
@@ -37,7 +37,9 @@ protected:
 
   virtual void TearDown() {}
 
-  void allocateMem(real*& gpuAngle, real*& gpuScale, int*& gpuCenterR,
+  void allocateMem(real*& gpuAngle,
+                   real*& gpuScale,
+                   int*& gpuCenterR,
                    int*& gpuCenterC) {
     gpuAngle = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
     gpuScale = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
@@ -48,7 +50,8 @@ protected:
   }
 
   // Generate translation parameters for testing.
-  void generateTranslationParams(int*& gpuCenterR, int*& gpuCenterC,
+  void generateTranslationParams(int*& gpuCenterR,
+                                 int*& gpuCenterC,
                                  int imgSize) {
     int cpuCenterR[NUM_IMAGES * SAMPLING_RATE];
     int cpuCenterC[NUM_IMAGES * SAMPLING_RATE];
@@ -59,13 +62,13 @@ protected:
 
     gpuCenterR =
         (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-    hl_memcpy_host2device(gpuCenterR, cpuCenterR,
-                          sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
+    hl_memcpy_host2device(
+        gpuCenterR, cpuCenterR, sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
 
     gpuCenterC =
         (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-    hl_memcpy_host2device(gpuCenterC, cpuCenterC,
-                          sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
+    hl_memcpy_host2device(
+        gpuCenterC, cpuCenterC, sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
   }
 
   // Generate rotation parameters for testing.
@@ -84,8 +87,7 @@ protected:
       cpuScale[i] = static_cast<real>(TGT_SIZE - 2) / TGT_SIZE;
     }
     gpuScale = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
-    hl_memcpy_host2device(gpuScale, cpuScale,
-                          sizeof(real) * NUM_IMAGES);
+    hl_memcpy_host2device(gpuScale, cpuScale, sizeof(real) * NUM_IMAGES);
   }
 
   // Generate the test images, only the center regions are set to 1.
@@ -111,8 +113,7 @@ protected:
       }
     }
     gpuImages = (real*)hl_malloc_device(sizeof(real) * IMAGE_MEM_SIZE);
-    hl_memcpy_host2device(gpuImages, cpuImages,
-                          sizeof(real) * IMAGE_MEM_SIZE);
+    hl_memcpy_host2device(gpuImages, cpuImages, sizeof(real) * IMAGE_MEM_SIZE);
   }
 
   real* gpuImages_;
@@ -120,64 +121,99 @@ protected:
 
 // Random perturbation. Only to make sure the code does not break.
 TEST_F(PerturbationTest, random_perturb) {
-  real* gpuAngle, *gpuScaleRatio;
-  int* gpuCenterR, *gpuCenterC;
+  real *gpuAngle, *gpuScaleRatio;
+  int *gpuCenterR, *gpuCenterC;
   allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
 
   real* targets = NULL;
   const int TARGET_MEM_SIZE =
       NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
   targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb(gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS,
-                         NUM_IMAGES, 1.0, 1.0, SAMPLING_RATE, gpuAngle,
-                         gpuScaleRatio, gpuCenterR, gpuCenterC, 2, true,
+  hl_conv_random_disturb(gpuImages_,
+                         IMG_SIZE,
+                         TGT_SIZE,
+                         CHANNELS,
+                         NUM_IMAGES,
+                         1.0,
+                         1.0,
+                         SAMPLING_RATE,
+                         gpuAngle,
+                         gpuScaleRatio,
+                         gpuCenterR,
+                         gpuCenterC,
+                         2,
+                         true,
                          targets);
   real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets,
-                        sizeof(real) * TARGET_MEM_SIZE);
+  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
 }
 
 TEST_F(PerturbationTest, identity_perturb) {
-  real* gpuAngle, *gpuScaleRatio;
-  int* gpuCenterR, *gpuCenterC;
+  real *gpuAngle, *gpuScaleRatio;
+  int *gpuCenterR, *gpuCenterC;
   allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
 
   real* targets = NULL;
   const int TARGET_MEM_SIZE =
       NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
   targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb(gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS,
-                         NUM_IMAGES, 1.0, 1.0, SAMPLING_RATE, gpuAngle,
-                         gpuScaleRatio, gpuCenterR, gpuCenterC, 2, false,
+  hl_conv_random_disturb(gpuImages_,
+                         IMG_SIZE,
+                         TGT_SIZE,
+                         CHANNELS,
+                         NUM_IMAGES,
+                         1.0,
+                         1.0,
+                         SAMPLING_RATE,
+                         gpuAngle,
+                         gpuScaleRatio,
+                         gpuCenterR,
+                         gpuCenterC,
+                         2,
+                         false,
                          targets);
   real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets,
-                        sizeof(real) * TARGET_MEM_SIZE);
+  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
   for (int i = 0; i < TARGET_MEM_SIZE; ++i) {
     EXPECT_FLOAT_EQ(1.0, cpuTargets[i]);
   }
 }
 
 TEST_F(PerturbationTest, translation_test) {
-  real* gpuAngle, *gpuScaleRatio;
-  int* gpuCenterR, *gpuCenterC;
+  real *gpuAngle, *gpuScaleRatio;
+  int *gpuCenterR, *gpuCenterC;
   allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-  hl_generate_disturb_params(gpuAngle, gpuScaleRatio, gpuCenterR,
-                             gpuCenterC, NUM_IMAGES, IMG_SIZE, 0.0,
-                             0.0, SAMPLING_RATE, false);
+  hl_generate_disturb_params(gpuAngle,
+                             gpuScaleRatio,
+                             gpuCenterR,
+                             gpuCenterC,
+                             NUM_IMAGES,
+                             IMG_SIZE,
+                             0.0,
+                             0.0,
+                             SAMPLING_RATE,
+                             false);
   generateTranslationParams(gpuCenterR, gpuCenterC, IMG_SIZE);
 
   real* targets = NULL;
   const int TARGET_MEM_SIZE =
       NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
   targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb_with_params(
-      gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS, NUM_IMAGES, SAMPLING_RATE,
-      gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, 2, targets);
+  hl_conv_random_disturb_with_params(gpuImages_,
+                                     IMG_SIZE,
+                                     TGT_SIZE,
+                                     CHANNELS,
+                                     NUM_IMAGES,
+                                     SAMPLING_RATE,
+                                     gpuAngle,
+                                     gpuScaleRatio,
+                                     gpuCenterR,
+                                     gpuCenterC,
+                                     2,
+                                     targets);
 
   real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets,
-                        sizeof(real) * TARGET_MEM_SIZE);
+  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
   for (int i = 0; i < SAMPLING_RATE * NUM_IMAGES; ++i) {
     for (int p = 0; p < TGT_SIZE * TGT_SIZE * CHANNELS; ++p) {
       const int offset = i * TGT_SIZE * TGT_SIZE * CHANNELS + p;
@@ -191,50 +227,80 @@ TEST_F(PerturbationTest, translation_test) {
 }
 
 TEST_F(PerturbationTest, rotation_test) {
-  real* gpuAngle, *gpuScaleRatio;
-  int* gpuCenterR, *gpuCenterC;
+  real *gpuAngle, *gpuScaleRatio;
+  int *gpuCenterR, *gpuCenterC;
   allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-  hl_generate_disturb_params(gpuAngle, gpuScaleRatio, gpuCenterR,
-                             gpuCenterC, NUM_IMAGES, IMG_SIZE, 0.0,
-                             0.0, SAMPLING_RATE, false);
+  hl_generate_disturb_params(gpuAngle,
+                             gpuScaleRatio,
+                             gpuCenterR,
+                             gpuCenterC,
+                             NUM_IMAGES,
+                             IMG_SIZE,
+                             0.0,
+                             0.0,
+                             SAMPLING_RATE,
+                             false);
   generateRotationParams(gpuAngle);
 
   real* targets = NULL;
   const int TARGET_MEM_SIZE =
       NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
   targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb_with_params(
-      gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS, NUM_IMAGES, SAMPLING_RATE,
-      gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, 2, targets);
+  hl_conv_random_disturb_with_params(gpuImages_,
+                                     IMG_SIZE,
+                                     TGT_SIZE,
+                                     CHANNELS,
+                                     NUM_IMAGES,
+                                     SAMPLING_RATE,
+                                     gpuAngle,
+                                     gpuScaleRatio,
+                                     gpuCenterR,
+                                     gpuCenterC,
+                                     2,
+                                     targets);
 
   real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets,
-                        sizeof(real) * TARGET_MEM_SIZE);
+  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
   for (int i = 0; i < TARGET_MEM_SIZE; ++i) {
     EXPECT_FLOAT_EQ(1.0, cpuTargets[i]);
   }
 }
 
 TEST_F(PerturbationTest, scale_test) {
-  real* gpuAngle, *gpuScaleRatio;
-  int* gpuCenterR, *gpuCenterC;
+  real *gpuAngle, *gpuScaleRatio;
+  int *gpuCenterR, *gpuCenterC;
   allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-  hl_generate_disturb_params(gpuAngle, gpuScaleRatio, gpuCenterR,
-                             gpuCenterC, NUM_IMAGES, IMG_SIZE, 0.0,
-                             0.0, SAMPLING_RATE, false);
+  hl_generate_disturb_params(gpuAngle,
+                             gpuScaleRatio,
+                             gpuCenterR,
+                             gpuCenterC,
+                             NUM_IMAGES,
+                             IMG_SIZE,
+                             0.0,
+                             0.0,
+                             SAMPLING_RATE,
+                             false);
   generateScaleParams(gpuScaleRatio);
 
   real* targets = NULL;
   const int TARGET_MEM_SIZE =
       NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
   targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb_with_params(
-      gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS, NUM_IMAGES, SAMPLING_RATE,
-      gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, 2, targets);
+  hl_conv_random_disturb_with_params(gpuImages_,
+                                     IMG_SIZE,
+                                     TGT_SIZE,
+                                     CHANNELS,
+                                     NUM_IMAGES,
+                                     SAMPLING_RATE,
+                                     gpuAngle,
+                                     gpuScaleRatio,
+                                     gpuCenterR,
+                                     gpuCenterC,
+                                     2,
+                                     targets);
 
   real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets,
-                        sizeof(real) * TARGET_MEM_SIZE);
+  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
   for (int i = 0; i < SAMPLING_RATE * NUM_IMAGES; ++i) {
     for (int p = 0; p < TGT_SIZE * TGT_SIZE * CHANNELS; ++p) {
       const int offset = i * TGT_SIZE * TGT_SIZE * CHANNELS + p;
diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp
index 6048dd81122292c6af4a726217d13794ee0f019c..d7aa20eb984417ff3907b078a263c5651d6209d3 100644
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
@@ -155,7 +155,7 @@ TEST(SMatrix, sMatrixMul) {
   for (auto M : {1, 40, 128, 200}) {
     for (auto N : {100, 2000, 20480}) {
       for (auto K : {100, 512, 1024}) {
-        VLOG(3) << " M=" << M << " N=" << N << " K=" << K;;
+        VLOG(3) << " M=" << M << " N=" << N << " K=" << K;
         testSpMatrixMul(M, N, K, 0.05);
       }
     }
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 42c74661d2b2cebe0c2f5f14d0970ab2f1fec866..81d53f065b84b2699141fc599b9efba794bbd25a 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -12,14 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Argument.h"
 #include "paddle/math/SparseMatrix.h"
 
 #include <algorithm>
 
 namespace paddle {
-static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu,
+static void resizeAndCopy(MatrixPtr& dest,
+                          const MatrixPtr& src,
+                          bool useGpu,
                           hl_stream_t stream) {
   if (src) {
     if (!dest) {
@@ -34,7 +35,9 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu,
   }
 }
 
-static void resizeAndCopy(IVectorPtr& dest, const IVectorPtr& src, bool useGpu,
+static void resizeAndCopy(IVectorPtr& dest,
+                          const IVectorPtr& src,
+                          bool useGpu,
                           hl_stream_t stream) {
   if (src) {
     IVector::resizeOrCreate(dest, src->getSize(), useGpu);
@@ -56,8 +59,11 @@ static void resizeAndCopy(ICpuGpuVectorPtr& dest,
   }
 }
 
-static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src,
-                          int32_t startRow, int32_t copySize, bool useGpu,
+static void resizeAndCopy(MatrixPtr& dest,
+                          const MatrixPtr& src,
+                          int32_t startRow,
+                          int32_t copySize,
+                          bool useGpu,
                           hl_stream_t stream = HPPL_STREAM_DEFAULT) {
   if (src) {
     CHECK_LE((size_t)startRow + copySize, src->getHeight());
@@ -84,8 +90,11 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src,
   }
 }
 
-static void resizeAndCopy(IVectorPtr& dest, const IVectorPtr& src,
-                          int32_t startPos, int32_t copySize, bool useGpu,
+static void resizeAndCopy(IVectorPtr& dest,
+                          const IVectorPtr& src,
+                          int32_t startPos,
+                          int32_t copySize,
+                          bool useGpu,
                           hl_stream_t stream = HPPL_STREAM_DEFAULT) {
   if (src) {
     CHECK_LE((size_t)startPos + copySize, src->getSize());
@@ -115,7 +124,8 @@ static void resizeAndCopy(ICpuGpuVectorPtr& dest,
 }
 
 static void resizeAndCopy(UserDefinedVectorPtr& dest,
-                          const UserDefinedVectorPtr& src, bool useGpu,
+                          const UserDefinedVectorPtr& src,
+                          bool useGpu,
                           hl_stream_t stream) {
   if (src) {
     CHECK(!useGpu) << "not implemented";
@@ -132,8 +142,10 @@ static void resizeAndCopy(UserDefinedVectorPtr& dest,
 }
 
 static void resizeAndCopy(UserDefinedVectorPtr& dest,
-                          const UserDefinedVectorPtr& src, int32_t startPos,
-                          int32_t copySize, bool useGpu,
+                          const UserDefinedVectorPtr& src,
+                          int32_t startPos,
+                          int32_t copySize,
+                          bool useGpu,
                           hl_stream_t stream = HPPL_STREAM_DEFAULT) {
   if (src) {
     CHECK(!useGpu) << "not implemented";
@@ -151,7 +163,9 @@ static void resizeAndCopy(UserDefinedVectorPtr& dest,
   }
 }
 
-static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src, bool useGpu,
+static void resizeAndCopy(SVectorPtr& dest,
+                          const SVectorPtr& src,
+                          bool useGpu,
                           hl_stream_t stream) {
   if (src) {
     size_t height = src->size();
@@ -166,8 +180,11 @@ static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src, bool useGpu,
   }
 }
 
-static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src,
-                          int32_t startPos, int32_t copySize, bool useGpu,
+static void resizeAndCopy(SVectorPtr& dest,
+                          const SVectorPtr& src,
+                          int32_t startPos,
+                          int32_t copySize,
+                          bool useGpu,
                           hl_stream_t stream = HPPL_STREAM_DEFAULT) {
   if (src) {
     CHECK_LE((size_t)startPos + copySize, src->size());
@@ -184,37 +201,46 @@ static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src,
 }
 
 void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) {
-   resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
-   hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
 }
 
-void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu,
+void Argument::resizeAndCopyFrom(const Argument& src,
+                                 bool useGpu,
                                  hl_stream_t stream) {
   dataId = src.dataId;
   resizeAndCopy(value, src.value, useGpu, stream);
   resizeAndCopy(grad, src.grad, useGpu, stream);
   resizeAndCopy(in, src.in, useGpu, stream);
   resizeAndCopy(ids, src.ids, useGpu, stream);
-  resizeAndCopy(sequenceStartPositions, src.sequenceStartPositions,
-                false /* useGpu */, stream);
+  resizeAndCopy(sequenceStartPositions,
+                src.sequenceStartPositions,
+                false /* useGpu */,
+                stream);
   if (src.hasSubseq()) {
     resizeAndCopy(subSequenceStartPositions,
-                  src.subSequenceStartPositions, false /* useGpu */, stream);
+                  src.subSequenceStartPositions,
+                  false /* useGpu */,
+                  stream);
   }
   resizeAndCopy(udp, src.udp, useGpu, stream);
   resizeAndCopy(strs, src.strs, useGpu, stream);
 }
 
-int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
-                                    int32_t copySize, bool useGpu) {
-    int32_t size = resizeAndCopyFrom(src, startSeq, copySize, useGpu,
-                                     HPPL_STREAM_DEFAULT);
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-    return size;
+int32_t Argument::resizeAndCopyFrom(const Argument& src,
+                                    int32_t startSeq,
+                                    int32_t copySize,
+                                    bool useGpu) {
+  int32_t size =
+      resizeAndCopyFrom(src, startSeq, copySize, useGpu, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  return size;
 }
 
-int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
-                                    int32_t copySize, bool useGpu,
+int32_t Argument::resizeAndCopyFrom(const Argument& src,
+                                    int32_t startSeq,
+                                    int32_t copySize,
+                                    bool useGpu,
                                     hl_stream_t stream) {
   dataId = src.dataId;
 
@@ -239,8 +265,12 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
     resizeAndCopy(grad, src.grad, startRow, copyFeatureSize, useGpu, stream);
     resizeAndCopy(ids, src.ids, startRow, copyFeatureSize, useGpu, stream);
     resizeAndCopy(udp, src.udp, startRow, copySize, useGpu, stream);
-    resizeAndCopy(sequenceStartPositions, src.sequenceStartPositions,
-                  startSeq, copySize + 1, false, stream);
+    resizeAndCopy(sequenceStartPositions,
+                  src.sequenceStartPositions,
+                  startSeq,
+                  copySize + 1,
+                  false,
+                  stream);
     // modify new sequenceStartPositions
     int* destSequences = sequenceStartPositions->getMutableData(false);
     for (int i = 0; i < copySize + 1; i++) {
@@ -264,8 +294,11 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
       }
       int32_t copySubSize = subEndSeq - subStartSeq;
       resizeAndCopy(subSequenceStartPositions,
-                    src.subSequenceStartPositions, subStartSeq,
-                    copySubSize + 1, false, stream);
+                    src.subSequenceStartPositions,
+                    subStartSeq,
+                    copySubSize + 1,
+                    false,
+                    stream);
       // modify new subSequenceStartPositions
       int* destSubSequences = subSequenceStartPositions->getMutableData(false);
       for (int i = 0; i < copySubSize + 1; i++) {
@@ -281,14 +314,19 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq,
 
 void Argument::concat(const std::vector<Argument>& args,
                       const std::vector<int>& selectRows,
-                      const std::vector<int>& seqStartPos, bool useGpu,
-                      hl_stream_t stream, PassType passType) {
+                      const std::vector<int>& seqStartPos,
+                      bool useGpu,
+                      hl_stream_t stream,
+                      PassType passType) {
   CHECK(!subSequenceStartPositions)
-          << "undefined behavior for subsequence positions";
+      << "undefined behavior for subsequence positions";
 
   size_t batchSize = selectRows.size();
-  auto copyArg = [batchSize, stream](MatrixPtr& dst, MatrixPtr src,
-                                     int startRow, int pos, int size,
+  auto copyArg = [batchSize, stream](MatrixPtr& dst,
+                                     MatrixPtr src,
+                                     int startRow,
+                                     int pos,
+                                     int size,
                                      bool useGpu) {
     if (!src) {
       dst.reset();
@@ -305,8 +343,11 @@ void Argument::concat(const std::vector<Argument>& args,
     tmpMatrix->copyFrom(*src->subMatrix(pos, size), stream);
   };
 
-  auto copyIds = [batchSize, stream](IVectorPtr& dst, const IVectorPtr& src,
-                                     int startRow, int pos, int size,
+  auto copyIds = [batchSize, stream](IVectorPtr& dst,
+                                     const IVectorPtr& src,
+                                     int startRow,
+                                     int pos,
+                                     int size,
                                      bool useGpu) {
     if (!src) {
       dst.reset();
@@ -316,8 +357,11 @@ void Argument::concat(const std::vector<Argument>& args,
     dst->subVec(startRow, size)->copyFrom(*src->subVec(pos, size), stream);
   };
 
-  auto copyStrs = [batchSize, stream](SVectorPtr& dst, const SVectorPtr& src,
-                                      int startRow, int pos, int size,
+  auto copyStrs = [batchSize, stream](SVectorPtr& dst,
+                                      const SVectorPtr& src,
+                                      int startRow,
+                                      int pos,
+                                      int size,
                                       bool useGpu) {
     if (!src) {
       dst.reset();
@@ -328,8 +372,8 @@ void Argument::concat(const std::vector<Argument>& args,
     } else {
       dst->resize(batchSize);
     }
-    std::copy(src->begin() + pos, src->begin() + pos + size,
-              dst->begin() + startRow);
+    std::copy(
+        src->begin() + pos, src->begin() + pos + size, dst->begin() + startRow);
   };
 
   dataId = args[0].dataId;
@@ -354,14 +398,16 @@ void Argument::concat(const std::vector<Argument>& args,
       copyStrs(strs, arg.strs, j, rowIdx, copySize, useGpu);
     }
   }
-  ICpuGpuVector::resizeOrCreate(sequenceStartPositions,
-                          seqStartPos.size(), useGpu);
-  sequenceStartPositions->copyFrom(seqStartPos.data(),
-                                   seqStartPos.size(), useGpu);
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, seqStartPos.size(), useGpu);
+  sequenceStartPositions->copyFrom(
+      seqStartPos.data(), seqStartPos.size(), useGpu);
 }
 
-void Argument::concat(const std::vector<Argument>& args, bool useGpu,
-                      hl_stream_t stream, PassType passType) {
+void Argument::concat(const std::vector<Argument>& args,
+                      bool useGpu,
+                      hl_stream_t stream,
+                      PassType passType) {
   int32_t batchSize = 0;
   int64_t numSequences = 0;
   int64_t numSubSequences = 0;
@@ -371,8 +417,8 @@ void Argument::concat(const std::vector<Argument>& args, bool useGpu,
     numSubSequences += arg.getNumSubSequences();
   }
 
-  auto copyArg = [batchSize, stream](MatrixPtr& dst, MatrixPtr src,
-                                     int startRow, bool useGpu) {
+  auto copyArg = [batchSize, stream](
+      MatrixPtr& dst, MatrixPtr src, int startRow, bool useGpu) {
     if (!src) {
       dst.reset();
       return;
@@ -388,8 +434,8 @@ void Argument::concat(const std::vector<Argument>& args, bool useGpu,
     tmpMatrix->copyFrom(*src, stream);
   };
 
-  auto copyIds = [batchSize, stream](IVectorPtr& dst, const IVectorPtr& src,
-                                     int startRow, bool useGpu) {
+  auto copyIds = [batchSize, stream](
+      IVectorPtr& dst, const IVectorPtr& src, int startRow, bool useGpu) {
     if (!src) {
       dst.reset();
       return;
@@ -398,8 +444,8 @@ void Argument::concat(const std::vector<Argument>& args, bool useGpu,
     dst->subVec(startRow, src->getSize())->copyFrom(*src, stream);
   };
 
-  auto copyStrs = [batchSize, stream](SVectorPtr& dst, const SVectorPtr& src,
-                                      int startRow, bool useGpu) {
+  auto copyStrs = [batchSize, stream](
+      SVectorPtr& dst, const SVectorPtr& src, int startRow, bool useGpu) {
     if (!src) {
       dst.reset();
       return;
@@ -412,21 +458,23 @@ void Argument::concat(const std::vector<Argument>& args, bool useGpu,
     std::copy(src->begin(), src->end(), dst->begin() + startRow);
   };
 
-  auto copySequencePos = []
-          (ICpuGpuVectorPtr& dstSeq, const ICpuGpuVectorPtr& srcSeq,
-           int dstNumSequences, int srcNumSequences,
-           int& startSequences, int startRow) {
-      if (srcSeq) {
-          ICpuGpuVector::resizeOrCreate(dstSeq, dstNumSequences + 1, false);
-          const int* src = srcSeq->getData(false);
-          int* dest = dstSeq->getMutableData(false);
-          for (int i = 0; i < srcNumSequences + 1; ++i) {
-              dest[i + startSequences] = src[i] + startRow;
-          }
-          startSequences += srcNumSequences;
-      } else {
-          dstSeq.reset();
+  auto copySequencePos = [](ICpuGpuVectorPtr& dstSeq,
+                            const ICpuGpuVectorPtr& srcSeq,
+                            int dstNumSequences,
+                            int srcNumSequences,
+                            int& startSequences,
+                            int startRow) {
+    if (srcSeq) {
+      ICpuGpuVector::resizeOrCreate(dstSeq, dstNumSequences + 1, false);
+      const int* src = srcSeq->getData(false);
+      int* dest = dstSeq->getMutableData(false);
+      for (int i = 0; i < srcNumSequences + 1; ++i) {
+        dest[i + startSequences] = src[i] + startRow;
       }
+      startSequences += srcNumSequences;
+    } else {
+      dstSeq.reset();
+    }
   };
 
   int startRow = 0;
@@ -479,8 +527,8 @@ void Argument::splitByDataId(const std::vector<Argument>& argus,
 
 void Argument::getSeqInfo(std::vector<SeqInfo>* seqInfo) const {
   const int* starts = sequenceStartPositions->getData(false);
-  const int* subStarts = hasSubseq()
-      ? subSequenceStartPositions->getData(false) : nullptr;
+  const int* subStarts =
+      hasSubseq() ? subSequenceStartPositions->getData(false) : nullptr;
   size_t numSequences = getNumSequences();
   seqInfo->reserve(numSequences);
   int subSeqEnd = 0;
@@ -501,7 +549,8 @@ void Argument::getSeqInfo(std::vector<SeqInfo>* seqInfo) const {
     }
     seqInfo->push_back(info);
   }
-  std::sort(seqInfo->begin(), seqInfo->end(),
+  std::sort(seqInfo->begin(),
+            seqInfo->end(),
             [](const SeqInfo& a, const SeqInfo& b) {
               return a.topLevelLength > b.topLevelLength;
             });
@@ -535,9 +584,8 @@ void Argument::degradeSequence(const Argument& input, bool useGpu) {
   CHECK_EQ(input.hasSubseq(), 1UL);
   size_t numSequences = input.getNumSequences();
   size_t numSubSequences = input.getNumSubSequences();
-  ICpuGpuVector::resizeOrCreate(sequenceStartPositions,
-                                 numSequences + 1,
-                                 false);
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, numSequences + 1, false);
   int* tgtBuf = sequenceStartPositions->getMutableData(false);
   const int* starts = input.sequenceStartPositions->getData(false);
   const int* subStarts = input.subSequenceStartPositions->getData(false);
@@ -551,24 +599,29 @@ void Argument::degradeSequence(const Argument& input, bool useGpu) {
   tgtBuf[numSequences] = numSubSequences;
 }
 
-void Argument::subArgFrom(const Argument& input, size_t offset, size_t height,
-                          size_t width, bool useGpu, bool trans, bool seqFlag,
-                          size_t seqStart, size_t seqSize) {
+void Argument::subArgFrom(const Argument& input,
+                          size_t offset,
+                          size_t height,
+                          size_t width,
+                          bool useGpu,
+                          bool trans,
+                          bool seqFlag,
+                          size_t seqStart,
+                          size_t seqSize) {
   if (input.value) {
-    value = Matrix::create(input.value->getData() + offset * width,
-                           height, width, trans, useGpu);
+    value = Matrix::create(
+        input.value->getData() + offset * width, height, width, trans, useGpu);
   }
   if (input.ids) {
     ids = IVector::create(input.ids->getData() + offset, height, useGpu);
   }
   if (input.grad) {
-    grad = Matrix::create(input.grad->getData() + offset * width,
-                          height, width, trans, useGpu);
+    grad = Matrix::create(
+        input.grad->getData() + offset * width, height, width, trans, useGpu);
   }
   if (seqFlag) {
     sequenceStartPositions = std::make_shared<ICpuGpuVector>(
-        *(input.sequenceStartPositions),
-        seqStart, seqSize);
+        *(input.sequenceStartPositions), seqStart, seqSize);
   }
 }
 
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index 81ff9029bc4c8fca7adbabd7ae65caf7ac2f3c2a..2b20122debf935562d36f29d872e8ef3243111e0 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "hl_gpu.h"
@@ -153,9 +152,8 @@ struct Argument {
   }
 
   int64_t getNumSubSequences() const {
-    return subSequenceStartPositions
-               ? subSequenceStartPositions->getSize() - 1
-               : getBatchSize();
+    return subSequenceStartPositions ? subSequenceStartPositions->getSize() - 1
+                                     : getBatchSize();
   }
 
   bool hasSubseq() const { return subSequenceStartPositions != nullptr; }
@@ -190,9 +188,14 @@ struct Argument {
    * @param seqStart[in]    offset of input.sequenceStartPositions
    * @param seqSize[in]     lenght of output.sequenceStartPositions
    */
-  void subArgFrom(const Argument& input, size_t offset, size_t height,
-                  size_t width, bool useGpu, bool trans = false,
-                  bool seqFlag = false, size_t seqStart = 0,
+  void subArgFrom(const Argument& input,
+                  size_t offset,
+                  size_t height,
+                  size_t width,
+                  bool useGpu,
+                  bool trans = false,
+                  bool seqFlag = false,
+                  size_t seqStart = 0,
                   size_t seqSize = 0);
   /*
    * for sequence input:
@@ -206,16 +209,21 @@ struct Argument {
    * Note that when specifying the stream explicitly in this case,
    * synchronize should also be called somewhere after this function
    */
-  int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
-                            int32_t copySize, bool useGpu, hl_stream_t stream);
+  int32_t resizeAndCopyFrom(const Argument& src,
+                            int32_t startSeq,
+                            int32_t copySize,
+                            bool useGpu,
+                            hl_stream_t stream);
 
   /*
    * same with the above function, except that the stream is
    * HPPL_STREAM_DEFAULT and synchronize is automatically called
    * inside it
    */
-  int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq,
-                            int32_t copySize, bool useGpu = FLAGS_use_gpu);
+  int32_t resizeAndCopyFrom(const Argument& src,
+                            int32_t startSeq,
+                            int32_t copySize,
+                            bool useGpu = FLAGS_use_gpu);
 
   void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);
 
@@ -237,13 +245,16 @@ struct Argument {
    */
   void concat(const std::vector<Argument>& args,
               const std::vector<int>& selectRows,
-              const std::vector<int>& seqStartPos, bool useGpu,
-              hl_stream_t stream, PassType passType);
+              const std::vector<int>& seqStartPos,
+              bool useGpu,
+              hl_stream_t stream,
+              PassType passType);
 
   /*
     Concatenate several args into one and put the result into this.
    */
-  void concat(const std::vector<Argument>& src, bool useGpu = FLAGS_use_gpu,
+  void concat(const std::vector<Argument>& src,
+              bool useGpu = FLAGS_use_gpu,
               hl_stream_t stream = HPPL_STREAM_DEFAULT,
               PassType passType = PASS_TEST);
 
diff --git a/paddle/parameter/AverageOptimizer.cpp b/paddle/parameter/AverageOptimizer.cpp
index 4f730059c748f36d690f388d29d213c676ac9626..593594761ed57495b92a30a8f3e8e86cdb45bfce 100644
--- a/paddle/parameter/AverageOptimizer.cpp
+++ b/paddle/parameter/AverageOptimizer.cpp
@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "AverageOptimizer.h"
 
 namespace paddle {
 
 // factory method to create an instance of AverageOptimizer
 ParameterOptimizer* AverageOptimizer::create(
-    const OptimizationConfig& optConfig, ParameterOptimizer* optimizer,
-    bool isParameterSparse, bool useParameterApply) {
+    const OptimizationConfig& optConfig,
+    ParameterOptimizer* optimizer,
+    bool isParameterSparse,
+    bool useParameterApply) {
   if (optConfig.average_window() <= 0) {
     return optimizer;
   }
@@ -44,8 +45,8 @@ AverageOptimizer::AverageOptimizer(const OptimizationConfig& optConfig,
       prevNumUpdates_(0),
       numAccumulates_(0),
       oldNumAccumulates_(0),
-      minAverageWindow_(std::min<int64_t>(
-        10000L, optConfig_.max_average_window())),
+      minAverageWindow_(
+          std::min<int64_t>(10000L, optConfig_.max_average_window())),
       maxAverageWindow_(optConfig_.max_average_window()) {
   parameterTypes_ = optimizer_->getParameterTypes();
   addParameterType(PARAMETER_SUM1);
@@ -121,17 +122,27 @@ ParameterOptimizer::TraverseCallback AverageOptimizer::apply() {
 
   real scale = 1. / (numAccumulates_ + oldNumAccumulates_);
   if (useApply_) {
-    return [scale](const VectorPtr vecs[], const ParameterConfig& config,
+    return [scale](const VectorPtr vecs[],
+                   const ParameterConfig& config,
                    size_t sparseId) {
-      vecs[PARAMETER_APPLY]->add3(*vecs[PARAMETER_SUM1], *vecs[PARAMETER_SUM2],
-                                  *vecs[PARAMETER_SUM3], scale, scale, scale);
+      vecs[PARAMETER_APPLY]->add3(*vecs[PARAMETER_SUM1],
+                                  *vecs[PARAMETER_SUM2],
+                                  *vecs[PARAMETER_SUM3],
+                                  scale,
+                                  scale,
+                                  scale);
     };
   } else {
-    return [scale](const VectorPtr vecs[], const ParameterConfig& config,
+    return [scale](const VectorPtr vecs[],
+                   const ParameterConfig& config,
                    size_t sparseId) {
       vecs[PARAMETER_GRADIENT]->copyFrom(*vecs[PARAMETER_VALUE]);
-      vecs[PARAMETER_VALUE]->add3(*vecs[PARAMETER_SUM1], *vecs[PARAMETER_SUM2],
-                                  *vecs[PARAMETER_SUM3], scale, scale, scale);
+      vecs[PARAMETER_VALUE]->add3(*vecs[PARAMETER_SUM1],
+                                  *vecs[PARAMETER_SUM2],
+                                  *vecs[PARAMETER_SUM3],
+                                  scale,
+                                  scale,
+                                  scale);
     };
   }
 }
@@ -144,8 +155,8 @@ ParameterOptimizer::TraverseCallback AverageOptimizer::restore() {
     return nullptr;
   }
 
-  return [](const VectorPtr vecs[], const ParameterConfig& config,
-            size_t sparseId) {
+  return [](
+      const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) {
     vecs[PARAMETER_VALUE]->copyFrom(*vecs[PARAMETER_GRADIENT]);
     vecs[PARAMETER_GRADIENT]->zeroMem();
   };
@@ -174,7 +185,8 @@ ParameterOptimizer::TraverseCallback AverageSparseOptimizer::startCatchUpWith()
 
   if (timer_ > 0) {
     callbacks.emplace_back(
-        [this](const VectorPtr vecs[], const ParameterConfig& config,
+        [this](const VectorPtr vecs[],
+               const ParameterConfig& config,
                size_t sparseId) { this->catchUpWith(vecs, config, sparseId); });
   }
 
diff --git a/paddle/parameter/AverageOptimizer.h b/paddle/parameter/AverageOptimizer.h
index 8e0ead84125ab283756acdbd3bf9120918adcf35..ccc2612608db574274f3e0acaacec7f9eb404223 100644
--- a/paddle/parameter/AverageOptimizer.h
+++ b/paddle/parameter/AverageOptimizer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "FirstOrderOptimizer.h"
@@ -26,7 +25,8 @@ public:
   // if *useParameterApply* set, use PARAMETER_APPLY to store averaged parameter
   // else use PARAMETER_VALUE, and value backup in PARAMETER_GRADIENT
   AverageOptimizer(const OptimizationConfig& optConfig,
-                   ParameterOptimizer* optimizer, bool useParameterApply);
+                   ParameterOptimizer* optimizer,
+                   bool useParameterApply);
 
   static ParameterOptimizer* create(const OptimizationConfig& optConfig,
                                     ParameterOptimizer* optimizer,
@@ -45,7 +45,8 @@ public:
 
   virtual void startBatch(int64_t numSamplesProcessed);
   virtual void finishBatch();
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       size_t sparseId) const {
     optimizer_->update(vecs, paraConfig, sparseId);
     vecs[PARAMETER_SUM1]->add(*vecs[PARAMETER_VALUE], 1.0f);
@@ -99,7 +100,8 @@ protected:
 class AverageSparseOptimizer : public AverageOptimizer {
 public:
   AverageSparseOptimizer(const OptimizationConfig& optConfig,
-                         ParameterOptimizer* optimizer, bool useParameterApply)
+                         ParameterOptimizer* optimizer,
+                         bool useParameterApply)
       : AverageOptimizer(optConfig, optimizer, useParameterApply) {}
 
   virtual void init(size_t numRows, const ParameterConfig* config) {
@@ -114,9 +116,11 @@ public:
     AverageOptimizer::finishBatch();
     timer_++;
   }
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       size_t sparseId) const;
-  void catchUpWith(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  void catchUpWith(const VectorPtr vecs[],
+                   const ParameterConfig& paraConfig,
                    size_t sparseId) const;
   virtual TraverseCallback startCatchUpWith() const;
   virtual void finishCatchUpWith() {
diff --git a/paddle/parameter/CMakeLists.txt b/paddle/parameter/CMakeLists.txt
index d6f67604c03485dfeb3c907705b117ac550e9b6f..a35e46997fb04e9378e106bf428a629b286c2e8c 100644
--- a/paddle/parameter/CMakeLists.txt
+++ b/paddle/parameter/CMakeLists.txt
@@ -10,4 +10,4 @@ add_style_check_target(paddle_parameter ${PARAMETERS_HEADERS})
 add_dependencies(paddle_parameter gen_proto_cpp)
 if(WITH_TESTING)
     add_subdirectory(tests)
-endif()
\ No newline at end of file
+endif()
diff --git a/paddle/parameter/FirstOrderOptimizer.cpp b/paddle/parameter/FirstOrderOptimizer.cpp
index bb46a51d1e02c6d7e96e33c2cac0585055f026a1..a9be07d062992ff24175339c630426d27e84c22b 100644
--- a/paddle/parameter/FirstOrderOptimizer.cpp
+++ b/paddle/parameter/FirstOrderOptimizer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "paddle/utils/Flags.h"
 
@@ -71,13 +70,15 @@ void SparseMomentumParameterOptimizer::update(const VectorPtr vecs[],
                                      tau_ * alpha_ * gamma_ * learningRate_);
     vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
                                tau_ / beta_ + 1.0 / alpha_,
-                               *vecs[PARAMETER_MOMENTUM_VT], 1.0 / beta_);
+                               *vecs[PARAMETER_MOMENTUM_VT],
+                               1.0 / beta_);
 
   } else {
-    vecs[PARAMETER_VALUE]->sgdUpdate(
-        *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
-        learningRate_ * paraConfig.learning_rate(), paraConfig.momentum(),
-        applyDecay_ ? paraConfig.decay_rate() : 0);
+    vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                     *vecs[PARAMETER_MOMENTUM],
+                                     learningRate_ * paraConfig.learning_rate(),
+                                     paraConfig.momentum(),
+                                     applyDecay_ ? paraConfig.decay_rate() : 0);
   }
 }
 
@@ -90,7 +91,8 @@ SparseMomentumParameterOptimizer::needSpecialTraversal(
     //  2. Note that \tau * u_t + v_t = \beta \theta_t, therefore:
     //     u_t should be rescaled to u_t/alpha_
     //     v_t should be reset to \theta_t
-    return [this](const VectorPtr vecs[], const ParameterConfig& config,
+    return [this](const VectorPtr vecs[],
+                  const ParameterConfig& config,
                   size_t sparseId) {
       vecs[PARAMETER_MOMENTUM_UT]->divScalar(alpha_);
       vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
@@ -120,10 +122,12 @@ void AdagradParameterOptimizer::update(const VectorPtr vecs[],
   vecs[PARAMETER_LEARNING_RATE]->add(optConfig_.ada_epsilon());
   vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
 
-  vecs[PARAMETER_VALUE]->sgdUpdate(
-      *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
-      *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(),
-      config.momentum(), applyDecay_ ? config.decay_rate() : 0);
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate_ * config.learning_rate(),
+                                   config.momentum(),
+                                   applyDecay_ ? config.decay_rate() : 0);
 }
 
 ParameterOptimizer::TraverseCallback
@@ -132,7 +136,8 @@ AdagradParameterOptimizer::needSpecialTraversal(
   if (numUpdates_ % kMaxNumAccumulates == 0) {
     // Move the sum to a different buffer to avoid loss of precision
     // due to too many sums.
-    return [this](const VectorPtr vecs[], const ParameterConfig& config,
+    return [this](const VectorPtr vecs[],
+                  const ParameterConfig& config,
                   size_t sparseId) {
       vecs[PARAMETER_GRADIENT_SQURESUM]->add(
           *vecs[PARAMETER_GRADIENT_SQURESUM1]);
@@ -148,24 +153,29 @@ void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[],
                                         size_t sparseId) const {
   CHECK(sparseId == -1LU) << "Sparse update is not supported";
   // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(*vecs[PARAMETER_GRADIENT],
-                                                    rou_, 1.0f - rou_);
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], rou_, 1.0f - rou_);
 
   // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
   vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
                                         *vecs[PARAMETER_GRADIENT_SQURESUM],
-                                        epsilon_, epsilon_);
+                                        epsilon_,
+                                        epsilon_);
   vecs[PARAMETER_LEARNING_RATE]->sqrt();
 
   // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
   vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
-      *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_LEARNING_RATE], rou_,
+      *vecs[PARAMETER_GRADIENT],
+      *vecs[PARAMETER_LEARNING_RATE],
+      rou_,
       1.0f - rou_);
 
-  vecs[PARAMETER_VALUE]->sgdUpdate(
-      *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
-      *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(),
-      config.momentum(), applyDecay_ ? config.decay_rate() : 0);
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate_ * config.learning_rate(),
+                                   config.momentum(),
+                                   applyDecay_ ? config.decay_rate() : 0);
 }
 
 void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
@@ -185,12 +195,13 @@ void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
   // For the first time update, make the sum be the current square
   // so that the initial estimation of E(g_t^2) will not be too small.
   vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou,
+      *vecs[PARAMETER_GRADIENT],
+      accumulatedRou,
       firstTime ? 1.0f : 1.0f - rou_);
 
   // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(*vecs[PARAMETER_GRADIENT],
-                                          accumulatedRou, 1.0f - rou_);
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou_);
 
   // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
   // Basiclly if the sign of the gradient changes more often,
@@ -201,10 +212,12 @@ void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
   vecs[PARAMETER_LEARNING_RATE]->add(optConfig_.ada_epsilon());
   vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
 
-  vecs[PARAMETER_VALUE]->sgdUpdate(
-      *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
-      *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(),
-      config.momentum(), applyDecay_ ? config.decay_rate() : 0);
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate_ * config.learning_rate(),
+                                   config.momentum(),
+                                   applyDecay_ ? config.decay_rate() : 0);
 }
 
 void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
@@ -224,7 +237,8 @@ void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
   // For the first time update, make the sum be the current square
   // so that the initial estimation of E(g_t^2) will not be too small.
   vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou,
+      *vecs[PARAMETER_GRADIENT],
+      accumulatedRou,
       firstTime ? 1.0f : 1.0f - rou_);
 
   // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
@@ -234,10 +248,12 @@ void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
   vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
   vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
 
-  vecs[PARAMETER_VALUE]->sgdUpdate(
-      *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
-      *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(),
-      config.momentum(), applyDecay_ ? config.decay_rate() : 0);
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate_ * config.learning_rate(),
+                                   config.momentum(),
+                                   applyDecay_ ? config.decay_rate() : 0);
 }
 
 void AdamParameterOptimizer::update(const VectorPtr vecs[],
@@ -290,7 +306,6 @@ void AdamaxParameterOptimizer::update(const VectorPtr vecs[],
   theta->add(*theta, 1.0, *g, -learningRate);
 }
 
-
 void OptimizerWithGradientClipping::update(const VectorPtr vecs[],
                                            const ParameterConfig& config,
                                            size_t sparseId) const {
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
index ad5f48097643a10d8b6f5bf3202211aa2b092469..a9a2ffdd41310d1927df012be8328d0e4bd3af0f 100644
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "ParameterOptimizer.h"
@@ -31,21 +30,22 @@ public:
   virtual void startBatch(int64_t numSamplesProcessed) {
     learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
   }
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       size_t sparseId) const {
     (void)sparseId;
-    real torch_learningRate = optConfig_.learning_method() == "torch_momentum" ?
-                              1.0 - paraConfig.momentum() : 1.0;
+    real torch_learningRate = optConfig_.learning_method() == "torch_momentum"
+                                  ? 1.0 - paraConfig.momentum()
+                                  : 1.0;
     vecs[PARAMETER_VALUE]->sgdUpdate(
-        *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM],
+        *vecs[PARAMETER_GRADIENT],
+        *vecs[PARAMETER_MOMENTUM],
         learningRate_ * paraConfig.learning_rate() *
-        (firstTime_ ? 1.0 : torch_learningRate),
+            (firstTime_ ? 1.0 : torch_learningRate),
         paraConfig.momentum(),
         applyDecay_ ? paraConfig.decay_rate() : 0);
   }
-  virtual void finishBatch() {
-        firstTime_ = false;
-  }
+  virtual void finishBatch() { firstTime_ = false; }
 };
 
 // SGD optimization with sparse support.
@@ -71,7 +71,8 @@ public:
       const OptimizationConfig& optConfig);
   virtual void init(size_t numRows, const ParameterConfig* config);
   virtual void startBatch(int64_t numSamplesProcessed);
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       size_t sparseId) const;
   virtual TraverseCallback needSpecialTraversal(
       const ParameterConfig& config) const;
@@ -111,7 +112,8 @@ public:
     (void)numSamplesProcessed;
     ++numUpdates_;
   }
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
   virtual TraverseCallback needSpecialTraversal(
       const ParameterConfig& config) const;
@@ -141,7 +143,8 @@ public:
     learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
   }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
 
 protected:
@@ -173,7 +176,8 @@ public:
   }
   virtual void finishBatch() { timer_++; }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
 
 protected:
@@ -214,7 +218,8 @@ public:
   }
   virtual void finishBatch() { timer_++; }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
 
 protected:
@@ -251,7 +256,8 @@ public:
 
   virtual void finishBatch() { ++step_; }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
 
 protected:
@@ -280,7 +286,8 @@ public:
 
   virtual void finishBatch() { ++step_; }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
 
 protected:
@@ -301,7 +308,8 @@ public:
     // learningRate required by regularizer
     learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
   }
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       size_t sparseId) const {
     vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_GRADIENT],
                                optConfig_.delta_add_rate());
@@ -314,7 +322,8 @@ public:
   explicit DummyOptimizer(const OptimizationConfig& optConfig)
       : ParameterOptimizer(optConfig) {}
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       size_t sparseId) const {}
 };
 
@@ -344,7 +353,8 @@ public:
       const ParameterConfig& config) const {
     return optimizer_->needSpecialTraversal(config);
   }
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
 
   virtual void setNoDecay() { optimizer_->setNoDecay(); }
diff --git a/paddle/parameter/LearningRateScheduler.cpp b/paddle/parameter/LearningRateScheduler.cpp
index ce045ebf05a226215d565bf0281f245918e13055..a7412500ccfa05707286f0ad493ad8280eee1cbc 100644
--- a/paddle/parameter/LearningRateScheduler.cpp
+++ b/paddle/parameter/LearningRateScheduler.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "LearningRateScheduler.h"
 #include "paddle/utils/StringUtil.h"
 
diff --git a/paddle/parameter/LearningRateScheduler.h b/paddle/parameter/LearningRateScheduler.h
index 74fb848fabe1ad9bbea8620d51d9d3674eb8a526..e987c3dcde120b8c88d58de7a18ee5c6db85bb5c 100644
--- a/paddle/parameter/LearningRateScheduler.h
+++ b/paddle/parameter/LearningRateScheduler.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "TrainerConfig.pb.h"
@@ -20,9 +19,10 @@ limitations under the License. */
 
 namespace paddle {
 // NOLINTNEXTLINES_4
-#define REGISTER_LEARNING_RATE_SCHEDULER(__type_name, __class_name)              \
-  static InitFunction __reg_type_##__type_name([]() {                            \
-    LearningRateScheduler::registrar_.registerClass<__class_name>(#__type_name); \
+#define REGISTER_LEARNING_RATE_SCHEDULER(__type_name, __class_name) \
+  static InitFunction __reg_type_##__type_name([]() {               \
+    LearningRateScheduler::registrar_.registerClass<__class_name>(  \
+        #__type_name);                                              \
   })
 
 class LearningRateScheduler {
diff --git a/paddle/parameter/OptimizerFunctions.cpp b/paddle/parameter/OptimizerFunctions.cpp
index 5adcf86efd5284ab5bc3131217c9e44172caa71b..6fd7964347644214533007dc1e11e6fa45ee9ea6 100644
--- a/paddle/parameter/OptimizerFunctions.cpp
+++ b/paddle/parameter/OptimizerFunctions.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "AverageOptimizer.h"
 #include "FirstOrderOptimizer.h"
 #include "OptimizerWithRegularizer.h"
@@ -22,19 +21,22 @@ namespace paddle {
 // creator for AverageOptimizer
 ParameterOptimizer* sgdOptimizerCreate(const OptimizationConfig& optConfig,
                                        const ParameterConfig& paraConfig,
-                                       bool isParameterSparse, bool inPserver) {
+                                       bool isParameterSparse,
+                                       bool inPserver) {
   ParameterOptimizer* optimizer = OptimizerWithRegularizer::create(
       optConfig, paraConfig, isParameterSparse, inPserver);
-  return AverageOptimizer::create(optConfig, optimizer, isParameterSparse,
-                                  inPserver /*useParameterApply*/);
+  return AverageOptimizer::create(
+      optConfig, optimizer, isParameterSparse, inPserver /*useParameterApply*/);
 }
 
 std::vector<ParameterType> sgdOptimizerGetTypes(
     const OptimizationConfig& optConfig, bool inPserver) {
   std::unique_ptr<ParameterOptimizer> optimizer;
-  optimizer.reset(AverageOptimizer::create(
-      optConfig, ParameterOptimizer::create(optConfig, inPserver),
-      false /*isParameterSparse*/, inPserver));
+  optimizer.reset(
+      AverageOptimizer::create(optConfig,
+                               ParameterOptimizer::create(optConfig, inPserver),
+                               false /*isParameterSparse*/,
+                               inPserver));
   CHECK(optimizer) << "fail to create optimizer: "
                    << optConfig.learning_method();
   return optimizer->getParameterTypes();
diff --git a/paddle/parameter/OptimizerFunctions.h b/paddle/parameter/OptimizerFunctions.h
index 9592658224d856fff1a2bde5e400ea85f95cd521..a5f8b2c56942720335c0df6c9d71fd4e15494600 100644
--- a/paddle/parameter/OptimizerFunctions.h
+++ b/paddle/parameter/OptimizerFunctions.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "FirstOrderOptimizer.h"
@@ -25,7 +24,8 @@ namespace paddle {
  */
 ParameterOptimizer* sgdOptimizerCreate(const OptimizationConfig& optConfig,
                                        const ParameterConfig& paraConfig,
-                                       bool isParameterSparse, bool inPserver);
+                                       bool isParameterSparse,
+                                       bool inPserver);
 
 /*
  * Get the parameter types needed for the specific optimization
diff --git a/paddle/parameter/OptimizerWithRegularizer.cpp b/paddle/parameter/OptimizerWithRegularizer.cpp
index 0da27a51c6d29337864222d2e85126113f7f6431..5381e7bef3b177884d85671ef6e3dfbc0de1d5ed 100644
--- a/paddle/parameter/OptimizerWithRegularizer.cpp
+++ b/paddle/parameter/OptimizerWithRegularizer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "OptimizerWithRegularizer.h"
 
 namespace paddle {
@@ -24,7 +23,8 @@ OptimizerWithRegularizerEveryNumBatches::needSpecialTraversal(
 
   if (isRegularizationBatch(config)) {
     callbacks.emplace_back(
-        [this](const VectorPtr vecs[], const ParameterConfig& config,
+        [this](const VectorPtr vecs[],
+               const ParameterConfig& config,
                size_t sparseId) { this->doTraversal(vecs, config); });
   }
 
@@ -39,8 +39,8 @@ void OptimizerWithRegularizerEveryNumBatches::doTraversal(
     const VectorPtr vecs[], const ParameterConfig& config) const {
   int32_t base =
       std::max(baseTimer_, (timer_ + 1 - config.num_batches_regularization()));
-  regularizer_->update(vecs, config, optimizer_->getLearningRate(), base,
-                       timer_ + 1);
+  regularizer_->update(
+      vecs, config, optimizer_->getLearningRate(), base, timer_ + 1);
 }
 
 ParameterOptimizer::TraverseCallback
@@ -53,7 +53,8 @@ OptimizerWithRegularizerEveryNumBatches::startCatchUpWith() const {
 
   if (baseTimer_ < timer_) {
     callbacks.emplace_back(
-        [this](const VectorPtr vecs[], const ParameterConfig& config,
+        [this](const VectorPtr vecs[],
+               const ParameterConfig& config,
                size_t sparseId) { this->catchUpWith(vecs, config, sparseId); });
   }
 
@@ -61,11 +62,15 @@ OptimizerWithRegularizerEveryNumBatches::startCatchUpWith() const {
 }
 
 void OptimizerWithRegularizerEveryNumBatches::catchUpWith(
-    const VectorPtr vecs[], const ParameterConfig& config,
+    const VectorPtr vecs[],
+    const ParameterConfig& config,
     size_t sparseId) const {
   int32_t base = timer_ - timer_ % config.num_batches_regularization();
-  regularizer_->update(vecs, config, optimizer_->getLearningRate(),
-                       std::max(base, baseTimer_), timer_);
+  regularizer_->update(vecs,
+                       config,
+                       optimizer_->getLearningRate(),
+                       std::max(base, baseTimer_),
+                       timer_);
 }
 
 void OptimizerWithRegularizerSparse::init(size_t numRows,
@@ -83,8 +88,11 @@ void OptimizerWithRegularizerSparse::update(const VectorPtr vecs[],
   optimizer_->update(vecs, config, sparseId);
   // para W(t0) -> W(t+1)
   CHECK_LT(sparseId, t0Vec_.size());
-  regularizer_->update(vecs, config, optimizer_->getLearningRate(),
-                       t0Vec_[sparseId], timer_ + 1);
+  regularizer_->update(vecs,
+                       config,
+                       optimizer_->getLearningRate(),
+                       t0Vec_[sparseId],
+                       timer_ + 1);
   t0Vec_[sparseId] = timer_ + 1;
 }
 
@@ -98,7 +106,8 @@ OptimizerWithRegularizerSparse::startCatchUpWith() const {
 
   if (timer_ > 0) {
     callbacks.emplace_back(
-        [this](const VectorPtr vecs[], const ParameterConfig& config,
+        [this](const VectorPtr vecs[],
+               const ParameterConfig& config,
                size_t sparseId) { this->catchUpWith(vecs, config, sparseId); });
   }
 
@@ -110,18 +119,20 @@ void OptimizerWithRegularizerSparse::catchUpWith(const VectorPtr vecs[],
                                                  size_t sparseId) const {
   // para W(t0) -> W(t+1)
   CHECK_LT(sparseId, t0Vec_.size());
-  regularizer_->update(vecs, config, optimizer_->getLearningRate(),
-                       t0Vec_[sparseId], timer_);
+  regularizer_->update(
+      vecs, config, optimizer_->getLearningRate(), t0Vec_[sparseId], timer_);
 }
 
 // factory method to create instance of OptimizerWithRegularizer
 ParameterOptimizer* OptimizerWithRegularizer::create(
-    const OptimizationConfig& optConfig, const ParameterConfig& paraConfig,
-    bool isParameterSparse, bool inPserver) {
+    const OptimizationConfig& optConfig,
+    const ParameterConfig& paraConfig,
+    bool isParameterSparse,
+    bool inPserver) {
   ParameterOptimizer* optimizer =
       ParameterOptimizer::create(optConfig, inPserver);
   if (paraConfig.gradient_clipping_threshold() > 0.0f &&
-     !dynamic_cast<AddOptimizer*>(optimizer)) {
+      !dynamic_cast<AddOptimizer*>(optimizer)) {
     optimizer = new OptimizerWithGradientClipping(optConfig, optimizer);
   }
   Regularizer* regularizer =
@@ -157,23 +168,23 @@ ParameterOptimizer* OptimizerWithRegularizer::create(
     }
     // normal
     optimizer->setNoDecay();
-    return new OptimizerWithRegularizerEveryNumBatches(optConfig, optimizer,
-                                                       regularizer);
+    return new OptimizerWithRegularizerEveryNumBatches(
+        optConfig, optimizer, regularizer);
   }
   if (isParameterSparse) {
-      CHECK(paraConfig.momentum() == 0.0f)
-          << "Parameter cannot support momentum if it's sparse.";
+    CHECK(paraConfig.momentum() == 0.0f)
+        << "Parameter cannot support momentum if it's sparse.";
     optimizer->setNoDecay();
-    return new OptimizerWithRegularizerSparse(optConfig, optimizer,
-                                              regularizer);
+    return new OptimizerWithRegularizerSparse(
+        optConfig, optimizer, regularizer);
   }
   // dense
   if (paraConfig.decay_rate_l1() == 0.0f ||
-    dynamic_cast<AddOptimizer*>(optimizer)) {
+      dynamic_cast<AddOptimizer*>(optimizer)) {
     return optimizer;
   }
   CHECK(paraConfig.momentum() == 0.0f)
-    << "Parameter cannot support momentum if it use L1 decay.";
+      << "Parameter cannot support momentum if it use L1 decay.";
   optimizer->setNoDecay();
   return new OptimizerWithRegularizer(optConfig, optimizer, regularizer);
 }
diff --git a/paddle/parameter/OptimizerWithRegularizer.h b/paddle/parameter/OptimizerWithRegularizer.h
index b8b2d5b84d6875c1f9e4ea8a9cd1c93c1fff4be5..ebe23c7397f6d3f14976422342953e493a6fbee1 100644
--- a/paddle/parameter/OptimizerWithRegularizer.h
+++ b/paddle/parameter/OptimizerWithRegularizer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "FirstOrderOptimizer.h"
@@ -24,7 +23,8 @@ class OptimizerWithRegularizer : public ParameterOptimizer {
 public:
   static ParameterOptimizer* create(const OptimizationConfig& optConfig,
                                     const ParameterConfig& paraConfig,
-                                    bool isParameterSparse, bool inPserver);
+                                    bool isParameterSparse,
+                                    bool inPserver);
 
   OptimizerWithRegularizer(const OptimizationConfig& optConfig,
                            ParameterOptimizer* optimizer,
@@ -60,7 +60,8 @@ public:
     return optimizer_->needSpecialTraversal(config);
   }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const {
     optimizer_->update(vecs, config, sparseId);
     regularizer_->update(vecs, config, optimizer_->getLearningRate(), 0, 1);
@@ -94,7 +95,8 @@ public:
     baseTimer_ = 0;
   }
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const {
     optimizer_->update(vecs, config, sparseId);
   }
@@ -103,7 +105,8 @@ public:
       const ParameterConfig& config) const;
   void doTraversal(const VectorPtr vecs[], const ParameterConfig& config) const;
 
-  void catchUpWith(const VectorPtr vecs[], const ParameterConfig& config,
+  void catchUpWith(const VectorPtr vecs[],
+                   const ParameterConfig& config,
                    size_t sparseId) const;
 
   virtual TraverseCallback startCatchUpWith() const;
@@ -130,9 +133,11 @@ public:
 
   virtual void init(size_t numRows, const ParameterConfig* config);
 
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId) const;
-  void catchUpWith(const VectorPtr vecs[], const ParameterConfig& config,
+  void catchUpWith(const VectorPtr vecs[],
+                   const ParameterConfig& config,
                    size_t sparseId) const;
   virtual TraverseCallback startCatchUpWith() const;
   virtual void finishCatchUpWith() {
diff --git a/paddle/parameter/ParallelParameter.cpp b/paddle/parameter/ParallelParameter.cpp
index 19cbdab1c8d1e8b4836c8f193901edb5b166f055..99b20a59ca2a8b4a84a5bcbd0fab135ac54de61c 100644
--- a/paddle/parameter/ParallelParameter.cpp
+++ b/paddle/parameter/ParallelParameter.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #include <fstream>
 
@@ -152,7 +151,8 @@ void SyncParameter::minorUpdate(real learnRate) {
   gradSem_->post();
 }
 
-AsyncParameter::AsyncParameter(TrainerRole role, int asyncCount,
+AsyncParameter::AsyncParameter(TrainerRole role,
+                               int asyncCount,
                                ParameterPtr localParam)
     : ParallelParameter(role, localParam) {
   asyncCount_ = asyncCount;
diff --git a/paddle/parameter/ParallelParameter.h b/paddle/parameter/ParallelParameter.h
index 882033af636529cd845bfaae2253767a37e2cb72..2b65321fe201ae166dbbd6629e9a0ab0c6481699 100644
--- a/paddle/parameter/ParallelParameter.h
+++ b/paddle/parameter/ParallelParameter.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <stdint.h>
@@ -47,17 +46,17 @@ const int UPDATE_TYPE_NUM = 32;
  * TrainRole denotes the role of current training, different roles have
  * different jobs.
  *
- * control, major, minor are three kinds of role to support mutiple GPUs 
+ * control, major, minor are three kinds of role to support mutiple GPUs
  * parallel SGD training. SM on GPU card has two groups, each group
  * consist of a major and a minor.
  *
  * @param    single  single GPU card single thread training.
- * 
+ *
  *
  * @param    control current parameter updates via control role,
  *                   not participate in real training. control role is
- *                   responsible for merging all major's gradient and 
- *                   update parameter value. 
+ *                   responsible for merging all major's gradient and
+ *                   update parameter value.
  *
  * @param    major   major role paticipates in real training, when local
  *                   gradient is ready, merge its corresponding minor's
@@ -83,7 +82,8 @@ typedef void (ParallelParameter::*UpdateFunction)(real learnRate);
 
 class ParallelParameter {
 public:
-  static ParallelParameterPtr create(TrainerRole role, ParameterPtr localParam,
+  static ParallelParameterPtr create(TrainerRole role,
+                                     ParameterPtr localParam,
                                      int asyncCount = 1);
 
   ParallelParameter(TrainerRole role, ParameterPtr localParam) {
@@ -135,7 +135,7 @@ protected:
 };
 
 /**
- * this class is designed for multi-threading training. 
+ * this class is designed for multi-threading training.
  *
  * "Synchronous" means multiple GPUs calculate 1/4 mini-Batch,
  * but will get only one gradient
@@ -209,14 +209,14 @@ public:
    * When asynchronous training, update strategy including slave and master.
    *
    * slave: If in range asyncCount, adopting self-update method.
-   *        If beyond asyncCount, waiting for master to update. 
+   *        If beyond asyncCount, waiting for master to update.
    */
   void slaveUpdate(real learnRate);
 
   /**
    * When asynchronous training, update strategy including slave and master.
    *
-   * master: it only polls slaves, do not training data. 
+   * master: it only polls slaves, do not training data.
    *         If slave's gradient is ready, fetch it.
    *         Update master's parameter, then copy it into
    *         corresponding slave.
@@ -227,7 +227,7 @@ public:
 private:
   /**
    * When asynchronous training, every aysnc trainer needs to
-   * accumulate a number of batch gradient. 
+   * accumulate a number of batch gradient.
    *
    * gradientAccum_ is used to save the sum of gradients.
    */
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
index 64d72ae7404f09903aea35cefd97e810b20c39a3..7e37bf225ba25e8bae269cf45b69ce418a54d1a3 100644
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <fstream>
 #include "paddle/math/MathUtils.h"
 #include "AverageOptimizer.h"
@@ -27,11 +26,13 @@ limitations under the License. */
 #include "hl_gpu.h"
 #include "paddle/utils/CommandLineParser.h"
 
-P_DEFINE_int32(enable_grad_share, (100 * 1024 * 1024),
+P_DEFINE_int32(enable_grad_share,
+               (100 * 1024 * 1024),
                "threshold for enable gradient parameter share for batch "
                "multi-cpu training");
 P_DEFINE_int32(
-    grad_share_block_num, 64,
+    grad_share_block_num,
+    64,
     "block number of gradient parameter share for batch multi-cpu training");
 
 namespace paddle {
@@ -95,13 +96,12 @@ void Parameter::randomize(const VectorPtr& value,
     real initial_max = config.initial_mean() + config.initial_std();
     value->uniform(initial_min, initial_max);
     VLOG(1) << config.name() << ": initial_min=" << initial_min
-                            << ", initial_max=" << initial_max;
+            << ", initial_max=" << initial_max;
   } else if (PARAMETER_INIT_NORMAL == config.initial_strategy()) {
     /* Initialize the parameters randomly */
     value->randnorm(config.initial_mean(), config.initial_std());
-    VLOG(1) << config.name()
-                            << ": initial_mean=" << config.initial_mean()
-                            << ", initial_std=" << config.initial_std();
+    VLOG(1) << config.name() << ": initial_mean=" << config.initial_mean()
+            << ", initial_std=" << config.initial_std();
   } else {
     LOG(FATAL) << "not supported initial_strategy: "
                << config.initial_strategy();
@@ -116,12 +116,18 @@ void Parameter::randomize() {
   if (config_.is_sparse()) {
     if (format_ == SPARSE_CSC) {
       sparseRand(intBufs_[PARAMETER_COLS]->getData(),
-                 intBufs_[PARAMETER_ROWS]->getData(), config_.size(),
-                 config_.dims(1) + 1, config_.dims(0), useGpu_);
+                 intBufs_[PARAMETER_ROWS]->getData(),
+                 config_.size(),
+                 config_.dims(1) + 1,
+                 config_.dims(0),
+                 useGpu_);
     } else {
       sparseRand(intBufs_[PARAMETER_ROWS]->getData(),
-                 intBufs_[PARAMETER_COLS]->getData(), config_.size(),
-                 config_.dims(0) + 1, config_.dims(1), useGpu_);
+                 intBufs_[PARAMETER_COLS]->getData(),
+                 config_.size(),
+                 config_.dims(0) + 1,
+                 config_.dims(1),
+                 useGpu_);
     }
   }
   setValueUpdated();
@@ -152,7 +158,7 @@ bool Parameter::isValueShared() {
 
 bool Parameter::isGradSparseUpdate() const {
   return !useGpu_ && !isStatic() &&
-      (config_.sparse_update() || config_.sparse_remote_update());
+         (config_.sparse_update() || config_.sparse_remote_update());
 }
 
 void Parameter::setMat(ParameterType pType, int matType) {
@@ -180,30 +186,42 @@ void Parameter::setMat(ParameterType pType, int matType) {
         CHECK_EQ(width + 1, intBufs_[PARAMETER_COLS]->getSize());
         CHECK_EQ(size, intBufs_[PARAMETER_ROWS]->getSize());
       }
-      mats_[pType] = Matrix::createSparseMatrix(
-          bufs_[pType]->getData(), intBufs_[PARAMETER_ROWS]->getData(),
-          intBufs_[PARAMETER_COLS]->getData(), height, width,
-          bufs_[pType]->getSize(), FLOAT_VALUE, format_, false, useGpu_);
+      mats_[pType] =
+          Matrix::createSparseMatrix(bufs_[pType]->getData(),
+                                     intBufs_[PARAMETER_ROWS]->getData(),
+                                     intBufs_[PARAMETER_COLS]->getData(),
+                                     height,
+                                     width,
+                                     bufs_[pType]->getSize(),
+                                     FLOAT_VALUE,
+                                     format_,
+                                     false,
+                                     useGpu_);
     }
   } else if (matType == MAT_NORMAL_SHARED) {
     CHECK_EQ(height * width, bufs_[pType]->getSize());
     size_t blockNum = 0;
     CHECK(isGradShared(&blockNum));
     mats_[pType] = std::make_shared<SharedCpuMatrix>(
-        blockNum, std::dynamic_pointer_cast<CpuMemoryHandle>(
-                      bufs_[pType]->getMemoryHandle()),
-        height, width);
+        blockNum,
+        std::dynamic_pointer_cast<CpuMemoryHandle>(
+            bufs_[pType]->getMemoryHandle()),
+        height,
+        width);
   } else if (matType == MAT_VALUE_SHARED) {
     CHECK_EQ(height * width, bufs_[pType]->getSize());
     mats_[pType] = std::make_shared<SharedCpuMatrix>(
         std::dynamic_pointer_cast<CpuMemoryHandle>(
-        bufs_[pType]->getMemoryHandle()), height, width);
+            bufs_[pType]->getMemoryHandle()),
+        height,
+        width);
   } else if (matType == MAT_SPARSE_ROW_IDS) {
     CHECK_EQ(height * width, bufs_[pType]->getSize());
     mats_[pType] = std::make_shared<SparseRowIdsCpuMatrix>(
         std::dynamic_pointer_cast<CpuMemoryHandle>(
             bufs_[pType]->getMemoryHandle()),
-        height, width);
+        height,
+        width);
   } else if (matType == MAT_SPARSE_ROW) {
     auto valueMat =
         std::dynamic_pointer_cast<SparseRowCpuMatrix>(mats_[PARAMETER_VALUE]);
@@ -214,29 +232,31 @@ void Parameter::setMat(ParameterType pType, int matType) {
                       << " MAT_SPARSE_ROW_PREFETCH or MAT_CACHE_ROW";
       indexDict = valueMat->getIndexDictHandle();
     }
-    auto mat = std::make_shared<SparseRowCpuMatrix>(
-        nullptr, height, width,
-        // grad share index with value
-        indexDict);
+    auto mat =
+        std::make_shared<SparseRowCpuMatrix>(nullptr,
+                                             height,
+                                             width,
+                                             // grad share index with value
+                                             indexDict);
     mats_[pType] = mat;
   } else if (matType == MAT_CACHE_ROW) {
     CHECK(isGradSparseUpdate());
-    auto mat = std::make_shared<CacheRowCpuMatrix>(
-      height, width);
+    auto mat = std::make_shared<CacheRowCpuMatrix>(height, width);
     mats_[pType] = mat;
   } else if (matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
              matType == MAT_SPARSE_ROW_PREFETCH) {
     auto mat = std::make_shared<SparsePrefetchRowCpuMatrix>(
         bufs_[pType] ? std::dynamic_pointer_cast<CpuMemoryHandle>(
-          bufs_[pType]->getMemoryHandle()) : nullptr,
-        height, width,
+                           bufs_[pType]->getMemoryHandle())
+                     : nullptr,
+        height,
+        width,
         nullptr,  // indexDictHandle
         getGlobalSyncThreadPool());
     mats_[pType] = mat;
   } else if (matType == MAT_SPARSE_ROW_AUTO_GROW) {
     CHECK(isGradSparseUpdate());
-    mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(
-      height, width);
+    mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(height, width);
   } else {
     LOG(FATAL) << "Unsupported mat type" << matType;
   }
@@ -252,30 +272,43 @@ SparsePrefetchRowCpuMatrix* Parameter::getPrefetchMatrix() {
 }
 
 void Parameter::updateWithGradient(real learningRate) {
-  sgdUpdate(learningRate * config_.learning_rate(), config_.momentum(),
-            config_.decay_rate(), bufs_[PARAMETER_VALUE].get(),
-            bufs_[PARAMETER_GRADIENT].get(), bufs_[PARAMETER_MOMENTUM].get());
+  sgdUpdate(learningRate * config_.learning_rate(),
+            config_.momentum(),
+            config_.decay_rate(),
+            bufs_[PARAMETER_VALUE].get(),
+            bufs_[PARAMETER_GRADIENT].get(),
+            bufs_[PARAMETER_MOMENTUM].get());
 }
 
-void Parameter::updateWithGradient(real learningRate, MatrixPtr gradMat,
-                                   IVectorPtr t0, int currentTime, bool fini) {
+void Parameter::updateWithGradient(real learningRate,
+                                   MatrixPtr gradMat,
+                                   IVectorPtr t0,
+                                   int currentTime,
+                                   bool fini) {
   SparseRowCpuMatrix* sparseMat =
       dynamic_cast<SparseRowCpuMatrix*>(gradMat.get());
   CHECK(sparseMat);
   CHECK_EQ(config_.momentum(), 0.0f)
       << "not support momentum in sparse input sgd";
   bool useL1 = (config_.decay_rate_l1() != 0.0f);
-  sparseMat->sgdUpdate(*bufs_[PARAMETER_VALUE], *t0,
-                       learningRate * config_.learning_rate(), currentTime,
+  sparseMat->sgdUpdate(*bufs_[PARAMETER_VALUE],
+                       *t0,
+                       learningRate * config_.learning_rate(),
+                       currentTime,
                        useL1 ? config_.decay_rate_l1() : config_.decay_rate(),
-                       useL1, fini);
+                       useL1,
+                       fini);
 }
 
-void Parameter::updateWithGradient(real learningRate, VectorPtr gradVec,
+void Parameter::updateWithGradient(real learningRate,
+                                   VectorPtr gradVec,
                                    bool normalUpdate) {
   if (normalUpdate) {
-    sgdUpdate(learningRate * config_.learning_rate(), config_.momentum(),
-              config_.decay_rate(), bufs_[PARAMETER_VALUE].get(), gradVec.get(),
+    sgdUpdate(learningRate * config_.learning_rate(),
+              config_.momentum(),
+              config_.decay_rate(),
+              bufs_[PARAMETER_VALUE].get(),
+              gradVec.get(),
               bufs_[PARAMETER_MOMENTUM].get());
   } else {
     size_t size = gradVec->getSize();
@@ -361,7 +394,7 @@ bool Parameter::load(const std::string& filename) {
       return true;
     }
     LOG(FATAL) << "unsupported load_missing_parameter_strategy: "
-        << FLAGS_load_missing_parameter_strategy;
+               << FLAGS_load_missing_parameter_strategy;
     return false;
   }
   return load(fs);
@@ -372,8 +405,8 @@ bool Parameter::load(std::istream& s) {
   Header header;
   CHECK(s.read(reinterpret_cast<char*>(&header), sizeof(header)))
       << "Fail to read parameter " << getName();
-  CHECK_EQ(header.version, kFormatVersion)
-      << "Incorrect format version: " << header.version;
+  CHECK_EQ(header.version, kFormatVersion) << "Incorrect format version: "
+                                           << header.version;
   CHECK_EQ(header.size, getSize())
       << "The size (" << header.size << ") in the file does not match the size "
       << "(" << getSize() << ") of the parameter: " << getName();
@@ -382,7 +415,7 @@ bool Parameter::load(std::istream& s) {
   CHECK(s.read(reinterpret_cast<char*>(vec.getData()),
                header.size * sizeof(real)));
 
-  auto & tmp = *bufs_[PARAMETER_VALUE].get();
+  auto& tmp = *bufs_[PARAMETER_VALUE].get();
   if (typeid(tmp) == typeid(GpuVector)) {
     bufs_[PARAMETER_VALUE]->copyFrom(vec);
   }
@@ -393,7 +426,11 @@ bool Parameter::load(std::istream& s) {
     auto height = config_.dims(0);
     auto width = config_.dims(1);
     auto mat = Matrix::create(vec.getData(), height, width);
-    CpuSparseMatrix sparseMat(height, width, 0, FLOAT_VALUE, format_,
+    CpuSparseMatrix sparseMat(height,
+                              width,
+                              0,
+                              FLOAT_VALUE,
+                              format_,
                               /*trans*/ false);
     sparseMat.copyFrom(*mat, HPPL_STREAM_DEFAULT);
     auto nnz = sparseMat.getElementCnt();
@@ -423,11 +460,11 @@ bool Parameter::load(std::istream& s) {
         s.read(reinterpret_cast<char*>(rows.getData()), rowSize * sizeof(int)));
     CHECK(
         s.read(reinterpret_cast<char*>(cols.getData()), colSize * sizeof(int)));
-    auto & paramRows = *intBufs_[PARAMETER_ROWS].get();
+    auto& paramRows = *intBufs_[PARAMETER_ROWS].get();
     if (typeid(paramRows) == typeid(GpuIVector)) {
       intBufs_[PARAMETER_ROWS]->copyFrom(rows);
     }
-    auto & paramCols = *intBufs_[PARAMETER_COLS].get();
+    auto& paramCols = *intBufs_[PARAMETER_COLS].get();
     if (typeid(paramCols) == typeid(GpuIVector)) {
       intBufs_[PARAMETER_COLS]->copyFrom(cols);
     }
@@ -457,8 +494,8 @@ void Parameter::exec(ExecFunc func) {
       func(this->getBufs());
     } else {  // multi thread
       VectorPtr* vecs = Parameter::getTlsTempBufs();
-      auto interval = calcSplitArrayInterval(this->getSize(), (size_t)tid,
-                                             numThreads, 8LU /*for avx*/);
+      auto interval = calcSplitArrayInterval(
+          this->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
       for (size_t i = 0; i < (size_t)NUM_PARAMETER_TYPES; ++i) {
         if (bufs_[i]) {
           vecs[i]->subVecFrom(*bufs_[i], interval);
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index ff251fe89f9f885c361b6c1ae7dde0ae57695e47..1c159d669a6a0f7b56c377e0b1cfa35b3fb75d53 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <stdint.h>
@@ -52,7 +51,6 @@ struct Segment {
   int64_t beginPos;  // beginning position in the local value or grad buffer
 };
 
-
 class Parameter;
 typedef std::shared_ptr<Parameter> ParameterPtr;
 
@@ -129,8 +127,7 @@ public:
     if (config_.dims_size() == 2) {
       if (matType == MAT_NORMAL || matType == MAT_NORMAL_SHARED ||
           matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
-          matType == MAT_VALUE_SHARED ||
-          matType == MAT_SPARSE_ROW_IDS) {
+          matType == MAT_VALUE_SHARED || matType == MAT_SPARSE_ROW_IDS) {
         bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
         bufs_[type]->zeroMem();
       } else {
@@ -161,7 +158,8 @@ public:
     }
   }
 
-  void enableSharedType(ParameterType type, VectorPtr vec,
+  void enableSharedType(ParameterType type,
+                        VectorPtr vec,
                         MatrixPtr mat = nullptr) {
     if (!bufs_[type] && !mats_[type]) {
       bufs_[type] = vec;
@@ -235,13 +233,17 @@ public:
    *
    * @see SparseRowCpuMatrix::sgdUpdate for more information.
    */
-  void updateWithGradient(real learningRate, MatrixPtr gradMat, IVectorPtr t0,
-                          int currentTime, bool fini = false);
+  void updateWithGradient(real learningRate,
+                          MatrixPtr gradMat,
+                          IVectorPtr t0,
+                          int currentTime,
+                          bool fini = false);
 
   /**
    * This function is used to calculate multiple gpus, but only as a candidate
    */
-  void updateWithGradient(real learningRate, VectorPtr grad,
+  void updateWithGradient(real learningRate,
+                          VectorPtr grad,
                           bool normalUpdate = true);
 
   /**
diff --git a/paddle/parameter/ParameterOptimizer.cpp b/paddle/parameter/ParameterOptimizer.cpp
index 164b50c4d279102ce14d82b102a74e56dfc5b2fe..2a71d6aee4dae556956616bd317156cfaf8732f0 100644
--- a/paddle/parameter/ParameterOptimizer.cpp
+++ b/paddle/parameter/ParameterOptimizer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 
 #include <fstream>
diff --git a/paddle/parameter/ParameterOptimizer.h b/paddle/parameter/ParameterOptimizer.h
index 8c766743401dddc6468e6db22164843e286e6ad7..21a148333c2fd3aa127c5b3bb8160784864f4cce 100644
--- a/paddle/parameter/ParameterOptimizer.h
+++ b/paddle/parameter/ParameterOptimizer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "LearningRateScheduler.h"
@@ -32,8 +31,8 @@ namespace paddle {
  */
 class ParameterOptimizer {
 public:
-  typedef std::function<void(const VectorPtr vecs[],
-                             const ParameterConfig& config, size_t sparseId)>
+  typedef std::function<void(
+      const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId)>
       TraverseCallback;
 
 public:
@@ -69,35 +68,35 @@ public:
     (void)numSamplesProcessed;
   }
 
- /**
-  * following hooks useful for sparse update,
-  * because the traversal in block costs.
-  * called by Trainer after update and before finishBatch
-  * e.g. Trainer call like this:
-  *
-  * @code
-  * startBatch();
-  * if (dense) {
-  *   update(blockVec);
-  * } else {//sparse
-  *   for (row : rows_in_block) {update(rowVec)}
-  * }
-  * auto callback = needSpecialTraversal();
-  * if (callback) {
-  *   // do traverse, maybe multi-thread
-  *   if (dense) {
-  *     callback();
-  *   } else {//sparse
-  *     for (row : all_rows_in_block) {callback();}
-  *   }
-  * }
-  * finishBatch();
-  * @endcode
-  *
-  * @return callback if need traverse,
-  *         else return nullptr.
-  *         It should be no state change.
-  */
+  /**
+   * following hooks useful for sparse update,
+   * because the traversal in block costs.
+   * called by Trainer after update and before finishBatch
+   * e.g. Trainer call like this:
+   *
+   * @code
+   * startBatch();
+   * if (dense) {
+   *   update(blockVec);
+   * } else {//sparse
+   *   for (row : rows_in_block) {update(rowVec)}
+   * }
+   * auto callback = needSpecialTraversal();
+   * if (callback) {
+   *   // do traverse, maybe multi-thread
+   *   if (dense) {
+   *     callback();
+   *   } else {//sparse
+   *     for (row : all_rows_in_block) {callback();}
+   *   }
+   * }
+   * finishBatch();
+   * @endcode
+   *
+   * @return callback if need traverse,
+   *         else return nullptr.
+   *         It should be no state change.
+   */
   virtual TraverseCallback needSpecialTraversal(
       const ParameterConfig& config) const {
     return nullptr;
@@ -112,47 +111,48 @@ public:
    * with its gradient in PARAMETER_GRADIENT. sparseId is row id,
    * when sparseId set, update is sparse, each time one row.
    */
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& config,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
                       size_t sparseId = -1LU) const = 0;
 
- /**
-  * following hooks catch up with current time for sparse update,
-  * In the beginning, call startCatchUpWith() and check return.
-  * In the end, call finishCatchUpWith() to finish state.
-  * callback do the actual works, can call many times for sparse data.
-  * e.g. Trainer call like this:
-  *
-  * @code
-  * auto callback = startCatchUpWith();
-  * if (callback) {
-  *   // do catch up with, maybe multi-thread
-  *   if (dense) {
-  *     callback();
-  *   } else {//sparse
-  *     for (row : rows_in_block) {callback();}
-  *   }
-  *   // finish catch up with, main thread
-  *   finishCatchUpWith();
-  * }
-  * @endcode
-  *
-  * @return callback if need catch up with,
-  *         else return nullptr.
-  *         It should be no state change.
-  */
+  /**
+   * following hooks catch up with current time for sparse update,
+   * In the beginning, call startCatchUpWith() and check return.
+   * In the end, call finishCatchUpWith() to finish state.
+   * callback do the actual works, can call many times for sparse data.
+   * e.g. Trainer call like this:
+   *
+   * @code
+   * auto callback = startCatchUpWith();
+   * if (callback) {
+   *   // do catch up with, maybe multi-thread
+   *   if (dense) {
+   *     callback();
+   *   } else {//sparse
+   *     for (row : rows_in_block) {callback();}
+   *   }
+   *   // finish catch up with, main thread
+   *   finishCatchUpWith();
+   * }
+   * @endcode
+   *
+   * @return callback if need catch up with,
+   *         else return nullptr.
+   *         It should be no state change.
+   */
   virtual TraverseCallback startCatchUpWith() const { return nullptr; }
   virtual void finishCatchUpWith() {}
 
- /**
-  * following two hooks used by averager,
-  * apply to final parameter value (PARAMETER_VALUE or PARAMETER_APPLY).
-  *
-  * restore() will restore orginal value if it apply to PARAMETER_VALUE.
-  * Caller must ensure it's catched up with current time before apply.
-  *
-  * Use returned callback same way as callback returned by
-  * ParameterOptimizer::needSpecialTraversal()
-  */
+  /**
+   * following two hooks used by averager,
+   * apply to final parameter value (PARAMETER_VALUE or PARAMETER_APPLY).
+   *
+   * restore() will restore orginal value if it apply to PARAMETER_VALUE.
+   * Caller must ensure it's catched up with current time before apply.
+   *
+   * Use returned callback same way as callback returned by
+   * ParameterOptimizer::needSpecialTraversal()
+   */
   virtual TraverseCallback apply() { return nullptr; }
   virtual TraverseCallback restore() { return nullptr; }
 
@@ -180,7 +180,8 @@ protected:
   static TraverseCallback composeCallbacks(
       const TraverseCallbackVec& callbacks) {
     if (callbacks.size() > 1LU) {
-      return [callbacks](const VectorPtr vecs[], const ParameterConfig& config,
+      return [callbacks](const VectorPtr vecs[],
+                         const ParameterConfig& config,
                          size_t sparseId) {
         for (auto callback : callbacks) {
           callback(vecs, config, sparseId);
diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp
index 679e3bf89b517a91cdd1af6bdad4e199418485a5..510ec5bf48a7576f646ecf01b02c5047c637afeb 100644
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Logging.h"
 #ifdef __AVX__
 #include <x86intrin.h>
@@ -23,8 +22,13 @@ limitations under the License. */
 
 namespace paddle {
 
-void sgdUpdateCpu(real learningRate, real momentum, real decayRate, size_t size,
-                  real* value, const real* grad, real* momentumVec) {
+void sgdUpdateCpu(real learningRate,
+                  real momentum,
+                  real decayRate,
+                  size_t size,
+                  real* value,
+                  const real* grad,
+                  real* momentumVec) {
   decayRate *= learningRate;
   for (size_t i = 0; i < size; ++i) {
     momentumVec[i] = momentum * momentumVec[i] - learningRate * grad[i] -
@@ -33,8 +37,12 @@ void sgdUpdateCpu(real learningRate, real momentum, real decayRate, size_t size,
   }
 }
 
-void sgdUpdate(real learningRate, real momentum, real decayRate, Vector* value,
-               Vector* grad, Vector* momentumVec) {
+void sgdUpdate(real learningRate,
+               real momentum,
+               real decayRate,
+               Vector* value,
+               Vector* grad,
+               Vector* momentumVec) {
   size_t size = value->getSize();
   real* val = value->getData();
   real* grd = grad->getData();
@@ -48,8 +56,12 @@ void sgdUpdate(real learningRate, real momentum, real decayRate, Vector* value,
   }
 }
 
-void sgdUpdateAvx(float learningRate, float momentum, float decayRate,
-                  size_t size, float* value, const float* _grad,
+void sgdUpdateAvx(float learningRate,
+                  float momentum,
+                  float decayRate,
+                  size_t size,
+                  float* value,
+                  const float* _grad,
                   float* momentumVec) {
 #ifdef __AVX__
   float* grad = const_cast<float*>(_grad);  // the gradient is not modified
@@ -86,18 +98,36 @@ void sgdUpdateAvx(float learningRate, float momentum, float decayRate,
   std::function<void(void)> loopFun;
 
   learningRate *= -1;
-  lr = _mm256_set_ps(learningRate, learningRate, learningRate, learningRate,
-                     learningRate, learningRate, learningRate, learningRate);
+  lr = _mm256_set_ps(learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate);
 
   if (0 != momentum) {
-    mom = _mm256_set_ps(momentum, momentum, momentum, momentum, momentum,
-                        momentum, momentum, momentum);
+    mom = _mm256_set_ps(momentum,
+                        momentum,
+                        momentum,
+                        momentum,
+                        momentum,
+                        momentum,
+                        momentum,
+                        momentum);
   }
 
   decayRate *= learningRate;
   if (0 != decayRate) {
-    dr = _mm256_set_ps(decayRate, decayRate, decayRate, decayRate, decayRate,
-                       decayRate, decayRate, decayRate);
+    dr = _mm256_set_ps(decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate);
   }
 
   auto gradMulFun = [&](void) {
diff --git a/paddle/parameter/ParameterUpdateFunctions.h b/paddle/parameter/ParameterUpdateFunctions.h
index 59eb25656e51c097b2d957902573437894ab53f7..2d98030bd2389469fbd32940af6162203557620c 100644
--- a/paddle/parameter/ParameterUpdateFunctions.h
+++ b/paddle/parameter/ParameterUpdateFunctions.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/TypeDefs.h"
@@ -31,14 +30,27 @@ namespace paddle {
  * momentum = 0 or decayRate = 0 are specially handled to avoid unnecessary
  * computation.
  */
-void sgdUpdate(real learningRate, real momentum, real decayRate, Vector* value,
-               Vector* grad, Vector* momentumVec);
-
-void sgdUpdateCpu(real learningRate, real momentum, real decayRate, size_t size,
-                  real* value, const real* grad, real* momentumVec);
-
-void sgdUpdateAvx(float learningRate, float momentum, float decayRate,
-                  size_t size, float* value, const float* grad,
+void sgdUpdate(real learningRate,
+               real momentum,
+               real decayRate,
+               Vector* value,
+               Vector* grad,
+               Vector* momentumVec);
+
+void sgdUpdateCpu(real learningRate,
+                  real momentum,
+                  real decayRate,
+                  size_t size,
+                  real* value,
+                  const real* grad,
+                  real* momentumVec);
+
+void sgdUpdateAvx(float learningRate,
+                  float momentum,
+                  float decayRate,
+                  size_t size,
+                  float* value,
+                  const float* grad,
                   float* momentumVec);
 
 }  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdaterBase.cpp b/paddle/parameter/ParameterUpdaterBase.cpp
index e3f1d54037b305972248b7b30065b0ae5eb4b357..e706742053fc49df9c99081774f425622941e38c 100644
--- a/paddle/parameter/ParameterUpdaterBase.cpp
+++ b/paddle/parameter/ParameterUpdaterBase.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <fstream>
 #include "paddle/utils/Logging.h"
 #include "ParameterUpdaterBase.h"
diff --git a/paddle/parameter/ParameterUpdaterBase.h b/paddle/parameter/ParameterUpdaterBase.h
index f16e183515853e01eacda39977c9a7e127b3824c..ffd2980261530382ee09f2c98e354d0e56fd8038 100644
--- a/paddle/parameter/ParameterUpdaterBase.h
+++ b/paddle/parameter/ParameterUpdaterBase.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "Parameter.h"
diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp
index 02a352920cf38120938d659dac4258a48643de4d..7d85a32c0cf527d39c252c2021b7bad0eb58753d 100644
--- a/paddle/parameter/ParameterUpdaterHook.cpp
+++ b/paddle/parameter/ParameterUpdaterHook.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ParameterUpdaterHook.h"
 
 #include <fstream>
@@ -155,7 +154,8 @@ private:
   std::hash<int> intHasher_;
 };
 
-static WeakKVCache<std::pair<std::string, int>, IParameterUpdaterHook,
+static WeakKVCache<std::pair<std::string, int>,
+                   IParameterUpdaterHook,
                    StringIntPairHasher> g_hookCache_;
 
 /**
diff --git a/paddle/parameter/ParameterUpdaterHook.h b/paddle/parameter/ParameterUpdaterHook.h
index 1c132a733866b8083632a64b1b47ff2b35b2ee69..553282bcaaa2e90910eaafbe2e03a4afadf04a85 100644
--- a/paddle/parameter/ParameterUpdaterHook.h
+++ b/paddle/parameter/ParameterUpdaterHook.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <memory>
 
diff --git a/paddle/parameter/Regularizer.cpp b/paddle/parameter/Regularizer.cpp
index bc7de3ca048dbe094e1f53c024e705425908cdfb..a9bddc1596656ba36d6c445781f42991684f0c52 100644
--- a/paddle/parameter/Regularizer.cpp
+++ b/paddle/parameter/Regularizer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 #include "paddle/utils/Flags.h"
 #include "Regularizer.h"
@@ -21,8 +20,9 @@ namespace paddle {
 
 Regularizer* Regularizer::get(const std::vector<ParameterType>& types,
                               const ParameterConfig& paraConfig) {
-  bool useLearningRateVec = std::find(types.begin(), types.end(),
-                                      PARAMETER_LEARNING_RATE) != types.end();
+  bool useLearningRateVec =
+      std::find(types.begin(), types.end(), PARAMETER_LEARNING_RATE) !=
+      types.end();
   if (paraConfig.decay_rate_l1() > 0.0f &&
       paraConfig.decay_rate() > 0.0f) {  // use L1 and L2
     if (useLearningRateVec) {
diff --git a/paddle/parameter/Regularizer.h b/paddle/parameter/Regularizer.h
index 8c9eb49ab611e8aea7b88f008fe287cbdb17a008..5baaccc00db5f858272dbfa6751647915bfa6e3c 100644
--- a/paddle/parameter/Regularizer.h
+++ b/paddle/parameter/Regularizer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "ParameterUpdaterBase.h"
@@ -22,7 +21,8 @@ namespace paddle {
 // Regularizer function for parameter, e.g. L1/L2
 class Regularizer {
 public:
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
                       real learningRate,  // learningrate from optimizer
                       int t0,             // last occurence time
                       int t) const = 0;   // current time
@@ -34,8 +34,11 @@ public:
 
 // L1 Regularizer, |w|_1
 class L1Regularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
-                      real learningRate, int t0, int t) const {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
     vecs[PARAMETER_VALUE]->applyL1(learningRate * paraConfig.learning_rate(),
                                    paraConfig.decay_rate_l1() * (t - t0));
   }
@@ -43,8 +46,11 @@ class L1Regularizer : public Regularizer {
 
 // L1 Lr Regularizer
 class L1LrRegularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
-                      real learningRate, int t0, int t) const {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
     vecs[PARAMETER_VALUE]->applyL1(*vecs[PARAMETER_LEARNING_RATE],
                                    learningRate * paraConfig.learning_rate(),
                                    paraConfig.decay_rate_l1() * (t - t0));
@@ -53,8 +59,11 @@ class L1LrRegularizer : public Regularizer {
 
 // L2 Regularizer, |w|_2^2
 class L2Regularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
-                      real learningRate, int t0, int t) const {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
     vecs[PARAMETER_VALUE]->applyL2(learningRate * paraConfig.learning_rate(),
                                    paraConfig.decay_rate() * (t - t0));
   }
@@ -62,8 +71,11 @@ class L2Regularizer : public Regularizer {
 
 // L2 Lr Regularizer
 class L2LrRegularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
-                      real learningRate, int t0, int t) const {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
     vecs[PARAMETER_VALUE]->applyL2(*vecs[PARAMETER_LEARNING_RATE],
                                    learningRate * paraConfig.learning_rate(),
                                    paraConfig.decay_rate() * (t - t0));
@@ -72,8 +84,11 @@ class L2LrRegularizer : public Regularizer {
 
 // L1 + L2 Regularizer, |w|_1 + |w|_2^2
 class L1L2Regularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
-                      real learningRate, int t0, int t) const {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
     vecs[PARAMETER_VALUE]->applyL1(learningRate * paraConfig.learning_rate(),
                                    paraConfig.decay_rate_l1() * (t - t0));
     vecs[PARAMETER_VALUE]->applyL2(learningRate * paraConfig.learning_rate(),
@@ -83,8 +98,11 @@ class L1L2Regularizer : public Regularizer {
 
 // L1 + L2 Lr Regularizer
 class L1L2LrRegularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig,
-                      real learningRate, int t0, int t) const {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
     vecs[PARAMETER_VALUE]->applyL1(*vecs[PARAMETER_LEARNING_RATE],
                                    learningRate * paraConfig.learning_rate(),
                                    paraConfig.decay_rate_l1() * (t - t0));
diff --git a/paddle/parameter/Weight.cpp b/paddle/parameter/Weight.cpp
index ed02355c01a587da36da038be9e3d6eaf559c884..c138010607412fa257a6c7360a27d855197f88ad 100644
--- a/paddle/parameter/Weight.cpp
+++ b/paddle/parameter/Weight.cpp
@@ -60,14 +60,20 @@ Weight::Weight(size_t height, size_t width, ParameterPtr param, size_t offset) {
 
   // weight_
   if (vPtr) {
-    weight_ = Matrix::create(vPtr->getData() + offset, height, width,
-                             /* trans */ false, param->useGpu());
+    weight_ = Matrix::create(vPtr->getData() + offset,
+                             height,
+                             width,
+                             /* trans */ false,
+                             param->useGpu());
   }
 
   // weightGrad
   if (gPtr) {
-    weightGrad_ = Matrix::create(gPtr->getData() + offset, height, width,
-                                 /* trans */ false, param->useGpu());
+    weightGrad_ = Matrix::create(gPtr->getData() + offset,
+                                 height,
+                                 width,
+                                 /* trans */ false,
+                                 param->useGpu());
   }
 
   parameter_ = param;
diff --git a/paddle/parameter/tests/CMakeLists.txt b/paddle/parameter/tests/CMakeLists.txt
index 177fb2fdfc045e1c68cc56e9f7654cbda5f46e25..cab264db8e5000e8eb61830ec07e9f590c103119 100644
--- a/paddle/parameter/tests/CMakeLists.txt
+++ b/paddle/parameter/tests/CMakeLists.txt
@@ -1 +1 @@
-add_simple_unittest(test_common)
\ No newline at end of file
+add_simple_unittest(test_common)
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
index 1a22abf7cf80157039f6147293e7648d654e45f7..1a64fe335257a3107be03cfd333cb483c5ab452d 100644
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <stdlib.h>
 #include <paddle/utils/Util.h>
 
@@ -38,8 +37,8 @@ protected:
   CommonTest() : testStat_("test") {}
   virtual ~CommonTest() {}
   virtual void SetUp() {
-    const size_t buffSize[] = {100,  128,   500,    1024,
-                               4096, 10240, 102400, 1000000};
+    const size_t buffSize[] = {
+        100, 128, 500, 1024, 4096, 10240, 102400, 1000000};
     sizeVec_.resize(8);
     memcpy(&sizeVec_[0], &buffSize[0], 8 * sizeof(size_t));
     valueUint_.resize(4);
@@ -54,8 +53,10 @@ protected:
     learningRate_ = 1.0;
   }
 
-  void test_sgdUpadate(real* gradientBuffer, real* valueBuffer,
-                       real* momentumBuffer, size_t size);
+  void test_sgdUpadate(real* gradientBuffer,
+                       real* valueBuffer,
+                       real* momentumBuffer,
+                       size_t size);
 
   virtual void TreaDown() { LOG(INFO) << "All Test Finished."; }
 
@@ -66,8 +67,10 @@ protected:
   StatSet testStat_;
 };
 
-void CommonTest::test_sgdUpadate(real* gradientBuffer, real* valueBuffer,
-                                 real* momentumBuffer, size_t size) {
+void CommonTest::test_sgdUpadate(real* gradientBuffer,
+                                 real* valueBuffer,
+                                 real* momentumBuffer,
+                                 size_t size) {
 // sgdUpdateAvx has no double version yet
 #if defined(__AVX__) && !defined(PADDLE_TYPE_DOUBLE)
   real valueSum1 = 0, valueSum2 = 0, momSum1 = 0, momSum2 = 0;
@@ -85,8 +88,13 @@ void CommonTest::test_sgdUpadate(real* gradientBuffer, real* valueBuffer,
         gettimeofday(&t, NULL);
       }
       REGISTER_TIMER("avxTimer", 0);
-      sgdUpdateAvx(learningRate_, arg.first, arg.second, size, valueBuffer,
-                   gradientBuffer, momentumBuffer);
+      sgdUpdateAvx(learningRate_,
+                   arg.first,
+                   arg.second,
+                   size,
+                   valueBuffer,
+                   gradientBuffer,
+                   momentumBuffer);
     }
     for (size_t i = 0; i < size; i++) {
       valueSum1 += valueBuffer[i];
@@ -98,8 +106,13 @@ void CommonTest::test_sgdUpadate(real* gradientBuffer, real* valueBuffer,
     }
     {
       REGISTER_TIMER("cpuTimer", 0);
-      sgdUpdateCpu(learningRate_, arg.first, arg.second, size, valueTmp,
-                   gradTmp, momentumTmp);
+      sgdUpdateCpu(learningRate_,
+                   arg.first,
+                   arg.second,
+                   size,
+                   valueTmp,
+                   gradTmp,
+                   momentumTmp);
     }
     for (size_t i = 0; i < size; i++) {
       valueSum2 += valueTmp[i];
@@ -126,10 +139,10 @@ TEST_F(CommonTest, sgdUpdate) {
   for (auto& size : sizeVec_) {
     real *gradientBuffer, *valueBuffer, *momentumBuffer;
     CHECK_EQ(posix_memalign((void**)&gradientBuffer, 32, sizeof(real) * size),
-        0);
+             0);
     CHECK_EQ(posix_memalign((void**)&valueBuffer, 32, sizeof(real) * size), 0);
     CHECK_EQ(posix_memalign((void**)&momentumBuffer, 32, sizeof(real) * size),
-        0);
+             0);
 
     for (size_t i = 0; i < size; i++) {
       gradientBuffer[i] = 1.0;
@@ -141,7 +154,8 @@ TEST_F(CommonTest, sgdUpdate) {
                 << "-------------------------";
       test_sgdUpadate(&gradientBuffer[alignHeader[i]],
                       &valueBuffer[alignHeader[i]],
-                      &momentumBuffer[alignHeader[i]], size - alignHeader[i]);
+                      &momentumBuffer[alignHeader[i]],
+                      size - alignHeader[i]);
     }
     free(gradientBuffer);
     free(valueBuffer);
@@ -173,16 +187,16 @@ TEST_F(CommonTest, barrierStat) {
 
   SyncThreadPool pool(threadNum);
 
-#define TEST_BARRIER_RANDOM(statName, numConnThreads, ...)               \
-  pool.exec([&](int tid, size_t numThreads) {                            \
-    struct timeval time;                                                 \
-    gettimeofday(&time, nullptr);                                        \
-    uint64_t usec = timeToMicroSecond(time);                             \
-    std::srand(usec);                                                    \
-    auto value = std::rand() % 100000;                                   \
-    usleep(value);                                                       \
-    REGISTER_SLOW_NODES_PROBE(globalStat, statName, numConnThreads, tid, \
-                              __VA_ARGS__);                              \
+#define TEST_BARRIER_RANDOM(statName, numConnThreads, ...)       \
+  pool.exec([&](int tid, size_t numThreads) {                    \
+    struct timeval time;                                         \
+    gettimeofday(&time, nullptr);                                \
+    uint64_t usec = timeToMicroSecond(time);                     \
+    std::srand(usec);                                            \
+    auto value = std::rand() % 100000;                           \
+    usleep(value);                                               \
+    REGISTER_SLOW_NODES_PROBE(                                   \
+        globalStat, statName, numConnThreads, tid, __VA_ARGS__); \
   });
 
   for (auto i = 0; i < 10; i++) {
@@ -202,11 +216,11 @@ TEST_F(CommonTest, barrierStat) {
   globalStat.reset();
 
 // use it to test accurate barrier gap
-#define TEST_BARRIER(statName, numConnThreads, ...)                      \
-  pool.exec([&](int tid, size_t numThreads) {                            \
-    usleep(tid * 10000);                                                 \
-    REGISTER_SLOW_NODES_PROBE(globalStat, statName, numConnThreads, tid, \
-                              __VA_ARGS__);                              \
+#define TEST_BARRIER(statName, numConnThreads, ...)              \
+  pool.exec([&](int tid, size_t numThreads) {                    \
+    usleep(tid * 10000);                                         \
+    REGISTER_SLOW_NODES_PROBE(                                   \
+        globalStat, statName, numConnThreads, tid, __VA_ARGS__); \
   });
 
   for (auto i = 0; i < 10; i++) {
diff --git a/paddle/pserver/BaseClient.cpp b/paddle/pserver/BaseClient.cpp
index df4daca9bfaf888ccaacc73d9295d6d973dcb9fb..ff83970ab1b11f74ceb4009cc8f469f7b54a7272 100644
--- a/paddle/pserver/BaseClient.cpp
+++ b/paddle/pserver/BaseClient.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <vector>
 #include <string.h>
 #include "paddle/utils/Stat.h"
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
index f1c4c9eb375420edaf895c3ddea7ac06f7b225bd..3a501172b70a91e02ecda0f9f78e0c025ac67936 100644
--- a/paddle/pserver/BaseClient.h
+++ b/paddle/pserver/BaseClient.h
@@ -62,7 +62,10 @@ public:
 
   /// send data to server, support only synchronize
   template <class DataType>
-  void putData(int clientId, SendDataType type, DataType* datas, size_t size,
+  void putData(int clientId,
+               SendDataType type,
+               DataType* datas,
+               size_t size,
                DataUpdateMode mode) {
     synchronize(SYNC_DATA);
     sendData(clientId, type, mode, datas, size);
@@ -71,16 +74,23 @@ public:
   }
 
   template <class DataType>
-  void putOwnData(int clientId, SendDataType type, DataType* datas,
+  void putOwnData(int clientId,
+                  SendDataType type,
+                  DataType* datas,
                   size_t size) {
     putData(clientId, type, datas, size, DATA_UPDATE_MODE_SET_OWN);
   }
 
   template <class DataType>
-  void getAllData(int clientId, SendDataType type, DataType* datas,
+  void getAllData(int clientId,
+                  SendDataType type,
+                  DataType* datas,
                   size_t size) {
-    sendData(clientId, type, DATA_UPDATE_MODE_GET_ALL,
-             reinterpret_cast<DataType*>(NULL), 0);
+    sendData(clientId,
+             type,
+             DATA_UPDATE_MODE_GET_ALL,
+             reinterpret_cast<DataType*>(NULL),
+             0);
     recvData();
     size_t dataOffset = 0;
     for (auto& recvMem : recvDataMems_) {
@@ -100,7 +110,10 @@ public:
    * The results are saved in recvBuf of rootId client
    */
   template <class DataType>
-  void reduce(DataType* sendBuf, DataType* recvBuf, size_t size, int clientId,
+  void reduce(DataType* sendBuf,
+              DataType* recvBuf,
+              size_t size,
+              int clientId,
               int rootId) {
     putOwnData(clientId, DATA_REDUCE_SUM, sendBuf, size);
     if (rootId == clientId) {
@@ -147,8 +160,12 @@ protected:
   void finishThreads();
 
   template <class DataType>
-  void prepareData(int clientId, SendDataType type, DataUpdateMode updateMode,
-                   DataType* datas, size_t size, SendJob* sendJob) {
+  void prepareData(int clientId,
+                   SendDataType type,
+                   DataUpdateMode updateMode,
+                   DataType* datas,
+                   size_t size,
+                   SendJob* sendJob) {
     sendJob->parallelDataRequests.resize(serviceNum_);
     sendJob->parallelInputIovs.resize(serviceNum_);
     for (int i = 0; i < serviceNum_; ++i) {
@@ -192,8 +209,11 @@ protected:
    *        synchronization in metric learning.
    */
   template <class DataType>
-  void sendData(int clientId, SendDataType type, DataUpdateMode updateMode,
-                DataType* datas, size_t size) {
+  void sendData(int clientId,
+                SendDataType type,
+                DataUpdateMode updateMode,
+                DataType* datas,
+                size_t size) {
     SendJobPtr sendJob = std::make_shared<SendJob>();
     prepareData(clientId, type, updateMode, datas, size, sendJob.get());
     for (int i = 0; i < threadNum_; ++i) {
@@ -210,7 +230,8 @@ protected:
 
   /// send request, and recv responses
   template <typename ProtoIn, typename ProtoOut>
-  void multiCall(const char* funcName, const ProtoIn& request,
+  void multiCall(const char* funcName,
+                 const ProtoIn& request,
                  std::vector<ProtoOut>* responses) {
     responses->resize(clients_.size());
     size_t numClients = clients_.size();
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
index ff2875fc702ffbb0675f21433138961c19ff0b86..1830170a163fa47114c75a2a88a731ea31060142 100644
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <netdb.h>
@@ -32,19 +31,22 @@ limitations under the License. */
 #include "RDMANetwork.h"
 
 /// quick ack can reduce the latency of small message
-P_DEFINE_bool(small_messages, false,
+P_DEFINE_bool(small_messages,
+              false,
               "if message size is small, recommend set it True to enable quick "
               "ack and no delay");
 
 /// reasonable sock_send_buf_size can control the traffic injected into switch
 /// network. Injecting too many data into traffic could cause packets loss which
 /// cause long latency and degrade the efficiency of communication.
-P_DEFINE_int32(sock_send_buf_size, 1024 * 1024 * 40,
+P_DEFINE_int32(sock_send_buf_size,
+               1024 * 1024 * 40,
                "restrict sock send buff size, can reduce network congestion if "
                "set carefully");
 
 /// reasonable size can hold bursted packets and reduce packets loss
-P_DEFINE_int32(sock_recv_buf_size, 1024 * 1024 * 40,
+P_DEFINE_int32(sock_recv_buf_size,
+               1024 * 1024 * 40,
                "restrict sock recv buff size");
 
 namespace paddle {
@@ -174,7 +176,8 @@ void SocketServer::tcpServer() {
   if (!addr_.empty()) {
     server = gethostbyname(addr_.c_str());
     PCHECK(server) << "ERROR, no such host: " << addr_;
-    bcopy((char *)server->h_addr, (char *)&serv_addr.sin_addr.s_addr,
+    bcopy((char *)server->h_addr,
+          (char *)&serv_addr.sin_addr.s_addr,
           server->h_length);
   } else {
     serv_addr.sin_addr.s_addr = INADDR_ANY;
@@ -347,29 +350,32 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
   struct sockaddr_in serv_addr;
   struct hostent *server;
 
-  int errRet;      // temp for gethostbyname_r
+  int errRet;  // temp for gethostbyname_r
 
   /// Create a socket point
   int sockfd = socket(AF_INET, SOCK_STREAM, 0);
   PCHECK(sockfd >= 0) << "ERROR opening socket";
 
 #if defined(__OSX__) || defined(__APPLE__)
-   server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
-   CHECK_NE(HOST_NOT_FOUND, errRet)
-     << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
-   CHECK(server) << "getipnodebyname error!";
+  server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
+  CHECK_NE(HOST_NOT_FOUND, errRet) << "ERROR, no such host: " << serverAddr
+                                   << " ret = " << errRet;
+  CHECK(server) << "getipnodebyname error!";
 #else
-   struct hostent hostinfo;
-   char buf[1024];  // temp for gethostbyname_r
-   CHECK_EQ(0, gethostbyname_r(serverAddr.c_str(), &hostinfo, buf, sizeof(buf),
-                               &server, &errRet))
-       << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
-   CHECK(server) << "gethostbyname_r error!";
+  struct hostent hostinfo;
+  char buf[1024];  // temp for gethostbyname_r
+  CHECK_EQ(
+      0,
+      gethostbyname_r(
+          serverAddr.c_str(), &hostinfo, buf, sizeof(buf), &server, &errRet))
+      << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
+  CHECK(server) << "gethostbyname_r error!";
 #endif
 
   bzero((char *)&serv_addr, sizeof(serv_addr));
   serv_addr.sin_family = AF_INET;
-  bcopy((char *)server->h_addr, (char *)&serv_addr.sin_addr.s_addr,
+  bcopy((char *)server->h_addr,
+        (char *)&serv_addr.sin_addr.s_addr,
         server->h_length);
   serv_addr.sin_port = htons(serverPort);
 
@@ -421,7 +427,8 @@ void SocketClient::RdmaClient(const std::string &serverAddr, int serverPort) {
  *
  * @note  responsible for building one connection to specified pserver port
  */
-SocketClient::SocketClient(const std::string &serverAddr, int serverPort,
+SocketClient::SocketClient(const std::string &serverAddr,
+                           int serverPort,
                            enum ChannelType channelType) {
   if (channelType == F_RDMA)
     RdmaClient(serverAddr, serverPort);
diff --git a/paddle/pserver/LightNetwork.h b/paddle/pserver/LightNetwork.h
index 0d6d6bf6b7c6d3b7123f9ce05f50ad45bfd5ac60..b7d7bc7902abb18aae03fc4d8a3972f0298199fe 100644
--- a/paddle/pserver/LightNetwork.h
+++ b/paddle/pserver/LightNetwork.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "SocketChannel.h"
@@ -39,9 +38,9 @@ class SocketWorker;
  *        in child class of socketserver.
  */
 class SocketServer : public Thread {
-   // rdmaCpu controls the cpu affinity of RDMA server daemon,
-   // which could benifit performance. rdmaCpu = -1 means TCP
-   // is used instead of RDMA transport.
+  // rdmaCpu controls the cpu affinity of RDMA server daemon,
+  // which could benifit performance. rdmaCpu = -1 means TCP
+  // is used instead of RDMA transport.
 public:
   SocketServer(const std::string& addr, int port, int rdmaCpu);
   ~SocketServer();
@@ -91,7 +90,6 @@ protected:
   bool stopping_;
 };
 
-
 /**
  * @brief class for holding one connection from one trainer
  *
@@ -165,7 +163,8 @@ private:
  */
 class SocketClient {
 public:
-  SocketClient(const std::string& serverAddr, int serverPort,
+  SocketClient(const std::string& serverAddr,
+               int serverPort,
                enum ChannelType channelType);
 
   SocketChannel* getChannel() { return channel_.get(); }
diff --git a/paddle/pserver/ParameterClient2.cpp b/paddle/pserver/ParameterClient2.cpp
index d0e5352c828d197d6854ef19e6310dc63913846d..28cc0ae2dd36273397015e618f6e14ea43398964 100644
--- a/paddle/pserver/ParameterClient2.cpp
+++ b/paddle/pserver/ParameterClient2.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <unistd.h>
 
 #include "ParameterClient2.h"
@@ -27,7 +26,8 @@ P_DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
 namespace paddle {
 
 template <class T>
-void copyToRepeatedField(google::protobuf::RepeatedField<T>* dest, const T* src,
+void copyToRepeatedField(google::protobuf::RepeatedField<T>* dest,
+                         const T* src,
                          size_t size) {
   dest->Clear();
   dest->Reserve(size);
@@ -46,11 +46,10 @@ void copyToRepeatedField(const std::vector<T>& src,
 ParameterClient2::ParameterClient2(bool separate, int port, int numPorts)
     : BaseClient(separate, numPorts), port_(port) {
 #ifndef PADDLE_DISABLE_TIMER
-    forwardbackwordTime_ = 0;
+  forwardbackwordTime_ = 0;
 #endif
 }
 
-
 int ParameterClient2::calcParameterBlockSize(
     const std::vector<ParameterPtr>& parameters, size_t serviceNum) {
   size_t totalSize = 0;
@@ -89,8 +88,8 @@ bool ParameterClient2::init(const std::vector<ParameterPtr>& parameters) {
   for (auto& para : parameters) {
     /// set block size for each parameter
     para->getConfig().set_parameter_block_size(
-            para->getConfig().sparse_remote_update() ?
-            para->getConfig().dims(1) : denseBlockSize);
+        para->getConfig().sparse_remote_update() ? para->getConfig().dims(1)
+                                                 : denseBlockSize);
   }
 
   for (auto& para : parameters) {
@@ -107,7 +106,7 @@ bool ParameterClient2::init(const std::vector<ParameterPtr>& parameters) {
     allSegments_.push_back(segments);
     if (para->getConfig().sparse_remote_update()) {
       CHECK_EQ(para->getConfig().parameter_block_size(),
-              para->getConfig().dims(1))
+               para->getConfig().dims(1))
           << "For sparse remote update parameter,"
           << " block size is the width of each row.";
     }
@@ -152,7 +151,8 @@ void ParameterClient2::destroy() {
   clients_.clear();
 }
 
-void ParameterClient2::sendParallel(int tid, size_t numThreads,
+void ParameterClient2::sendParallel(int tid,
+                                    size_t numThreads,
                                     ParameterType recvParameterType) {
   int numMyClients = divup(serviceNum_ - tid, numThreads);
 
@@ -163,7 +163,8 @@ void ParameterClient2::sendParallel(int tid, size_t numThreads,
     /// at the same time so that they will not flood data to the same
     /// pserver.
     i = calcClientId(i, serviceNum_);
-    clients_[i].send("sendParameter", sendJob_.parallelRequests[i],
+    clients_[i].send("sendParameter",
+                     sendJob_.parallelRequests[i],
                      sendJob_.parallelInputIovs[i]);
 
     /// clear large structure
@@ -204,10 +205,15 @@ void ParameterClient2::sendParallel(int tid, size_t numThreads,
 }
 
 void ParameterClient2::prepareSendData(
-    ParameterUpdateMode updateMode, ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments, int64_t numSamples,
-    real cost, bool sendBackParameter, ParameterType sendBackParameterType,
-    BatchStatus batchStatus, SendJob* sendJob) {
+    ParameterUpdateMode updateMode,
+    ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments,
+    int64_t numSamples,
+    real cost,
+    bool sendBackParameter,
+    ParameterType sendBackParameterType,
+    BatchStatus batchStatus,
+    SendJob* sendJob) {
   sendJob->parallelRequests.resize(serviceNum_);
   sendJob->parallelInputIovs.resize(serviceNum_);
 
@@ -247,11 +253,11 @@ void ParameterClient2::prepareSendData(
       const auto prefetchMat = parameter->getPrefetchMatrix();
       CHECK(prefetchMat != nullptr) << "prefetchMat is nullptr";
       auto sendMat = dynamic_cast<SparseRowCpuMatrix*>(
-        parameter->getMat(parameterType).get());
+          parameter->getMat(parameterType).get());
       CHECK(sendMat != nullptr) << "sendMat is nullptr";
 
       syncThreadPool_->exec([&](int tid, size_t numThreads) {
-        const auto &localIndices = prefetchMat->getLocalIndices();
+        const auto& localIndices = prefetchMat->getLocalIndices();
         /// num of sparse rows
         size_t nLocalBlocks = localIndices.size();
         uint64_t beginDim = 0;
@@ -278,17 +284,17 @@ void ParameterClient2::prepareSendData(
 
           if (sendingPara) {
             sendJob->parallelInputIovs[serverId].push_back(
-                {sendMat->getLocalRow(row), sizeof(real) * (size_t) blockSize});
+                {sendMat->getLocalRow(row), sizeof(real) * (size_t)blockSize});
             /// detect sparse parameter distribution
             sparseDistribution_->probeDistribution(serverId,
-                    sizeof(real) * blockSize);
+                                                   sizeof(real) * blockSize);
           }
         }
       });
 
     } else {  /// parameter set for dense and sparse
-      real* buf = sendingPara ?
-          parameter->getBuf(parameterType)->getPoint(0) : nullptr;
+      real* buf =
+          sendingPara ? parameter->getBuf(parameterType)->getPoint(0) : nullptr;
       uint64_t endDim = 0;
       for (uint64_t beginDim = 0; beginDim < paraSize; beginDim = endDim) {
         endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
@@ -302,8 +308,8 @@ void ParameterClient2::prepareSendData(
         block->set_begin_pos(beginDim);
         block->set_block_size(endDim - beginDim);
         if (buf) {
-            sendJob->parallelInputIovs[serverId].push_back({buf + beginDim,
-                     sizeof(real) * ((size_t) (endDim - beginDim))});
+          sendJob->parallelInputIovs[serverId].push_back(
+              {buf + beginDim, sizeof(real) * ((size_t)(endDim - beginDim))});
         }
       }
     }
@@ -313,13 +319,23 @@ void ParameterClient2::prepareSendData(
 }
 
 void ParameterClient2::sendAndReceiveParameter(
-    ParameterUpdateMode updateMode, ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments, int64_t numSamples,
-    real cost, bool sendBackParameter, ParameterType sendBackParameterType,
+    ParameterUpdateMode updateMode,
+    ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments,
+    int64_t numSamples,
+    real cost,
+    bool sendBackParameter,
+    ParameterType sendBackParameterType,
     ParameterType recvParameterType) {
-  prepareSendData(updateMode, parameterType, parameterSegments, numSamples,
-                  cost, sendBackParameter, sendBackParameterType,
-                  /*batchStatus = */ BATCH_START_AND_FINISH, &sendJob_);
+  prepareSendData(updateMode,
+                  parameterType,
+                  parameterSegments,
+                  numSamples,
+                  cost,
+                  sendBackParameter,
+                  sendBackParameterType,
+                  /*batchStatus = */ BATCH_START_AND_FINISH,
+                  &sendJob_);
 
   syncThreadPool_->exec([&](int tid, size_t numThreads) {
     this->sendParallel(tid, numThreads, recvParameterType);
@@ -327,12 +343,22 @@ void ParameterClient2::sendAndReceiveParameter(
 }
 
 void ParameterClient2::sendParameter(
-    ParameterUpdateMode updateMode, ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments, int64_t numSamples,
-    real cost, bool sendBackParameter, BatchStatus batchStatus) {
+    ParameterUpdateMode updateMode,
+    ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments,
+    int64_t numSamples,
+    real cost,
+    bool sendBackParameter,
+    BatchStatus batchStatus) {
   SendJobPtr sendJob = std::make_shared<SendJob>();
-  prepareSendData(updateMode, parameterType, parameterSegments, numSamples,
-                  cost, sendBackParameter, PARAMETER_VALUE, batchStatus,
+  prepareSendData(updateMode,
+                  parameterType,
+                  parameterSegments,
+                  numSamples,
+                  cost,
+                  sendBackParameter,
+                  PARAMETER_VALUE,
+                  batchStatus,
                   sendJob.get());
 
   for (int i = 0; i < threadNum_; i++) {
@@ -360,10 +386,12 @@ void ParameterClient2::send(int threadId) {
       /// pserver.
       i = calcClientId(i, serviceNum_);
       if (recvJob->parallelRequests.size()) {
-        clients_[i].send("sendParameter", recvJob->parallelRequests[i],
+        clients_[i].send("sendParameter",
+                         recvJob->parallelRequests[i],
                          recvJob->parallelInputIovs[i]);
       } else {
-        clients_[i].send("sendData", recvJob->parallelDataRequests[i],
+        clients_[i].send("sendData",
+                         recvJob->parallelDataRequests[i],
                          recvJob->parallelInputIovs[i]);
       }
     }
@@ -586,12 +614,13 @@ void PreparedOperations::addOperationHelper(Operation* op, CpuMatrixPtr mat) {
   ProtoMatrix& pmat = *op->add_matrices();
   pmat.set_num_cols(mat->getWidth());
   pmat.set_num_rows(mat->getHeight());
-  copyToRepeatedField(pmat.mutable_values(), mat->getData(),
-                      pmat.num_cols() * pmat.num_rows());
+  copyToRepeatedField(
+      pmat.mutable_values(), mat->getData(), pmat.num_cols() * pmat.num_rows());
 }
 
 void ParameterClient2::doOperation(PreparedOperations& ops,
-                                   bool waitForGradient, bool sendBackGradient,
+                                   bool waitForGradient,
+                                   bool sendBackGradient,
                                    bool releasePass) {
   std::vector<DoOperationResponse> responses;
   ops.request_.set_wait_for_gradient(waitForGradient);
@@ -666,7 +695,8 @@ void ParameterClient2::doOperation(PreparedOperations& ops,
         CHECK_EQ(rmat->getWidth(), (size_t)mat.num_cols());
         CpuMatrixPtr amat =
             std::make_shared<CpuMatrix>(const_cast<real*>(mat.values().data()),
-                                        rmat->getHeight(), rmat->getWidth());
+                                        rmat->getHeight(),
+                                        rmat->getWidth());
         rmat->add(*amat);
       }
     }
@@ -700,14 +730,17 @@ void ParameterClient2::vectorAddMult(PServerVector u, PServerVector v, real a) {
   doOperation(ops, false, false);
 }
 
-void ParameterClient2::vectorAddMultInto(PServerVector u, PServerVector v,
-                                         PServerVector w, real a) {
+void ParameterClient2::vectorAddMultInto(PServerVector u,
+                                         PServerVector v,
+                                         PServerVector w,
+                                         real a) {
   PreparedOperations ops;
   ops.addOperation(PSERVER_OP_au_bv_cw, v, w, u, (real)1, a, (real)0);
   doOperation(ops, false, false);
 }
 
-void ParameterClient2::vectorScaleInto(PServerVector u, PServerVector v,
+void ParameterClient2::vectorScaleInto(PServerVector u,
+                                       PServerVector v,
                                        real a) {
   PreparedOperations ops;
   ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)0);
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
index 7a4085ad8230747ab3c740910695932623946a5e..af8dd41ec4327fcf78625e7aa5d4b136ca7d14dd 100644
--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <atomic>
@@ -190,8 +189,8 @@ protected:
 };
 
 struct ParameterSegments {
-  std::string name;               // name of the parameter
-  size_t id;                      // id of the parameter
+  std::string name;  // name of the parameter
+  size_t id;         // id of the parameter
 };
 
 /**
@@ -225,7 +224,8 @@ public:
    *                 connections the parameter client maintains.
    */
   ParameterClient2(bool separate = false,
-                   int port = FLAGS_port, int numPorts = FLAGS_ports_num);
+                   int port = FLAGS_port,
+                   int numPorts = FLAGS_ports_num);
 
   ~ParameterClient2();
 
@@ -255,14 +255,14 @@ public:
    *            client[recvParameterType]
    * @note Only parameterType will be sent.
    */
-  void sendAndReceiveParameter(
-      ParameterUpdateMode updateMode,
-      ParameterType parameterType,
-      const std::vector<ParameterSegments>& segments,
-      int64_t numSamples,
-      real cost, bool sendBackParameter,
-      ParameterType sendBackParameterType,
-      ParameterType recvParameterType);
+  void sendAndReceiveParameter(ParameterUpdateMode updateMode,
+                               ParameterType parameterType,
+                               const std::vector<ParameterSegments>& segments,
+                               int64_t numSamples,
+                               real cost,
+                               bool sendBackParameter,
+                               ParameterType sendBackParameterType,
+                               ParameterType recvParameterType);
 
   /**
    * @brief Sends all parameters to parameter servers, and receives the response
@@ -276,8 +276,13 @@ public:
       bool sendBackParameter,
       ParameterType sendBackParameterType = PARAMETER_VALUE,
       ParameterType recvParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(updateMode, parameterType, allSegments_, numSamples,
-                            cost, sendBackParameter, sendBackParameterType,
+    sendAndReceiveParameter(updateMode,
+                            parameterType,
+                            allSegments_,
+                            numSamples,
+                            cost,
+                            sendBackParameter,
+                            sendBackParameterType,
                             recvParameterType);
   }
 
@@ -302,29 +307,41 @@ public:
   void sendParameter(ParameterUpdateMode updateMode,
                      ParameterType parameterType,
                      const std::vector<ParameterSegments>& segments,
-                     int64_t numSamples, real cost, bool sendBackParameter,
+                     int64_t numSamples,
+                     real cost,
+                     bool sendBackParameter,
                      BatchStatus batchStatus);
 
   void recvParameter();
 
   /**
-   * Sends all parameters to parameter servers, recvParameter() have to be invoked
+   * Sends all parameters to parameter servers, recvParameter() have to be
+   * invoked
    * afterwards.
    *
    * @note This function is non-blocking. This means that if parameter should
    *       not changes between this call and recvParameter()
    */
   void sendParameter(ParameterUpdateMode updateMode,
-                     ParameterType parameterType, int64_t numSamples, real cost,
-                     bool sendBackParameter, BatchStatus batchStatus) {
-    sendParameter(updateMode, parameterType, allSegments_, numSamples, cost,
-                  sendBackParameter, batchStatus);
+                     ParameterType parameterType,
+                     int64_t numSamples,
+                     real cost,
+                     bool sendBackParameter,
+                     BatchStatus batchStatus) {
+    sendParameter(updateMode,
+                  parameterType,
+                  allSegments_,
+                  numSamples,
+                  cost,
+                  sendBackParameter,
+                  batchStatus);
   }
 
   /// Get all parameters from parameter servers
   void getParameter(ParameterType recvParameterType = PARAMETER_VALUE,
                     ParameterType sendBackParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM, PARAMETER_VALUE,
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM,
+                            PARAMETER_VALUE,
                             0,     // numSamples = 0
                             0,     // cost = 0
                             true,  // sendBackParameter = true
@@ -341,12 +358,14 @@ public:
                             0,     // numSamples = 0
                             0,     // cost = 0
                             true,  // sendBackParameter = true
-                            sendBackParameterType, recvParameterType);
+                            sendBackParameterType,
+                            recvParameterType);
   }
 
   /// Set all parameters on parameter servers using the local parameters
   void setParameter() {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM, PARAMETER_VALUE,
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM,
+                            PARAMETER_VALUE,
                             0,       // numSamples = 0
                             0,       // cost = 0
                             false);  // sendBackParameter = false
@@ -356,7 +375,8 @@ public:
    * means do not sending local parameters
    */
   void setParameterZero() {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM_ZERO, PARAMETER_VALUE,
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM_ZERO,
+                            PARAMETER_VALUE,
                             0,       // numSamples = 0
                             0,       // cost = 0
                             false);  // sendBackParameter = false
@@ -401,15 +421,18 @@ public:
    * @param[in] If true, and if all clients call waitPassFinish, signal all
    *            clients finish the pass.
    */
-  void doOperation(PreparedOperations& ops, bool waitForGradient,
-                   bool sendBackParameter, bool releasePass = true);
+  void doOperation(PreparedOperations& ops,
+                   bool waitForGradient,
+                   bool sendBackParameter,
+                   bool releasePass = true);
 
   /**
    * Set the configuration of pserver, including parameter config and
    * optimization config
    */
   void setConfig(const OptimizationConfig& optConfig,
-                 const std::string& saveDir = "", bool isSparseServer = false);
+                 const std::string& saveDir = "",
+                 bool isSparseServer = false);
 
   /// Return true if all pservers are in the given status
   bool inStatus(PServerStatus status);
@@ -454,7 +477,9 @@ public:
   void vectorAddMult(PServerVector u, PServerVector v, real a);
 
   /// u = v + w * a
-  void vectorAddMultInto(PServerVector u, PServerVector v, PServerVector w,
+  void vectorAddMultInto(PServerVector u,
+                         PServerVector v,
+                         PServerVector w,
                          real a);
   /// u = v * a
   void vectorScaleInto(PServerVector u, PServerVector v, real a);
@@ -491,7 +516,8 @@ public:
 
 protected:
   template <typename ProtoIn, typename ProtoOut>
-  void multiCall(const char* funcName, const ProtoIn& request,
+  void multiCall(const char* funcName,
+                 const ProtoIn& request,
                  std::vector<ProtoOut>* responses) {
     responses->resize(clients_.size());
     size_t numClients = clients_.size();
@@ -511,10 +537,12 @@ private:
    *        to all pservers. it is called under one SyncThreadPool. it
    *        supports to use N thread to control M connections. the receiving
    *        actions can be started until all sending action to all connections
-   *        owned by current thread are finished. Different connections controlled
+   *        owned by current thread are finished. Different connections
+   * controlled
    *        by different threads can transfer data asynchronously.
    */
-  void sendParallel(int tid, size_t numThreads,
+  void sendParallel(int tid,
+                    size_t numThreads,
                     ParameterType recvParameterType);
   /// sending thread routine for asynchronously send data
   void send(int threadId);
@@ -535,9 +563,12 @@ private:
       ParameterUpdateMode updateMode,
       ParameterType parameterType,  // client send type
       const std::vector<ParameterSegments>& parameterSegments,
-      int64_t numSamples, real cost, bool sendBackParameter,
+      int64_t numSamples,
+      real cost,
+      bool sendBackParameter,
       ParameterType sendBackParameterType,  // send back type in pserver
-      BatchStatus batchStatus, SendJob* sendJob);
+      BatchStatus batchStatus,
+      SendJob* sendJob);
 
   /// start necessary threads for threadPool
   void initThreads();
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index 8f72c1988d1676503f8ab1174d34a8ee6fe78516..b7f999f8b132e59ce8b7dffe5c4d43615e4c564c 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -31,10 +31,12 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 
 P_DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
-P_DEFINE_double(async_lagged_ratio_min, 1.0,
+P_DEFINE_double(async_lagged_ratio_min,
+                1.0,
                 "control config_.async_lagged_grad_discard_ratio() min value");
 P_DEFINE_double(
-    async_lagged_ratio_default, 1.5,
+    async_lagged_ratio_default,
+    1.5,
     "if async_lagged_grad_discard_ratio is not set in trainer_config.conf"
     "use it as defalut value");
 
@@ -47,7 +49,8 @@ const std::string ParameterServer2::kRetMsgInvalidVectorHandle =
 const std::string ParameterServer2::kRetMsgUnknownOperation =
     "Unknown operation";
 
-ParameterServer2::ParameterServer2(const std::string& addr, int port,
+ParameterServer2::ParameterServer2(const std::string& addr,
+                                   int port,
                                    int rdmaCpu)
     : ProtoServer(addr, port, rdmaCpu),
       dataSize_(0),
@@ -59,12 +62,12 @@ ParameterServer2::ParameterServer2(const std::string& addr, int port,
       allClientPassFinish_(false),
       serverId_(-1),
       batchId_(-1) {
- /**
-  * register function for remote client calling, these functions
-  * will be mapped to a data structure for quick looking up. each
-  * request from trainer can contains one function name to indicate
-  * remote action. this architecture looks like rpc style for pserver.
-  */
+  /**
+   * register function for remote client calling, these functions
+   * will be mapped to a data structure for quick looking up. each
+   * request from trainer can contains one function name to indicate
+   * remote action. this architecture looks like rpc style for pserver.
+   */
   REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendParameter);
   REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendData);
   REGISTER_SERVICE_FUNCTION(ParameterServer2, setConfig);
@@ -150,12 +153,12 @@ void ParameterServer2::setConfig(const SetConfigRequest& request,
       mkDir(request.save_dir().c_str());
     }
 
-  for (const auto& config : request.param_configs()) {
-    CHECK(!configMap_.count(config.para_id()))
-        << "Duplicated parameter name: " << config.name();
-    configMap_[config.para_id()] = config;
-    CHECK_EQ(config.sparse_remote_update(), isSparseServer_);
-  }
+    for (const auto& config : request.param_configs()) {
+      CHECK(!configMap_.count(config.para_id()))
+          << "Duplicated parameter name: " << config.name();
+      configMap_[config.para_id()] = config;
+      CHECK_EQ(config.sparse_remote_update(), isSparseServer_);
+    }
 
     config_ = request.opt_config();
     if (config_.algorithm() == TrainAlgorithm::AsyncSGD) {
@@ -264,6 +267,15 @@ void ParameterServer2::setParameter(const SendParameterRequest& request,
   std::vector<int64_t> blockIds;
   blockIds.reserve(request.blocks_size());
   int bufferIndex = 0;
+
+  if (!request.blocks().size()) {
+    LOG(WARNING)
+        << "--ports_num or --ports_num_for_sparse might be too large, "
+        << "or total dense parameter size or sparse parameters size "
+        << "might be too small, this psever doesn't store any parameter.";
+    return;
+  }
+
   for (const auto& block : request.blocks()) {
     /// block size for parameter(e.g. 128 for sparse row, 1K for dense)
     uint64_t blockSize = getParameterConfig(block).parameter_block_size();
@@ -330,8 +342,8 @@ void ParameterServer2::setParameter(const SendParameterRequest& request,
           << "width : " << width;
     }
     info.optimizer->init(1, info.config);
-    usedSegments_.push_back(std::make_pair(offsets[i],
-                offsets[i] + request.blocks(i).block_size()));
+    usedSegments_.push_back(std::make_pair(
+        offsets[i], offsets[i] + request.blocks(i).block_size()));
   }
   mergeSegments(&usedSegments_);
 
@@ -355,15 +367,18 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
                                    std::vector<Buffer>* outputBuffers) {
   VLOG(1) << "pserver: addGradient";
 
-  /// forwardbackward delta from all trainers
-  /// indicate the fluctuation caused by forwardbackward.
+/// forwardbackward delta from all trainers
+/// indicate the fluctuation caused by forwardbackward.
 #ifndef PADDLE_METRIC_LEARNING
   // @TODO(yanfei):
   // add support tuning forwardbackward balance for metric learning
   if (!numPassFinishClients_) {
     REGISTER_BARRIER_DELTA_SERVER_SET(
-        *statSet_, "forwardbackwardDelta", FLAGS_num_gradient_servers,
-        request.trainer_id(), request.forwardbackward_time(),
+        *statSet_,
+        "forwardbackwardDelta",
+        FLAGS_num_gradient_servers,
+        request.trainer_id(),
+        request.forwardbackward_time(),
         isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
   }
 #endif
@@ -381,14 +396,19 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
   /// barrier fluctuation caused by network and previous forwardbackward
   if (!numPassFinishClients_) {
     REGISTER_BARRIER_TIMER_SERVER_SET(
-        *statSet_, "handleReqBegin", FLAGS_num_gradient_servers,
-        request.trainer_id(), (*handleRequestBegin_),
+        *statSet_,
+        "handleReqBegin",
+        FLAGS_num_gradient_servers,
+        request.trainer_id(),
+        (*handleRequestBegin_),
         isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
   }
 
   if (!numPassFinishClients_) {
     REGISTER_BARRIER_TIMER_SERVER(
-        *statSet_, "addGradBegin", FLAGS_num_gradient_servers,
+        *statSet_,
+        "addGradBegin",
+        FLAGS_num_gradient_servers,
         request.trainer_id(),
         isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
   }
@@ -405,8 +425,8 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
 
       int64_t blockId = getBlockId(block);
       CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
-                          << " id=" << block.para_id()
-                          << " block id=" << block.block_id();
+                           << " id=" << block.para_id()
+                           << " block id=" << block.block_id();
 
       Buffer buffer = inputBuffers[bufferIndex];
       ++bufferIndex;
@@ -429,7 +449,9 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
 
     if (!numPassFinishClients_) {
       REGISTER_BARRIER_TIMER_SERVER(
-          *statSet_, "addGradCoreFinish", FLAGS_num_gradient_servers,
+          *statSet_,
+          "addGradCoreFinish",
+          FLAGS_num_gradient_servers,
           request.trainer_id(),
           isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
     }
@@ -444,7 +466,9 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
     /// numPassFinishClients_ means some trainer has entered finishPass
     if (!numPassFinishClients_) {
       REGISTER_SLOW_NODES_PROBE(
-          *statSet_, "SLOW_NODES", FLAGS_num_gradient_servers,
+          *statSet_,
+          "SLOW_NODES",
+          FLAGS_num_gradient_servers,
           request.trainer_id(),
           isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
     }
@@ -454,7 +478,9 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
 
     /// if wait pass finish does not start, do check
     if (!numPassFinishClients_) {
-      CHECK_BARRIER_TIMER(*statSet_, "SLOW_NODES", FLAGS_num_gradient_servers,
+      CHECK_BARRIER_TIMER(*statSet_,
+                          "SLOW_NODES",
+                          FLAGS_num_gradient_servers,
                           isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
     }
 
@@ -462,7 +488,9 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
     /// can indicate the fluctation caused by computation at pserver.
     if (!numPassFinishClients_) {
       REGISTER_BARRIER_TIMER_SERVER(
-          *statSet_, "paraReady", FLAGS_num_gradient_servers,
+          *statSet_,
+          "paraReady",
+          FLAGS_num_gradient_servers,
           request.trainer_id(),
           isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
     }
@@ -472,7 +500,8 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
     {
       /// total time except overhead of network.
       REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecvNoSend",
-                                 timeToMicroSecond(*addGradBegin_), -1,
+                                 timeToMicroSecond(*addGradBegin_),
+                                 -1,
                                  *statSet_);
     }
   }
@@ -600,7 +629,8 @@ void ParameterServer2::asyncSGD(const SendParameterRequest& request,
                         << " block id=" << block.block_id();
     int64_t blockId = getBlockId(block);
     CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
-        << " id=" << block.para_id() << " block id=" << block.block_id();
+                         << " id=" << block.para_id()
+                         << " block id=" << block.block_id();
     Buffer buffer = inputBuffers[bufferIndex];
     ++bufferIndex;
 
@@ -721,10 +751,11 @@ void ParameterServer2::sendBackParameter(const ParameterBlock& block,
 
   int64_t offset = getBlockOffset(block);
   CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-      << " id=" << block.para_id() << " block id=" << block.block_id();
+                      << " id=" << block.para_id()
+                      << " block id=" << block.block_id();
 
   real* valueBuffer = vectors_[parameterType]->getPoint(offset);
-  outputBuffers->push_back({valueBuffer, (size_t) block.block_size()});
+  outputBuffers->push_back({valueBuffer, (size_t)block.block_size()});
 }
 
 void ParameterServer2::sendBackParameter(const ParameterBlock& block,
@@ -740,7 +771,8 @@ void ParameterServer2::sendBackParameter(const ParameterBlock& block,
 
   int64_t offset = getBlockOffset(block);
   CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-      << " id=" << block.para_id() << " block id=" << block.block_id();
+                      << " id=" << block.para_id()
+                      << " block id=" << block.block_id();
 
   size_t size = buffer->size;
   real* valueBuffer = vectors_[parameterType]->getPoint(offset);
@@ -750,8 +782,11 @@ void ParameterServer2::sendBackParameter(const ParameterBlock& block,
 }
 
 void ParameterServer2::sendBackParameterSparse(
-    const ParameterBlock& block, int parameterType,
-    SendParameterResponse* response, Buffer* buffer, size_t width,
+    const ParameterBlock& block,
+    int parameterType,
+    SendParameterResponse* response,
+    Buffer* buffer,
+    size_t width,
     std::vector<Buffer>* outputBuffers) {
   ParameterBlock* returnBlock = response->add_blocks();
   returnBlock->set_para_id(block.para_id());
@@ -760,7 +795,8 @@ void ParameterServer2::sendBackParameterSparse(
   returnBlock->set_block_size(block.block_size());
   int64_t offset = getBlockOffset(block);
   CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-      << " id=" << block.para_id() << " block id=" << block.block_id();
+                      << " id=" << block.para_id()
+                      << " block id=" << block.block_id();
 
   real* valueBuffer = vectors_[parameterType]->getPoint(offset);
   CHECK_EQ(buffer->size, width);
@@ -772,7 +808,7 @@ void ParameterServer2::readAllBlocks(
     MsgReader* msgReader, std::vector<ParameterServer2::Buffer>* buffers) {
   auto& buffer = *readWriteBuffer_;
   size_t numBlocks = msgReader->getNumBlocks();
-  buffer.resizeWithAlignHints(msgReader->getTotalLength()/sizeof(real),
+  buffer.resizeWithAlignHints(msgReader->getTotalLength() / sizeof(real),
                               numBlocks);
   std::vector<void*> bufs(numBlocks);
   buffers->clear();
@@ -852,7 +888,9 @@ void ParameterServer2::sendParameter(const SendParameterRequest& request,
         /// indicates network flucatuation for big message.
         if (!numPassFinishClients_) {
           REGISTER_BARRIER_TIMER_SERVER(
-              *statSet_, "sendParamFinish", FLAGS_num_gradient_servers,
+              *statSet_,
+              "sendParamFinish",
+              FLAGS_num_gradient_servers,
               request.trainer_id(),
               isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
         }
@@ -862,13 +900,15 @@ void ParameterServer2::sendParameter(const SendParameterRequest& request,
           /// total time including overhead of network.
           REGISTER_TIMER_DYNAMIC_SET("sendParaTotal",
                                      timeToMicroSecond(*handleRequestBegin_),
-                                     -1, *statSet_);
+                                     -1,
+                                     *statSet_);
         }
         /// all time exhausted in pserverServer except recieve network.
         {
           /// total time except overhead of network receive
           REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecv",
-                                     timeToMicroSecond(*addGradBegin_), -1,
+                                     timeToMicroSecond(*addGradBegin_),
+                                     -1,
                                      *statSet_);
         }
       }
@@ -998,36 +1038,42 @@ void ParameterServer2::clearUnusedSegments(CpuVector* vec) {
     return;
   }
   memset(data, 0, sizeof(real) * usedSegments_[0].first);
-  memset(data + usedSegments_.back().second, 0,
+  memset(data + usedSegments_.back().second,
+         0,
          sizeof(real) * (size_ - usedSegments_.back().second));
   size_t n = size_ - usedSegments_.back().second;
 
   for (size_t i = 1; i < usedSegments_.size(); ++i) {
     memset(
-        data + usedSegments_[i - 1].second, 0,
+        data + usedSegments_[i - 1].second,
+        0,
         sizeof(real) * (usedSegments_[i].first - usedSegments_[i - 1].second));
     n += usedSegments_[i].first - usedSegments_[i - 1].second;
   }
 }
 
 void ParameterServer2::parallelExecForEachBlock(ExecFunc func) {
-  SyncThreadPool::execHelper(syncThreadPool_.get(), [&](int tid,
-                                                        size_t numThreads) {
-    int64_t numBlocks = blockIdMap_.size();
-    VectorPtr* vecs = Parameter::getTlsTempBufs();
-    for (int64_t blockId = tid; blockId < numBlocks; blockId += numThreads) {
-      func(blockId, vecs);
-    }
-  });
+  SyncThreadPool::execHelper(syncThreadPool_.get(),
+                             [&](int tid, size_t numThreads) {
+                               int64_t numBlocks = blockIdMap_.size();
+                               VectorPtr* vecs = Parameter::getTlsTempBufs();
+                               for (int64_t blockId = tid; blockId < numBlocks;
+                                    blockId += numThreads) {
+                                 func(blockId, vecs);
+                               }
+                             });
 }
 
 void ParameterServer2::blockTraverse(
-    BlockInfo& info, const ParameterConfig& config, int64_t offset, size_t size,
+    BlockInfo& info,
+    const ParameterConfig& config,
+    int64_t offset,
+    size_t size,
     const VectorPtr vecs[],
     const ParameterOptimizer::TraverseCallback& callback) {
   /// setup sub bufs
   for (const auto type : info.optimizer->getParameterTypes()) {
-      vecs[type]->subVecFrom(*vectors_[type], offset, size);
+    vecs[type]->subVecFrom(*vectors_[type], offset, size);
   }
   callback(vecs, config, config.sparse_remote_update() ? 0 : -1LU);
 }
@@ -1055,10 +1101,10 @@ void ParameterServer2::op_SGD(const Operation& operation,
       info.optimizer->startBatch(numSamplesProcessed_);
 
       for (const auto type : info.optimizer->getParameterTypes()) {
-          vecs[type]->subVecFrom(*vectors_[type], offset, size);
+        vecs[type]->subVecFrom(*vectors_[type], offset, size);
       }
-      info.optimizer->update(vecs, config,
-              config.sparse_remote_update() ? 0 : -1LU);
+      info.optimizer->update(
+          vecs, config, config.sparse_remote_update() ? 0 : -1LU);
       vecs[PARAMETER_GRADIENT]->zeroMem();
 
       if (auto callback = info.optimizer->needSpecialTraversal(config)) {
@@ -1460,7 +1506,6 @@ void ParameterServer2::waitPassFinish(const WaitPassFinishRequest& request,
 
 void ParameterServer2::synchronize(const SynchronizeRequest& request,
                                    ProtoResponseCallback callback) {
-  CHECK_LT(request.sync_object_id(), SyncObject_ARRAYSIZE);
   synchronizeBarriers_[request.sync_object_id()]->wait();
   dataSize_ = 0;
   callback(SynchronizeResponse());
@@ -1468,7 +1513,6 @@ void ParameterServer2::synchronize(const SynchronizeRequest& request,
 
 void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request,
                                        ProtoResponseCallback callback) {
-  CHECK_LT(request.sync_object_id(), SyncObject_ARRAYSIZE);
   synchronizeBarriers_[request.sync_object_id()]->wait();
   callback(SynchronizeResponse());
 
diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h
index ceb1ad69e9ec51894d869cee63f48950e5e8fa7c..ccaea42e7d0cb1865234702315fd4bbd00e548d5 100644
--- a/paddle/pserver/ParameterServer2.h
+++ b/paddle/pserver/ParameterServer2.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <atomic>
@@ -55,7 +54,6 @@ namespace paddle {
 // computation causes big optmization latency, the GPU may be required by
 // pserver.
 
-
 /**
  * Client interface for the parameter server
  *
@@ -189,9 +187,10 @@ protected:
      */
     constexpr static size_t AlignElementCount = AlignBytes / sizeof(T);
 
-    static_assert(
-        AlignElementCount == (AlignElementCount & -AlignElementCount)
-          || AlignBytes > sizeof(T), "AlignElementCount should be exp of 2");
+    static_assert(AlignElementCount ==
+                          (AlignElementCount & -AlignElementCount) ||
+                      AlignBytes > sizeof(T),
+                  "AlignElementCount should be exp of 2");
 
     /**
      * @brief Resize Buffer, with block count that will be allocated. Each block
@@ -205,7 +204,7 @@ protected:
       } else {
         //! at most, we need such elements in buffer to make sure each block is
         //! aligned.
-        this->resize(size + alignBlockCount* (AlignElementCount - 1));
+        this->resize(size + alignBlockCount * (AlignElementCount - 1));
       }
     }
 
@@ -224,8 +223,8 @@ protected:
       curOffset_ += blockSize;
 
       if (!IsTLargerThanAlign) {
-        curOffset_ = (curOffset_ + AlignElementCount - 1) &
-            ~(AlignElementCount -1);
+        curOffset_ =
+            (curOffset_ + AlignElementCount - 1) & ~(AlignElementCount - 1);
       }
       return r;
     }
@@ -369,7 +368,8 @@ public:
   /**
    * @brief send config to pserver
    *
-   * @note  it can help pserver to understand the configuration for optimization,
+   * @note  it can help pserver to understand the configuration for
+   * optimization,
    *        logging control, duplicated initialization, etc.
    */
   void setConfig(const SetConfigRequest& request,
@@ -545,17 +545,17 @@ protected:
                      std::vector<ParameterServer2::Buffer>* buffers);
 
   const ParameterConfig& getParameterConfig(const ParameterBlock& block) {
-    CHECK_LT(block.para_id(), -1UL)
-        << "invalid parameter id:" << block.para_id();
+    CHECK_LT(block.para_id(), -1UL) << "invalid parameter id:"
+                                    << block.para_id();
     const auto it = configMap_.find(block.para_id());
-    CHECK(it != configMap_.end())
-        << "can not find parameter id: " << block.para_id();
+    CHECK(it != configMap_.end()) << "can not find parameter id: "
+                                  << block.para_id();
     return it->second;
   }
 
   /// it implictly check blockOffsetMap_ while retrieving blockId
   const ParameterConfig& getParameterConfig(int64_t blockId) const {
-    CHECK(blockId >= 0 && blockId < (int64_t) blockInfos_.size())
+    CHECK(blockId >= 0 && blockId < (int64_t)blockInfos_.size())
         << "block idx out of range, id: " << blockId
         << " info size: " << blockInfos_.size();
     return *(blockInfos_[blockId].config);
@@ -614,7 +614,8 @@ protected:
    *        vectors_[parameterType] directly
    *        for dense with sync-sgd
    */
-  void sendBackParameter(const ParameterBlock& block, int parameterType,
+  void sendBackParameter(const ParameterBlock& block,
+                         int parameterType,
                          SendParameterResponse* response,
                          std::vector<Buffer>* outputBuffers);
 
@@ -627,16 +628,20 @@ protected:
    *        to buffer->base.
    *        for dense with async-sgd
    */
-  void sendBackParameter(const ParameterBlock& block, int parameterType,
-                         SendParameterResponse* response, Buffer* buffer,
+  void sendBackParameter(const ParameterBlock& block,
+                         int parameterType,
+                         SendParameterResponse* response,
+                         Buffer* buffer,
                          std::vector<Buffer>* outputBuffers);
   /**
    * @brief prepare data for sending back
    *
    * @note  specified for sparse
    */
-  void sendBackParameterSparse(const ParameterBlock& block, int parameterType,
-                               SendParameterResponse* response, Buffer* buffer,
+  void sendBackParameterSparse(const ParameterBlock& block,
+                               int parameterType,
+                               SendParameterResponse* response,
+                               Buffer* buffer,
                                size_t width,
                                std::vector<Buffer>* outputBuffers);
 
@@ -648,8 +653,11 @@ protected:
    */
   typedef std::function<void(int64_t blockId, const VectorPtr vecs[])> ExecFunc;
   void parallelExecForEachBlock(ExecFunc func);
-  void blockTraverse(BlockInfo& info, const ParameterConfig& config,
-                     int64_t offset, size_t size, const VectorPtr vecs[],
+  void blockTraverse(BlockInfo& info,
+                     const ParameterConfig& config,
+                     int64_t offset,
+                     size_t size,
+                     const VectorPtr vecs[],
                      const ParameterOptimizer::TraverseCallback& callback);
 
 public:
diff --git a/paddle/pserver/ProtoServer.cpp b/paddle/pserver/ProtoServer.cpp
index 0ce06ddf9180299c0ecf28669fe96e9668d9d48b..2f6d911a017d231692c42f2a235cf1e15257f7ae 100644
--- a/paddle/pserver/ProtoServer.cpp
+++ b/paddle/pserver/ProtoServer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ProtoServer.h"
 
 namespace paddle {
@@ -42,8 +41,8 @@ void ProtoServer::handleRequest(std::unique_ptr<MsgReader> msgReader,
 
 void ProtoServer::registerServiceFunctionImp(const std::string& funcName,
                                              ServiceFunction func) {
-  CHECK(!nameToFuncMap_.count(funcName))
-      << "Duplicated registration: " << funcName;
+  CHECK(!nameToFuncMap_.count(funcName)) << "Duplicated registration: "
+                                         << funcName;
   nameToFuncMap_[funcName] = func;
 }
 
diff --git a/paddle/pserver/ProtoServer.h b/paddle/pserver/ProtoServer.h
index 86e715868356ca1939dac819b52e816e19d7d361..cf08e24ff3ef47d9c17bfe14d7d3aff1537b8ce8 100644
--- a/paddle/pserver/ProtoServer.h
+++ b/paddle/pserver/ProtoServer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "LightNetwork.h"
@@ -23,17 +22,17 @@ limitations under the License. */
 
 namespace paddle {
 
-  /**
-   *
-   * It implements the rpc framework, which launchs one thread for each
-   * connection. Here define one parameter server as single TCP server
-   * binding on single port. All connections share single tcp ProtoServer
-   * object, each connection handles all requests from specified trainer
-   * within single worker thread.
-   * to accelerate bandwidth efficiency and harness multicore for pserver
-   * optimization to reduce pserver latency, you could launch more port
-   * for single NIC hardward with --port=N(N>1) for small cluster job.
-   */
+/**
+ *
+ * It implements the rpc framework, which launchs one thread for each
+ * connection. Here define one parameter server as single TCP server
+ * binding on single port. All connections share single tcp ProtoServer
+ * object, each connection handles all requests from specified trainer
+ * within single worker thread.
+ * to accelerate bandwidth efficiency and harness multicore for pserver
+ * optimization to reduce pserver latency, you could launch more port
+ * for single NIC hardward with --port=N(N>1) for small cluster job.
+ */
 class ProtoServer : public SocketServer {
 public:
   /// rdmaCpu controls the cpu affinity of RDMA server daemon,
@@ -84,7 +83,8 @@ public:
   template <class ProtoIn>
   void registerServiceFunctionEx(
       const std::string& funcName,
-      std::function<void(const ProtoIn&, std::unique_ptr<MsgReader> msgReader,
+      std::function<void(const ProtoIn&,
+                         std::unique_ptr<MsgReader> msgReader,
                          ProtoResponseCallbackEx callback)> func);
 
 protected:
@@ -120,7 +120,8 @@ protected:
 
 class ProtoClient : public SocketClient {
 public:
-  ProtoClient(const std::string& serverAddr, int serverPort,
+  ProtoClient(const std::string& serverAddr,
+              int serverPort,
               enum ChannelType channelType = F_TCP)
       : SocketClient(serverAddr, serverPort, channelType) {}
 
@@ -133,7 +134,8 @@ public:
    * @note  iov provides additional blocks which need to be written to the
    *        communication channel
    */
-  void send(const char* funcName, const google::protobuf::MessageLite& proto,
+  void send(const char* funcName,
+            const google::protobuf::MessageLite& proto,
             const std::vector<iovec>& iov = std::vector<iovec>());
 
   /**
@@ -148,7 +150,8 @@ public:
 
   /// combines send() and recv()
   std::unique_ptr<MsgReader> sendAndRecv(
-      const char* funcName, const google::protobuf::MessageLite& protoIn,
+      const char* funcName,
+      const google::protobuf::MessageLite& protoIn,
       google::protobuf::MessageLite* protoOut) {
     send(funcName, protoIn);
     return recv(protoOut);
@@ -156,8 +159,10 @@ public:
 
   /// combines send() and recv()
   std::unique_ptr<MsgReader> sendAndRecv(
-      const char* funcName, const google::protobuf::MessageLite& protoIn,
-      const std::vector<iovec>& iov, google::protobuf::MessageLite* protoOut) {
+      const char* funcName,
+      const google::protobuf::MessageLite& protoIn,
+      const std::vector<iovec>& iov,
+      google::protobuf::MessageLite* protoOut) {
     send(funcName, protoIn, iov);
     return recv(protoOut);
   }
@@ -172,52 +177,62 @@ struct service_arg_type<R (C::*)(const Arg1&, Arg2)> {
 };
 
 template <class R, class C, class Arg1, class Arg2>
-struct service_arg_type<R (C::*)(const Arg1&, std::unique_ptr<MsgReader>,
-                                 Arg2)> {
+struct service_arg_type<R (C::*)(  // NOLINT
+    const Arg1&,
+    std::unique_ptr<MsgReader>,
+    Arg2)> {
   typedef Arg1 _1;
 };
 
 /// register a service function to the ProtoServer
 /// This should only be used within a member function of className
-#define REGISTER_SERVICE_FUNCTION(className, funcName)                        \
-  registerServiceFunction<                                                    \
-      service_arg_type<decltype(&className::funcName)>::_1>(                  \
-      #funcName, std::bind(&className::funcName, this, std::placeholders::_1, \
-                           std::placeholders::_2))
+#define REGISTER_SERVICE_FUNCTION(className, funcName)       \
+  registerServiceFunction<                                   \
+      service_arg_type<decltype(&className::funcName)>::_1>( \
+      #funcName,                                             \
+      std::bind(&className::funcName,                        \
+                this,                                        \
+                std::placeholders::_1,                       \
+                std::placeholders::_2))
 
 /// register a service function to the ProtoServer
 /// This should only be used within a member function of className
-#define REGISTER_SERVICE_FUNCTION_EX(className, funcName)                     \
-  registerServiceFunctionEx<                                                  \
-      service_arg_type<decltype(&className::funcName)>::_1>(                  \
-      #funcName, std::bind(&className::funcName, this, std::placeholders::_1, \
-                           std::placeholders::_2, std::placeholders::_3))
+#define REGISTER_SERVICE_FUNCTION_EX(className, funcName)    \
+  registerServiceFunctionEx<                                 \
+      service_arg_type<decltype(&className::funcName)>::_1>( \
+      #funcName,                                             \
+      std::bind(&className::funcName,                        \
+                this,                                        \
+                std::placeholders::_1,                       \
+                std::placeholders::_2,                       \
+                std::placeholders::_3))
 
 /// create wrapper function for parameter server high level function and
 /// register the wrapper function into function mapping.
 template <class ProtoIn>
 void ProtoServer::registerServiceFunctionEx(
     const std::string& funcName,
-    std::function<void(const ProtoIn&, std::unique_ptr<MsgReader> msgReader,
+    std::function<void(const ProtoIn&,
+                       std::unique_ptr<MsgReader> msgReader,
                        ProtoResponseCallbackEx callback)> func) {
-  auto f =
-      [func](std::unique_ptr<MsgReader> msgReader, ResponseCallback callback) {
-        ProtoIn request;
-        std::string str(msgReader->getNextBlockLength(), 0);
-        msgReader->readNextBlock(&str[0]);
-        CHECK(request.ParseFromString(str));
-        auto pcob = [callback](const google::protobuf::MessageLite& response,
-                               const std::vector<iovec>& outputIovs) {
-          std::string out;
-          CHECK(response.SerializeToString(&out));
-          std::vector<iovec> iovs;
-          iovs.push_back({&out[0], out.size()});
-          iovs.insert(iovs.end(), outputIovs.begin(), outputIovs.end());
-          callback(iovs);
-        };
-
-        func(request, std::move(msgReader), pcob);
-      };
+  auto f = [func](std::unique_ptr<MsgReader> msgReader,
+                  ResponseCallback callback) {
+    ProtoIn request;
+    std::string str(msgReader->getNextBlockLength(), 0);
+    msgReader->readNextBlock(&str[0]);
+    CHECK(request.ParseFromString(str));
+    auto pcob = [callback](const google::protobuf::MessageLite& response,
+                           const std::vector<iovec>& outputIovs) {
+      std::string out;
+      CHECK(response.SerializeToString(&out));
+      std::vector<iovec> iovs;
+      iovs.push_back({&out[0], out.size()});
+      iovs.insert(iovs.end(), outputIovs.begin(), outputIovs.end());
+      callback(iovs);
+    };
+
+    func(request, std::move(msgReader), pcob);
+  };
 
   registerServiceFunctionImp(funcName, f);
 }
@@ -226,24 +241,24 @@ template <class ProtoIn>
 void ProtoServer::registerServiceFunction(
     const std::string& funcName,
     std::function<void(const ProtoIn&, ProtoResponseCallback callback)> func) {
-  auto f =
-      [func](std::unique_ptr<MsgReader> msgReader, ResponseCallback callback) {
-        ProtoIn request;
-        std::string str(msgReader->getNextBlockLength(), 0);
-        msgReader->readNextBlock(&str[0]);
-        CHECK(request.ParseFromString(str));
-        msgReader.reset();
-
-        auto pcob = [callback](const google::protobuf::MessageLite& response) {
-          std::string out;
-          CHECK(response.SerializeToString(&out));
-          std::vector<iovec> iovs;
-          iovs.push_back({&out[0], out.size()});
-          callback(iovs);
-        };
-
-        func(request, pcob);
-      };
+  auto f = [func](std::unique_ptr<MsgReader> msgReader,
+                  ResponseCallback callback) {
+    ProtoIn request;
+    std::string str(msgReader->getNextBlockLength(), 0);
+    msgReader->readNextBlock(&str[0]);
+    CHECK(request.ParseFromString(str));
+    msgReader.reset();
+
+    auto pcob = [callback](const google::protobuf::MessageLite& response) {
+      std::string out;
+      CHECK(response.SerializeToString(&out));
+      std::vector<iovec> iovs;
+      iovs.push_back({&out[0], out.size()});
+      callback(iovs);
+    };
+
+    func(request, pcob);
+  };
 
   registerServiceFunctionImp(funcName, f);
 }
diff --git a/paddle/pserver/RDMANetwork.h b/paddle/pserver/RDMANetwork.h
index 05b845b68a150cb36fa4ba09150bc8f41e3922c8..4e492a3afd120462ac6e056b9df850063c503a53 100644
--- a/paddle/pserver/RDMANetwork.h
+++ b/paddle/pserver/RDMANetwork.h
@@ -76,7 +76,7 @@ inline sxi_sock* accept(sxi_socket* s) {
 
 inline sockaddr_in* getSourceAddress(sxi_sock* sock) {
 #ifndef PADDLE_DISABLE_RDMA
-  return reinterpret_cast<sockaddr_in *>(&sock->sa);
+  return reinterpret_cast<sockaddr_in*>(&sock->sa);
 #else
   PROMPT_ERR();
 #endif
@@ -98,7 +98,6 @@ inline int close(sxi_sock* sock) {
 #endif
 }
 
-
 inline void init() {
 #ifndef PADDLE_DISABLE_RDMA
   sxi_module_init();
@@ -155,6 +154,5 @@ inline sxi_sock* connect(sxi_socket* socket, const char* url) {
 #endif
 }
 
-
 }  //  namespace rdma
 }  //  namespace paddle
diff --git a/paddle/pserver/SocketChannel.cpp b/paddle/pserver/SocketChannel.cpp
index 20295d7cdc22b5dba6380a0792eafef9feec257a..4ebc47d32659d82f32b9da529aec7ec3f46f77a9 100644
--- a/paddle/pserver/SocketChannel.cpp
+++ b/paddle/pserver/SocketChannel.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "SocketChannel.h"
 
 #include <stdio.h>
@@ -35,7 +34,6 @@ namespace paddle {
 #define UIO_MAXIOV 512
 #endif
 
-
 SocketChannel::~SocketChannel() {
   if (tcpRdma_ == F_TCP)
     close(tcpSocket_);
@@ -81,8 +79,12 @@ size_t SocketChannel::write(const void* buf, size_t size) {
 }
 
 template <class IOFunc, class SocketType>
-static size_t readwritev(IOFunc iofunc, SocketType socket, iovec* iovs,
-                         int iovcnt, int maxiovs, const std::string& peerName) {
+static size_t readwritev(IOFunc iofunc,
+                         SocketType socket,
+                         iovec* iovs,
+                         int iovcnt,
+                         int maxiovs,
+                         const std::string& peerName) {
   int curIov = 0;
   size_t total = 0;
 
@@ -123,25 +125,40 @@ static size_t readwritev(IOFunc iofunc, SocketType socket, iovec* iovs,
   return size;
 }
 
-
 /// rdma::readv and rdma::writev can take advantage of RDMA blocking offload
 /// transfering
 size_t SocketChannel::writev(const std::vector<struct iovec>& iovs) {
   if (tcpRdma_ == F_TCP)
-    return readwritev(::writev, tcpSocket_, const_cast<iovec*>(&iovs[0]),
-                      iovs.size(), UIO_MAXIOV, peerName_);
+    return readwritev(::writev,
+                      tcpSocket_,
+                      const_cast<iovec*>(&iovs[0]),
+                      iovs.size(),
+                      UIO_MAXIOV,
+                      peerName_);
   else
-    return readwritev(rdma::writev, rdmaSocket_, const_cast<iovec*>(&iovs[0]),
-                      iovs.size(), MAX_VEC_SIZE, peerName_);
+    return readwritev(rdma::writev,
+                      rdmaSocket_,
+                      const_cast<iovec*>(&iovs[0]),
+                      iovs.size(),
+                      MAX_VEC_SIZE,
+                      peerName_);
 }
 
 size_t SocketChannel::readv(std::vector<struct iovec>* iovs) {
   if (tcpRdma_ == F_TCP)
-    return readwritev(::readv, tcpSocket_, const_cast<iovec*>(&(*iovs)[0]),
-                      iovs->size(), UIO_MAXIOV, peerName_);
+    return readwritev(::readv,
+                      tcpSocket_,
+                      const_cast<iovec*>(&(*iovs)[0]),
+                      iovs->size(),
+                      UIO_MAXIOV,
+                      peerName_);
   else
-    return readwritev(rdma::readv, rdmaSocket_, const_cast<iovec*>(&(*iovs)[0]),
-                      iovs->size(), MAX_VEC_SIZE, peerName_);
+    return readwritev(rdma::readv,
+                      rdmaSocket_,
+                      const_cast<iovec*>(&(*iovs)[0]),
+                      iovs->size(),
+                      MAX_VEC_SIZE,
+                      peerName_);
 }
 
 void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
@@ -157,8 +174,8 @@ void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
   std::vector<iovec> iovs;
   iovs.reserve(userIovs.size() + 2);
   iovs.push_back({&header, sizeof(header)});
-  iovs.push_back({&iovLengths[0], static_cast<size_t>(
-      sizeof(iovLengths[0]) * header.numIovs)});
+  iovs.push_back({&iovLengths[0],
+                  static_cast<size_t>(sizeof(iovLengths[0]) * header.numIovs)});
   iovs.insert(iovs.end(), userIovs.begin(), userIovs.end());
 
   header.totalLength = 0;
diff --git a/paddle/pserver/SocketChannel.h b/paddle/pserver/SocketChannel.h
index fb9ac2e1dc23d9921777427540fb482e9bb0bd08..472b37a12283ca1c358034427d491804af765171 100644
--- a/paddle/pserver/SocketChannel.h
+++ b/paddle/pserver/SocketChannel.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
diff --git a/paddle/pserver/SparseParameterDistribution.cpp b/paddle/pserver/SparseParameterDistribution.cpp
index 31682c158e8006e071d681b29322b6000a9d1329..2085b22a95138fa8caf474a081fb46229688966f 100644
--- a/paddle/pserver/SparseParameterDistribution.cpp
+++ b/paddle/pserver/SparseParameterDistribution.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <unistd.h>
 
 #include "paddle/utils/Logging.h"
@@ -21,19 +20,24 @@ limitations under the License. */
 
 #include "SparseParameterDistribution.h"
 
-P_DEFINE_bool(check_sparse_distribution_in_pserver, false,
+P_DEFINE_bool(check_sparse_distribution_in_pserver,
+              false,
               "check whether sparse parameter exhibts balanced distribution at "
               "all pservers");
-P_DEFINE_bool(show_check_sparse_distribution_log, false,
+P_DEFINE_bool(show_check_sparse_distribution_log,
+              false,
               "show logs details for sparse parameter distribution in pserver");
-P_DEFINE_int32(check_sparse_distribution_batches, 100,
+P_DEFINE_int32(check_sparse_distribution_batches,
+               100,
                "run sparse parameter distribution check for N batches");
 P_DEFINE_double(
-    check_sparse_distribution_ratio, 0.6,
+    check_sparse_distribution_ratio,
+    0.6,
     "if parameters dispatched to different pservers exhibit unbalanced "
     " distribution for check_sparse_distribution_ratio * "
     " check_sparse_distribution_batches times, crash program");
-P_DEFINE_double(check_sparse_distribution_unbalance_degree, 2.0,
+P_DEFINE_double(check_sparse_distribution_unbalance_degree,
+                2.0,
                 "the ratio of maximum data size and minimun data size for "
                 "different pserver");
 
diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp
index 260aed0083c5d19ba6a766a70b51e30042389e38..24c90f10785a6f5870ab291a5c5e6c13fbc0d49f 100644
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/Util.h"
 
 #include <stdio.h>
@@ -184,7 +183,8 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
 
   bzero((char*)&serv_addr, sizeof(serv_addr));
   serv_addr.sin_family = AF_INET;
-  bcopy((char*)server->h_addr, (char*)&serv_addr.sin_addr.s_addr,
+  bcopy((char*)server->h_addr,
+        (char*)&serv_addr.sin_addr.s_addr,
         server->h_length);
   serv_addr.sin_port = htons(serverPort);
 
diff --git a/paddle/pserver/test/test_ParameterServer2.cpp b/paddle/pserver/test/test_ParameterServer2.cpp
index c9722f1212ae9b7cab15c5ae314c604ffa8f0647..eb813e92d6d696db6c2ced543a00594b69c7f5af 100644
--- a/paddle/pserver/test/test_ParameterServer2.cpp
+++ b/paddle/pserver/test/test_ParameterServer2.cpp
@@ -27,7 +27,9 @@ P_DEFINE_int32(server_cpu, 0, "assign server cpu");
 
 class ParameterServer2Tester : public ParameterServer2 {
 public:
-  ParameterServer2Tester(std::string serverAddr, int port, int rdmaCpu = -1,
+  ParameterServer2Tester(std::string serverAddr,
+                         int port,
+                         int rdmaCpu = -1,
                          bool sepSendAndRecv = false)
       : ParameterServer2(serverAddr, port, rdmaCpu), client_(sepSendAndRecv) {}
   virtual ~ParameterServer2Tester() {}
@@ -63,7 +65,7 @@ public:
     }
 
     size_t id = 0;
-    for (auto &para : parameters_) {
+    for (auto& para : parameters_) {
       para->setID(id++);
     }
 
@@ -560,8 +562,8 @@ TEST(ParameterServer2, sendData) {
   std::unique_ptr<ParameterServer2Tester> g_server2;
   std::unique_ptr<ParameterServer2Tester> g_server3;
   if (FLAGS_rdma_tcp == "rdma") {
-    g_server1.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port,
-                                               FLAGS_server_cpu));
+    g_server1.reset(new ParameterServer2Tester(
+        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
     g_server1->start();
     g_server2.reset(new ParameterServer2Tester(
         FLAGS_server_addr, FLAGS_port + 1, FLAGS_server_cpu + 1));
@@ -604,8 +606,8 @@ int main(int argc, char** argv) {
   FLAGS_num_gradient_servers = 2;
 
   if (FLAGS_rdma_tcp == "rdma") {
-    g_server.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port,
-                                              FLAGS_server_cpu));
+    g_server.reset(new ParameterServer2Tester(
+        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
   } else {
     g_server.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
   }
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
index 065d6b3396be2287ee14226b4cf9b07be32e63e0..79d1f2743a1c2e6050afe48d6cf86a1084a4500c 100644
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -126,9 +126,11 @@ TEST(ProtoServer, extended) {
         GetStatusResponse response;
         {
           REGISTER_TIMER("sendAndRecv");
-          auto msgReader = client->sendAndRecv(
-              "getStatusEx", request, {{cpuGrad.getData(), (size_t)dataSize}},
-              &response);
+          auto msgReader =
+              client->sendAndRecv("getStatusEx",
+                                  request,
+                                  {{cpuGrad.getData(), (size_t)dataSize}},
+                                  &response);
 
           EXPECT_EQ(msgReader->getNumBlocks(), (size_t)1);
           EXPECT_EQ(msgReader->getNextBlockLength(), (size_t)dataSize);
diff --git a/paddle/py_paddle/__init__.py b/paddle/py_paddle/__init__.py
index f372068942ea36a05c1433b482731bf112bfa51e..f8399f9c63d81f5a52bf2b277789c26d809f0153 100644
--- a/paddle/py_paddle/__init__.py
+++ b/paddle/py_paddle/__init__.py
@@ -15,9 +15,10 @@
 from util import DataProviderWrapperConverter
 from dataprovider_converter import DataProviderConverter
 
-__all__ = ['paddle',
-           'DataProviderConverter',
-           'DataProviderWrapperConverter',  # for deprecated usage.
-           'loadParameterFile']
+__all__ = [
+    'paddle',
+    'DataProviderConverter',
+    'DataProviderWrapperConverter',  # for deprecated usage.
+    'loadParameterFile'
+]
 util.monkeypatches()
-
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index 0366bb636c704a83f09a15a65a1bb721060e45d1..d64c7b20cb65a4b8dfebfc516cfc2c3fdc247114 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -45,10 +45,8 @@ class DenseScanner(IScanner):
     def finish_scan(self, argument):
         assert isinstance(argument, swig_paddle.Arguments)
         assert isinstance(self.input_type, dp2.InputType)
-        m = swig_paddle.Matrix.createDense(self.__mat__,
-                                           self.__height__,
-                                           self.input_type.dim,
-                                           False)
+        m = swig_paddle.Matrix.createDense(self.__mat__, self.__height__,
+                                           self.input_type.dim, False)
         argument.setSlotValue(self.pos, m)
 
 
@@ -63,7 +61,8 @@ class SparseBinaryScanner(IScanner):
 
     def scan(self, dat):
         self.extend_cols(dat)
-        self.__rows__.append(len(dat))
+        self.__rows__.append(len(self.__cols__))
+        self.__height__ += 1
 
     def extend_cols(self, dat):
         self.__cols__.extend(dat)
@@ -140,8 +139,10 @@ class DataProviderConverter(object):
         assert isinstance(argument, swig_paddle.Arguments)
         argument.resize(len(self.input_types))
 
-        scanners = [DataProviderConverter.create_scanner(i, each_type)
-                    for i, each_type in enumerate(self.input_types)]
+        scanners = [
+            DataProviderConverter.create_scanner(i, each_type)
+            for i, each_type in enumerate(self.input_types)
+        ]
 
         for each_sample in dat:
             for each_step, scanner in zip(each_sample, scanners):
@@ -170,11 +171,14 @@ class DataProviderConverter(object):
         assert retv is not None
 
         if each.seq_type == dp2.SequenceType.SUB_SEQUENCE:
-            retv = SequenceScanner(each, i, retv, lambda a, p, seq:
-            a.setSlotSubSequenceStartPositions(p, seq))
-
-        if each.seq_type in [dp2.SequenceType.SUB_SEQUENCE,
-                             dp2.SequenceType.SEQUENCE]:
-            retv = SequenceScanner(each, i, retv, lambda a, p, seq:
-            a.setSlotSequenceStartPositions(p, seq))
+            retv = SequenceScanner(
+                each, i, retv,
+                lambda a, p, seq: a.setSlotSubSequenceStartPositions(p, seq))
+
+        if each.seq_type in [
+                dp2.SequenceType.SUB_SEQUENCE, dp2.SequenceType.SEQUENCE
+        ]:
+            retv = SequenceScanner(
+                each, i, retv,
+                lambda a, p, seq: a.setSlotSequenceStartPositions(p, seq))
         return retv
diff --git a/paddle/py_paddle/util.py b/paddle/py_paddle/util.py
index e6cf2710ef523fa494ddfb0917dbf35ecb49d685..e1f310580f95cfb210ba89589bab668433818b23 100644
--- a/paddle/py_paddle/util.py
+++ b/paddle/py_paddle/util.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 Some Useful method for py_paddle.
 """
@@ -80,6 +79,19 @@ class __ParameterCallbackWrapper__(swig_paddle.UpdateCallback):
             return __ParameterCallbackWrapper__(callback).__disown__()
 
 
+def __arguments_to_numpy__(i, arg):
+    assert isinstance(arg, swig_paddle.Arguments)
+    value = arg.getSlotValue(i)
+    ids = arg.getSlotIds(i)
+    if value is not None:
+        assert isinstance(value, swig_paddle.Matrix)
+        value = value.copyToNumpyMat()
+    if ids is not None:
+        assert isinstance(ids, swig_paddle.IVector)
+        ids = ids.copyToNumpyArray()
+    return {"value": value, "id": ids}
+
+
 def __monkeypatch_gradient_machine__():
     """
     Add some class methods to GradientMachine.
@@ -88,21 +100,6 @@ def __monkeypatch_gradient_machine__():
     swig_paddle.GradientMachine.loadFromConfigFile = \
         staticmethod(loadGradientMachine)
 
-    def __arguments_to_numpy__(i, arg):
-        assert isinstance(arg, swig_paddle.Arguments)
-        value = arg.getSlotValue(i)
-        if value is not None:
-            assert isinstance(value, swig_paddle.Matrix)
-            value = value.copyToNumpyMat()
-        ids = arg.getSlotIds(i)
-        if ids is not None:
-            assert isinstance(ids, swig_paddle.IVector)
-            ids = ids.copyToNumpyArray()
-        return {
-            "value": value,
-            "id": ids
-        }
-
     def __matrix_to_numpy__(m):
         if isinstance(m, swig_paddle.Matrix):
             return m.copyToNumpyMat()
@@ -113,9 +110,11 @@ def __monkeypatch_gradient_machine__():
 
     def createFromConfigProto(protoObj,
                               createMode=swig_paddle.CREATE_MODE_NORMAL,
-                              paramTypes=[swig_paddle.PARAMETER_VALUE,
-                                          swig_paddle.PARAMETER_GRADIENT,
-                                          swig_paddle.PARAMETER_MOMENTUM]):
+                              paramTypes=[
+                                  swig_paddle.PARAMETER_VALUE,
+                                  swig_paddle.PARAMETER_GRADIENT,
+                                  swig_paddle.PARAMETER_MOMENTUM
+                              ]):
         """
         Create Gradient Machine From Proto object.
         :param protoObj: Model config
@@ -126,7 +125,7 @@ def __monkeypatch_gradient_machine__():
         :type paramTypes: list of int
         :return: paddle.GradientMachine
         """
-        assert isinstance(protoObj, paddle.proto.ModelConfig_pb2.ModelConfig)
+        assert isinstance(protoObj, paddle.proto.ModelConfig)
         return swig_paddle.GradientMachine.createByConfigProtoStr(
             protoObj.SerializeToString(), createMode, paramTypes)
 
@@ -145,8 +144,10 @@ def __monkeypatch_gradient_machine__():
         """
         outArgs = swig_paddle.Arguments.createArguments(0)
         self.forward(inArgs, outArgs, swig_paddle.PASS_TEST)
-        return [__arguments_to_numpy__(i, outArgs) for i in xrange(
-            outArgs.getSlotNum())]
+        return [
+            __arguments_to_numpy__(i, outArgs)
+            for i in xrange(outArgs.getSlotNum())
+        ]
 
     swig_paddle.GradientMachine.forwardTest = forwardTest
 
@@ -167,7 +168,10 @@ def __monkeypatch_gradient_machine__():
     swig_paddle.GradientMachine.__forwardBackward__ = \
         swig_paddle.GradientMachine.forwardBackward
 
-    def forwardBackward(self, inArgs, outArgs, passType,
+    def forwardBackward(self,
+                        inArgs,
+                        outArgs,
+                        passType,
                         callback=swig_paddle.UpdateCallback()):
         """
         GradientMachine forward backward.
@@ -315,9 +319,8 @@ class DataProviderWrapperConverter(object):
             self.cols += other
 
         def __call__(self, slot_idx, arg):
-            mat = swig_paddle.Matrix.createSparse(len(self.indices) - 1,
-                                                  self.dim,
-                                                  len(self.cols), True)
+            mat = swig_paddle.Matrix.createSparse(
+                len(self.indices) - 1, self.dim, len(self.cols), True)
             assert isinstance(mat, swig_paddle.Matrix)
             mat.sparseCopyFrom(self.indices, self.cols)
             self.putIntoArg(slot_idx, arg, mat)
@@ -341,9 +344,8 @@ class DataProviderWrapperConverter(object):
             self.values += map(lambda x: x[1], other)
 
         def __call__(self, slot_idx, arg):
-            mat = swig_paddle.Matrix.createSparse(len(self.indices) - 1,
-                                                  self.dim,
-                                                  len(self.cols), False)
+            mat = swig_paddle.Matrix.createSparse(
+                len(self.indices) - 1, self.dim, len(self.cols), False)
             assert isinstance(mat, swig_paddle.Matrix)
             mat.sparseCopyFrom(self.indices, self.cols, self.values)
             self.putIntoArg(slot_idx, arg, mat)
@@ -352,8 +354,9 @@ class DataProviderWrapperConverter(object):
         paddle.trainer.PyDataProviderWrapper.DenseSlot: DenseValueConverter,
         paddle.trainer.PyDataProviderWrapper.IndexSlot: IdValueConverter,
         paddle.trainer.PyDataProviderWrapper.SparseNonValueSlot:
-            SparseNonValueConverter,
-        paddle.trainer.PyDataProviderWrapper.SparseValueSlot: SparseValueConverter
+        SparseNonValueConverter,
+        paddle.trainer.PyDataProviderWrapper.SparseValueSlot:
+        SparseValueConverter
     }
 
     def __init__(self, use_seq, header):
@@ -381,10 +384,9 @@ class DataProviderWrapperConverter(object):
         assert isinstance(argument, swig_paddle.Arguments)
         argument.resize(len(self.__header__))
 
-        values = map(lambda x:
-                     DataProviderWrapperConverter.__SLOT_VALUE_CONVERTER_MAP__[
-                         x.__class__](x),
-                     self.__header__)
+        values = map(
+            lambda x: DataProviderWrapperConverter.__SLOT_VALUE_CONVERTER_MAP__[x.__class__](x),
+            self.__header__)
 
         if self.__use_seq__:
             seq_dim = [[] for _ in xrange(self.__header__.__len__())]
@@ -394,14 +396,13 @@ class DataProviderWrapperConverter(object):
                 for slot_idx, sequence in enumerate(each_sample):
                     for raw_data in sequence:
                         values[slot_idx].append(raw_data)
-                    seq_start_pos[slot_idx].append(
-                        seq_start_pos[slot_idx][-1] + len(sequence))
+                    seq_start_pos[slot_idx].append(seq_start_pos[slot_idx][-1] +
+                                                   len(sequence))
                     seq_dim[slot_idx].append(len(sequence))
 
             for slot_idx in xrange(len(self.__header__)):
-                argument.setSlotSequenceDim(slot_idx,
-                                            swig_paddle.IVector.create(
-                                                seq_dim[slot_idx]))
+                argument.setSlotSequenceDim(
+                    slot_idx, swig_paddle.IVector.create(seq_dim[slot_idx]))
                 argument.setSlotSequenceStartPositions(
                     slot_idx,
                     swig_paddle.IVector.create(seq_start_pos[slot_idx]))
@@ -422,7 +423,6 @@ class DataProviderWrapperConverter(object):
         return self.convert(wrapper_data, argument)
 
 
-
 def __monkey_patch_protobuf_objects__():
     def ParameterConfig_toProto(self):
         """
@@ -459,14 +459,28 @@ def __monkey_patch_protobuf_objects__():
         :return: paddle.OptimizationConfig
         """
 
-        assert isinstance(protoObj,
-                          paddle.proto.TrainerConfig_pb2.OptimizationConfig)
+        assert isinstance(protoObj, paddle.proto.OptimizationConfig)
         return swig_paddle.OptimizationConfig.createFromProtoString(
             protoObj.SerializeToString())
 
     swig_paddle.OptimizationConfig.createFromProto = staticmethod(
         OptimizationConfig_createFromProto)
 
+    def TrainerConfig_createFromProto(protoObj):
+        """
+        Create a new paddle.TrainerConfig from
+        proto.OptimizationConfig
+
+        :param protoObj: proto.TrainerConfig
+        :return: paddle.TrainerConfig
+        """
+        assert isinstance(protoObj, paddle.proto.TrainerConfig)
+        return swig_paddle.TrainerConfig.createFromProtoString(
+            protoObj.SerializeToString())
+
+    swig_paddle.TrainerConfig.createFromProto = staticmethod(
+        TrainerConfig_createFromProto)
+
 
 def __monkey_patch_parameter__():
     def getBufs(self):
@@ -483,9 +497,72 @@ def __monkey_patch_parameter__():
     swig_paddle.Parameter.getBufs = getBufs
 
 
+def __monkey_patch_trainer__():
+    swig_paddle.Trainer.__create__ = staticmethod(swig_paddle.Trainer.create)
+
+    def Trainer_create(config, model=None):
+        """
+        Create a trainer for model with TrainerCOnfig trainer_config
+        trainer_config.model_config will be ignored when model is supplied.
+        Trainer.trainOneBatch() and Trainer.forwardOneBatch() can be used only
+        when trainer_config.data_config is set.
+
+        A typical usage for Trainer is:
+        .. code-block:: python
+           trainer = Trainer.create(trainer_config, model)
+           for p in xrange(num_passes)
+               while True:
+                   data = get_next_batch(batch_size)
+                   if not data:
+                       break
+                   trainer.trainOneDataBatch(batch_size, data)
+               trainer.finishTrainPass()
+           trainer.finishTrain()
+
+        The trainer will take care of logging, model saving, distributed
+        training, etc.
+
+        :param config: trainer configuration
+        :type config: paddle.proto.TrainerConfig
+        :param model: the model to be trained
+        :type model: swig_paddle.GradientMachine
+        :return: a trainer
+        :rtype swig_paddle.Trainer
+
+        """
+        assert isinstance(config, paddle.proto.TrainerConfig)
+        if model is not None:
+            assert isinstance(model, swig_paddle.GradientMachine)
+        return swig_paddle.Trainer.__create__(
+            swig_paddle.TrainerConfig.createFromProto(config), model)
+
+    swig_paddle.Trainer.create = staticmethod(Trainer_create)
+
+    swig_paddle.Trainer.__getForwardOutput__ = \
+        swig_paddle.Trainer.getForwardOutput
+
+    def getForwardOutput(self):
+        """
+        Get the netword outputs from the previous trainOneBatch(),
+        trainOneDataBatch(), testOneDataPatch(), or forwardOneBatch() call.
+
+        :return: list of dictionary with keys ['id', 'value'], each value is a
+                 numpy.ndarray.
+        """
+        outArgs = self.__getForwardOutput__()
+        return [
+            __arguments_to_numpy__(i, outArgs)
+            for i in xrange(outArgs.getSlotNum())
+        ]
+
+    swig_paddle.Trainer.getForwardOutput = getForwardOutput
+
+
 def monkeypatches():
-    patches = [__monkeypatch_init_paddle__, __monkeypatch_gradient_machine__,
-               __monkey_patch_protobuf_objects__,
-               __monkey_patch_parameter__]
+    patches = [
+        __monkeypatch_init_paddle__, __monkeypatch_gradient_machine__,
+        __monkey_patch_protobuf_objects__, __monkey_patch_parameter__,
+        __monkey_patch_trainer__
+    ]
     for patch in patches:
         patch()
diff --git a/paddle/scripts/CMakeLists.txt b/paddle/scripts/CMakeLists.txt
index dee46055c5a4db8c77b4248de70573c02a60e631..1bae396a18688cd53e164774df07660ccc2451d7 100644
--- a/paddle/scripts/CMakeLists.txt
+++ b/paddle/scripts/CMakeLists.txt
@@ -6,4 +6,4 @@ configure_file(submit_local.sh.in
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/submit_local.sh DESTINATION bin
         PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
             GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
-        RENAME paddle)
\ No newline at end of file
+        RENAME paddle)
diff --git a/paddle/scripts/cluster_train/conf.py b/paddle/scripts/cluster_train/conf.py
index c8fd360e7552ed7c0f11aaa06574a11344c44aba..f1114a59201b9e57a14b739a327b622327c515f7 100644
--- a/paddle/scripts/cluster_train/conf.py
+++ b/paddle/scripts/cluster_train/conf.py
@@ -13,17 +13,14 @@
 # limitations under the License.
 
 HOSTS = [
-        "root@192.168.100.17",
-        "root@192.168.100.18",
-        ]
-
+    "root@192.168.100.17",
+    "root@192.168.100.18",
+]
 '''
 workspace configuration
 '''
 #root dir for workspace, can be set as any director with real user account
 ROOT_DIR = "/home/paddle"
-
-
 '''
 network configuration
 '''
@@ -37,4 +34,4 @@ PADDLE_PORTS_NUM = 2
 PADDLE_PORTS_NUM_FOR_SPARSE = 2
 
 #environments setting for all processes in cluster job
-LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lib64"
+LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/lib64"
diff --git a/paddle/scripts/cluster_train/paddle.py b/paddle/scripts/cluster_train/paddle.py
index 79698c72e619fa48c42d91d41abab61e2a5902ee..7343a600c1bf5522ac8b0cd90a38f8a362ba7ae6 100644
--- a/paddle/scripts/cluster_train/paddle.py
+++ b/paddle/scripts/cluster_train/paddle.py
@@ -12,8 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
 """ module for launching cluster job """
 
 import os
@@ -23,13 +21,13 @@ import copy
 import time
 import signal
 
-
 from fabric.api import run, put, settings, env, prefix
 from fabric.tasks import execute
 
 #configuration for cluster
 import conf
 
+
 def refine_unknown_args(cmd_args):
     '''
     refine unknown parameters to handle some special parameters
@@ -37,7 +35,7 @@ def refine_unknown_args(cmd_args):
     new_args = []
     for arg in cmd_args:
         if arg.startswith("--") and arg.find("=") != -1:
-            equal_pos = arg.find("=") #find first = pos
+            equal_pos = arg.find("=")  #find first = pos
             arglist = list(arg)
             arglist[equal_pos] = " "
             arg = "".join(arglist)
@@ -50,6 +48,7 @@ def refine_unknown_args(cmd_args):
             new_args.append(arg)
     return new_args
 
+
 def kill_process():
     '''
     kill comments threads
@@ -60,6 +59,7 @@ def kill_process():
          | awk '{print $2}' \
          | xargs kill > /dev/null 2>&1")
 
+
 def job_prepare(jobdir, data=None):
     '''
     prepare job related workspace data
@@ -70,6 +70,7 @@ def job_prepare(jobdir, data=None):
     This function just prepare all related model and other resources
     needed at runtime.
     '''
+
     def job_create_workspace(jobdir, data=None):
         '''
         prepare job workspace, common file, etc.
@@ -94,7 +95,8 @@ def job_prepare(jobdir, data=None):
         execute(set_nodefile, i, hosts=conf.HOSTS[i])
     #clean rubbish caused by exception 
     with settings(warn_only=True):
-          execute(kill_process, hosts=conf.HOSTS)
+        execute(kill_process, hosts=conf.HOSTS)
+
 
 def job_pserver(jobdir, pids=None):
     '''
@@ -124,9 +126,8 @@ def job_pserver(jobdir, pids=None):
 
     execute(start_pserver, jobdir, pargs, hosts=conf.HOSTS)
 
-def job_trainer(jobdir,
-        train_args_dict,
-        pids=None):
+
+def job_trainer(jobdir, train_args_dict, pids=None):
     '''
     start paddle trainer
     '''
@@ -171,9 +172,8 @@ def job_trainer(jobdir,
         train_args += " --trainer_id=" + str(i)
         execute(start_trainer, jobdir, train_args, hosts=conf.HOSTS[i])
 
-def job_all(job_package,
-        jobdir=None,
-        train_args_dict=None):
+
+def job_all(job_package, jobdir=None, train_args_dict=None):
     '''
     param job_package
     param train_args_dict
@@ -183,41 +183,52 @@ def job_all(job_package,
         jobdir = conf.ROOT_DIR + "/JOB" + timestamp
     job_prepare(jobdir, job_package)
     job_pserver(jobdir)
-    time.sleep(5) #wait until pservers completely start
+    time.sleep(5)  #wait until pservers completely start
     job_trainer(jobdir, train_args_dict)
     job_clean()
 
+
 def job_clean():
     '''
     if starting job failed from paddle internal, the framework always
     is launched successfully since these process are daemon processes.
     so this job_clean can alway clean job rubbish process with ctrl+c.
     '''
+
     def signal_handler(signal, frame):
         '''
         SIGINT handler
         '''
+
         def kill_process():
-             run("ps aux \
+            run("ps aux \
                   | grep paddle_process_by_paddle \
                   | grep -v grep  \
                   | awk '{print $2}' \
                   | xargs kill > /dev/null 2>&1")
+
         with settings(warn_only=True):
-              execute(kill_process, hosts=conf.HOSTS)
+            execute(kill_process, hosts=conf.HOSTS)
 
     signal.signal(signal.SIGINT, signal_handler)
     signal.pause()
 
+
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(prog="paddle.py",
-            description='simple tool for cluster training')
-    parser.add_argument('-j', '--job_workspace',
-            required=False, default=None,
-            help='job workspace')
-    parser.add_argument('-p', '--job_dispatch_package',
-            required=False, default=None,
-            help='job package for dispatching to all other nodes')
+    parser = argparse.ArgumentParser(
+        prog="paddle.py", description='simple tool for cluster training')
+    parser.add_argument(
+        '-j',
+        '--job_workspace',
+        required=False,
+        default=None,
+        help='job workspace')
+    parser.add_argument(
+        '-p',
+        '--job_dispatch_package',
+        required=False,
+        default=None,
+        help='job package for dispatching to all other nodes')
 
     args, train_args_list = parser.parse_known_args()
     train_args = refine_unknown_args(train_args_list)
@@ -227,14 +238,10 @@ if __name__ == '__main__':
         #if assigned workspace, do not need to dispatch data,
         #so job_local_package should be None
         assert args.job_dispatch_package is None
-        job_all(None,
-                args.job_workspace,
-                train_args_dict)
+        job_all(None, args.job_workspace, train_args_dict)
     elif args.job_dispatch_package is not None:
         assert args.job_workspace is None
         assert os.path.isdir(args.job_dispatch_package)
-        job_all(args.job_dispatch_package,
-                None,
-                train_args_dict)
+        job_all(args.job_dispatch_package, None, train_args_dict)
     else:
         print "--job_workspace or --job_dispatch_package should be set"
diff --git a/paddle/scripts/cpplint.py b/paddle/scripts/cpplint.py
index 5e905b865fc5167a2bddcf4ca0ab8313d17af4a3..157ce7b44ac3cfe3a8ca5eda78e959cf7be4cc5b 100644
--- a/paddle/scripts/cpplint.py
+++ b/paddle/scripts/cpplint.py
@@ -27,7 +27,6 @@
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
 """Does google-lint on c++ files.
 
 The goal of this script is to identify places in the code that *may*
@@ -55,7 +54,6 @@ import string
 import sys
 import unicodedata
 
-
 _USAGE = """
 Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
                    [--counting=total|toplevel|detailed] [--root=subdir]
@@ -242,13 +240,11 @@ _ERROR_CATEGORIES = [
     'whitespace/semicolon',
     'whitespace/tab',
     'whitespace/todo',
-    ]
+]
 
 # These error categories are no longer enforced by cpplint, but for backwards-
 # compatibility they may still appear in NOLINT comments.
-_LEGACY_ERROR_CATEGORIES = [
-    'readability/streams',
-    ]
+_LEGACY_ERROR_CATEGORIES = ['readability/streams', ]
 
 # The default state of the category filter. This is overridden by the --filter=
 # flag. By default all errors are on, so only add here categories that should be
@@ -394,8 +390,7 @@ _CPP_HEADERS = frozenset([
     'cuchar',
     'cwchar',
     'cwctype',
-    ])
-
+])
 
 # These headers are excluded from [build/include] and [build/include_order]
 # checks:
@@ -405,38 +400,40 @@ _CPP_HEADERS = frozenset([
 _THIRD_PARTY_HEADERS_PATTERN = re.compile(
     r'^(?:[^/]*[A-Z][^/]*\.h|lua\.h|lauxlib\.h|lualib\.h)$')
 
-
 # Assertion macros.  These are defined in base/logging.h and
 # testing/base/gunit.h.  Note that the _M versions need to come first
 # for substring matching to work.
 _CHECK_MACROS = [
-    'DCHECK', 'CHECK',
-    'EXPECT_TRUE_M', 'EXPECT_TRUE',
-    'ASSERT_TRUE_M', 'ASSERT_TRUE',
-    'EXPECT_FALSE_M', 'EXPECT_FALSE',
-    'ASSERT_FALSE_M', 'ASSERT_FALSE',
-    ]
+    'DCHECK',
+    'CHECK',
+    'EXPECT_TRUE_M',
+    'EXPECT_TRUE',
+    'ASSERT_TRUE_M',
+    'ASSERT_TRUE',
+    'EXPECT_FALSE_M',
+    'EXPECT_FALSE',
+    'ASSERT_FALSE_M',
+    'ASSERT_FALSE',
+]
 
 # Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE
 _CHECK_REPLACEMENT = dict([(m, {}) for m in _CHECK_MACROS])
 
-for op, replacement in [('==', 'EQ'), ('!=', 'NE'),
-                        ('>=', 'GE'), ('>', 'GT'),
+for op, replacement in [('==', 'EQ'), ('!=', 'NE'), ('>=', 'GE'), ('>', 'GT'),
                         ('<=', 'LE'), ('<', 'LT')]:
-  _CHECK_REPLACEMENT['DCHECK'][op] = 'DCHECK_%s' % replacement
-  _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement
-  _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement
-  _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement
-  _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement
-  _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement
-
-for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'),
-                            ('>=', 'LT'), ('>', 'LE'),
-                            ('<=', 'GT'), ('<', 'GE')]:
-  _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement
-  _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement
-  _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement
-  _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement
+    _CHECK_REPLACEMENT['DCHECK'][op] = 'DCHECK_%s' % replacement
+    _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement
+    _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement
+    _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement
+    _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement
+    _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement
+
+for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'), ('>=', 'LT'),
+                            ('>', 'LE'), ('<=', 'GT'), ('<', 'GE')]:
+    _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement
+    _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement
+    _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement
+    _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement
 
 # Alternative tokens and their replacements.  For full list, see section 2.5
 # Alternative tokens [lex.digraph] in the C++ standard.
@@ -455,16 +452,15 @@ _ALT_TOKEN_REPLACEMENT = {
     'xor_eq': '^=',
     'not': '!',
     'not_eq': '!='
-    }
+}
 
 # Compile regular expression that matches all the above keywords.  The "[ =()]"
 # bit is meant to avoid matching these keywords outside of boolean expressions.
 #
 # False positives include C-style multi-line comments and multi-line strings
 # but those have always been troublesome for cpplint.
-_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile(
-    r'[ =()](' + ('|'.join(_ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)')
-
+_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile(r'[ =()](' + ('|'.join(
+    _ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)')
 
 # These constants define types of headers for use with
 # _IncludeState.CheckNextIncludeOrder().
@@ -475,17 +471,16 @@ _POSSIBLE_MY_HEADER = 4
 _OTHER_HEADER = 5
 
 # These constants define the current inline assembly state
-_NO_ASM = 0       # Outside of inline assembly block
-_INSIDE_ASM = 1   # Inside inline assembly block
-_END_ASM = 2      # Last line of inline assembly block
-_BLOCK_ASM = 3    # The whole block is an inline assembly block
+_NO_ASM = 0  # Outside of inline assembly block
+_INSIDE_ASM = 1  # Inside inline assembly block
+_END_ASM = 2  # Last line of inline assembly block
+_BLOCK_ASM = 3  # The whole block is an inline assembly block
 
 # Match start of assembly blocks
 _MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)'
                         r'(?:\s+(volatile|__volatile__))?'
                         r'\s*[{(]')
 
-
 _regexp_compile_cache = {}
 
 # {str, set(int)}: a map from error categories to sets of linenumbers
@@ -504,8 +499,9 @@ _line_length = 80
 # This is set by --extensions flag.
 _valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh'])
 
+
 def ParseNolintSuppressions(filename, raw_line, linenum, error):
-  """Updates the global list of error-suppressions.
+    """Updates the global list of error-suppressions.
 
   Parses any NOLINT comments on the current line, updating the global
   error_suppressions store.  Reports an error if the NOLINT comment
@@ -517,45 +513,47 @@ def ParseNolintSuppressions(filename, raw_line, linenum, error):
     linenum: int, the number of the current line.
     error: function, an error handler.
   """
-  matched = Search(r'\bNOLINT(NEXTLINE(S_\d+)?)?\b(\([^)]+\))?', raw_line)
-  if matched:
-    if matched.group(1):
-      lines = matched.group(2)
-      if lines :
-        lines=int(lines[2:])
-        suppressed_line = [ linenum + i for i in xrange(lines) ]
-      else:
-        suppressed_line = linenum + 1
-    else:
-      suppressed_line = linenum
-    category = matched.group(3)
-    if category in (None, '(*)'):  # => "suppress all"
-      if isinstance(suppressed_line, int):
-        _error_suppressions.setdefault(None, set()).add(suppressed_line)
-      else:
-        for _line in suppressed_line:
-          _error_suppressions.setdefault(None, set()).add(_line)
-    else:
-      if category.startswith('(') and category.endswith(')'):
-        category = category[1:-1]
-        if category in _ERROR_CATEGORIES:
-          if isinstance(suppressed_line, int):
-            _error_suppressions.setdefault(category, set()).add(suppressed_line)
-          else:
-            for _line in suppressed_line:
-              _error_suppressions.setdefault(category, set()).add(_line)
-        elif category not in _LEGACY_ERROR_CATEGORIES:
-          error(filename, linenum, 'readability/nolint', 5,
-                'Unknown NOLINT error category: %s' % category)
+    matched = Search(r'\bNOLINT(NEXTLINE(S_\d+)?)?\b(\([^)]+\))?', raw_line)
+    if matched:
+        if matched.group(1):
+            lines = matched.group(2)
+            if lines:
+                lines = int(lines[2:])
+                suppressed_line = [linenum + i for i in xrange(lines)]
+            else:
+                suppressed_line = linenum + 1
+        else:
+            suppressed_line = linenum
+        category = matched.group(3)
+        if category in (None, '(*)'):  # => "suppress all"
+            if isinstance(suppressed_line, int):
+                _error_suppressions.setdefault(None, set()).add(suppressed_line)
+            else:
+                for _line in suppressed_line:
+                    _error_suppressions.setdefault(None, set()).add(_line)
+        else:
+            if category.startswith('(') and category.endswith(')'):
+                category = category[1:-1]
+                if category in _ERROR_CATEGORIES:
+                    if isinstance(suppressed_line, int):
+                        _error_suppressions.setdefault(
+                            category, set()).add(suppressed_line)
+                    else:
+                        for _line in suppressed_line:
+                            _error_suppressions.setdefault(category,
+                                                           set()).add(_line)
+                elif category not in _LEGACY_ERROR_CATEGORIES:
+                    error(filename, linenum, 'readability/nolint', 5,
+                          'Unknown NOLINT error category: %s' % category)
 
 
 def ResetNolintSuppressions():
-  """Resets the set of NOLINT suppressions to empty."""
-  _error_suppressions.clear()
+    """Resets the set of NOLINT suppressions to empty."""
+    _error_suppressions.clear()
 
 
 def IsErrorSuppressedByNolint(category, linenum):
-  """Returns true if the specified error category is suppressed on this line.
+    """Returns true if the specified error category is suppressed on this line.
 
   Consults the global error_suppressions map populated by
   ParseNolintSuppressions/ResetNolintSuppressions.
@@ -566,22 +564,22 @@ def IsErrorSuppressedByNolint(category, linenum):
   Returns:
     bool, True iff the error should be suppressed due to a NOLINT comment.
   """
-  return (linenum in _error_suppressions.get(category, set()) or
-          linenum in _error_suppressions.get(None, set()))
+    return (linenum in _error_suppressions.get(category, set()) or
+            linenum in _error_suppressions.get(None, set()))
 
 
 def Match(pattern, s):
-  """Matches the string with the pattern, caching the compiled regexp."""
-  # The regexp compilation caching is inlined in both Match and Search for
-  # performance reasons; factoring it out into a separate function turns out
-  # to be noticeably expensive.
-  if pattern not in _regexp_compile_cache:
-    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
-  return _regexp_compile_cache[pattern].match(s)
+    """Matches the string with the pattern, caching the compiled regexp."""
+    # The regexp compilation caching is inlined in both Match and Search for
+    # performance reasons; factoring it out into a separate function turns out
+    # to be noticeably expensive.
+    if pattern not in _regexp_compile_cache:
+        _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+    return _regexp_compile_cache[pattern].match(s)
 
 
 def ReplaceAll(pattern, rep, s):
-  """Replaces instances of pattern in a string with a replacement.
+    """Replaces instances of pattern in a string with a replacement.
 
   The compiled regex is kept in a cache shared by Match and Search.
 
@@ -593,20 +591,20 @@ def ReplaceAll(pattern, rep, s):
   Returns:
     string with replacements made (or original string if no replacements)
   """
-  if pattern not in _regexp_compile_cache:
-    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
-  return _regexp_compile_cache[pattern].sub(rep, s)
+    if pattern not in _regexp_compile_cache:
+        _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+    return _regexp_compile_cache[pattern].sub(rep, s)
 
 
 def Search(pattern, s):
-  """Searches the string for the pattern, caching the compiled regexp."""
-  if pattern not in _regexp_compile_cache:
-    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
-  return _regexp_compile_cache[pattern].search(s)
+    """Searches the string for the pattern, caching the compiled regexp."""
+    if pattern not in _regexp_compile_cache:
+        _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+    return _regexp_compile_cache[pattern].search(s)
 
 
 class _IncludeState(object):
-  """Tracks line numbers for includes, and the order in which includes appear.
+    """Tracks line numbers for includes, and the order in which includes appear.
 
   include_list contains list of lists of (header, line number) pairs.
   It's a lists of lists rather than just one flat list to make it
@@ -617,35 +615,35 @@ class _IncludeState(object):
   raise an _IncludeError with an appropriate error message.
 
   """
-  # self._section will move monotonically through this set. If it ever
-  # needs to move backwards, CheckNextIncludeOrder will raise an error.
-  _INITIAL_SECTION = 0
-  _MY_H_SECTION = 1
-  _C_SECTION = 2
-  _CPP_SECTION = 3
-  _OTHER_H_SECTION = 4
-
-  _TYPE_NAMES = {
-      _C_SYS_HEADER: 'C system header',
-      _CPP_SYS_HEADER: 'C++ system header',
-      _LIKELY_MY_HEADER: 'header this file implements',
-      _POSSIBLE_MY_HEADER: 'header this file may implement',
-      _OTHER_HEADER: 'other header',
-      }
-  _SECTION_NAMES = {
-      _INITIAL_SECTION: "... nothing. (This can't be an error.)",
-      _MY_H_SECTION: 'a header this file implements',
-      _C_SECTION: 'C system header',
-      _CPP_SECTION: 'C++ system header',
-      _OTHER_H_SECTION: 'other header',
-      }
-
-  def __init__(self):
-    self.include_list = [[]]
-    self.ResetSection('')
-
-  def FindHeader(self, header):
-    """Check if a header has already been included.
+    # self._section will move monotonically through this set. If it ever
+    # needs to move backwards, CheckNextIncludeOrder will raise an error.
+    _INITIAL_SECTION = 0
+    _MY_H_SECTION = 1
+    _C_SECTION = 2
+    _CPP_SECTION = 3
+    _OTHER_H_SECTION = 4
+
+    _TYPE_NAMES = {
+        _C_SYS_HEADER: 'C system header',
+        _CPP_SYS_HEADER: 'C++ system header',
+        _LIKELY_MY_HEADER: 'header this file implements',
+        _POSSIBLE_MY_HEADER: 'header this file may implement',
+        _OTHER_HEADER: 'other header',
+    }
+    _SECTION_NAMES = {
+        _INITIAL_SECTION: "... nothing. (This can't be an error.)",
+        _MY_H_SECTION: 'a header this file implements',
+        _C_SECTION: 'C system header',
+        _CPP_SECTION: 'C++ system header',
+        _OTHER_H_SECTION: 'other header',
+    }
+
+    def __init__(self):
+        self.include_list = [[]]
+        self.ResetSection('')
+
+    def FindHeader(self, header):
+        """Check if a header has already been included.
 
     Args:
       header: header to check.
@@ -653,35 +651,35 @@ class _IncludeState(object):
       Line number of previous occurrence, or -1 if the header has not
       been seen before.
     """
-    for section_list in self.include_list:
-      for f in section_list:
-        if f[0] == header:
-          return f[1]
-    return -1
+        for section_list in self.include_list:
+            for f in section_list:
+                if f[0] == header:
+                    return f[1]
+        return -1
 
-  def ResetSection(self, directive):
-    """Reset section checking for preprocessor directive.
+    def ResetSection(self, directive):
+        """Reset section checking for preprocessor directive.
 
     Args:
       directive: preprocessor directive (e.g. "if", "else").
     """
-    # The name of the current section.
-    self._section = self._INITIAL_SECTION
-    # The path of last found header.
-    self._last_header = ''
+        # The name of the current section.
+        self._section = self._INITIAL_SECTION
+        # The path of last found header.
+        self._last_header = ''
 
-    # Update list of includes.  Note that we never pop from the
-    # include list.
-    if directive in ('if', 'ifdef', 'ifndef'):
-      self.include_list.append([])
-    elif directive in ('else', 'elif'):
-      self.include_list[-1] = []
+        # Update list of includes.  Note that we never pop from the
+        # include list.
+        if directive in ('if', 'ifdef', 'ifndef'):
+            self.include_list.append([])
+        elif directive in ('else', 'elif'):
+            self.include_list[-1] = []
 
-  def SetLastHeader(self, header_path):
-    self._last_header = header_path
+    def SetLastHeader(self, header_path):
+        self._last_header = header_path
 
-  def CanonicalizeAlphabeticalOrder(self, header_path):
-    """Returns a path canonicalized for alphabetical comparison.
+    def CanonicalizeAlphabeticalOrder(self, header_path):
+        """Returns a path canonicalized for alphabetical comparison.
 
     - replaces "-" with "_" so they both cmp the same.
     - removes '-inl' since we don't require them to be after the main header.
@@ -693,10 +691,10 @@ class _IncludeState(object):
     Returns:
       Canonicalized path.
     """
-    return header_path.replace('-inl.h', '.h').replace('-', '_').lower()
+        return header_path.replace('-inl.h', '.h').replace('-', '_').lower()
 
-  def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path):
-    """Check if a header is in alphabetical order with the previous header.
+    def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path):
+        """Check if a header is in alphabetical order with the previous header.
 
     Args:
       clean_lines: A CleansedLines instance containing the file.
@@ -706,18 +704,18 @@ class _IncludeState(object):
     Returns:
       Returns true if the header is in alphabetical order.
     """
-    # If previous section is different from current section, _last_header will
-    # be reset to empty string, so it's always less than current header.
-    #
-    # If previous line was a blank line, assume that the headers are
-    # intentionally sorted the way they are.
-    if (self._last_header > header_path and
-        Match(r'^\s*#\s*include\b', clean_lines.elided[linenum - 1])):
-      return False
-    return True
+        # If previous section is different from current section, _last_header will
+        # be reset to empty string, so it's always less than current header.
+        #
+        # If previous line was a blank line, assume that the headers are
+        # intentionally sorted the way they are.
+        if (self._last_header > header_path and
+                Match(r'^\s*#\s*include\b', clean_lines.elided[linenum - 1])):
+            return False
+        return True
 
-  def CheckNextIncludeOrder(self, header_type):
-    """Returns a non-empty error message if the next header is out of order.
+    def CheckNextIncludeOrder(self, header_type):
+        """Returns a non-empty error message if the next header is out of order.
 
     This function also updates the internal state to be ready to check
     the next include.
@@ -730,80 +728,79 @@ class _IncludeState(object):
       error message describing what's wrong.
 
     """
-    error_message = ('Found %s after %s' %
-                     (self._TYPE_NAMES[header_type],
-                      self._SECTION_NAMES[self._section]))
-
-    last_section = self._section
-
-    if header_type == _C_SYS_HEADER:
-      if self._section <= self._C_SECTION:
-        self._section = self._C_SECTION
-      else:
-        self._last_header = ''
-        return error_message
-    elif header_type == _CPP_SYS_HEADER:
-      if self._section <= self._CPP_SECTION:
-        self._section = self._CPP_SECTION
-      else:
-        self._last_header = ''
-        return error_message
-    elif header_type == _LIKELY_MY_HEADER:
-      if self._section <= self._MY_H_SECTION:
-        self._section = self._MY_H_SECTION
-      else:
-        self._section = self._OTHER_H_SECTION
-    elif header_type == _POSSIBLE_MY_HEADER:
-      if self._section <= self._MY_H_SECTION:
-        self._section = self._MY_H_SECTION
-      else:
-        # This will always be the fallback because we're not sure
-        # enough that the header is associated with this file.
-        self._section = self._OTHER_H_SECTION
-    else:
-      assert header_type == _OTHER_HEADER
-      self._section = self._OTHER_H_SECTION
+        error_message = ('Found %s after %s' % (
+            self._TYPE_NAMES[header_type], self._SECTION_NAMES[self._section]))
+
+        last_section = self._section
+
+        if header_type == _C_SYS_HEADER:
+            if self._section <= self._C_SECTION:
+                self._section = self._C_SECTION
+            else:
+                self._last_header = ''
+                return error_message
+        elif header_type == _CPP_SYS_HEADER:
+            if self._section <= self._CPP_SECTION:
+                self._section = self._CPP_SECTION
+            else:
+                self._last_header = ''
+                return error_message
+        elif header_type == _LIKELY_MY_HEADER:
+            if self._section <= self._MY_H_SECTION:
+                self._section = self._MY_H_SECTION
+            else:
+                self._section = self._OTHER_H_SECTION
+        elif header_type == _POSSIBLE_MY_HEADER:
+            if self._section <= self._MY_H_SECTION:
+                self._section = self._MY_H_SECTION
+            else:
+                # This will always be the fallback because we're not sure
+                # enough that the header is associated with this file.
+                self._section = self._OTHER_H_SECTION
+        else:
+            assert header_type == _OTHER_HEADER
+            self._section = self._OTHER_H_SECTION
 
-    if last_section != self._section:
-      self._last_header = ''
+        if last_section != self._section:
+            self._last_header = ''
 
-    return ''
+        return ''
 
 
 class _CppLintState(object):
-  """Maintains module-wide state.."""
-
-  def __init__(self):
-    self.verbose_level = 1  # global setting.
-    self.error_count = 0    # global count of reported errors
-    # filters to apply when emitting error messages
-    self.filters = _DEFAULT_FILTERS[:]
-    # backup of filter list. Used to restore the state after each file.
-    self._filters_backup = self.filters[:]
-    self.counting = 'total'  # In what way are we counting errors?
-    self.errors_by_category = {}  # string to int dict storing error counts
-
-    # output format:
-    # "emacs" - format that emacs can parse (default)
-    # "vs7" - format that Microsoft Visual Studio 7 can parse
-    self.output_format = 'emacs'
-
-  def SetOutputFormat(self, output_format):
-    """Sets the output format for errors."""
-    self.output_format = output_format
-
-  def SetVerboseLevel(self, level):
-    """Sets the module's verbosity, and returns the previous setting."""
-    last_verbose_level = self.verbose_level
-    self.verbose_level = level
-    return last_verbose_level
-
-  def SetCountingStyle(self, counting_style):
-    """Sets the module's counting options."""
-    self.counting = counting_style
-
-  def SetFilters(self, filters):
-    """Sets the error-message filters.
+    """Maintains module-wide state.."""
+
+    def __init__(self):
+        self.verbose_level = 1  # global setting.
+        self.error_count = 0  # global count of reported errors
+        # filters to apply when emitting error messages
+        self.filters = _DEFAULT_FILTERS[:]
+        # backup of filter list. Used to restore the state after each file.
+        self._filters_backup = self.filters[:]
+        self.counting = 'total'  # In what way are we counting errors?
+        self.errors_by_category = {}  # string to int dict storing error counts
+
+        # output format:
+        # "emacs" - format that emacs can parse (default)
+        # "vs7" - format that Microsoft Visual Studio 7 can parse
+        self.output_format = 'emacs'
+
+    def SetOutputFormat(self, output_format):
+        """Sets the output format for errors."""
+        self.output_format = output_format
+
+    def SetVerboseLevel(self, level):
+        """Sets the module's verbosity, and returns the previous setting."""
+        last_verbose_level = self.verbose_level
+        self.verbose_level = level
+        return last_verbose_level
+
+    def SetCountingStyle(self, counting_style):
+        """Sets the module's counting options."""
+        self.counting = counting_style
+
+    def SetFilters(self, filters):
+        """Sets the error-message filters.
 
     These filters are applied when deciding whether to emit a given
     error message.
@@ -816,86 +813,88 @@ class _CppLintState(object):
       ValueError: The comma-separated filters did not all start with '+' or '-'.
                   E.g. "-,+whitespace,-whitespace/indent,whitespace/badfilter"
     """
-    # Default filters always have less priority than the flag ones.
-    self.filters = _DEFAULT_FILTERS[:]
-    self.AddFilters(filters)
-
-  def AddFilters(self, filters):
-    """ Adds more filters to the existing list of error-message filters. """
-    for filt in filters.split(','):
-      clean_filt = filt.strip()
-      if clean_filt:
-        self.filters.append(clean_filt)
-    for filt in self.filters:
-      if not (filt.startswith('+') or filt.startswith('-')):
-        raise ValueError('Every filter in --filters must start with + or -'
-                         ' (%s does not)' % filt)
-
-  def BackupFilters(self):
-    """ Saves the current filter list to backup storage."""
-    self._filters_backup = self.filters[:]
+        # Default filters always have less priority than the flag ones.
+        self.filters = _DEFAULT_FILTERS[:]
+        self.AddFilters(filters)
+
+    def AddFilters(self, filters):
+        """ Adds more filters to the existing list of error-message filters. """
+        for filt in filters.split(','):
+            clean_filt = filt.strip()
+            if clean_filt:
+                self.filters.append(clean_filt)
+        for filt in self.filters:
+            if not (filt.startswith('+') or filt.startswith('-')):
+                raise ValueError(
+                    'Every filter in --filters must start with + or -'
+                    ' (%s does not)' % filt)
+
+    def BackupFilters(self):
+        """ Saves the current filter list to backup storage."""
+        self._filters_backup = self.filters[:]
+
+    def RestoreFilters(self):
+        """ Restores filters previously backed up."""
+        self.filters = self._filters_backup[:]
+
+    def ResetErrorCounts(self):
+        """Sets the module's error statistic back to zero."""
+        self.error_count = 0
+        self.errors_by_category = {}
+
+    def IncrementErrorCount(self, category):
+        """Bumps the module's error statistic."""
+        self.error_count += 1
+        if self.counting in ('toplevel', 'detailed'):
+            if self.counting != 'detailed':
+                category = category.split('/')[0]
+            if category not in self.errors_by_category:
+                self.errors_by_category[category] = 0
+            self.errors_by_category[category] += 1
+
+    def PrintErrorCounts(self):
+        """Print a summary of errors by category, and the total."""
+        for category, count in self.errors_by_category.iteritems():
+            sys.stdout.write('Category \'%s\' errors found: %d\n' %
+                             (category, count))
+        sys.stdout.write('Total errors found: %d\n' % self.error_count)
 
-  def RestoreFilters(self):
-    """ Restores filters previously backed up."""
-    self.filters = self._filters_backup[:]
-
-  def ResetErrorCounts(self):
-    """Sets the module's error statistic back to zero."""
-    self.error_count = 0
-    self.errors_by_category = {}
-
-  def IncrementErrorCount(self, category):
-    """Bumps the module's error statistic."""
-    self.error_count += 1
-    if self.counting in ('toplevel', 'detailed'):
-      if self.counting != 'detailed':
-        category = category.split('/')[0]
-      if category not in self.errors_by_category:
-        self.errors_by_category[category] = 0
-      self.errors_by_category[category] += 1
-
-  def PrintErrorCounts(self):
-    """Print a summary of errors by category, and the total."""
-    for category, count in self.errors_by_category.iteritems():
-      sys.stdout.write('Category \'%s\' errors found: %d\n' %
-                       (category, count))
-    sys.stdout.write('Total errors found: %d\n' % self.error_count)
 
 _cpplint_state = _CppLintState()
 
 
 def _OutputFormat():
-  """Gets the module's output format."""
-  return _cpplint_state.output_format
+    """Gets the module's output format."""
+    return _cpplint_state.output_format
 
 
 def _SetOutputFormat(output_format):
-  """Sets the module's output format."""
-  _cpplint_state.SetOutputFormat(output_format)
+    """Sets the module's output format."""
+    _cpplint_state.SetOutputFormat(output_format)
 
 
 def _VerboseLevel():
-  """Returns the module's verbosity setting."""
-  return _cpplint_state.verbose_level
+    """Returns the module's verbosity setting."""
+    return _cpplint_state.verbose_level
 
 
 def _SetVerboseLevel(level):
-  """Sets the module's verbosity, and returns the previous setting."""
-  return _cpplint_state.SetVerboseLevel(level)
+    """Sets the module's verbosity, and returns the previous setting."""
+    return _cpplint_state.SetVerboseLevel(level)
 
 
 def _SetCountingStyle(level):
-  """Sets the module's counting options."""
-  _cpplint_state.SetCountingStyle(level)
+    """Sets the module's counting options."""
+    _cpplint_state.SetCountingStyle(level)
 
 
 def _Filters():
-  """Returns the module's list of output filters, as a list."""
-  return _cpplint_state.filters
+    """Returns the module's list of output filters, as a list."""
+    return _cpplint_state.filters
 
 
 def _SetFilters(filters):
-  """Sets the module's error-message filters.
+    """Sets the module's error-message filters.
 
   These filters are applied when deciding whether to emit a given
   error message.
@@ -904,10 +903,11 @@ def _SetFilters(filters):
     filters: A string of comma-separated filters (eg "whitespace/indent").
              Each filter should start with + or -; else we die.
   """
-  _cpplint_state.SetFilters(filters)
+    _cpplint_state.SetFilters(filters)
+
 
 def _AddFilters(filters):
-  """Adds more filter overrides.
+    """Adds more filter overrides.
 
   Unlike _SetFilters, this function does not reset the current list of filters
   available.
@@ -916,93 +916,97 @@ def _AddFilters(filters):
     filters: A string of comma-separated filters (eg "whitespace/indent").
              Each filter should start with + or -; else we die.
   """
-  _cpplint_state.AddFilters(filters)
+    _cpplint_state.AddFilters(filters)
+
 
 def _BackupFilters():
-  """ Saves the current filter list to backup storage."""
-  _cpplint_state.BackupFilters()
+    """ Saves the current filter list to backup storage."""
+    _cpplint_state.BackupFilters()
+
 
 def _RestoreFilters():
-  """ Restores filters previously backed up."""
-  _cpplint_state.RestoreFilters()
+    """ Restores filters previously backed up."""
+    _cpplint_state.RestoreFilters()
+
 
 class _FunctionState(object):
-  """Tracks current function name and the number of lines in its body."""
+    """Tracks current function name and the number of lines in its body."""
 
-  _NORMAL_TRIGGER = 250  # for --v=0, 500 for --v=1, etc.
-  _TEST_TRIGGER = 400    # about 50% more than _NORMAL_TRIGGER.
+    _NORMAL_TRIGGER = 250  # for --v=0, 500 for --v=1, etc.
+    _TEST_TRIGGER = 400  # about 50% more than _NORMAL_TRIGGER.
 
-  def __init__(self):
-    self.in_a_function = False
-    self.lines_in_function = 0
-    self.current_function = ''
+    def __init__(self):
+        self.in_a_function = False
+        self.lines_in_function = 0
+        self.current_function = ''
 
-  def Begin(self, function_name):
-    """Start analyzing function body.
+    def Begin(self, function_name):
+        """Start analyzing function body.
 
     Args:
       function_name: The name of the function being tracked.
     """
-    self.in_a_function = True
-    self.lines_in_function = 0
-    self.current_function = function_name
+        self.in_a_function = True
+        self.lines_in_function = 0
+        self.current_function = function_name
 
-  def Count(self):
-    """Count line in current function body."""
-    if self.in_a_function:
-      self.lines_in_function += 1
+    def Count(self):
+        """Count line in current function body."""
+        if self.in_a_function:
+            self.lines_in_function += 1
 
-  def Check(self, error, filename, linenum):
-    """Report if too many lines in function body.
+    def Check(self, error, filename, linenum):
+        """Report if too many lines in function body.
 
     Args:
       error: The function to call with any errors found.
       filename: The name of the current file.
       linenum: The number of the line to check.
     """
-    if Match(r'T(EST|est)', self.current_function):
-      base_trigger = self._TEST_TRIGGER
-    else:
-      base_trigger = self._NORMAL_TRIGGER
-    trigger = base_trigger * 2**_VerboseLevel()
-
-    if self.lines_in_function > trigger:
-      error_level = int(math.log(self.lines_in_function / base_trigger, 2))
-      # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ...
-      if error_level > 5:
-        error_level = 5
-      error(filename, linenum, 'readability/fn_size', error_level,
-            'Small and focused functions are preferred:'
-            ' %s has %d non-comment lines'
-            ' (error triggered by exceeding %d lines).'  % (
-                self.current_function, self.lines_in_function, trigger))
-
-  def End(self):
-    """Stop analyzing function body."""
-    self.in_a_function = False
+        if Match(r'T(EST|est)', self.current_function):
+            base_trigger = self._TEST_TRIGGER
+        else:
+            base_trigger = self._NORMAL_TRIGGER
+        trigger = base_trigger * 2**_VerboseLevel()
+
+        if self.lines_in_function > trigger:
+            error_level = int(
+                math.log(self.lines_in_function / base_trigger, 2))
+            # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ...
+            if error_level > 5:
+                error_level = 5
+            error(filename, linenum, 'readability/fn_size', error_level,
+                  'Small and focused functions are preferred:'
+                  ' %s has %d non-comment lines'
+                  ' (error triggered by exceeding %d lines).' % (
+                      self.current_function, self.lines_in_function, trigger))
+
+    def End(self):
+        """Stop analyzing function body."""
+        self.in_a_function = False
 
 
 class _IncludeError(Exception):
-  """Indicates a problem with the include order in a file."""
-  pass
+    """Indicates a problem with the include order in a file."""
+    pass
 
 
 class FileInfo(object):
-  """Provides utility functions for filenames.
+    """Provides utility functions for filenames.
 
   FileInfo provides easy access to the components of a file's path
   relative to the project root.
   """
 
-  def __init__(self, filename):
-    self._filename = filename
+    def __init__(self, filename):
+        self._filename = filename
 
-  def FullName(self):
-    """Make Windows paths like Unix."""
-    return os.path.abspath(self._filename).replace('\\', '/')
+    def FullName(self):
+        """Make Windows paths like Unix."""
+        return os.path.abspath(self._filename).replace('\\', '/')
 
-  def RepositoryName(self):
-    """FullName after removing the local path to the repository.
+    def RepositoryName(self):
+        """FullName after removing the local path to the repository.
 
     If we have a real absolute path name here we can try to do something smart:
     detecting the root of the checkout and truncating /path/to/checkout from
@@ -1011,43 +1015,43 @@ class FileInfo(object):
     people on different computers who have checked the source out to different
     locations won't see bogus errors.
     """
-    fullname = self.FullName()
-
-    if os.path.exists(fullname):
-      project_dir = os.path.dirname(fullname)
-
-      if os.path.exists(os.path.join(project_dir, ".svn")):
-        # If there's a .svn file in the current directory, we recursively look
-        # up the directory tree for the top of the SVN checkout
-        root_dir = project_dir
-        one_up_dir = os.path.dirname(root_dir)
-        while os.path.exists(os.path.join(one_up_dir, ".svn")):
-          root_dir = os.path.dirname(root_dir)
-          one_up_dir = os.path.dirname(one_up_dir)
-
-        prefix = os.path.commonprefix([root_dir, project_dir])
-        return fullname[len(prefix) + 1:]
-
-      # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by
-      # searching up from the current path.
-      root_dir = os.path.dirname(fullname)
-      while (root_dir != os.path.dirname(root_dir) and
-             not os.path.exists(os.path.join(root_dir, ".git")) and
-             not os.path.exists(os.path.join(root_dir, ".hg")) and
-             not os.path.exists(os.path.join(root_dir, ".svn"))):
-        root_dir = os.path.dirname(root_dir)
-
-      if (os.path.exists(os.path.join(root_dir, ".git")) or
-          os.path.exists(os.path.join(root_dir, ".hg")) or
-          os.path.exists(os.path.join(root_dir, ".svn"))):
-        prefix = os.path.commonprefix([root_dir, project_dir])
-        return fullname[len(prefix) + 1:]
-
-    # Don't know what to do; header guard warnings may be wrong...
-    return fullname
-
-  def Split(self):
-    """Splits the file into the directory, basename, and extension.
+        fullname = self.FullName()
+
+        if os.path.exists(fullname):
+            project_dir = os.path.dirname(fullname)
+
+            if os.path.exists(os.path.join(project_dir, ".svn")):
+                # If there's a .svn file in the current directory, we recursively look
+                # up the directory tree for the top of the SVN checkout
+                root_dir = project_dir
+                one_up_dir = os.path.dirname(root_dir)
+                while os.path.exists(os.path.join(one_up_dir, ".svn")):
+                    root_dir = os.path.dirname(root_dir)
+                    one_up_dir = os.path.dirname(one_up_dir)
+
+                prefix = os.path.commonprefix([root_dir, project_dir])
+                return fullname[len(prefix) + 1:]
+
+            # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by
+            # searching up from the current path.
+            root_dir = os.path.dirname(fullname)
+            while (root_dir != os.path.dirname(root_dir) and
+                   not os.path.exists(os.path.join(root_dir, ".git")) and
+                   not os.path.exists(os.path.join(root_dir, ".hg")) and
+                   not os.path.exists(os.path.join(root_dir, ".svn"))):
+                root_dir = os.path.dirname(root_dir)
+
+            if (os.path.exists(os.path.join(root_dir, ".git")) or
+                    os.path.exists(os.path.join(root_dir, ".hg")) or
+                    os.path.exists(os.path.join(root_dir, ".svn"))):
+                prefix = os.path.commonprefix([root_dir, project_dir])
+                return fullname[len(prefix) + 1:]
+
+        # Don't know what to do; header guard warnings may be wrong...
+        return fullname
+
+    def Split(self):
+        """Splits the file into the directory, basename, and extension.
 
     For 'chrome/browser/browser.cc', Split() would
     return ('chrome/browser', 'browser', '.cc')
@@ -1056,57 +1060,57 @@ class FileInfo(object):
       A tuple of (directory, basename, extension).
     """
 
-    googlename = self.RepositoryName()
-    project, rest = os.path.split(googlename)
-    return (project,) + os.path.splitext(rest)
+        googlename = self.RepositoryName()
+        project, rest = os.path.split(googlename)
+        return (project, ) + os.path.splitext(rest)
 
-  def BaseName(self):
-    """File base name - text after the final slash, before the final period."""
-    return self.Split()[1]
+    def BaseName(self):
+        """File base name - text after the final slash, before the final period."""
+        return self.Split()[1]
 
-  def Extension(self):
-    """File extension - text following the final period."""
-    return self.Split()[2]
+    def Extension(self):
+        """File extension - text following the final period."""
+        return self.Split()[2]
 
-  def NoExtension(self):
-    """File has no source file extension."""
-    return '/'.join(self.Split()[0:2])
+    def NoExtension(self):
+        """File has no source file extension."""
+        return '/'.join(self.Split()[0:2])
 
-  def IsSource(self):
-    """File has a source file extension."""
-    return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx')
+    def IsSource(self):
+        """File has a source file extension."""
+        return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx')
 
 
 def _ShouldPrintError(category, confidence, linenum):
-  """If confidence >= verbose, category passes filter and is not suppressed."""
+    """If confidence >= verbose, category passes filter and is not suppressed."""
 
-  # There are three ways we might decide not to print an error message:
-  # a "NOLINT(category)" comment appears in the source,
-  # the verbosity level isn't high enough, or the filters filter it out.
-  if IsErrorSuppressedByNolint(category, linenum):
-    return False
+    # There are three ways we might decide not to print an error message:
+    # a "NOLINT(category)" comment appears in the source,
+    # the verbosity level isn't high enough, or the filters filter it out.
+    if IsErrorSuppressedByNolint(category, linenum):
+        return False
 
-  if confidence < _cpplint_state.verbose_level:
-    return False
+    if confidence < _cpplint_state.verbose_level:
+        return False
 
-  is_filtered = False
-  for one_filter in _Filters():
-    if one_filter.startswith('-'):
-      if category.startswith(one_filter[1:]):
-        is_filtered = True
-    elif one_filter.startswith('+'):
-      if category.startswith(one_filter[1:]):
-        is_filtered = False
-    else:
-      assert False  # should have been checked for in SetFilter.
-  if is_filtered:
-    return False
+    is_filtered = False
+    for one_filter in _Filters():
+        if one_filter.startswith('-'):
+            if category.startswith(one_filter[1:]):
+                is_filtered = True
+        elif one_filter.startswith('+'):
+            if category.startswith(one_filter[1:]):
+                is_filtered = False
+        else:
+            assert False  # should have been checked for in SetFilter.
+    if is_filtered:
+        return False
 
-  return True
+    return True
 
 
 def Error(filename, linenum, category, confidence, message):
-  """Logs the fact we've found a lint error.
+    """Logs the fact we've found a lint error.
 
   We log where the error was found, and also our confidence in the error,
   that is, how certain we are this is a legitimate style regression, and
@@ -1127,17 +1131,17 @@ def Error(filename, linenum, category, confidence, message):
       and 1 meaning that it could be a legitimate construct.
     message: The error message.
   """
-  if _ShouldPrintError(category, confidence, linenum):
-    _cpplint_state.IncrementErrorCount(category)
-    if _cpplint_state.output_format == 'vs7':
-      sys.stderr.write('%s(%s):  %s  [%s] [%d]\n' % (
-          filename, linenum, message, category, confidence))
-    elif _cpplint_state.output_format == 'eclipse':
-      sys.stderr.write('%s:%s: warning: %s  [%s] [%d]\n' % (
-          filename, linenum, message, category, confidence))
-    else:
-      sys.stderr.write('%s:%s:  %s  [%s] [%d]\n' % (
-          filename, linenum, message, category, confidence))
+    if _ShouldPrintError(category, confidence, linenum):
+        _cpplint_state.IncrementErrorCount(category)
+        if _cpplint_state.output_format == 'vs7':
+            sys.stderr.write('%s(%s):  %s  [%s] [%d]\n' %
+                             (filename, linenum, message, category, confidence))
+        elif _cpplint_state.output_format == 'eclipse':
+            sys.stderr.write('%s:%s: warning: %s  [%s] [%d]\n' %
+                             (filename, linenum, message, category, confidence))
+        else:
+            sys.stderr.write('%s:%s:  %s  [%s] [%d]\n' %
+                             (filename, linenum, message, category, confidence))
 
 
 # Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard.
@@ -1154,14 +1158,13 @@ _RE_PATTERN_C_COMMENTS = r'/\*(?:[^*]|\*(?!/))*\*/'
 # if this doesn't work we try on left side but only if there's a non-character
 # on the right.
 _RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile(
-    r'(\s*' + _RE_PATTERN_C_COMMENTS + r'\s*$|' +
-    _RE_PATTERN_C_COMMENTS + r'\s+|' +
-    r'\s+' + _RE_PATTERN_C_COMMENTS + r'(?=\W)|' +
+    r'(\s*' + _RE_PATTERN_C_COMMENTS + r'\s*$|' + _RE_PATTERN_C_COMMENTS +
+    r'\s+|' + r'\s+' + _RE_PATTERN_C_COMMENTS + r'(?=\W)|' +
     _RE_PATTERN_C_COMMENTS + r')')
 
 
 def IsCppString(line):
-  """Does line terminate so, that the next symbol is in string constant.
+    """Does line terminate so, that the next symbol is in string constant.
 
   This function does not consider single-line nor multi-line comments.
 
@@ -1173,12 +1176,12 @@ def IsCppString(line):
     string constant.
   """
 
-  line = line.replace(r'\\', 'XX')  # after this, \\" does not match to \"
-  return ((line.count('"') - line.count(r'\"') - line.count("'\"'")) & 1) == 1
+    line = line.replace(r'\\', 'XX')  # after this, \\" does not match to \"
+    return ((line.count('"') - line.count(r'\"') - line.count("'\"'")) & 1) == 1
 
 
 def CleanseRawStrings(raw_lines):
-  """Removes C++11 raw strings from lines.
+    """Removes C++11 raw strings from lines.
 
     Before:
       static const char kData[] = R"(
@@ -1197,98 +1200,100 @@ def CleanseRawStrings(raw_lines):
     list of lines with C++11 raw strings replaced by empty strings.
   """
 
-  delimiter = None
-  lines_without_raw_strings = []
-  for line in raw_lines:
-    if delimiter:
-      # Inside a raw string, look for the end
-      end = line.find(delimiter)
-      if end >= 0:
-        # Found the end of the string, match leading space for this
-        # line and resume copying the original lines, and also insert
-        # a "" on the last line.
-        leading_space = Match(r'^(\s*)\S', line)
-        line = leading_space.group(1) + '""' + line[end + len(delimiter):]
-        delimiter = None
-      else:
-        # Haven't found the end yet, append a blank line.
-        line = '""'
-
-    # Look for beginning of a raw string, and replace them with
-    # empty strings.  This is done in a loop to handle multiple raw
-    # strings on the same line.
-    while delimiter is None:
-      # Look for beginning of a raw string.
-      # See 2.14.15 [lex.string] for syntax.
-      matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line)
-      if matched:
-        delimiter = ')' + matched.group(2) + '"'
-
-        end = matched.group(3).find(delimiter)
-        if end >= 0:
-          # Raw string ended on same line
-          line = (matched.group(1) + '""' +
-                  matched.group(3)[end + len(delimiter):])
-          delimiter = None
-        else:
-          # Start of a multi-line raw string
-          line = matched.group(1) + '""'
-      else:
-        break
-
-    lines_without_raw_strings.append(line)
-
-  # TODO(unknown): if delimiter is not None here, we might want to
-  # emit a warning for unterminated string.
-  return lines_without_raw_strings
+    delimiter = None
+    lines_without_raw_strings = []
+    for line in raw_lines:
+        if delimiter:
+            # Inside a raw string, look for the end
+            end = line.find(delimiter)
+            if end >= 0:
+                # Found the end of the string, match leading space for this
+                # line and resume copying the original lines, and also insert
+                # a "" on the last line.
+                leading_space = Match(r'^(\s*)\S', line)
+                line = leading_space.group(1) + '""' + line[end + len(
+                    delimiter):]
+                delimiter = None
+            else:
+                # Haven't found the end yet, append a blank line.
+                line = '""'
+
+        # Look for beginning of a raw string, and replace them with
+        # empty strings.  This is done in a loop to handle multiple raw
+        # strings on the same line.
+        while delimiter is None:
+            # Look for beginning of a raw string.
+            # See 2.14.15 [lex.string] for syntax.
+            matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$',
+                            line)
+            if matched:
+                delimiter = ')' + matched.group(2) + '"'
+
+                end = matched.group(3).find(delimiter)
+                if end >= 0:
+                    # Raw string ended on same line
+                    line = (matched.group(1) + '""' +
+                            matched.group(3)[end + len(delimiter):])
+                    delimiter = None
+                else:
+                    # Start of a multi-line raw string
+                    line = matched.group(1) + '""'
+            else:
+                break
+
+        lines_without_raw_strings.append(line)
+
+    # TODO(unknown): if delimiter is not None here, we might want to
+    # emit a warning for unterminated string.
+    return lines_without_raw_strings
 
 
 def FindNextMultiLineCommentStart(lines, lineix):
-  """Find the beginning marker for a multiline comment."""
-  while lineix < len(lines):
-    if lines[lineix].strip().startswith('/*'):
-      # Only return this marker if the comment goes beyond this line
-      if lines[lineix].strip().find('*/', 2) < 0:
-        return lineix
-    lineix += 1
-  return len(lines)
+    """Find the beginning marker for a multiline comment."""
+    while lineix < len(lines):
+        if lines[lineix].strip().startswith('/*'):
+            # Only return this marker if the comment goes beyond this line
+            if lines[lineix].strip().find('*/', 2) < 0:
+                return lineix
+        lineix += 1
+    return len(lines)
 
 
 def FindNextMultiLineCommentEnd(lines, lineix):
-  """We are inside a comment, find the end marker."""
-  while lineix < len(lines):
-    if lines[lineix].strip().endswith('*/'):
-      return lineix
-    lineix += 1
-  return len(lines)
+    """We are inside a comment, find the end marker."""
+    while lineix < len(lines):
+        if lines[lineix].strip().endswith('*/'):
+            return lineix
+        lineix += 1
+    return len(lines)
 
 
 def RemoveMultiLineCommentsFromRange(lines, begin, end):
-  """Clears a range of lines for multi-line comments."""
-  # Having // dummy comments makes the lines non-empty, so we will not get
-  # unnecessary blank line warnings later in the code.
-  for i in range(begin, end):
-    lines[i] = '/**/'
+    """Clears a range of lines for multi-line comments."""
+    # Having // dummy comments makes the lines non-empty, so we will not get
+    # unnecessary blank line warnings later in the code.
+    for i in range(begin, end):
+        lines[i] = '/**/'
 
 
 def RemoveMultiLineComments(filename, lines, error):
-  """Removes multiline (c-style) comments from lines."""
-  lineix = 0
-  while lineix < len(lines):
-    lineix_begin = FindNextMultiLineCommentStart(lines, lineix)
-    if lineix_begin >= len(lines):
-      return
-    lineix_end = FindNextMultiLineCommentEnd(lines, lineix_begin)
-    if lineix_end >= len(lines):
-      error(filename, lineix_begin + 1, 'readability/multiline_comment', 5,
-            'Could not find end of multi-line comment')
-      return
-    RemoveMultiLineCommentsFromRange(lines, lineix_begin, lineix_end + 1)
-    lineix = lineix_end + 1
+    """Removes multiline (c-style) comments from lines."""
+    lineix = 0
+    while lineix < len(lines):
+        lineix_begin = FindNextMultiLineCommentStart(lines, lineix)
+        if lineix_begin >= len(lines):
+            return
+        lineix_end = FindNextMultiLineCommentEnd(lines, lineix_begin)
+        if lineix_end >= len(lines):
+            error(filename, lineix_begin + 1, 'readability/multiline_comment',
+                  5, 'Could not find end of multi-line comment')
+            return
+        RemoveMultiLineCommentsFromRange(lines, lineix_begin, lineix_end + 1)
+        lineix = lineix_end + 1
 
 
 def CleanseComments(line):
-  """Removes //-comments and single-line C-style /* */ comments.
+    """Removes //-comments and single-line C-style /* */ comments.
 
   Args:
     line: A line of C++ source.
@@ -1296,15 +1301,15 @@ def CleanseComments(line):
   Returns:
     The line with single-line comments removed.
   """
-  commentpos = line.find('//')
-  if commentpos != -1 and not IsCppString(line[:commentpos]):
-    line = line[:commentpos].rstrip()
-  # get rid of /* ... */
-  return _RE_PATTERN_CLEANSE_LINE_C_COMMENTS.sub('', line)
+    commentpos = line.find('//')
+    if commentpos != -1 and not IsCppString(line[:commentpos]):
+        line = line[:commentpos].rstrip()
+    # get rid of /* ... */
+    return _RE_PATTERN_CLEANSE_LINE_C_COMMENTS.sub('', line)
 
 
 class CleansedLines(object):
-  """Holds 4 copies of all lines with different preprocessing applied to them.
+    """Holds 4 copies of all lines with different preprocessing applied to them.
 
   1) elided member contains lines without strings and comments.
   2) lines member contains lines without comments.
@@ -1314,25 +1319,26 @@ class CleansedLines(object):
   All these members are of <type 'list'>, and of the same length.
   """
 
-  def __init__(self, lines):
-    self.elided = []
-    self.lines = []
-    self.raw_lines = lines
-    self.num_lines = len(lines)
-    self.lines_without_raw_strings = CleanseRawStrings(lines)
-    for linenum in range(len(self.lines_without_raw_strings)):
-      self.lines.append(CleanseComments(
-          self.lines_without_raw_strings[linenum]))
-      elided = self._CollapseStrings(self.lines_without_raw_strings[linenum])
-      self.elided.append(CleanseComments(elided))
-
-  def NumLines(self):
-    """Returns the number of lines represented."""
-    return self.num_lines
-
-  @staticmethod
-  def _CollapseStrings(elided):
-    """Collapses strings and chars on a line to simple "" or '' blocks.
+    def __init__(self, lines):
+        self.elided = []
+        self.lines = []
+        self.raw_lines = lines
+        self.num_lines = len(lines)
+        self.lines_without_raw_strings = CleanseRawStrings(lines)
+        for linenum in range(len(self.lines_without_raw_strings)):
+            self.lines.append(
+                CleanseComments(self.lines_without_raw_strings[linenum]))
+            elided = self._CollapseStrings(self.lines_without_raw_strings[
+                linenum])
+            self.elided.append(CleanseComments(elided))
+
+    def NumLines(self):
+        """Returns the number of lines represented."""
+        return self.num_lines
+
+    @staticmethod
+    def _CollapseStrings(elided):
+        """Collapses strings and chars on a line to simple "" or '' blocks.
 
     We nix strings first so we're not fooled by text like '"http://"'
 
@@ -1342,64 +1348,65 @@ class CleansedLines(object):
     Returns:
       The line with collapsed strings.
     """
-    if _RE_PATTERN_INCLUDE.match(elided):
-      return elided
-
-    # Remove escaped characters first to make quote/single quote collapsing
-    # basic.  Things that look like escaped characters shouldn't occur
-    # outside of strings and chars.
-    elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
-
-    # Replace quoted strings and digit separators.  Both single quotes
-    # and double quotes are processed in the same loop, otherwise
-    # nested quotes wouldn't work.
-    collapsed = ''
-    while True:
-      # Find the first quote character
-      match = Match(r'^([^\'"]*)([\'"])(.*)$', elided)
-      if not match:
-        collapsed += elided
-        break
-      head, quote, tail = match.groups()
-
-      if quote == '"':
-        # Collapse double quoted strings
-        second_quote = tail.find('"')
-        if second_quote >= 0:
-          collapsed += head + '""'
-          elided = tail[second_quote + 1:]
-        else:
-          # Unmatched double quote, don't bother processing the rest
-          # of the line since this is probably a multiline string.
-          collapsed += elided
-          break
-      else:
-        # Found single quote, check nearby text to eliminate digit separators.
-        #
-        # There is no special handling for floating point here, because
-        # the integer/fractional/exponent parts would all be parsed
-        # correctly as long as there are digits on both sides of the
-        # separator.  So we are fine as long as we don't see something
-        # like "0.'3" (gcc 4.9.0 will not allow this literal).
-        if Search(r'\b(?:0[bBxX]?|[1-9])[0-9a-fA-F]*$', head):
-          match_literal = Match(r'^((?:\'?[0-9a-zA-Z_])*)(.*)$', "'" + tail)
-          collapsed += head + match_literal.group(1).replace("'", '')
-          elided = match_literal.group(2)
-        else:
-          second_quote = tail.find('\'')
-          if second_quote >= 0:
-            collapsed += head + "''"
-            elided = tail[second_quote + 1:]
-          else:
-            # Unmatched single quote
-            collapsed += elided
-            break
-
-    return collapsed
+        if _RE_PATTERN_INCLUDE.match(elided):
+            return elided
+
+        # Remove escaped characters first to make quote/single quote collapsing
+        # basic.  Things that look like escaped characters shouldn't occur
+        # outside of strings and chars.
+        elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
+
+        # Replace quoted strings and digit separators.  Both single quotes
+        # and double quotes are processed in the same loop, otherwise
+        # nested quotes wouldn't work.
+        collapsed = ''
+        while True:
+            # Find the first quote character
+            match = Match(r'^([^\'"]*)([\'"])(.*)$', elided)
+            if not match:
+                collapsed += elided
+                break
+            head, quote, tail = match.groups()
+
+            if quote == '"':
+                # Collapse double quoted strings
+                second_quote = tail.find('"')
+                if second_quote >= 0:
+                    collapsed += head + '""'
+                    elided = tail[second_quote + 1:]
+                else:
+                    # Unmatched double quote, don't bother processing the rest
+                    # of the line since this is probably a multiline string.
+                    collapsed += elided
+                    break
+            else:
+                # Found single quote, check nearby text to eliminate digit separators.
+                #
+                # There is no special handling for floating point here, because
+                # the integer/fractional/exponent parts would all be parsed
+                # correctly as long as there are digits on both sides of the
+                # separator.  So we are fine as long as we don't see something
+                # like "0.'3" (gcc 4.9.0 will not allow this literal).
+                if Search(r'\b(?:0[bBxX]?|[1-9])[0-9a-fA-F]*$', head):
+                    match_literal = Match(r'^((?:\'?[0-9a-zA-Z_])*)(.*)$',
+                                          "'" + tail)
+                    collapsed += head + match_literal.group(1).replace("'", '')
+                    elided = match_literal.group(2)
+                else:
+                    second_quote = tail.find('\'')
+                    if second_quote >= 0:
+                        collapsed += head + "''"
+                        elided = tail[second_quote + 1:]
+                    else:
+                        # Unmatched single quote
+                        collapsed += elided
+                        break
+
+        return collapsed
 
 
 def FindEndOfExpressionInLine(line, startpos, stack):
-  """Find the position just after the end of current parenthesized expression.
+    """Find the position just after the end of current parenthesized expression.
 
   Args:
     line: a CleansedLines line.
@@ -1411,73 +1418,73 @@ def FindEndOfExpressionInLine(line, startpos, stack):
     On finding an unclosed expression: (-1, None)
     Otherwise: (-1, new stack at end of this line)
   """
-  for i in xrange(startpos, len(line)):
-    char = line[i]
-    if char in '([{':
-      # Found start of parenthesized expression, push to expression stack
-      stack.append(char)
-    elif char == '<':
-      # Found potential start of template argument list
-      if i > 0 and line[i - 1] == '<':
-        # Left shift operator
-        if stack and stack[-1] == '<':
-          stack.pop()
-          if not stack:
-            return (-1, None)
-      elif i > 0 and Search(r'\boperator\s*$', line[0:i]):
-        # operator<, don't add to stack
-        continue
-      else:
-        # Tentative start of template argument list
-        stack.append('<')
-    elif char in ')]}':
-      # Found end of parenthesized expression.
-      #
-      # If we are currently expecting a matching '>', the pending '<'
-      # must have been an operator.  Remove them from expression stack.
-      while stack and stack[-1] == '<':
-        stack.pop()
-      if not stack:
-        return (-1, None)
-      if ((stack[-1] == '(' and char == ')') or
-          (stack[-1] == '[' and char == ']') or
-          (stack[-1] == '{' and char == '}')):
-        stack.pop()
-        if not stack:
-          return (i + 1, None)
-      else:
-        # Mismatched parentheses
-        return (-1, None)
-    elif char == '>':
-      # Found potential end of template argument list.
-
-      # Ignore "->" and operator functions
-      if (i > 0 and
-          (line[i - 1] == '-' or Search(r'\boperator\s*$', line[0:i - 1]))):
-        continue
-
-      # Pop the stack if there is a matching '<'.  Otherwise, ignore
-      # this '>' since it must be an operator.
-      if stack:
-        if stack[-1] == '<':
-          stack.pop()
-          if not stack:
-            return (i + 1, None)
-    elif char == ';':
-      # Found something that look like end of statements.  If we are currently
-      # expecting a '>', the matching '<' must have been an operator, since
-      # template argument list should not contain statements.
-      while stack and stack[-1] == '<':
-        stack.pop()
-      if not stack:
-        return (-1, None)
-
-  # Did not find end of expression or unbalanced parentheses on this line
-  return (-1, stack)
+    for i in xrange(startpos, len(line)):
+        char = line[i]
+        if char in '([{':
+            # Found start of parenthesized expression, push to expression stack
+            stack.append(char)
+        elif char == '<':
+            # Found potential start of template argument list
+            if i > 0 and line[i - 1] == '<':
+                # Left shift operator
+                if stack and stack[-1] == '<':
+                    stack.pop()
+                    if not stack:
+                        return (-1, None)
+            elif i > 0 and Search(r'\boperator\s*$', line[0:i]):
+                # operator<, don't add to stack
+                continue
+            else:
+                # Tentative start of template argument list
+                stack.append('<')
+        elif char in ')]}':
+            # Found end of parenthesized expression.
+            #
+            # If we are currently expecting a matching '>', the pending '<'
+            # must have been an operator.  Remove them from expression stack.
+            while stack and stack[-1] == '<':
+                stack.pop()
+            if not stack:
+                return (-1, None)
+            if ((stack[-1] == '(' and char == ')') or
+                (stack[-1] == '[' and char == ']') or
+                (stack[-1] == '{' and char == '}')):
+                stack.pop()
+                if not stack:
+                    return (i + 1, None)
+            else:
+                # Mismatched parentheses
+                return (-1, None)
+        elif char == '>':
+            # Found potential end of template argument list.
+
+            # Ignore "->" and operator functions
+            if (i > 0 and (line[i - 1] == '-' or Search(r'\boperator\s*$',
+                                                        line[0:i - 1]))):
+                continue
+
+            # Pop the stack if there is a matching '<'.  Otherwise, ignore
+            # this '>' since it must be an operator.
+            if stack:
+                if stack[-1] == '<':
+                    stack.pop()
+                    if not stack:
+                        return (i + 1, None)
+        elif char == ';':
+            # Found something that look like end of statements.  If we are currently
+            # expecting a '>', the matching '<' must have been an operator, since
+            # template argument list should not contain statements.
+            while stack and stack[-1] == '<':
+                stack.pop()
+            if not stack:
+                return (-1, None)
+
+    # Did not find end of expression or unbalanced parentheses on this line
+    return (-1, stack)
 
 
 def CloseExpression(clean_lines, linenum, pos):
-  """If input points to ( or { or [ or <, finds the position that closes it.
+    """If input points to ( or { or [ or <, finds the position that closes it.
 
   If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the
   linenum/pos that correspond to the closing of the expression.
@@ -1499,29 +1506,29 @@ def CloseExpression(clean_lines, linenum, pos):
     'cleansed' line at linenum.
   """
 
-  line = clean_lines.elided[linenum]
-  if (line[pos] not in '({[<') or Match(r'<[<=]', line[pos:]):
-    return (line, clean_lines.NumLines(), -1)
-
-  # Check first line
-  (end_pos, stack) = FindEndOfExpressionInLine(line, pos, [])
-  if end_pos > -1:
-    return (line, linenum, end_pos)
-
-  # Continue scanning forward
-  while stack and linenum < clean_lines.NumLines() - 1:
-    linenum += 1
     line = clean_lines.elided[linenum]
-    (end_pos, stack) = FindEndOfExpressionInLine(line, 0, stack)
+    if (line[pos] not in '({[<') or Match(r'<[<=]', line[pos:]):
+        return (line, clean_lines.NumLines(), -1)
+
+    # Check first line
+    (end_pos, stack) = FindEndOfExpressionInLine(line, pos, [])
     if end_pos > -1:
-      return (line, linenum, end_pos)
+        return (line, linenum, end_pos)
+
+    # Continue scanning forward
+    while stack and linenum < clean_lines.NumLines() - 1:
+        linenum += 1
+        line = clean_lines.elided[linenum]
+        (end_pos, stack) = FindEndOfExpressionInLine(line, 0, stack)
+        if end_pos > -1:
+            return (line, linenum, end_pos)
 
-  # Did not find end of expression before end of file, give up
-  return (line, clean_lines.NumLines(), -1)
+    # Did not find end of expression before end of file, give up
+    return (line, clean_lines.NumLines(), -1)
 
 
 def FindStartOfExpressionInLine(line, endpos, stack):
-  """Find position at the matching start of current expression.
+    """Find position at the matching start of current expression.
 
   This is almost the reverse of FindEndOfExpressionInLine, but note
   that the input position and returned position differs by 1.
@@ -1536,69 +1543,68 @@ def FindStartOfExpressionInLine(line, endpos, stack):
     On finding an unclosed expression: (-1, None)
     Otherwise: (-1, new stack at beginning of this line)
   """
-  i = endpos
-  while i >= 0:
-    char = line[i]
-    if char in ')]}':
-      # Found end of expression, push to expression stack
-      stack.append(char)
-    elif char == '>':
-      # Found potential end of template argument list.
-      #
-      # Ignore it if it's a "->" or ">=" or "operator>"
-      if (i > 0 and
-          (line[i - 1] == '-' or
-           Match(r'\s>=\s', line[i - 1:]) or
-           Search(r'\boperator\s*$', line[0:i]))):
-        i -= 1
-      else:
-        stack.append('>')
-    elif char == '<':
-      # Found potential start of template argument list
-      if i > 0 and line[i - 1] == '<':
-        # Left shift operator
+    i = endpos
+    while i >= 0:
+        char = line[i]
+        if char in ')]}':
+            # Found end of expression, push to expression stack
+            stack.append(char)
+        elif char == '>':
+            # Found potential end of template argument list.
+            #
+            # Ignore it if it's a "->" or ">=" or "operator>"
+            if (i > 0 and
+                (line[i - 1] == '-' or Match(r'\s>=\s', line[i - 1:]) or
+                 Search(r'\boperator\s*$', line[0:i]))):
+                i -= 1
+            else:
+                stack.append('>')
+        elif char == '<':
+            # Found potential start of template argument list
+            if i > 0 and line[i - 1] == '<':
+                # Left shift operator
+                i -= 1
+            else:
+                # If there is a matching '>', we can pop the expression stack.
+                # Otherwise, ignore this '<' since it must be an operator.
+                if stack and stack[-1] == '>':
+                    stack.pop()
+                    if not stack:
+                        return (i, None)
+        elif char in '([{':
+            # Found start of expression.
+            #
+            # If there are any unmatched '>' on the stack, they must be
+            # operators.  Remove those.
+            while stack and stack[-1] == '>':
+                stack.pop()
+            if not stack:
+                return (-1, None)
+            if ((char == '(' and stack[-1] == ')') or
+                (char == '[' and stack[-1] == ']') or
+                (char == '{' and stack[-1] == '}')):
+                stack.pop()
+                if not stack:
+                    return (i, None)
+            else:
+                # Mismatched parentheses
+                return (-1, None)
+        elif char == ';':
+            # Found something that look like end of statements.  If we are currently
+            # expecting a '<', the matching '>' must have been an operator, since
+            # template argument list should not contain statements.
+            while stack and stack[-1] == '>':
+                stack.pop()
+            if not stack:
+                return (-1, None)
+
         i -= 1
-      else:
-        # If there is a matching '>', we can pop the expression stack.
-        # Otherwise, ignore this '<' since it must be an operator.
-        if stack and stack[-1] == '>':
-          stack.pop()
-          if not stack:
-            return (i, None)
-    elif char in '([{':
-      # Found start of expression.
-      #
-      # If there are any unmatched '>' on the stack, they must be
-      # operators.  Remove those.
-      while stack and stack[-1] == '>':
-        stack.pop()
-      if not stack:
-        return (-1, None)
-      if ((char == '(' and stack[-1] == ')') or
-          (char == '[' and stack[-1] == ']') or
-          (char == '{' and stack[-1] == '}')):
-        stack.pop()
-        if not stack:
-          return (i, None)
-      else:
-        # Mismatched parentheses
-        return (-1, None)
-    elif char == ';':
-      # Found something that look like end of statements.  If we are currently
-      # expecting a '<', the matching '>' must have been an operator, since
-      # template argument list should not contain statements.
-      while stack and stack[-1] == '>':
-        stack.pop()
-      if not stack:
-        return (-1, None)
-
-    i -= 1
-
-  return (-1, stack)
+
+    return (-1, stack)
 
 
 def ReverseCloseExpression(clean_lines, linenum, pos):
-  """If input points to ) or } or ] or >, finds the position that opens it.
+    """If input points to ) or } or ] or >, finds the position that opens it.
 
   If lines[linenum][pos] points to a ')' or '}' or ']' or '>', finds the
   linenum/pos that correspond to the opening of the expression.
@@ -1614,42 +1620,42 @@ def ReverseCloseExpression(clean_lines, linenum, pos):
     we ignore strings and comments when matching; and the line we
     return is the 'cleansed' line at linenum.
   """
-  line = clean_lines.elided[linenum]
-  if line[pos] not in ')}]>':
-    return (line, 0, -1)
-
-  # Check last line
-  (start_pos, stack) = FindStartOfExpressionInLine(line, pos, [])
-  if start_pos > -1:
-    return (line, linenum, start_pos)
-
-  # Continue scanning backward
-  while stack and linenum > 0:
-    linenum -= 1
     line = clean_lines.elided[linenum]
-    (start_pos, stack) = FindStartOfExpressionInLine(line, len(line) - 1, stack)
-    if start_pos > -1:
-      return (line, linenum, start_pos)
+    if line[pos] not in ')}]>':
+        return (line, 0, -1)
 
-  # Did not find start of expression before beginning of file, give up
-  return (line, 0, -1)
+    # Check last line
+    (start_pos, stack) = FindStartOfExpressionInLine(line, pos, [])
+    if start_pos > -1:
+        return (line, linenum, start_pos)
+
+    # Continue scanning backward
+    while stack and linenum > 0:
+        linenum -= 1
+        line = clean_lines.elided[linenum]
+        (start_pos, stack) = FindStartOfExpressionInLine(line,
+                                                         len(line) - 1, stack)
+        if start_pos > -1:
+            return (line, linenum, start_pos)
+
+    # Did not find start of expression before beginning of file, give up
+    return (line, 0, -1)
 
 
 def CheckForCopyright(filename, lines, error):
-  """Logs an error if no Copyright message appears at the top of the file."""
+    """Logs an error if no Copyright message appears at the top of the file."""
 
-  # We'll say it should occur by line 10. Don't forget there's a
-  # dummy line at the front.
-  for line in xrange(1, min(len(lines), 11)):
-    if re.search(r'Copyright', lines[line], re.I): break
-  else:                       # means no copyright line was found
-    error(filename, 0, 'legal/copyright', 5,
-          'No copyright message found.  '
-          'You should have a line: "Copyright [year] <Copyright Owner>"')
+    # We'll say it should occur by line 10. Don't forget there's a
+    # dummy line at the front.
+    for line in xrange(1, min(len(lines), 11)):
+        if re.search(r'Copyright', lines[line], re.I): break
+    else:  # means no copyright line was found
+        error(filename, 0, 'legal/copyright', 5, 'No copyright message found.  '
+              'You should have a line: "Copyright [year] <Copyright Owner>"')
 
 
 def GetIndentLevel(line):
-  """Return the number of leading spaces in line.
+    """Return the number of leading spaces in line.
 
   Args:
     line: A string to check.
@@ -1657,15 +1663,15 @@ def GetIndentLevel(line):
   Returns:
     An integer count of leading spaces, possibly zero.
   """
-  indent = Match(r'^( *)\S', line)
-  if indent:
-    return len(indent.group(1))
-  else:
-    return 0
+    indent = Match(r'^( *)\S', line)
+    if indent:
+        return len(indent.group(1))
+    else:
+        return 0
 
 
 def GetHeaderGuardCPPVariable(filename):
-  """Returns the CPP variable that should be used as a header guard.
+    """Returns the CPP variable that should be used as a header guard.
 
   Args:
     filename: The name of a C++ header file.
@@ -1675,12 +1681,12 @@ def GetHeaderGuardCPPVariable(filename):
     named file.
 
   """
-  filename = os.path.basename(filename)
-  return re.sub(r'[^a-zA-Z0-9]', '_', filename).upper() + '_'
+    filename = os.path.basename(filename)
+    return re.sub(r'[^a-zA-Z0-9]', '_', filename).upper() + '_'
 
 
 def CheckForHeaderGuard(filename, clean_lines, error):
-  """Checks that the file contains a header guard.
+    """Checks that the file contains a header guard.
 
   Logs an error if no #ifndef header guard is present.  For other
   headers, checks that the full pathname is used.
@@ -1691,123 +1697,124 @@ def CheckForHeaderGuard(filename, clean_lines, error):
     error: The function to call with any errors found.
   """
 
-  # Don't check for header guards if there are error suppression
-  # comments somewhere in this file.
-  #
-  # Because this is silencing a warning for a nonexistent line, we
-  # only support the very specific NOLINT(build/header_guard) syntax,
-  # and not the general NOLINT or NOLINT(*) syntax.
-  raw_lines = clean_lines.lines_without_raw_strings
-  for i in raw_lines:
-    if Search(r'//\s*NOLINT\(build/header_guard\)', i):
-      return
-
-  cppvar = GetHeaderGuardCPPVariable(filename)
-
-  ifndef = ''
-  ifndef_linenum = 0
-  define = ''
-  endif = ''
-  endif_linenum = 0
-  pragma_linenum = -1
-  for linenum, line in enumerate(raw_lines):
-    linesplit = line.split()
-    if len(linesplit) >= 2:
-      if linesplit[0] == '#pragma' and linesplit[1] == 'once':
-        pragma_linenum = linenum
-      # find the first occurrence of #ifndef and #define, save arg
-      if not ifndef and linesplit[0] == '#ifndef':
-        # set ifndef to the header guard presented on the #ifndef line.
-        ifndef = linesplit[1]
-        ifndef_linenum = linenum
-      if not define and linesplit[0] == '#define':
-        define = linesplit[1]
-    # find the last occurrence of #endif, save entire line
-    if line.startswith('#endif'):
-      endif = line
-      endif_linenum = linenum
-  if pragma_linenum != -1:
-      return # short path for pragma once
-  if not ifndef or not define or ifndef != define:
-    error(filename, 0, 'build/header_guard', 5,
-          'No #ifndef header guard found, suggested CPP variable is: %s' %
-          cppvar)
-    return
-
-  # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__
-  # for backward compatibility.
-  if ifndef != cppvar:
-    error_level = 0
-    if ifndef != cppvar + '_':
-      error_level = 5
-
-    ParseNolintSuppressions(filename, raw_lines[ifndef_linenum], ifndef_linenum,
+    # Don't check for header guards if there are error suppression
+    # comments somewhere in this file.
+    #
+    # Because this is silencing a warning for a nonexistent line, we
+    # only support the very specific NOLINT(build/header_guard) syntax,
+    # and not the general NOLINT or NOLINT(*) syntax.
+    raw_lines = clean_lines.lines_without_raw_strings
+    for i in raw_lines:
+        if Search(r'//\s*NOLINT\(build/header_guard\)', i):
+            return
+
+    cppvar = GetHeaderGuardCPPVariable(filename)
+
+    ifndef = ''
+    ifndef_linenum = 0
+    define = ''
+    endif = ''
+    endif_linenum = 0
+    pragma_linenum = -1
+    for linenum, line in enumerate(raw_lines):
+        linesplit = line.split()
+        if len(linesplit) >= 2:
+            if linesplit[0] == '#pragma' and linesplit[1] == 'once':
+                pragma_linenum = linenum
+            # find the first occurrence of #ifndef and #define, save arg
+            if not ifndef and linesplit[0] == '#ifndef':
+                # set ifndef to the header guard presented on the #ifndef line.
+                ifndef = linesplit[1]
+                ifndef_linenum = linenum
+            if not define and linesplit[0] == '#define':
+                define = linesplit[1]
+        # find the last occurrence of #endif, save entire line
+        if line.startswith('#endif'):
+            endif = line
+            endif_linenum = linenum
+    if pragma_linenum != -1:
+        return  # short path for pragma once
+    if not ifndef or not define or ifndef != define:
+        error(filename, 0, 'build/header_guard', 5,
+              'No #ifndef header guard found, suggested CPP variable is: %s' %
+              cppvar)
+        return
+
+    # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__
+    # for backward compatibility.
+    if ifndef != cppvar:
+        error_level = 0
+        if ifndef != cppvar + '_':
+            error_level = 5
+
+        ParseNolintSuppressions(filename, raw_lines[ifndef_linenum],
+                                ifndef_linenum, error)
+        error(filename, ifndef_linenum, 'build/header_guard', error_level,
+              '#ifndef header guard has wrong style, please use: %s' % cppvar)
+
+    # Check for "//" comments on endif line.
+    ParseNolintSuppressions(filename, raw_lines[endif_linenum], endif_linenum,
                             error)
-    error(filename, ifndef_linenum, 'build/header_guard', error_level,
-          '#ifndef header guard has wrong style, please use: %s' % cppvar)
-
-  # Check for "//" comments on endif line.
-  ParseNolintSuppressions(filename, raw_lines[endif_linenum], endif_linenum,
-                          error)
-  match = Match(r'#endif\s*//\s*' + cppvar + r'(_)?\b', endif)
-  if match:
-    if match.group(1) == '_':
-      # Issue low severity warning for deprecated double trailing underscore
-      error(filename, endif_linenum, 'build/header_guard', 0,
-            '#endif line should be "#endif  // %s"' % cppvar)
-    return
-
-  # Didn't find the corresponding "//" comment.  If this file does not
-  # contain any "//" comments at all, it could be that the compiler
-  # only wants "/**/" comments, look for those instead.
-  no_single_line_comments = True
-  for i in xrange(1, len(raw_lines) - 1):
-    line = raw_lines[i]
-    if Match(r'^(?:(?:\'(?:\.|[^\'])*\')|(?:"(?:\.|[^"])*")|[^\'"])*//', line):
-      no_single_line_comments = False
-      break
-
-  if no_single_line_comments:
-    match = Match(r'#endif\s*/\*\s*' + cppvar + r'(_)?\s*\*/', endif)
+    match = Match(r'#endif\s*//\s*' + cppvar + r'(_)?\b', endif)
     if match:
-      if match.group(1) == '_':
-        # Low severity warning for double trailing underscore
-        error(filename, endif_linenum, 'build/header_guard', 0,
-              '#endif line should be "#endif  /* %s */"' % cppvar)
-      return
+        if match.group(1) == '_':
+            # Issue low severity warning for deprecated double trailing underscore
+            error(filename, endif_linenum, 'build/header_guard', 0,
+                  '#endif line should be "#endif  // %s"' % cppvar)
+        return
 
-  # Didn't find anything
-  error(filename, endif_linenum, 'build/header_guard', 5,
-        '#endif line should be "#endif  // %s"' % cppvar)
+    # Didn't find the corresponding "//" comment.  If this file does not
+    # contain any "//" comments at all, it could be that the compiler
+    # only wants "/**/" comments, look for those instead.
+    no_single_line_comments = True
+    for i in xrange(1, len(raw_lines) - 1):
+        line = raw_lines[i]
+        if Match(r'^(?:(?:\'(?:\.|[^\'])*\')|(?:"(?:\.|[^"])*")|[^\'"])*//',
+                 line):
+            no_single_line_comments = False
+            break
+
+    if no_single_line_comments:
+        match = Match(r'#endif\s*/\*\s*' + cppvar + r'(_)?\s*\*/', endif)
+        if match:
+            if match.group(1) == '_':
+                # Low severity warning for double trailing underscore
+                error(filename, endif_linenum, 'build/header_guard', 0,
+                      '#endif line should be "#endif  /* %s */"' % cppvar)
+            return
+
+    # Didn't find anything
+    error(filename, endif_linenum, 'build/header_guard', 5,
+          '#endif line should be "#endif  // %s"' % cppvar)
 
 
 def CheckHeaderFileIncluded(filename, include_state, error):
-  """Logs an error if a .cc file does not include its header."""
-
-  # Do not check test files
-  if filename.endswith('_test.cc') or filename.endswith('_unittest.cc'):
-    return
-
-  fileinfo = FileInfo(filename)
-  headerfile = filename[0:len(filename) - 2] + 'h'
-  if not os.path.exists(headerfile):
-    return
-  headername = FileInfo(headerfile).RepositoryName()
-  first_include = 0
-  for section_list in include_state.include_list:
-    for f in section_list:
-      if headername in f[0] or f[0] in headername:
+    """Logs an error if a .cc file does not include its header."""
+
+    # Do not check test files
+    if filename.endswith('_test.cc') or filename.endswith('_unittest.cc'):
+        return
+
+    fileinfo = FileInfo(filename)
+    headerfile = filename[0:len(filename) - 2] + 'h'
+    if not os.path.exists(headerfile):
         return
-      if not first_include:
-        first_include = f[1]
+    headername = FileInfo(headerfile).RepositoryName()
+    first_include = 0
+    for section_list in include_state.include_list:
+        for f in section_list:
+            if headername in f[0] or f[0] in headername:
+                return
+            if not first_include:
+                first_include = f[1]
 
-  error(filename, first_include, 'build/include', 5,
-        '%s should include its header file %s' % (fileinfo.RepositoryName(),
-                                                  headername))
+    error(filename, first_include, 'build/include', 5,
+          '%s should include its header file %s' % (fileinfo.RepositoryName(),
+                                                    headername))
 
 
 def CheckForBadCharacters(filename, lines, error):
-  """Logs an error for each line containing bad characters.
+    """Logs an error for each line containing bad characters.
 
   Two kinds of bad characters:
 
@@ -1823,16 +1830,19 @@ def CheckForBadCharacters(filename, lines, error):
     lines: An array of strings, each representing a line of the file.
     error: The function to call with any errors found.
   """
-  for linenum, line in enumerate(lines):
-    if u'\ufffd' in line:
-      error(filename, linenum, 'readability/utf8', 5,
-            'Line contains invalid UTF-8 (or Unicode replacement character).')
-    if '\0' in line:
-      error(filename, linenum, 'readability/nul', 5, 'Line contains NUL byte.')
+    for linenum, line in enumerate(lines):
+        if u'\ufffd' in line:
+            error(
+                filename, linenum, 'readability/utf8', 5,
+                'Line contains invalid UTF-8 (or Unicode replacement character).'
+            )
+        if '\0' in line:
+            error(filename, linenum, 'readability/nul', 5,
+                  'Line contains NUL byte.')
 
 
 def CheckForNewlineAtEOF(filename, lines, error):
-  """Logs an error if there is no newline char at the end of the file.
+    """Logs an error if there is no newline char at the end of the file.
 
   Args:
     filename: The name of the current file.
@@ -1840,17 +1850,18 @@ def CheckForNewlineAtEOF(filename, lines, error):
     error: The function to call with any errors found.
   """
 
-  # The array lines() was created by adding two newlines to the
-  # original file (go figure), then splitting on \n.
-  # To verify that the file ends in \n, we just have to make sure the
-  # last-but-two element of lines() exists and is empty.
-  if len(lines) < 3 or lines[-2]:
-    error(filename, len(lines) - 2, 'whitespace/ending_newline', 5,
-          'Could not find a newline character at the end of the file.')
+    # The array lines() was created by adding two newlines to the
+    # original file (go figure), then splitting on \n.
+    # To verify that the file ends in \n, we just have to make sure the
+    # last-but-two element of lines() exists and is empty.
+    if len(lines) < 3 or lines[-2]:
+        error(filename,
+              len(lines) - 2, 'whitespace/ending_newline', 5,
+              'Could not find a newline character at the end of the file.')
 
 
 def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error):
-  """Logs an error if we see /* ... */ or "..." that extend past one line.
+    """Logs an error if we see /* ... */ or "..." that extend past one line.
 
   /* ... */ comments are legit inside macros, for one line.
   Otherwise, we prefer // comments, so it's ok to warn about the
@@ -1866,25 +1877,25 @@ def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error):
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
-  line = clean_lines.elided[linenum]
+    line = clean_lines.elided[linenum]
 
-  # Remove all \\ (escaped backslashes) from the line. They are OK, and the
-  # second (escaped) slash may trigger later \" detection erroneously.
-  line = line.replace('\\\\', '')
+    # Remove all \\ (escaped backslashes) from the line. They are OK, and the
+    # second (escaped) slash may trigger later \" detection erroneously.
+    line = line.replace('\\\\', '')
 
-  if line.count('/*') > line.count('*/'):
-    error(filename, linenum, 'readability/multiline_comment', 5,
-          'Complex multi-line /*...*/-style comment found. '
-          'Lint may give bogus warnings.  '
-          'Consider replacing these with //-style comments, '
-          'with #if 0...#endif, '
-          'or with more clearly structured multi-line comments.')
+    if line.count('/*') > line.count('*/'):
+        error(filename, linenum, 'readability/multiline_comment', 5,
+              'Complex multi-line /*...*/-style comment found. '
+              'Lint may give bogus warnings.  '
+              'Consider replacing these with //-style comments, '
+              'with #if 0...#endif, '
+              'or with more clearly structured multi-line comments.')
 
-  if (line.count('"') - line.count('\\"')) % 2:
-    error(filename, linenum, 'readability/multiline_string', 5,
-          'Multi-line string ("...") found.  This lint script doesn\'t '
-          'do well with such strings, and may give bogus warnings.  '
-          'Use C++11 raw strings or concatenation instead.')
+    if (line.count('"') - line.count('\\"')) % 2:
+        error(filename, linenum, 'readability/multiline_string', 5,
+              'Multi-line string ("...") found.  This lint script doesn\'t '
+              'do well with such strings, and may give bogus warnings.  '
+              'Use C++11 raw strings or concatenation instead.')
 
 
 # (non-threadsafe name, thread-safe alternative, validation pattern)
@@ -1911,14 +1922,12 @@ _THREADING_LIST = (
     ('gmtime(', 'gmtime_r(', _UNSAFE_FUNC_PREFIX + r'gmtime\([^)]+\)'),
     ('localtime(', 'localtime_r(', _UNSAFE_FUNC_PREFIX + r'localtime\([^)]+\)'),
     ('rand(', 'rand_r(', _UNSAFE_FUNC_PREFIX + r'rand\(\)'),
-    ('strtok(', 'strtok_r(',
-     _UNSAFE_FUNC_PREFIX + r'strtok\([^)]+\)'),
-    ('ttyname(', 'ttyname_r(', _UNSAFE_FUNC_PREFIX + r'ttyname\([^)]+\)'),
-    )
+    ('strtok(', 'strtok_r(', _UNSAFE_FUNC_PREFIX + r'strtok\([^)]+\)'),
+    ('ttyname(', 'ttyname_r(', _UNSAFE_FUNC_PREFIX + r'ttyname\([^)]+\)'), )
 
 
 def CheckPosixThreading(filename, clean_lines, linenum, error):
-  """Checks for calls to thread-unsafe functions.
+    """Checks for calls to thread-unsafe functions.
 
   Much code has been originally written without consideration of
   multi-threading. Also, engineers are relying on their old experience;
@@ -1932,19 +1941,18 @@ def CheckPosixThreading(filename, clean_lines, linenum, error):
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
-  line = clean_lines.elided[linenum]
-  for single_thread_func, multithread_safe_func, pattern in _THREADING_LIST:
-    # Additional pattern matching check to confirm that this is the
-    # function we are looking for
-    if Search(pattern, line):
-      error(filename, linenum, 'runtime/threadsafe_fn', 2,
-            'Consider using ' + multithread_safe_func +
-            '...) instead of ' + single_thread_func +
-            '...) for improved thread safety.')
+    line = clean_lines.elided[linenum]
+    for single_thread_func, multithread_safe_func, pattern in _THREADING_LIST:
+        # Additional pattern matching check to confirm that this is the
+        # function we are looking for
+        if Search(pattern, line):
+            error(filename, linenum, 'runtime/threadsafe_fn', 2,
+                  'Consider using ' + multithread_safe_func + '...) instead of '
+                  + single_thread_func + '...) for improved thread safety.')
 
 
 def CheckVlogArguments(filename, clean_lines, linenum, error):
-  """Checks that VLOG() is only used for defining a logging level.
+    """Checks that VLOG() is only used for defining a logging level.
 
   For example, VLOG(2) is correct. VLOG(INFO), VLOG(WARNING), VLOG(ERROR), and
   VLOG(FATAL) are not.
@@ -1955,20 +1963,20 @@ def CheckVlogArguments(filename, clean_lines, linenum, error):
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
-  line = clean_lines.elided[linenum]
-  if Search(r'\bVLOG\((INFO|ERROR|WARNING|DFATAL|FATAL)\)', line):
-    error(filename, linenum, 'runtime/vlog', 5,
-          'VLOG() should be used with numeric verbosity level.  '
-          'Use LOG() if you want symbolic severity levels.')
+    line = clean_lines.elided[linenum]
+    if Search(r'\bVLOG\((INFO|ERROR|WARNING|DFATAL|FATAL)\)', line):
+        error(filename, linenum, 'runtime/vlog', 5,
+              'VLOG() should be used with numeric verbosity level.  '
+              'Use LOG() if you want symbolic severity levels.')
+
 
 # Matches invalid increment: *count++, which moves pointer instead of
 # incrementing a value.
-_RE_PATTERN_INVALID_INCREMENT = re.compile(
-    r'^\s*\*\w+(\+\+|--);')
+_RE_PATTERN_INVALID_INCREMENT = re.compile(r'^\s*\*\w+(\+\+|--);')
 
 
 def CheckInvalidIncrement(filename, clean_lines, linenum, error):
-  """Checks for invalid increment *count++.
+    """Checks for invalid increment *count++.
 
   For example following function:
   void increment_counter(int* count) {
@@ -1983,37 +1991,38 @@ def CheckInvalidIncrement(filename, clean_lines, linenum, error):
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
-  line = clean_lines.elided[linenum]
-  if _RE_PATTERN_INVALID_INCREMENT.match(line):
-    error(filename, linenum, 'runtime/invalid_increment', 5,
-          'Changing pointer instead of value (or unused value of operator*).')
+    line = clean_lines.elided[linenum]
+    if _RE_PATTERN_INVALID_INCREMENT.match(line):
+        error(
+            filename, linenum, 'runtime/invalid_increment', 5,
+            'Changing pointer instead of value (or unused value of operator*).')
 
 
 def IsMacroDefinition(clean_lines, linenum):
-  if Search(r'^#define', clean_lines[linenum]):
-    return True
+    if Search(r'^#define', clean_lines[linenum]):
+        return True
 
-  if linenum > 0 and Search(r'\\$', clean_lines[linenum - 1]):
-    return True
+    if linenum > 0 and Search(r'\\$', clean_lines[linenum - 1]):
+        return True
 
-  return False
+    return False
 
 
 def IsForwardClassDeclaration(clean_lines, linenum):
-  return Match(r'^\s*(\btemplate\b)*.*class\s+\w+;\s*$', clean_lines[linenum])
+    return Match(r'^\s*(\btemplate\b)*.*class\s+\w+;\s*$', clean_lines[linenum])
 
 
 class _BlockInfo(object):
-  """Stores information about a generic block of code."""
+    """Stores information about a generic block of code."""
 
-  def __init__(self, seen_open_brace):
-    self.seen_open_brace = seen_open_brace
-    self.open_parentheses = 0
-    self.inline_asm = _NO_ASM
-    self.check_namespace_indentation = False
+    def __init__(self, seen_open_brace):
+        self.seen_open_brace = seen_open_brace
+        self.open_parentheses = 0
+        self.inline_asm = _NO_ASM
+        self.check_namespace_indentation = False
 
-  def CheckBegin(self, filename, clean_lines, linenum, error):
-    """Run checks that applies to text up to the opening brace.
+    def CheckBegin(self, filename, clean_lines, linenum, error):
+        """Run checks that applies to text up to the opening brace.
 
     This is mostly for checking the text after the class identifier
     and the "{", usually where the base class is specified.  For other
@@ -2025,10 +2034,10 @@ class _BlockInfo(object):
       linenum: The number of the line to check.
       error: The function to call with any errors found.
     """
-    pass
+        pass
 
-  def CheckEnd(self, filename, clean_lines, linenum, error):
-    """Run checks that applies to text after the closing brace.
+    def CheckEnd(self, filename, clean_lines, linenum, error):
+        """Run checks that applies to text after the closing brace.
 
     This is mostly used for checking end of namespace comments.
 
@@ -2038,10 +2047,10 @@ class _BlockInfo(object):
       linenum: The number of the line to check.
       error: The function to call with any errors found.
     """
-    pass
+        pass
 
-  def IsBlockInfo(self):
-    """Returns true if this block is a _BlockInfo.
+    def IsBlockInfo(self):
+        """Returns true if this block is a _BlockInfo.
 
     This is convenient for verifying that an object is an instance of
     a _BlockInfo, but not an instance of any of the derived classes.
@@ -2049,231 +2058,235 @@ class _BlockInfo(object):
     Returns:
       True for this class, False for derived classes.
     """
-    return self.__class__ == _BlockInfo
+        return self.__class__ == _BlockInfo
 
 
 class _ExternCInfo(_BlockInfo):
-  """Stores information about an 'extern "C"' block."""
+    """Stores information about an 'extern "C"' block."""
 
-  def __init__(self):
-    _BlockInfo.__init__(self, True)
+    def __init__(self):
+        _BlockInfo.__init__(self, True)
 
 
 class _ClassInfo(_BlockInfo):
-  """Stores information about a class."""
-
-  def __init__(self, name, class_or_struct, clean_lines, linenum):
-    _BlockInfo.__init__(self, False)
-    self.name = name
-    self.starting_linenum = linenum
-    self.is_derived = False
-    self.check_namespace_indentation = True
-    if class_or_struct == 'struct':
-      self.access = 'public'
-      self.is_struct = True
-    else:
-      self.access = 'private'
-      self.is_struct = False
+    """Stores information about a class."""
+
+    def __init__(self, name, class_or_struct, clean_lines, linenum):
+        _BlockInfo.__init__(self, False)
+        self.name = name
+        self.starting_linenum = linenum
+        self.is_derived = False
+        self.check_namespace_indentation = True
+        if class_or_struct == 'struct':
+            self.access = 'public'
+            self.is_struct = True
+        else:
+            self.access = 'private'
+            self.is_struct = False
 
-    # Remember initial indentation level for this class.  Using raw_lines here
-    # instead of elided to account for leading comments.
-    self.class_indent = GetIndentLevel(clean_lines.raw_lines[linenum])
+        # Remember initial indentation level for this class.  Using raw_lines here
+        # instead of elided to account for leading comments.
+        self.class_indent = GetIndentLevel(clean_lines.raw_lines[linenum])
 
-    # Try to find the end of the class.  This will be confused by things like:
-    #   class A {
-    #   } *x = { ...
-    #
-    # But it's still good enough for CheckSectionSpacing.
-    self.last_line = 0
-    depth = 0
-    for i in range(linenum, clean_lines.NumLines()):
-      line = clean_lines.elided[i]
-      depth += line.count('{') - line.count('}')
-      if not depth:
-        self.last_line = i
-        break
-
-  def CheckBegin(self, filename, clean_lines, linenum, error):
-    # Look for a bare ':'
-    if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]):
-      self.is_derived = True
-
-  def CheckEnd(self, filename, clean_lines, linenum, error):
-    # If there is a DISALLOW macro, it should appear near the end of
-    # the class.
-    seen_last_thing_in_class = False
-    for i in xrange(linenum - 1, self.starting_linenum, -1):
-      match = Search(
-          r'\b(DISALLOW_COPY_AND_ASSIGN|DISALLOW_IMPLICIT_CONSTRUCTORS)\(' +
-          self.name + r'\)',
-          clean_lines.elided[i])
-      if match:
-        if seen_last_thing_in_class:
-          error(filename, i, 'readability/constructors', 3,
-                match.group(1) + ' should be the last thing in the class')
-        break
-
-      if not Match(r'^\s*$', clean_lines.elided[i]):
-        seen_last_thing_in_class = True
-
-    # Check that closing brace is aligned with beginning of the class.
-    # Only do this if the closing brace is indented by only whitespaces.
-    # This means we will not check single-line class definitions.
-    indent = Match(r'^( *)\}', clean_lines.elided[linenum])
-    if indent and len(indent.group(1)) != self.class_indent:
-      if self.is_struct:
-        parent = 'struct ' + self.name
-      else:
-        parent = 'class ' + self.name
-      error(filename, linenum, 'whitespace/indent', 3,
-            'Closing brace should be aligned with beginning of %s' % parent)
+        # Try to find the end of the class.  This will be confused by things like:
+        #   class A {
+        #   } *x = { ...
+        #
+        # But it's still good enough for CheckSectionSpacing.
+        self.last_line = 0
+        depth = 0
+        for i in range(linenum, clean_lines.NumLines()):
+            line = clean_lines.elided[i]
+            depth += line.count('{') - line.count('}')
+            if not depth:
+                self.last_line = i
+                break
+
+    def CheckBegin(self, filename, clean_lines, linenum, error):
+        # Look for a bare ':'
+        if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]):
+            self.is_derived = True
+
+    def CheckEnd(self, filename, clean_lines, linenum, error):
+        # If there is a DISALLOW macro, it should appear near the end of
+        # the class.
+        seen_last_thing_in_class = False
+        for i in xrange(linenum - 1, self.starting_linenum, -1):
+            match = Search(
+                r'\b(DISALLOW_COPY_AND_ASSIGN|DISALLOW_IMPLICIT_CONSTRUCTORS)\('
+                + self.name + r'\)', clean_lines.elided[i])
+            if match:
+                if seen_last_thing_in_class:
+                    error(filename, i, 'readability/constructors', 3,
+                          match.group(1) +
+                          ' should be the last thing in the class')
+                break
+
+            if not Match(r'^\s*$', clean_lines.elided[i]):
+                seen_last_thing_in_class = True
+
+        # Check that closing brace is aligned with beginning of the class.
+        # Only do this if the closing brace is indented by only whitespaces.
+        # This means we will not check single-line class definitions.
+        indent = Match(r'^( *)\}', clean_lines.elided[linenum])
+        if indent and len(indent.group(1)) != self.class_indent:
+            if self.is_struct:
+                parent = 'struct ' + self.name
+            else:
+                parent = 'class ' + self.name
+            error(filename, linenum, 'whitespace/indent', 3,
+                  'Closing brace should be aligned with beginning of %s' %
+                  parent)
 
 
 class _NamespaceInfo(_BlockInfo):
-  """Stores information about a namespace."""
-
-  def __init__(self, name, linenum):
-    _BlockInfo.__init__(self, False)
-    self.name = name or ''
-    self.starting_linenum = linenum
-    self.check_namespace_indentation = True
-
-  def CheckEnd(self, filename, clean_lines, linenum, error):
-    """Check end of namespace comments."""
-    line = clean_lines.raw_lines[linenum]
-
-    # Check how many lines is enclosed in this namespace.  Don't issue
-    # warning for missing namespace comments if there aren't enough
-    # lines.  However, do apply checks if there is already an end of
-    # namespace comment and it's incorrect.
-    #
-    # TODO(unknown): We always want to check end of namespace comments
-    # if a namespace is large, but sometimes we also want to apply the
-    # check if a short namespace contained nontrivial things (something
-    # other than forward declarations).  There is currently no logic on
-    # deciding what these nontrivial things are, so this check is
-    # triggered by namespace size only, which works most of the time.
-    if (linenum - self.starting_linenum < 10
-        and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)):
-      return
-
-    # Look for matching comment at end of namespace.
-    #
-    # Note that we accept C style "/* */" comments for terminating
-    # namespaces, so that code that terminate namespaces inside
-    # preprocessor macros can be cpplint clean.
-    #
-    # We also accept stuff like "// end of namespace <name>." with the
-    # period at the end.
-    #
-    # Besides these, we don't accept anything else, otherwise we might
-    # get false negatives when existing comment is a substring of the
-    # expected namespace.
-    if self.name:
-      # Named namespace
-      if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) +
-                    r'[\*/\.\\\s]*$'),
-                   line):
-        error(filename, linenum, 'readability/namespace', 5,
-              'Namespace should be terminated with "// namespace %s"' %
-              self.name)
-    else:
-      # Anonymous namespace
-      if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
-        # If "// namespace anonymous" or "// anonymous namespace (more text)",
-        # mention "// anonymous namespace" as an acceptable form
-        if Match(r'}.*\b(namespace anonymous|anonymous namespace)\b', line):
-          error(filename, linenum, 'readability/namespace', 5,
-                'Anonymous namespace should be terminated with "// namespace"'
-                ' or "// anonymous namespace"')
+    """Stores information about a namespace."""
+
+    def __init__(self, name, linenum):
+        _BlockInfo.__init__(self, False)
+        self.name = name or ''
+        self.starting_linenum = linenum
+        self.check_namespace_indentation = True
+
+    def CheckEnd(self, filename, clean_lines, linenum, error):
+        """Check end of namespace comments."""
+        line = clean_lines.raw_lines[linenum]
+
+        # Check how many lines is enclosed in this namespace.  Don't issue
+        # warning for missing namespace comments if there aren't enough
+        # lines.  However, do apply checks if there is already an end of
+        # namespace comment and it's incorrect.
+        #
+        # TODO(unknown): We always want to check end of namespace comments
+        # if a namespace is large, but sometimes we also want to apply the
+        # check if a short namespace contained nontrivial things (something
+        # other than forward declarations).  There is currently no logic on
+        # deciding what these nontrivial things are, so this check is
+        # triggered by namespace size only, which works most of the time.
+        if (linenum - self.starting_linenum < 10 and
+                not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)):
+            return
+
+        # Look for matching comment at end of namespace.
+        #
+        # Note that we accept C style "/* */" comments for terminating
+        # namespaces, so that code that terminate namespaces inside
+        # preprocessor macros can be cpplint clean.
+        #
+        # We also accept stuff like "// end of namespace <name>." with the
+        # period at the end.
+        #
+        # Besides these, we don't accept anything else, otherwise we might
+        # get false negatives when existing comment is a substring of the
+        # expected namespace.
+        if self.name:
+            # Named namespace
+            if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' +
+                          re.escape(self.name) + r'[\*/\.\\\s]*$'), line):
+                error(filename, linenum, 'readability/namespace', 5,
+                      'Namespace should be terminated with "// namespace %s"' %
+                      self.name)
         else:
-          error(filename, linenum, 'readability/namespace', 5,
-                'Anonymous namespace should be terminated with "// namespace"')
+            # Anonymous namespace
+            if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
+                # If "// namespace anonymous" or "// anonymous namespace (more text)",
+                # mention "// anonymous namespace" as an acceptable form
+                if Match(r'}.*\b(namespace anonymous|anonymous namespace)\b',
+                         line):
+                    error(
+                        filename, linenum, 'readability/namespace', 5,
+                        'Anonymous namespace should be terminated with "// namespace"'
+                        ' or "// anonymous namespace"')
+                else:
+                    error(
+                        filename, linenum, 'readability/namespace', 5,
+                        'Anonymous namespace should be terminated with "// namespace"'
+                    )
 
 
 class _PreprocessorInfo(object):
-  """Stores checkpoints of nesting stacks when #if/#else is seen."""
+    """Stores checkpoints of nesting stacks when #if/#else is seen."""
 
-  def __init__(self, stack_before_if):
-    # The entire nesting stack before #if
-    self.stack_before_if = stack_before_if
+    def __init__(self, stack_before_if):
+        # The entire nesting stack before #if
+        self.stack_before_if = stack_before_if
 
-    # The entire nesting stack up to #else
-    self.stack_before_else = []
+        # The entire nesting stack up to #else
+        self.stack_before_else = []
 
-    # Whether we have already seen #else or #elif
-    self.seen_else = False
+        # Whether we have already seen #else or #elif
+        self.seen_else = False
 
 
 class NestingState(object):
-  """Holds states related to parsing braces."""
-
-  def __init__(self):
-    # Stack for tracking all braces.  An object is pushed whenever we
-    # see a "{", and popped when we see a "}".  Only 3 types of
-    # objects are possible:
-    # - _ClassInfo: a class or struct.
-    # - _NamespaceInfo: a namespace.
-    # - _BlockInfo: some other type of block.
-    self.stack = []
-
-    # Top of the previous stack before each Update().
-    #
-    # Because the nesting_stack is updated at the end of each line, we
-    # had to do some convoluted checks to find out what is the current
-    # scope at the beginning of the line.  This check is simplified by
-    # saving the previous top of nesting stack.
-    #
-    # We could save the full stack, but we only need the top.  Copying
-    # the full nesting stack would slow down cpplint by ~10%.
-    self.previous_stack_top = []
+    """Holds states related to parsing braces."""
+
+    def __init__(self):
+        # Stack for tracking all braces.  An object is pushed whenever we
+        # see a "{", and popped when we see a "}".  Only 3 types of
+        # objects are possible:
+        # - _ClassInfo: a class or struct.
+        # - _NamespaceInfo: a namespace.
+        # - _BlockInfo: some other type of block.
+        self.stack = []
+
+        # Top of the previous stack before each Update().
+        #
+        # Because the nesting_stack is updated at the end of each line, we
+        # had to do some convoluted checks to find out what is the current
+        # scope at the beginning of the line.  This check is simplified by
+        # saving the previous top of nesting stack.
+        #
+        # We could save the full stack, but we only need the top.  Copying
+        # the full nesting stack would slow down cpplint by ~10%.
+        self.previous_stack_top = []
 
-    # Stack of _PreprocessorInfo objects.
-    self.pp_stack = []
+        # Stack of _PreprocessorInfo objects.
+        self.pp_stack = []
 
-  def SeenOpenBrace(self):
-    """Check if we have seen the opening brace for the innermost block.
+    def SeenOpenBrace(self):
+        """Check if we have seen the opening brace for the innermost block.
 
     Returns:
       True if we have seen the opening brace, False if the innermost
       block is still expecting an opening brace.
     """
-    return (not self.stack) or self.stack[-1].seen_open_brace
+        return (not self.stack) or self.stack[-1].seen_open_brace
 
-  def InNamespaceBody(self):
-    """Check if we are currently one level inside a namespace body.
+    def InNamespaceBody(self):
+        """Check if we are currently one level inside a namespace body.
 
     Returns:
       True if top of the stack is a namespace block, False otherwise.
     """
-    return self.stack and isinstance(self.stack[-1], _NamespaceInfo)
+        return self.stack and isinstance(self.stack[-1], _NamespaceInfo)
 
-  def InExternC(self):
-    """Check if we are currently one level inside an 'extern "C"' block.
+    def InExternC(self):
+        """Check if we are currently one level inside an 'extern "C"' block.
 
     Returns:
       True if top of the stack is an extern block, False otherwise.
     """
-    return self.stack and isinstance(self.stack[-1], _ExternCInfo)
+        return self.stack and isinstance(self.stack[-1], _ExternCInfo)
 
-  def InClassDeclaration(self):
-    """Check if we are currently one level inside a class or struct declaration.
+    def InClassDeclaration(self):
+        """Check if we are currently one level inside a class or struct declaration.
 
     Returns:
       True if top of the stack is a class/struct, False otherwise.
     """
-    return self.stack and isinstance(self.stack[-1], _ClassInfo)
+        return self.stack and isinstance(self.stack[-1], _ClassInfo)
 
-  def InAsmBlock(self):
-    """Check if we are currently one level inside an inline ASM block.
+    def InAsmBlock(self):
+        """Check if we are currently one level inside an inline ASM block.
 
     Returns:
       True if the top of the stack is a block containing inline ASM.
     """
-    return self.stack and self.stack[-1].inline_asm != _NO_ASM
+        return self.stack and self.stack[-1].inline_asm != _NO_ASM
 
-  def InTemplateArgumentList(self, clean_lines, linenum, pos):
-    """Check if current position is inside template argument list.
+    def InTemplateArgumentList(self, clean_lines, linenum, pos):
+        """Check if current position is inside template argument list.
 
     Args:
       clean_lines: A CleansedLines instance containing the file.
@@ -2282,50 +2295,51 @@ class NestingState(object):
     Returns:
       True if (linenum, pos) is inside template arguments.
     """
-    while linenum < clean_lines.NumLines():
-      # Find the earliest character that might indicate a template argument
-      line = clean_lines.elided[linenum]
-      match = Match(r'^[^{};=\[\]\.<>]*(.)', line[pos:])
-      if not match:
-        linenum += 1
-        pos = 0
-        continue
-      token = match.group(1)
-      pos += len(match.group(0))
-
-      # These things do not look like template argument list:
-      #   class Suspect {
-      #   class Suspect x; }
-      if token in ('{', '}', ';'): return False
-
-      # These things look like template argument list:
-      #   template <class Suspect>
-      #   template <class Suspect = default_value>
-      #   template <class Suspect[]>
-      #   template <class Suspect...>
-      if token in ('>', '=', '[', ']', '.'): return True
-
-      # Check if token is an unmatched '<'.
-      # If not, move on to the next character.
-      if token != '<':
-        pos += 1
-        if pos >= len(line):
-          linenum += 1
-          pos = 0
-        continue
-
-      # We can't be sure if we just find a single '<', and need to
-      # find the matching '>'.
-      (_, end_line, end_pos) = CloseExpression(clean_lines, linenum, pos - 1)
-      if end_pos < 0:
-        # Not sure if template argument list or syntax error in file
+        while linenum < clean_lines.NumLines():
+            # Find the earliest character that might indicate a template argument
+            line = clean_lines.elided[linenum]
+            match = Match(r'^[^{};=\[\]\.<>]*(.)', line[pos:])
+            if not match:
+                linenum += 1
+                pos = 0
+                continue
+            token = match.group(1)
+            pos += len(match.group(0))
+
+            # These things do not look like template argument list:
+            #   class Suspect {
+            #   class Suspect x; }
+            if token in ('{', '}', ';'): return False
+
+            # These things look like template argument list:
+            #   template <class Suspect>
+            #   template <class Suspect = default_value>
+            #   template <class Suspect[]>
+            #   template <class Suspect...>
+            if token in ('>', '=', '[', ']', '.'): return True
+
+            # Check if token is an unmatched '<'.
+            # If not, move on to the next character.
+            if token != '<':
+                pos += 1
+                if pos >= len(line):
+                    linenum += 1
+                    pos = 0
+                continue
+
+            # We can't be sure if we just find a single '<', and need to
+            # find the matching '>'.
+            (_, end_line, end_pos) = CloseExpression(clean_lines, linenum,
+                                                     pos - 1)
+            if end_pos < 0:
+                # Not sure if template argument list or syntax error in file
+                return False
+            linenum = end_line
+            pos = end_pos
         return False
-      linenum = end_line
-      pos = end_pos
-    return False
 
-  def UpdatePreprocessor(self, line):
-    """Update preprocessor stack.
+    def UpdatePreprocessor(self, line):
+        """Update preprocessor stack.
 
     We need to handle preprocessors due to classes like this:
       #ifdef SWIG
@@ -2345,44 +2359,45 @@ class NestingState(object):
     Args:
       line: current line to check.
     """
-    if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line):
-      # Beginning of #if block, save the nesting stack here.  The saved
-      # stack will allow us to restore the parsing state in the #else case.
-      self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack)))
-    elif Match(r'^\s*#\s*(else|elif)\b', line):
-      # Beginning of #else block
-      if self.pp_stack:
-        if not self.pp_stack[-1].seen_else:
-          # This is the first #else or #elif block.  Remember the
-          # whole nesting stack up to this point.  This is what we
-          # keep after the #endif.
-          self.pp_stack[-1].seen_else = True
-          self.pp_stack[-1].stack_before_else = copy.deepcopy(self.stack)
-
-        # Restore the stack to how it was before the #if
-        self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if)
-      else:
-        # TODO(unknown): unexpected #else, issue warning?
-        pass
-    elif Match(r'^\s*#\s*endif\b', line):
-      # End of #if or #else blocks.
-      if self.pp_stack:
-        # If we saw an #else, we will need to restore the nesting
-        # stack to its former state before the #else, otherwise we
-        # will just continue from where we left off.
-        if self.pp_stack[-1].seen_else:
-          # Here we can just use a shallow copy since we are the last
-          # reference to it.
-          self.stack = self.pp_stack[-1].stack_before_else
-        # Drop the corresponding #if
-        self.pp_stack.pop()
-      else:
-        # TODO(unknown): unexpected #endif, issue warning?
-        pass
-
-  # TODO(unknown): Update() is too long, but we will refactor later.
-  def Update(self, filename, clean_lines, linenum, error):
-    """Update nesting state with current line.
+        if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line):
+            # Beginning of #if block, save the nesting stack here.  The saved
+            # stack will allow us to restore the parsing state in the #else case.
+            self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack)))
+        elif Match(r'^\s*#\s*(else|elif)\b', line):
+            # Beginning of #else block
+            if self.pp_stack:
+                if not self.pp_stack[-1].seen_else:
+                    # This is the first #else or #elif block.  Remember the
+                    # whole nesting stack up to this point.  This is what we
+                    # keep after the #endif.
+                    self.pp_stack[-1].seen_else = True
+                    self.pp_stack[-1].stack_before_else = copy.deepcopy(
+                        self.stack)
+
+                # Restore the stack to how it was before the #if
+                self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if)
+            else:
+                # TODO(unknown): unexpected #else, issue warning?
+                pass
+        elif Match(r'^\s*#\s*endif\b', line):
+            # End of #if or #else blocks.
+            if self.pp_stack:
+                # If we saw an #else, we will need to restore the nesting
+                # stack to its former state before the #else, otherwise we
+                # will just continue from where we left off.
+                if self.pp_stack[-1].seen_else:
+                    # Here we can just use a shallow copy since we are the last
+                    # reference to it.
+                    self.stack = self.pp_stack[-1].stack_before_else
+                # Drop the corresponding #if
+                self.pp_stack.pop()
+            else:
+                # TODO(unknown): unexpected #endif, issue warning?
+                pass
+
+    # TODO(unknown): Update() is too long, but we will refactor later.
+    def Update(self, filename, clean_lines, linenum, error):
+        """Update nesting state with current line.
 
     Args:
       filename: The name of the current file.
@@ -2390,198 +2405,201 @@ class NestingState(object):
       linenum: The number of the line to check.
       error: The function to call with any errors found.
     """
-    line = clean_lines.elided[linenum]
+        line = clean_lines.elided[linenum]
 
-    # Remember top of the previous nesting stack.
-    #
-    # The stack is always pushed/popped and not modified in place, so
-    # we can just do a shallow copy instead of copy.deepcopy.  Using
-    # deepcopy would slow down cpplint by ~28%.
-    if self.stack:
-      self.previous_stack_top = self.stack[-1]
-    else:
-      self.previous_stack_top = None
-
-    # Update pp_stack
-    self.UpdatePreprocessor(line)
-
-    # Count parentheses.  This is to avoid adding struct arguments to
-    # the nesting stack.
-    if self.stack:
-      inner_block = self.stack[-1]
-      depth_change = line.count('(') - line.count(')')
-      inner_block.open_parentheses += depth_change
-
-      # Also check if we are starting or ending an inline assembly block.
-      if inner_block.inline_asm in (_NO_ASM, _END_ASM):
-        if (depth_change != 0 and
-            inner_block.open_parentheses == 1 and
-            _MATCH_ASM.match(line)):
-          # Enter assembly block
-          inner_block.inline_asm = _INSIDE_ASM
-        else:
-          # Not entering assembly block.  If previous line was _END_ASM,
-          # we will now shift to _NO_ASM state.
-          inner_block.inline_asm = _NO_ASM
-      elif (inner_block.inline_asm == _INSIDE_ASM and
-            inner_block.open_parentheses == 0):
-        # Exit assembly block
-        inner_block.inline_asm = _END_ASM
-
-    # Consume namespace declaration at the beginning of the line.  Do
-    # this in a loop so that we catch same line declarations like this:
-    #   namespace proto2 { namespace bridge { class MessageSet; } }
-    while True:
-      # Match start of namespace.  The "\b\s*" below catches namespace
-      # declarations even if it weren't followed by a whitespace, this
-      # is so that we don't confuse our namespace checker.  The
-      # missing spaces will be flagged by CheckSpacing.
-      namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', line)
-      if not namespace_decl_match:
-        break
-
-      new_namespace = _NamespaceInfo(namespace_decl_match.group(1), linenum)
-      self.stack.append(new_namespace)
-
-      line = namespace_decl_match.group(2)
-      if line.find('{') != -1:
-        new_namespace.seen_open_brace = True
-        line = line[line.find('{') + 1:]
-
-    # Look for a class declaration in whatever is left of the line
-    # after parsing namespaces.  The regexp accounts for decorated classes
-    # such as in:
-    #   class LOCKABLE API Object {
-    #   };
-    class_decl_match = Match(
-        r'^(\s*(?:template\s*<[\w\s<>,:]*>\s*)?'
-        r'(class|struct)\s+(?:[A-Z_]+\s+)*(\w+(?:::\w+)*))'
-        r'(.*)$', line)
-    if (class_decl_match and
-        (not self.stack or self.stack[-1].open_parentheses == 0)):
-      # We do not want to accept classes that are actually template arguments:
-      #   template <class Ignore1,
-      #             class Ignore2 = Default<Args>,
-      #             template <Args> class Ignore3>
-      #   void Function() {};
-      #
-      # To avoid template argument cases, we scan forward and look for
-      # an unmatched '>'.  If we see one, assume we are inside a
-      # template argument list.
-      end_declaration = len(class_decl_match.group(1))
-      if not self.InTemplateArgumentList(clean_lines, linenum, end_declaration):
-        self.stack.append(_ClassInfo(
-            class_decl_match.group(3), class_decl_match.group(2),
-            clean_lines, linenum))
-        line = class_decl_match.group(4)
-
-    # If we have not yet seen the opening brace for the innermost block,
-    # run checks here.
-    if not self.SeenOpenBrace():
-      self.stack[-1].CheckBegin(filename, clean_lines, linenum, error)
-
-    # Update access control if we are inside a class/struct
-    if self.stack and isinstance(self.stack[-1], _ClassInfo):
-      classinfo = self.stack[-1]
-      access_match = Match(
-          r'^(.*)\b(public|private|protected|signals)(\s+(?:slots\s*)?)?'
-          r':(?:[^:]|$)',
-          line)
-      if access_match:
-        classinfo.access = access_match.group(2)
-
-        # Check that access keywords are indented +1 space.  Skip this
-        # check if the keywords are not preceded by whitespaces.
-        indent = access_match.group(1)
-        if (len(indent) != classinfo.class_indent + 1 and
-            Match(r'^\s*$', indent)):
-          if classinfo.is_struct:
-            parent = 'struct ' + classinfo.name
-          else:
-            parent = 'class ' + classinfo.name
-          slots = ''
-          if access_match.group(3):
-            slots = access_match.group(3)
-          error(filename, linenum, 'whitespace/indent', 3,
-                '%s%s: should be indented +1 space inside %s' % (
-                    access_match.group(2), slots, parent))
-
-    # Consume braces or semicolons from what's left of the line
-    while True:
-      # Match first brace, semicolon, or closed parenthesis.
-      matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line)
-      if not matched:
-        break
-
-      token = matched.group(1)
-      if token == '{':
-        # If namespace or class hasn't seen a opening brace yet, mark
-        # namespace/class head as complete.  Push a new block onto the
-        # stack otherwise.
-        if not self.SeenOpenBrace():
-          self.stack[-1].seen_open_brace = True
-        elif Match(r'^extern\s*"[^"]*"\s*\{', line):
-          self.stack.append(_ExternCInfo())
-        else:
-          self.stack.append(_BlockInfo(True))
-          if _MATCH_ASM.match(line):
-            self.stack[-1].inline_asm = _BLOCK_ASM
-
-      elif token == ';' or token == ')':
-        # If we haven't seen an opening brace yet, but we already saw
-        # a semicolon, this is probably a forward declaration.  Pop
-        # the stack for these.
+        # Remember top of the previous nesting stack.
         #
-        # Similarly, if we haven't seen an opening brace yet, but we
-        # already saw a closing parenthesis, then these are probably
-        # function arguments with extra "class" or "struct" keywords.
-        # Also pop these stack for these.
-        if not self.SeenOpenBrace():
-          self.stack.pop()
-      else:  # token == '}'
-        # Perform end of block checks and pop the stack.
+        # The stack is always pushed/popped and not modified in place, so
+        # we can just do a shallow copy instead of copy.deepcopy.  Using
+        # deepcopy would slow down cpplint by ~28%.
         if self.stack:
-          self.stack[-1].CheckEnd(filename, clean_lines, linenum, error)
-          self.stack.pop()
-      line = matched.group(2)
+            self.previous_stack_top = self.stack[-1]
+        else:
+            self.previous_stack_top = None
+
+        # Update pp_stack
+        self.UpdatePreprocessor(line)
 
-  def InnermostClass(self):
-    """Get class info on the top of the stack.
+        # Count parentheses.  This is to avoid adding struct arguments to
+        # the nesting stack.
+        if self.stack:
+            inner_block = self.stack[-1]
+            depth_change = line.count('(') - line.count(')')
+            inner_block.open_parentheses += depth_change
+
+            # Also check if we are starting or ending an inline assembly block.
+            if inner_block.inline_asm in (_NO_ASM, _END_ASM):
+                if (depth_change != 0 and inner_block.open_parentheses == 1 and
+                        _MATCH_ASM.match(line)):
+                    # Enter assembly block
+                    inner_block.inline_asm = _INSIDE_ASM
+                else:
+                    # Not entering assembly block.  If previous line was _END_ASM,
+                    # we will now shift to _NO_ASM state.
+                    inner_block.inline_asm = _NO_ASM
+            elif (inner_block.inline_asm == _INSIDE_ASM and
+                  inner_block.open_parentheses == 0):
+                # Exit assembly block
+                inner_block.inline_asm = _END_ASM
+
+        # Consume namespace declaration at the beginning of the line.  Do
+        # this in a loop so that we catch same line declarations like this:
+        #   namespace proto2 { namespace bridge { class MessageSet; } }
+        while True:
+            # Match start of namespace.  The "\b\s*" below catches namespace
+            # declarations even if it weren't followed by a whitespace, this
+            # is so that we don't confuse our namespace checker.  The
+            # missing spaces will be flagged by CheckSpacing.
+            namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$',
+                                         line)
+            if not namespace_decl_match:
+                break
+
+            new_namespace = _NamespaceInfo(
+                namespace_decl_match.group(1), linenum)
+            self.stack.append(new_namespace)
+
+            line = namespace_decl_match.group(2)
+            if line.find('{') != -1:
+                new_namespace.seen_open_brace = True
+                line = line[line.find('{') + 1:]
+
+        # Look for a class declaration in whatever is left of the line
+        # after parsing namespaces.  The regexp accounts for decorated classes
+        # such as in:
+        #   class LOCKABLE API Object {
+        #   };
+        class_decl_match = Match(
+            r'^(\s*(?:template\s*<[\w\s<>,:]*>\s*)?'
+            r'(class|struct)\s+(?:[A-Z_]+\s+)*(\w+(?:::\w+)*))'
+            r'(.*)$', line)
+        if (class_decl_match and
+            (not self.stack or self.stack[-1].open_parentheses == 0)):
+            # We do not want to accept classes that are actually template arguments:
+            #   template <class Ignore1,
+            #             class Ignore2 = Default<Args>,
+            #             template <Args> class Ignore3>
+            #   void Function() {};
+            #
+            # To avoid template argument cases, we scan forward and look for
+            # an unmatched '>'.  If we see one, assume we are inside a
+            # template argument list.
+            end_declaration = len(class_decl_match.group(1))
+            if not self.InTemplateArgumentList(clean_lines, linenum,
+                                               end_declaration):
+                self.stack.append(
+                    _ClassInfo(
+                        class_decl_match.group(3),
+                        class_decl_match.group(2), clean_lines, linenum))
+                line = class_decl_match.group(4)
+
+        # If we have not yet seen the opening brace for the innermost block,
+        # run checks here.
+        if not self.SeenOpenBrace():
+            self.stack[-1].CheckBegin(filename, clean_lines, linenum, error)
+
+        # Update access control if we are inside a class/struct
+        if self.stack and isinstance(self.stack[-1], _ClassInfo):
+            classinfo = self.stack[-1]
+            access_match = Match(
+                r'^(.*)\b(public|private|protected|signals)(\s+(?:slots\s*)?)?'
+                r':(?:[^:]|$)', line)
+            if access_match:
+                classinfo.access = access_match.group(2)
+
+                # Check that access keywords are indented +1 space.  Skip this
+                # check if the keywords are not preceded by whitespaces.
+                indent = access_match.group(1)
+                if (len(indent) != classinfo.class_indent + 1 and
+                        Match(r'^\s*$', indent)):
+                    if classinfo.is_struct:
+                        parent = 'struct ' + classinfo.name
+                    else:
+                        parent = 'class ' + classinfo.name
+                    slots = ''
+                    if access_match.group(3):
+                        slots = access_match.group(3)
+                    error(filename, linenum, 'whitespace/indent', 3,
+                          '%s%s: should be indented +1 space inside %s' % (
+                              access_match.group(2), slots, parent))
+
+        # Consume braces or semicolons from what's left of the line
+        while True:
+            # Match first brace, semicolon, or closed parenthesis.
+            matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line)
+            if not matched:
+                break
+
+            token = matched.group(1)
+            if token == '{':
+                # If namespace or class hasn't seen a opening brace yet, mark
+                # namespace/class head as complete.  Push a new block onto the
+                # stack otherwise.
+                if not self.SeenOpenBrace():
+                    self.stack[-1].seen_open_brace = True
+                elif Match(r'^extern\s*"[^"]*"\s*\{', line):
+                    self.stack.append(_ExternCInfo())
+                else:
+                    self.stack.append(_BlockInfo(True))
+                    if _MATCH_ASM.match(line):
+                        self.stack[-1].inline_asm = _BLOCK_ASM
+
+            elif token == ';' or token == ')':
+                # If we haven't seen an opening brace yet, but we already saw
+                # a semicolon, this is probably a forward declaration.  Pop
+                # the stack for these.
+                #
+                # Similarly, if we haven't seen an opening brace yet, but we
+                # already saw a closing parenthesis, then these are probably
+                # function arguments with extra "class" or "struct" keywords.
+                # Also pop these stack for these.
+                if not self.SeenOpenBrace():
+                    self.stack.pop()
+            else:  # token == '}'
+                # Perform end of block checks and pop the stack.
+                if self.stack:
+                    self.stack[-1].CheckEnd(filename, clean_lines, linenum,
+                                            error)
+                    self.stack.pop()
+            line = matched.group(2)
+
+    def InnermostClass(self):
+        """Get class info on the top of the stack.
 
     Returns:
       A _ClassInfo object if we are inside a class, or None otherwise.
     """
-    for i in range(len(self.stack), 0, -1):
-      classinfo = self.stack[i - 1]
-      if isinstance(classinfo, _ClassInfo):
-        return classinfo
-    return None
+        for i in range(len(self.stack), 0, -1):
+            classinfo = self.stack[i - 1]
+            if isinstance(classinfo, _ClassInfo):
+                return classinfo
+        return None
 
-  def CheckCompletedBlocks(self, filename, error):
-    """Checks that all classes and namespaces have been completely parsed.
+    def CheckCompletedBlocks(self, filename, error):
+        """Checks that all classes and namespaces have been completely parsed.
 
     Call this when all lines in a file have been processed.
     Args:
       filename: The name of the current file.
       error: The function to call with any errors found.
     """
-    # Note: This test can result in false positives if #ifdef constructs
-    # get in the way of brace matching. See the testBuildClass test in
-    # cpplint_unittest.py for an example of this.
-    for obj in self.stack:
-      if isinstance(obj, _ClassInfo):
-        error(filename, obj.starting_linenum, 'build/class', 5,
-              'Failed to find complete declaration of class %s' %
-              obj.name)
-      elif isinstance(obj, _NamespaceInfo):
-        error(filename, obj.starting_linenum, 'build/namespaces', 5,
-              'Failed to find complete declaration of namespace %s' %
-              obj.name)
-
-
-def CheckForNonStandardConstructs(filename, clean_lines, linenum,
-                                  nesting_state, error):
-  r"""Logs an error if we see certain non-ANSI constructs ignored by gcc-2.
+        # Note: This test can result in false positives if #ifdef constructs
+        # get in the way of brace matching. See the testBuildClass test in
+        # cpplint_unittest.py for an example of this.
+        for obj in self.stack:
+            if isinstance(obj, _ClassInfo):
+                error(filename, obj.starting_linenum, 'build/class', 5,
+                      'Failed to find complete declaration of class %s' %
+                      obj.name)
+            elif isinstance(obj, _NamespaceInfo):
+                error(filename, obj.starting_linenum, 'build/namespaces', 5,
+                      'Failed to find complete declaration of namespace %s' %
+                      obj.name)
+
+
+def CheckForNonStandardConstructs(filename, clean_lines, linenum, nesting_state,
+                                  error):
+    r"""Logs an error if we see certain non-ANSI constructs ignored by gcc-2.
 
   Complain about several constructs which gcc-2 accepts, but which are
   not standard C++.  Warning about these in lint is one way to ease the
@@ -2608,143 +2626,144 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum,
            filename, line number, error level, and message
   """
 
-  # Remove comments from the line, but leave in strings for now.
-  line = clean_lines.lines[linenum]
-
-  if Search(r'printf\s*\(.*".*%[-+ ]?\d*q', line):
-    error(filename, linenum, 'runtime/printf_format', 3,
-          '%q in format strings is deprecated.  Use %ll instead.')
-
-  if Search(r'printf\s*\(.*".*%\d+\$', line):
-    error(filename, linenum, 'runtime/printf_format', 2,
-          '%N$ formats are unconventional.  Try rewriting to avoid them.')
-
-  # Remove escaped backslashes before looking for undefined escapes.
-  line = line.replace('\\\\', '')
-
-  if Search(r'("|\').*\\(%|\[|\(|{)', line):
-    error(filename, linenum, 'build/printf_format', 3,
-          '%, [, (, and { are undefined character escapes.  Unescape them.')
-
-  # For the rest, work with both comments and strings removed.
-  line = clean_lines.elided[linenum]
-
-  if Search(r'\b(const|volatile|void|char|short|int|long'
-            r'|float|double|signed|unsigned'
-            r'|schar|u?int8|u?int16|u?int32|u?int64)'
-            r'\s+(register|static|extern|typedef)\b',
-            line):
-    error(filename, linenum, 'build/storage_class', 5,
-          'Storage class (static, extern, typedef, etc) should be first.')
-
-  if Match(r'\s*#\s*endif\s*[^/\s]+', line):
-    error(filename, linenum, 'build/endif_comment', 5,
-          'Uncommented text after #endif is non-standard.  Use a comment.')
-
-  if Match(r'\s*class\s+(\w+\s*::\s*)+\w+\s*;', line):
-    error(filename, linenum, 'build/forward_decl', 5,
-          'Inner-style forward declarations are invalid.  Remove this line.')
-
-  if Search(r'(\w+|[+-]?\d+(\.\d*)?)\s*(<|>)\?=?\s*(\w+|[+-]?\d+)(\.\d*)?',
-            line):
-    error(filename, linenum, 'build/deprecated', 3,
-          '>? and <? (max and min) operators are non-standard and deprecated.')
-
-  if Search(r'^\s*const\s*string\s*&\s*\w+\s*;', line):
-    # TODO(unknown): Could it be expanded safely to arbitrary references,
-    # without triggering too many false positives? The first
-    # attempt triggered 5 warnings for mostly benign code in the regtest, hence
-    # the restriction.
-    # Here's the original regexp, for the reference:
-    # type_name = r'\w+((\s*::\s*\w+)|(\s*<\s*\w+?\s*>))?'
-    # r'\s*const\s*' + type_name + '\s*&\s*\w+\s*;'
-    error(filename, linenum, 'runtime/member_string_references', 2,
-          'const string& members are dangerous. It is much better to use '
-          'alternatives, such as pointers or simple constants.')
-
-  # Everything else in this function operates on class declarations.
-  # Return early if the top of the nesting stack is not a class, or if
-  # the class head is not completed yet.
-  classinfo = nesting_state.InnermostClass()
-  if not classinfo or not classinfo.seen_open_brace:
-    return
-
-  # The class may have been declared with namespace or classname qualifiers.
-  # The constructor and destructor will not have those qualifiers.
-  base_classname = classinfo.name.split('::')[-1]
-
-  # Look for single-argument constructors that aren't marked explicit.
-  # Technically a valid construct, but against style. Also look for
-  # non-single-argument constructors which are also technically valid, but
-  # strongly suggest something is wrong.
-  explicit_constructor_match = Match(
-      r'\s+(?:inline\s+)?(explicit\s+)?(?:inline\s+)?%s\s*'
-      r'\(((?:[^()]|\([^()]*\))*)\)'
-      % re.escape(base_classname),
-      line)
-
-  if explicit_constructor_match:
-    is_marked_explicit = explicit_constructor_match.group(1)
-
-    if not explicit_constructor_match.group(2):
-      constructor_args = []
-    else:
-      constructor_args = explicit_constructor_match.group(2).split(',')
-
-    # collapse arguments so that commas in template parameter lists and function
-    # argument parameter lists don't split arguments in two
-    i = 0
-    while i < len(constructor_args):
-      constructor_arg = constructor_args[i]
-      while (constructor_arg.count('<') > constructor_arg.count('>') or
-             constructor_arg.count('(') > constructor_arg.count(')')):
-        constructor_arg += ',' + constructor_args[i + 1]
-        del constructor_args[i + 1]
-      constructor_args[i] = constructor_arg
-      i += 1
-
-    defaulted_args = [arg for arg in constructor_args if '=' in arg]
-    noarg_constructor = (not constructor_args or  # empty arg list
-                         # 'void' arg specifier
-                         (len(constructor_args) == 1 and
-                          constructor_args[0].strip() == 'void'))
-    onearg_constructor = ((len(constructor_args) == 1 and  # exactly one arg
-                           not noarg_constructor) or
-                          # all but at most one arg defaulted
-                          (len(constructor_args) >= 1 and
-                           not noarg_constructor and
-                           len(defaulted_args) >= len(constructor_args) - 1))
-    initializer_list_constructor = bool(
-        onearg_constructor and
-        Search(r'\bstd\s*::\s*initializer_list\b', constructor_args[0]))
-    copy_constructor = bool(
-        onearg_constructor and
-        Match(r'(const\s+)?%s(\s*<[^>]*>)?(\s+const)?\s*(?:<\w+>\s*)?&'
-              % re.escape(base_classname), constructor_args[0].strip()))
-
-    if (not is_marked_explicit and
-        onearg_constructor and
-        not initializer_list_constructor and
-        not copy_constructor):
-      if defaulted_args:
-        error(filename, linenum, 'runtime/explicit', 5,
-              'Constructors callable with one argument '
-              'should be marked explicit.')
-      else:
-        error(filename, linenum, 'runtime/explicit', 5,
-              'Single-parameter constructors should be marked explicit.')
-    elif is_marked_explicit and not onearg_constructor:
-      if noarg_constructor:
-        error(filename, linenum, 'runtime/explicit', 5,
-              'Zero-parameter constructors should not be marked explicit.')
-      else:
-        error(filename, linenum, 'runtime/explicit', 0,
-              'Constructors that require multiple arguments '
-              'should not be marked explicit.')
+    # Remove comments from the line, but leave in strings for now.
+    line = clean_lines.lines[linenum]
+
+    if Search(r'printf\s*\(.*".*%[-+ ]?\d*q', line):
+        error(filename, linenum, 'runtime/printf_format', 3,
+              '%q in format strings is deprecated.  Use %ll instead.')
+
+    if Search(r'printf\s*\(.*".*%\d+\$', line):
+        error(filename, linenum, 'runtime/printf_format', 2,
+              '%N$ formats are unconventional.  Try rewriting to avoid them.')
+
+    # Remove escaped backslashes before looking for undefined escapes.
+    line = line.replace('\\\\', '')
+
+    if Search(r'("|\').*\\(%|\[|\(|{)', line):
+        error(filename, linenum, 'build/printf_format', 3,
+              '%, [, (, and { are undefined character escapes.  Unescape them.')
+
+    # For the rest, work with both comments and strings removed.
+    line = clean_lines.elided[linenum]
+
+    if Search(r'\b(const|volatile|void|char|short|int|long'
+              r'|float|double|signed|unsigned'
+              r'|schar|u?int8|u?int16|u?int32|u?int64)'
+              r'\s+(register|static|extern|typedef)\b', line):
+        error(filename, linenum, 'build/storage_class', 5,
+              'Storage class (static, extern, typedef, etc) should be first.')
+
+    if Match(r'\s*#\s*endif\s*[^/\s]+', line):
+        error(filename, linenum, 'build/endif_comment', 5,
+              'Uncommented text after #endif is non-standard.  Use a comment.')
+
+    if Match(r'\s*class\s+(\w+\s*::\s*)+\w+\s*;', line):
+        error(
+            filename, linenum, 'build/forward_decl', 5,
+            'Inner-style forward declarations are invalid.  Remove this line.')
+
+    if Search(r'(\w+|[+-]?\d+(\.\d*)?)\s*(<|>)\?=?\s*(\w+|[+-]?\d+)(\.\d*)?',
+              line):
+        error(
+            filename, linenum, 'build/deprecated', 3,
+            '>? and <? (max and min) operators are non-standard and deprecated.')
+
+    if Search(r'^\s*const\s*string\s*&\s*\w+\s*;', line):
+        # TODO(unknown): Could it be expanded safely to arbitrary references,
+        # without triggering too many false positives? The first
+        # attempt triggered 5 warnings for mostly benign code in the regtest, hence
+        # the restriction.
+        # Here's the original regexp, for the reference:
+        # type_name = r'\w+((\s*::\s*\w+)|(\s*<\s*\w+?\s*>))?'
+        # r'\s*const\s*' + type_name + '\s*&\s*\w+\s*;'
+        error(filename, linenum, 'runtime/member_string_references', 2,
+              'const string& members are dangerous. It is much better to use '
+              'alternatives, such as pointers or simple constants.')
+
+    # Everything else in this function operates on class declarations.
+    # Return early if the top of the nesting stack is not a class, or if
+    # the class head is not completed yet.
+    classinfo = nesting_state.InnermostClass()
+    if not classinfo or not classinfo.seen_open_brace:
+        return
+
+    # The class may have been declared with namespace or classname qualifiers.
+    # The constructor and destructor will not have those qualifiers.
+    base_classname = classinfo.name.split('::')[-1]
+
+    # Look for single-argument constructors that aren't marked explicit.
+    # Technically a valid construct, but against style. Also look for
+    # non-single-argument constructors which are also technically valid, but
+    # strongly suggest something is wrong.
+    explicit_constructor_match = Match(
+        r'\s+(?:inline\s+)?(explicit\s+)?(?:inline\s+)?%s\s*'
+        r'\(((?:[^()]|\([^()]*\))*)\)' % re.escape(base_classname), line)
+
+    if explicit_constructor_match:
+        is_marked_explicit = explicit_constructor_match.group(1)
+
+        if not explicit_constructor_match.group(2):
+            constructor_args = []
+        else:
+            constructor_args = explicit_constructor_match.group(2).split(',')
+
+        # collapse arguments so that commas in template parameter lists and function
+        # argument parameter lists don't split arguments in two
+        i = 0
+        while i < len(constructor_args):
+            constructor_arg = constructor_args[i]
+            while (constructor_arg.count('<') > constructor_arg.count('>') or
+                   constructor_arg.count('(') > constructor_arg.count(')')):
+                constructor_arg += ',' + constructor_args[i + 1]
+                del constructor_args[i + 1]
+            constructor_args[i] = constructor_arg
+            i += 1
+
+        defaulted_args = [arg for arg in constructor_args if '=' in arg]
+        noarg_constructor = (
+            not constructor_args or  # empty arg list
+            # 'void' arg specifier
+            (len(constructor_args) == 1 and
+             constructor_args[0].strip() == 'void'))
+        onearg_constructor = (
+            (
+                len(constructor_args) == 1 and  # exactly one arg
+                not noarg_constructor) or
+            # all but at most one arg defaulted
+            (len(constructor_args) >= 1 and not noarg_constructor and
+             len(defaulted_args) >= len(constructor_args) - 1))
+        initializer_list_constructor = bool(
+            onearg_constructor and
+            Search(r'\bstd\s*::\s*initializer_list\b', constructor_args[0]))
+        copy_constructor = bool(
+            onearg_constructor and
+            Match(r'(const\s+)?%s(\s*<[^>]*>)?(\s+const)?\s*(?:<\w+>\s*)?&' %
+                  re.escape(base_classname), constructor_args[0].strip()))
+
+        if (not is_marked_explicit and onearg_constructor and
+                not initializer_list_constructor and not copy_constructor):
+            if defaulted_args:
+                error(filename, linenum, 'runtime/explicit', 5,
+                      'Constructors callable with one argument '
+                      'should be marked explicit.')
+            else:
+                error(
+                    filename, linenum, 'runtime/explicit', 5,
+                    'Single-parameter constructors should be marked explicit.')
+        elif is_marked_explicit and not onearg_constructor:
+            if noarg_constructor:
+                error(
+                    filename, linenum, 'runtime/explicit', 5,
+                    'Zero-parameter constructors should not be marked explicit.')
+            else:
+                error(filename, linenum, 'runtime/explicit', 0,
+                      'Constructors that require multiple arguments '
+                      'should not be marked explicit.')
 
 
 def CheckSpacingForFunctionCall(filename, clean_lines, linenum, error):
-  """Checks for the correctness of various spacing around function calls.
+    """Checks for the correctness of various spacing around function calls.
 
   Args:
     filename: The name of the current file.
@@ -2752,75 +2771,74 @@ def CheckSpacingForFunctionCall(filename, clean_lines, linenum, error):
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
-  line = clean_lines.elided[linenum]
-
-  # Since function calls often occur inside if/for/while/switch
-  # expressions - which have their own, more liberal conventions - we
-  # first see if we should be looking inside such an expression for a
-  # function call, to which we can apply more strict standards.
-  fncall = line    # if there's no control flow construct, look at whole line
-  for pattern in (r'\bif\s*\((.*)\)\s*{',
-                  r'\bfor\s*\((.*)\)\s*{',
-                  r'\bwhile\s*\((.*)\)\s*[{;]',
-                  r'\bswitch\s*\((.*)\)\s*{'):
-    match = Search(pattern, line)
-    if match:
-      fncall = match.group(1)    # look inside the parens for function calls
-      break
-
-  # Except in if/for/while/switch, there should never be space
-  # immediately inside parens (eg "f( 3, 4 )").  We make an exception
-  # for nested parens ( (a+b) + c ).  Likewise, there should never be
-  # a space before a ( when it's a function argument.  I assume it's a
-  # function argument when the char before the whitespace is legal in
-  # a function name (alnum + _) and we're not starting a macro. Also ignore
-  # pointers and references to arrays and functions coz they're too tricky:
-  # we use a very simple way to recognize these:
-  # " (something)(maybe-something)" or
-  # " (something)(maybe-something," or
-  # " (something)[something]"
-  # Note that we assume the contents of [] to be short enough that
-  # they'll never need to wrap.
-  if (  # Ignore control structures.
-      not Search(r'\b(if|for|while|switch|return|new|delete|catch|sizeof)\b',
-                 fncall) and
-      # Ignore pointers/references to functions.
-      not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and
-      # Ignore pointers/references to arrays.
-      not Search(r' \([^)]+\)\[[^\]]+\]', fncall)):
-    if Search(r'\w\s*\(\s(?!\s*\\$)', fncall):      # a ( used for a fn call
-      error(filename, linenum, 'whitespace/parens', 4,
-            'Extra space after ( in function call')
-    elif Search(r'\(\s+(?!(\s*\\)|\()', fncall):
-      error(filename, linenum, 'whitespace/parens', 2,
-            'Extra space after (')
-    if (Search(r'\w\s+\(', fncall) and
-        not Search(r'#\s*define|typedef|using\s+\w+\s*=', fncall) and
-        not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall) and
-        not Search(r'\bcase\s+\(', fncall)):
-      # TODO(unknown): Space after an operator function seem to be a common
-      # error, silence those for now by restricting them to highest verbosity.
-      if Search(r'\boperator_*\b', line):
-        error(filename, linenum, 'whitespace/parens', 0,
-              'Extra space before ( in function call')
-      else:
-        error(filename, linenum, 'whitespace/parens', 4,
-              'Extra space before ( in function call')
-    # If the ) is followed only by a newline or a { + newline, assume it's
-    # part of a control statement (if/while/etc), and don't complain
-    if Search(r'[^)]\s+\)\s*[^{\s]', fncall):
-      # If the closing parenthesis is preceded by only whitespaces,
-      # try to give a more descriptive error message.
-      if Search(r'^\s+\)', fncall):
-        error(filename, linenum, 'whitespace/parens', 2,
-              'Closing ) should be moved to the previous line')
-      else:
-        error(filename, linenum, 'whitespace/parens', 2,
-              'Extra space before )')
+    line = clean_lines.elided[linenum]
+
+    # Since function calls often occur inside if/for/while/switch
+    # expressions - which have their own, more liberal conventions - we
+    # first see if we should be looking inside such an expression for a
+    # function call, to which we can apply more strict standards.
+    fncall = line  # if there's no control flow construct, look at whole line
+    for pattern in (r'\bif\s*\((.*)\)\s*{', r'\bfor\s*\((.*)\)\s*{',
+                    r'\bwhile\s*\((.*)\)\s*[{;]', r'\bswitch\s*\((.*)\)\s*{'):
+        match = Search(pattern, line)
+        if match:
+            fncall = match.group(1)  # look inside the parens for function calls
+            break
+
+    # Except in if/for/while/switch, there should never be space
+    # immediately inside parens (eg "f( 3, 4 )").  We make an exception
+    # for nested parens ( (a+b) + c ).  Likewise, there should never be
+    # a space before a ( when it's a function argument.  I assume it's a
+    # function argument when the char before the whitespace is legal in
+    # a function name (alnum + _) and we're not starting a macro. Also ignore
+    # pointers and references to arrays and functions coz they're too tricky:
+    # we use a very simple way to recognize these:
+    # " (something)(maybe-something)" or
+    # " (something)(maybe-something," or
+    # " (something)[something]"
+    # Note that we assume the contents of [] to be short enough that
+    # they'll never need to wrap.
+    if (  # Ignore control structures.
+            not Search(
+                r'\b(if|for|while|switch|return|new|delete|catch|sizeof)\b',
+                fncall) and
+            # Ignore pointers/references to functions.
+            not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and
+            # Ignore pointers/references to arrays.
+            not Search(r' \([^)]+\)\[[^\]]+\]', fncall)):
+        if Search(r'\w\s*\(\s(?!\s*\\$)', fncall):  # a ( used for a fn call
+            error(filename, linenum, 'whitespace/parens', 4,
+                  'Extra space after ( in function call')
+        elif Search(r'\(\s+(?!(\s*\\)|\()', fncall):
+            error(filename, linenum, 'whitespace/parens', 2,
+                  'Extra space after (')
+        if (Search(r'\w\s+\(', fncall) and
+                not Search(r'#\s*define|typedef|using\s+\w+\s*=', fncall) and
+                not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall) and
+                not Search(r'\bcase\s+\(', fncall)):
+            # TODO(unknown): Space after an operator function seem to be a common
+            # error, silence those for now by restricting them to highest verbosity.
+            if Search(r'\boperator_*\b', line):
+                error(filename, linenum, 'whitespace/parens', 0,
+                      'Extra space before ( in function call')
+            else:
+                error(filename, linenum, 'whitespace/parens', 4,
+                      'Extra space before ( in function call')
+        # If the ) is followed only by a newline or a { + newline, assume it's
+        # part of a control statement (if/while/etc), and don't complain
+        if Search(r'[^)]\s+\)\s*[^{\s]', fncall):
+            # If the closing parenthesis is preceded by only whitespaces,
+            # try to give a more descriptive error message.
+            if Search(r'^\s+\)', fncall):
+                error(filename, linenum, 'whitespace/parens', 2,
+                      'Closing ) should be moved to the previous line')
+            else:
+                error(filename, linenum, 'whitespace/parens', 2,
+                      'Extra space before )')
 
 
 def IsBlankLine(line):
-  """Returns true if the given line is blank.
+    """Returns true if the given line is blank.
 
   We consider a line to be blank if the line is empty or consists of
   only white spaces.
@@ -2831,26 +2849,26 @@ def IsBlankLine(line):
   Returns:
     True, if the given line is blank.
   """
-  return not line or line.isspace()
+    return not line or line.isspace()
 
 
 def CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line,
                                  error):
-  is_namespace_indent_item = (
-      len(nesting_state.stack) > 1 and
-      nesting_state.stack[-1].check_namespace_indentation and
-      isinstance(nesting_state.previous_stack_top, _NamespaceInfo) and
-      nesting_state.previous_stack_top == nesting_state.stack[-2])
+    is_namespace_indent_item = (
+        len(nesting_state.stack) > 1 and
+        nesting_state.stack[-1].check_namespace_indentation and
+        isinstance(nesting_state.previous_stack_top, _NamespaceInfo) and
+        nesting_state.previous_stack_top == nesting_state.stack[-2])
 
-  if ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
-                                     clean_lines.elided, line):
-    CheckItemIndentationInNamespace(filename, clean_lines.elided,
-                                    line, error)
+    if ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
+                                       clean_lines.elided, line):
+        CheckItemIndentationInNamespace(filename, clean_lines.elided, line,
+                                        error)
 
 
-def CheckForFunctionLengths(filename, clean_lines, linenum,
-                            function_state, error):
-  """Reports for long function bodies.
+def CheckForFunctionLengths(filename, clean_lines, linenum, function_state,
+                            error):
+    """Reports for long function bodies.
 
   For an overview why this is done, see:
   http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
@@ -2871,56 +2889,57 @@ def CheckForFunctionLengths(filename, clean_lines, linenum,
     function_state: Current function name and lines in body so far.
     error: The function to call with any errors found.
   """
-  lines = clean_lines.lines
-  line = lines[linenum]
-  joined_line = ''
-
-  starting_func = False
-  regexp = r'(\w(\w|::|\*|\&|\s)*)\('  # decls * & space::name( ...
-  match_result = Match(regexp, line)
-  if match_result:
-    # If the name is all caps and underscores, figure it's a macro and
-    # ignore it, unless it's TEST or TEST_F.
-    function_name = match_result.group(1).split()[-1]
-    if function_name == 'TEST' or function_name == 'TEST_F' or (
-        not Match(r'[A-Z_]+$', function_name)):
-      starting_func = True
-
-  if starting_func:
-    body_found = False
-    for start_linenum in xrange(linenum, clean_lines.NumLines()):
-      start_line = lines[start_linenum]
-      joined_line += ' ' + start_line.lstrip()
-      if Search(r'(;|})', start_line):  # Declarations and trivial functions
-        body_found = True
-        break                              # ... ignore
-      elif Search(r'{', start_line):
-        body_found = True
-        function = Search(r'((\w|:)*)\(', line).group(1)
-        if Match(r'TEST', function):    # Handle TEST... macros
-          parameter_regexp = Search(r'(\(.*\))', joined_line)
-          if parameter_regexp:             # Ignore bad syntax
-            function += parameter_regexp.group(1)
-        else:
-          function += '()'
-        function_state.Begin(function)
-        break
-    if not body_found:
-      # No body for the function (or evidence of a non-function) was found.
-      error(filename, linenum, 'readability/fn_size', 5,
-            'Lint failed to find start of function body.')
-  elif Match(r'^\}\s*$', line):  # function end
-    function_state.Check(error, filename, linenum)
-    function_state.End()
-  elif not Match(r'^\s*$', line):
-    function_state.Count()  # Count non-blank/non-comment lines.
+    lines = clean_lines.lines
+    line = lines[linenum]
+    joined_line = ''
+
+    starting_func = False
+    regexp = r'(\w(\w|::|\*|\&|\s)*)\('  # decls * & space::name( ...
+    match_result = Match(regexp, line)
+    if match_result:
+        # If the name is all caps and underscores, figure it's a macro and
+        # ignore it, unless it's TEST or TEST_F.
+        function_name = match_result.group(1).split()[-1]
+        if function_name == 'TEST' or function_name == 'TEST_F' or (
+                not Match(r'[A-Z_]+$', function_name)):
+            starting_func = True
+
+    if starting_func:
+        body_found = False
+        for start_linenum in xrange(linenum, clean_lines.NumLines()):
+            start_line = lines[start_linenum]
+            joined_line += ' ' + start_line.lstrip()
+            if Search(r'(;|})',
+                      start_line):  # Declarations and trivial functions
+                body_found = True
+                break  # ... ignore
+            elif Search(r'{', start_line):
+                body_found = True
+                function = Search(r'((\w|:)*)\(', line).group(1)
+                if Match(r'TEST', function):  # Handle TEST... macros
+                    parameter_regexp = Search(r'(\(.*\))', joined_line)
+                    if parameter_regexp:  # Ignore bad syntax
+                        function += parameter_regexp.group(1)
+                else:
+                    function += '()'
+                function_state.Begin(function)
+                break
+        if not body_found:
+            # No body for the function (or evidence of a non-function) was found.
+            error(filename, linenum, 'readability/fn_size', 5,
+                  'Lint failed to find start of function body.')
+    elif Match(r'^\}\s*$', line):  # function end
+        function_state.Check(error, filename, linenum)
+        function_state.End()
+    elif not Match(r'^\s*$', line):
+        function_state.Count()  # Count non-blank/non-comment lines.
 
 
 _RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?')
 
 
 def CheckComment(line, filename, linenum, next_line_start, error):
-  """Checks for common mistakes in comments.
+    """Checks for common mistakes in comments.
 
   Args:
     line: The line in question.
@@ -2929,54 +2948,54 @@ def CheckComment(line, filename, linenum, next_line_start, error):
     next_line_start: The first non-whitespace column of the next line.
     error: The function to call with any errors found.
   """
-  commentpos = line.find('//')
-  if commentpos != -1:
-    # Check if the // may be in quotes.  If so, ignore it
-    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
-    if (line.count('"', 0, commentpos) -
-        line.count('\\"', 0, commentpos)) % 2 == 0:   # not in quotes
-      # Allow one space for new scopes, two spaces otherwise:
-      if (not (Match(r'^.*{ *//', line) and next_line_start == commentpos) and
-          ((commentpos >= 1 and
-            line[commentpos-1] not in string.whitespace) or
-           (commentpos >= 2 and
-            line[commentpos-2] not in string.whitespace))):
-        error(filename, linenum, 'whitespace/comments', 2,
-              'At least two spaces is best between code and comments')
-
-      # Checks for common mistakes in TODO comments.
-      comment = line[commentpos:]
-      match = _RE_PATTERN_TODO.match(comment)
-      if match:
-        # One whitespace is correct; zero whitespace is handled elsewhere.
-        leading_whitespace = match.group(1)
-        if len(leading_whitespace) > 1:
-          error(filename, linenum, 'whitespace/todo', 2,
-                'Too many spaces before TODO')
-
-        username = match.group(2)
-        if not username:
-          error(filename, linenum, 'readability/todo', 2,
-                'Missing username in TODO; it should look like '
-                '"// TODO(my_username): Stuff."')
-
-        middle_whitespace = match.group(3)
-        # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
-        if middle_whitespace != ' ' and middle_whitespace != '':
-          error(filename, linenum, 'whitespace/todo', 2,
-                'TODO(my_username) should be followed by a space')
-
-      # If the comment contains an alphanumeric character, there
-      # should be a space somewhere between it and the // unless
-      # it's a /// or //! Doxygen comment.
-      if (Match(r'//[^ ]*\w', comment) and
-          not Match(r'(///|//\!)(\s+|$)', comment)):
-        error(filename, linenum, 'whitespace/comments', 4,
-              'Should have a space between // and comment')
+    commentpos = line.find('//')
+    if commentpos != -1:
+        # Check if the // may be in quotes.  If so, ignore it
+        # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
+        if (line.count('"', 0, commentpos) - line.count('\\"', 0, commentpos)
+            ) % 2 == 0:  # not in quotes
+            # Allow one space for new scopes, two spaces otherwise:
+            if (not (Match(r'^.*{ *//', line) and next_line_start == commentpos)
+                    and ((commentpos >= 1 and
+                          line[commentpos - 1] not in string.whitespace) or
+                         (commentpos >= 2 and
+                          line[commentpos - 2] not in string.whitespace))):
+                error(filename, linenum, 'whitespace/comments', 2,
+                      'At least two spaces is best between code and comments')
+
+            # Checks for common mistakes in TODO comments.
+            comment = line[commentpos:]
+            match = _RE_PATTERN_TODO.match(comment)
+            if match:
+                # One whitespace is correct; zero whitespace is handled elsewhere.
+                leading_whitespace = match.group(1)
+                if len(leading_whitespace) > 1:
+                    error(filename, linenum, 'whitespace/todo', 2,
+                          'Too many spaces before TODO')
+
+                username = match.group(2)
+                if not username:
+                    error(filename, linenum, 'readability/todo', 2,
+                          'Missing username in TODO; it should look like '
+                          '"// TODO(my_username): Stuff."')
+
+                middle_whitespace = match.group(3)
+                # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
+                if middle_whitespace != ' ' and middle_whitespace != '':
+                    error(filename, linenum, 'whitespace/todo', 2,
+                          'TODO(my_username) should be followed by a space')
+
+            # If the comment contains an alphanumeric character, there
+            # should be a space somewhere between it and the // unless
+            # it's a /// or //! Doxygen comment.
+            if (Match(r'//[^ ]*\w', comment) and
+                    not Match(r'(///|//\!)(\s+|$)', comment)):
+                error(filename, linenum, 'whitespace/comments', 4,
+                      'Should have a space between // and comment')
 
 
 def CheckAccess(filename, clean_lines, linenum, nesting_state, error):
-  """Checks for improper use of DISALLOW* macros.
+    """Checks for improper use of DISALLOW* macros.
 
   Args:
     filename: The name of the current file.
@@ -2986,27 +3005,27 @@ def CheckAccess(filename, clean_lines, linenum, nesting_state, error):
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
-  line = clean_lines.elided[linenum]  # get rid of comments and strings
-
-  matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|'
-                   r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line)
-  if not matched:
-    return
-  if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo):
-    if nesting_state.stack[-1].access != 'private':
-      error(filename, linenum, 'readability/constructors', 3,
-            '%s must be in the private: section' % matched.group(1))
-
-  else:
-    # Found DISALLOW* macro outside a class declaration, or perhaps it
-    # was used inside a function when it should have been part of the
-    # class declaration.  We could issue a warning here, but it
-    # probably resulted in a compiler error already.
-    pass
+    line = clean_lines.elided[linenum]  # get rid of comments and strings
+
+    matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|'
+                     r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line)
+    if not matched:
+        return
+    if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo):
+        if nesting_state.stack[-1].access != 'private':
+            error(filename, linenum, 'readability/constructors', 3,
+                  '%s must be in the private: section' % matched.group(1))
+
+    else:
+        # Found DISALLOW* macro outside a class declaration, or perhaps it
+        # was used inside a function when it should have been part of the
+        # class declaration.  We could issue a warning here, but it
+        # probably resulted in a compiler error already.
+        pass
 
 
 def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
-  """Checks for the correctness of various spacing issues in the code.
+    """Checks for the correctness of various spacing issues in the code.
 
   Things we check for: spaces around operators, spaces after
   if/for/while/switch, no spaces around parens in function calls, two
@@ -3023,118 +3042,114 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
     error: The function to call with any errors found.
   """
 
-  # Don't use "elided" lines here, otherwise we can't check commented lines.
-  # Don't want to use "raw" either, because we don't want to check inside C++11
-  # raw strings,
-  raw = clean_lines.lines_without_raw_strings
-  line = raw[linenum]
-
-  # Before nixing comments, check if the line is blank for no good
-  # reason.  This includes the first line after a block is opened, and
-  # blank lines at the end of a function (ie, right before a line like '}'
-  #
-  # Skip all the blank line checks if we are immediately inside a
-  # namespace body.  In other words, don't issue blank line warnings
-  # for this block:
-  #   namespace {
-  #
-  #   }
-  #
-  # A warning about missing end of namespace comments will be issued instead.
-  #
-  # Also skip blank line checks for 'extern "C"' blocks, which are formatted
-  # like namespaces.
-  if (IsBlankLine(line) and
-      not nesting_state.InNamespaceBody() and
-      not nesting_state.InExternC()):
-    elided = clean_lines.elided
-    prev_line = elided[linenum - 1]
-    prevbrace = prev_line.rfind('{')
-    # TODO(unknown): Don't complain if line before blank line, and line after,
-    #                both start with alnums and are indented the same amount.
-    #                This ignores whitespace at the start of a namespace block
-    #                because those are not usually indented.
-    if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1:
-      # OK, we have a blank line at the start of a code block.  Before we
-      # complain, we check if it is an exception to the rule: The previous
-      # non-empty line has the parameters of a function header that are indented
-      # 4 spaces (because they did not fit in a 80 column line when placed on
-      # the same line as the function name).  We also check for the case where
-      # the previous line is indented 6 spaces, which may happen when the
-      # initializers of a constructor do not fit into a 80 column line.
-      exception = False
-      if Match(r' {6}\w', prev_line):  # Initializer list?
-        # We are looking for the opening column of initializer list, which
-        # should be indented 4 spaces to cause 6 space indentation afterwards.
-        search_position = linenum-2
-        while (search_position >= 0
-               and Match(r' {6}\w', elided[search_position])):
-          search_position -= 1
-        exception = (search_position >= 0
-                     and elided[search_position][:5] == '    :')
-      else:
-        # Search for the function arguments or an initializer list.  We use a
-        # simple heuristic here: If the line is indented 4 spaces; and we have a
-        # closing paren, without the opening paren, followed by an opening brace
-        # or colon (for initializer lists) we assume that it is the last line of
-        # a function header.  If we have a colon indented 4 spaces, it is an
-        # initializer list.
-        exception = (Match(r' {4}\w[^\(]*\)\s*(const\s*)?(\{\s*$|:)',
-                           prev_line)
-                     or Match(r' {4}:', prev_line))
-
-      if not exception:
-        error(filename, linenum, 'whitespace/blank_line', 2,
-              'Redundant blank line at the start of a code block '
-              'should be deleted.')
-    # Ignore blank lines at the end of a block in a long if-else
-    # chain, like this:
-    #   if (condition1) {
-    #     // Something followed by a blank line
+    # Don't use "elided" lines here, otherwise we can't check commented lines.
+    # Don't want to use "raw" either, because we don't want to check inside C++11
+    # raw strings,
+    raw = clean_lines.lines_without_raw_strings
+    line = raw[linenum]
+
+    # Before nixing comments, check if the line is blank for no good
+    # reason.  This includes the first line after a block is opened, and
+    # blank lines at the end of a function (ie, right before a line like '}'
+    #
+    # Skip all the blank line checks if we are immediately inside a
+    # namespace body.  In other words, don't issue blank line warnings
+    # for this block:
+    #   namespace {
     #
-    #   } else if (condition2) {
-    #     // Something else
     #   }
+    #
+    # A warning about missing end of namespace comments will be issued instead.
+    #
+    # Also skip blank line checks for 'extern "C"' blocks, which are formatted
+    # like namespaces.
+    if (IsBlankLine(line) and not nesting_state.InNamespaceBody() and
+            not nesting_state.InExternC()):
+        elided = clean_lines.elided
+        prev_line = elided[linenum - 1]
+        prevbrace = prev_line.rfind('{')
+        # TODO(unknown): Don't complain if line before blank line, and line after,
+        #                both start with alnums and are indented the same amount.
+        #                This ignores whitespace at the start of a namespace block
+        #                because those are not usually indented.
+        if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1:
+            # OK, we have a blank line at the start of a code block.  Before we
+            # complain, we check if it is an exception to the rule: The previous
+            # non-empty line has the parameters of a function header that are indented
+            # 4 spaces (because they did not fit in a 80 column line when placed on
+            # the same line as the function name).  We also check for the case where
+            # the previous line is indented 6 spaces, which may happen when the
+            # initializers of a constructor do not fit into a 80 column line.
+            exception = False
+            if Match(r' {6}\w', prev_line):  # Initializer list?
+                # We are looking for the opening column of initializer list, which
+                # should be indented 4 spaces to cause 6 space indentation afterwards.
+                search_position = linenum - 2
+                while (search_position >= 0 and
+                       Match(r' {6}\w', elided[search_position])):
+                    search_position -= 1
+                exception = (search_position >= 0 and
+                             elided[search_position][:5] == '    :')
+            else:
+                # Search for the function arguments or an initializer list.  We use a
+                # simple heuristic here: If the line is indented 4 spaces; and we have a
+                # closing paren, without the opening paren, followed by an opening brace
+                # or colon (for initializer lists) we assume that it is the last line of
+                # a function header.  If we have a colon indented 4 spaces, it is an
+                # initializer list.
+                exception = (Match(r' {4}\w[^\(]*\)\s*(const\s*)?(\{\s*$|:)',
+                                   prev_line) or Match(r' {4}:', prev_line))
+
+            if not exception:
+                error(filename, linenum, 'whitespace/blank_line', 2,
+                      'Redundant blank line at the start of a code block '
+                      'should be deleted.')
+        # Ignore blank lines at the end of a block in a long if-else
+        # chain, like this:
+        #   if (condition1) {
+        #     // Something followed by a blank line
+        #
+        #   } else if (condition2) {
+        #     // Something else
+        #   }
+        if linenum + 1 < clean_lines.NumLines():
+            next_line = raw[linenum + 1]
+            if (next_line and Match(r'\s*}', next_line) and
+                    next_line.find('} else ') == -1):
+                error(filename, linenum, 'whitespace/blank_line', 3,
+                      'Redundant blank line at the end of a code block '
+                      'should be deleted.')
+
+        matched = Match(r'\s*(public|protected|private):', prev_line)
+        if matched:
+            error(filename, linenum, 'whitespace/blank_line', 3,
+                  'Do not leave a blank line after "%s:"' % matched.group(1))
+
+    # Next, check comments
+    next_line_start = 0
     if linenum + 1 < clean_lines.NumLines():
-      next_line = raw[linenum + 1]
-      if (next_line
-          and Match(r'\s*}', next_line)
-          and next_line.find('} else ') == -1):
-        error(filename, linenum, 'whitespace/blank_line', 3,
-              'Redundant blank line at the end of a code block '
-              'should be deleted.')
-
-    matched = Match(r'\s*(public|protected|private):', prev_line)
-    if matched:
-      error(filename, linenum, 'whitespace/blank_line', 3,
-            'Do not leave a blank line after "%s:"' % matched.group(1))
-
-  # Next, check comments
-  next_line_start = 0
-  if linenum + 1 < clean_lines.NumLines():
-    next_line = raw[linenum + 1]
-    next_line_start = len(next_line) - len(next_line.lstrip())
-  CheckComment(line, filename, linenum, next_line_start, error)
+        next_line = raw[linenum + 1]
+        next_line_start = len(next_line) - len(next_line.lstrip())
+    CheckComment(line, filename, linenum, next_line_start, error)
 
-  # get rid of comments and strings
-  line = clean_lines.elided[linenum]
+    # get rid of comments and strings
+    line = clean_lines.elided[linenum]
 
-  # You shouldn't have spaces before your brackets, except maybe after
-  # 'delete []' or 'return []() {};'
-  if Search(r'\w\s+\[', line) and not Search(r'(?:delete|return)\s+\[', line):
-    error(filename, linenum, 'whitespace/braces', 5,
-          'Extra space before [')
+    # You shouldn't have spaces before your brackets, except maybe after
+    # 'delete []' or 'return []() {};'
+    if Search(r'\w\s+\[', line) and not Search(r'(?:delete|return)\s+\[', line):
+        error(filename, linenum, 'whitespace/braces', 5, 'Extra space before [')
 
-  # In range-based for, we wanted spaces before and after the colon, but
-  # not around "::" tokens that might appear.
-  if (Search(r'for *\(.*[^:]:[^: ]', line) or
-      Search(r'for *\(.*[^: ]:[^:]', line)):
-    error(filename, linenum, 'whitespace/forcolon', 2,
-          'Missing space around colon in range-based for loop')
+    # In range-based for, we wanted spaces before and after the colon, but
+    # not around "::" tokens that might appear.
+    if (Search(r'for *\(.*[^:]:[^: ]', line) or
+            Search(r'for *\(.*[^: ]:[^:]', line)):
+        error(filename, linenum, 'whitespace/forcolon', 2,
+              'Missing space around colon in range-based for loop')
 
 
 def CheckOperatorSpacing(filename, clean_lines, linenum, error):
-  """Checks for horizontal spacing around operators.
+    """Checks for horizontal spacing around operators.
 
   Args:
     filename: The name of the current file.
@@ -3142,114 +3157,116 @@ def CheckOperatorSpacing(filename, clean_lines, linenum, error):
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
-  line = clean_lines.elided[linenum]
-
-  # Don't try to do spacing checks for operator methods.  Do this by
-  # replacing the troublesome characters with something else,
-  # preserving column position for all other characters.
-  #
-  # The replacement is done repeatedly to avoid false positives from
-  # operators that call operators.
-  while True:
-    match = Match(r'^(.*\boperator\b)(\S+)(\s*\(.*)$', line)
-    if match:
-      line = match.group(1) + ('_' * len(match.group(2))) + match.group(3)
-    else:
-      break
-
-  # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )".
-  # Otherwise not.  Note we only check for non-spaces on *both* sides;
-  # sometimes people put non-spaces on one side when aligning ='s among
-  # many lines (not that this is behavior that I approve of...)
-  if ((Search(r'[\w.]=', line) or
-       Search(r'=[\w.]', line))
-      and not Search(r'\b(if|while|for) ', line)
-      # Operators taken from [lex.operators] in C++11 standard.
-      and not Search(r'(>=|<=|==|!=|&=|\^=|\|=|\+=|\*=|\/=|\%=)', line)
-      and not Search(r'operator=', line)):
-    error(filename, linenum, 'whitespace/operators', 4,
-          'Missing spaces around =')
-
-  # It's ok not to have spaces around binary operators like + - * /, but if
-  # there's too little whitespace, we get concerned.  It's hard to tell,
-  # though, so we punt on this one for now.  TODO.
-
-  # You should always have whitespace around binary operators.
-  #
-  # Check <= and >= first to avoid false positives with < and >, then
-  # check non-include lines for spacing around < and >.
-  #
-  # If the operator is followed by a comma, assume it's be used in a
-  # macro context and don't do any checks.  This avoids false
-  # positives.
-  #
-  # Note that && is not included here.  Those are checked separately
-  # in CheckRValueReference
-  match = Search(r'[^<>=!\s](==|!=|<=|>=|\|\|)[^<>=!\s,;\)]', line)
-  if match:
-    error(filename, linenum, 'whitespace/operators', 3,
-          'Missing spaces around %s' % match.group(1))
-  elif not Match(r'#.*include', line):
-    # Look for < that is not surrounded by spaces.  This is only
-    # triggered if both sides are missing spaces, even though
-    # technically should should flag if at least one side is missing a
-    # space.  This is done to avoid some false positives with shifts.
-    match = Match(r'^(.*[^\s<])<[^\s=<,]', line)
+    line = clean_lines.elided[linenum]
+
+    # Don't try to do spacing checks for operator methods.  Do this by
+    # replacing the troublesome characters with something else,
+    # preserving column position for all other characters.
+    #
+    # The replacement is done repeatedly to avoid false positives from
+    # operators that call operators.
+    while True:
+        match = Match(r'^(.*\boperator\b)(\S+)(\s*\(.*)$', line)
+        if match:
+            line = match.group(1) + ('_' * len(match.group(2))) + match.group(3)
+        else:
+            break
+
+    # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )".
+    # Otherwise not.  Note we only check for non-spaces on *both* sides;
+    # sometimes people put non-spaces on one side when aligning ='s among
+    # many lines (not that this is behavior that I approve of...)
+    if ((Search(r'[\w.]=', line) or
+         Search(r'=[\w.]', line)) and not Search(r'\b(if|while|for) ', line)
+            # Operators taken from [lex.operators] in C++11 standard.
+            and
+            not Search(r'(>=|<=|==|!=|&=|\^=|\|=|\+=|\*=|\/=|\%=)', line) and
+            not Search(r'operator=', line)):
+        error(filename, linenum, 'whitespace/operators', 4,
+              'Missing spaces around =')
+
+    # It's ok not to have spaces around binary operators like + - * /, but if
+    # there's too little whitespace, we get concerned.  It's hard to tell,
+    # though, so we punt on this one for now.  TODO.
+
+    # You should always have whitespace around binary operators.
+    #
+    # Check <= and >= first to avoid false positives with < and >, then
+    # check non-include lines for spacing around < and >.
+    #
+    # If the operator is followed by a comma, assume it's be used in a
+    # macro context and don't do any checks.  This avoids false
+    # positives.
+    #
+    # Note that && is not included here.  Those are checked separately
+    # in CheckRValueReference
+    match = Search(r'[^<>=!\s](==|!=|<=|>=|\|\|)[^<>=!\s,;\)]', line)
     if match:
-      (_, _, end_pos) = CloseExpression(
-          clean_lines, linenum, len(match.group(1)))
-      if end_pos <= -1:
         error(filename, linenum, 'whitespace/operators', 3,
-              'Missing spaces around <')
+              'Missing spaces around %s' % match.group(1))
+    elif not Match(r'#.*include', line):
+        # Look for < that is not surrounded by spaces.  This is only
+        # triggered if both sides are missing spaces, even though
+        # technically should should flag if at least one side is missing a
+        # space.  This is done to avoid some false positives with shifts.
+        match = Match(r'^(.*[^\s<])<[^\s=<,]', line)
+        if match:
+            (_, _, end_pos) = CloseExpression(clean_lines, linenum,
+                                              len(match.group(1)))
+            if end_pos <= -1:
+                error(filename, linenum, 'whitespace/operators', 3,
+                      'Missing spaces around <')
+
+        # Look for > that is not surrounded by spaces.  Similar to the
+        # above, we only trigger if both sides are missing spaces to avoid
+        # false positives with shifts.
+        match = Match(r'^(.*[^-\s>])>[^\s=>,]', line)
+        if match:
+            (_, _, start_pos) = ReverseCloseExpression(clean_lines, linenum,
+                                                       len(match.group(1)))
+            if start_pos <= -1:
+                error(filename, linenum, 'whitespace/operators', 3,
+                      'Missing spaces around >')
+
+    # We allow no-spaces around << when used like this: 10<<20, but
+    # not otherwise (particularly, not when used as streams)
+    #
+    # We also allow operators following an opening parenthesis, since
+    # those tend to be macros that deal with operators.
+    match = Search(r'(operator|[^\s(<])(?:L|UL|ULL|l|ul|ull)?<<([^\s,=<])',
+                   line)
+    if (match and
+            not (match.group(1).isdigit() and match.group(2).isdigit()) and
+            not (match.group(1) == 'operator' and match.group(2) == ';')):
+        error(filename, linenum, 'whitespace/operators', 3,
+              'Missing spaces around <<')
 
-    # Look for > that is not surrounded by spaces.  Similar to the
-    # above, we only trigger if both sides are missing spaces to avoid
-    # false positives with shifts.
-    match = Match(r'^(.*[^-\s>])>[^\s=>,]', line)
+    # We allow no-spaces around >> for almost anything.  This is because
+    # C++11 allows ">>" to close nested templates, which accounts for
+    # most cases when ">>" is not followed by a space.
+    #
+    # We still warn on ">>" followed by alpha character, because that is
+    # likely due to ">>" being used for right shifts, e.g.:
+    #   value >> alpha
+    #
+    # When ">>" is used to close templates, the alphanumeric letter that
+    # follows would be part of an identifier, and there should still be
+    # a space separating the template type and the identifier.
+    #   type<type<type>> alpha
+    match = Search(r'>>[a-zA-Z_]', line)
     if match:
-      (_, _, start_pos) = ReverseCloseExpression(
-          clean_lines, linenum, len(match.group(1)))
-      if start_pos <= -1:
         error(filename, linenum, 'whitespace/operators', 3,
-              'Missing spaces around >')
-
-  # We allow no-spaces around << when used like this: 10<<20, but
-  # not otherwise (particularly, not when used as streams)
-  #
-  # We also allow operators following an opening parenthesis, since
-  # those tend to be macros that deal with operators.
-  match = Search(r'(operator|[^\s(<])(?:L|UL|ULL|l|ul|ull)?<<([^\s,=<])', line)
-  if (match and not (match.group(1).isdigit() and match.group(2).isdigit()) and
-      not (match.group(1) == 'operator' and match.group(2) == ';')):
-    error(filename, linenum, 'whitespace/operators', 3,
-          'Missing spaces around <<')
-
-  # We allow no-spaces around >> for almost anything.  This is because
-  # C++11 allows ">>" to close nested templates, which accounts for
-  # most cases when ">>" is not followed by a space.
-  #
-  # We still warn on ">>" followed by alpha character, because that is
-  # likely due to ">>" being used for right shifts, e.g.:
-  #   value >> alpha
-  #
-  # When ">>" is used to close templates, the alphanumeric letter that
-  # follows would be part of an identifier, and there should still be
-  # a space separating the template type and the identifier.
-  #   type<type<type>> alpha
-  match = Search(r'>>[a-zA-Z_]', line)
-  if match:
-    error(filename, linenum, 'whitespace/operators', 3,
-          'Missing spaces around >>')
-
-  # There shouldn't be space around unary operators
-  match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line)
-  if match:
-    error(filename, linenum, 'whitespace/operators', 4,
-          'Extra space for operator %s' % match.group(1))
+              'Missing spaces around >>')
+
+    # There shouldn't be space around unary operators
+    match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line)
+    if match:
+        error(filename, linenum, 'whitespace/operators', 4,
+              'Extra space for operator %s' % match.group(1))
 
 
 def CheckParenthesisSpacing(filename, clean_lines, linenum, error):
-  """Checks for horizontal spacing around parentheses.
+    """Checks for horizontal spacing around parentheses.
 
   Args:
     filename: The name of the current file.
@@ -3257,37 +3274,36 @@ def CheckParenthesisSpacing(filename, clean_lines, linenum, error):
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
-  line = clean_lines.elided[linenum]
-
-  # No spaces after an if, while, switch, or for
-  match = Search(r' (if\(|for\(|while\(|switch\()', line)
-  if match:
-    error(filename, linenum, 'whitespace/parens', 5,
-          'Missing space before ( in %s' % match.group(1))
-
-  # For if/for/while/switch, the left and right parens should be
-  # consistent about how many spaces are inside the parens, and
-  # there should either be zero or one spaces inside the parens.
-  # We don't want: "if ( foo)" or "if ( foo   )".
-  # Exception: "for ( ; foo; bar)" and "for (foo; bar; )" are allowed.
-  match = Search(r'\b(if|for|while|switch)\s*'
-                 r'\(([ ]*)(.).*[^ ]+([ ]*)\)\s*{\s*$',
-                 line)
-  if match:
-    if len(match.group(2)) != len(match.group(4)):
-      if not (match.group(3) == ';' and
-              len(match.group(2)) == 1 + len(match.group(4)) or
-              not match.group(2) and Search(r'\bfor\s*\(.*; \)', line)):
+    line = clean_lines.elided[linenum]
+
+    # No spaces after an if, while, switch, or for
+    match = Search(r' (if\(|for\(|while\(|switch\()', line)
+    if match:
         error(filename, linenum, 'whitespace/parens', 5,
-              'Mismatching spaces inside () in %s' % match.group(1))
-    if len(match.group(2)) not in [0, 1]:
-      error(filename, linenum, 'whitespace/parens', 5,
-            'Should have zero or one spaces inside ( and ) in %s' %
-            match.group(1))
+              'Missing space before ( in %s' % match.group(1))
+
+    # For if/for/while/switch, the left and right parens should be
+    # consistent about how many spaces are inside the parens, and
+    # there should either be zero or one spaces inside the parens.
+    # We don't want: "if ( foo)" or "if ( foo   )".
+    # Exception: "for ( ; foo; bar)" and "for (foo; bar; )" are allowed.
+    match = Search(r'\b(if|for|while|switch)\s*'
+                   r'\(([ ]*)(.).*[^ ]+([ ]*)\)\s*{\s*$', line)
+    if match:
+        if len(match.group(2)) != len(match.group(4)):
+            if not (match.group(3) == ';' and
+                    len(match.group(2)) == 1 + len(match.group(4)) or
+                    not match.group(2) and Search(r'\bfor\s*\(.*; \)', line)):
+                error(filename, linenum, 'whitespace/parens', 5,
+                      'Mismatching spaces inside () in %s' % match.group(1))
+        if len(match.group(2)) not in [0, 1]:
+            error(filename, linenum, 'whitespace/parens', 5,
+                  'Should have zero or one spaces inside ( and ) in %s' %
+                  match.group(1))
 
 
 def CheckCommaSpacing(filename, clean_lines, linenum, error):
-  """Checks for horizontal spacing near commas and semicolons.
+    """Checks for horizontal spacing near commas and semicolons.
 
   Args:
     filename: The name of the current file.
@@ -3295,35 +3311,34 @@ def CheckCommaSpacing(filename, clean_lines, linenum, error):
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
-  raw = clean_lines.lines_without_raw_strings
-  line = clean_lines.elided[linenum]
-
-  # You should always have a space after a comma (either as fn arg or operator)
-  #
-  # This does not apply when the non-space character following the
-  # comma is another comma, since the only time when that happens is
-  # for empty macro arguments.
-  #
-  # We run this check in two passes: first pass on elided lines to
-  # verify that lines contain missing whitespaces, second pass on raw
-  # lines to confirm that those missing whitespaces are not due to
-  # elided comments.
-  if (Search(r',[^,\s]', ReplaceAll(r'\boperator\s*,\s*\(', 'F(', line)) and
-      Search(r',[^,\s]', raw[linenum])):
-    error(filename, linenum, 'whitespace/comma', 3,
-          'Missing space after ,')
-
-  # You should always have a space after a semicolon
-  # except for few corner cases
-  # TODO(unknown): clarify if 'if (1) { return 1;}' is requires one more
-  # space after ;
-  if Search(r';[^\s};\\)/]', line):
-    error(filename, linenum, 'whitespace/semicolon', 3,
-          'Missing space after ;')
+    raw = clean_lines.lines_without_raw_strings
+    line = clean_lines.elided[linenum]
+
+    # You should always have a space after a comma (either as fn arg or operator)
+    #
+    # This does not apply when the non-space character following the
+    # comma is another comma, since the only time when that happens is
+    # for empty macro arguments.
+    #
+    # We run this check in two passes: first pass on elided lines to
+    # verify that lines contain missing whitespaces, second pass on raw
+    # lines to confirm that those missing whitespaces are not due to
+    # elided comments.
+    if (Search(r',[^,\s]', ReplaceAll(r'\boperator\s*,\s*\(', 'F(', line)) and
+            Search(r',[^,\s]', raw[linenum])):
+        error(filename, linenum, 'whitespace/comma', 3, 'Missing space after ,')
+
+    # You should always have a space after a semicolon
+    # except for few corner cases
+    # TODO(unknown): clarify if 'if (1) { return 1;}' is requires one more
+    # space after ;
+    if Search(r';[^\s};\\)/]', line):
+        error(filename, linenum, 'whitespace/semicolon', 3,
+              'Missing space after ;')
 
 
 def CheckBracesSpacing(filename, clean_lines, linenum, error):
-  """Checks for horizontal spacing near commas.
+    """Checks for horizontal spacing near commas.
 
   Args:
     filename: The name of the current file.
@@ -3331,78 +3346,78 @@ def CheckBracesSpacing(filename, clean_lines, linenum, error):
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
-  line = clean_lines.elided[linenum]
-
-  # Except after an opening paren, or after another opening brace (in case of
-  # an initializer list, for instance), you should have spaces before your
-  # braces. And since you should never have braces at the beginning of a line,
-  # this is an easy test.
-  match = Match(r'^(.*[^ ({>]){', line)
-  if match:
-    # Try a bit harder to check for brace initialization.  This
-    # happens in one of the following forms:
-    #   Constructor() : initializer_list_{} { ... }
-    #   Constructor{}.MemberFunction()
-    #   Type variable{};
-    #   FunctionCall(type{}, ...);
-    #   LastArgument(..., type{});
-    #   LOG(INFO) << type{} << " ...";
-    #   map_of_type[{...}] = ...;
-    #   ternary = expr ? new type{} : nullptr;
-    #   OuterTemplate<InnerTemplateConstructor<Type>{}>
-    #
-    # We check for the character following the closing brace, and
-    # silence the warning if it's one of those listed above, i.e.
-    # "{.;,)<>]:".
-    #
-    # To account for nested initializer list, we allow any number of
-    # closing braces up to "{;,)<".  We can't simply silence the
-    # warning on first sight of closing brace, because that would
-    # cause false negatives for things that are not initializer lists.
-    #   Silence this:         But not this:
-    #     Outer{                if (...) {
-    #       Inner{...}            if (...){  // Missing space before {
-    #     };                    }
-    #
-    # There is a false negative with this approach if people inserted
-    # spurious semicolons, e.g. "if (cond){};", but we will catch the
-    # spurious semicolon with a separate check.
-    (endline, endlinenum, endpos) = CloseExpression(
-        clean_lines, linenum, len(match.group(1)))
-    trailing_text = ''
-    if endpos > -1:
-      trailing_text = endline[endpos:]
-    for offset in xrange(endlinenum + 1,
-                         min(endlinenum + 3, clean_lines.NumLines() - 1)):
-      trailing_text += clean_lines.elided[offset]
-    if not Match(r'^[\s}]*[{.;,)<>\]:]', trailing_text):
-      error(filename, linenum, 'whitespace/braces', 5,
-            'Missing space before {')
-
-  # Make sure '} else {' has spaces.
-  if Search(r'}else', line):
-    error(filename, linenum, 'whitespace/braces', 5,
-          'Missing space before else')
-
-  # You shouldn't have a space before a semicolon at the end of the line.
-  # There's a special case for "for" since the style guide allows space before
-  # the semicolon there.
-  if Search(r':\s*;\s*$', line):
-    error(filename, linenum, 'whitespace/semicolon', 5,
-          'Semicolon defining empty statement. Use {} instead.')
-  elif Search(r'^\s*;\s*$', line):
-    error(filename, linenum, 'whitespace/semicolon', 5,
-          'Line contains only semicolon. If this should be an empty statement, '
-          'use {} instead.')
-  elif (Search(r'\s+;\s*$', line) and
-        not Search(r'\bfor\b', line)):
-    error(filename, linenum, 'whitespace/semicolon', 5,
-          'Extra space before last semicolon. If this should be an empty '
-          'statement, use {} instead.')
+    line = clean_lines.elided[linenum]
+
+    # Except after an opening paren, or after another opening brace (in case of
+    # an initializer list, for instance), you should have spaces before your
+    # braces. And since you should never have braces at the beginning of a line,
+    # this is an easy test.
+    match = Match(r'^(.*[^ ({>]){', line)
+    if match:
+        # Try a bit harder to check for brace initialization.  This
+        # happens in one of the following forms:
+        #   Constructor() : initializer_list_{} { ... }
+        #   Constructor{}.MemberFunction()
+        #   Type variable{};
+        #   FunctionCall(type{}, ...);
+        #   LastArgument(..., type{});
+        #   LOG(INFO) << type{} << " ...";
+        #   map_of_type[{...}] = ...;
+        #   ternary = expr ? new type{} : nullptr;
+        #   OuterTemplate<InnerTemplateConstructor<Type>{}>
+        #
+        # We check for the character following the closing brace, and
+        # silence the warning if it's one of those listed above, i.e.
+        # "{.;,)<>]:".
+        #
+        # To account for nested initializer list, we allow any number of
+        # closing braces up to "{;,)<".  We can't simply silence the
+        # warning on first sight of closing brace, because that would
+        # cause false negatives for things that are not initializer lists.
+        #   Silence this:         But not this:
+        #     Outer{                if (...) {
+        #       Inner{...}            if (...){  // Missing space before {
+        #     };                    }
+        #
+        # There is a false negative with this approach if people inserted
+        # spurious semicolons, e.g. "if (cond){};", but we will catch the
+        # spurious semicolon with a separate check.
+        (endline, endlinenum, endpos) = CloseExpression(clean_lines, linenum,
+                                                        len(match.group(1)))
+        trailing_text = ''
+        if endpos > -1:
+            trailing_text = endline[endpos:]
+        for offset in xrange(endlinenum + 1,
+                             min(endlinenum + 3, clean_lines.NumLines() - 1)):
+            trailing_text += clean_lines.elided[offset]
+        if not Match(r'^[\s}]*[{.;,)<>\]:]', trailing_text):
+            error(filename, linenum, 'whitespace/braces', 5,
+                  'Missing space before {')
+
+    # Make sure '} else {' has spaces.
+    if Search(r'}else', line):
+        error(filename, linenum, 'whitespace/braces', 5,
+              'Missing space before else')
+
+    # You shouldn't have a space before a semicolon at the end of the line.
+    # There's a special case for "for" since the style guide allows space before
+    # the semicolon there.
+    if Search(r':\s*;\s*$', line):
+        error(filename, linenum, 'whitespace/semicolon', 5,
+              'Semicolon defining empty statement. Use {} instead.')
+    elif Search(r'^\s*;\s*$', line):
+        error(
+            filename, linenum, 'whitespace/semicolon', 5,
+            'Line contains only semicolon. If this should be an empty statement, '
+            'use {} instead.')
+    elif (Search(r'\s+;\s*$', line) and not Search(r'\bfor\b', line)):
+        error(filename, linenum, 'whitespace/semicolon', 5,
+              'Extra space before last semicolon. If this should be an empty '
+              'statement, use {} instead.')
 
 
 def IsDecltype(clean_lines, linenum, column):
-  """Check if the token ending on (linenum, column) is decltype().
+    """Check if the token ending on (linenum, column) is decltype().
 
   Args:
     clean_lines: A CleansedLines instance containing the file.
@@ -3411,16 +3426,16 @@ def IsDecltype(clean_lines, linenum, column):
   Returns:
     True if this token is decltype() expression, False otherwise.
   """
-  (text, _, start_col) = ReverseCloseExpression(clean_lines, linenum, column)
-  if start_col < 0:
+    (text, _, start_col) = ReverseCloseExpression(clean_lines, linenum, column)
+    if start_col < 0:
+        return False
+    if Search(r'\bdecltype\s*$', text[0:start_col]):
+        return True
     return False
-  if Search(r'\bdecltype\s*$', text[0:start_col]):
-    return True
-  return False
 
 
 def IsTemplateParameterList(clean_lines, linenum, column):
-  """Check if the token ending on (linenum, column) is the end of template<>.
+    """Check if the token ending on (linenum, column) is the end of template<>.
 
   Args:
     clean_lines: A CleansedLines instance containing the file.
@@ -3429,16 +3444,16 @@ def IsTemplateParameterList(clean_lines, linenum, column):
   Returns:
     True if this token is end of a template parameter list, False otherwise.
   """
-  (_, startline, startpos) = ReverseCloseExpression(
-      clean_lines, linenum, column)
-  if (startpos > -1 and
-      Search(r'\btemplate\s*$', clean_lines.elided[startline][0:startpos])):
-    return True
-  return False
+    (_, startline, startpos) = ReverseCloseExpression(clean_lines, linenum,
+                                                      column)
+    if (startpos > -1 and Search(r'\btemplate\s*$',
+                                 clean_lines.elided[startline][0:startpos])):
+        return True
+    return False
 
 
 def IsRValueType(typenames, clean_lines, nesting_state, linenum, column):
-  """Check if the token ending on (linenum, column) is a type.
+    """Check if the token ending on (linenum, column) is a type.
 
   Assumes that text to the right of the column is "&&" or a function
   name.
@@ -3453,196 +3468,198 @@ def IsRValueType(typenames, clean_lines, nesting_state, linenum, column):
   Returns:
     True if this token is a type, False if we are not sure.
   """
-  prefix = clean_lines.elided[linenum][0:column]
-
-  # Get one word to the left.  If we failed to do so, this is most
-  # likely not a type, since it's unlikely that the type name and "&&"
-  # would be split across multiple lines.
-  match = Match(r'^(.*)(\b\w+|[>*)&])\s*$', prefix)
-  if not match:
-    return False
+    prefix = clean_lines.elided[linenum][0:column]
 
-  # Check text following the token.  If it's "&&>" or "&&," or "&&...", it's
-  # most likely a rvalue reference used inside a template.
-  suffix = clean_lines.elided[linenum][column:]
-  if Match(r'&&\s*(?:[>,]|\.\.\.)', suffix):
-    return True
+    # Get one word to the left.  If we failed to do so, this is most
+    # likely not a type, since it's unlikely that the type name and "&&"
+    # would be split across multiple lines.
+    match = Match(r'^(.*)(\b\w+|[>*)&])\s*$', prefix)
+    if not match:
+        return False
 
-  # Check for known types and end of templates:
-  #   int&& variable
-  #   vector<int>&& variable
-  #
-  # Because this function is called recursively, we also need to
-  # recognize pointer and reference types:
-  #   int* Function()
-  #   int& Function()
-  if (match.group(2) in typenames or
-      match.group(2) in ['char', 'char16_t', 'char32_t', 'wchar_t', 'bool',
-                         'short', 'int', 'long', 'signed', 'unsigned',
-                         'float', 'double', 'void', 'auto', '>', '*', '&']):
-    return True
+    # Check text following the token.  If it's "&&>" or "&&," or "&&...", it's
+    # most likely a rvalue reference used inside a template.
+    suffix = clean_lines.elided[linenum][column:]
+    if Match(r'&&\s*(?:[>,]|\.\.\.)', suffix):
+        return True
 
-  # If we see a close parenthesis, look for decltype on the other side.
-  # decltype would unambiguously identify a type, anything else is
-  # probably a parenthesized expression and not a type.
-  if match.group(2) == ')':
-    return IsDecltype(
-        clean_lines, linenum, len(match.group(1)) + len(match.group(2)) - 1)
-
-  # Check for casts and cv-qualifiers.
-  #   match.group(1)  remainder
-  #   --------------  ---------
-  #   const_cast<     type&&
-  #   const           type&&
-  #   type            const&&
-  if Search(r'\b(?:const_cast\s*<|static_cast\s*<|dynamic_cast\s*<|'
-            r'reinterpret_cast\s*<|\w+\s)\s*$',
-            match.group(1)):
-    return True
+    # Check for known types and end of templates:
+    #   int&& variable
+    #   vector<int>&& variable
+    #
+    # Because this function is called recursively, we also need to
+    # recognize pointer and reference types:
+    #   int* Function()
+    #   int& Function()
+    if (match.group(2) in typenames or match.group(2) in [
+            'char', 'char16_t', 'char32_t', 'wchar_t', 'bool', 'short', 'int',
+            'long', 'signed', 'unsigned', 'float', 'double', 'void', 'auto',
+            '>', '*', '&'
+    ]):
+        return True
 
-  # Look for a preceding symbol that might help differentiate the context.
-  # These are the cases that would be ambiguous:
-  #   match.group(1)  remainder
-  #   --------------  ---------
-  #   Call         (   expression &&
-  #   Declaration  (   type&&
-  #   sizeof       (   type&&
-  #   if           (   expression &&
-  #   while        (   expression &&
-  #   for          (   type&&
-  #   for(         ;   expression &&
-  #   statement    ;   type&&
-  #   block        {   type&&
-  #   constructor  {   expression &&
-  start = linenum
-  line = match.group(1)
-  match_symbol = None
-  while start >= 0:
-    # We want to skip over identifiers and commas to get to a symbol.
-    # Commas are skipped so that we can find the opening parenthesis
-    # for function parameter lists.
-    match_symbol = Match(r'^(.*)([^\w\s,])[\w\s,]*$', line)
-    if match_symbol:
-      break
-    start -= 1
-    line = clean_lines.elided[start]
-
-  if not match_symbol:
-    # Probably the first statement in the file is an rvalue reference
-    return True
+    # If we see a close parenthesis, look for decltype on the other side.
+    # decltype would unambiguously identify a type, anything else is
+    # probably a parenthesized expression and not a type.
+    if match.group(2) == ')':
+        return IsDecltype(clean_lines, linenum,
+                          len(match.group(1)) + len(match.group(2)) - 1)
+
+    # Check for casts and cv-qualifiers.
+    #   match.group(1)  remainder
+    #   --------------  ---------
+    #   const_cast<     type&&
+    #   const           type&&
+    #   type            const&&
+    if Search(r'\b(?:const_cast\s*<|static_cast\s*<|dynamic_cast\s*<|'
+              r'reinterpret_cast\s*<|\w+\s)\s*$', match.group(1)):
+        return True
 
-  if match_symbol.group(2) == '}':
-    # Found closing brace, probably an indicate of this:
-    #   block{} type&&
-    return True
+    # Look for a preceding symbol that might help differentiate the context.
+    # These are the cases that would be ambiguous:
+    #   match.group(1)  remainder
+    #   --------------  ---------
+    #   Call         (   expression &&
+    #   Declaration  (   type&&
+    #   sizeof       (   type&&
+    #   if           (   expression &&
+    #   while        (   expression &&
+    #   for          (   type&&
+    #   for(         ;   expression &&
+    #   statement    ;   type&&
+    #   block        {   type&&
+    #   constructor  {   expression &&
+    start = linenum
+    line = match.group(1)
+    match_symbol = None
+    while start >= 0:
+        # We want to skip over identifiers and commas to get to a symbol.
+        # Commas are skipped so that we can find the opening parenthesis
+        # for function parameter lists.
+        match_symbol = Match(r'^(.*)([^\w\s,])[\w\s,]*$', line)
+        if match_symbol:
+            break
+        start -= 1
+        line = clean_lines.elided[start]
 
-  if match_symbol.group(2) == ';':
-    # Found semicolon, probably one of these:
-    #   for(; expression &&
-    #   statement; type&&
-
-    # Look for the previous 'for(' in the previous lines.
-    before_text = match_symbol.group(1)
-    for i in xrange(start - 1, max(start - 6, 0), -1):
-      before_text = clean_lines.elided[i] + before_text
-    if Search(r'for\s*\([^{};]*$', before_text):
-      # This is the condition inside a for-loop
-      return False
-
-    # Did not find a for-init-statement before this semicolon, so this
-    # is probably a new statement and not a condition.
-    return True
+    if not match_symbol:
+        # Probably the first statement in the file is an rvalue reference
+        return True
 
-  if match_symbol.group(2) == '{':
-    # Found opening brace, probably one of these:
-    #   block{ type&& = ... ; }
-    #   constructor{ expression && expression }
+    if match_symbol.group(2) == '}':
+        # Found closing brace, probably an indicate of this:
+        #   block{} type&&
+        return True
 
-    # Look for a closing brace or a semicolon.  If we see a semicolon
-    # first, this is probably a rvalue reference.
-    line = clean_lines.elided[start][0:len(match_symbol.group(1)) + 1]
-    end = start
-    depth = 1
-    while True:
-      for ch in line:
-        if ch == ';':
-          return True
-        elif ch == '{':
-          depth += 1
-        elif ch == '}':
-          depth -= 1
-          if depth == 0:
+    if match_symbol.group(2) == ';':
+        # Found semicolon, probably one of these:
+        #   for(; expression &&
+        #   statement; type&&
+
+        # Look for the previous 'for(' in the previous lines.
+        before_text = match_symbol.group(1)
+        for i in xrange(start - 1, max(start - 6, 0), -1):
+            before_text = clean_lines.elided[i] + before_text
+        if Search(r'for\s*\([^{};]*$', before_text):
+            # This is the condition inside a for-loop
             return False
-      end += 1
-      if end >= clean_lines.NumLines():
-        break
-      line = clean_lines.elided[end]
-    # Incomplete program?
-    return False
 
-  if match_symbol.group(2) == '(':
-    # Opening parenthesis.  Need to check what's to the left of the
-    # parenthesis.  Look back one extra line for additional context.
-    before_text = match_symbol.group(1)
-    if linenum > 1:
-      before_text = clean_lines.elided[linenum - 1] + before_text
-    before_text = match_symbol.group(1)
-
-    # Patterns that are likely to be types:
-    #   [](type&&
-    #   for (type&&
-    #   sizeof(type&&
-    #   operator=(type&&
-    #
-    if Search(r'(?:\]|\bfor|\bsizeof|\boperator\s*\S+\s*)\s*$', before_text):
-      return True
-
-    # Patterns that are likely to be expressions:
-    #   if (expression &&
-    #   while (expression &&
-    #   : initializer(expression &&
-    #   , initializer(expression &&
-    #   ( FunctionCall(expression &&
-    #   + FunctionCall(expression &&
-    #   + (expression &&
-    #
-    # The last '+' represents operators such as '+' and '-'.
-    if Search(r'(?:\bif|\bwhile|[-+=%^(<!?:,&*]\s*)$', before_text):
-      return False
-
-    # Something else.  Check that tokens to the left look like
-    #   return_type function_name
-    match_func = Match(r'^(.*\S.*)\s+\w(?:\w|::)*(?:<[^<>]*>)?\s*$',
-                       match_symbol.group(1))
-    if match_func:
-      # Check for constructors, which don't have return types.
-      if Search(r'\b(?:explicit|inline)$', match_func.group(1)):
-        return True
-      implicit_constructor = Match(r'\s*(\w+)\((?:const\s+)?(\w+)', prefix)
-      if (implicit_constructor and
-          implicit_constructor.group(1) == implicit_constructor.group(2)):
+        # Did not find a for-init-statement before this semicolon, so this
+        # is probably a new statement and not a condition.
         return True
-      return IsRValueType(typenames, clean_lines, nesting_state, linenum,
-                          len(match_func.group(1)))
 
-    # Nothing before the function name.  If this is inside a block scope,
-    # this is probably a function call.
-    return not (nesting_state.previous_stack_top and
-                nesting_state.previous_stack_top.IsBlockInfo())
+    if match_symbol.group(2) == '{':
+        # Found opening brace, probably one of these:
+        #   block{ type&& = ... ; }
+        #   constructor{ expression && expression }
+
+        # Look for a closing brace or a semicolon.  If we see a semicolon
+        # first, this is probably a rvalue reference.
+        line = clean_lines.elided[start][0:len(match_symbol.group(1)) + 1]
+        end = start
+        depth = 1
+        while True:
+            for ch in line:
+                if ch == ';':
+                    return True
+                elif ch == '{':
+                    depth += 1
+                elif ch == '}':
+                    depth -= 1
+                    if depth == 0:
+                        return False
+            end += 1
+            if end >= clean_lines.NumLines():
+                break
+            line = clean_lines.elided[end]
+        # Incomplete program?
+        return False
 
-  if match_symbol.group(2) == '>':
-    # Possibly a closing bracket, check that what's on the other side
-    # looks like the start of a template.
-    return IsTemplateParameterList(
-        clean_lines, start, len(match_symbol.group(1)))
+    if match_symbol.group(2) == '(':
+        # Opening parenthesis.  Need to check what's to the left of the
+        # parenthesis.  Look back one extra line for additional context.
+        before_text = match_symbol.group(1)
+        if linenum > 1:
+            before_text = clean_lines.elided[linenum - 1] + before_text
+        before_text = match_symbol.group(1)
+
+        # Patterns that are likely to be types:
+        #   [](type&&
+        #   for (type&&
+        #   sizeof(type&&
+        #   operator=(type&&
+        #
+        if Search(r'(?:\]|\bfor|\bsizeof|\boperator\s*\S+\s*)\s*$',
+                  before_text):
+            return True
+
+        # Patterns that are likely to be expressions:
+        #   if (expression &&
+        #   while (expression &&
+        #   : initializer(expression &&
+        #   , initializer(expression &&
+        #   ( FunctionCall(expression &&
+        #   + FunctionCall(expression &&
+        #   + (expression &&
+        #
+        # The last '+' represents operators such as '+' and '-'.
+        if Search(r'(?:\bif|\bwhile|[-+=%^(<!?:,&*]\s*)$', before_text):
+            return False
 
-  # Some other symbol, usually something like "a=b&&c".  This is most
-  # likely not a type.
-  return False
+        # Something else.  Check that tokens to the left look like
+        #   return_type function_name
+        match_func = Match(r'^(.*\S.*)\s+\w(?:\w|::)*(?:<[^<>]*>)?\s*$',
+                           match_symbol.group(1))
+        if match_func:
+            # Check for constructors, which don't have return types.
+            if Search(r'\b(?:explicit|inline)$', match_func.group(1)):
+                return True
+            implicit_constructor = Match(r'\s*(\w+)\((?:const\s+)?(\w+)',
+                                         prefix)
+            if (implicit_constructor and implicit_constructor.group(1) ==
+                    implicit_constructor.group(2)):
+                return True
+            return IsRValueType(typenames, clean_lines, nesting_state, linenum,
+                                len(match_func.group(1)))
+
+        # Nothing before the function name.  If this is inside a block scope,
+        # this is probably a function call.
+        return not (nesting_state.previous_stack_top and
+                    nesting_state.previous_stack_top.IsBlockInfo())
+
+    if match_symbol.group(2) == '>':
+        # Possibly a closing bracket, check that what's on the other side
+        # looks like the start of a template.
+        return IsTemplateParameterList(clean_lines, start,
+                                       len(match_symbol.group(1)))
+
+    # Some other symbol, usually something like "a=b&&c".  This is most
+    # likely not a type.
+    return False
 
 
 def IsDeletedOrDefault(clean_lines, linenum):
-  """Check if current constructor or operator is deleted or default.
+    """Check if current constructor or operator is deleted or default.
 
   Args:
     clean_lines: A CleansedLines instance containing the file.
@@ -3650,18 +3667,18 @@ def IsDeletedOrDefault(clean_lines, linenum):
   Returns:
     True if this is a deleted or default constructor.
   """
-  open_paren = clean_lines.elided[linenum].find('(')
-  if open_paren < 0:
-    return False
-  (close_line, _, close_paren) = CloseExpression(
-      clean_lines, linenum, open_paren)
-  if close_paren < 0:
-    return False
-  return Match(r'\s*=\s*(?:delete|default)\b', close_line[close_paren:])
+    open_paren = clean_lines.elided[linenum].find('(')
+    if open_paren < 0:
+        return False
+    (close_line, _, close_paren) = CloseExpression(clean_lines, linenum,
+                                                   open_paren)
+    if close_paren < 0:
+        return False
+    return Match(r'\s*=\s*(?:delete|default)\b', close_line[close_paren:])
 
 
 def IsRValueAllowed(clean_lines, linenum, typenames):
-  """Check if RValue reference is allowed on a particular line.
+    """Check if RValue reference is allowed on a particular line.
 
   Args:
     clean_lines: A CleansedLines instance containing the file.
@@ -3670,56 +3687,57 @@ def IsRValueAllowed(clean_lines, linenum, typenames):
   Returns:
     True if line is within the region where RValue references are allowed.
   """
-  # Allow region marked by PUSH/POP macros
-  for i in xrange(linenum, 0, -1):
-    line = clean_lines.elided[i]
-    if Match(r'GOOGLE_ALLOW_RVALUE_REFERENCES_(?:PUSH|POP)', line):
-      if not line.endswith('PUSH'):
-        return False
-      for j in xrange(linenum, clean_lines.NumLines(), 1):
-        line = clean_lines.elided[j]
+    # Allow region marked by PUSH/POP macros
+    for i in xrange(linenum, 0, -1):
+        line = clean_lines.elided[i]
         if Match(r'GOOGLE_ALLOW_RVALUE_REFERENCES_(?:PUSH|POP)', line):
-          return line.endswith('POP')
-
-  # Allow operator=
-  line = clean_lines.elided[linenum]
-  if Search(r'\boperator\s*=\s*\(', line):
-    return IsDeletedOrDefault(clean_lines, linenum)
-
-  # Allow constructors
-  match = Match(r'\s*(?:[\w<>]+::)*([\w<>]+)\s*::\s*([\w<>]+)\s*\(', line)
-  if match and match.group(1) == match.group(2):
-    return IsDeletedOrDefault(clean_lines, linenum)
-  if Search(r'\b(?:explicit|inline)\s+[\w<>]+\s*\(', line):
-    return IsDeletedOrDefault(clean_lines, linenum)
-
-  if Match(r'\s*[\w<>]+\s*\(', line):
-    previous_line = 'ReturnType'
-    if linenum > 0:
-      previous_line = clean_lines.elided[linenum - 1]
-    if Match(r'^\s*$', previous_line) or Search(r'[{}:;]\s*$', previous_line):
-      return IsDeletedOrDefault(clean_lines, linenum)
+            if not line.endswith('PUSH'):
+                return False
+            for j in xrange(linenum, clean_lines.NumLines(), 1):
+                line = clean_lines.elided[j]
+                if Match(r'GOOGLE_ALLOW_RVALUE_REFERENCES_(?:PUSH|POP)', line):
+                    return line.endswith('POP')
 
-  # Reject types not mentioned in template-argument-list
-  while line:
-    match = Match(r'^.*?(\w+)\s*&&(.*)$', line)
-    if not match:
-      break
-    if match.group(1) not in typenames:
-      return False
-    line = match.group(2)
+    # Allow operator=
+    line = clean_lines.elided[linenum]
+    if Search(r'\boperator\s*=\s*\(', line):
+        return IsDeletedOrDefault(clean_lines, linenum)
+
+    # Allow constructors
+    match = Match(r'\s*(?:[\w<>]+::)*([\w<>]+)\s*::\s*([\w<>]+)\s*\(', line)
+    if match and match.group(1) == match.group(2):
+        return IsDeletedOrDefault(clean_lines, linenum)
+    if Search(r'\b(?:explicit|inline)\s+[\w<>]+\s*\(', line):
+        return IsDeletedOrDefault(clean_lines, linenum)
+
+    if Match(r'\s*[\w<>]+\s*\(', line):
+        previous_line = 'ReturnType'
+        if linenum > 0:
+            previous_line = clean_lines.elided[linenum - 1]
+        if Match(r'^\s*$', previous_line) or Search(r'[{}:;]\s*$',
+                                                    previous_line):
+            return IsDeletedOrDefault(clean_lines, linenum)
+
+    # Reject types not mentioned in template-argument-list
+    while line:
+        match = Match(r'^.*?(\w+)\s*&&(.*)$', line)
+        if not match:
+            break
+        if match.group(1) not in typenames:
+            return False
+        line = match.group(2)
 
-  # All RValue types that were in template-argument-list should have
-  # been removed by now.  Those were allowed, assuming that they will
-  # be forwarded.
-  #
-  # If there are no remaining RValue types left (i.e. types that were
-  # not found in template-argument-list), flag those as not allowed.
-  return line.find('&&') < 0
+    # All RValue types that were in template-argument-list should have
+    # been removed by now.  Those were allowed, assuming that they will
+    # be forwarded.
+    #
+    # If there are no remaining RValue types left (i.e. types that were
+    # not found in template-argument-list), flag those as not allowed.
+    return line.find('&&') < 0
 
 
 def GetTemplateArgs(clean_lines, linenum):
-  """Find list of template arguments associated with this function declaration.
+    """Find list of template arguments associated with this function declaration.
 
   Args:
     clean_lines: A CleansedLines instance containing the file.
@@ -3729,61 +3747,63 @@ def GetTemplateArgs(clean_lines, linenum):
     Set of type names, or empty set if this does not appear to have
     any template parameters.
   """
-  # Find start of function
-  func_line = linenum
-  while func_line > 0:
-    line = clean_lines.elided[func_line]
-    if Match(r'^\s*$', line):
-      return set()
-    if line.find('(') >= 0:
-      break
-    func_line -= 1
-  if func_line == 0:
-    return set()
-
-  # Collapse template-argument-list into a single string
-  argument_list = ''
-  match = Match(r'^(\s*template\s*)<', clean_lines.elided[func_line])
-  if match:
-    # template-argument-list on the same line as function name
-    start_col = len(match.group(1))
-    _, end_line, end_col = CloseExpression(clean_lines, func_line, start_col)
-    if end_col > -1 and end_line == func_line:
-      start_col += 1  # Skip the opening bracket
-      argument_list = clean_lines.elided[func_line][start_col:end_col]
-
-  elif func_line > 1:
-    # template-argument-list one line before function name
-    match = Match(r'^(.*)>\s*$', clean_lines.elided[func_line - 1])
+    # Find start of function
+    func_line = linenum
+    while func_line > 0:
+        line = clean_lines.elided[func_line]
+        if Match(r'^\s*$', line):
+            return set()
+        if line.find('(') >= 0:
+            break
+        func_line -= 1
+    if func_line == 0:
+        return set()
+
+    # Collapse template-argument-list into a single string
+    argument_list = ''
+    match = Match(r'^(\s*template\s*)<', clean_lines.elided[func_line])
     if match:
-      end_col = len(match.group(1))
-      _, start_line, start_col = ReverseCloseExpression(
-          clean_lines, func_line - 1, end_col)
-      if start_col > -1:
-        start_col += 1  # Skip the opening bracket
-        while start_line < func_line - 1:
-          argument_list += clean_lines.elided[start_line][start_col:]
-          start_col = 0
-          start_line += 1
-        argument_list += clean_lines.elided[func_line - 1][start_col:end_col]
-
-  if not argument_list:
-    return set()
-
-  # Extract type names
-  typenames = set()
-  while True:
-    match = Match(r'^[,\s]*(?:typename|class)(?:\.\.\.)?\s+(\w+)(.*)$',
-                  argument_list)
-    if not match:
-      break
-    typenames.add(match.group(1))
-    argument_list = match.group(2)
-  return typenames
+        # template-argument-list on the same line as function name
+        start_col = len(match.group(1))
+        _, end_line, end_col = CloseExpression(clean_lines, func_line,
+                                               start_col)
+        if end_col > -1 and end_line == func_line:
+            start_col += 1  # Skip the opening bracket
+            argument_list = clean_lines.elided[func_line][start_col:end_col]
+
+    elif func_line > 1:
+        # template-argument-list one line before function name
+        match = Match(r'^(.*)>\s*$', clean_lines.elided[func_line - 1])
+        if match:
+            end_col = len(match.group(1))
+            _, start_line, start_col = ReverseCloseExpression(
+                clean_lines, func_line - 1, end_col)
+            if start_col > -1:
+                start_col += 1  # Skip the opening bracket
+                while start_line < func_line - 1:
+                    argument_list += clean_lines.elided[start_line][start_col:]
+                    start_col = 0
+                    start_line += 1
+                argument_list += clean_lines.elided[func_line - 1][start_col:
+                                                                   end_col]
+
+    if not argument_list:
+        return set()
+
+    # Extract type names
+    typenames = set()
+    while True:
+        match = Match(r'^[,\s]*(?:typename|class)(?:\.\.\.)?\s+(\w+)(.*)$',
+                      argument_list)
+        if not match:
+            break
+        typenames.add(match.group(1))
+        argument_list = match.group(2)
+    return typenames
 
 
 def CheckRValueReference(filename, clean_lines, linenum, nesting_state, error):
-  """Check for rvalue references.
+    """Check for rvalue references.
 
   Args:
     filename: The name of the current file.
@@ -3793,33 +3813,34 @@ def CheckRValueReference(filename, clean_lines, linenum, nesting_state, error):
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
-  # Find lines missing spaces around &&.
-  # TODO(unknown): currently we don't check for rvalue references
-  # with spaces surrounding the && to avoid false positives with
-  # boolean expressions.
-  line = clean_lines.elided[linenum]
-  match = Match(r'^(.*\S)&&', line)
-  if not match:
-    match = Match(r'(.*)&&\S', line)
-  if (not match) or '(&&)' in line or Search(r'\boperator\s*$', match.group(1)):
-    return
-
-  # Either poorly formed && or an rvalue reference, check the context
-  # to get a more accurate error message.  Mostly we want to determine
-  # if what's to the left of "&&" is a type or not.
-  typenames = GetTemplateArgs(clean_lines, linenum)
-  and_pos = len(match.group(1))
-  if IsRValueType(typenames, clean_lines, nesting_state, linenum, and_pos):
-    if not IsRValueAllowed(clean_lines, linenum, typenames):
-      error(filename, linenum, 'build/c++11', 3,
-            'RValue references are an unapproved C++ feature.')
-  else:
-    error(filename, linenum, 'whitespace/operators', 3,
-          'Missing spaces around &&')
+    # Find lines missing spaces around &&.
+    # TODO(unknown): currently we don't check for rvalue references
+    # with spaces surrounding the && to avoid false positives with
+    # boolean expressions.
+    line = clean_lines.elided[linenum]
+    match = Match(r'^(.*\S)&&', line)
+    if not match:
+        match = Match(r'(.*)&&\S', line)
+    if (not match) or '(&&)' in line or Search(r'\boperator\s*$',
+                                               match.group(1)):
+        return
+
+    # Either poorly formed && or an rvalue reference, check the context
+    # to get a more accurate error message.  Mostly we want to determine
+    # if what's to the left of "&&" is a type or not.
+    typenames = GetTemplateArgs(clean_lines, linenum)
+    and_pos = len(match.group(1))
+    if IsRValueType(typenames, clean_lines, nesting_state, linenum, and_pos):
+        if not IsRValueAllowed(clean_lines, linenum, typenames):
+            error(filename, linenum, 'build/c++11', 3,
+                  'RValue references are an unapproved C++ feature.')
+    else:
+        error(filename, linenum, 'whitespace/operators', 3,
+              'Missing spaces around &&')
 
 
 def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):
-  """Checks for additional blank line issues related to sections.
+    """Checks for additional blank line issues related to sections.
 
   Currently the only thing checked here is blank line before protected/private.
 
@@ -3830,51 +3851,53 @@ def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
-  # Skip checks if the class is small, where small means 25 lines or less.
-  # 25 lines seems like a good cutoff since that's the usual height of
-  # terminals, and any class that can't fit in one screen can't really
-  # be considered "small".
-  #
-  # Also skip checks if we are on the first line.  This accounts for
-  # classes that look like
-  #   class Foo { public: ... };
-  #
-  # If we didn't find the end of the class, last_line would be zero,
-  # and the check will be skipped by the first condition.
-  if (class_info.last_line - class_info.starting_linenum <= 24 or
-      linenum <= class_info.starting_linenum):
-    return
-
-  matched = Match(r'\s*(public|protected|private):', clean_lines.lines[linenum])
-  if matched:
-    # Issue warning if the line before public/protected/private was
-    # not a blank line, but don't do this if the previous line contains
-    # "class" or "struct".  This can happen two ways:
-    #  - We are at the beginning of the class.
-    #  - We are forward-declaring an inner class that is semantically
-    #    private, but needed to be public for implementation reasons.
-    # Also ignores cases where the previous line ends with a backslash as can be
-    # common when defining classes in C macros.
-    prev_line = clean_lines.lines[linenum - 1]
-    if (not IsBlankLine(prev_line) and
-        not Search(r'\b(class|struct)\b', prev_line) and
-        not Search(r'\\$', prev_line)):
-      # Try a bit harder to find the beginning of the class.  This is to
-      # account for multi-line base-specifier lists, e.g.:
-      #   class Derived
-      #       : public Base {
-      end_class_head = class_info.starting_linenum
-      for i in range(class_info.starting_linenum, linenum):
-        if Search(r'\{\s*$', clean_lines.lines[i]):
-          end_class_head = i
-          break
-      if end_class_head < linenum - 1:
-        error(filename, linenum, 'whitespace/blank_line', 3,
-              '"%s:" should be preceded by a blank line' % matched.group(1))
+    # Skip checks if the class is small, where small means 25 lines or less.
+    # 25 lines seems like a good cutoff since that's the usual height of
+    # terminals, and any class that can't fit in one screen can't really
+    # be considered "small".
+    #
+    # Also skip checks if we are on the first line.  This accounts for
+    # classes that look like
+    #   class Foo { public: ... };
+    #
+    # If we didn't find the end of the class, last_line would be zero,
+    # and the check will be skipped by the first condition.
+    if (class_info.last_line - class_info.starting_linenum <= 24 or
+            linenum <= class_info.starting_linenum):
+        return
+
+    matched = Match(r'\s*(public|protected|private):',
+                    clean_lines.lines[linenum])
+    if matched:
+        # Issue warning if the line before public/protected/private was
+        # not a blank line, but don't do this if the previous line contains
+        # "class" or "struct".  This can happen two ways:
+        #  - We are at the beginning of the class.
+        #  - We are forward-declaring an inner class that is semantically
+        #    private, but needed to be public for implementation reasons.
+        # Also ignores cases where the previous line ends with a backslash as can be
+        # common when defining classes in C macros.
+        prev_line = clean_lines.lines[linenum - 1]
+        if (not IsBlankLine(prev_line) and
+                not Search(r'\b(class|struct)\b', prev_line) and
+                not Search(r'\\$', prev_line)):
+            # Try a bit harder to find the beginning of the class.  This is to
+            # account for multi-line base-specifier lists, e.g.:
+            #   class Derived
+            #       : public Base {
+            end_class_head = class_info.starting_linenum
+            for i in range(class_info.starting_linenum, linenum):
+                if Search(r'\{\s*$', clean_lines.lines[i]):
+                    end_class_head = i
+                    break
+            if end_class_head < linenum - 1:
+                error(filename, linenum, 'whitespace/blank_line', 3,
+                      '"%s:" should be preceded by a blank line' %
+                      matched.group(1))
 
 
 def GetPreviousNonBlankLine(clean_lines, linenum):
-  """Return the most recent non-blank line and its line number.
+    """Return the most recent non-blank line and its line number.
 
   Args:
     clean_lines: A CleansedLines instance containing the file contents.
@@ -3887,17 +3910,17 @@ def GetPreviousNonBlankLine(clean_lines, linenum):
     if this is the first non-blank line.
   """
 
-  prevlinenum = linenum - 1
-  while prevlinenum >= 0:
-    prevline = clean_lines.elided[prevlinenum]
-    if not IsBlankLine(prevline):     # if not a blank line...
-      return (prevline, prevlinenum)
-    prevlinenum -= 1
-  return ('', -1)
+    prevlinenum = linenum - 1
+    while prevlinenum >= 0:
+        prevline = clean_lines.elided[prevlinenum]
+        if not IsBlankLine(prevline):  # if not a blank line...
+            return (prevline, prevlinenum)
+        prevlinenum -= 1
+    return ('', -1)
 
 
 def CheckBraces(filename, clean_lines, linenum, error):
-  """Looks for misplaced braces (e.g. at the end of line).
+    """Looks for misplaced braces (e.g. at the end of line).
 
   Args:
     filename: The name of the current file.
@@ -3906,114 +3929,123 @@ def CheckBraces(filename, clean_lines, linenum, error):
     error: The function to call with any errors found.
   """
 
-  line = clean_lines.elided[linenum]        # get rid of comments and strings
-
-  if Match(r'\s*{\s*$', line):
-    # We allow an open brace to start a line in the case where someone is using
-    # braces in a block to explicitly create a new scope, which is commonly used
-    # to control the lifetime of stack-allocated variables.  Braces are also
-    # used for brace initializers inside function calls.  We don't detect this
-    # perfectly: we just don't complain if the last non-whitespace character on
-    # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the
-    # previous line starts a preprocessor block.
-    prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
-    if (not Search(r'[,;:}{(]\s*$', prevline) and
-        not Match(r'\s*#', prevline)):
-      error(filename, linenum, 'whitespace/braces', 4,
-            '{ should almost always be at the end of the previous line')
-
-  # An else clause should be on the same line as the preceding closing brace.
-  if Match(r'\s*else\b\s*(?:if\b|\{|$)', line):
-    prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
-    if Match(r'\s*}\s*$', prevline):
-      error(filename, linenum, 'whitespace/newline', 4,
-            'An else should appear on the same line as the preceding }')
-
-  # If braces come on one side of an else, they should be on both.
-  # However, we have to worry about "else if" that spans multiple lines!
-  if Search(r'else if\s*\(', line):       # could be multi-line if
-    brace_on_left = bool(Search(r'}\s*else if\s*\(', line))
-    # find the ( after the if
-    pos = line.find('else if')
-    pos = line.find('(', pos)
-    if pos > 0:
-      (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
-      brace_on_right = endline[endpos:].find('{') != -1
-      if brace_on_left != brace_on_right:    # must be brace after if
+    line = clean_lines.elided[linenum]  # get rid of comments and strings
+
+    if Match(r'\s*{\s*$', line):
+        # We allow an open brace to start a line in the case where someone is using
+        # braces in a block to explicitly create a new scope, which is commonly used
+        # to control the lifetime of stack-allocated variables.  Braces are also
+        # used for brace initializers inside function calls.  We don't detect this
+        # perfectly: we just don't complain if the last non-whitespace character on
+        # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the
+        # previous line starts a preprocessor block.
+        prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+        if (not Search(r'[,;:}{(]\s*$', prevline) and
+                not Match(r'\s*#', prevline)):
+            error(filename, linenum, 'whitespace/braces', 4,
+                  '{ should almost always be at the end of the previous line')
+
+    # An else clause should be on the same line as the preceding closing brace.
+    if Match(r'\s*else\b\s*(?:if\b|\{|$)', line):
+        prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+        if Match(r'\s*}\s*$', prevline):
+            error(filename, linenum, 'whitespace/newline', 4,
+                  'An else should appear on the same line as the preceding }')
+
+    # If braces come on one side of an else, they should be on both.
+    # However, we have to worry about "else if" that spans multiple lines!
+    if Search(r'else if\s*\(', line):  # could be multi-line if
+        brace_on_left = bool(Search(r'}\s*else if\s*\(', line))
+        # find the ( after the if
+        pos = line.find('else if')
+        pos = line.find('(', pos)
+        if pos > 0:
+            (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
+            brace_on_right = endline[endpos:].find('{') != -1
+            if brace_on_left != brace_on_right:  # must be brace after if
+                error(
+                    filename, linenum, 'readability/braces', 5,
+                    'If an else has a brace on one side, it should have it on both'
+                )
+    elif Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
         error(filename, linenum, 'readability/braces', 5,
               'If an else has a brace on one side, it should have it on both')
-  elif Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
-    error(filename, linenum, 'readability/braces', 5,
-          'If an else has a brace on one side, it should have it on both')
-
-  # Likewise, an else should never have the else clause on the same line
-  if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line):
-    error(filename, linenum, 'whitespace/newline', 4,
-          'Else clause should never be on same line as else (use 2 lines)')
-
-  # In the same way, a do/while should never be on one line
-  if Match(r'\s*do [^\s{]', line):
-    error(filename, linenum, 'whitespace/newline', 4,
-          'do/while clauses should not be on a single line')
-
-  # Check single-line if/else bodies. The style guide says 'curly braces are not
-  # required for single-line statements'. We additionally allow multi-line,
-  # single statements, but we reject anything with more than one semicolon in
-  # it. This means that the first semicolon after the if should be at the end of
-  # its line, and the line after that should have an indent level equal to or
-  # lower than the if. We also check for ambiguous if/else nesting without
-  # braces.
-  if_else_match = Search(r'\b(if\s*\(|else\b)', line)
-  if if_else_match and not Match(r'\s*#', line):
-    if_indent = GetIndentLevel(line)
-    endline, endlinenum, endpos = line, linenum, if_else_match.end()
-    if_match = Search(r'\bif\s*\(', line)
-    if if_match:
-      # This could be a multiline if condition, so find the end first.
-      pos = if_match.end() - 1
-      (endline, endlinenum, endpos) = CloseExpression(clean_lines, linenum, pos)
-    # Check for an opening brace, either directly after the if or on the next
-    # line. If found, this isn't a single-statement conditional.
-    if (not Match(r'\s*{', endline[endpos:])
-        and not (Match(r'\s*$', endline[endpos:])
-                 and endlinenum < (len(clean_lines.elided) - 1)
-                 and Match(r'\s*{', clean_lines.elided[endlinenum + 1]))):
-      while (endlinenum < len(clean_lines.elided)
-             and ';' not in clean_lines.elided[endlinenum][endpos:]):
-        endlinenum += 1
-        endpos = 0
-      if endlinenum < len(clean_lines.elided):
-        endline = clean_lines.elided[endlinenum]
-        # We allow a mix of whitespace and closing braces (e.g. for one-liner
-        # methods) and a single \ after the semicolon (for macros)
-        endpos = endline.find(';')
-        if not Match(r';[\s}]*(\\?)$', endline[endpos:]):
-          # Semicolon isn't the last character, there's something trailing.
-          # Output a warning if the semicolon is not contained inside
-          # a lambda expression.
-          if not Match(r'^[^{};]*\[[^\[\]]*\][^{}]*\{[^{}]*\}\s*\)*[;,]\s*$',
-                       endline):
-            error(filename, linenum, 'readability/braces', 4,
-                  'If/else bodies with multiple statements require braces')
-        elif endlinenum < len(clean_lines.elided) - 1:
-          # Make sure the next line is dedented
-          next_line = clean_lines.elided[endlinenum + 1]
-          next_indent = GetIndentLevel(next_line)
-          # With ambiguous nested if statements, this will error out on the
-          # if that *doesn't* match the else, regardless of whether it's the
-          # inner one or outer one.
-          if (if_match and Match(r'\s*else\b', next_line)
-              and next_indent != if_indent):
-            error(filename, linenum, 'readability/braces', 4,
-                  'Else clause should be indented at the same level as if. '
-                  'Ambiguous nested if/else chains require braces.')
-          elif next_indent > if_indent:
-            error(filename, linenum, 'readability/braces', 4,
-                  'If/else bodies with multiple statements require braces')
+
+    # Likewise, an else should never have the else clause on the same line
+    if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line):
+        error(filename, linenum, 'whitespace/newline', 4,
+              'Else clause should never be on same line as else (use 2 lines)')
+
+    # In the same way, a do/while should never be on one line
+    if Match(r'\s*do [^\s{]', line):
+        error(filename, linenum, 'whitespace/newline', 4,
+              'do/while clauses should not be on a single line')
+
+    # Check single-line if/else bodies. The style guide says 'curly braces are not
+    # required for single-line statements'. We additionally allow multi-line,
+    # single statements, but we reject anything with more than one semicolon in
+    # it. This means that the first semicolon after the if should be at the end of
+    # its line, and the line after that should have an indent level equal to or
+    # lower than the if. We also check for ambiguous if/else nesting without
+    # braces.
+    if_else_match = Search(r'\b(if\s*\(|else\b)', line)
+    if if_else_match and not Match(r'\s*#', line):
+        if_indent = GetIndentLevel(line)
+        endline, endlinenum, endpos = line, linenum, if_else_match.end()
+        if_match = Search(r'\bif\s*\(', line)
+        if if_match:
+            # This could be a multiline if condition, so find the end first.
+            pos = if_match.end() - 1
+            (endline, endlinenum, endpos) = CloseExpression(clean_lines,
+                                                            linenum, pos)
+        # Check for an opening brace, either directly after the if or on the next
+        # line. If found, this isn't a single-statement conditional.
+        if (not Match(r'\s*{', endline[endpos:]) and
+                not (Match(r'\s*$', endline[endpos:]) and endlinenum <
+                     (len(clean_lines.elided) - 1) and
+                     Match(r'\s*{', clean_lines.elided[endlinenum + 1]))):
+            while (endlinenum < len(clean_lines.elided) and
+                   ';' not in clean_lines.elided[endlinenum][endpos:]):
+                endlinenum += 1
+                endpos = 0
+            if endlinenum < len(clean_lines.elided):
+                endline = clean_lines.elided[endlinenum]
+                # We allow a mix of whitespace and closing braces (e.g. for one-liner
+                # methods) and a single \ after the semicolon (for macros)
+                endpos = endline.find(';')
+                if not Match(r';[\s}]*(\\?)$', endline[endpos:]):
+                    # Semicolon isn't the last character, there's something trailing.
+                    # Output a warning if the semicolon is not contained inside
+                    # a lambda expression.
+                    if not Match(
+                            r'^[^{};]*\[[^\[\]]*\][^{}]*\{[^{}]*\}\s*\)*[;,]\s*$',
+                            endline):
+                        error(
+                            filename, linenum, 'readability/braces', 4,
+                            'If/else bodies with multiple statements require braces'
+                        )
+                elif endlinenum < len(clean_lines.elided) - 1:
+                    # Make sure the next line is dedented
+                    next_line = clean_lines.elided[endlinenum + 1]
+                    next_indent = GetIndentLevel(next_line)
+                    # With ambiguous nested if statements, this will error out on the
+                    # if that *doesn't* match the else, regardless of whether it's the
+                    # inner one or outer one.
+                    if (if_match and Match(r'\s*else\b', next_line) and
+                            next_indent != if_indent):
+                        error(
+                            filename, linenum, 'readability/braces', 4,
+                            'Else clause should be indented at the same level as if. '
+                            'Ambiguous nested if/else chains require braces.')
+                    elif next_indent > if_indent:
+                        error(
+                            filename, linenum, 'readability/braces', 4,
+                            'If/else bodies with multiple statements require braces'
+                        )
 
 
 def CheckTrailingSemicolon(filename, clean_lines, linenum, error):
-  """Looks for redundant trailing semicolon.
+    """Looks for redundant trailing semicolon.
 
   Args:
     filename: The name of the current file.
@@ -4022,135 +4054,133 @@ def CheckTrailingSemicolon(filename, clean_lines, linenum, error):
     error: The function to call with any errors found.
   """
 
-  line = clean_lines.elided[linenum]
-
-  # Block bodies should not be followed by a semicolon.  Due to C++11
-  # brace initialization, there are more places where semicolons are
-  # required than not, so we use a whitelist approach to check these
-  # rather than a blacklist.  These are the places where "};" should
-  # be replaced by just "}":
-  # 1. Some flavor of block following closing parenthesis:
-  #    for (;;) {};
-  #    while (...) {};
-  #    switch (...) {};
-  #    Function(...) {};
-  #    if (...) {};
-  #    if (...) else if (...) {};
-  #
-  # 2. else block:
-  #    if (...) else {};
-  #
-  # 3. const member function:
-  #    Function(...) const {};
-  #
-  # 4. Block following some statement:
-  #    x = 42;
-  #    {};
-  #
-  # 5. Block at the beginning of a function:
-  #    Function(...) {
-  #      {};
-  #    }
-  #
-  #    Note that naively checking for the preceding "{" will also match
-  #    braces inside multi-dimensional arrays, but this is fine since
-  #    that expression will not contain semicolons.
-  #
-  # 6. Block following another block:
-  #    while (true) {}
-  #    {};
-  #
-  # 7. End of namespaces:
-  #    namespace {};
-  #
-  #    These semicolons seems far more common than other kinds of
-  #    redundant semicolons, possibly due to people converting classes
-  #    to namespaces.  For now we do not warn for this case.
-  #
-  # Try matching case 1 first.
-  match = Match(r'^(.*\)\s*)\{', line)
-  if match:
-    # Matched closing parenthesis (case 1).  Check the token before the
-    # matching opening parenthesis, and don't warn if it looks like a
-    # macro.  This avoids these false positives:
-    #  - macro that defines a base class
-    #  - multi-line macro that defines a base class
-    #  - macro that defines the whole class-head
+    line = clean_lines.elided[linenum]
+
+    # Block bodies should not be followed by a semicolon.  Due to C++11
+    # brace initialization, there are more places where semicolons are
+    # required than not, so we use a whitelist approach to check these
+    # rather than a blacklist.  These are the places where "};" should
+    # be replaced by just "}":
+    # 1. Some flavor of block following closing parenthesis:
+    #    for (;;) {};
+    #    while (...) {};
+    #    switch (...) {};
+    #    Function(...) {};
+    #    if (...) {};
+    #    if (...) else if (...) {};
     #
-    # But we still issue warnings for macros that we know are safe to
-    # warn, specifically:
-    #  - TEST, TEST_F, TEST_P, MATCHER, MATCHER_P
-    #  - TYPED_TEST
-    #  - INTERFACE_DEF
-    #  - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED:
+    # 2. else block:
+    #    if (...) else {};
     #
-    # We implement a whitelist of safe macros instead of a blacklist of
-    # unsafe macros, even though the latter appears less frequently in
-    # google code and would have been easier to implement.  This is because
-    # the downside for getting the whitelist wrong means some extra
-    # semicolons, while the downside for getting the blacklist wrong
-    # would result in compile errors.
+    # 3. const member function:
+    #    Function(...) const {};
     #
-    # In addition to macros, we also don't want to warn on
-    #  - Compound literals
-    #  - Lambdas
-    #  - alignas specifier with anonymous structs:
-    closing_brace_pos = match.group(1).rfind(')')
-    opening_parenthesis = ReverseCloseExpression(
-        clean_lines, linenum, closing_brace_pos)
-    if opening_parenthesis[2] > -1:
-      line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]]
-      macro = Search(r'\b([A-Z_]+)\s*$', line_prefix)
-      func = Match(r'^(.*\])\s*$', line_prefix)
-      if ((macro and
-           macro.group(1) not in (
-               'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST',
-               'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED',
-               'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or
-          (func and not Search(r'\boperator\s*\[\s*\]', func.group(1))) or
-          Search(r'\b(?:struct|union)\s+alignas\s*$', line_prefix) or
-          Search(r'\s+=\s*$', line_prefix)):
-        match = None
-    if (match and
-        opening_parenthesis[1] > 1 and
-        Search(r'\]\s*$', clean_lines.elided[opening_parenthesis[1] - 1])):
-      # Multi-line lambda-expression
-      match = None
-
-  else:
-    # Try matching cases 2-3.
-    match = Match(r'^(.*(?:else|\)\s*const)\s*)\{', line)
-    if not match:
-      # Try matching cases 4-6.  These are always matched on separate lines.
-      #
-      # Note that we can't simply concatenate the previous line to the
-      # current line and do a single match, otherwise we may output
-      # duplicate warnings for the blank line case:
-      #   if (cond) {
-      #     // blank line
-      #   }
-      prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
-      if prevline and Search(r'[;{}]\s*$', prevline):
-        match = Match(r'^(\s*)\{', line)
-
-  # Check matching closing brace
-  if match:
-    (endline, endlinenum, endpos) = CloseExpression(
-        clean_lines, linenum, len(match.group(1)))
-    if endpos > -1 and Match(r'^\s*;', endline[endpos:]):
-      # Current {} pair is eligible for semicolon check, and we have found
-      # the redundant semicolon, output warning here.
-      #
-      # Note: because we are scanning forward for opening braces, and
-      # outputting warnings for the matching closing brace, if there are
-      # nested blocks with trailing semicolons, we will get the error
-      # messages in reversed order.
-      error(filename, endlinenum, 'readability/braces', 4,
-            "You don't need a ; after a }")
+    # 4. Block following some statement:
+    #    x = 42;
+    #    {};
+    #
+    # 5. Block at the beginning of a function:
+    #    Function(...) {
+    #      {};
+    #    }
+    #
+    #    Note that naively checking for the preceding "{" will also match
+    #    braces inside multi-dimensional arrays, but this is fine since
+    #    that expression will not contain semicolons.
+    #
+    # 6. Block following another block:
+    #    while (true) {}
+    #    {};
+    #
+    # 7. End of namespaces:
+    #    namespace {};
+    #
+    #    These semicolons seems far more common than other kinds of
+    #    redundant semicolons, possibly due to people converting classes
+    #    to namespaces.  For now we do not warn for this case.
+    #
+    # Try matching case 1 first.
+    match = Match(r'^(.*\)\s*)\{', line)
+    if match:
+        # Matched closing parenthesis (case 1).  Check the token before the
+        # matching opening parenthesis, and don't warn if it looks like a
+        # macro.  This avoids these false positives:
+        #  - macro that defines a base class
+        #  - multi-line macro that defines a base class
+        #  - macro that defines the whole class-head
+        #
+        # But we still issue warnings for macros that we know are safe to
+        # warn, specifically:
+        #  - TEST, TEST_F, TEST_P, MATCHER, MATCHER_P
+        #  - TYPED_TEST
+        #  - INTERFACE_DEF
+        #  - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED:
+        #
+        # We implement a whitelist of safe macros instead of a blacklist of
+        # unsafe macros, even though the latter appears less frequently in
+        # google code and would have been easier to implement.  This is because
+        # the downside for getting the whitelist wrong means some extra
+        # semicolons, while the downside for getting the blacklist wrong
+        # would result in compile errors.
+        #
+        # In addition to macros, we also don't want to warn on
+        #  - Compound literals
+        #  - Lambdas
+        #  - alignas specifier with anonymous structs:
+        closing_brace_pos = match.group(1).rfind(')')
+        opening_parenthesis = ReverseCloseExpression(clean_lines, linenum,
+                                                     closing_brace_pos)
+        if opening_parenthesis[2] > -1:
+            line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]]
+            macro = Search(r'\b([A-Z_]+)\s*$', line_prefix)
+            func = Match(r'^(.*\])\s*$', line_prefix)
+            if ((macro and macro.group(1) not in
+                 ('TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST',
+                  'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED',
+                  'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or
+                (func and not Search(r'\boperator\s*\[\s*\]', func.group(1))) or
+                    Search(r'\b(?:struct|union)\s+alignas\s*$', line_prefix) or
+                    Search(r'\s+=\s*$', line_prefix)):
+                match = None
+        if (match and opening_parenthesis[1] > 1 and Search(
+                r'\]\s*$', clean_lines.elided[opening_parenthesis[1] - 1])):
+            # Multi-line lambda-expression
+            match = None
+
+    else:
+        # Try matching cases 2-3.
+        match = Match(r'^(.*(?:else|\)\s*const)\s*)\{', line)
+        if not match:
+            # Try matching cases 4-6.  These are always matched on separate lines.
+            #
+            # Note that we can't simply concatenate the previous line to the
+            # current line and do a single match, otherwise we may output
+            # duplicate warnings for the blank line case:
+            #   if (cond) {
+            #     // blank line
+            #   }
+            prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+            if prevline and Search(r'[;{}]\s*$', prevline):
+                match = Match(r'^(\s*)\{', line)
+
+    # Check matching closing brace
+    if match:
+        (endline, endlinenum, endpos) = CloseExpression(clean_lines, linenum,
+                                                        len(match.group(1)))
+        if endpos > -1 and Match(r'^\s*;', endline[endpos:]):
+            # Current {} pair is eligible for semicolon check, and we have found
+            # the redundant semicolon, output warning here.
+            #
+            # Note: because we are scanning forward for opening braces, and
+            # outputting warnings for the matching closing brace, if there are
+            # nested blocks with trailing semicolons, we will get the error
+            # messages in reversed order.
+            error(filename, endlinenum, 'readability/braces', 4,
+                  "You don't need a ; after a }")
 
 
 def CheckEmptyBlockBody(filename, clean_lines, linenum, error):
-  """Look for empty loop/conditional body with only a single semicolon.
+    """Look for empty loop/conditional body with only a single semicolon.
 
   Args:
     filename: The name of the current file.
@@ -4159,33 +4189,34 @@ def CheckEmptyBlockBody(filename, clean_lines, linenum, error):
     error: The function to call with any errors found.
   """
 
-  # Search for loop keywords at the beginning of the line.  Because only
-  # whitespaces are allowed before the keywords, this will also ignore most
-  # do-while-loops, since those lines should start with closing brace.
-  #
-  # We also check "if" blocks here, since an empty conditional block
-  # is likely an error.
-  line = clean_lines.elided[linenum]
-  matched = Match(r'\s*(for|while|if)\s*\(', line)
-  if matched:
-    # Find the end of the conditional expression
-    (end_line, end_linenum, end_pos) = CloseExpression(
-        clean_lines, linenum, line.find('('))
-
-    # Output warning if what follows the condition expression is a semicolon.
-    # No warning for all other cases, including whitespace or newline, since we
-    # have a separate check for semicolons preceded by whitespace.
-    if end_pos >= 0 and Match(r';', end_line[end_pos:]):
-      if matched.group(1) == 'if':
-        error(filename, end_linenum, 'whitespace/empty_conditional_body', 5,
-              'Empty conditional bodies should use {}')
-      else:
-        error(filename, end_linenum, 'whitespace/empty_loop_body', 5,
-              'Empty loop bodies should use {} or continue')
+    # Search for loop keywords at the beginning of the line.  Because only
+    # whitespaces are allowed before the keywords, this will also ignore most
+    # do-while-loops, since those lines should start with closing brace.
+    #
+    # We also check "if" blocks here, since an empty conditional block
+    # is likely an error.
+    line = clean_lines.elided[linenum]
+    matched = Match(r'\s*(for|while|if)\s*\(', line)
+    if matched:
+        # Find the end of the conditional expression
+        (end_line, end_linenum, end_pos) = CloseExpression(clean_lines, linenum,
+                                                           line.find('('))
+
+        # Output warning if what follows the condition expression is a semicolon.
+        # No warning for all other cases, including whitespace or newline, since we
+        # have a separate check for semicolons preceded by whitespace.
+        if end_pos >= 0 and Match(r';', end_line[end_pos:]):
+            if matched.group(1) == 'if':
+                error(filename, end_linenum,
+                      'whitespace/empty_conditional_body', 5,
+                      'Empty conditional bodies should use {}')
+            else:
+                error(filename, end_linenum, 'whitespace/empty_loop_body', 5,
+                      'Empty loop bodies should use {} or continue')
 
 
 def FindCheckMacro(line):
-  """Find a replaceable CHECK-like macro.
+    """Find a replaceable CHECK-like macro.
 
   Args:
     line: line to search on.
@@ -4193,22 +4224,22 @@ def FindCheckMacro(line):
     (macro name, start position), or (None, -1) if no replaceable
     macro is found.
   """
-  for macro in _CHECK_MACROS:
-    i = line.find(macro)
-    if i >= 0:
-      # Find opening parenthesis.  Do a regular expression match here
-      # to make sure that we are matching the expected CHECK macro, as
-      # opposed to some other macro that happens to contain the CHECK
-      # substring.
-      matched = Match(r'^(.*\b' + macro + r'\s*)\(', line)
-      if not matched:
-        continue
-      return (macro, len(matched.group(1)))
-  return (None, -1)
+    for macro in _CHECK_MACROS:
+        i = line.find(macro)
+        if i >= 0:
+            # Find opening parenthesis.  Do a regular expression match here
+            # to make sure that we are matching the expected CHECK macro, as
+            # opposed to some other macro that happens to contain the CHECK
+            # substring.
+            matched = Match(r'^(.*\b' + macro + r'\s*)\(', line)
+            if not matched:
+                continue
+            return (macro, len(matched.group(1)))
+    return (None, -1)
 
 
 def CheckCheck(filename, clean_lines, linenum, error):
-  """Checks the use of CHECK and EXPECT macros.
+    """Checks the use of CHECK and EXPECT macros.
 
   Args:
     filename: The name of the current file.
@@ -4217,116 +4248,116 @@ def CheckCheck(filename, clean_lines, linenum, error):
     error: The function to call with any errors found.
   """
 
-  # Decide the set of replacement macros that should be suggested
-  lines = clean_lines.elided
-  (check_macro, start_pos) = FindCheckMacro(lines[linenum])
-  if not check_macro:
-    return
-
-  # Find end of the boolean expression by matching parentheses
-  (last_line, end_line, end_pos) = CloseExpression(
-      clean_lines, linenum, start_pos)
-  if end_pos < 0:
-    return
-
-  # If the check macro is followed by something other than a
-  # semicolon, assume users will log their own custom error messages
-  # and don't suggest any replacements.
-  if not Match(r'\s*;', last_line[end_pos:]):
-    return
-
-  if linenum == end_line:
-    expression = lines[linenum][start_pos + 1:end_pos - 1]
-  else:
-    expression = lines[linenum][start_pos + 1:]
-    for i in xrange(linenum + 1, end_line):
-      expression += lines[i]
-    expression += last_line[0:end_pos - 1]
-
-  # Parse expression so that we can take parentheses into account.
-  # This avoids false positives for inputs like "CHECK((a < 4) == b)",
-  # which is not replaceable by CHECK_LE.
-  lhs = ''
-  rhs = ''
-  operator = None
-  while expression:
-    matched = Match(r'^\s*(<<|<<=|>>|>>=|->\*|->|&&|\|\||'
-                    r'==|!=|>=|>|<=|<|\()(.*)$', expression)
-    if matched:
-      token = matched.group(1)
-      if token == '(':
-        # Parenthesized operand
-        expression = matched.group(2)
-        (end, _) = FindEndOfExpressionInLine(expression, 0, ['('])
-        if end < 0:
-          return  # Unmatched parenthesis
-        lhs += '(' + expression[0:end]
-        expression = expression[end:]
-      elif token in ('&&', '||'):
-        # Logical and/or operators.  This means the expression
-        # contains more than one term, for example:
-        #   CHECK(42 < a && a < b);
-        #
-        # These are not replaceable with CHECK_LE, so bail out early.
+    # Decide the set of replacement macros that should be suggested
+    lines = clean_lines.elided
+    (check_macro, start_pos) = FindCheckMacro(lines[linenum])
+    if not check_macro:
         return
-      elif token in ('<<', '<<=', '>>', '>>=', '->*', '->'):
-        # Non-relational operator
-        lhs += token
-        expression = matched.group(2)
-      else:
-        # Relational operator
-        operator = token
-        rhs = matched.group(2)
-        break
+
+    # Find end of the boolean expression by matching parentheses
+    (last_line, end_line, end_pos) = CloseExpression(clean_lines, linenum,
+                                                     start_pos)
+    if end_pos < 0:
+        return
+
+    # If the check macro is followed by something other than a
+    # semicolon, assume users will log their own custom error messages
+    # and don't suggest any replacements.
+    if not Match(r'\s*;', last_line[end_pos:]):
+        return
+
+    if linenum == end_line:
+        expression = lines[linenum][start_pos + 1:end_pos - 1]
     else:
-      # Unparenthesized operand.  Instead of appending to lhs one character
-      # at a time, we do another regular expression match to consume several
-      # characters at once if possible.  Trivial benchmark shows that this
-      # is more efficient when the operands are longer than a single
-      # character, which is generally the case.
-      matched = Match(r'^([^-=!<>()&|]+)(.*)$', expression)
-      if not matched:
-        matched = Match(r'^(\s*\S)(.*)$', expression)
-        if not matched:
-          break
-      lhs += matched.group(1)
-      expression = matched.group(2)
-
-  # Only apply checks if we got all parts of the boolean expression
-  if not (lhs and operator and rhs):
-    return
-
-  # Check that rhs do not contain logical operators.  We already know
-  # that lhs is fine since the loop above parses out && and ||.
-  if rhs.find('&&') > -1 or rhs.find('||') > -1:
-    return
-
-  # At least one of the operands must be a constant literal.  This is
-  # to avoid suggesting replacements for unprintable things like
-  # CHECK(variable != iterator)
-  #
-  # The following pattern matches decimal, hex integers, strings, and
-  # characters (in that order).
-  lhs = lhs.strip()
-  rhs = rhs.strip()
-  match_constant = r'^([-+]?(\d+|0[xX][0-9a-fA-F]+)[lLuU]{0,3}|".*"|\'.*\')$'
-  if Match(match_constant, lhs) or Match(match_constant, rhs):
-    # Note: since we know both lhs and rhs, we can provide a more
-    # descriptive error message like:
-    #   Consider using CHECK_EQ(x, 42) instead of CHECK(x == 42)
-    # Instead of:
-    #   Consider using CHECK_EQ instead of CHECK(a == b)
+        expression = lines[linenum][start_pos + 1:]
+        for i in xrange(linenum + 1, end_line):
+            expression += lines[i]
+        expression += last_line[0:end_pos - 1]
+
+    # Parse expression so that we can take parentheses into account.
+    # This avoids false positives for inputs like "CHECK((a < 4) == b)",
+    # which is not replaceable by CHECK_LE.
+    lhs = ''
+    rhs = ''
+    operator = None
+    while expression:
+        matched = Match(r'^\s*(<<|<<=|>>|>>=|->\*|->|&&|\|\||'
+                        r'==|!=|>=|>|<=|<|\()(.*)$', expression)
+        if matched:
+            token = matched.group(1)
+            if token == '(':
+                # Parenthesized operand
+                expression = matched.group(2)
+                (end, _) = FindEndOfExpressionInLine(expression, 0, ['('])
+                if end < 0:
+                    return  # Unmatched parenthesis
+                lhs += '(' + expression[0:end]
+                expression = expression[end:]
+            elif token in ('&&', '||'):
+                # Logical and/or operators.  This means the expression
+                # contains more than one term, for example:
+                #   CHECK(42 < a && a < b);
+                #
+                # These are not replaceable with CHECK_LE, so bail out early.
+                return
+            elif token in ('<<', '<<=', '>>', '>>=', '->*', '->'):
+                # Non-relational operator
+                lhs += token
+                expression = matched.group(2)
+            else:
+                # Relational operator
+                operator = token
+                rhs = matched.group(2)
+                break
+        else:
+            # Unparenthesized operand.  Instead of appending to lhs one character
+            # at a time, we do another regular expression match to consume several
+            # characters at once if possible.  Trivial benchmark shows that this
+            # is more efficient when the operands are longer than a single
+            # character, which is generally the case.
+            matched = Match(r'^([^-=!<>()&|]+)(.*)$', expression)
+            if not matched:
+                matched = Match(r'^(\s*\S)(.*)$', expression)
+                if not matched:
+                    break
+            lhs += matched.group(1)
+            expression = matched.group(2)
+
+    # Only apply checks if we got all parts of the boolean expression
+    if not (lhs and operator and rhs):
+        return
+
+    # Check that rhs do not contain logical operators.  We already know
+    # that lhs is fine since the loop above parses out && and ||.
+    if rhs.find('&&') > -1 or rhs.find('||') > -1:
+        return
+
+    # At least one of the operands must be a constant literal.  This is
+    # to avoid suggesting replacements for unprintable things like
+    # CHECK(variable != iterator)
     #
-    # We are still keeping the less descriptive message because if lhs
-    # or rhs gets long, the error message might become unreadable.
-    error(filename, linenum, 'readability/check', 2,
-          'Consider using %s instead of %s(a %s b)' % (
-              _CHECK_REPLACEMENT[check_macro][operator],
-              check_macro, operator))
+    # The following pattern matches decimal, hex integers, strings, and
+    # characters (in that order).
+    lhs = lhs.strip()
+    rhs = rhs.strip()
+    match_constant = r'^([-+]?(\d+|0[xX][0-9a-fA-F]+)[lLuU]{0,3}|".*"|\'.*\')$'
+    if Match(match_constant, lhs) or Match(match_constant, rhs):
+        # Note: since we know both lhs and rhs, we can provide a more
+        # descriptive error message like:
+        #   Consider using CHECK_EQ(x, 42) instead of CHECK(x == 42)
+        # Instead of:
+        #   Consider using CHECK_EQ instead of CHECK(a == b)
+        #
+        # We are still keeping the less descriptive message because if lhs
+        # or rhs gets long, the error message might become unreadable.
+        error(filename, linenum, 'readability/check', 2,
+              'Consider using %s instead of %s(a %s b)' %
+              (_CHECK_REPLACEMENT[check_macro][operator], check_macro,
+               operator))
 
 
 def CheckAltTokens(filename, clean_lines, linenum, error):
-  """Check alternative keywords being used in boolean expressions.
+    """Check alternative keywords being used in boolean expressions.
 
   Args:
     filename: The name of the current file.
@@ -4334,31 +4365,31 @@ def CheckAltTokens(filename, clean_lines, linenum, error):
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
-  line = clean_lines.elided[linenum]
+    line = clean_lines.elided[linenum]
 
-  # Avoid preprocessor lines
-  if Match(r'^\s*#', line):
-    return
+    # Avoid preprocessor lines
+    if Match(r'^\s*#', line):
+        return
 
-  # Last ditch effort to avoid multi-line comments.  This will not help
-  # if the comment started before the current line or ended after the
-  # current line, but it catches most of the false positives.  At least,
-  # it provides a way to workaround this warning for people who use
-  # multi-line comments in preprocessor macros.
-  #
-  # TODO(unknown): remove this once cpplint has better support for
-  # multi-line comments.
-  if line.find('/*') >= 0 or line.find('*/') >= 0:
-    return
+    # Last ditch effort to avoid multi-line comments.  This will not help
+    # if the comment started before the current line or ended after the
+    # current line, but it catches most of the false positives.  At least,
+    # it provides a way to workaround this warning for people who use
+    # multi-line comments in preprocessor macros.
+    #
+    # TODO(unknown): remove this once cpplint has better support for
+    # multi-line comments.
+    if line.find('/*') >= 0 or line.find('*/') >= 0:
+        return
 
-  for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line):
-    error(filename, linenum, 'readability/alt_tokens', 2,
-          'Use operator %s instead of %s' % (
-              _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1)))
+    for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line):
+        error(filename, linenum, 'readability/alt_tokens', 2,
+              'Use operator %s instead of %s' % (
+                  _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1)))
 
 
 def GetLineWidth(line):
-  """Determines the width of the line in column positions.
+    """Determines the width of the line in column positions.
 
   Args:
     line: A string, which may be a Unicode string.
@@ -4367,21 +4398,21 @@ def GetLineWidth(line):
     The width of the line in column positions, accounting for Unicode
     combining characters and wide characters.
   """
-  if isinstance(line, unicode):
-    width = 0
-    for uc in unicodedata.normalize('NFC', line):
-      if unicodedata.east_asian_width(uc) in ('W', 'F'):
-        width += 2
-      elif not unicodedata.combining(uc):
-        width += 1
-    return width
-  else:
-    return len(line)
+    if isinstance(line, unicode):
+        width = 0
+        for uc in unicodedata.normalize('NFC', line):
+            if unicodedata.east_asian_width(uc) in ('W', 'F'):
+                width += 2
+            elif not unicodedata.combining(uc):
+                width += 1
+        return width
+    else:
+        return len(line)
 
 
 def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
                error):
-  """Checks rules from the 'C++ style rules' section of cppguide.html.
+    """Checks rules from the 'C++ style rules' section of cppguide.html.
 
   Most of these rules are hard to test (naming, comment style), but we
   do what we can.  In particular we check for 2-space indents, line lengths,
@@ -4397,105 +4428,105 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
     error: The function to call with any errors found.
   """
 
-  # Don't use "elided" lines here, otherwise we can't check commented lines.
-  # Don't want to use "raw" either, because we don't want to check inside C++11
-  # raw strings,
-  raw_lines = clean_lines.lines_without_raw_strings
-  line = raw_lines[linenum]
-
-  if line.find('\t') != -1:
-    error(filename, linenum, 'whitespace/tab', 1,
-          'Tab found; better to use spaces')
-
-  # One or three blank spaces at the beginning of the line is weird; it's
-  # hard to reconcile that with 2-space indents.
-  # NOTE: here are the conditions rob pike used for his tests.  Mine aren't
-  # as sophisticated, but it may be worth becoming so:  RLENGTH==initial_spaces
-  # if(RLENGTH > 20) complain = 0;
-  # if(match($0, " +(error|private|public|protected):")) complain = 0;
-  # if(match(prev, "&& *$")) complain = 0;
-  # if(match(prev, "\\|\\| *$")) complain = 0;
-  # if(match(prev, "[\",=><] *$")) complain = 0;
-  # if(match($0, " <<")) complain = 0;
-  # if(match(prev, " +for \\(")) complain = 0;
-  # if(prevodd && match(prevprev, " +for \\(")) complain = 0;
-  scope_or_label_pattern = r'\s*\w+\s*:\s*\\?$'
-  classinfo = nesting_state.InnermostClass()
-  initial_spaces = 0
-  cleansed_line = clean_lines.elided[linenum]
-  while initial_spaces < len(line) and line[initial_spaces] == ' ':
-    initial_spaces += 1
-  if line and line[-1].isspace():
-    error(filename, linenum, 'whitespace/end_of_line', 4,
-          'Line ends in whitespace.  Consider deleting these extra spaces.')
-  # There are certain situations we allow one space, notably for
-  # section labels, and also lines containing multi-line raw strings.
-  elif ((initial_spaces == 1 or initial_spaces == 3) and
-        not Match(scope_or_label_pattern, cleansed_line) and
-        not (clean_lines.raw_lines[linenum] != line and
-             Match(r'^\s*""', line))):
-    error(filename, linenum, 'whitespace/indent', 3,
-          'Weird number of spaces at line-start.  '
-          'Are you using a 2-space indent?')
-
-  # Check if the line is a header guard.
-  is_header_guard = False
-  if file_extension == 'h':
-    cppvar = GetHeaderGuardCPPVariable(filename)
-    if (line.startswith('#ifndef %s' % cppvar) or
-        line.startswith('#define %s' % cppvar) or
-        line.startswith('#endif  // %s' % cppvar)):
-      is_header_guard = True
-  # #include lines and header guards can be long, since there's no clean way to
-  # split them.
-  #
-  # URLs can be long too.  It's possible to split these, but it makes them
-  # harder to cut&paste.
-  #
-  # The "$Id:...$" comment may also get very long without it being the
-  # developers fault.
-  if (not line.startswith('#include') and not is_header_guard and
-      not Match(r'^\s*//.*http(s?)://\S*$', line) and
-      not Match(r'^// \$Id:.*#[0-9]+ \$$', line)):
-    line_width = GetLineWidth(line)
-    extended_length = int((_line_length * 1.25))
-    if line_width > extended_length:
-      error(filename, linenum, 'whitespace/line_length', 4,
-            'Lines should very rarely be longer than %i characters' %
-            extended_length)
-    elif line_width > _line_length:
-      error(filename, linenum, 'whitespace/line_length', 2,
-            'Lines should be <= %i characters long' % _line_length)
-
-  if (cleansed_line.count(';') > 1 and
-      # for loops are allowed two ;'s (and may run over two lines).
-      cleansed_line.find('for') == -1 and
-      (GetPreviousNonBlankLine(clean_lines, linenum)[0].find('for') == -1 or
-       GetPreviousNonBlankLine(clean_lines, linenum)[0].find(';') != -1) and
-      # It's ok to have many commands in a switch case that fits in 1 line
-      not ((cleansed_line.find('case ') != -1 or
-            cleansed_line.find('default:') != -1) and
-           cleansed_line.find('break;') != -1)):
-    error(filename, linenum, 'whitespace/newline', 0,
-          'More than one command on the same line')
-
-  # Some more style checks
-  CheckBraces(filename, clean_lines, linenum, error)
-  CheckTrailingSemicolon(filename, clean_lines, linenum, error)
-  CheckEmptyBlockBody(filename, clean_lines, linenum, error)
-  CheckAccess(filename, clean_lines, linenum, nesting_state, error)
-  CheckSpacing(filename, clean_lines, linenum, nesting_state, error)
-  CheckOperatorSpacing(filename, clean_lines, linenum, error)
-  CheckParenthesisSpacing(filename, clean_lines, linenum, error)
-  CheckCommaSpacing(filename, clean_lines, linenum, error)
-  CheckBracesSpacing(filename, clean_lines, linenum, error)
-  CheckSpacingForFunctionCall(filename, clean_lines, linenum, error)
-  CheckRValueReference(filename, clean_lines, linenum, nesting_state, error)
-  CheckCheck(filename, clean_lines, linenum, error)
-  CheckAltTokens(filename, clean_lines, linenum, error)
-  classinfo = nesting_state.InnermostClass()
-  if classinfo:
-    CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)
+    # Don't use "elided" lines here, otherwise we can't check commented lines.
+    # Don't want to use "raw" either, because we don't want to check inside C++11
+    # raw strings,
+    raw_lines = clean_lines.lines_without_raw_strings
+    line = raw_lines[linenum]
+
+    if line.find('\t') != -1:
+        error(filename, linenum, 'whitespace/tab', 1,
+              'Tab found; better to use spaces')
+
+    # One or three blank spaces at the beginning of the line is weird; it's
+    # hard to reconcile that with 2-space indents.
+    # NOTE: here are the conditions rob pike used for his tests.  Mine aren't
+    # as sophisticated, but it may be worth becoming so:  RLENGTH==initial_spaces
+    # if(RLENGTH > 20) complain = 0;
+    # if(match($0, " +(error|private|public|protected):")) complain = 0;
+    # if(match(prev, "&& *$")) complain = 0;
+    # if(match(prev, "\\|\\| *$")) complain = 0;
+    # if(match(prev, "[\",=><] *$")) complain = 0;
+    # if(match($0, " <<")) complain = 0;
+    # if(match(prev, " +for \\(")) complain = 0;
+    # if(prevodd && match(prevprev, " +for \\(")) complain = 0;
+    scope_or_label_pattern = r'\s*\w+\s*:\s*\\?$'
+    classinfo = nesting_state.InnermostClass()
+    initial_spaces = 0
+    cleansed_line = clean_lines.elided[linenum]
+    while initial_spaces < len(line) and line[initial_spaces] == ' ':
+        initial_spaces += 1
+    if line and line[-1].isspace():
+        error(filename, linenum, 'whitespace/end_of_line', 4,
+              'Line ends in whitespace.  Consider deleting these extra spaces.')
+    # There are certain situations we allow one space, notably for
+    # section labels, and also lines containing multi-line raw strings.
+    elif ((initial_spaces == 1 or initial_spaces == 3) and
+          not Match(scope_or_label_pattern, cleansed_line) and
+          not (clean_lines.raw_lines[linenum] != line and
+               Match(r'^\s*""', line))):
+        error(filename, linenum, 'whitespace/indent', 3,
+              'Weird number of spaces at line-start.  '
+              'Are you using a 2-space indent?')
+
+    # Check if the line is a header guard.
+    is_header_guard = False
+    if file_extension == 'h':
+        cppvar = GetHeaderGuardCPPVariable(filename)
+        if (line.startswith('#ifndef %s' % cppvar) or
+                line.startswith('#define %s' % cppvar) or
+                line.startswith('#endif  // %s' % cppvar)):
+            is_header_guard = True
+    # #include lines and header guards can be long, since there's no clean way to
+    # split them.
+    #
+    # URLs can be long too.  It's possible to split these, but it makes them
+    # harder to cut&paste.
+    #
+    # The "$Id:...$" comment may also get very long without it being the
+    # developers fault.
+    if (not line.startswith('#include') and not is_header_guard and
+            not Match(r'^\s*//.*http(s?)://\S*$', line) and
+            not Match(r'^// \$Id:.*#[0-9]+ \$$', line)):
+        line_width = GetLineWidth(line)
+        extended_length = int((_line_length * 1.25))
+        if line_width > extended_length:
+            error(filename, linenum, 'whitespace/line_length', 4,
+                  'Lines should very rarely be longer than %i characters' %
+                  extended_length)
+        elif line_width > _line_length:
+            error(filename, linenum, 'whitespace/line_length', 2,
+                  'Lines should be <= %i characters long' % _line_length)
+
+    if (cleansed_line.count(';') > 1 and
+            # for loops are allowed two ;'s (and may run over two lines).
+            cleansed_line.find('for') == -1 and
+        (GetPreviousNonBlankLine(clean_lines, linenum)[0].find('for') == -1 or
+         GetPreviousNonBlankLine(clean_lines, linenum)[0].find(';') != -1) and
+            # It's ok to have many commands in a switch case that fits in 1 line
+            not ((cleansed_line.find('case ') != -1 or
+                  cleansed_line.find('default:') != -1) and
+                 cleansed_line.find('break;') != -1)):
+        error(filename, linenum, 'whitespace/newline', 0,
+              'More than one command on the same line')
+
+    # Some more style checks
+    CheckBraces(filename, clean_lines, linenum, error)
+    CheckTrailingSemicolon(filename, clean_lines, linenum, error)
+    CheckEmptyBlockBody(filename, clean_lines, linenum, error)
+    CheckAccess(filename, clean_lines, linenum, nesting_state, error)
+    CheckSpacing(filename, clean_lines, linenum, nesting_state, error)
+    CheckOperatorSpacing(filename, clean_lines, linenum, error)
+    CheckParenthesisSpacing(filename, clean_lines, linenum, error)
+    CheckCommaSpacing(filename, clean_lines, linenum, error)
+    CheckBracesSpacing(filename, clean_lines, linenum, error)
+    CheckSpacingForFunctionCall(filename, clean_lines, linenum, error)
+    CheckRValueReference(filename, clean_lines, linenum, nesting_state, error)
+    CheckCheck(filename, clean_lines, linenum, error)
+    CheckAltTokens(filename, clean_lines, linenum, error)
+    classinfo = nesting_state.InnermostClass()
+    if classinfo:
+        CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)
 
 
 _RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$')
@@ -4508,7 +4539,7 @@ _RE_FIRST_COMPONENT = re.compile(r'^[^-_.]+')
 
 
 def _DropCommonSuffixes(filename):
-  """Drops common suffixes like _test.cc or -inl.h from filename.
+    """Drops common suffixes like _test.cc or -inl.h from filename.
 
   For example:
     >>> _DropCommonSuffixes('foo/foo-inl.h')
@@ -4526,16 +4557,16 @@ def _DropCommonSuffixes(filename):
   Returns:
     The filename with the common suffix removed.
   """
-  for suffix in ('test.cc', 'regtest.cc', 'unittest.cc',
-                 'inl.h', 'impl.h', 'internal.h'):
-    if (filename.endswith(suffix) and len(filename) > len(suffix) and
-        filename[-len(suffix) - 1] in ('-', '_')):
-      return filename[:-len(suffix) - 1]
-  return os.path.splitext(filename)[0]
+    for suffix in ('test.cc', 'regtest.cc', 'unittest.cc', 'inl.h', 'impl.h',
+                   'internal.h'):
+        if (filename.endswith(suffix) and len(filename) > len(suffix) and
+                filename[-len(suffix) - 1] in ('-', '_')):
+            return filename[:-len(suffix) - 1]
+    return os.path.splitext(filename)[0]
 
 
 def _IsTestFilename(filename):
-  """Determines if the given filename has a suffix that identifies it as a test.
+    """Determines if the given filename has a suffix that identifies it as a test.
 
   Args:
     filename: The input filename.
@@ -4543,16 +4574,15 @@ def _IsTestFilename(filename):
   Returns:
     True if 'filename' looks like a test, False otherwise.
   """
-  if (filename.endswith('_test.cc') or
-      filename.endswith('_unittest.cc') or
-      filename.endswith('_regtest.cc')):
-    return True
-  else:
-    return False
+    if (filename.endswith('_test.cc') or filename.endswith('_unittest.cc') or
+            filename.endswith('_regtest.cc')):
+        return True
+    else:
+        return False
 
 
 def _ClassifyInclude(fileinfo, include, is_system):
-  """Figures out what kind of header 'include' is.
+    """Figures out what kind of header 'include' is.
 
   Args:
     fileinfo: The current file cpplint is running over. A FileInfo instance.
@@ -4575,44 +4605,43 @@ def _ClassifyInclude(fileinfo, include, is_system):
     >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/bar.h', False)
     _OTHER_HEADER
   """
-  # This is a list of all standard c++ header files, except
-  # those already checked for above.
-  is_cpp_h = include in _CPP_HEADERS
-
-  if is_system:
-    if is_cpp_h:
-      return _CPP_SYS_HEADER
-    else:
-      return _C_SYS_HEADER
-
-  # If the target file and the include we're checking share a
-  # basename when we drop common extensions, and the include
-  # lives in . , then it's likely to be owned by the target file.
-  target_dir, target_base = (
-      os.path.split(_DropCommonSuffixes(fileinfo.RepositoryName())))
-  include_dir, include_base = os.path.split(_DropCommonSuffixes(include))
-  if target_base == include_base and (
-      include_dir == target_dir or
-      include_dir == os.path.normpath(target_dir + '/../public')):
-    return _LIKELY_MY_HEADER
-
-  # If the target and include share some initial basename
-  # component, it's possible the target is implementing the
-  # include, so it's allowed to be first, but we'll never
-  # complain if it's not there.
-  target_first_component = _RE_FIRST_COMPONENT.match(target_base)
-  include_first_component = _RE_FIRST_COMPONENT.match(include_base)
-  if (target_first_component and include_first_component and
-      target_first_component.group(0) ==
-      include_first_component.group(0)):
-    return _POSSIBLE_MY_HEADER
-
-  return _OTHER_HEADER
+    # This is a list of all standard c++ header files, except
+    # those already checked for above.
+    is_cpp_h = include in _CPP_HEADERS
 
+    if is_system:
+        if is_cpp_h:
+            return _CPP_SYS_HEADER
+        else:
+            return _C_SYS_HEADER
+
+    # If the target file and the include we're checking share a
+    # basename when we drop common extensions, and the include
+    # lives in . , then it's likely to be owned by the target file.
+    target_dir, target_base = (
+        os.path.split(_DropCommonSuffixes(fileinfo.RepositoryName())))
+    include_dir, include_base = os.path.split(_DropCommonSuffixes(include))
+    if target_base == include_base and (
+            include_dir == target_dir or
+            include_dir == os.path.normpath(target_dir + '/../public')):
+        return _LIKELY_MY_HEADER
+
+    # If the target and include share some initial basename
+    # component, it's possible the target is implementing the
+    # include, so it's allowed to be first, but we'll never
+    # complain if it's not there.
+    target_first_component = _RE_FIRST_COMPONENT.match(target_base)
+    include_first_component = _RE_FIRST_COMPONENT.match(include_base)
+    if (target_first_component and include_first_component and
+            target_first_component.group(0) ==
+            include_first_component.group(0)):
+        return _POSSIBLE_MY_HEADER
+
+    return _OTHER_HEADER
 
 
 def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
-  """Check rules that are applicable to #include lines.
+    """Check rules that are applicable to #include lines.
 
   Strings on #include lines are NOT removed from elided line, to make
   certain tasks easier. However, to prevent false positives, checks
@@ -4625,68 +4654,69 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
     include_state: An _IncludeState instance in which the headers are inserted.
     error: The function to call with any errors found.
   """
-  fileinfo = FileInfo(filename)
-  line = clean_lines.lines[linenum]
-
-  # "include" should use the new style "foo/bar.h" instead of just "bar.h"
-  # Only do this check if the included header follows google naming
-  # conventions.  If not, assume that it's a 3rd party API that
-  # requires special include conventions.
-  #
-  # We also make an exception for Lua headers, which follow google
-  # naming convention but not the include convention.
-  match = Match(r'#include\s*"([^/]+\.h)"', line)
-  if match and not _THIRD_PARTY_HEADERS_PATTERN.match(match.group(1)):
-    error(filename, linenum, 'build/include', 4,
-          'Include the directory when naming .h files')
-
-  # we shouldn't include a file more than once. actually, there are a
-  # handful of instances where doing so is okay, but in general it's
-  # not.
-  match = _RE_PATTERN_INCLUDE.search(line)
-  if match:
-    include = match.group(2)
-    is_system = (match.group(1) == '<')
-    duplicate_line = include_state.FindHeader(include)
-    if duplicate_line >= 0:
-      error(filename, linenum, 'build/include', 4,
-            '"%s" already included at %s:%s' %
-            (include, filename, duplicate_line))
-    elif (include.endswith('.cc') and
-          os.path.dirname(fileinfo.RepositoryName()) != os.path.dirname(include)):
-      error(filename, linenum, 'build/include', 4,
-            'Do not include .cc files from other packages')
-    elif not _THIRD_PARTY_HEADERS_PATTERN.match(include):
-      include_state.include_list[-1].append((include, linenum))
-
-      # We want to ensure that headers appear in the right order:
-      # 1) for foo.cc, foo.h  (preferred location)
-      # 2) c system files
-      # 3) cpp system files
-      # 4) for foo.cc, foo.h  (deprecated location)
-      # 5) other google headers
-      #
-      # We classify each include statement as one of those 5 types
-      # using a number of techniques. The include_state object keeps
-      # track of the highest type seen, and complains if we see a
-      # lower type after that.
-      error_message = include_state.CheckNextIncludeOrder(
-          _ClassifyInclude(fileinfo, include, is_system))
-      if error_message:
-        error(filename, linenum, 'build/include_order', 4,
-              '%s. Should be: %s.h, c system, c++ system, other.' %
-              (error_message, fileinfo.BaseName()))
-      canonical_include = include_state.CanonicalizeAlphabeticalOrder(include)
-      if not include_state.IsInAlphabeticalOrder(
-          clean_lines, linenum, canonical_include):
-        error(filename, linenum, 'build/include_alpha', 4,
-              'Include "%s" not in alphabetical order' % include)
-      include_state.SetLastHeader(canonical_include)
+    fileinfo = FileInfo(filename)
+    line = clean_lines.lines[linenum]
 
+    # "include" should use the new style "foo/bar.h" instead of just "bar.h"
+    # Only do this check if the included header follows google naming
+    # conventions.  If not, assume that it's a 3rd party API that
+    # requires special include conventions.
+    #
+    # We also make an exception for Lua headers, which follow google
+    # naming convention but not the include convention.
+    match = Match(r'#include\s*"([^/]+\.h)"', line)
+    if match and not _THIRD_PARTY_HEADERS_PATTERN.match(match.group(1)):
+        error(filename, linenum, 'build/include', 4,
+              'Include the directory when naming .h files')
+
+    # we shouldn't include a file more than once. actually, there are a
+    # handful of instances where doing so is okay, but in general it's
+    # not.
+    match = _RE_PATTERN_INCLUDE.search(line)
+    if match:
+        include = match.group(2)
+        is_system = (match.group(1) == '<')
+        duplicate_line = include_state.FindHeader(include)
+        if duplicate_line >= 0:
+            error(filename, linenum, 'build/include', 4,
+                  '"%s" already included at %s:%s' %
+                  (include, filename, duplicate_line))
+        elif (include.endswith('.cc') and
+              os.path.dirname(fileinfo.RepositoryName()) !=
+              os.path.dirname(include)):
+            error(filename, linenum, 'build/include', 4,
+                  'Do not include .cc files from other packages')
+        elif not _THIRD_PARTY_HEADERS_PATTERN.match(include):
+            include_state.include_list[-1].append((include, linenum))
+
+            # We want to ensure that headers appear in the right order:
+            # 1) for foo.cc, foo.h  (preferred location)
+            # 2) c system files
+            # 3) cpp system files
+            # 4) for foo.cc, foo.h  (deprecated location)
+            # 5) other google headers
+            #
+            # We classify each include statement as one of those 5 types
+            # using a number of techniques. The include_state object keeps
+            # track of the highest type seen, and complains if we see a
+            # lower type after that.
+            error_message = include_state.CheckNextIncludeOrder(
+                _ClassifyInclude(fileinfo, include, is_system))
+            if error_message:
+                error(filename, linenum, 'build/include_order', 4,
+                      '%s. Should be: %s.h, c system, c++ system, other.' %
+                      (error_message, fileinfo.BaseName()))
+            canonical_include = include_state.CanonicalizeAlphabeticalOrder(
+                include)
+            if not include_state.IsInAlphabeticalOrder(clean_lines, linenum,
+                                                       canonical_include):
+                error(filename, linenum, 'build/include_alpha', 4,
+                      'Include "%s" not in alphabetical order' % include)
+            include_state.SetLastHeader(canonical_include)
 
 
 def _GetTextInside(text, start_pattern):
-  r"""Retrieves all the text between matching open and close parentheses.
+    r"""Retrieves all the text between matching open and close parentheses.
 
   Given a string of lines and a regular expression string, retrieve all the text
   following the expression and between opening punctuation symbols like
@@ -4705,40 +4735,40 @@ def _GetTextInside(text, start_pattern):
     The extracted text.
     None if either the opening string or ending punctuation could not be found.
   """
-  # TODO(unknown): Audit cpplint.py to see what places could be profitably
-  # rewritten to use _GetTextInside (and use inferior regexp matching today).
-
-  # Give opening punctuations to get the matching close-punctuations.
-  matching_punctuation = {'(': ')', '{': '}', '[': ']'}
-  closing_punctuation = set(matching_punctuation.itervalues())
-
-  # Find the position to start extracting text.
-  match = re.search(start_pattern, text, re.M)
-  if not match:  # start_pattern not found in text.
-    return None
-  start_position = match.end(0)
-
-  assert start_position > 0, (
-      'start_pattern must ends with an opening punctuation.')
-  assert text[start_position - 1] in matching_punctuation, (
-      'start_pattern must ends with an opening punctuation.')
-  # Stack of closing punctuations we expect to have in text after position.
-  punctuation_stack = [matching_punctuation[text[start_position - 1]]]
-  position = start_position
-  while punctuation_stack and position < len(text):
-    if text[position] == punctuation_stack[-1]:
-      punctuation_stack.pop()
-    elif text[position] in closing_punctuation:
-      # A closing punctuation without matching opening punctuations.
-      return None
-    elif text[position] in matching_punctuation:
-      punctuation_stack.append(matching_punctuation[text[position]])
-    position += 1
-  if punctuation_stack:
-    # Opening punctuations left without matching close-punctuations.
-    return None
-  # punctuations match.
-  return text[start_position:position - 1]
+    # TODO(unknown): Audit cpplint.py to see what places could be profitably
+    # rewritten to use _GetTextInside (and use inferior regexp matching today).
+
+    # Give opening punctuations to get the matching close-punctuations.
+    matching_punctuation = {'(': ')', '{': '}', '[': ']'}
+    closing_punctuation = set(matching_punctuation.itervalues())
+
+    # Find the position to start extracting text.
+    match = re.search(start_pattern, text, re.M)
+    if not match:  # start_pattern not found in text.
+        return None
+    start_position = match.end(0)
+
+    assert start_position > 0, (
+        'start_pattern must ends with an opening punctuation.')
+    assert text[start_position - 1] in matching_punctuation, (
+        'start_pattern must ends with an opening punctuation.')
+    # Stack of closing punctuations we expect to have in text after position.
+    punctuation_stack = [matching_punctuation[text[start_position - 1]]]
+    position = start_position
+    while punctuation_stack and position < len(text):
+        if text[position] == punctuation_stack[-1]:
+            punctuation_stack.pop()
+        elif text[position] in closing_punctuation:
+            # A closing punctuation without matching opening punctuations.
+            return None
+        elif text[position] in matching_punctuation:
+            punctuation_stack.append(matching_punctuation[text[position]])
+        position += 1
+    if punctuation_stack:
+        # Opening punctuations left without matching close-punctuations.
+        return None
+    # punctuations match.
+    return text[start_position:position - 1]
 
 
 # Patterns for matching call-by-reference parameters.
@@ -4763,13 +4793,13 @@ _RE_PATTERN_REF_PARAM = re.compile(
 # A call-by-const-reference parameter either ends with 'const& identifier'
 # or looks like 'const type& identifier' when 'type' is atomic.
 _RE_PATTERN_CONST_REF_PARAM = (
-    r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT +
-    r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')')
+    r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT + r'|const\s+' +
+    _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')')
 
 
-def CheckLanguage(filename, clean_lines, linenum, file_extension,
-                  include_state, nesting_state, error):
-  """Checks rules from the 'C++ language rules' section of cppguide.html.
+def CheckLanguage(filename, clean_lines, linenum, file_extension, include_state,
+                  nesting_state, error):
+    """Checks rules from the 'C++ language rules' section of cppguide.html.
 
   Some of these rules are hard to test (function overloading, using
   uint32 inappropriately), but we do the best we can.
@@ -4784,149 +4814,152 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension,
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
-  # If the line is empty or consists of entirely a comment, no need to
-  # check it.
-  line = clean_lines.elided[linenum]
-  if not line:
-    return
-
-  match = _RE_PATTERN_INCLUDE.search(line)
-  if match:
-    CheckIncludeLine(filename, clean_lines, linenum, include_state, error)
-    return
-
-  # Reset include state across preprocessor directives.  This is meant
-  # to silence warnings for conditional includes.
-  match = Match(r'^\s*#\s*(if|ifdef|ifndef|elif|else|endif)\b', line)
-  if match:
-    include_state.ResetSection(match.group(1))
-
-  # Make Windows paths like Unix.
-  fullname = os.path.abspath(filename).replace('\\', '/')
-  
-  # Perform other checks now that we are sure that this is not an include line
-  CheckCasts(filename, clean_lines, linenum, error)
-  CheckGlobalStatic(filename, clean_lines, linenum, error)
-  CheckPrintf(filename, clean_lines, linenum, error)
-
-  if file_extension == 'h':
-    # TODO(unknown): check that 1-arg constructors are explicit.
-    #                How to tell it's a constructor?
-    #                (handled in CheckForNonStandardConstructs for now)
-    # TODO(unknown): check that classes declare or disable copy/assign
-    #                (level 1 error)
-    pass
+    # If the line is empty or consists of entirely a comment, no need to
+    # check it.
+    line = clean_lines.elided[linenum]
+    if not line:
+        return
 
-  # Check if people are using the verboten C basic types.  The only exception
-  # we regularly allow is "unsigned short port" for port.
-  if Search(r'\bshort port\b', line):
-    if not Search(r'\bunsigned short port\b', line):
-      error(filename, linenum, 'runtime/int', 4,
-            'Use "unsigned short" for ports, not "short"')
-  else:
-    match = Search(r'\b(short|long(?! +double)|long long)\b', line)
+    match = _RE_PATTERN_INCLUDE.search(line)
     if match:
-      error(filename, linenum, 'runtime/int', 4,
-            'Use int16/int64/etc, rather than the C type %s' % match.group(1))
-
-  # Check if some verboten operator overloading is going on
-  # TODO(unknown): catch out-of-line unary operator&:
-  #   class X {};
-  #   int operator&(const X& x) { return 42; }  // unary operator&
-  # The trick is it's hard to tell apart from binary operator&:
-  #   class Y { int operator&(const Y& x) { return 23; } }; // binary operator&
-  if Search(r'\boperator\s*&\s*\(\s*\)', line):
-    error(filename, linenum, 'runtime/operator', 4,
-          'Unary operator& is dangerous.  Do not use it.')
-
-  # Check for suspicious usage of "if" like
-  # } if (a == b) {
-  if Search(r'\}\s*if\s*\(', line):
-    error(filename, linenum, 'readability/braces', 4,
-          'Did you mean "else if"? If not, start a new line for "if".')
-
-  # Check for potential format string bugs like printf(foo).
-  # We constrain the pattern not to pick things like DocidForPrintf(foo).
-  # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str())
-  # TODO(unknown): Catch the following case. Need to change the calling
-  # convention of the whole function to process multiple line to handle it.
-  #   printf(
-  #       boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line);
-  printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(')
-  if printf_args:
-    match = Match(r'([\w.\->()]+)$', printf_args)
-    if match and match.group(1) != '__VA_ARGS__':
-      function_name = re.search(r'\b((?:string)?printf)\s*\(',
-                                line, re.I).group(1)
-      error(filename, linenum, 'runtime/printf', 4,
-            'Potential format string bug. Do %s("%%s", %s) instead.'
-            % (function_name, match.group(1)))
-
-  # Check for potential memset bugs like memset(buf, sizeof(buf), 0).
-  match = Search(r'memset\s*\(([^,]*),\s*([^,]*),\s*0\s*\)', line)
-  if match and not Match(r"^''|-?[0-9]+|0x[0-9A-Fa-f]$", match.group(2)):
-    error(filename, linenum, 'runtime/memset', 4,
-          'Did you mean "memset(%s, 0, %s)"?'
-          % (match.group(1), match.group(2)))
-
-  if Search(r'\busing namespace\b', line):
-    error(filename, linenum, 'build/namespaces', 5,
-          'Do not use namespace using-directives.  '
-          'Use using-declarations instead.')
-
-  # Detect variable-length arrays.
-  match = Match(r'\s*(.+::)?(\w+) [a-z]\w*\[(.+)];', line)
-  if (match and match.group(2) != 'return' and match.group(2) != 'delete' and
-      match.group(3).find(']') == -1):
-    # Split the size using space and arithmetic operators as delimiters.
-    # If any of the resulting tokens are not compile time constants then
-    # report the error.
-    tokens = re.split(r'\s|\+|\-|\*|\/|<<|>>]', match.group(3))
-    is_const = True
-    skip_next = False
-    for tok in tokens:
-      if skip_next:
+        CheckIncludeLine(filename, clean_lines, linenum, include_state, error)
+        return
+
+    # Reset include state across preprocessor directives.  This is meant
+    # to silence warnings for conditional includes.
+    match = Match(r'^\s*#\s*(if|ifdef|ifndef|elif|else|endif)\b', line)
+    if match:
+        include_state.ResetSection(match.group(1))
+
+    # Make Windows paths like Unix.
+    fullname = os.path.abspath(filename).replace('\\', '/')
+
+    # Perform other checks now that we are sure that this is not an include line
+    CheckCasts(filename, clean_lines, linenum, error)
+    CheckGlobalStatic(filename, clean_lines, linenum, error)
+    CheckPrintf(filename, clean_lines, linenum, error)
+
+    if file_extension == 'h':
+        # TODO(unknown): check that 1-arg constructors are explicit.
+        #                How to tell it's a constructor?
+        #                (handled in CheckForNonStandardConstructs for now)
+        # TODO(unknown): check that classes declare or disable copy/assign
+        #                (level 1 error)
+        pass
+
+    # Check if people are using the verboten C basic types.  The only exception
+    # we regularly allow is "unsigned short port" for port.
+    if Search(r'\bshort port\b', line):
+        if not Search(r'\bunsigned short port\b', line):
+            error(filename, linenum, 'runtime/int', 4,
+                  'Use "unsigned short" for ports, not "short"')
+    else:
+        match = Search(r'\b(short|long(?! +double)|long long)\b', line)
+        if match:
+            error(filename, linenum, 'runtime/int', 4,
+                  'Use int16/int64/etc, rather than the C type %s' %
+                  match.group(1))
+
+    # Check if some verboten operator overloading is going on
+    # TODO(unknown): catch out-of-line unary operator&:
+    #   class X {};
+    #   int operator&(const X& x) { return 42; }  // unary operator&
+    # The trick is it's hard to tell apart from binary operator&:
+    #   class Y { int operator&(const Y& x) { return 23; } }; // binary operator&
+    if Search(r'\boperator\s*&\s*\(\s*\)', line):
+        error(filename, linenum, 'runtime/operator', 4,
+              'Unary operator& is dangerous.  Do not use it.')
+
+    # Check for suspicious usage of "if" like
+    # } if (a == b) {
+    if Search(r'\}\s*if\s*\(', line):
+        error(filename, linenum, 'readability/braces', 4,
+              'Did you mean "else if"? If not, start a new line for "if".')
+
+    # Check for potential format string bugs like printf(foo).
+    # We constrain the pattern not to pick things like DocidForPrintf(foo).
+    # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str())
+    # TODO(unknown): Catch the following case. Need to change the calling
+    # convention of the whole function to process multiple line to handle it.
+    #   printf(
+    #       boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line);
+    printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(')
+    if printf_args:
+        match = Match(r'([\w.\->()]+)$', printf_args)
+        if match and match.group(1) != '__VA_ARGS__':
+            function_name = re.search(r'\b((?:string)?printf)\s*\(', line,
+                                      re.I).group(1)
+            error(filename, linenum, 'runtime/printf', 4,
+                  'Potential format string bug. Do %s("%%s", %s) instead.' %
+                  (function_name, match.group(1)))
+
+    # Check for potential memset bugs like memset(buf, sizeof(buf), 0).
+    match = Search(r'memset\s*\(([^,]*),\s*([^,]*),\s*0\s*\)', line)
+    if match and not Match(r"^''|-?[0-9]+|0x[0-9A-Fa-f]$", match.group(2)):
+        error(filename, linenum, 'runtime/memset', 4,
+              'Did you mean "memset(%s, 0, %s)"?' %
+              (match.group(1), match.group(2)))
+
+    if Search(r'\busing namespace\b', line):
+        error(filename, linenum, 'build/namespaces', 5,
+              'Do not use namespace using-directives.  '
+              'Use using-declarations instead.')
+
+    # Detect variable-length arrays.
+    match = Match(r'\s*(.+::)?(\w+) [a-z]\w*\[(.+)];', line)
+    if (match and match.group(2) != 'return' and match.group(2) != 'delete' and
+            match.group(3).find(']') == -1):
+        # Split the size using space and arithmetic operators as delimiters.
+        # If any of the resulting tokens are not compile time constants then
+        # report the error.
+        tokens = re.split(r'\s|\+|\-|\*|\/|<<|>>]', match.group(3))
+        is_const = True
         skip_next = False
-        continue
-
-      if Search(r'sizeof\(.+\)', tok): continue
-      if Search(r'arraysize\(\w+\)', tok): continue
-
-      tok = tok.lstrip('(')
-      tok = tok.rstrip(')')
-      if not tok: continue
-      if Match(r'\d+', tok): continue
-      if Match(r'0[xX][0-9a-fA-F]+', tok): continue
-      if Match(r'k[A-Z0-9]\w*', tok): continue
-      if Match(r'(.+::)?k[A-Z0-9]\w*', tok): continue
-      if Match(r'(.+::)?[A-Z][A-Z0-9_]*', tok): continue
-      # A catch all for tricky sizeof cases, including 'sizeof expression',
-      # 'sizeof(*type)', 'sizeof(const type)', 'sizeof(struct StructName)'
-      # requires skipping the next token because we split on ' ' and '*'.
-      if tok.startswith('sizeof'):
-        skip_next = True
-        continue
-      is_const = False
-      break
-    if not is_const:
-      error(filename, linenum, 'runtime/arrays', 1,
-            'Do not use variable-length arrays.  Use an appropriately named '
-            "('k' followed by CamelCase) compile-time constant for the size.")
-
-  # Check for use of unnamed namespaces in header files.  Registration
-  # macros are typically OK, so we allow use of "namespace {" on lines
-  # that end with backslashes.
-  if (file_extension == 'h'
-      and Search(r'\bnamespace\s*{', line)
-      and line[-1] != '\\'):
-    error(filename, linenum, 'build/namespaces', 4,
-          'Do not use unnamed namespaces in header files.  See '
-          'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
-          ' for more information.')
+        for tok in tokens:
+            if skip_next:
+                skip_next = False
+                continue
+
+            if Search(r'sizeof\(.+\)', tok): continue
+            if Search(r'arraysize\(\w+\)', tok): continue
+
+            tok = tok.lstrip('(')
+            tok = tok.rstrip(')')
+            if not tok: continue
+            if Match(r'\d+', tok): continue
+            if Match(r'0[xX][0-9a-fA-F]+', tok): continue
+            if Match(r'k[A-Z0-9]\w*', tok): continue
+            if Match(r'(.+::)?k[A-Z0-9]\w*', tok): continue
+            if Match(r'(.+::)?[A-Z][A-Z0-9_]*', tok): continue
+            # A catch all for tricky sizeof cases, including 'sizeof expression',
+            # 'sizeof(*type)', 'sizeof(const type)', 'sizeof(struct StructName)'
+            # requires skipping the next token because we split on ' ' and '*'.
+            if tok.startswith('sizeof'):
+                skip_next = True
+                continue
+            is_const = False
+            break
+        if not is_const:
+            error(
+                filename, linenum, 'runtime/arrays', 1,
+                'Do not use variable-length arrays.  Use an appropriately named '
+                "('k' followed by CamelCase) compile-time constant for the size."
+            )
+
+    # Check for use of unnamed namespaces in header files.  Registration
+    # macros are typically OK, so we allow use of "namespace {" on lines
+    # that end with backslashes.
+    if (file_extension == 'h' and Search(r'\bnamespace\s*{', line) and
+            line[-1] != '\\'):
+        error(
+            filename, linenum, 'build/namespaces', 4,
+            'Do not use unnamed namespaces in header files.  See '
+            'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
+            ' for more information.')
 
 
 def CheckGlobalStatic(filename, clean_lines, linenum, error):
-  """Check for unsafe global or static objects.
+    """Check for unsafe global or static objects.
 
   Args:
     filename: The name of the current file.
@@ -4934,51 +4967,50 @@ def CheckGlobalStatic(filename, clean_lines, linenum, error):
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
-  line = clean_lines.elided[linenum]
-
-  # Match two lines at a time to support multiline declarations
-  if linenum + 1 < clean_lines.NumLines() and not Search(r'[;({]', line):
-    line += clean_lines.elided[linenum + 1].strip()
-
-  # Check for people declaring static/global STL strings at the top level.
-  # This is dangerous because the C++ language does not guarantee that
-  # globals with constructors are initialized before the first access.
-  match = Match(
-      r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)',
-      line)
-
-  # Remove false positives:
-  # - String pointers (as opposed to values).
-  #    string *pointer
-  #    const string *pointer
-  #    string const *pointer
-  #    string *const pointer
-  #
-  # - Functions and template specializations.
-  #    string Function<Type>(...
-  #    string Class<Type>::Method(...
-  #
-  # - Operators.  These are matched separately because operator names
-  #   cross non-word boundaries, and trying to match both operators
-  #   and functions at the same time would decrease accuracy of
-  #   matching identifiers.
-  #    string Class::operator*()
-  if (match and
-      not Search(r'\bstring\b(\s+const)?\s*\*\s*(const\s+)?\w', line) and
-      not Search(r'\boperator\W', line) and
-      not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)*\s*\(([^"]|$)', match.group(3))):
-    error(filename, linenum, 'runtime/string', 4,
-          'For a static/global string constant, use a C style string instead: '
-          '"%schar %s[]".' %
-          (match.group(1), match.group(2)))
-
-  if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line):
-    error(filename, linenum, 'runtime/init', 4,
-          'You seem to be initializing a member variable with itself.')
+    line = clean_lines.elided[linenum]
+
+    # Match two lines at a time to support multiline declarations
+    if linenum + 1 < clean_lines.NumLines() and not Search(r'[;({]', line):
+        line += clean_lines.elided[linenum + 1].strip()
+
+    # Check for people declaring static/global STL strings at the top level.
+    # This is dangerous because the C++ language does not guarantee that
+    # globals with constructors are initialized before the first access.
+    match = Match(r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)',
+                  line)
+
+    # Remove false positives:
+    # - String pointers (as opposed to values).
+    #    string *pointer
+    #    const string *pointer
+    #    string const *pointer
+    #    string *const pointer
+    #
+    # - Functions and template specializations.
+    #    string Function<Type>(...
+    #    string Class<Type>::Method(...
+    #
+    # - Operators.  These are matched separately because operator names
+    #   cross non-word boundaries, and trying to match both operators
+    #   and functions at the same time would decrease accuracy of
+    #   matching identifiers.
+    #    string Class::operator*()
+    if (match and
+            not Search(r'\bstring\b(\s+const)?\s*\*\s*(const\s+)?\w', line) and
+            not Search(r'\boperator\W', line) and not Match(
+                r'\s*(<.*>)?(::[a-zA-Z0-9_]+)*\s*\(([^"]|$)', match.group(3))):
+        error(
+            filename, linenum, 'runtime/string', 4,
+            'For a static/global string constant, use a C style string instead: '
+            '"%schar %s[]".' % (match.group(1), match.group(2)))
+
+    if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line):
+        error(filename, linenum, 'runtime/init', 4,
+              'You seem to be initializing a member variable with itself.')
 
 
 def CheckPrintf(filename, clean_lines, linenum, error):
-  """Check for printf related issues.
+    """Check for printf related issues.
 
   Args:
     filename: The name of the current file.
@@ -4986,28 +5018,28 @@ def CheckPrintf(filename, clean_lines, linenum, error):
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
-  line = clean_lines.elided[linenum]
-
-  # When snprintf is used, the second argument shouldn't be a literal.
-  match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
-  if match and match.group(2) != '0':
-    # If 2nd arg is zero, snprintf is used to calculate size.
-    error(filename, linenum, 'runtime/printf', 3,
-          'If you can, use sizeof(%s) instead of %s as the 2nd arg '
-          'to snprintf.' % (match.group(1), match.group(2)))
-
-  # Check if some verboten C functions are being used.
-  if Search(r'\bsprintf\s*\(', line):
-    error(filename, linenum, 'runtime/printf', 5,
-          'Never use sprintf. Use snprintf instead.')
-  match = Search(r'\b(strcpy|strcat)\s*\(', line)
-  if match:
-    error(filename, linenum, 'runtime/printf', 4,
-          'Almost always, snprintf is better than %s' % match.group(1))
+    line = clean_lines.elided[linenum]
+
+    # When snprintf is used, the second argument shouldn't be a literal.
+    match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
+    if match and match.group(2) != '0':
+        # If 2nd arg is zero, snprintf is used to calculate size.
+        error(filename, linenum, 'runtime/printf', 3,
+              'If you can, use sizeof(%s) instead of %s as the 2nd arg '
+              'to snprintf.' % (match.group(1), match.group(2)))
+
+    # Check if some verboten C functions are being used.
+    if Search(r'\bsprintf\s*\(', line):
+        error(filename, linenum, 'runtime/printf', 5,
+              'Never use sprintf. Use snprintf instead.')
+    match = Search(r'\b(strcpy|strcat)\s*\(', line)
+    if match:
+        error(filename, linenum, 'runtime/printf', 4,
+              'Almost always, snprintf is better than %s' % match.group(1))
 
 
 def IsDerivedFunction(clean_lines, linenum):
-  """Check if current line contains an inherited function.
+    """Check if current line contains an inherited function.
 
   Args:
     clean_lines: A CleansedLines instance containing the file.
@@ -5016,20 +5048,20 @@ def IsDerivedFunction(clean_lines, linenum):
     True if current line contains a function with "override"
     virt-specifier.
   """
-  # Scan back a few lines for start of current function
-  for i in xrange(linenum, max(-1, linenum - 10), -1):
-    match = Match(r'^([^()]*\w+)\(', clean_lines.elided[i])
-    if match:
-      # Look for "override" after the matching closing parenthesis
-      line, _, closing_paren = CloseExpression(
-          clean_lines, i, len(match.group(1)))
-      return (closing_paren >= 0 and
-              Search(r'\boverride\b', line[closing_paren:]))
-  return False
+    # Scan back a few lines for start of current function
+    for i in xrange(linenum, max(-1, linenum - 10), -1):
+        match = Match(r'^([^()]*\w+)\(', clean_lines.elided[i])
+        if match:
+            # Look for "override" after the matching closing parenthesis
+            line, _, closing_paren = CloseExpression(clean_lines, i,
+                                                     len(match.group(1)))
+            return (closing_paren >= 0 and
+                    Search(r'\boverride\b', line[closing_paren:]))
+    return False
 
 
 def IsOutOfLineMethodDefinition(clean_lines, linenum):
-  """Check if current line contains an out-of-line method definition.
+    """Check if current line contains an out-of-line method definition.
 
   Args:
     clean_lines: A CleansedLines instance containing the file.
@@ -5037,15 +5069,16 @@ def IsOutOfLineMethodDefinition(clean_lines, linenum):
   Returns:
     True if current line contains an out-of-line method definition.
   """
-  # Scan back a few lines for start of current function
-  for i in xrange(linenum, max(-1, linenum - 10), -1):
-    if Match(r'^([^()]*\w+)\(', clean_lines.elided[i]):
-      return Match(r'^[^()]*\w+::\w+\(', clean_lines.elided[i]) is not None
-  return False
+    # Scan back a few lines for start of current function
+    for i in xrange(linenum, max(-1, linenum - 10), -1):
+        if Match(r'^([^()]*\w+)\(', clean_lines.elided[i]):
+            return Match(r'^[^()]*\w+::\w+\(',
+                         clean_lines.elided[i]) is not None
+    return False
 
 
 def IsInitializerList(clean_lines, linenum):
-  """Check if current line is inside constructor initializer list.
+    """Check if current line is inside constructor initializer list.
 
   Args:
     clean_lines: A CleansedLines instance containing the file.
@@ -5054,41 +5087,41 @@ def IsInitializerList(clean_lines, linenum):
     True if current line appears to be inside constructor initializer
     list, False otherwise.
   """
-  for i in xrange(linenum, 1, -1):
-    line = clean_lines.elided[i]
-    if i == linenum:
-      remove_function_body = Match(r'^(.*)\{\s*$', line)
-      if remove_function_body:
-        line = remove_function_body.group(1)
-
-    if Search(r'\s:\s*\w+[({]', line):
-      # A lone colon tend to indicate the start of a constructor
-      # initializer list.  It could also be a ternary operator, which
-      # also tend to appear in constructor initializer lists as
-      # opposed to parameter lists.
-      return True
-    if Search(r'\}\s*,\s*$', line):
-      # A closing brace followed by a comma is probably the end of a
-      # brace-initialized member in constructor initializer list.
-      return True
-    if Search(r'[{};]\s*$', line):
-      # Found one of the following:
-      # - A closing brace or semicolon, probably the end of the previous
-      #   function.
-      # - An opening brace, probably the start of current class or namespace.
-      #
-      # Current line is probably not inside an initializer list since
-      # we saw one of those things without seeing the starting colon.
-      return False
-
-  # Got to the beginning of the file without seeing the start of
-  # constructor initializer list.
-  return False
-
-
-def CheckForNonConstReference(filename, clean_lines, linenum,
-                              nesting_state, error):
-  """Check for non-const references.
+    for i in xrange(linenum, 1, -1):
+        line = clean_lines.elided[i]
+        if i == linenum:
+            remove_function_body = Match(r'^(.*)\{\s*$', line)
+            if remove_function_body:
+                line = remove_function_body.group(1)
+
+        if Search(r'\s:\s*\w+[({]', line):
+            # A lone colon tend to indicate the start of a constructor
+            # initializer list.  It could also be a ternary operator, which
+            # also tend to appear in constructor initializer lists as
+            # opposed to parameter lists.
+            return True
+        if Search(r'\}\s*,\s*$', line):
+            # A closing brace followed by a comma is probably the end of a
+            # brace-initialized member in constructor initializer list.
+            return True
+        if Search(r'[{};]\s*$', line):
+            # Found one of the following:
+            # - A closing brace or semicolon, probably the end of the previous
+            #   function.
+            # - An opening brace, probably the start of current class or namespace.
+            #
+            # Current line is probably not inside an initializer list since
+            # we saw one of those things without seeing the starting colon.
+            return False
+
+    # Got to the beginning of the file without seeing the start of
+    # constructor initializer list.
+    return False
+
+
+def CheckForNonConstReference(filename, clean_lines, linenum, nesting_state,
+                              error):
+    """Check for non-const references.
 
   Separate from CheckLanguage since it scans backwards from current
   line, instead of scanning forward.
@@ -5101,131 +5134,131 @@ def CheckForNonConstReference(filename, clean_lines, linenum,
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
-  # Do nothing if there is no '&' on current line.
-  line = clean_lines.elided[linenum]
-  if '&' not in line:
-    return
-
-  # If a function is inherited, current function doesn't have much of
-  # a choice, so any non-const references should not be blamed on
-  # derived function.
-  if IsDerivedFunction(clean_lines, linenum):
-    return
-
-  # Don't warn on out-of-line method definitions, as we would warn on the
-  # in-line declaration, if it isn't marked with 'override'.
-  if IsOutOfLineMethodDefinition(clean_lines, linenum):
-    return
-
-  # Long type names may be broken across multiple lines, usually in one
-  # of these forms:
-  #   LongType
-  #       ::LongTypeContinued &identifier
-  #   LongType::
-  #       LongTypeContinued &identifier
-  #   LongType<
-  #       ...>::LongTypeContinued &identifier
-  #
-  # If we detected a type split across two lines, join the previous
-  # line to current line so that we can match const references
-  # accordingly.
-  #
-  # Note that this only scans back one line, since scanning back
-  # arbitrary number of lines would be expensive.  If you have a type
-  # that spans more than 2 lines, please use a typedef.
-  if linenum > 1:
-    previous = None
-    if Match(r'\s*::(?:[\w<>]|::)+\s*&\s*\S', line):
-      # previous_line\n + ::current_line
-      previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+[\w<>])\s*$',
-                        clean_lines.elided[linenum - 1])
-    elif Match(r'\s*[a-zA-Z_]([\w<>]|::)+\s*&\s*\S', line):
-      # previous_line::\n + current_line
-      previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+::)\s*$',
-                        clean_lines.elided[linenum - 1])
-    if previous:
-      line = previous.group(1) + line.lstrip()
-    else:
-      # Check for templated parameter that is split across multiple lines
-      endpos = line.rfind('>')
-      if endpos > -1:
-        (_, startline, startpos) = ReverseCloseExpression(
-            clean_lines, linenum, endpos)
-        if startpos > -1 and startline < linenum:
-          # Found the matching < on an earlier line, collect all
-          # pieces up to current line.
-          line = ''
-          for i in xrange(startline, linenum + 1):
-            line += clean_lines.elided[i].strip()
-
-  # Check for non-const references in function parameters.  A single '&' may
-  # found in the following places:
-  #   inside expression: binary & for bitwise AND
-  #   inside expression: unary & for taking the address of something
-  #   inside declarators: reference parameter
-  # We will exclude the first two cases by checking that we are not inside a
-  # function body, including one that was just introduced by a trailing '{'.
-  # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare].
-  if (nesting_state.previous_stack_top and
-      not (isinstance(nesting_state.previous_stack_top, _ClassInfo) or
-           isinstance(nesting_state.previous_stack_top, _NamespaceInfo))):
-    # Not at toplevel, not within a class, and not within a namespace
-    return
-
-  # Avoid initializer lists.  We only need to scan back from the
-  # current line for something that starts with ':'.
-  #
-  # We don't need to check the current line, since the '&' would
-  # appear inside the second set of parentheses on the current line as
-  # opposed to the first set.
-  if linenum > 0:
-    for i in xrange(linenum - 1, max(0, linenum - 10), -1):
-      previous_line = clean_lines.elided[i]
-      if not Search(r'[),]\s*$', previous_line):
-        break
-      if Match(r'^\s*:\s+\S', previous_line):
+    # Do nothing if there is no '&' on current line.
+    line = clean_lines.elided[linenum]
+    if '&' not in line:
+        return
+
+    # If a function is inherited, current function doesn't have much of
+    # a choice, so any non-const references should not be blamed on
+    # derived function.
+    if IsDerivedFunction(clean_lines, linenum):
+        return
+
+    # Don't warn on out-of-line method definitions, as we would warn on the
+    # in-line declaration, if it isn't marked with 'override'.
+    if IsOutOfLineMethodDefinition(clean_lines, linenum):
+        return
+
+    # Long type names may be broken across multiple lines, usually in one
+    # of these forms:
+    #   LongType
+    #       ::LongTypeContinued &identifier
+    #   LongType::
+    #       LongTypeContinued &identifier
+    #   LongType<
+    #       ...>::LongTypeContinued &identifier
+    #
+    # If we detected a type split across two lines, join the previous
+    # line to current line so that we can match const references
+    # accordingly.
+    #
+    # Note that this only scans back one line, since scanning back
+    # arbitrary number of lines would be expensive.  If you have a type
+    # that spans more than 2 lines, please use a typedef.
+    if linenum > 1:
+        previous = None
+        if Match(r'\s*::(?:[\w<>]|::)+\s*&\s*\S', line):
+            # previous_line\n + ::current_line
+            previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+[\w<>])\s*$',
+                              clean_lines.elided[linenum - 1])
+        elif Match(r'\s*[a-zA-Z_]([\w<>]|::)+\s*&\s*\S', line):
+            # previous_line::\n + current_line
+            previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+::)\s*$',
+                              clean_lines.elided[linenum - 1])
+        if previous:
+            line = previous.group(1) + line.lstrip()
+        else:
+            # Check for templated parameter that is split across multiple lines
+            endpos = line.rfind('>')
+            if endpos > -1:
+                (_, startline, startpos) = ReverseCloseExpression(
+                    clean_lines, linenum, endpos)
+                if startpos > -1 and startline < linenum:
+                    # Found the matching < on an earlier line, collect all
+                    # pieces up to current line.
+                    line = ''
+                    for i in xrange(startline, linenum + 1):
+                        line += clean_lines.elided[i].strip()
+
+    # Check for non-const references in function parameters.  A single '&' may
+    # found in the following places:
+    #   inside expression: binary & for bitwise AND
+    #   inside expression: unary & for taking the address of something
+    #   inside declarators: reference parameter
+    # We will exclude the first two cases by checking that we are not inside a
+    # function body, including one that was just introduced by a trailing '{'.
+    # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare].
+    if (nesting_state.previous_stack_top and
+            not (isinstance(nesting_state.previous_stack_top, _ClassInfo) or
+                 isinstance(nesting_state.previous_stack_top, _NamespaceInfo))):
+        # Not at toplevel, not within a class, and not within a namespace
+        return
+
+    # Avoid initializer lists.  We only need to scan back from the
+    # current line for something that starts with ':'.
+    #
+    # We don't need to check the current line, since the '&' would
+    # appear inside the second set of parentheses on the current line as
+    # opposed to the first set.
+    if linenum > 0:
+        for i in xrange(linenum - 1, max(0, linenum - 10), -1):
+            previous_line = clean_lines.elided[i]
+            if not Search(r'[),]\s*$', previous_line):
+                break
+            if Match(r'^\s*:\s+\S', previous_line):
+                return
+
+    # Avoid preprocessors
+    if Search(r'\\\s*$', line):
         return
 
-  # Avoid preprocessors
-  if Search(r'\\\s*$', line):
-    return
-
-  # Avoid constructor initializer lists
-  if IsInitializerList(clean_lines, linenum):
-    return
-
-  # We allow non-const references in a few standard places, like functions
-  # called "swap()" or iostream operators like "<<" or ">>".  Do not check
-  # those function parameters.
-  #
-  # We also accept & in static_assert, which looks like a function but
-  # it's actually a declaration expression.
-  whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
-                           r'operator\s*[<>][<>]|'
-                           r'static_assert|COMPILE_ASSERT'
-                           r')\s*\(')
-  if Search(whitelisted_functions, line):
-    return
-  elif not Search(r'\S+\([^)]*$', line):
-    # Don't see a whitelisted function on this line.  Actually we
-    # didn't see any function name on this line, so this is likely a
-    # multi-line parameter list.  Try a bit harder to catch this case.
-    for i in xrange(2):
-      if (linenum > i and
-          Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])):
+    # Avoid constructor initializer lists
+    if IsInitializerList(clean_lines, linenum):
         return
 
-  decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
-  for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
-    if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter):
-      error(filename, linenum, 'runtime/references', 2,
-            'Is this a non-const reference? '
-            'If so, make const or use a pointer: ' +
-            ReplaceAll(' *<', '<', parameter))
+    # We allow non-const references in a few standard places, like functions
+    # called "swap()" or iostream operators like "<<" or ">>".  Do not check
+    # those function parameters.
+    #
+    # We also accept & in static_assert, which looks like a function but
+    # it's actually a declaration expression.
+    whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
+                             r'operator\s*[<>][<>]|'
+                             r'static_assert|COMPILE_ASSERT'
+                             r')\s*\(')
+    if Search(whitelisted_functions, line):
+        return
+    elif not Search(r'\S+\([^)]*$', line):
+        # Don't see a whitelisted function on this line.  Actually we
+        # didn't see any function name on this line, so this is likely a
+        # multi-line parameter list.  Try a bit harder to catch this case.
+        for i in xrange(2):
+            if (linenum > i and Search(whitelisted_functions,
+                                       clean_lines.elided[linenum - i - 1])):
+                return
+
+    decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
+    for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
+        if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter):
+            error(filename, linenum, 'runtime/references', 2,
+                  'Is this a non-const reference? '
+                  'If so, make const or use a pointer: ' + ReplaceAll(
+                      ' *<', '<', parameter))
 
 
 def CheckCasts(filename, clean_lines, linenum, error):
-  """Various cast related checks.
+    """Various cast related checks.
 
   Args:
     filename: The name of the current file.
@@ -5233,118 +5266,116 @@ def CheckCasts(filename, clean_lines, linenum, error):
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
-  line = clean_lines.elided[linenum]
-
-  # Check to see if they're using an conversion function cast.
-  # I just try to capture the most common basic types, though there are more.
-  # Parameterless conversion functions, such as bool(), are allowed as they are
-  # probably a member operator declaration or default constructor.
-  match = Search(
-      r'(\bnew\s+|\S<\s*(?:const\s+)?)?\b'
-      r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
-      r'(\([^)].*)', line)
-  expecting_function = ExpectingFunctionArgs(clean_lines, linenum)
-  if match and not expecting_function:
-    matched_type = match.group(2)
-
-    # matched_new_or_template is used to silence two false positives:
-    # - New operators
-    # - Template arguments with function types
+    line = clean_lines.elided[linenum]
+
+    # Check to see if they're using an conversion function cast.
+    # I just try to capture the most common basic types, though there are more.
+    # Parameterless conversion functions, such as bool(), are allowed as they are
+    # probably a member operator declaration or default constructor.
+    match = Search(r'(\bnew\s+|\S<\s*(?:const\s+)?)?\b'
+                   r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
+                   r'(\([^)].*)', line)
+    expecting_function = ExpectingFunctionArgs(clean_lines, linenum)
+    if match and not expecting_function:
+        matched_type = match.group(2)
+
+        # matched_new_or_template is used to silence two false positives:
+        # - New operators
+        # - Template arguments with function types
+        #
+        # For template arguments, we match on types immediately following
+        # an opening bracket without any spaces.  This is a fast way to
+        # silence the common case where the function type is the first
+        # template argument.  False negative with less-than comparison is
+        # avoided because those operators are usually followed by a space.
+        #
+        #   function<double(double)>   // bracket + no space = false positive
+        #   value < double(42)         // bracket + space = true positive
+        matched_new_or_template = match.group(1)
+
+        # Avoid arrays by looking for brackets that come after the closing
+        # parenthesis.
+        if Match(r'\([^()]+\)\s*\[', match.group(3)):
+            return
+
+        # Other things to ignore:
+        # - Function pointers
+        # - Casts to pointer types
+        # - Placement new
+        # - Alias declarations
+        matched_funcptr = match.group(3)
+        if (matched_new_or_template is None and not (matched_funcptr and (Match(
+                r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
+                matched_funcptr) or matched_funcptr.startswith('(*)'))) and
+                not Match(r'\s*using\s+\S+\s*=\s*' + matched_type, line) and
+                not Search(r'new\(\S+\)\s*' + matched_type, line)):
+            error(filename, linenum, 'readability/casting', 4,
+                  'Using deprecated casting style.  '
+                  'Use static_cast<%s>(...) instead' % matched_type)
+
+    if not expecting_function:
+        CheckCStyleCast(filename, clean_lines, linenum, 'static_cast',
+                        r'\((int|float|double|bool|char|u?int(16|32|64))\)',
+                        error)
+
+    # This doesn't catch all cases. Consider (const char * const)"hello".
     #
-    # For template arguments, we match on types immediately following
-    # an opening bracket without any spaces.  This is a fast way to
-    # silence the common case where the function type is the first
-    # template argument.  False negative with less-than comparison is
-    # avoided because those operators are usually followed by a space.
+    # (char *) "foo" should always be a const_cast (reinterpret_cast won't
+    # compile).
+    if CheckCStyleCast(filename, clean_lines, linenum, 'const_cast',
+                       r'\((char\s?\*+\s?)\)\s*"', error):
+        pass
+    else:
+        # Check pointer casts for other than string constants
+        CheckCStyleCast(filename, clean_lines, linenum, 'reinterpret_cast',
+                        r'\((\w+\s?\*+\s?)\)', error)
+
+    # In addition, we look for people taking the address of a cast.  This
+    # is dangerous -- casts can assign to temporaries, so the pointer doesn't
+    # point where you think.
     #
-    #   function<double(double)>   // bracket + no space = false positive
-    #   value < double(42)         // bracket + space = true positive
-    matched_new_or_template = match.group(1)
-
-    # Avoid arrays by looking for brackets that come after the closing
-    # parenthesis.
-    if Match(r'\([^()]+\)\s*\[', match.group(3)):
-      return
-
-    # Other things to ignore:
-    # - Function pointers
-    # - Casts to pointer types
-    # - Placement new
-    # - Alias declarations
-    matched_funcptr = match.group(3)
-    if (matched_new_or_template is None and
-        not (matched_funcptr and
-             (Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
-                    matched_funcptr) or
-              matched_funcptr.startswith('(*)'))) and
-        not Match(r'\s*using\s+\S+\s*=\s*' + matched_type, line) and
-        not Search(r'new\(\S+\)\s*' + matched_type, line)):
-      error(filename, linenum, 'readability/casting', 4,
-            'Using deprecated casting style.  '
-            'Use static_cast<%s>(...) instead' %
-            matched_type)
-
-  if not expecting_function:
-    CheckCStyleCast(filename, clean_lines, linenum, 'static_cast',
-                    r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
-
-  # This doesn't catch all cases. Consider (const char * const)"hello".
-  #
-  # (char *) "foo" should always be a const_cast (reinterpret_cast won't
-  # compile).
-  if CheckCStyleCast(filename, clean_lines, linenum, 'const_cast',
-                     r'\((char\s?\*+\s?)\)\s*"', error):
-    pass
-  else:
-    # Check pointer casts for other than string constants
-    CheckCStyleCast(filename, clean_lines, linenum, 'reinterpret_cast',
-                    r'\((\w+\s?\*+\s?)\)', error)
-
-  # In addition, we look for people taking the address of a cast.  This
-  # is dangerous -- casts can assign to temporaries, so the pointer doesn't
-  # point where you think.
-  #
-  # Some non-identifier character is required before the '&' for the
-  # expression to be recognized as a cast.  These are casts:
-  #   expression = &static_cast<int*>(temporary());
-  #   function(&(int*)(temporary()));
-  #
-  # This is not a cast:
-  #   reference_type&(int* function_param);
-  match = Search(
-      r'(?:[^\w]&\(([^)*][^)]*)\)[\w(])|'
-      r'(?:[^\w]&(static|dynamic|down|reinterpret)_cast\b)', line)
-  if match:
-    # Try a better error message when the & is bound to something
-    # dereferenced by the casted pointer, as opposed to the casted
-    # pointer itself.
-    parenthesis_error = False
-    match = Match(r'^(.*&(?:static|dynamic|down|reinterpret)_cast\b)<', line)
+    # Some non-identifier character is required before the '&' for the
+    # expression to be recognized as a cast.  These are casts:
+    #   expression = &static_cast<int*>(temporary());
+    #   function(&(int*)(temporary()));
+    #
+    # This is not a cast:
+    #   reference_type&(int* function_param);
+    match = Search(r'(?:[^\w]&\(([^)*][^)]*)\)[\w(])|'
+                   r'(?:[^\w]&(static|dynamic|down|reinterpret)_cast\b)', line)
     if match:
-      _, y1, x1 = CloseExpression(clean_lines, linenum, len(match.group(1)))
-      if x1 >= 0 and clean_lines.elided[y1][x1] == '(':
-        _, y2, x2 = CloseExpression(clean_lines, y1, x1)
-        if x2 >= 0:
-          extended_line = clean_lines.elided[y2][x2:]
-          if y2 < clean_lines.NumLines() - 1:
-            extended_line += clean_lines.elided[y2 + 1]
-          if Match(r'\s*(?:->|\[)', extended_line):
-            parenthesis_error = True
-
-    if parenthesis_error:
-      error(filename, linenum, 'readability/casting', 4,
-            ('Are you taking an address of something dereferenced '
-             'from a cast?  Wrapping the dereferenced expression in '
-             'parentheses will make the binding more obvious'))
-    else:
-      error(filename, linenum, 'runtime/casting', 4,
-            ('Are you taking an address of a cast?  '
-             'This is dangerous: could be a temp var.  '
-             'Take the address before doing the cast, rather than after'))
+        # Try a better error message when the & is bound to something
+        # dereferenced by the casted pointer, as opposed to the casted
+        # pointer itself.
+        parenthesis_error = False
+        match = Match(r'^(.*&(?:static|dynamic|down|reinterpret)_cast\b)<',
+                      line)
+        if match:
+            _, y1, x1 = CloseExpression(clean_lines, linenum,
+                                        len(match.group(1)))
+            if x1 >= 0 and clean_lines.elided[y1][x1] == '(':
+                _, y2, x2 = CloseExpression(clean_lines, y1, x1)
+                if x2 >= 0:
+                    extended_line = clean_lines.elided[y2][x2:]
+                    if y2 < clean_lines.NumLines() - 1:
+                        extended_line += clean_lines.elided[y2 + 1]
+                    if Match(r'\s*(?:->|\[)', extended_line):
+                        parenthesis_error = True
+
+        if parenthesis_error:
+            error(filename, linenum, 'readability/casting', 4,
+                  ('Are you taking an address of something dereferenced '
+                   'from a cast?  Wrapping the dereferenced expression in '
+                   'parentheses will make the binding more obvious'))
+        else:
+            error(filename, linenum, 'runtime/casting', 4,
+                  ('Are you taking an address of a cast?  '
+                   'This is dangerous: could be a temp var.  '
+                   'Take the address before doing the cast, rather than after'))
 
 
 def CheckCStyleCast(filename, clean_lines, linenum, cast_type, pattern, error):
-  """Checks for a C-style cast by looking for the pattern.
+    """Checks for a C-style cast by looking for the pattern.
 
   Args:
     filename: The name of the current file.
@@ -5359,96 +5390,96 @@ def CheckCStyleCast(filename, clean_lines, linenum, cast_type, pattern, error):
     True if an error was emitted.
     False otherwise.
   """
-  line = clean_lines.elided[linenum]
-  match = Search(pattern, line)
-  if not match:
-    return False
+    line = clean_lines.elided[linenum]
+    match = Search(pattern, line)
+    if not match:
+        return False
 
-  # Exclude lines with keywords that tend to look like casts
-  context = line[0:match.start(1) - 1]
-  if Match(r'.*\b(?:sizeof|alignof|alignas|[_A-Z][_A-Z0-9]*)\s*$', context):
-    return False
+    # Exclude lines with keywords that tend to look like casts
+    context = line[0:match.start(1) - 1]
+    if Match(r'.*\b(?:sizeof|alignof|alignas|[_A-Z][_A-Z0-9]*)\s*$', context):
+        return False
 
-  # Try expanding current context to see if we one level of
-  # parentheses inside a macro.
-  if linenum > 0:
-    for i in xrange(linenum - 1, max(0, linenum - 5), -1):
-      context = clean_lines.elided[i] + context
-  if Match(r'.*\b[_A-Z][_A-Z0-9]*\s*\((?:\([^()]*\)|[^()])*$', context):
-    return False
+    # Try expanding current context to see if we one level of
+    # parentheses inside a macro.
+    if linenum > 0:
+        for i in xrange(linenum - 1, max(0, linenum - 5), -1):
+            context = clean_lines.elided[i] + context
+    if Match(r'.*\b[_A-Z][_A-Z0-9]*\s*\((?:\([^()]*\)|[^()])*$', context):
+        return False
 
-  # operator++(int) and operator--(int)
-  if context.endswith(' operator++') or context.endswith(' operator--'):
-    return False
+    # operator++(int) and operator--(int)
+    if context.endswith(' operator++') or context.endswith(' operator--'):
+        return False
 
-  # A single unnamed argument for a function tends to look like old
-  # style cast.  If we see those, don't issue warnings for deprecated
-  # casts, instead issue warnings for unnamed arguments where
-  # appropriate.
-  #
-  # These are things that we want warnings for, since the style guide
-  # explicitly require all parameters to be named:
-  #   Function(int);
-  #   Function(int) {
-  #   ConstMember(int) const;
-  #   ConstMember(int) const {
-  #   ExceptionMember(int) throw (...);
-  #   ExceptionMember(int) throw (...) {
-  #   PureVirtual(int) = 0;
-  #   [](int) -> bool {
-  #
-  # These are functions of some sort, where the compiler would be fine
-  # if they had named parameters, but people often omit those
-  # identifiers to reduce clutter:
-  #   (FunctionPointer)(int);
-  #   (FunctionPointer)(int) = value;
-  #   Function((function_pointer_arg)(int))
-  #   Function((function_pointer_arg)(int), int param)
-  #   <TemplateArgument(int)>;
-  #   <(FunctionPointerTemplateArgument)(int)>;
-  remainder = line[match.end(0):]
-  if Match(r'^\s*(?:;|const\b|throw\b|final\b|override\b|[=>{),]|->)',
-           remainder):
-    # Looks like an unnamed parameter.
-
-    # Don't warn on any kind of template arguments.
-    if Match(r'^\s*>', remainder):
-      return False
-
-    # Don't warn on assignments to function pointers, but keep warnings for
-    # unnamed parameters to pure virtual functions.  Note that this pattern
-    # will also pass on assignments of "0" to function pointers, but the
-    # preferred values for those would be "nullptr" or "NULL".
-    matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder)
-    if matched_zero and matched_zero.group(1) != '0':
-      return False
-
-    # Don't warn on function pointer declarations.  For this we need
-    # to check what came before the "(type)" string.
-    if Match(r'.*\)\s*$', line[0:match.start(0)]):
-      return False
-
-    # Don't warn if the parameter is named with block comments, e.g.:
-    #  Function(int /*unused_param*/);
-    raw_line = clean_lines.raw_lines[linenum]
-    if '/*' in raw_line:
-      return False
-
-    # Passed all filters, issue warning here.
-    error(filename, linenum, 'readability/function', 3,
-          'All parameters should be named in a function')
-    return True
+    # A single unnamed argument for a function tends to look like old
+    # style cast.  If we see those, don't issue warnings for deprecated
+    # casts, instead issue warnings for unnamed arguments where
+    # appropriate.
+    #
+    # These are things that we want warnings for, since the style guide
+    # explicitly require all parameters to be named:
+    #   Function(int);
+    #   Function(int) {
+    #   ConstMember(int) const;
+    #   ConstMember(int) const {
+    #   ExceptionMember(int) throw (...);
+    #   ExceptionMember(int) throw (...) {
+    #   PureVirtual(int) = 0;
+    #   [](int) -> bool {
+    #
+    # These are functions of some sort, where the compiler would be fine
+    # if they had named parameters, but people often omit those
+    # identifiers to reduce clutter:
+    #   (FunctionPointer)(int);
+    #   (FunctionPointer)(int) = value;
+    #   Function((function_pointer_arg)(int))
+    #   Function((function_pointer_arg)(int), int param)
+    #   <TemplateArgument(int)>;
+    #   <(FunctionPointerTemplateArgument)(int)>;
+    remainder = line[match.end(0):]
+    if Match(r'^\s*(?:;|const\b|throw\b|final\b|override\b|[=>{),]|->)',
+             remainder):
+        # Looks like an unnamed parameter.
+
+        # Don't warn on any kind of template arguments.
+        if Match(r'^\s*>', remainder):
+            return False
 
-  # At this point, all that should be left is actual casts.
-  error(filename, linenum, 'readability/casting', 4,
-        'Using C-style cast.  Use %s<%s>(...) instead' %
-        (cast_type, match.group(1)))
+        # Don't warn on assignments to function pointers, but keep warnings for
+        # unnamed parameters to pure virtual functions.  Note that this pattern
+        # will also pass on assignments of "0" to function pointers, but the
+        # preferred values for those would be "nullptr" or "NULL".
+        matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder)
+        if matched_zero and matched_zero.group(1) != '0':
+            return False
 
-  return True
+        # Don't warn on function pointer declarations.  For this we need
+        # to check what came before the "(type)" string.
+        if Match(r'.*\)\s*$', line[0:match.start(0)]):
+            return False
+
+        # Don't warn if the parameter is named with block comments, e.g.:
+        #  Function(int /*unused_param*/);
+        raw_line = clean_lines.raw_lines[linenum]
+        if '/*' in raw_line:
+            return False
+
+        # Passed all filters, issue warning here.
+        error(filename, linenum, 'readability/function', 3,
+              'All parameters should be named in a function')
+        return True
+
+    # At this point, all that should be left is actual casts.
+    error(filename, linenum, 'readability/casting', 4,
+          'Using C-style cast.  Use %s<%s>(...) instead' %
+          (cast_type, match.group(1)))
+
+    return True
 
 
 def ExpectingFunctionArgs(clean_lines, linenum):
-  """Checks whether where function type arguments are expected.
+    """Checks whether where function type arguments are expected.
 
   Args:
     clean_lines: A CleansedLines instance containing the file.
@@ -5458,78 +5489,107 @@ def ExpectingFunctionArgs(clean_lines, linenum):
     True if the line at 'linenum' is inside something that expects arguments
     of function types.
   """
-  line = clean_lines.elided[linenum]
-  return (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
-          (linenum >= 2 and
-           (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
-                  clean_lines.elided[linenum - 1]) or
-            Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
-                  clean_lines.elided[linenum - 2]) or
-            Search(r'\bstd::m?function\s*\<\s*$',
-                   clean_lines.elided[linenum - 1]))))
+    line = clean_lines.elided[linenum]
+    return (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
+            (linenum >= 2 and
+             (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
+                    clean_lines.elided[linenum - 1]) or
+              Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
+                    clean_lines.elided[linenum - 2]) or
+              Search(r'\bstd::m?function\s*\<\s*$',
+                     clean_lines.elided[linenum - 1]))))
 
 
 _HEADERS_CONTAINING_TEMPLATES = (
-    ('<deque>', ('deque',)),
-    ('<functional>', ('unary_function', 'binary_function',
-                      'plus', 'minus', 'multiplies', 'divides', 'modulus',
-                      'negate',
-                      'equal_to', 'not_equal_to', 'greater', 'less',
-                      'greater_equal', 'less_equal',
-                      'logical_and', 'logical_or', 'logical_not',
-                      'unary_negate', 'not1', 'binary_negate', 'not2',
-                      'bind1st', 'bind2nd',
-                      'pointer_to_unary_function',
-                      'pointer_to_binary_function',
-                      'ptr_fun',
-                      'mem_fun_t', 'mem_fun', 'mem_fun1_t', 'mem_fun1_ref_t',
-                      'mem_fun_ref_t',
-                      'const_mem_fun_t', 'const_mem_fun1_t',
-                      'const_mem_fun_ref_t', 'const_mem_fun1_ref_t',
-                      'mem_fun_ref',
-                     )),
-    ('<limits>', ('numeric_limits',)),
-    ('<list>', ('list',)),
-    ('<map>', ('map', 'multimap',)),
-    ('<memory>', ('allocator',)),
-    ('<queue>', ('queue', 'priority_queue',)),
-    ('<set>', ('set', 'multiset',)),
-    ('<stack>', ('stack',)),
-    ('<string>', ('char_traits', 'basic_string',)),
-    ('<tuple>', ('tuple',)),
-    ('<utility>', ('pair',)),
-    ('<vector>', ('vector',)),
+    ('<deque>', ('deque', )),
+    ('<functional>', (
+        'unary_function',
+        'binary_function',
+        'plus',
+        'minus',
+        'multiplies',
+        'divides',
+        'modulus',
+        'negate',
+        'equal_to',
+        'not_equal_to',
+        'greater',
+        'less',
+        'greater_equal',
+        'less_equal',
+        'logical_and',
+        'logical_or',
+        'logical_not',
+        'unary_negate',
+        'not1',
+        'binary_negate',
+        'not2',
+        'bind1st',
+        'bind2nd',
+        'pointer_to_unary_function',
+        'pointer_to_binary_function',
+        'ptr_fun',
+        'mem_fun_t',
+        'mem_fun',
+        'mem_fun1_t',
+        'mem_fun1_ref_t',
+        'mem_fun_ref_t',
+        'const_mem_fun_t',
+        'const_mem_fun1_t',
+        'const_mem_fun_ref_t',
+        'const_mem_fun1_ref_t',
+        'mem_fun_ref', )),
+    ('<limits>', ('numeric_limits', )),
+    ('<list>', ('list', )),
+    ('<map>', (
+        'map',
+        'multimap', )),
+    ('<memory>', ('allocator', )),
+    ('<queue>', (
+        'queue',
+        'priority_queue', )),
+    ('<set>', (
+        'set',
+        'multiset', )),
+    ('<stack>', ('stack', )),
+    ('<string>', (
+        'char_traits',
+        'basic_string', )),
+    ('<tuple>', ('tuple', )),
+    ('<utility>', ('pair', )),
+    ('<vector>', ('vector', )),
 
     # gcc extensions.
     # Note: std::hash is their hash, ::hash is our hash
-    ('<hash_map>', ('hash_map', 'hash_multimap',)),
-    ('<hash_set>', ('hash_set', 'hash_multiset',)),
-    ('<slist>', ('slist',)),
-    )
+    ('<hash_map>', (
+        'hash_map',
+        'hash_multimap', )),
+    ('<hash_set>', (
+        'hash_set',
+        'hash_multiset', )),
+    ('<slist>', ('slist', )), )
 
 _RE_PATTERN_STRING = re.compile(r'\bstring\b')
 
 _re_pattern_algorithm_header = []
 for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap',
                   'transform'):
-  # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
-  # type::max().
-  _re_pattern_algorithm_header.append(
-      (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
-       _template,
-       '<algorithm>'))
+    # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
+    # type::max().
+    _re_pattern_algorithm_header.append(
+        (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'), _template,
+         '<algorithm>'))
 
 _re_pattern_templates = []
 for _header, _templates in _HEADERS_CONTAINING_TEMPLATES:
-  for _template in _templates:
-    _re_pattern_templates.append(
-        (re.compile(r'(\<|\b)' + _template + r'\s*\<'),
-         _template + '<>',
-         _header))
+    for _template in _templates:
+        _re_pattern_templates.append(
+            (re.compile(r'(\<|\b)' + _template + r'\s*\<'), _template + '<>',
+             _header))
 
 
 def FilesBelongToSameModule(filename_cc, filename_h):
-  """Check if these two filenames belong to the same module.
+    """Check if these two filenames belong to the same module.
 
   The concept of a 'module' here is a as follows:
   foo.h, foo-inl.h, foo.cc, foo_test.cc and foo_unittest.cc belong to the
@@ -5558,33 +5618,33 @@ def FilesBelongToSameModule(filename_cc, filename_h):
     string: the additional prefix needed to open the header file.
   """
 
-  if not filename_cc.endswith('.cc'):
-    return (False, '')
-  filename_cc = filename_cc[:-len('.cc')]
-  if filename_cc.endswith('_unittest'):
-    filename_cc = filename_cc[:-len('_unittest')]
-  elif filename_cc.endswith('_test'):
-    filename_cc = filename_cc[:-len('_test')]
-  filename_cc = filename_cc.replace('/public/', '/')
-  filename_cc = filename_cc.replace('/internal/', '/')
-
-  if not filename_h.endswith('.h'):
-    return (False, '')
-  filename_h = filename_h[:-len('.h')]
-  if filename_h.endswith('-inl'):
-    filename_h = filename_h[:-len('-inl')]
-  filename_h = filename_h.replace('/public/', '/')
-  filename_h = filename_h.replace('/internal/', '/')
-
-  files_belong_to_same_module = filename_cc.endswith(filename_h)
-  common_path = ''
-  if files_belong_to_same_module:
-    common_path = filename_cc[:-len(filename_h)]
-  return files_belong_to_same_module, common_path
+    if not filename_cc.endswith('.cc'):
+        return (False, '')
+    filename_cc = filename_cc[:-len('.cc')]
+    if filename_cc.endswith('_unittest'):
+        filename_cc = filename_cc[:-len('_unittest')]
+    elif filename_cc.endswith('_test'):
+        filename_cc = filename_cc[:-len('_test')]
+    filename_cc = filename_cc.replace('/public/', '/')
+    filename_cc = filename_cc.replace('/internal/', '/')
+
+    if not filename_h.endswith('.h'):
+        return (False, '')
+    filename_h = filename_h[:-len('.h')]
+    if filename_h.endswith('-inl'):
+        filename_h = filename_h[:-len('-inl')]
+    filename_h = filename_h.replace('/public/', '/')
+    filename_h = filename_h.replace('/internal/', '/')
+
+    files_belong_to_same_module = filename_cc.endswith(filename_h)
+    common_path = ''
+    if files_belong_to_same_module:
+        common_path = filename_cc[:-len(filename_h)]
+    return files_belong_to_same_module, common_path
 
 
 def UpdateIncludeState(filename, include_dict, io=codecs):
-  """Fill up the include_dict with new includes found from the file.
+    """Fill up the include_dict with new includes found from the file.
 
   Args:
     filename: the name of the header to read.
@@ -5594,25 +5654,28 @@ def UpdateIncludeState(filename, include_dict, io=codecs):
   Returns:
     True if a header was successfully added. False otherwise.
   """
-  headerfile = None
-  try:
-    headerfile = io.open(filename, 'r', 'utf8', 'replace')
-  except IOError:
-    return False
-  linenum = 0
-  for line in headerfile:
-    linenum += 1
-    clean_line = CleanseComments(line)
-    match = _RE_PATTERN_INCLUDE.search(clean_line)
-    if match:
-      include = match.group(2)
-      include_dict.setdefault(include, linenum)
-  return True
+    headerfile = None
+    try:
+        headerfile = io.open(filename, 'r', 'utf8', 'replace')
+    except IOError:
+        return False
+    linenum = 0
+    for line in headerfile:
+        linenum += 1
+        clean_line = CleanseComments(line)
+        match = _RE_PATTERN_INCLUDE.search(clean_line)
+        if match:
+            include = match.group(2)
+            include_dict.setdefault(include, linenum)
+    return True
 
 
-def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
+def CheckForIncludeWhatYouUse(filename,
+                              clean_lines,
+                              include_state,
+                              error,
                               io=codecs):
-  """Reports for missing stl includes.
+    """Reports for missing stl includes.
 
   This function will output warnings to make sure you are including the headers
   necessary for the stl containers and functions that you use. We only give one
@@ -5628,87 +5691,88 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
     io: The IO factory to use to read the header file. Provided for unittest
         injection.
   """
-  required = {}  # A map of header name to linenumber and the template entity.
-                 # Example of required: { '<functional>': (1219, 'less<>') }
+    required = {}  # A map of header name to linenumber and the template entity.
+    # Example of required: { '<functional>': (1219, 'less<>') }
 
-  for linenum in xrange(clean_lines.NumLines()):
-    line = clean_lines.elided[linenum]
-    if not line or line[0] == '#':
-      continue
+    for linenum in xrange(clean_lines.NumLines()):
+        line = clean_lines.elided[linenum]
+        if not line or line[0] == '#':
+            continue
 
-    # String is special -- it is a non-templatized type in STL.
-    matched = _RE_PATTERN_STRING.search(line)
-    if matched:
-      # Don't warn about strings in non-STL namespaces:
-      # (We check only the first match per line; good enough.)
-      prefix = line[:matched.start()]
-      if prefix.endswith('std::') or not prefix.endswith('::'):
-        required['<string>'] = (linenum, 'string')
-
-    for pattern, template, header in _re_pattern_algorithm_header:
-      if pattern.search(line):
-        required[header] = (linenum, template)
-
-    # The following function is just a speed up, no semantics are changed.
-    if not '<' in line:  # Reduces the cpu time usage by skipping lines.
-      continue
-
-    for pattern, template, header in _re_pattern_templates:
-      if pattern.search(line):
-        required[header] = (linenum, template)
-
-  # The policy is that if you #include something in foo.h you don't need to
-  # include it again in foo.cc. Here, we will look at possible includes.
-  # Let's flatten the include_state include_list and copy it into a dictionary.
-  include_dict = dict([item for sublist in include_state.include_list
-                       for item in sublist])
-
-  # Did we find the header for this file (if any) and successfully load it?
-  header_found = False
-
-  # Use the absolute path so that matching works properly.
-  abs_filename = FileInfo(filename).FullName()
-
-  # For Emacs's flymake.
-  # If cpplint is invoked from Emacs's flymake, a temporary file is generated
-  # by flymake and that file name might end with '_flymake.cc'. In that case,
-  # restore original file name here so that the corresponding header file can be
-  # found.
-  # e.g. If the file name is 'foo_flymake.cc', we should search for 'foo.h'
-  # instead of 'foo_flymake.h'
-  abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename)
-
-  # include_dict is modified during iteration, so we iterate over a copy of
-  # the keys.
-  header_keys = include_dict.keys()
-  for header in header_keys:
-    (same_module, common_path) = FilesBelongToSameModule(abs_filename, header)
-    fullpath = common_path + header
-    if same_module and UpdateIncludeState(fullpath, include_dict, io):
-      header_found = True
-
-  # If we can't find the header file for a .cc, assume it's because we don't
-  # know where to look. In that case we'll give up as we're not sure they
-  # didn't include it in the .h file.
-  # TODO(unknown): Do a better job of finding .h files so we are confident that
-  # not having the .h file means there isn't one.
-  if filename.endswith('.cc') and not header_found:
-    return
-
-  # All the lines have been processed, report the errors found.
-  for required_header_unstripped in required:
-    template = required[required_header_unstripped][1]
-    if required_header_unstripped.strip('<>"') not in include_dict:
-      error(filename, required[required_header_unstripped][0],
-            'build/include_what_you_use', 4,
-            'Add #include ' + required_header_unstripped + ' for ' + template)
+        # String is special -- it is a non-templatized type in STL.
+        matched = _RE_PATTERN_STRING.search(line)
+        if matched:
+            # Don't warn about strings in non-STL namespaces:
+            # (We check only the first match per line; good enough.)
+            prefix = line[:matched.start()]
+            if prefix.endswith('std::') or not prefix.endswith('::'):
+                required['<string>'] = (linenum, 'string')
+
+        for pattern, template, header in _re_pattern_algorithm_header:
+            if pattern.search(line):
+                required[header] = (linenum, template)
+
+        # The following function is just a speed up, no semantics are changed.
+        if not '<' in line:  # Reduces the cpu time usage by skipping lines.
+            continue
+
+        for pattern, template, header in _re_pattern_templates:
+            if pattern.search(line):
+                required[header] = (linenum, template)
+
+    # The policy is that if you #include something in foo.h you don't need to
+    # include it again in foo.cc. Here, we will look at possible includes.
+    # Let's flatten the include_state include_list and copy it into a dictionary.
+    include_dict = dict(
+        [item for sublist in include_state.include_list for item in sublist])
+
+    # Did we find the header for this file (if any) and successfully load it?
+    header_found = False
+
+    # Use the absolute path so that matching works properly.
+    abs_filename = FileInfo(filename).FullName()
+
+    # For Emacs's flymake.
+    # If cpplint is invoked from Emacs's flymake, a temporary file is generated
+    # by flymake and that file name might end with '_flymake.cc'. In that case,
+    # restore original file name here so that the corresponding header file can be
+    # found.
+    # e.g. If the file name is 'foo_flymake.cc', we should search for 'foo.h'
+    # instead of 'foo_flymake.h'
+    abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename)
+
+    # include_dict is modified during iteration, so we iterate over a copy of
+    # the keys.
+    header_keys = include_dict.keys()
+    for header in header_keys:
+        (same_module, common_path) = FilesBelongToSameModule(abs_filename,
+                                                             header)
+        fullpath = common_path + header
+        if same_module and UpdateIncludeState(fullpath, include_dict, io):
+            header_found = True
+
+    # If we can't find the header file for a .cc, assume it's because we don't
+    # know where to look. In that case we'll give up as we're not sure they
+    # didn't include it in the .h file.
+    # TODO(unknown): Do a better job of finding .h files so we are confident that
+    # not having the .h file means there isn't one.
+    if filename.endswith('.cc') and not header_found:
+        return
+
+    # All the lines have been processed, report the errors found.
+    for required_header_unstripped in required:
+        template = required[required_header_unstripped][1]
+        if required_header_unstripped.strip('<>"') not in include_dict:
+            error(filename, required[required_header_unstripped][0],
+                  'build/include_what_you_use', 4, 'Add #include ' +
+                  required_header_unstripped + ' for ' + template)
 
 
 _RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<')
 
 
 def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
-  """Check that make_pair's template arguments are deduced.
+    """Check that make_pair's template arguments are deduced.
 
   G++ 4.6 in C++11 mode fails badly if make_pair's template arguments are
   specified explicitly, and such use isn't intended in any case.
@@ -5719,17 +5783,20 @@ def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
-  line = clean_lines.elided[linenum]
-  match = _RE_PATTERN_EXPLICIT_MAKEPAIR.search(line)
-  if match:
-    error(filename, linenum, 'build/explicit_make_pair',
-          4,  # 4 = high confidence
-          'For C++11-compatibility, omit template arguments from make_pair'
-          ' OR use pair directly OR if appropriate, construct a pair directly')
+    line = clean_lines.elided[linenum]
+    match = _RE_PATTERN_EXPLICIT_MAKEPAIR.search(line)
+    if match:
+        error(
+            filename,
+            linenum,
+            'build/explicit_make_pair',
+            4,  # 4 = high confidence
+            'For C++11-compatibility, omit template arguments from make_pair'
+            ' OR use pair directly OR if appropriate, construct a pair directly')
 
 
 def CheckDefaultLambdaCaptures(filename, clean_lines, linenum, error):
-  """Check that default lambda captures are not used.
+    """Check that default lambda captures are not used.
 
   Args:
     filename: The name of the current file.
@@ -5737,24 +5804,28 @@ def CheckDefaultLambdaCaptures(filename, clean_lines, linenum, error):
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
-  line = clean_lines.elided[linenum]
-
-  # A lambda introducer specifies a default capture if it starts with "[="
-  # or if it starts with "[&" _not_ followed by an identifier.
-  match = Match(r'^(.*)\[\s*(?:=|&[^\w])', line)
-  if match:
-    # Found a potential error, check what comes after the lambda-introducer.
-    # If it's not open parenthesis (for lambda-declarator) or open brace
-    # (for compound-statement), it's not a lambda.
-    line, _, pos = CloseExpression(clean_lines, linenum, len(match.group(1)))
-    if pos >= 0 and Match(r'^\s*[{(]', line[pos:]):
-      error(filename, linenum, 'build/c++11',
-            4,  # 4 = high confidence
-            'Default lambda captures are an unapproved C++ feature.')
+    line = clean_lines.elided[linenum]
+
+    # A lambda introducer specifies a default capture if it starts with "[="
+    # or if it starts with "[&" _not_ followed by an identifier.
+    match = Match(r'^(.*)\[\s*(?:=|&[^\w])', line)
+    if match:
+        # Found a potential error, check what comes after the lambda-introducer.
+        # If it's not open parenthesis (for lambda-declarator) or open brace
+        # (for compound-statement), it's not a lambda.
+        line, _, pos = CloseExpression(clean_lines, linenum,
+                                       len(match.group(1)))
+        if pos >= 0 and Match(r'^\s*[{(]', line[pos:]):
+            error(
+                filename,
+                linenum,
+                'build/c++11',
+                4,  # 4 = high confidence
+                'Default lambda captures are an unapproved C++ feature.')
 
 
 def CheckRedundantVirtual(filename, clean_lines, linenum, error):
-  """Check if line contains a redundant "virtual" function-specifier.
+    """Check if line contains a redundant "virtual" function-specifier.
 
   Args:
     filename: The name of the current file.
@@ -5762,63 +5833,64 @@ def CheckRedundantVirtual(filename, clean_lines, linenum, error):
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
-  # Look for "virtual" on current line.
-  line = clean_lines.elided[linenum]
-  virtual = Match(r'^(.*)(\bvirtual\b)(.*)$', line)
-  if not virtual: return
-
-  # Ignore "virtual" keywords that are near access-specifiers.  These
-  # are only used in class base-specifier and do not apply to member
-  # functions.
-  if (Search(r'\b(public|protected|private)\s+$', virtual.group(1)) or
-      Match(r'^\s+(public|protected|private)\b', virtual.group(3))):
-    return
-
-  # Ignore the "virtual" keyword from virtual base classes.  Usually
-  # there is a column on the same line in these cases (virtual base
-  # classes are rare in google3 because multiple inheritance is rare).
-  if Match(r'^.*[^:]:[^:].*$', line): return
-
-  # Look for the next opening parenthesis.  This is the start of the
-  # parameter list (possibly on the next line shortly after virtual).
-  # TODO(unknown): doesn't work if there are virtual functions with
-  # decltype() or other things that use parentheses, but csearch suggests
-  # that this is rare.
-  end_col = -1
-  end_line = -1
-  start_col = len(virtual.group(2))
-  for start_line in xrange(linenum, min(linenum + 3, clean_lines.NumLines())):
-    line = clean_lines.elided[start_line][start_col:]
-    parameter_list = Match(r'^([^(]*)\(', line)
-    if parameter_list:
-      # Match parentheses to find the end of the parameter list
-      (_, end_line, end_col) = CloseExpression(
-          clean_lines, start_line, start_col + len(parameter_list.group(1)))
-      break
-    start_col = 0
-
-  if end_col < 0:
-    return  # Couldn't find end of parameter list, give up
-
-  # Look for "override" or "final" after the parameter list
-  # (possibly on the next few lines).
-  for i in xrange(end_line, min(end_line + 3, clean_lines.NumLines())):
-    line = clean_lines.elided[i][end_col:]
-    match = Search(r'\b(override|final)\b', line)
-    if match:
-      error(filename, linenum, 'readability/inheritance', 4,
-            ('"virtual" is redundant since function is '
-             'already declared as "%s"' % match.group(1)))
+    # Look for "virtual" on current line.
+    line = clean_lines.elided[linenum]
+    virtual = Match(r'^(.*)(\bvirtual\b)(.*)$', line)
+    if not virtual: return
+
+    # Ignore "virtual" keywords that are near access-specifiers.  These
+    # are only used in class base-specifier and do not apply to member
+    # functions.
+    if (Search(r'\b(public|protected|private)\s+$', virtual.group(1)) or
+            Match(r'^\s+(public|protected|private)\b', virtual.group(3))):
+        return
 
-    # Set end_col to check whole lines after we are done with the
-    # first line.
-    end_col = 0
-    if Search(r'[^\w]\s*$', line):
-      break
+    # Ignore the "virtual" keyword from virtual base classes.  Usually
+    # there is a column on the same line in these cases (virtual base
+    # classes are rare in google3 because multiple inheritance is rare).
+    if Match(r'^.*[^:]:[^:].*$', line): return
+
+    # Look for the next opening parenthesis.  This is the start of the
+    # parameter list (possibly on the next line shortly after virtual).
+    # TODO(unknown): doesn't work if there are virtual functions with
+    # decltype() or other things that use parentheses, but csearch suggests
+    # that this is rare.
+    end_col = -1
+    end_line = -1
+    start_col = len(virtual.group(2))
+    for start_line in xrange(linenum, min(linenum + 3, clean_lines.NumLines())):
+        line = clean_lines.elided[start_line][start_col:]
+        parameter_list = Match(r'^([^(]*)\(', line)
+        if parameter_list:
+            # Match parentheses to find the end of the parameter list
+            (_, end_line, end_col) = CloseExpression(
+                clean_lines, start_line,
+                start_col + len(parameter_list.group(1)))
+            break
+        start_col = 0
+
+    if end_col < 0:
+        return  # Couldn't find end of parameter list, give up
+
+    # Look for "override" or "final" after the parameter list
+    # (possibly on the next few lines).
+    for i in xrange(end_line, min(end_line + 3, clean_lines.NumLines())):
+        line = clean_lines.elided[i][end_col:]
+        match = Search(r'\b(override|final)\b', line)
+        if match:
+            error(filename, linenum, 'readability/inheritance', 4,
+                  ('"virtual" is redundant since function is '
+                   'already declared as "%s"' % match.group(1)))
+
+        # Set end_col to check whole lines after we are done with the
+        # first line.
+        end_col = 0
+        if Search(r'[^\w]\s*$', line):
+            break
 
 
 def CheckRedundantOverrideOrFinal(filename, clean_lines, linenum, error):
-  """Check if line contains a redundant "override" or "final" virt-specifier.
+    """Check if line contains a redundant "override" or "final" virt-specifier.
 
   Args:
     filename: The name of the current file.
@@ -5826,32 +5898,30 @@ def CheckRedundantOverrideOrFinal(filename, clean_lines, linenum, error):
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
-  # Look for closing parenthesis nearby.  We need one to confirm where
-  # the declarator ends and where the virt-specifier starts to avoid
-  # false positives.
-  line = clean_lines.elided[linenum]
-  declarator_end = line.rfind(')')
-  if declarator_end >= 0:
-    fragment = line[declarator_end:]
-  else:
-    if linenum > 1 and clean_lines.elided[linenum - 1].rfind(')') >= 0:
-      fragment = line
+    # Look for closing parenthesis nearby.  We need one to confirm where
+    # the declarator ends and where the virt-specifier starts to avoid
+    # false positives.
+    line = clean_lines.elided[linenum]
+    declarator_end = line.rfind(')')
+    if declarator_end >= 0:
+        fragment = line[declarator_end:]
     else:
-      return
-
-  # Check that at most one of "override" or "final" is present, not both
-  if Search(r'\boverride\b', fragment) and Search(r'\bfinal\b', fragment):
-    error(filename, linenum, 'readability/inheritance', 4,
-          ('"override" is redundant since function is '
-           'already declared as "final"'))
-
+        if linenum > 1 and clean_lines.elided[linenum - 1].rfind(')') >= 0:
+            fragment = line
+        else:
+            return
 
+    # Check that at most one of "override" or "final" is present, not both
+    if Search(r'\boverride\b', fragment) and Search(r'\bfinal\b', fragment):
+        error(filename, linenum, 'readability/inheritance', 4,
+              ('"override" is redundant since function is '
+               'already declared as "final"'))
 
 
 # Returns true if we are at a new block, and it is directly
 # inside of a namespace.
 def IsBlockInNameSpace(nesting_state, is_forward_declaration):
-  """Checks that the new block is directly in a namespace.
+    """Checks that the new block is directly in a namespace.
 
   Args:
     nesting_state: The _NestingState object that contains info about our state.
@@ -5859,21 +5929,21 @@ def IsBlockInNameSpace(nesting_state, is_forward_declaration):
   Returns:
     Whether or not the new block is directly in a namespace.
   """
-  if is_forward_declaration:
-    if len(nesting_state.stack) >= 1 and (
-        isinstance(nesting_state.stack[-1], _NamespaceInfo)):
-      return True
-    else:
-      return False
+    if is_forward_declaration:
+        if len(nesting_state.stack) >= 1 and (
+                isinstance(nesting_state.stack[-1], _NamespaceInfo)):
+            return True
+        else:
+            return False
 
-  return (len(nesting_state.stack) > 1 and
-          nesting_state.stack[-1].check_namespace_indentation and
-          isinstance(nesting_state.stack[-2], _NamespaceInfo))
+    return (len(nesting_state.stack) > 1 and
+            nesting_state.stack[-1].check_namespace_indentation and
+            isinstance(nesting_state.stack[-2], _NamespaceInfo))
 
 
 def ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
                                     raw_lines_no_comments, linenum):
-  """This method determines if we should apply our namespace indentation check.
+    """This method determines if we should apply our namespace indentation check.
 
   Args:
     nesting_state: The current nesting state.
@@ -5888,17 +5958,17 @@ def ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
     only works for classes and namespaces inside of a namespace.
   """
 
-  is_forward_declaration = IsForwardClassDeclaration(raw_lines_no_comments,
-                                                     linenum)
+    is_forward_declaration = IsForwardClassDeclaration(raw_lines_no_comments,
+                                                       linenum)
 
-  if not (is_namespace_indent_item or is_forward_declaration):
-    return False
+    if not (is_namespace_indent_item or is_forward_declaration):
+        return False
 
-  # If we are in a macro, we do not want to check the namespace indentation.
-  if IsMacroDefinition(raw_lines_no_comments, linenum):
-    return False
+    # If we are in a macro, we do not want to check the namespace indentation.
+    if IsMacroDefinition(raw_lines_no_comments, linenum):
+        return False
 
-  return IsBlockInNameSpace(nesting_state, is_forward_declaration)
+    return IsBlockInNameSpace(nesting_state, is_forward_declaration)
 
 
 # Call this method if the line is directly inside of a namespace.
@@ -5906,16 +5976,22 @@ def ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
 # an inner namespace, it cannot be indented.
 def CheckItemIndentationInNamespace(filename, raw_lines_no_comments, linenum,
                                     error):
-  line = raw_lines_no_comments[linenum]
-  if Match(r'^\s+', line):
-    error(filename, linenum, 'runtime/indentation_namespace', 4,
-          'Do not indent within a namespace')
-
-
-def ProcessLine(filename, file_extension, clean_lines, line,
-                include_state, function_state, nesting_state, error,
+    line = raw_lines_no_comments[linenum]
+    if Match(r'^\s+', line):
+        error(filename, linenum, 'runtime/indentation_namespace', 4,
+              'Do not indent within a namespace')
+
+
+def ProcessLine(filename,
+                file_extension,
+                clean_lines,
+                line,
+                include_state,
+                function_state,
+                nesting_state,
+                error,
                 extra_check_functions=[]):
-  """Processes a single line in the file.
+    """Processes a single line in the file.
 
   Args:
     filename: Filename of the file that is being processed.
@@ -5933,32 +6009,34 @@ def ProcessLine(filename, file_extension, clean_lines, line,
                            run on each source line. Each function takes 4
                            arguments: filename, clean_lines, line, error
   """
-  raw_lines = clean_lines.raw_lines
-  ParseNolintSuppressions(filename, raw_lines[line], line, error)
-  nesting_state.Update(filename, clean_lines, line, error)
-  CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line,
-                               error)
-  if nesting_state.InAsmBlock(): return
-  CheckForFunctionLengths(filename, clean_lines, line, function_state, error)
-  CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)
-  CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error)
-  CheckLanguage(filename, clean_lines, line, file_extension, include_state,
-                nesting_state, error)
-  CheckForNonConstReference(filename, clean_lines, line, nesting_state, error)
-  CheckForNonStandardConstructs(filename, clean_lines, line,
-                                nesting_state, error)
-  CheckVlogArguments(filename, clean_lines, line, error)
-  CheckPosixThreading(filename, clean_lines, line, error)
-  CheckInvalidIncrement(filename, clean_lines, line, error)
-  CheckMakePairUsesDeduction(filename, clean_lines, line, error)
-  CheckDefaultLambdaCaptures(filename, clean_lines, line, error)
-  CheckRedundantVirtual(filename, clean_lines, line, error)
-  CheckRedundantOverrideOrFinal(filename, clean_lines, line, error)
-  for check_fn in extra_check_functions:
-    check_fn(filename, clean_lines, line, error)
+    raw_lines = clean_lines.raw_lines
+    ParseNolintSuppressions(filename, raw_lines[line], line, error)
+    nesting_state.Update(filename, clean_lines, line, error)
+    CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line,
+                                 error)
+    if nesting_state.InAsmBlock(): return
+    CheckForFunctionLengths(filename, clean_lines, line, function_state, error)
+    CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)
+    CheckStyle(filename, clean_lines, line, file_extension, nesting_state,
+               error)
+    CheckLanguage(filename, clean_lines, line, file_extension, include_state,
+                  nesting_state, error)
+    CheckForNonConstReference(filename, clean_lines, line, nesting_state, error)
+    CheckForNonStandardConstructs(filename, clean_lines, line, nesting_state,
+                                  error)
+    CheckVlogArguments(filename, clean_lines, line, error)
+    CheckPosixThreading(filename, clean_lines, line, error)
+    CheckInvalidIncrement(filename, clean_lines, line, error)
+    CheckMakePairUsesDeduction(filename, clean_lines, line, error)
+    CheckDefaultLambdaCaptures(filename, clean_lines, line, error)
+    CheckRedundantVirtual(filename, clean_lines, line, error)
+    CheckRedundantOverrideOrFinal(filename, clean_lines, line, error)
+    for check_fn in extra_check_functions:
+        check_fn(filename, clean_lines, line, error)
+
 
 def FlagCxx11Features(filename, clean_lines, linenum, error):
-  """Flag those c++11 features that we only allow in certain places.
+    """Flag those c++11 features that we only allow in certain places.
 
   Args:
     filename: The name of the current file.
@@ -5966,46 +6044,48 @@ def FlagCxx11Features(filename, clean_lines, linenum, error):
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
-  line = clean_lines.elided[linenum]
-
-  # Flag unapproved C++11 headers.
-  include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line)
-  if include and include.group(1) in ('cfenv',
-                                      'condition_variable',
-                                      'fenv.h',
-                                      'future',
-                                      'mutex',
-                                      'thread',
-                                      'chrono',
-                                      'ratio',
-                                      'regex',
-                                      'system_error',
-                                     ):
-    error(filename, linenum, 'build/c++11', 5,
-          ('<%s> is an unapproved C++11 header.') % include.group(1))
-
-  # The only place where we need to worry about C++11 keywords and library
-  # features in preprocessor directives is in macro definitions.
-  if Match(r'\s*#', line) and not Match(r'\s*#\s*define\b', line): return
-
-  # These are classes and free functions.  The classes are always
-  # mentioned as std::*, but we only catch the free functions if
-  # they're not found by ADL.  They're alphabetical by header.
-  for top_name in (
-      # type_traits
-      'alignment_of',
-      'aligned_union',
-      ):
-    if Search(r'\bstd::%s\b' % top_name, line):
-      error(filename, linenum, 'build/c++11', 5,
-            ('std::%s is an unapproved C++11 class or function.  Send c-style '
-             'an example of where it would make your code more readable, and '
-             'they may let you use it.') % top_name)
-
-
-def ProcessFileData(filename, file_extension, lines, error,
+    line = clean_lines.elided[linenum]
+
+    # Flag unapproved C++11 headers.
+    include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line)
+    if include and include.group(1) in (
+            'cfenv',
+            'condition_variable',
+            'fenv.h',
+            'future',
+            'mutex',
+            'thread',
+            'chrono',
+            'ratio',
+            'regex',
+            'system_error', ):
+        error(filename, linenum, 'build/c++11', 5,
+              ('<%s> is an unapproved C++11 header.') % include.group(1))
+
+    # The only place where we need to worry about C++11 keywords and library
+    # features in preprocessor directives is in macro definitions.
+    if Match(r'\s*#', line) and not Match(r'\s*#\s*define\b', line): return
+
+    # These are classes and free functions.  The classes are always
+    # mentioned as std::*, but we only catch the free functions if
+    # they're not found by ADL.  They're alphabetical by header.
+    for top_name in (
+            # type_traits
+            'alignment_of',
+            'aligned_union', ):
+        if Search(r'\bstd::%s\b' % top_name, line):
+            error(filename, linenum, 'build/c++11', 5, (
+                'std::%s is an unapproved C++11 class or function.  Send c-style '
+                'an example of where it would make your code more readable, and '
+                'they may let you use it.') % top_name)
+
+
+def ProcessFileData(filename,
+                    file_extension,
+                    lines,
+                    error,
                     extra_check_functions=[]):
-  """Performs lint checks and reports any errors to the given error function.
+    """Performs lint checks and reports any errors to the given error function.
 
   Args:
     filename: Filename of the file that is being processed.
@@ -6018,44 +6098,44 @@ def ProcessFileData(filename, file_extension, lines, error,
                            run on each source line. Each function takes 4
                            arguments: filename, clean_lines, line, error
   """
-  lines = (['// marker so line numbers and indices both start at 1'] + lines +
-           ['// marker so line numbers end in a known way'])
+    lines = (['// marker so line numbers and indices both start at 1'] + lines +
+             ['// marker so line numbers end in a known way'])
 
-  include_state = _IncludeState()
-  function_state = _FunctionState()
-  nesting_state = NestingState()
+    include_state = _IncludeState()
+    function_state = _FunctionState()
+    nesting_state = NestingState()
 
-  ResetNolintSuppressions()
+    ResetNolintSuppressions()
 
-  CheckForCopyright(filename, lines, error)
+    CheckForCopyright(filename, lines, error)
 
-  RemoveMultiLineComments(filename, lines, error)
-  clean_lines = CleansedLines(lines)
+    RemoveMultiLineComments(filename, lines, error)
+    clean_lines = CleansedLines(lines)
 
-  if file_extension == 'h':
-    CheckForHeaderGuard(filename, clean_lines, error)
+    if file_extension == 'h':
+        CheckForHeaderGuard(filename, clean_lines, error)
 
-  for line in xrange(clean_lines.NumLines()):
-    ProcessLine(filename, file_extension, clean_lines, line,
-                include_state, function_state, nesting_state, error,
-                extra_check_functions)
-    FlagCxx11Features(filename, clean_lines, line, error)
-  nesting_state.CheckCompletedBlocks(filename, error)
+    for line in xrange(clean_lines.NumLines()):
+        ProcessLine(filename, file_extension, clean_lines, line, include_state,
+                    function_state, nesting_state, error, extra_check_functions)
+        FlagCxx11Features(filename, clean_lines, line, error)
+    nesting_state.CheckCompletedBlocks(filename, error)
 
-  CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)
-  
-  # Check that the .cc file has included its header if it exists.
-  if file_extension == 'cc':
-    CheckHeaderFileIncluded(filename, include_state, error)
+    CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)
 
-  # We check here rather than inside ProcessLine so that we see raw
-  # lines rather than "cleaned" lines.
-  CheckForBadCharacters(filename, lines, error)
+    # Check that the .cc file has included its header if it exists.
+    if file_extension == 'cc':
+        CheckHeaderFileIncluded(filename, include_state, error)
+
+    # We check here rather than inside ProcessLine so that we see raw
+    # lines rather than "cleaned" lines.
+    CheckForBadCharacters(filename, lines, error)
+
+    CheckForNewlineAtEOF(filename, lines, error)
 
-  CheckForNewlineAtEOF(filename, lines, error)
 
 def ProcessConfigOverrides(filename):
-  """ Loads the configuration files and processes the config overrides.
+    """ Loads the configuration files and processes the config overrides.
 
   Args:
     filename: The name of the file being processed by the linter.
@@ -6064,74 +6144,76 @@ def ProcessConfigOverrides(filename):
     False if the current |filename| should not be processed further.
   """
 
-  abs_filename = os.path.abspath(filename)
-  cfg_filters = []
-  keep_looking = True
-  while keep_looking:
-    abs_path, base_name = os.path.split(abs_filename)
-    if not base_name:
-      break  # Reached the root directory.
-
-    cfg_file = os.path.join(abs_path, "CPPLINT.cfg")
-    abs_filename = abs_path
-    if not os.path.isfile(cfg_file):
-      continue
-
-    try:
-      with open(cfg_file) as file_handle:
-        for line in file_handle:
-          line, _, _ = line.partition('#')  # Remove comments.
-          if not line.strip():
+    abs_filename = os.path.abspath(filename)
+    cfg_filters = []
+    keep_looking = True
+    while keep_looking:
+        abs_path, base_name = os.path.split(abs_filename)
+        if not base_name:
+            break  # Reached the root directory.
+
+        cfg_file = os.path.join(abs_path, "CPPLINT.cfg")
+        abs_filename = abs_path
+        if not os.path.isfile(cfg_file):
             continue
 
-          name, _, val = line.partition('=')
-          name = name.strip()
-          val = val.strip()
-          if name == 'set noparent':
-            keep_looking = False
-          elif name == 'filter':
-            cfg_filters.append(val)
-          elif name == 'exclude_files':
-            # When matching exclude_files pattern, use the base_name of
-            # the current file name or the directory name we are processing.
-            # For example, if we are checking for lint errors in /foo/bar/baz.cc
-            # and we found the .cfg file at /foo/CPPLINT.cfg, then the config
-            # file's "exclude_files" filter is meant to be checked against "bar"
-            # and not "baz" nor "bar/baz.cc".
-            if base_name:
-              pattern = re.compile(val)
-              if pattern.match(base_name):
-                sys.stderr.write('Ignoring "%s": file excluded by "%s". '
-                                 'File path component "%s" matches '
-                                 'pattern "%s"\n' %
-                                 (filename, cfg_file, base_name, val))
-                return False
-          elif name == 'linelength':
-            global _line_length
-            try:
-                _line_length = int(val)
-            except ValueError:
-                sys.stderr.write('Line length must be numeric.')
-          else:
+        try:
+            with open(cfg_file) as file_handle:
+                for line in file_handle:
+                    line, _, _ = line.partition('#')  # Remove comments.
+                    if not line.strip():
+                        continue
+
+                    name, _, val = line.partition('=')
+                    name = name.strip()
+                    val = val.strip()
+                    if name == 'set noparent':
+                        keep_looking = False
+                    elif name == 'filter':
+                        cfg_filters.append(val)
+                    elif name == 'exclude_files':
+                        # When matching exclude_files pattern, use the base_name of
+                        # the current file name or the directory name we are processing.
+                        # For example, if we are checking for lint errors in /foo/bar/baz.cc
+                        # and we found the .cfg file at /foo/CPPLINT.cfg, then the config
+                        # file's "exclude_files" filter is meant to be checked against "bar"
+                        # and not "baz" nor "bar/baz.cc".
+                        if base_name:
+                            pattern = re.compile(val)
+                            if pattern.match(base_name):
+                                sys.stderr.write(
+                                    'Ignoring "%s": file excluded by "%s". '
+                                    'File path component "%s" matches '
+                                    'pattern "%s"\n' %
+                                    (filename, cfg_file, base_name, val))
+                                return False
+                    elif name == 'linelength':
+                        global _line_length
+                        try:
+                            _line_length = int(val)
+                        except ValueError:
+                            sys.stderr.write('Line length must be numeric.')
+                    else:
+                        sys.stderr.write(
+                            'Invalid configuration option (%s) in file %s\n' %
+                            (name, cfg_file))
+
+        except IOError:
             sys.stderr.write(
-                'Invalid configuration option (%s) in file %s\n' %
-                (name, cfg_file))
-
-    except IOError:
-      sys.stderr.write(
-          "Skipping config file '%s': Can't open for reading\n" % cfg_file)
-      keep_looking = False
+                "Skipping config file '%s': Can't open for reading\n" %
+                cfg_file)
+            keep_looking = False
 
-  # Apply all the accumulated filters in reverse order (top-level directory
-  # config options having the least priority).
-  for filter in reversed(cfg_filters):
-     _AddFilters(filter)
+    # Apply all the accumulated filters in reverse order (top-level directory
+    # config options having the least priority).
+    for filter in reversed(cfg_filters):
+        _AddFilters(filter)
 
-  return True
+    return True
 
 
 def ProcessFile(filename, vlevel, extra_check_functions=[]):
-  """Does google-lint on a single file.
+    """Does google-lint on a single file.
 
   Args:
     filename: The name of the file to parse.
@@ -6144,104 +6226,105 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]):
                            arguments: filename, clean_lines, line, error
   """
 
-  _SetVerboseLevel(vlevel)
-  _BackupFilters()
+    _SetVerboseLevel(vlevel)
+    _BackupFilters()
 
-  if not ProcessConfigOverrides(filename):
-    _RestoreFilters()
-    return
-
-  lf_lines = []
-  crlf_lines = []
-  try:
-    # Support the UNIX convention of using "-" for stdin.  Note that
-    # we are not opening the file with universal newline support
-    # (which codecs doesn't support anyway), so the resulting lines do
-    # contain trailing '\r' characters if we are reading a file that
-    # has CRLF endings.
-    # If after the split a trailing '\r' is present, it is removed
-    # below.
-    if filename == '-':
-      lines = codecs.StreamReaderWriter(sys.stdin,
-                                        codecs.getreader('utf8'),
-                                        codecs.getwriter('utf8'),
-                                        'replace').read().split('\n')
+    if not ProcessConfigOverrides(filename):
+        _RestoreFilters()
+        return
+
+    lf_lines = []
+    crlf_lines = []
+    try:
+        # Support the UNIX convention of using "-" for stdin.  Note that
+        # we are not opening the file with universal newline support
+        # (which codecs doesn't support anyway), so the resulting lines do
+        # contain trailing '\r' characters if we are reading a file that
+        # has CRLF endings.
+        # If after the split a trailing '\r' is present, it is removed
+        # below.
+        if filename == '-':
+            lines = codecs.StreamReaderWriter(sys.stdin,
+                                              codecs.getreader('utf8'),
+                                              codecs.getwriter('utf8'),
+                                              'replace').read().split('\n')
+        else:
+            lines = codecs.open(filename, 'r', 'utf8',
+                                'replace').read().split('\n')
+
+        # Remove trailing '\r'.
+        # The -1 accounts for the extra trailing blank line we get from split()
+        for linenum in range(len(lines) - 1):
+            if lines[linenum].endswith('\r'):
+                lines[linenum] = lines[linenum].rstrip('\r')
+                crlf_lines.append(linenum + 1)
+            else:
+                lf_lines.append(linenum + 1)
+
+    except IOError:
+        sys.stderr.write("Skipping input '%s': Can't open for reading\n" %
+                         filename)
+        _RestoreFilters()
+        return
+
+    # Note, if no dot is found, this will give the entire filename as the ext.
+    file_extension = filename[filename.rfind('.') + 1:]
+
+    # When reading from stdin, the extension is unknown, so no cpplint tests
+    # should rely on the extension.
+    if filename != '-' and file_extension not in _valid_extensions:
+        sys.stderr.write('Ignoring %s; not a valid file name '
+                         '(%s)\n' % (filename, ', '.join(_valid_extensions)))
     else:
-      lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n')
-
-    # Remove trailing '\r'.
-    # The -1 accounts for the extra trailing blank line we get from split()
-    for linenum in range(len(lines) - 1):
-      if lines[linenum].endswith('\r'):
-        lines[linenum] = lines[linenum].rstrip('\r')
-        crlf_lines.append(linenum + 1)
-      else:
-        lf_lines.append(linenum + 1)
-
-  except IOError:
-    sys.stderr.write(
-        "Skipping input '%s': Can't open for reading\n" % filename)
-    _RestoreFilters()
-    return
-
-  # Note, if no dot is found, this will give the entire filename as the ext.
-  file_extension = filename[filename.rfind('.') + 1:]
-
-  # When reading from stdin, the extension is unknown, so no cpplint tests
-  # should rely on the extension.
-  if filename != '-' and file_extension not in _valid_extensions:
-    sys.stderr.write('Ignoring %s; not a valid file name '
-                     '(%s)\n' % (filename, ', '.join(_valid_extensions)))
-  else:
-    ProcessFileData(filename, file_extension, lines, Error,
-                    extra_check_functions)
-
-    # If end-of-line sequences are a mix of LF and CR-LF, issue
-    # warnings on the lines with CR.
-    #
-    # Don't issue any warnings if all lines are uniformly LF or CR-LF,
-    # since critique can handle these just fine, and the style guide
-    # doesn't dictate a particular end of line sequence.
-    #
-    # We can't depend on os.linesep to determine what the desired
-    # end-of-line sequence should be, since that will return the
-    # server-side end-of-line sequence.
-    if lf_lines and crlf_lines:
-      # Warn on every line with CR.  An alternative approach might be to
-      # check whether the file is mostly CRLF or just LF, and warn on the
-      # minority, we bias toward LF here since most tools prefer LF.
-      for linenum in crlf_lines:
-        Error(filename, linenum, 'whitespace/newline', 1,
-              'Unexpected \\r (^M) found; better to use only \\n')
+        ProcessFileData(filename, file_extension, lines, Error,
+                        extra_check_functions)
 
-  sys.stdout.write('Done processing %s\n' % filename)
-  _RestoreFilters()
+        # If end-of-line sequences are a mix of LF and CR-LF, issue
+        # warnings on the lines with CR.
+        #
+        # Don't issue any warnings if all lines are uniformly LF or CR-LF,
+        # since critique can handle these just fine, and the style guide
+        # doesn't dictate a particular end of line sequence.
+        #
+        # We can't depend on os.linesep to determine what the desired
+        # end-of-line sequence should be, since that will return the
+        # server-side end-of-line sequence.
+        if lf_lines and crlf_lines:
+            # Warn on every line with CR.  An alternative approach might be to
+            # check whether the file is mostly CRLF or just LF, and warn on the
+            # minority, we bias toward LF here since most tools prefer LF.
+            for linenum in crlf_lines:
+                Error(filename, linenum, 'whitespace/newline', 1,
+                      'Unexpected \\r (^M) found; better to use only \\n')
+
+    sys.stdout.write('Done processing %s\n' % filename)
+    _RestoreFilters()
 
 
 def PrintUsage(message):
-  """Prints a brief usage string and exits, optionally with an error message.
+    """Prints a brief usage string and exits, optionally with an error message.
 
   Args:
     message: The optional error message.
   """
-  sys.stderr.write(_USAGE)
-  if message:
-    sys.exit('\nFATAL ERROR: ' + message)
-  else:
-    sys.exit(1)
+    sys.stderr.write(_USAGE)
+    if message:
+        sys.exit('\nFATAL ERROR: ' + message)
+    else:
+        sys.exit(1)
 
 
 def PrintCategories():
-  """Prints a list of all the error-categories used by error messages.
+    """Prints a list of all the error-categories used by error messages.
 
   These are the categories used to filter messages via --filter.
   """
-  sys.stderr.write(''.join('  %s\n' % cat for cat in _ERROR_CATEGORIES))
-  sys.exit(0)
+    sys.stderr.write(''.join('  %s\n' % cat for cat in _ERROR_CATEGORIES))
+    sys.exit(0)
 
 
 def ParseArguments(args):
-  """Parses the command line arguments.
+    """Parses the command line arguments.
 
   This may set the output format and verbosity level as side-effects.
 
@@ -6251,82 +6334,82 @@ def ParseArguments(args):
   Returns:
     The list of filenames to lint.
   """
-  try:
-    (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=',
-                                                 'counting=',
-                                                 'filter=',
-                                                 'root=',
-                                                 'linelength=',
-                                                 'extensions='])
-  except getopt.GetoptError:
-    PrintUsage('Invalid arguments.')
-
-  verbosity = _VerboseLevel()
-  output_format = _OutputFormat()
-  filters = ''
-  counting_style = ''
-
-  for (opt, val) in opts:
-    if opt == '--help':
-      PrintUsage(None)
-    elif opt == '--output':
-      if val not in ('emacs', 'vs7', 'eclipse'):
-        PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.')
-      output_format = val
-    elif opt == '--verbose':
-      verbosity = int(val)
-    elif opt == '--filter':
-      filters = val
-      if not filters:
-        PrintCategories()
-    elif opt == '--counting':
-      if val not in ('total', 'toplevel', 'detailed'):
-        PrintUsage('Valid counting options are total, toplevel, and detailed')
-      counting_style = val
-    elif opt == '--root':
-      global _root
-      _root = val
-    elif opt == '--linelength':
-      global _line_length
-      try:
-          _line_length = int(val)
-      except ValueError:
-          PrintUsage('Line length must be digits.')
-    elif opt == '--extensions':
-      global _valid_extensions
-      try:
-          _valid_extensions = set(val.split(','))
-      except ValueError:
-          PrintUsage('Extensions must be comma seperated list.')
-
-  if not filenames:
-    PrintUsage('No files were specified.')
-
-  _SetOutputFormat(output_format)
-  _SetVerboseLevel(verbosity)
-  _SetFilters(filters)
-  _SetCountingStyle(counting_style)
-
-  return filenames
+    try:
+        (opts, filenames) = getopt.getopt(args, '', [
+            'help', 'output=', 'verbose=', 'counting=', 'filter=', 'root=',
+            'linelength=', 'extensions='
+        ])
+    except getopt.GetoptError:
+        PrintUsage('Invalid arguments.')
+
+    verbosity = _VerboseLevel()
+    output_format = _OutputFormat()
+    filters = ''
+    counting_style = ''
+
+    for (opt, val) in opts:
+        if opt == '--help':
+            PrintUsage(None)
+        elif opt == '--output':
+            if val not in ('emacs', 'vs7', 'eclipse'):
+                PrintUsage(
+                    'The only allowed output formats are emacs, vs7 and eclipse.'
+                )
+            output_format = val
+        elif opt == '--verbose':
+            verbosity = int(val)
+        elif opt == '--filter':
+            filters = val
+            if not filters:
+                PrintCategories()
+        elif opt == '--counting':
+            if val not in ('total', 'toplevel', 'detailed'):
+                PrintUsage(
+                    'Valid counting options are total, toplevel, and detailed')
+            counting_style = val
+        elif opt == '--root':
+            global _root
+            _root = val
+        elif opt == '--linelength':
+            global _line_length
+            try:
+                _line_length = int(val)
+            except ValueError:
+                PrintUsage('Line length must be digits.')
+        elif opt == '--extensions':
+            global _valid_extensions
+            try:
+                _valid_extensions = set(val.split(','))
+            except ValueError:
+                PrintUsage('Extensions must be comma seperated list.')
+
+    if not filenames:
+        PrintUsage('No files were specified.')
+
+    _SetOutputFormat(output_format)
+    _SetVerboseLevel(verbosity)
+    _SetFilters(filters)
+    _SetCountingStyle(counting_style)
+
+    return filenames
 
 
 def main():
-  filenames = ParseArguments(sys.argv[1:])
+    filenames = ParseArguments(sys.argv[1:])
 
-  # Change stderr to write with replacement characters so we don't die
-  # if we try to print something containing non-ASCII characters.
-  sys.stderr = codecs.StreamReaderWriter(sys.stderr,
-                                         codecs.getreader('utf8'),
-                                         codecs.getwriter('utf8'),
-                                         'replace')
+    # Change stderr to write with replacement characters so we don't die
+    # if we try to print something containing non-ASCII characters.
+    sys.stderr = codecs.StreamReaderWriter(sys.stderr,
+                                           codecs.getreader('utf8'),
+                                           codecs.getwriter('utf8'), 'replace')
 
-  _cpplint_state.ResetErrorCounts()
-  for filename in filenames:
-    ProcessFile(filename, _cpplint_state.verbose_level)
-  _cpplint_state.PrintErrorCounts()
+    _cpplint_state.ResetErrorCounts()
+    for filename in filenames:
+        ProcessFile(filename, _cpplint_state.verbose_level)
+    _cpplint_state.PrintErrorCounts()
 
-  sys.exit(_cpplint_state.error_count > 0)
+    sys.exit(_cpplint_state.error_count > 0)
 
 
 if __name__ == '__main__':
-  main()
+    main()
diff --git a/paddle/scripts/deb/build_scripts/build.sh b/paddle/scripts/deb/build_scripts/build.sh
index 662d2a9103f7da62d96650f490688d02b2c4669e..d13dea514841b110c304b8aa0e65ad16e42c75f3 100755
--- a/paddle/scripts/deb/build_scripts/build.sh
+++ b/paddle/scripts/deb/build_scripts/build.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
 set -e
+apt-get update
 apt-get install -y dh-make
 cd ~
 mkdir -p ~/dist/gpu
 mkdir -p ~/dist/cpu
 mkdir -p ~/dist/cpu-noavx
 mkdir -p ~/dist/gpu-noavx
-git clone https://github.com/baidu/Paddle.git paddle
 cd paddle
 mkdir build
 cd build
@@ -33,5 +33,3 @@ cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON -DWITH_AVX=OFF -DCUDNN_ROOT=/usr/
 make -j `nproc`
 cpack -D CPACK_GENERATOR='DEB' ..
 mv *.deb ~/dist/gpu-noavx
-
-
diff --git a/paddle/scripts/deb/build_scripts/build_deb.sh b/paddle/scripts/deb/build_scripts/build_deb.sh
index 1331c1249d5a7eae8bf8f4648aacd8579363a402..c38c6299f840345b7f6f6e0aad7482241d36198a 100755
--- a/paddle/scripts/deb/build_scripts/build_deb.sh
+++ b/paddle/scripts/deb/build_scripts/build_deb.sh
@@ -3,6 +3,6 @@ set -e
 docker build -t build_paddle_deb .
 rm -rf dist
 mkdir -p dist
-docker run -v$PWD/dist:/root/dist --name tmp_build_deb_container build_paddle_deb
+docker run -v$PWD/dist:/root/dist -v $PWD/../../../..:/root/paddle --name tmp_build_deb_container build_paddle_deb
 docker rm tmp_build_deb_container
 docker rmi build_paddle_deb
diff --git a/paddle/scripts/docker/Dockerfile.cpu b/paddle/scripts/docker/Dockerfile.cpu
index 3aa8cb1a3a8695d513ca0682c0ac2a269e6589da..69b8363b7ac9eed033ec4958e189e233b3dc2689 100644
--- a/paddle/scripts/docker/Dockerfile.cpu
+++ b/paddle/scripts/docker/Dockerfile.cpu
@@ -1,6 +1,7 @@
 FROM ubuntu:14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=OFF
 ENV IS_DEVEL=OFF
 ENV WITH_DEMO=OFF
diff --git a/paddle/scripts/docker/Dockerfile.cpu-demo b/paddle/scripts/docker/Dockerfile.cpu-demo
index 22c0b9e701bfc06230d7fcb1f7b3c49e8e3d0d0f..ccbd183ee3c1ac27fc624f22847f53eb7d60b83d 100644
--- a/paddle/scripts/docker/Dockerfile.cpu-demo
+++ b/paddle/scripts/docker/Dockerfile.cpu-demo
@@ -1,6 +1,7 @@
 FROM ubuntu:14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=OFF
 ENV IS_DEVEL=ON
 ENV WITH_DEMO=ON
diff --git a/paddle/scripts/docker/Dockerfile.cpu-devel b/paddle/scripts/docker/Dockerfile.cpu-devel
index b40f3c0a30ba36571883a9881f6146e630cca187..36460384f383ba10c4bff1d9875cd053d6391b97 100644
--- a/paddle/scripts/docker/Dockerfile.cpu-devel
+++ b/paddle/scripts/docker/Dockerfile.cpu-devel
@@ -1,6 +1,7 @@
 FROM ubuntu:14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=OFF
 ENV IS_DEVEL=ON
 ENV WITH_DEMO=OFF
diff --git a/paddle/scripts/docker/Dockerfile.cpu-noavx b/paddle/scripts/docker/Dockerfile.cpu-noavx
index 5cb5ac7dc4e6811ffdcacae7188dae078adc4030..fa3b7427b0ad3973423894fa7af54ae5a2514e06 100644
--- a/paddle/scripts/docker/Dockerfile.cpu-noavx
+++ b/paddle/scripts/docker/Dockerfile.cpu-noavx
@@ -1,6 +1,7 @@
 FROM ubuntu:14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=OFF
 ENV IS_DEVEL=OFF
 ENV WITH_DEMO=OFF
diff --git a/paddle/scripts/docker/Dockerfile.cpu-noavx-demo b/paddle/scripts/docker/Dockerfile.cpu-noavx-demo
index bec401960efb2329a54e0e80043eba7d9ab36d9c..61315f762dee4d64251ef3d8db5b11b30a3ddb3a 100644
--- a/paddle/scripts/docker/Dockerfile.cpu-noavx-demo
+++ b/paddle/scripts/docker/Dockerfile.cpu-noavx-demo
@@ -1,6 +1,7 @@
 FROM ubuntu:14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=OFF
 ENV IS_DEVEL=ON
 ENV WITH_DEMO=ON
diff --git a/paddle/scripts/docker/Dockerfile.cpu-noavx-devel b/paddle/scripts/docker/Dockerfile.cpu-noavx-devel
index b7c3eaed97aa5f11c80a9082fc7733ffd475e965..76365311990b527ea473be840770bfeb6025d74f 100644
--- a/paddle/scripts/docker/Dockerfile.cpu-noavx-devel
+++ b/paddle/scripts/docker/Dockerfile.cpu-noavx-devel
@@ -1,6 +1,7 @@
 FROM ubuntu:14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=OFF
 ENV IS_DEVEL=ON
 ENV WITH_DEMO=OFF
diff --git a/paddle/scripts/docker/Dockerfile.gpu b/paddle/scripts/docker/Dockerfile.gpu
index b7f5b6d93df50638fbb5e04c61aeed112371ef5b..1e023ae2818dbb27c457ff17b01fc4ab02815eba 100644
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
@@ -1,6 +1,7 @@
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=ON
 ENV IS_DEVEL=OFF
 ENV WITH_DEMO=OFF
diff --git a/paddle/scripts/docker/Dockerfile.gpu-demo b/paddle/scripts/docker/Dockerfile.gpu-demo
index 2d1411de09f2ac37aa129f51d5d4e98baa22023b..92b0dca4026c89c6749e14f189370183462333b8 100644
--- a/paddle/scripts/docker/Dockerfile.gpu-demo
+++ b/paddle/scripts/docker/Dockerfile.gpu-demo
@@ -1,6 +1,7 @@
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=ON
 ENV IS_DEVEL=ON
 ENV WITH_DEMO=ON
diff --git a/paddle/scripts/docker/Dockerfile.gpu-devel b/paddle/scripts/docker/Dockerfile.gpu-devel
index eb13f4304fa06203fcf319b6b1cf4adf087cb044..fb6f351fd2f7e0f950e00ac96681de88ca238f70 100644
--- a/paddle/scripts/docker/Dockerfile.gpu-devel
+++ b/paddle/scripts/docker/Dockerfile.gpu-devel
@@ -1,6 +1,7 @@
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=ON
 ENV IS_DEVEL=ON
 ENV WITH_DEMO=OFF
diff --git a/paddle/scripts/docker/Dockerfile.gpu-noavx b/paddle/scripts/docker/Dockerfile.gpu-noavx
index 0944b0e152af3ba3e4794091c6a76e05b888573b..7567e62025506ca2ae8c1d35d595d92ed6de87f3 100644
--- a/paddle/scripts/docker/Dockerfile.gpu-noavx
+++ b/paddle/scripts/docker/Dockerfile.gpu-noavx
@@ -1,6 +1,7 @@
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=ON
 ENV IS_DEVEL=OFF
 ENV WITH_DEMO=OFF
diff --git a/paddle/scripts/docker/Dockerfile.gpu-noavx-demo b/paddle/scripts/docker/Dockerfile.gpu-noavx-demo
index 2da2a55d696a38016b0eb13cd263977026cb1f2b..ac52484c5cb513537283e1a0ffbe9df067fefc9a 100644
--- a/paddle/scripts/docker/Dockerfile.gpu-noavx-demo
+++ b/paddle/scripts/docker/Dockerfile.gpu-noavx-demo
@@ -1,6 +1,7 @@
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=ON
 ENV IS_DEVEL=ON
 ENV WITH_DEMO=ON
diff --git a/paddle/scripts/docker/Dockerfile.gpu-noavx-devel b/paddle/scripts/docker/Dockerfile.gpu-noavx-devel
index 9f551462f206aaf59fca7ee5bfe258f83cfdd0ca..19202f306b8f71e93af085d5285098a1fbe1dba7 100644
--- a/paddle/scripts/docker/Dockerfile.gpu-noavx-devel
+++ b/paddle/scripts/docker/Dockerfile.gpu-noavx-devel
@@ -1,6 +1,7 @@
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=ON
 ENV IS_DEVEL=ON
 ENV WITH_DEMO=OFF
diff --git a/paddle/scripts/docker/Dockerfile.m4 b/paddle/scripts/docker/Dockerfile.m4
index 129d21b36abd9443e62276c75da3eeb19fa246de..e14493ed9e842351125ab458db53fcc3f38233f6 100644
--- a/paddle/scripts/docker/Dockerfile.m4
+++ b/paddle/scripts/docker/Dockerfile.m4
@@ -1,6 +1,7 @@
 FROM PADDLE_BASE_IMAGE
 MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
 COPY build.sh /root/
+ENV GIT_CHECKOUT=v0.9.0a0
 ENV WITH_GPU=PADDLE_WITH_GPU
 ENV IS_DEVEL=PADDLE_IS_DEVEL
 ENV WITH_DEMO=PADDLE_WITH_DEMO
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 33689e736cda76ea5731d847260b399fc8c6e484..ec5f3bd967d3569ee058a2e12d85fc50ba25c69d 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -23,6 +23,7 @@ fi
 cd ~
 git clone https://github.com/baidu/Paddle.git paddle
 cd paddle
+git checkout ${GIT_CHECKOUT}
 mkdir build
 cd build
 cmake .. -DWITH_DOC=OFF -DWITH_GPU=${WITH_GPU} -DWITH_SWIG_PY=ON\
diff --git a/paddle/scripts/docker/generate.sh b/paddle/scripts/docker/generate.sh
index 8a50aefd34955fcc54c25c833154997472449f93..2ad7527db127f3bd2018a7a1f5b40dacfecca6da 100644
--- a/paddle/scripts/docker/generate.sh
+++ b/paddle/scripts/docker/generate.sh
@@ -58,4 +58,3 @@ m4 -DPADDLE_WITH_GPU=ON -DPADDLE_IS_DEVEL=ON -DPADDLE_WITH_DEMO=ON \
    -DPADDLE_BASE_IMAGE=nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04 \
    -DPADDLE_WITH_AVX=OFF \
    Dockerfile.m4 > Dockerfile.gpu-noavx-demo
-
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 4cf5f41f195df7655c9e77eba23baf90e21cee13..20ea2fedc4d464cdd5403af28bc917770c993b98 100644
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -28,6 +28,34 @@ function version(){
         echo "    with_predict_sdk: @WITH_PREDICT_SDK@"
 }
 
+function ver2num() {
+  # convert version to number.
+  if [ -z "$1" ]; then # empty argument
+    printf "%03d%03d%03d%03d%03d" 0
+  else
+    local VERN=$(echo $1 | sed 's#v##g' | sed 's#\.# #g' \
+        | sed 's#a# 0 #g' | sed 's#b# 1 #g' | sed 's#rc# 2 #g')
+    if [ `echo $VERN | wc -w` -eq 3 ] ; then
+      printf "%03d%03d%03d%03d%03d" $VERN 999 999
+    else
+      printf "%03d%03d%03d%03d%03d" $VERN
+    fi
+  fi
+}
+
+PADDLE_CONF_HOME="$HOME/.config/paddle"
+mkdir -p ${PADDLE_CONF_HOME}
+
+if [ -z "${PADDLE_NO_STAT+x}" ]; then
+    SERVER_VER=`curl -m 5 -X POST --data content="{ \"version\": \"@PADDLE_VERSION@\" }"\
+        -b ${PADDLE_CONF_HOME}/paddle.cookie \
+        -c ${PADDLE_CONF_HOME}/paddle.cookie \
+        http://api.paddlepaddle.org/version 2>/dev/null`
+    if [ $? -eq 0 ] && [ "$(ver2num @PADDLE_VERSION@)" -lt  $(ver2num $SERVER_VER) ]; then
+      echo "Paddle release a new version ${SERVER_VER}, you can get the install package in http://www.paddlepaddle.org" 
+    fi
+fi
+
 
 MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
@@ -68,7 +96,7 @@ EOF
 if [ $? -eq 1 ]; then  # Older version installed, or not installed at all
    echo "First time run paddle, need to install some python dependencies."
    BASEDIR=$(dirname "$0")
-   pip install ${BASEDIR}/../opt/paddle/share/wheels/*.whl
+   pip install ${BASEDIR}/../opt/paddle/share/wheels/*-@PADDLE_VERSION@-*.whl
    if [ $? -ne 0 ]; then
       echo "pip install wheels failed. "
       echo "Please use 'sudo paddle' at the first time you use PaddlePaddle"
diff --git a/paddle/scripts/tools/build_docs/.gitignore b/paddle/scripts/tools/build_docs/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..6ec14c8f5bc3774a81dbe87c44f458594b38f12c
--- /dev/null
+++ b/paddle/scripts/tools/build_docs/.gitignore
@@ -0,0 +1,2 @@
+doc
+doc_cn
diff --git a/paddle/scripts/tools/build_docs/Dockerfile b/paddle/scripts/tools/build_docs/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..5db0b29c4739943f9e677dc7973b392a345b7da1
--- /dev/null
+++ b/paddle/scripts/tools/build_docs/Dockerfile
@@ -0,0 +1,6 @@
+FROM paddledev/paddle:cpu-devel-latest
+COPY build.sh /
+RUN pip install sphinx &&\
+    apt install -y doxygen graphviz &&\
+    pip install breathe recommonmark numpy protobuf==2.6.1
+CMD /build.sh
diff --git a/paddle/scripts/tools/build_docs/build.sh b/paddle/scripts/tools/build_docs/build.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a23b6e61d45926e77015365627bfb7dca303ac65
--- /dev/null
+++ b/paddle/scripts/tools/build_docs/build.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+set -ex
+
+mkdir -p /build
+cd /build
+cmake /paddle -DWITH_DOC=ON
+make paddle_docs paddle_docs_cn -j `nproc`
+mkdir -p /output/doc
+mkdir -p /output/doc_cn
+cp -r doc/html/* /output/doc/
+cp -r doc_cn/html/* /output/doc_cn/
+cd /
+rm -rf /paddle/build
diff --git a/paddle/scripts/tools/build_docs/build_docs.sh b/paddle/scripts/tools/build_docs/build_docs.sh
new file mode 100755
index 0000000000000000000000000000000000000000..9f8b80435c8fb17907d7da52c864a448f0d8d136
--- /dev/null
+++ b/paddle/scripts/tools/build_docs/build_docs.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -e
+docker build . -t paddle_build_doc
+docker run --rm -v $PWD/../../../../:/paddle -v $PWD:/output paddle_build_doc
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index a73c32344c8abe4d314fbac2c2ec02aafeeac9d1..242fd982aa0015bfe9cb910c52afc3b42ab1028b 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -3,20 +3,24 @@ source ./common.sh
 CMAKE_EXTRA=""
 if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
   CMAKE_EXTRA="-DPYTHON_LIBRARY=/usr/local/Cellar/python/2.7.12_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config/libpython2.7.dylib"
+else
+  CMAKE_EXTRA="-DWITH_SWIG_PY=ON"
 fi
 
 
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON ${CMAKE_EXTRA}
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON -DON_COVERALLS=ON ${CMAKE_EXTRA}
 
 NPROC=1
 if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
   NRPOC=`nproc`
+  make -j $NPROC
+  make coveralls
 elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
   NPROC=`sysctl -n hw.ncpu`
+  make -j $NPROC
+  env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j $NPROC"
 fi
 
 
-make -j $NPROC
-env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j $NPROC"
 sudo make install
 sudo paddle version
diff --git a/paddle/scripts/travis/common.sh b/paddle/scripts/travis/common.sh
index 37e27d665b12fb5b3b8ec7ad245d4587cb0361d6..9b6e420ca7931f0d17da461c7579bf4dc69e18e0 100755
--- a/paddle/scripts/travis/common.sh
+++ b/paddle/scripts/travis/common.sh
@@ -2,4 +2,3 @@
 set -e
 mkdir -p ../../../build
 cd ../../../build
-
diff --git a/paddle/setup.py.in b/paddle/setup.py.in
index 3341dd6f95969fcd8df5b6049b0b8d2d5905a43f..1a15eafd5528a68aa9a68ed020de6decb61bd2a7 100644
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -31,8 +31,8 @@ is_lin = (system == 'linux')
 #   because generate paddle LDFLAGS is too complicated to do in setup.py
 #   it just read COMAKE generated LDFLAGS.
 extra_links = []
-ldflags = api.paddle_ld_flags.PaddleLDFlag()
-ldflags = ldflags.ldflag_str()
+obj = api.paddle_ld_flags.PaddleLDFlag()
+ldflags = obj.ldflag_str()
 if ldflags is not None:
   extra_links.extend(ldflags.split(" "))
 
@@ -51,13 +51,20 @@ elif is_osx == True:
 
 include_dirs = [np.get_include(), "../"]    # include numpy and paddle.
 
+extra_c = obj.c_flag()
+
+attr=dict()
+if extra_c is not None:
+  attr["extra_compile_args"] = extra_c
+
 setup(name="py_paddle",
   version="@PADDLE_VERSION@",
   ext_modules=[
     Extension('py_paddle._swig_paddle',      # Build SWIG Extension.
        ['Paddle_wrap.cxx'],
        include_dirs = include_dirs,
-       extra_link_args = extra_links
+       extra_link_args = extra_links,
+       **attr
     )
   ],
   packages=['py_paddle'],
diff --git a/paddle/trainer/CMakeLists.txt b/paddle/trainer/CMakeLists.txt
index 08b411d2ccbae7745b5bd72f92c1190cb75ced87..06c019f0a97757b658d1bc3405246d8f47632aad 100644
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -7,6 +7,7 @@ set(TRAINER_SOURCES
         Tester.cpp
         Trainer.cpp
         TrainerInternal.cpp
+        TrainerBenchmark.cpp
         ThreadParameterUpdater.cpp
         TrainerInternalConfig.cpp
         TrainerConfigHelper.cpp)
diff --git a/paddle/trainer/ParamUtil.cpp b/paddle/trainer/ParamUtil.cpp
index dae8b44b6db8eec2d8d3284bdc6883355b5128ea..2be9cd62235a262812231579c536a5f0596b69d9 100644
--- a/paddle/trainer/ParamUtil.cpp
+++ b/paddle/trainer/ParamUtil.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ParamUtil.h"
 
 #include <fenv.h>
@@ -48,8 +47,6 @@ ParameterUtil::ParameterUtil(
   pUpdater_ = parameterUpdater;
 }
 
-
-
 bool ParameterUtil::loadParameters(int passId, bool local, bool remote) {
   constexpr int kBufLen = 100;
   char buf[kBufLen];
@@ -60,8 +57,9 @@ bool ParameterUtil::loadParameters(int passId, bool local, bool remote) {
   return true;
 }
 
-void ParameterUtil::loadParametersWithPath(const std::string& dir,
-                                    bool local, bool remote) {
+void ParameterUtil::loadParametersWithPath(const std::string &dir,
+                                           bool local,
+                                           bool remote) {
   if (local) {
     gserver_->loadParameters(dir);
   }
@@ -89,13 +87,16 @@ void ParameterUtil::saveParameters(int passId, int passInnerId) {
   }
 
   std::string basePath = config_->getSaveDir();
+  if (basePath.find('/') == std::string::npos) {
+    basePath = "./" + basePath;
+  }
   mkDirRecursively(basePath.c_str());
 
   std::string saveDir = path::join(basePath, buf);
   mkDir(saveDir.c_str());
   if (!intConfig_->load_save_param_pserver_) {
     pUpdater_->getParametersRemote(true /*full parameter*/,
-                                  true /*after apply*/);
+                                   true /*after apply*/);
   }
 
   gserver_->saveParameters(saveDir);
@@ -114,9 +115,13 @@ void ParameterUtil::saveParameters(int passId, int passInnerId) {
 void ParameterUtil::deleteParameters(int passId, int passInnerId) {
   constexpr int kBufLen = 100;
   char buf[kBufLen];
-  const std::string& saveDir = config_->getSaveDir();
+  const std::string &saveDir = config_->getSaveDir();
   if (passInnerId > 0) {
-    snprintf(buf, kBufLen, "%s/pass-%05d-%03d", saveDir.c_str(), passId,
+    snprintf(buf,
+             kBufLen,
+             "%s/pass-%05d-%03d",
+             saveDir.c_str(),
+             passId,
              passInnerId);
   } else {
     snprintf(buf, kBufLen, "%s/pass-%05d", saveDir.c_str(), passId);
@@ -126,8 +131,7 @@ void ParameterUtil::deleteParameters(int passId, int passInnerId) {
   rmDir(buf);
 }
 
-
-void ParameterUtil::saveConfigWithPath(const std::string& path) {
+void ParameterUtil::saveConfigWithPath(const std::string &path) {
   std::string src;
   // save config in some path
   if (!intConfig_->config_.empty()) {
diff --git a/paddle/trainer/ParamUtil.h b/paddle/trainer/ParamUtil.h
index cfb637a3edfdcae866964bb232c64bd731e46179..3923941c3d1533621d89313aa09801e98cd5b8a9 100644
--- a/paddle/trainer/ParamUtil.h
+++ b/paddle/trainer/ParamUtil.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
@@ -37,14 +36,14 @@ namespace paddle {
 struct ParameterUtilConfig {
   DISABLE_COPY(ParameterUtilConfig);
 
-  ParameterUtilConfig(bool save_only_one, int saving_period,
+  ParameterUtilConfig(bool save_only_one,
+                      int saving_period,
                       bool load_save_parameters_in_pserver,
-                      std::string config):
-                      save_only_one_(save_only_one),
-                      saving_period_(saving_period),
-                      load_save_param_pserver_(load_save_parameters_in_pserver),
-                      config_(config) {
-                      }
+                      std::string config)
+      : save_only_one_(save_only_one),
+        saving_period_(saving_period),
+        load_save_param_pserver_(load_save_parameters_in_pserver),
+        config_(config) {}
 
   bool save_only_one_;
   int saving_period_;
@@ -52,7 +51,6 @@ struct ParameterUtilConfig {
   std::string config_;
 };
 
-
 /**
  * ParameterUtil
  * Utility class for loading and saving parameters
@@ -80,8 +78,9 @@ public:
   bool loadParameters(int passId, bool local = true, bool remote = false);
 
   /// load parameters given path info
-  void loadParametersWithPath(const std::string& dir, bool local = true,
-                      bool remote = false);
+  void loadParametersWithPath(const std::string &dir,
+                              bool local = true,
+                              bool remote = false);
 
   /// Save parameter to dist for pass passId
   /// passInnerId means saving times in one pass, some users want to
@@ -97,14 +96,14 @@ public:
   void deleteParameters(int passId, int passInnerId = 0);
 
   /// save config given path info
-  void saveConfigWithPath(const std::string& path);
+  void saveConfigWithPath(const std::string &path);
 
   /**
    * Try to load parameter from config.
    * @return true if can load from trainer config.
    */
   inline bool tryLoadParametersFromConfig() {
-    auto& c = config_->getConfig();
+    auto &c = config_->getConfig();
     if (!c.init_model_path().empty()) {
       loadParametersWithPath(c.init_model_path());
       return true;
diff --git a/paddle/trainer/ParameterUpdater.cpp b/paddle/trainer/ParameterUpdater.cpp
index ef2b1443d9c35e8d3296730b044c2d4cd3217d89..6001a0b391fb3425315de3194945a4d04aff7150 100644
--- a/paddle/trainer/ParameterUpdater.cpp
+++ b/paddle/trainer/ParameterUpdater.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ParameterUpdater.h"
 
 #include "paddle/utils/Logging.h"
@@ -30,7 +29,8 @@ SgdUpdaterWithCpuAverager::SgdUpdaterWithCpuAverager(
   CHECK(FLAGS_use_gpu && optConfig.do_average_in_cpu());
   averager_.reset(AverageOptimizer::create(optConfig,
                                            new DummyOptimizer(optConfig),
-                                           false /*sparse*/, true /*apply*/));
+                                           false /*sparse*/,
+                                           true /*apply*/));
   updateWorker_.addJob([]() { hl_set_device(FLAGS_gpu_id); });
 }
 
diff --git a/paddle/trainer/ParameterUpdater.h b/paddle/trainer/ParameterUpdater.h
index 854e6a45d890f6fa2265ac72088c8c2574dfde5a..b83b4cf55e27b25864499531bbfe483fb75f78a1 100644
--- a/paddle/trainer/ParameterUpdater.h
+++ b/paddle/trainer/ParameterUpdater.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Thread.h"
@@ -69,7 +68,8 @@ public:
     ParameterUpdater::init(parameters);
     optimizer_->init(parameters_.size(), nullptr);
     // check no L1 decay in parameter configs
-    CHECK(std::find_if(parameters.begin(), parameters.end(),
+    CHECK(std::find_if(parameters.begin(),
+                       parameters.end(),
                        [](const ParameterPtr& para) {
                          return para->getConfig().decay_rate_l1() > 0.0f;
                        }) == parameters.end())
@@ -146,7 +146,6 @@ protected:
     para->getBuf(PARAMETER_GRADIENT)->zeroMem();
   }
 
-
   std::unique_ptr<ParameterOptimizer> optimizer_;
 
   /**
@@ -163,10 +162,10 @@ class SgdCpuUpdater : public SgdLocalUpdater, public Deprecated {
 public:
   explicit SgdCpuUpdater(const OptimizationConfig& optConfig)
       : SgdLocalUpdater(optConfig),
-        Deprecated("SgdCpuUpdater is used only in recursive neural network, "
-                   "and recursive neural network is deprecated in paddle. "
-                   "Use it all by your own.")
-  {}
+        Deprecated(
+            "SgdCpuUpdater is used only in recursive neural network, "
+            "and recursive neural network is deprecated in paddle. "
+            "Use it all by your own.") {}
 
   /**
    * @brief update all parameter on finish batch.
diff --git a/paddle/trainer/RemoteParameterUpdater.cpp b/paddle/trainer/RemoteParameterUpdater.cpp
index 3a5c2a351737ec9eb98b20c679d21dbfea42eea5..d83bb5b10adeff2dc43ad4705e5c55d10856de0d 100644
--- a/paddle/trainer/RemoteParameterUpdater.cpp
+++ b/paddle/trainer/RemoteParameterUpdater.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "RemoteParameterUpdater.h"
 #include "Trainer.h"
 #include "paddle/utils/Stat.h"
@@ -31,7 +30,8 @@ const std::string RemoteParameterUpdater::kAverage = "average";
 const std::string RemoteParameterUpdater::kElasticAverage = "elastic_average";
 
 RemoteParameterUpdater::RemoteParameterUpdater(
-    const OptimizationConfig& config, int expectedPassCount,
+    const OptimizationConfig& config,
+    int expectedPassCount,
     std::unique_ptr<ParameterUpdater>&& localUpdater)
     : config_(config),
       localUpdater_(std::move(localUpdater)),
@@ -94,8 +94,8 @@ void RemoteParameterUpdater::init(std::vector<ParameterPtr>& parameters) {
     parameterClient_->getParameter();
     copyParametersToDevice(PARAMETER_VALUE);
   }
-  if (FLAGS_trainer_id == 0 && (config_.algorithm()
-                                != TrainAlgorithm::AsyncSGD)) {
+  if (FLAGS_trainer_id == 0 &&
+      (config_.algorithm() != TrainAlgorithm::AsyncSGD)) {
     startController();
     useApplyInPserver_ = useApplyInPserver(config_);
   }
@@ -241,7 +241,9 @@ void RemoteParameterUpdater::finishBatch(real cost) {
 
   {
     REGISTER_TIMER("sendAndRecv_dense");
-    parameterClient_->sendAndReceiveParameter(mode, sendType, batchSize_,
+    parameterClient_->sendAndReceiveParameter(mode,
+                                              sendType,
+                                              batchSize_,
                                               0,  // cost = 0
                                               sendBackParameter);
   }
@@ -356,7 +358,8 @@ void RemoteParameterUpdater::restore() {
 }
 
 ConcurrentRemoteParameterUpdater::ConcurrentRemoteParameterUpdater(
-    OptimizationConfig config, int passCount,
+    OptimizationConfig config,
+    int passCount,
     std::unique_ptr<ParameterUpdater>&& localUpdater)
     : RemoteParameterUpdater(config, passCount, std::move(localUpdater)) {
   sendThread_.reset(new std::thread([this]() { this->send(); }));
@@ -423,7 +426,10 @@ void ConcurrentRemoteParameterUpdater::send(Parameter* para) {
   std::vector<ParameterSegments> paraSegment;
   if (para == NULL) {
     parameterClient_->sendParameter(
-        mode, sendType, paraSegment, batchSize_,
+        mode,
+        sendType,
+        paraSegment,
+        batchSize_,
         0,              // cost=0
         true,           // sendBackParameter = true
         batchStatus_);  // batchStatus_ = BATCH_FINISH
@@ -440,7 +446,10 @@ void ConcurrentRemoteParameterUpdater::send(Parameter* para) {
       copySingleParaFromDevice(para, sendType);
       hl_stream_synchronize(kDeviceToHostStream);
     }
-    parameterClient_->sendParameter(mode, sendType, paraSegment, batchSize_,
+    parameterClient_->sendParameter(mode,
+                                    sendType,
+                                    paraSegment,
+                                    batchSize_,
                                     0,     // cost=0
                                     true,  // sendBackParameter = true
                                     batchStatus_);
@@ -589,14 +598,14 @@ SparseRemoteParameterUpdater::SparseRemoteParameterUpdater(
 void SparseRemoteParameterUpdater::init(std::vector<ParameterPtr>& parameters) {
   ParameterUpdater::init(parameters);
 
-  parameterClient_.reset(new ParameterClient2(false,
-      FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse));
+  parameterClient_.reset(new ParameterClient2(
+      false, FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse));
   parameterClient_->init(parameters_);
   parameterClient_->setTrainerId(FLAGS_trainer_id);
 
   if (FLAGS_trainer_id == 0) {
-    parameterClient_->setConfig(config_, FLAGS_save_dir,
-                                true /*is_sparse_server*/);
+    parameterClient_->setConfig(
+        config_, FLAGS_save_dir, true /*is_sparse_server*/);
     if (parameters[0]->isFullSize()) {
       parameterClient_->setParameter();
     } else {  // init in pserver
@@ -615,9 +624,8 @@ void SparseRemoteParameterUpdater::startController() {
 }
 
 void SparseRemoteParameterUpdater::controller() {
-  ParameterClient2 client(false,
-                          FLAGS_port + FLAGS_ports_num,
-                          FLAGS_ports_num_for_sparse);
+  ParameterClient2 client(
+      false, FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse);
   client.init(parameters_);
 
   while (true) {
@@ -679,7 +687,9 @@ void SparseRemoteParameterUpdater::finishBatch(real cost) {
   ParameterType sendType = PARAMETER_GRADIENT;
 
   REGISTER_TIMER("sendSparseParam");
-  parameterClient_->sendAndReceiveParameter(mode, sendType, batchSize_,
+  parameterClient_->sendAndReceiveParameter(mode,
+                                            sendType,
+                                            batchSize_,
                                             0,       // cost = 0
                                             false);  // sendBackParameter
 
@@ -823,6 +833,6 @@ void SparseRemoteParameterUpdaterComposite::init(
 
 std::vector<std::function<ParameterUpdater*(
     const std::string&, const OptimizationConfig&, bool, size_t)>>
-ParameterUpdaterCreators::constructors_;
+    ParameterUpdaterCreators::constructors_;
 
 }  // namespace paddle
diff --git a/paddle/trainer/RemoteParameterUpdater.h b/paddle/trainer/RemoteParameterUpdater.h
index be273e9ef73c744ddbcad760ac50a5720c7502a9..a40884724cc7f963dc6ce5eede750327b2bbfed9 100644
--- a/paddle/trainer/RemoteParameterUpdater.h
+++ b/paddle/trainer/RemoteParameterUpdater.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <thread>
@@ -56,7 +55,8 @@ namespace paddle {
 class RemoteParameterUpdater : public ParameterUpdater {
 public:
   RemoteParameterUpdater(
-      const OptimizationConfig& config, int expectedPpassCount,
+      const OptimizationConfig& config,
+      int expectedPpassCount,
       std::unique_ptr<ParameterUpdater>&& localUpdater = nullptr);
   ~RemoteParameterUpdater() {
     if (controllerThread_) {
@@ -180,7 +180,8 @@ protected:
 class ConcurrentRemoteParameterUpdater : public RemoteParameterUpdater {
 public:
   ConcurrentRemoteParameterUpdater(
-      OptimizationConfig config, int expectedPassCount,
+      OptimizationConfig config,
+      int expectedPassCount,
       std::unique_ptr<ParameterUpdater>&& localUpdater);
   ~ConcurrentRemoteParameterUpdater();
 
@@ -264,7 +265,8 @@ private:
 class SparseRemoteParameterUpdater : public ParameterUpdater {
 public:
   SparseRemoteParameterUpdater(const OptimizationConfig& config,
-                               int expectedPassCount, bool testing);
+                               int expectedPassCount,
+                               bool testing);
   ~SparseRemoteParameterUpdater() {
     if (controllerThread_) {
       controllerThread_->join();
@@ -345,7 +347,9 @@ public:
    * @note  use syncThreadPool to synchronize these two updaters
    */
   SparseRemoteParameterUpdaterComposite(
-      const OptimizationConfig& config, int expectedPassCount, bool testing,
+      const OptimizationConfig& config,
+      int expectedPassCount,
+      bool testing,
       std::unique_ptr<ParameterUpdater>&& normalUpdater) {
     updaters_.resize(NUMBER_UPDATERS);
     updaters_[UPDATER_SPARSE_REMOTE].reset(
@@ -373,11 +377,11 @@ public:
    */
   static void addCreator(
       const std::function<ParameterUpdater*(
-          const std::string&,  // algo
+          const std::string&,         // algo
           const OptimizationConfig&,  // optConfig
-          bool,  // isLocal
-          size_t  // numPasses
-        )>& creator) {    // NOLINT  explicit move closing ) in this line
+          bool,                       // isLocal
+          size_t                      // numPasses
+          )>& creator) {  // NOLINT  explicit move closing ) in this line
                           // for readability
     constructors_.push_back(creator);
   }
@@ -395,7 +399,7 @@ public:
                                             const OptimizationConfig& optConfig,
                                             bool isLocal,
                                             size_t numPasses) {
-    for (auto & c : constructors_) {
+    for (auto& c : constructors_) {
       if (auto updater = c(algo, optConfig, isLocal, numPasses)) {
         return updater;
       }
@@ -406,7 +410,7 @@ public:
 private:
   static std::vector<std::function<ParameterUpdater*(
       const std::string&, const OptimizationConfig&, bool, size_t)>>
-  constructors_;
+      constructors_;
 };
 
 }  // namespace paddle
diff --git a/paddle/trainer/Tester.cpp b/paddle/trainer/Tester.cpp
index ccf06e1d84edc4f57e982102479f99295c1955e3..30e92682baec2fc6035ecfa9dbd90415acd5abe1 100644
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Tester.h"
 
 #include <fenv.h>
@@ -37,58 +36,65 @@ limitations under the License. */
 
 namespace paddle {
 
-Tester::Tester(const std::shared_ptr<TrainerConfigHelper> &config,
-               std::unique_ptr<TesterConfig> &&intconfig,
-               const GradientMachinePtr &gradientMachine,
-               const std::shared_ptr<ParameterUpdater> &parameterUpdater,
-               std::shared_ptr<DataProvider> testDataProvider):
-               config_(config),
-               intconfig_(std::move(intconfig)),
-               gradientMachine_(gradientMachine),
-               parameterUpdater_(parameterUpdater),
-               testDataProvider_(testDataProvider) {
-  testEvaluator_.reset(gradientMachine_ ->makeEvaluator());
+Tester::Tester(const std::shared_ptr<TrainerConfigHelper>& config,
+               std::unique_ptr<TesterConfig>&& intconfig,
+               const GradientMachinePtr& gradientMachine,
+               const std::shared_ptr<ParameterUpdater>& parameterUpdater,
+               std::shared_ptr<DataProvider> testDataProvider)
+    : config_(config),
+      intconfig_(std::move(intconfig)),
+      gradientMachine_(gradientMachine),
+      parameterUpdater_(parameterUpdater),
+      testDataProvider_(testDataProvider) {
+  testEvaluator_.reset(gradientMachine_->makeEvaluator());
   if (intconfig_->distributeTest) {
     testParameterClient_.reset(new ParameterClient2(true));
   }
 
   if (testParameterClient_) {
-    testParameterClient_->init(
-        gradientMachine_->getParameters());
+    testParameterClient_->init(gradientMachine_->getParameters());
   }
 
   std::unique_ptr<ParameterUtilConfig> paramConfig(
-      new ParameterUtilConfig(
-          intconfig_->saveOnlyOne,
-          intconfig_->savingPeriod,
-          intconfig_->loadsaveParametersInPserver,
-          intconfig_->config));
+      new ParameterUtilConfig(intconfig_->saveOnlyOne,
+                              intconfig_->savingPeriod,
+                              intconfig_->loadsaveParametersInPserver,
+                              intconfig_->config));
 
   paramUtil_.reset(new ParameterUtil(
-      config_,
-      std::move(paramConfig),
-      gradientMachine_,
-      parameterUpdater_));
+      config_, std::move(paramConfig), gradientMachine_, parameterUpdater_));
+}
+
+void Tester::startTestPeriod() {
+  testEvaluator_->start();
+  testContext_.cost = 0;
+  testContext_.numSamples = 0;
+
+  parameterUpdater_->apply();
+  if (intconfig_->prevBatchState) {
+    gradientMachine_->getState(*intconfig_->trainState);
+    gradientMachine_->setState(*intconfig_->testState);
+  }
+}
+
+void Tester::testOneDataBatch(const DataBatch& dataBatch,
+                              std::vector<Argument>* outArgs) {
+  testContext_.cost +=
+      forwardOneBatch(dataBatch, testEvaluator_.get(), outArgs);
+  testContext_.numSamples += dataBatch.getSize();
 }
 
 void Tester::testOnePeriod() {
   DataBatch dataBatch;
   int64_t batchSize = config_->getOptConfig().batch_size();
-  testEvaluator_->start();
-  real cost = 0;
-  int64_t numSamples = 0;
   bool testAllData =
       intconfig_->testPeriod == 0 || intconfig_->testAllDataInOnePeriod;
-
   int batches =
       testAllData ? std::numeric_limits<int>::max() : intconfig_->testPeriod;
 
-  parameterUpdater_->apply();
-  if (intconfig_->prevBatchState) {
-    gradientMachine_->getState(*intconfig_->trainState);
-    gradientMachine_->setState(*intconfig_->testState);
-  }
+  std::vector<Argument> outArgs;
 
+  startTestPeriod();
   for (int i = 0; i < batches; ++i) {
     int num = testDataProvider_->getNextBatch(batchSize, &dataBatch);
     if (num == 0) {
@@ -102,13 +108,18 @@ void Tester::testOnePeriod() {
         num = testDataProvider_->getNextBatch(batchSize, &dataBatch);
       }
     }
-    cost += testOneBatch(dataBatch, testEvaluator_.get());
-    numSamples += num;
+    testOneDataBatch(dataBatch, &outArgs);
   }
+  finishTestPeriod();
+}
+
+void Tester::finishTestPeriod() {
   testEvaluator_->finish();
-  CHECK_GT(numSamples, 0) << "There is no samples in your test batch. Possibly "
-                             "wrong implementation of DataProvidor.reset()";
-  LOG(INFO) << " Test samples=" << numSamples << " cost=" << cost / numSamples
+  CHECK_GT(testContext_.numSamples, 0)
+      << "There is no samples in your test batch. Possibly "
+         "wrong implementation of DataProvidor.reset()";
+  LOG(INFO) << " Test samples=" << testContext_.numSamples
+            << " cost=" << testContext_.cost / testContext_.numSamples
             << " Eval: " << *testEvaluator_;
   parameterUpdater_->restore();
   if (intconfig_->prevBatchState) {
@@ -128,9 +139,11 @@ int64_t Tester::testOneBatchById(int64_t batchId) {
     return 0;
   }
 
+  std::vector<Argument> outArgs;
+
   stats_ += std::pair<int64_t, real>{
       actualBatchSize,
-      testOneBatch(dataBatch, testEvaluator_.get())};
+      forwardOneBatch(dataBatch, testEvaluator_.get(), &outArgs)};
 
   if (((batchId + 1) % intconfig_->logPeriod) == 0) {
     LOG(INFO) << " Batch=" << batchId + 1 << " " << stats_.getStats(false);
@@ -139,7 +152,10 @@ int64_t Tester::testOneBatchById(int64_t batchId) {
   return actualBatchSize;
 }
 
-real Tester::testOneBatch(const DataBatch &dataBatch, Evaluator *evaluator) {
+real Tester::forwardOneBatch(const DataBatch& dataBatch,
+                             Evaluator* evaluator,
+                             std::vector<Argument>* pOutArgs) {
+  auto& outArgs = *pOutArgs;
   const std::vector<Argument>& inArgs = dataBatch.getStreams();
   if (intconfig_->loadsaveParametersInPserver) {
     REGISTER_TIMER("prefetch");
@@ -148,18 +164,18 @@ real Tester::testOneBatch(const DataBatch &dataBatch, Evaluator *evaluator) {
                                            true /*after apply*/);
   }
 
-  std::vector<Argument> outArgs;
   gradientMachine_->forward(inArgs, &outArgs, PASS_TEST);
 
   // write features if set this flag and outArgs is not empty
   std::string featFile = intconfig_->featFile;
-  if (!featFile.empty() && !outArgs.empty()) {
+  if (!featFile.empty() && outArgs.empty()) {
     size_t numOutputs = outArgs.size();
     std::vector<MatrixPtr> featMatrices;
     featMatrices.resize(numOutputs);
     for (size_t i = 0; i < numOutputs; ++i) {
       featMatrices[i] = Matrix::create(outArgs[i].value->getHeight(),
-                                       outArgs[i].value->getWidth(), false,
+                                       outArgs[i].value->getWidth(),
+                                       false,
                                        false);  // CPU data buffer
       featMatrices[i]->copyFrom(*(outArgs[i].value), HPPL_STREAM_DEFAULT);
     }
@@ -201,20 +217,19 @@ real Tester::testOneBatch(const DataBatch &dataBatch, Evaluator *evaluator) {
   return Argument::sumCosts(outArgs);
 }
 
-
 void Tester::testOnePassBatch(int passId) {
   stats_.reset();
   const std::vector<Argument> inArgs;
   gradientMachine_->forward(inArgs, nullptr, PASS_TEST);
-  int64_t num; real cost;
+  int64_t num;
+  real cost;
   gradientMachine_->getStats(cost, num);
-  stats_ += std::pair<int64_t, real> {num, cost};
+  stats_ += std::pair<int64_t, real>{num, cost};
   gradientMachine_->onPassEnd();
 
   LOG(INFO) << " Pass=" << passId << " " << stats_.getStats(false);
 }
 
-
 void Tester::testOnePass(int passId) {
   stats_.reset();
   int64_t batchId = 0;
@@ -244,7 +259,6 @@ void Tester::testOnePass(int passId) {
   }
 }
 
-
 void Tester::test() {
   CHECK(testDataProvider_) << "TestData is not specified";
   testDataProvider_->setSkipShuffle();
@@ -260,33 +274,32 @@ void Tester::test() {
     intconfig_->testPass = 0;
     intconfig_->numPasses = modelList.size();
     intconfig_->savingPeriod = 1;
-    CHECK_EQ(intconfig_->testWait, 0) <<
-      "--test_wait must be 0 for evaluation";
+    CHECK_EQ(intconfig_->testWait, 0) << "--test_wait must be 0 for evaluation";
   } else if (!initModelPath.empty()) {
     modelList.push_back(initModelPath);
     intconfig_->testPass = 0;
     intconfig_->numPasses = 1;
     intconfig_->savingPeriod = 1;
-    CHECK_EQ(intconfig_->testWait, 0) <<
-      "--test_wait must be 0 for evaluation";
+    CHECK_EQ(intconfig_->testWait, 0) << "--test_wait must be 0 for evaluation";
   }
 
   for (int i = intconfig_->testPass; i < intconfig_->numPasses; ++i) {
     int passId = i;
     if (passId % intconfig_->savingPeriod == 0) {
       if (intconfig_->testWait) {
-        while (paramUtil_->loadParameters(passId,
-                true /*local*/, true /*remote*/) == false) {
+        while (paramUtil_->loadParameters(
+                   passId, true /*local*/, true /*remote*/) == false) {
           LOG(INFO) << "Waiting for parameters of pass " << passId;
           sleep(60);  // sleep 60s
         }
       } else {
         if (modelList.size() == 0) {
-          CHECK_EQ(paramUtil_->loadParameters(passId,
-                  true /*local*/, true /*remote*/), true);
+          CHECK_EQ(paramUtil_->loadParameters(
+                       passId, true /*local*/, true /*remote*/),
+                   true);
         } else {
-          paramUtil_->loadParametersWithPath(modelList[i],
-                                      true /*local*/, true /*remote*/);
+          paramUtil_->loadParametersWithPath(
+              modelList[i], true /*local*/, true /*remote*/);
         }
       }
       if (IGradientMachineMode::trainWholeDataInOneBatch(intconfig_->mode)) {
@@ -305,9 +318,8 @@ void Tester::test() {
   gradientMachine_->finish();
 }
 
-
 void Tester::printOutput(const std::vector<Argument>& outArgs,
-                          std::ostream& os) {
+                         std::ostream& os) {
   size_t numOutputs = outArgs.size();
   size_t numIns = outArgs[0].getBatchSize();
   if (cpuMat_.size() != numOutputs || cpuVec_.size() != numOutputs) {
@@ -325,11 +337,13 @@ void Tester::printOutput(const std::vector<Argument>& outArgs,
         } else if (dynamic_cast<GpuSparseMatrix*>(outArgs[i].value.get())) {
           auto sparseMat =
               dynamic_cast<GpuSparseMatrix*>(outArgs[i].value.get());
-          cpuMat_[i] = Matrix::createSparseMatrix(
-              sparseMat->getHeight(), sparseMat->getWidth(),
-              sparseMat->getElementCnt(), sparseMat->getValueType(),
-              sparseMat->format_, false, /* trans */
-              false);                    /* useGpu */
+          cpuMat_[i] = Matrix::createSparseMatrix(sparseMat->getHeight(),
+                                                  sparseMat->getWidth(),
+                                                  sparseMat->getElementCnt(),
+                                                  sparseMat->getValueType(),
+                                                  sparseMat->format_,
+                                                  false,  /* trans */
+                                                  false); /* useGpu */
           hl_stream_t stream = HPPL_STREAM_DEFAULT;
           cpuMat_[i]->copyFrom(*sparseMat, stream);
         } else {
diff --git a/paddle/trainer/Tester.h b/paddle/trainer/Tester.h
index 9663b8def9145bc740f28150d34b8ff88fdfd66a..a9de9fe208c61c00fbeebe644222e255308e762b 100644
--- a/paddle/trainer/Tester.h
+++ b/paddle/trainer/Tester.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
@@ -49,10 +48,10 @@ public:
    *                         for getting parameter from parameter-server.
    * @param testDataProvider Test data provider.
    */
-  Tester(const std::shared_ptr<TrainerConfigHelper> &config,
-         std::unique_ptr<TesterConfig> &&intconfig,
-         const GradientMachinePtr &gradientMachine,
-         const std::shared_ptr<ParameterUpdater> &parameterUpdater,
+  Tester(const std::shared_ptr<TrainerConfigHelper>& config,
+         std::unique_ptr<TesterConfig>&& intconfig,
+         const GradientMachinePtr& gradientMachine,
+         const std::shared_ptr<ParameterUpdater>& parameterUpdater,
          std::shared_ptr<DataProvider> testDataProvider);
 
   /**
@@ -68,6 +67,10 @@ public:
    * is training at same time.
    */
   void testOnePeriod();
+  void startTestPeriod();
+  void finishTestPeriod();
+  void testOneDataBatch(const DataBatch& dataBatch,
+                        std::vector<Argument>* outArgs);
 
   /**
    * Test for given data batch.
@@ -75,15 +78,15 @@ public:
    * @param evaluator Evaluator
    * @return cost
    */
-  real testOneBatch(const DataBatch &dataBatch, Evaluator *evaluator);
-
+  real forwardOneBatch(const DataBatch& dataBatch,
+                       Evaluator* evaluator,
+                       std::vector<Argument>* outArgs);
 
   /**
    * performance the full pass of test given test data provider
    */
   void test();
 
-
 protected:
   std::shared_ptr<ParameterClient2> testParameterClient_;
   std::shared_ptr<TrainerConfigHelper> config_;
@@ -99,6 +102,10 @@ protected:
   std::ofstream os_;
   std::vector<MatrixPtr> cpuMat_;
   std::vector<IVectorPtr> cpuVec_;
+  struct {
+    int64_t numSamples;
+    real cost;
+  } testContext_;
 
 private:
   /**
diff --git a/paddle/trainer/TesterConfig.h b/paddle/trainer/TesterConfig.h
index d5e644ce6124710c76a463d521c16451e22b5462..90267e68d768f2a144e0041d0f493072ef9eb9a1 100644
--- a/paddle/trainer/TesterConfig.h
+++ b/paddle/trainer/TesterConfig.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
diff --git a/paddle/trainer/ThreadParameterUpdater.cpp b/paddle/trainer/ThreadParameterUpdater.cpp
index a26e9239f987f63ecbcf0183582ca64d64b50af6..cc22851d8ecbf594df1e3f2c8aeaa98c07b3765b 100644
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ b/paddle/trainer/ThreadParameterUpdater.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ThreadParameterUpdater.h"
 
 #include "paddle/utils/Logging.h"
@@ -45,7 +44,8 @@ void SgdThreadUpdater::init(std::vector<ParameterPtr>& parameters) {
   optimizers_.resize(maxId + 1);
   for (auto& para : parameters_) {
     int pid = para->getID();
-    optimizers_[pid].reset(sgdOptimizerCreate(config_, para->getConfig(),
+    optimizers_[pid].reset(sgdOptimizerCreate(config_,
+                                              para->getConfig(),
                                               para->isGradSparseUpdate(),
                                               false /*inPserver*/));
     size_t numRows = para->isGradSparseUpdate() ? para->getConfig().dims(0) : 0;
@@ -91,8 +91,10 @@ void SgdThreadUpdater::updateImpl(Parameter* para) {
 }
 
 void SgdThreadUpdater::threadTraverse(
-    const ParameterOptimizer::TraverseCallback& callback, int tid,
-    size_t numThreads, Parameter* para) {
+    const ParameterOptimizer::TraverseCallback& callback,
+    int tid,
+    size_t numThreads,
+    Parameter* para) {
   VectorPtr* vecs = Parameter::getTlsTempBufs();
   if (para->isGradSparseUpdate()) {
     size_t height = para->getConfig().dims(0);
@@ -106,8 +108,8 @@ void SgdThreadUpdater::threadTraverse(
     }
   } else {  // dense
     // setup sub bufs
-    auto interval = calcSplitArrayInterval(para->getSize(), (size_t)tid,
-                                           numThreads, 8LU /*for avx*/);
+    auto interval = calcSplitArrayInterval(
+        para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
     for (auto type : parameterTypes_) {
       vecs[type]->subVecFrom(*para->getBuf(type), interval);
     }
@@ -150,7 +152,7 @@ void SgdThreadUpdater::traverse(GetTraverseCallback getTraverseCallback) {
   } else if (hasCpuPara) {
     getGlobalSyncThreadPool()->exec(cpuTraverse);
   } else if (hasGpuPara) {
-      gpuTraverse(0, 0);
+    gpuTraverse(0, 0);
   }
 }
 
@@ -168,9 +170,8 @@ void SgdThreadUpdater::catchUpWith() {
 void SgdThreadUpdater::apply() {
   catchUpWith();
 
-  traverse([this](Parameter* para) {
-    return optimizers_[para->getID()]->apply();
-  });
+  traverse(
+      [this](Parameter* para) { return optimizers_[para->getID()]->apply(); });
 }
 
 void SgdThreadUpdater::restore() {
@@ -205,9 +206,9 @@ void SgdThreadUpdater::finishBatch(real cost) {
   }
 }
 
-void SgdThreadUpdater::threadUpdateSparse(
-    int tid, size_t numThreads, Parameter* para) {
-
+void SgdThreadUpdater::threadUpdateSparse(int tid,
+                                          size_t numThreads,
+                                          Parameter* para) {
   int pid = para->getID();
   ParameterOptimizer* optimizer = optimizers_[pid].get();
   VectorPtr* vecs = Parameter::getTlsTempBufs();
@@ -216,10 +217,10 @@ void SgdThreadUpdater::threadUpdateSparse(
   size_t width = para->getConfig().dims(1);
 
   if (dynamic_cast<SparseRowIdsCpuMatrix*>(
-        para->getMat(PARAMETER_GRADIENT).get())) {
+          para->getMat(PARAMETER_GRADIENT).get())) {
     // From MultiGradientMachine
     SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
-      para->getMat(PARAMETER_GRADIENT).get());
+        para->getMat(PARAMETER_GRADIENT).get());
     std::vector<uint32_t>& sparseIds = mainMat->getIds(tid);
 
     for (auto id : sparseIds) {
@@ -232,16 +233,16 @@ void SgdThreadUpdater::threadUpdateSparse(
     }
     sparseIds.clear();
   } else if (dynamic_cast<SparseRowCpuMatrix*>(
-               para->getMat(PARAMETER_GRADIENT).get())) {
+                 para->getMat(PARAMETER_GRADIENT).get())) {
     // From NeuralNetwork
     SparseRowCpuMatrix* mainMat = dynamic_cast<SparseRowCpuMatrix*>(
-      para->getMat(PARAMETER_GRADIENT).get());
+        para->getMat(PARAMETER_GRADIENT).get());
 
     std::vector<unsigned int>& localIndices =
         mainMat->getIndexDictHandle()->localIndices;
 
-    auto interval = calcSplitArrayInterval(
-      localIndices.size(), tid, numThreads);
+    auto interval =
+        calcSplitArrayInterval(localIndices.size(), tid, numThreads);
     for (size_t i = interval.first; i < interval.second; ++i) {
       auto id = localIndices[i];
       real* row = mainMat->getLocalRow(i);
@@ -258,15 +259,14 @@ void SgdThreadUpdater::threadUpdateSparse(
     }
     // For numThreads > 1, MultiGradientMachine is used, which goes
     // to the above branch.
-    CHECK_EQ(numThreads, 1);
+    CHECK_EQ(numThreads, 1UL);
     mainMat->clearIndices();
   } else {
-    auto & m = *para->getMat(PARAMETER_GRADIENT).get();
+    auto& m = *para->getMat(PARAMETER_GRADIENT).get();
     LOG(FATAL) << "Internal error: " << para->getName() << " "
                << typeid(m).name();
   }
 
-
   if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
     for (size_t i = tid; i < height; i += numThreads) {
       // setup sub bufs
@@ -278,14 +278,15 @@ void SgdThreadUpdater::threadUpdateSparse(
   }
 }
 
-void SgdThreadUpdater::threadUpdateDense(int tid, size_t numThreads,
+void SgdThreadUpdater::threadUpdateDense(int tid,
+                                         size_t numThreads,
                                          Parameter* para) {
   int pid = para->getID();
   ParameterOptimizer* optimizer = optimizers_[pid].get();
   VectorPtr* vecs = Parameter::getTlsTempBufs();
 
-  auto interval = calcSplitArrayInterval(para->getSize(), (size_t)tid,
-                                         numThreads, 8LU /*for avx*/);
+  auto interval = calcSplitArrayInterval(
+      para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
 
   // setup sub bufs
   for (auto type : parameterTypes_) {
diff --git a/paddle/trainer/ThreadParameterUpdater.h b/paddle/trainer/ThreadParameterUpdater.h
index d8a7a5dd4f12afc7edfbf2c5f28cbe31d7516153..5a5e3f1d4b3c1e915aa6ac01ff503c552e42de1a 100644
--- a/paddle/trainer/ThreadParameterUpdater.h
+++ b/paddle/trainer/ThreadParameterUpdater.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
@@ -26,7 +25,6 @@ limitations under the License. */
 #include <memory>
 #include <vector>
 
-
 namespace paddle {
 
 /**
@@ -45,14 +43,12 @@ public:
   explicit SgdThreadUpdater(const OptimizationConfig& optConfig);
   virtual ~SgdThreadUpdater() {}
 
-
   // Use the startPass() function of the base optimizer.
   virtual void startPass();
 
   // Use the finishPass() function of the base optimizer.
   virtual bool finishPass(real cost);
 
-
   virtual void init(std::vector<ParameterPtr>& parameters);
   virtual PassType startBatch(int64_t batchSize);
   // Call finishBatch for each optimizer.
@@ -78,9 +74,11 @@ protected:
   void threadUpdateDense(int tid, size_t numThreads, Parameter* para);
   // The update function for after update operations, such as averager.
   void threadTraverse(const ParameterOptimizer::TraverseCallback& callback,
-                      int tid, size_t numThreads, Parameter* para);
+                      int tid,
+                      size_t numThreads,
+                      Parameter* para);
   typedef std::function<const ParameterOptimizer::TraverseCallback(Parameter*)>
-    GetTraverseCallback;
+      GetTraverseCallback;
   void traverse(GetTraverseCallback getTraverseCallback);
 };
 
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index 275150e12d12b57550ce45355cb3c533b57b4b86..8a5162912e5feae9b80ab8fff56bb20e4dac1696 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Trainer.h"
 
 #include <fenv.h>
@@ -40,7 +39,8 @@ limitations under the License. */
 #include "TrainerConfigHelper.h"
 
 P_DEFINE_string(config, "", "Trainer config file");
-P_DEFINE_int32(test_period, 1000,
+P_DEFINE_int32(test_period,
+               0,
                "Run test every so many train batches."
                " 0 for testing after each pass."
                " If not 0, test log_period batches."
@@ -49,23 +49,28 @@ P_DEFINE_int32(test_period, 1000,
 P_DEFINE_bool(local, true, "Train in local mode or not");
 
 P_DEFINE_bool(
-    test_all_data_in_one_period, false,
+    test_all_data_in_one_period,
+    false,
     "true will test all data in one test peroid."
     "Otherwise test (batch_size * log_peroid) data in one test period.");
 
-P_DEFINE_int32(average_test_period, 0,
+P_DEFINE_int32(average_test_period,
+               0,
                "Do test on average parameter every so"
                " many batches. MUST be devided by FLAGS_log_period."
                " Default 0 means do not test average parameter");
 
 P_DEFINE_int32(saving_period, 1, "Save parameteres every so many passes");
-P_DEFINE_int64(saving_period_by_batches, 0,
+P_DEFINE_int64(saving_period_by_batches,
+               0,
                "Save parameters every so many batches in one pass");
 P_DEFINE_string(save_dir, "", "Directory for saving model parameter");
-P_DEFINE_int32(start_pass, 0,
+P_DEFINE_int32(start_pass,
+               0,
                "Start training from this pass. "
                "Will load parameter from the previous pass");
-P_DEFINE_int32(test_pass, -1,
+P_DEFINE_int32(test_pass,
+               -1,
                "Will load parameter start from this pass to test");
 P_DEFINE_int32(test_wait, 0, "Waiting for pass parameter if not exist");
 P_DEFINE_bool(with_cost, true, "enable cost layer or not");
@@ -73,17 +78,21 @@ P_DEFINE_bool(distribute_test, false, "test in distribute mode");
 
 P_DEFINE_int32(num_passes, 100, "train for so many passes");
 
-P_DEFINE_string(config_args, "",
+P_DEFINE_string(config_args,
+                "",
                 "arguments passed to config file."
                 "Format: key1=value1,key2=value2");
 
-P_DEFINE_bool(save_only_one, false,
+P_DEFINE_bool(save_only_one,
+              false,
               "Save only parameters in last pass, remove previous.");
 
 P_DEFINE_string(feat_file, "", "File name of extracted feature.");
-P_DEFINE_string(predict_output_dir, "",
+P_DEFINE_string(predict_output_dir,
+                "",
                 "Directory that saves the predicted results of output layers");
-P_DEFINE_string(model_list, "",
+P_DEFINE_string(model_list,
+                "",
                 "File that saves the model list when evaluation");
 
 namespace paddle {
@@ -98,11 +107,11 @@ void Trainer::init(int argc, char** argv) {
   init(config);
 }
 
-void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
+void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
                    bool testing,
-                   const std::shared_ptr<GradientMachine> &gradientMachine,
-                   const std::shared_ptr<DataProvider> &dataProvider,
-                   const std::shared_ptr<DataProvider> &testDataProvider) {
+                   const std::shared_ptr<GradientMachine>& gradientMachine,
+                   const std::shared_ptr<DataProvider>& dataProvider,
+                   const std::shared_ptr<DataProvider>& testDataProvider) {
   this->stats_ = std::make_shared<TrainerStats>();
 
   config_ = config;
@@ -156,13 +165,16 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
       LOG(INFO) << "trainer mode: Testing";
     }
   } else if (IGradientMachineMode::tryGetMode(
-               (int*)&mode_, config_->getOptConfig().algorithm(),
-               FLAGS_trainer_count,
-               FLAGS_local, FLAGS_use_gpu)) {
+                 (int*)&mode_,
+                 config_->getOptConfig().algorithm(),
+                 FLAGS_trainer_count,
+                 FLAGS_local,
+                 FLAGS_use_gpu)) {
     LOG(INFO) << "Custom trainer mode.";
   } else if ((config_->getOptConfig().algorithm() == TrainAlgorithm::SGD ||
-              config_->getOptConfig().algorithm() == TrainAlgorithm::AsyncSGD)
-             && useSparseUpdater) {
+              config_->getOptConfig().algorithm() ==
+                  TrainAlgorithm::AsyncSGD) &&
+             useSparseUpdater) {
     mode_ = GradientMachine::kSgdSparseCpuTraining;
     LOG(INFO) << "trainer mode: SgdSparseCpuTraining";
   } else {
@@ -171,32 +183,33 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
   }
 
   // initialize trainer internal
-  trainerInternal_.init(config_, gradientMachine,
+  trainerInternal_.init(config_,
+                        gradientMachine,
                         TrainerInternalConfig::createFromMode(mode_),
-                        stats_, testing);
+                        stats_,
+                        testing);
   std::unique_ptr<ParameterUtilConfig> paramConfig(
-          new ParameterUtilConfig(FLAGS_save_only_one,
-                                  FLAGS_saving_period,
-                                  FLAGS_loadsave_parameters_in_pserver,
-                                  FLAGS_config));
+      new ParameterUtilConfig(FLAGS_save_only_one,
+                              FLAGS_saving_period,
+                              FLAGS_loadsave_parameters_in_pserver,
+                              FLAGS_config));
 
   paramUtil_.reset(
-      new paddle::ParameterUtil(
-          config_,
-          std::move(paramConfig),
-          trainerInternal_.getGradientMachine(),
-          trainerInternal_.getParameterUpdater()));
-
+      new paddle::ParameterUtil(config_,
+                                std::move(paramConfig),
+                                trainerInternal_.getGradientMachine(),
+                                trainerInternal_.getParameterUpdater()));
 
-  bool gpuData = FLAGS_use_gpu && (!FLAGS_parallel_nn) &&
-                 (!IGradientMachineMode::dataMustInCpu(mode_,
-                                                       FLAGS_trainer_count));
+  bool gpuData =
+      FLAGS_use_gpu && (!FLAGS_parallel_nn) &&
+      (!IGradientMachineMode::dataMustInCpu(mode_, FLAGS_trainer_count));
 
   dataProvider_ = dataProvider;
   if (!dataProvider_ && config_->hasDataConfig()) {
     dataProvider_.reset(DataProvider::create(*config_, *config_, gpuData));
   }
-  if (dataProvider_) {
+  if (!testDataProvider_) {
+    // No evaluator_ if there is testDataProvider but no dataProvider.
     evaluator_.reset(trainerInternal_.getGradientMachine()->makeEvaluator());
     currentEvaluator_.reset(
         trainerInternal_.getGradientMachine()->makeEvaluator());
@@ -215,10 +228,7 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
         DataProvider::create(config_->getTestDataConfig(), *config_, gpuData));
   }
   if (testDataProvider_) {
-    tester_.reset(new Tester(config_, createTesterConfig(),
-                 trainerInternal_.getGradientMachine(),
-                 trainerInternal_.getParameterUpdater(),
-                 testDataProvider_));
+    createTester();
   }
 
   if (!testing &&
@@ -246,51 +256,43 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
       } else if (!config_->getConfig().init_model_path().empty() &&
                  (FLAGS_local || FLAGS_trainer_id == 0)) {
         paramUtil_->loadParametersWithPath(
-              config_->getConfig().init_model_path(),
-              false /*local*/, true /*remote*/);
+            config_->getConfig().init_model_path(),
+            false /*local*/,
+            true /*remote*/);
       } else if (config_->getConfig().start_pass() > 0 &&
                  (FLAGS_local || FLAGS_trainer_id == 0)) {
         CHECK(paramUtil_->loadParameters(config_->getConfig().start_pass() - 1,
-              false /*local*/, true /*remote*/));
+                                         false /*local*/,
+                                         true /*remote*/));
       } else {
         trainerInternal_.getParameterUpdater()->randParametersRemote();
       }
     }
   }
 
-
   // set current evaluator and evalutor
   trainerInternal_.setCurrentEvaluator(currentEvaluator_.get());
   trainerInternal_.setEvaluator(evaluator_.get());
 }
 
 void Trainer::train(size_t numPasses) {
-  srand(config_->getConfig().start_pass() + 1);
-  dataProvider_->reset();
-
-  if (this->testDataProvider_) {
-    this->testDataProvider_->reset();
-  }
-
-  trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
-
+  startTrain();
   for (size_t i = 0; i < numPasses; ++i) {
     if (IGradientMachineMode::trainWholeDataInOneBatch(mode_)) {
       trainOnePassBatch(config_->getConfig().start_pass() + i);
     } else {
-      trainOnePass(config_->getConfig().start_pass() + i);
+      trainOnePass();
     }
     if (i < numPasses - 1) {
       dataProvider_->reset();
     }
   }
 
-  trainerInternal_.getGradientMachine()->finish();
+  finishTrain();
 }
 
-
 static double genPerturbation(real* d, real* grad, size_t dim) {
-  auto & reng = ThreadLocalRandomEngine::get();
+  auto& reng = ThreadLocalRandomEngine::get();
   std::uniform_real_distribution<double> dist(-1, 1);
   double gradNorm = 0, dNorm = 0;
   for (size_t i = 0; i < dim; ++i) {
@@ -387,13 +389,28 @@ real Trainer::checkGradient() {
   return maxDiff;
 }
 
-void Trainer::trainOnePass(int passId) {
-  this->stats_->reset();
-  int64_t batchId = 0;
-  int32_t batchSize = config_->getOptConfig().batch_size();
-  real avgTestCost = 0;
-  int64_t numAvgTests = 0;
-  int passInnerId = 1;
+void Trainer::startTrain() {
+  trainPassContext_.passId = config_->getConfig().start_pass();
+  srand(config_->getConfig().start_pass() + 1);
+  if (dataProvider_) {
+    dataProvider_->reset();
+  }
+
+  if (this->testDataProvider_) {
+    this->testDataProvider_->reset();
+  }
+
+  trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
+}
+
+void Trainer::finishTrain() { trainerInternal_.getGradientMachine()->finish(); }
+
+void Trainer::startTrainPass() {
+  stats_->reset();
+  trainPassContext_.batchId = 0;
+  trainPassContext_.avgTestCost = 0;
+  trainPassContext_.numAvgTests = 0;
+  trainPassContext_.passInnerId = 1;
 
   trainerInternal_.getParameterUpdater()->startPass();
   evaluator_->start();
@@ -401,81 +418,82 @@ void Trainer::trainOnePass(int passId) {
     trainerInternal_.getGradientMachine()->resetState();
     trainerInternal_.getGradientMachine()->getState(testState_);
   }
-  while (true) {
-    DataBatch dataBatch;
-
-    int num = 0;
-    {
-      REGISTER_TIMER("getTrainBatch");
-      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-    }
-    if (num == 0) break;
+}
 
-    if (averageEvaluator_) {
-      int64_t mod = batchId % FLAGS_average_test_period;
-      if (mod >= FLAGS_average_test_period - FLAGS_log_period) {
-        if (mod == FLAGS_average_test_period - FLAGS_log_period) {
-          averageEvaluator_->start();
-        }
-        trainerInternal_.getParameterUpdater()->apply();
-        if (FLAGS_prev_batch_state) {
-          trainerInternal_.getGradientMachine()->getState(trainState_);
-        }
-        avgTestCost +=
-            tester_->testOneBatch(dataBatch, averageEvaluator_.get());
-        if (FLAGS_prev_batch_state) {
-          trainerInternal_.getGradientMachine()->setState(trainState_);
-        }
-        numAvgTests += num;
-        trainerInternal_.getParameterUpdater()->restore();
+void Trainer::trainOneDataBatch(DataBatch& dataBatch) {
+  int num = dataBatch.getSize();
+  if (averageEvaluator_) {
+    int64_t mod = trainPassContext_.batchId % FLAGS_average_test_period;
+    if (mod >= FLAGS_average_test_period - FLAGS_log_period) {
+      if (mod == FLAGS_average_test_period - FLAGS_log_period) {
+        averageEvaluator_->start();
       }
+      trainerInternal_.getParameterUpdater()->apply();
+      if (FLAGS_prev_batch_state) {
+        trainerInternal_.getGradientMachine()->getState(trainState_);
+      }
+      trainPassContext_.avgTestCost += tester_->forwardOneBatch(
+          dataBatch, averageEvaluator_.get(), &forwardOutput_);
+      if (FLAGS_prev_batch_state) {
+        trainerInternal_.getGradientMachine()->setState(trainState_);
+      }
+      trainPassContext_.numAvgTests += num;
+      trainerInternal_.getParameterUpdater()->restore();
     }
-    {
-      REGISTER_TIMER("TrainBatch");
-      trainerInternal_.trainOneBatch(batchId, dataBatch);
-    }
+  }
+  {
+    REGISTER_TIMER("TrainBatch");
+    trainerInternal_.trainOneBatch(
+        trainPassContext_.batchId, dataBatch, &forwardOutput_);
+  }
 
-    if (averageEvaluator_ &&
-        batchId % FLAGS_average_test_period == FLAGS_average_test_period - 1) {
-      averageEvaluator_->finish();
-      LOG(INFO) << " Averaged parameter:"
-                << " cost=" << avgTestCost / numAvgTests
-                << " Eval: " << *averageEvaluator_;
-      numAvgTests = 0;
-      avgTestCost = 0;
-    }
+  if (averageEvaluator_ &&
+      trainPassContext_.batchId % FLAGS_average_test_period ==
+          FLAGS_average_test_period - 1) {
+    averageEvaluator_->finish();
+    LOG(INFO) << " Averaged parameter:"
+              << " cost="
+              << trainPassContext_.avgTestCost / trainPassContext_.numAvgTests
+              << " Eval: " << *averageEvaluator_;
+    trainPassContext_.numAvgTests = 0;
+    trainPassContext_.avgTestCost = 0;
+  }
 
-    ++batchId;
+  ++trainPassContext_.batchId;
 
-    if (batchId % FLAGS_log_period == 0) {
-      FOR_TIMING(globalStat.setThreadInfo(true));
-      FOR_TIMING(globalStat.printAllStatus());
-      FOR_TIMING(globalStat.reset());
-    }
+  if (trainPassContext_.batchId % FLAGS_log_period == 0) {
+    FOR_TIMING(globalStat.setThreadInfo(true));
+    FOR_TIMING(globalStat.printAllStatus());
+    FOR_TIMING(globalStat.reset());
+  }
 
-    if (testDataProvider_ && FLAGS_test_period > 0 &&
-        batchId % FLAGS_test_period == 0) {
-      tester_->testOnePeriod();
-    }
+  if (testDataProvider_ && FLAGS_test_period > 0 &&
+      trainPassContext_.batchId % FLAGS_test_period == 0) {
+    tester_->testOnePeriod();
+  }
 
-    if (FLAGS_saving_period_by_batches > 0 &&
-        batchId > FLAGS_saving_period_by_batches * passInnerId &&
-        0 == FLAGS_trainer_id) {
-      trainerInternal_.getParameterUpdater()->catchUpWith();
-      if (testDataProvider_) {
-        tester_->testOnePeriod();
-      }
-      paramUtil_->saveParametersOnePass(passId, passInnerId);
-      ++passInnerId;
+  if (FLAGS_saving_period_by_batches > 0 &&
+      trainPassContext_.batchId >
+          FLAGS_saving_period_by_batches * trainPassContext_.passInnerId &&
+      0 == FLAGS_trainer_id) {
+    trainerInternal_.getParameterUpdater()->catchUpWith();
+    if (testDataProvider_) {
+      tester_->testOnePeriod();
     }
+    paramUtil_->saveParametersOnePass(trainPassContext_.passId,
+                                      trainPassContext_.passInnerId);
+    ++trainPassContext_.passInnerId;
   }
+}
 
-  if (batchId == 0) {
+void Trainer::finishTrainPass() {
+  if (trainPassContext_.batchId == 0) {
     // This means no more data from DataProvider
     return;
   }
 
-  trainerInternal_.finishTrainPass(passId, batchId);
+  trainerInternal_.finishTrainPass(trainPassContext_.passId,
+                                   trainPassContext_.batchId);
 
   FOR_TIMING(globalStat.setThreadInfo(true));
   FOR_TIMING(globalStat.printAllStatus());
@@ -485,9 +503,30 @@ void Trainer::trainOnePass(int passId) {
     tester_->testOnePeriod();
   }
 
-  if (passId % FLAGS_saving_period == 0 && FLAGS_trainer_id == 0) {
-    paramUtil_->saveParametersOnePass(passId);
+  if (trainPassContext_.passId % FLAGS_saving_period == 0 &&
+      FLAGS_trainer_id == 0) {
+    paramUtil_->saveParametersOnePass(trainPassContext_.passId);
   }
+  ++trainPassContext_.passId;
+}
+
+void Trainer::trainOnePass() {
+  startTrainPass();
+  size_t batchSize = config_->getOptConfig().batch_size();
+  while (true) {
+    DataBatch dataBatch;
+
+    int num = 0;
+    {
+      REGISTER_TIMER("getTrainBatch");
+      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+    }
+    if (num == 0) break;
+    CHECK_EQ(num, dataBatch.getSize());
+    trainOneDataBatch(dataBatch);
+  }
+
+  finishTrainPass();
 }
 
 void Trainer::trainOnePassBatch(int passId) {
@@ -497,8 +536,8 @@ void Trainer::trainOnePassBatch(int passId) {
   const std::vector<Argument> inArgs;
   {
     REGISTER_TIMER("onePass");
-    trainerInternal_.getGradientMachine()->forwardBackward(inArgs, nullptr,
-                                                        PASS_TRAIN, nullptr);
+    trainerInternal_.getGradientMachine()->forwardBackward(
+        inArgs, nullptr, PASS_TRAIN, nullptr);
   }
 
   real cost = .0;
@@ -508,8 +547,7 @@ void Trainer::trainOnePassBatch(int passId) {
 
   trainerInternal_.getGradientMachine()->onPassEnd();
 
-  bool accepted =
-    trainerInternal_.getParameterUpdater()->finishPass(cost);
+  bool accepted = trainerInternal_.getParameterUpdater()->finishPass(cost);
 
   globalStat.setThreadInfo(true);
   globalStat.printAllStatus();
@@ -530,11 +568,12 @@ void Trainer::trainOnePassBatch(int passId) {
   }
 }
 
-real Trainer::calcGradient(const DataBatch& dataBatch, const Vector& value,
+real Trainer::calcGradient(const DataBatch& dataBatch,
+                           const Vector& value,
                            Vector& gradient) {
   CHECK_EQ(value.getSize(), gradient.getSize());
   std::vector<ParameterPtr>& parameters =
-    trainerInternal_.getGradientMachine()->getParameters();
+      trainerInternal_.getGradientMachine()->getParameters();
 
   clearGradient();
 
@@ -555,8 +594,8 @@ real Trainer::calcGradient(const DataBatch& dataBatch, const Vector& value,
   std::vector<Argument> inArgs = dataBatch.getStreams();
   std::vector<Argument> outArgs;
 
-  trainerInternal_.getGradientMachine()->forwardBackward(inArgs, &outArgs,
-                                                         PASS_TRAIN);
+  trainerInternal_.getGradientMachine()->forwardBackward(
+      inArgs, &outArgs, PASS_TRAIN);
   real cost = Argument::sumCosts(outArgs);
 
   offset = 0;
@@ -582,10 +621,16 @@ void Trainer::clearGradient() {
 
 int Trainer::getBatchSize() { return config_->getOptConfig().batch_size(); }
 
-void Trainer::test() {
-  tester_->test();
+void Trainer::createTester() {
+  tester_.reset(new paddle::Tester(config_,
+                                   createTesterConfig(),
+                                   trainerInternal_.getGradientMachine(),
+                                   trainerInternal_.getParameterUpdater(),
+                                   testDataProvider_));
 }
 
+void Trainer::test() { tester_->test(); }
+
 std::unique_ptr<TesterConfig> Trainer::createTesterConfig() {
   TesterConfig* conf = new TesterConfig;
   conf->testPeriod = FLAGS_test_period;
@@ -612,7 +657,5 @@ std::unique_ptr<TesterConfig> Trainer::createTesterConfig() {
   return std::unique_ptr<TesterConfig>(conf);
 }
 
-ParameterUtil* Trainer::getParameterUtilPtr() {
-  return paramUtil_.get();
-}
+ParameterUtil* Trainer::getParameterUtilPtr() { return paramUtil_.get(); }
 }  // namespace paddle
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
index 9bfd6d107a20438d2b1fc8d3143a39c7961c8115..899607c7c0f17ef2e91969f5ba1dcfa573518727 100644
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
@@ -66,18 +65,17 @@ public:
    * @param testDataProvider Test Data Provider. null if create from config.
    */
   virtual void init(
-      const std::shared_ptr<TrainerConfigHelper> &config,
+      const std::shared_ptr<TrainerConfigHelper>& config,
       bool testing = false,
-      const std::shared_ptr<GradientMachine> &gradientMachine = nullptr,
-      const std::shared_ptr<DataProvider> &dataProvider = nullptr,
-      const std::shared_ptr<DataProvider> &testDataProvider = nullptr);
+      const std::shared_ptr<GradientMachine>& gradientMachine = nullptr,
+      const std::shared_ptr<DataProvider>& dataProvider = nullptr,
+      const std::shared_ptr<DataProvider>& testDataProvider = nullptr);
 
   /**
    * Initialize Trainer from command line flags.
    */
   void init(int argc, char** argv);
 
-
   /**
    * Train until num_passes reached.
    * One pass means neural network train through all training data.
@@ -94,6 +92,12 @@ public:
    */
   real checkGradient();
 
+  void startTrain();
+  void finishTrain();
+  void startTrainPass();
+  void finishTrainPass();
+  void trainOneDataBatch(DataBatch& dataBatch);
+  void time();
 
   /**
    * given a dataBatch and the current parameter value
@@ -102,7 +106,8 @@ public:
    * TODO(yuyang18): I think this method is deprecated and buggy. Should it be
    * removed?
    */
-  real calcGradient(const DataBatch& dataBatch, const Vector& value,
+  real calcGradient(const DataBatch& dataBatch,
+                    const Vector& value,
                     Vector& gradient);
 
   /**
@@ -144,11 +149,11 @@ public:
 
 protected:
   /**
-   * Train one pass of data. passId starts from 0
+   * Train one pass of data.
    *
    * SGD Method.
    */
-  void trainOnePass(int passId);
+  void trainOnePass();
 
   /**
    * Train one pass in one batch.
@@ -161,6 +166,8 @@ protected:
    */
   void clearGradient();
 
+  void createTester();
+
 private:
   std::unique_ptr<TesterConfig> createTesterConfig();
 
@@ -173,6 +180,17 @@ protected:
   MachineState trainState_;
   MachineState testState_;
 
+  struct TrainPassContext {
+    int64_t batchId;
+    real avgTestCost;
+    int64_t numAvgTests;
+    int passId;
+    int passInnerId;
+  };
+  std::vector<paddle::Argument> forwardOutput_;
+
+  TrainPassContext trainPassContext_;
+
   std::unique_ptr<Evaluator> evaluator_;
   std::unique_ptr<Evaluator> currentEvaluator_;
   std::unique_ptr<Evaluator> averageEvaluator_;
@@ -188,12 +206,12 @@ protected:
   // parameter util
   std::unique_ptr<ParameterUtil> paramUtil_;
 
-  #ifdef PADDLE_METRIC_LEARNING
+#ifdef PADDLE_METRIC_LEARNING
   MetricTrainer trainerInternal_;
-  #else
+#else
   // trainer Internal
   TrainerInternal trainerInternal_;
-  #endif
+#endif
 };
 
 }  // namespace paddle
diff --git a/paddle/trainer/TrainerBenchmark.cpp b/paddle/trainer/TrainerBenchmark.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..54862e95b4a738b88dc256efbac9102fca383a4f
--- /dev/null
+++ b/paddle/trainer/TrainerBenchmark.cpp
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#undef PADDLE_DISABLE_TIMER
+
+#include "Trainer.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
+
+P_DECLARE_int32(test_period);
+
+P_DEFINE_bool(feed_data, false, "Wether to read data from DataProvider.");
+
+namespace paddle {
+
+void Trainer::time() {
+  startTrain();
+
+  trainerInternal_.getParameterUpdater()->startPass();
+  evaluator_->start();
+
+  DataBatch dataBatch;
+  int32_t batchSize = config_->getOptConfig().batch_size();
+  int32_t num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+  CHECK_EQ(num, batchSize) << "The sample number is less than batch size "
+                           << num << " != " << batchSize;
+
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+
+  std::vector<paddle::Argument> outputs;
+  // burning time
+  LOG(INFO) << "Burning time...";
+  for (int n = 0; n < 10; ++n) {
+    trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
+  }
+  LOG(INFO) << "Burning time end.";
+
+  for (int n = 0; n < FLAGS_test_period; n++) {
+    if (FLAGS_feed_data) {
+      REGISTER_TIMER("GetData");
+      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+    }
+
+    if (num != batchSize) {
+      break;
+    }
+
+    {
+      REGISTER_TIMER("FwdBwd");
+      trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
+    }
+  }
+  globalStat.setThreadInfo(true);
+  globalStat.printSegTimerStatus();
+  globalStat.reset();
+
+  finishTrain();
+}
+
+}  // namespace paddle
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp
index 98197e7988517ad9ae3cf244e98654368a6ec17a..ee5b1e0a9c5a8faa6614d76ab938f1f1b8f4e73a 100644
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
@@ -29,9 +29,8 @@ P_DECLARE_bool(with_gpu);
 P_DECLARE_bool(parallel_nn);
 P_DECLARE_string(config_args);
 
-
-const char* kConfigParserModuleName = "paddle.trainer.config_parser";
-const char* kConfigParserFuncName = "parse_config_and_serialize";
+const char *kConfigParserModuleName = "paddle.trainer.config_parser";
+const char *kConfigParserFuncName = "parse_config_and_serialize";
 
 namespace paddle {
 
@@ -40,12 +39,10 @@ struct TrainerConfigHelperPrivate {
 };
 
 TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
-  :m(new TrainerConfigHelperPrivate()) {
+    : m(new TrainerConfigHelperPrivate()) {
   std::ostringstream configArgs;
-  configArgs << "trainer_id=" << FLAGS_trainer_id
-             << ",local=" << FLAGS_local
-             << ",with_cost=" << FLAGS_with_cost
-             << ",use_gpu=" << FLAGS_use_gpu
+  configArgs << "trainer_id=" << FLAGS_trainer_id << ",local=" << FLAGS_local
+             << ",with_cost=" << FLAGS_with_cost << ",use_gpu=" << FLAGS_use_gpu
              << ",parallel_nn=" << FLAGS_parallel_nn
              << ",cudnn_version=" << hl_get_cudnn_lib_version();
   if (!FLAGS_config_args.empty()) {
@@ -54,31 +51,26 @@ TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
 
   VLOG(3) << "Parsing trainer config " << configFilePath;
   std::string configProtoStr =
-      callPythonFunc(kConfigParserModuleName, kConfigParserFuncName,
+      callPythonFunc(kConfigParserModuleName,
+                     kConfigParserFuncName,
                      {configFilePath, configArgs.str()});
   CHECK(m->conf.ParseFromString(configProtoStr));
 }
 
-TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig& config)
-  :m(new TrainerConfigHelperPrivate()) {
+TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig &config)
+    : m(new TrainerConfigHelperPrivate()) {
   m->conf = config;
 }
 
-
 TrainerConfigHelper::~TrainerConfigHelper() {
   if (m) {
     delete m;
   }
 }
 
-const TrainerConfig &
-TrainerConfigHelper::getConfig() const {
-  return m->conf;
-}
+const TrainerConfig &TrainerConfigHelper::getConfig() const { return m->conf; }
 
-TrainerConfig& TrainerConfigHelper::getMutableConfig() {
-  return m->conf;
-}
+TrainerConfig &TrainerConfigHelper::getMutableConfig() { return m->conf; }
 
 const OptimizationConfig &TrainerConfigHelper::getOptConfig() const {
   return m->conf.opt_config();
@@ -173,8 +165,7 @@ std::string TrainerConfigHelper::getConfigName(bool *ok) const {
   } else if (!m->conf.init_model_path().empty()) {
     retv = getConfigNameFromPath(m->conf.init_model_path());
   } else if (m->conf.start_pass() >= 1) {
-    retv = getConfigNameFromPassId(m->conf.start_pass(),
-                                   m->conf.save_dir());
+    retv = getConfigNameFromPassId(m->conf.start_pass(), m->conf.save_dir());
   }
 
   if (ok) {
@@ -191,8 +182,8 @@ std::shared_ptr<TrainerConfigHelper> TrainerConfigHelper::createFromFlags() {
   } else if (!FLAGS_init_model_path.empty()) {
     configPath = getConfigNameFromPath(FLAGS_init_model_path);
   } else if (FLAGS_start_pass >= 1) {
-    configPath = getConfigNameFromPassId(FLAGS_start_pass - 1,
-                                         FLAGS_init_model_path);
+    configPath =
+        getConfigNameFromPassId(FLAGS_start_pass - 1, FLAGS_init_model_path);
   } else {
     return nullptr;
   }
diff --git a/paddle/trainer/TrainerConfigHelper.h b/paddle/trainer/TrainerConfigHelper.h
index d3ad1eeeb43bc6be0b944e2059dddeab734efb75..d20684964136a553b2d4119e8db5a1de084278bb 100644
--- a/paddle/trainer/TrainerConfigHelper.h
+++ b/paddle/trainer/TrainerConfigHelper.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <memory>
@@ -27,7 +26,6 @@ struct TrainerConfigHelperPrivate;
 class ModelConfig;
 class DataConfig;
 
-
 /**
  * @brief TrainerConfig Helper. A class wrap protobuf's TrainerConfig Object,
  * simplize the usage for TrainerConfig.
@@ -46,7 +44,7 @@ public:
    * @brief Ctor, Create a TrainerConfig from config file
    * @param configFilePath Config file path.
    */
-  explicit TrainerConfigHelper(const std::string &configFilePath);
+  explicit TrainerConfigHelper(const std::string& configFilePath);
   explicit TrainerConfigHelper(const TrainerConfig& config);
 
   /**
@@ -106,7 +104,6 @@ public:
    */
   bool hasTestDataConfig() const;
 
-
   /**
    * @brief Update trainer config from command line flags.
    *        Override config's (save_dir, init_model_path, start_pass) if command
@@ -114,7 +111,6 @@ public:
    */
   void updateConfigFromFlags();
 
-
   /**
    * @brief Disable optimization's sparse remote update.
    */
@@ -125,13 +121,10 @@ public:
    */
   void disableRemoteSparseUpdaterForEachParams();
 
-
   /**
    * @brief implicit conversion.
    */
-  inline operator const TrainerConfig&() const {
-    return this->getConfig();
-  }
+  inline operator const TrainerConfig&() const { return this->getConfig(); }
 
   /**
    * @brief implicit conversion.
@@ -143,16 +136,12 @@ public:
   /**
    * @brief implicit conversion.
    */
-  inline operator const DataConfig&() const {
-    return this->getDataConfig();
-  }
+  inline operator const DataConfig&() const { return this->getDataConfig(); }
 
   /**
    * @brief implicit conversion.
    */
-  inline operator const ModelConfig&() const {
-    return this->getModelConfig();
-  }
+  inline operator const ModelConfig&() const { return this->getModelConfig(); }
 
   /**
    * @brief Get mutable optimization config.
diff --git a/paddle/trainer/TrainerInternal.cpp b/paddle/trainer/TrainerInternal.cpp
index 6029a4b2c1d0a0c04058bbd979523f26b72b5a5e..b1c3bf26d21d1760cd1710f372aa8a89fb7b101b 100644
--- a/paddle/trainer/TrainerInternal.cpp
+++ b/paddle/trainer/TrainerInternal.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "TrainerInternal.h"
 
 #include <fenv.h>
@@ -37,32 +36,36 @@ limitations under the License. */
 
 namespace paddle {
 
-void TrainerInternal::init(const std::shared_ptr<TrainerConfigHelper> &config,
-                           const GradientMachinePtr &gradientMachine,
-                           std::unique_ptr<TrainerInternalConfig> &&intconfig,
-                           const std::shared_ptr<TrainerStats> &stats,
+void TrainerInternal::init(const std::shared_ptr<TrainerConfigHelper>& config,
+                           const GradientMachinePtr& gradientMachine,
+                           std::unique_ptr<TrainerInternalConfig>&& intconfig,
+                           const std::shared_ptr<TrainerStats>& stats,
                            bool testing) {
-    config_ = config;
-    intconfig_ = std::move(intconfig);
-    stats_ = stats;
+  config_ = config;
+  intconfig_ = std::move(intconfig);
+  stats_ = stats;
 
-    //! in training will use parameter updater definitly.
-    //! But only use parameter in testing mode when some parameter in pserver.
-    if (!testing || (config_->getOptConfig().use_sparse_remote_updater() &&
+  //! in training will use parameter updater definitly.
+  //! But only use parameter in testing mode when some parameter in pserver.
+  if (!testing || (config_->getOptConfig().use_sparse_remote_updater() &&
                    intconfig_->loadsave_parameters_in_pserver)) {
-      createParameterUpdater(testing);
-    }
+    createParameterUpdater(testing);
+  }
 
-    gradientMachine_ = gradientMachine;
-    if (!gradientMachine) {
-      gradientMachine_.reset(GradientMachine::create(
-        config_->getConfig().model_config(), intconfig_->mode,
-        parameterUpdater_->getParameterTypes()));
-    }
+  gradientMachine_ = gradientMachine;
+  if (!gradientMachine) {
+    CHECK(config_->getConfig().has_model_config())
+        << "Missing model_config in trainer_config";
+    gradientMachine_.reset(
+        GradientMachine::create(config_->getConfig().model_config(),
+                                intconfig_->mode,
+                                parameterUpdater_->getParameterTypes()));
+  }
 }
 
 void TrainerInternal::trainOneBatch(int64_t batchId,
-                                    const DataBatch& dataBatch) {
+                                    const DataBatch& dataBatch,
+                                    std::vector<Argument>* outArgs) {
   // true means updating parameter whenever gradient is ready during backward()
   bool doPipelineUpdate =
       (intconfig_->mode != GradientMachine::kSgdSparseCpuTraining) &&
@@ -84,7 +87,6 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
   }
 
   const std::vector<Argument>& inArgs = dataBatch.getStreams();
-  std::vector<Argument> outArgs;
 
   PassType passType = parameterUpdater_->startBatch(actualBatchSize);
 
@@ -94,8 +96,8 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
     parameterUpdater_->getParametersRemote();
   }
 
-  UpdateCallback updateCallback =
-      [this, showStats, &paraStats](Parameter* para) {
+  UpdateCallback updateCallback = [this, showStats, &paraStats](
+      Parameter* para) {
     if (showStats) {
       //! @TODO(yuyang18) Show stats is actually a ParameterHook, refactor
       // it
@@ -114,8 +116,8 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
     timer.start();
 #endif
     REGISTER_TIMER("forwardBackward");
-    forwardBackwardBatch(inArgs, outArgs, passType, updateCallback,
-                         doPipelineUpdate);
+    forwardBackwardBatch(
+        inArgs, *outArgs, passType, updateCallback, doPipelineUpdate);
 #ifndef PADDLE_DISABLE_TIMER
     timer.stop();
     parameterUpdater_->setForwardbackwardTime(timer.get());
@@ -132,7 +134,7 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
   real cost = 0;
   {
     REGISTER_TIMER("sumCost");
-    cost = Argument::sumCosts(outArgs);
+    cost = Argument::sumCosts(*outArgs);
   }
 
   if (batchId % intconfig_->log_period == 0) {
@@ -145,7 +147,7 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
     gradientMachine_->eval(evaluator_);
   }
 
-  *stats_ += { actualBatchSize, cost };
+  *stats_ += {actualBatchSize, cost};
   {
     REGISTER_TIMER("finishBatch");
     parameterUpdater_->finishBatch(cost);
@@ -160,12 +162,11 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
     if (intconfig_->dot_period > 0) {
       std::cerr << std::endl;
     }
-    LOG(INFO) << " Batch=" << batchId + 1 << " "
-              << *stats_
+    LOG(INFO) << " Batch=" << batchId + 1 << " " << *stats_
               << " Eval: " << *evaluator_
               << " CurrentEval: " << *currentEvaluator_;
   } else if (intconfig_->dot_period > 0 &&
-            (batchId + 1) % intconfig_->dot_period == 0) {
+             (batchId + 1) % intconfig_->dot_period == 0) {
     std::cerr << ".";
   }
 }
@@ -177,13 +178,13 @@ void TrainerInternal::finishTrainPass(int passId, int batchId) {
   gradientMachine_->onPassEnd();
   parameterUpdater_->finishPass();
   evaluator_->finish();
-  LOG(INFO) << " Pass=" << passId << " Batch=" << batchId
-            << " " << stats_->getStats(false /*without current cost*/)
+  LOG(INFO) << " Pass=" << passId << " Batch=" << batchId << " "
+            << stats_->getStats(false /*without current cost*/)
             << " Eval: " << *evaluator_;
 }
 
-void TrainerInternal::showParameterStats(const std::vector<ParaStat>&
-                                        paraStats) {
+void TrainerInternal::showParameterStats(
+    const std::vector<ParaStat>& paraStats) {
   std::vector<ParameterPtr>& parameters = gradientMachine_->getParameters();
   for (auto& parameter : parameters) {
     SetDevice device(parameter->getDeviceId());
@@ -216,18 +217,21 @@ void TrainerInternal::showParameterStats(const std::vector<ParaStat>&
 void TrainerInternal::createParameterUpdater(bool testing) {
   const std::string& alg = config_->getOptConfig().algorithm();
   parameterUpdater_.reset(ParameterUpdaterCreators::tryCreateUpdater(
-                            alg, config_->getOptConfig(), intconfig_->local,
-                            intconfig_->num_passes));
-  if (parameterUpdater_) { return; }
+      alg, config_->getOptConfig(), intconfig_->local, intconfig_->num_passes));
+  if (parameterUpdater_) {
+    return;
+  }
 
   if (!intconfig_->local) {
     if (testing && config_->getOptConfig().use_sparse_remote_updater()) {
       std::unique_ptr<ParameterUpdater> localUpdater;
       localUpdater.reset(
           new SgdLocalUpdater(config_->getOptConfig()));  // do nothing
-      parameterUpdater_.reset(new SparseRemoteParameterUpdaterComposite(
-          config_->getOptConfig(), intconfig_->num_passes, testing,
-          std::move(localUpdater)));
+      parameterUpdater_.reset(
+          new SparseRemoteParameterUpdaterComposite(config_->getOptConfig(),
+                                                    intconfig_->num_passes,
+                                                    testing,
+                                                    std::move(localUpdater)));
     } else {
       if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode &&
           !intconfig_->use_old_updater) {
@@ -249,21 +253,18 @@ void TrainerInternal::createParameterUpdater(bool testing) {
       }
 
       localUpdater.reset(
-              intconfig_->use_old_updater
+          intconfig_->use_old_updater
               ? new RemoteParameterUpdater(
-                      *config_,
-                      intconfig_->num_passes,
-                      std::move(localUpdater))
+                    *config_, intconfig_->num_passes, std::move(localUpdater))
               : new ConcurrentRemoteParameterUpdater(
-                      *config_,
-                      intconfig_->num_passes,
-                      std::move(localUpdater)));
-
+                    *config_, intconfig_->num_passes, std::move(localUpdater)));
 
       if (config_->getOptConfig().use_sparse_remote_updater()) {
-        localUpdater.reset(new SparseRemoteParameterUpdaterComposite(
-            *config_, intconfig_->num_passes, testing,
-            std::move(localUpdater)));
+        localUpdater.reset(
+            new SparseRemoteParameterUpdaterComposite(*config_,
+                                                      intconfig_->num_passes,
+                                                      testing,
+                                                      std::move(localUpdater)));
       }
 
       this->parameterUpdater_ = std::move(localUpdater);
@@ -280,8 +281,7 @@ void TrainerInternal::createParameterUpdater(bool testing) {
       } else if (intconfig_->use_gpu &&
                  config_->getOptConfig().do_average_in_cpu() &&
                  config_->getOptConfig().average_window() > 0) {
-        parameterUpdater_.reset(
-            new SgdUpdaterWithCpuAverager(*config_));
+        parameterUpdater_.reset(new SgdUpdaterWithCpuAverager(*config_));
       } else {
         parameterUpdater_.reset(new SgdLocalUpdater(*config_));
       }
@@ -292,10 +292,10 @@ void TrainerInternal::createParameterUpdater(bool testing) {
 }
 
 void TrainerInternal::forwardBackwardBatch(const std::vector<Argument>& inArgs,
-                                   std::vector<Argument>& outArgs,
-                                   PassType& passType,
-                                   UpdateCallback updateCallback,
-                                   bool doPipelineUpdate) {
+                                           std::vector<Argument>& outArgs,
+                                           PassType& passType,
+                                           UpdateCallback updateCallback,
+                                           bool doPipelineUpdate) {
   gradientMachine_->forwardBackward(
       inArgs, &outArgs, passType, doPipelineUpdate ? updateCallback : nullptr);
 }
diff --git a/paddle/trainer/TrainerInternal.h b/paddle/trainer/TrainerInternal.h
index 17011c4d2e46fee34e8abf08279327fa747d9c0a..962d53a30e5454060e8ce864c347c37b9cc98116 100644
--- a/paddle/trainer/TrainerInternal.h
+++ b/paddle/trainer/TrainerInternal.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
@@ -28,7 +27,6 @@ limitations under the License. */
 #include "TrainerConfigHelper.h"
 #include "TrainerInternalConfig.h"
 
-
 namespace paddle {
 
 /**
@@ -40,12 +38,10 @@ public:
   struct ParaStat {
     real maxAbsGrad;
     real avgAbsGrad;
-    ParaStat() :maxAbsGrad(.0), avgAbsGrad(.0){
-    }
+    ParaStat() : maxAbsGrad(.0), avgAbsGrad(.0) {}
   };
 
-  TrainerInternal() {
-  }
+  TrainerInternal() {}
 
   /**
    * Intializes trainer internal class
@@ -55,10 +51,10 @@ public:
    * @param stats training stats
    * @param testing if it is in testing phase
    */
-  void init(const std::shared_ptr<TrainerConfigHelper> &config,
-            const GradientMachinePtr &machine,
-            std::unique_ptr<TrainerInternalConfig> &&intconfig,
-            const std::shared_ptr<TrainerStats> &stats,
+  void init(const std::shared_ptr<TrainerConfigHelper>& config,
+            const GradientMachinePtr& machine,
+            std::unique_ptr<TrainerInternalConfig>&& intconfig,
+            const std::shared_ptr<TrainerStats>& stats,
             bool testing);
 
   virtual ~TrainerInternal() {}
@@ -81,7 +77,9 @@ public:
    * @param batchId current batch id
    * @param dataBatch data for the batch
    */
-  void trainOneBatch(int64_t batchId, const DataBatch& dataBatch);
+  void trainOneBatch(int64_t batchId,
+                     const DataBatch& dataBatch,
+                     std::vector<Argument>* outArgs);
 
   /**
    * showParameterStats
@@ -92,7 +90,7 @@ public:
   /**
    * getGradientMachine
    */
-  inline const GradientMachinePtr & getGradientMachine() const {
+  inline const GradientMachinePtr& getGradientMachine() const {
     return gradientMachine_;
   }
 
@@ -107,17 +105,13 @@ public:
    * setCurrentEvaluator
    * @param eval evaluator to set
    */
-  inline void setCurrentEvaluator(Evaluator* eval) {
-    currentEvaluator_ = eval;
-  }
+  inline void setCurrentEvaluator(Evaluator* eval) { currentEvaluator_ = eval; }
 
   /**
    * setEvaluator
    * @param eval evaluator to set
    */
-  inline void setEvaluator(Evaluator* eval) {
-    evaluator_ = eval;
-  }
+  inline void setEvaluator(Evaluator* eval) { evaluator_ = eval; }
 
   /**
    * forwardBackwardBatch
diff --git a/paddle/trainer/TrainerInternalConfig.cpp b/paddle/trainer/TrainerInternalConfig.cpp
index 4a829a4df9e345d5d6b82740deea3cd005f6432b..0dc74cb3b39309b33a1a92dfa5a45e95defb4120 100644
--- a/paddle/trainer/TrainerInternalConfig.cpp
+++ b/paddle/trainer/TrainerInternalConfig.cpp
@@ -14,7 +14,8 @@ limitations under the License. */
 
 #include "TrainerInternalConfig.h"
 
-P_DEFINE_int32(show_parameter_stats_period, 0,
+P_DEFINE_int32(show_parameter_stats_period,
+               0,
                "Whether to show parameter stats during training");
 
 P_DEFINE_int32(dot_period, 1, "Print '.' every so many batches");
diff --git a/paddle/trainer/TrainerInternalConfig.h b/paddle/trainer/TrainerInternalConfig.h
index 9b59143bade737d9cde225836b8ae634e8e1543f..b7bfd29abd729b33ca953fb20835c57cbcf3ef74 100644
--- a/paddle/trainer/TrainerInternalConfig.h
+++ b/paddle/trainer/TrainerInternalConfig.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "paddle/utils/Util.h"
@@ -94,9 +93,7 @@ public:
    * @brief get all processed samples' number
    * @return all processed samples' number
    */
-  inline int64_t getNumProcessed() const {
-    return this->numProcessed_;
-  }
+  inline int64_t getNumProcessed() const { return this->numProcessed_; }
 
   /**
    * @brief same function as addCost. But it is simple to invoke.
@@ -111,7 +108,7 @@ public:
    * @param p a pair of parameter, first is numProcessed, second is cost.
    * @return *this
    */
-  inline TrainerStats& operator += (const std::pair<int64_t, real>& p) {
+  inline TrainerStats& operator+=(const std::pair<int64_t, real>& p) {
     this->addCost(p.first, p.second);
     return *this;
   }
@@ -121,9 +118,7 @@ public:
    *
    * reset stat when constructed.
    */
-  inline TrainerStats() {
-    this->reset();
-  }
+  inline TrainerStats() { this->reset(); }
 
   /**
    * @brief show stats to ostream.
@@ -137,7 +132,7 @@ public:
     os << "samples=" << this->getNumProcessed()
        << " AvgCost=" << this->getAvgCost();
     if (withCurrentCost) {
-       os << " CurrentCost=" << this->getCurrentAvgCost();
+      os << " CurrentCost=" << this->getCurrentAvgCost();
     }
   }
 
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index 94266639f94ade6b490eb26243dd964ddedf40b9..e23e745d99c7b10fb780cb0c89e27207eefc19c1 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <fenv.h>
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/StringUtil.h"
@@ -34,7 +33,7 @@ P_DECLARE_string(rdma_tcp);
 using namespace paddle;  // NOLINT
 
 int main(int argc, char** argv) {
-  // write logs instantly (never buffer log messages)
+// write logs instantly (never buffer log messages)
 #ifdef PADDLE_USE_GLOG
   FLAGS_logbuflevel = -1;
 #endif
@@ -103,6 +102,8 @@ int main(int argc, char** argv) {
     trainer.checkGradient();
   } else if (FLAGS_job == "test") {
     trainer.test();
+  } else if (FLAGS_job == "time") {
+    trainer.time();
   } else {
     LOG(FATAL) << "Unknown job type: " << FLAGS_job;
   }
diff --git a/paddle/trainer/tests/__init__.py b/paddle/trainer/tests/__init__.py
index 7f9e87eee6037666b86420fba194624859d356b3..c90af2ee000d46a032984ee23559e7e99b49ddad 100644
--- a/paddle/trainer/tests/__init__.py
+++ b/paddle/trainer/tests/__init__.py
@@ -11,4 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
diff --git a/paddle/trainer/tests/config_parser_test.py b/paddle/trainer/tests/config_parser_test.py
index 5ca874cec7914a20f79c2c7b1873c5bd04f60dca..c5ec315d6b01b0a5a3f73673e1756e9c06d685ba 100644
--- a/paddle/trainer/tests/config_parser_test.py
+++ b/paddle/trainer/tests/config_parser_test.py
@@ -17,6 +17,6 @@ from paddle.trainer.config_parser import parse_config_and_serialize
 if __name__ == '__main__':
     parse_config_and_serialize('trainer/tests/test_config.conf', '')
     parse_config_and_serialize(
-        'trainer/tests/sample_trainer_config.conf', 
+        'trainer/tests/sample_trainer_config.conf',
         'extension_module_name=paddle.trainer.config_parser_extension')
     parse_config_and_serialize('gserver/tests/pyDataProvider/trainer.conf', '')
diff --git a/paddle/trainer/tests/gen_proto_data.py b/paddle/trainer/tests/gen_proto_data.py
index c818a94bee7c28b0245d28dd62353d46444cb592..a3dbc10c886e183582b44fee479d5ffb074193ef 100644
--- a/paddle/trainer/tests/gen_proto_data.py
+++ b/paddle/trainer/tests/gen_proto_data.py
@@ -21,8 +21,7 @@ import logging
 import pprint
 
 logging.basicConfig(
-    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s',
-)
+    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', )
 logger = logging.getLogger('paddle')
 logger.setLevel(logging.INFO)
 
@@ -36,33 +35,32 @@ num_original_columns = 3
 # [[-1,0], [0,0]]  means previous token at column 0 and current token at
 # column 0 are combined as one feature.
 patterns = [
-    [[-2,0]],
-    [[-1,0]],
-    [[0,0]],
-    [[1,0]],
-    [[2,0]],
-
-    [[-1,0], [0,0]],
-    [[0,0], [1,0]],
-
-    [[-2,1]],
-    [[-1,1]],
-    [[0,1]],
-    [[1,1]],
-    [[2,1]],
-    [[-2,1], [-1,1]],
-    [[-1,1], [0,1]],
-    [[0,1], [1,1]],
-    [[1,1], [2,1]],
-
-    [[-2,1], [-1,1], [0,1]],
-    [[-1,1], [0,1], [1,1]],
-    [[0,1], [1,1], [2,1]],
+    [[-2, 0]],
+    [[-1, 0]],
+    [[0, 0]],
+    [[1, 0]],
+    [[2, 0]],
+    [[-1, 0], [0, 0]],
+    [[0, 0], [1, 0]],
+    [[-2, 1]],
+    [[-1, 1]],
+    [[0, 1]],
+    [[1, 1]],
+    [[2, 1]],
+    [[-2, 1], [-1, 1]],
+    [[-1, 1], [0, 1]],
+    [[0, 1], [1, 1]],
+    [[1, 1], [2, 1]],
+    [[-2, 1], [-1, 1], [0, 1]],
+    [[-1, 1], [0, 1], [1, 1]],
+    [[0, 1], [1, 1], [2, 1]],
 ]
 
+
 def make_features(sequence):
     length = len(sequence)
     num_features = len(sequence[0])
+
     def get_features(pos):
         if pos < 0:
             return ['#B%s' % -pos] * num_features
@@ -72,9 +70,10 @@ def make_features(sequence):
 
     for i in xrange(length):
         for pattern in patterns:
-            fname = '/'.join([get_features(i+pos)[f] for pos, f in pattern])
+            fname = '/'.join([get_features(i + pos)[f] for pos, f in pattern])
             sequence[i].append(fname)
 
+
 '''
 Source file format:
 Each line is for one timestep. The features are separated by space.
@@ -87,6 +86,8 @@ i-th column.
 
 return a list of dict for each column
 '''
+
+
 def create_dictionaries(filename, cutoff, oov_policy):
     def add_to_dict(sequence, dicts):
         num_features = len(dicts)
@@ -118,7 +119,6 @@ def create_dictionaries(filename, cutoff, oov_policy):
         features = line.split(' ')
         sequence.append(features)
 
-
     for i in xrange(num_features):
         dct = dicts[i]
         n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
@@ -161,12 +161,9 @@ existed in dicts[i] will be assigned to id 0.
 if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
 in dicts[i].
 '''
-def gen_proto_file(
-        input_file,
-        dicts,
-        oov_policy,
-        output_file):
 
+
+def gen_proto_file(input_file, dicts, oov_policy, output_file):
     def write_sequence(out, sequence):
         num_features = len(dicts)
         is_beginning = True
@@ -213,8 +210,8 @@ def gen_proto_file(
     if patterns:
         slot_def = header.slot_defs.add()
         slot_def.type = DataFormat.SlotDef.VECTOR_SPARSE_NON_VALUE
-        slot_def.dim = sum([len(dicts[i])
-                            for i in xrange(num_original_columns, len(dicts))])
+        slot_def.dim = sum(
+            [len(dicts[i]) for i in xrange(num_original_columns, len(dicts))])
         logger.info("feature_dim=%s" % slot_def.dim)
 
     for i in xrange(num_original_columns):
@@ -242,30 +239,31 @@ def gen_proto_file(
 
     logger.info("num_sequences=%s" % num_sequences)
 
+
 dict2 = {
- 'B-ADJP': 0,
- 'I-ADJP': 1,
- 'B-ADVP': 2,
- 'I-ADVP': 3,
- 'B-CONJP': 4,
- 'I-CONJP': 5,
- 'B-INTJ': 6,
- 'I-INTJ': 7,
- 'B-LST': 8,
- 'I-LST': 9,
- 'B-NP': 10,
- 'I-NP': 11,
- 'B-PP': 12,
- 'I-PP': 13,
- 'B-PRT': 14,
- 'I-PRT': 15,
- 'B-SBAR': 16,
- 'I-SBAR': 17,
- 'B-UCP': 18,
- 'I-UCP': 19,
- 'B-VP': 20,
- 'I-VP': 21,
- 'O': 22
+    'B-ADJP': 0,
+    'I-ADJP': 1,
+    'B-ADVP': 2,
+    'I-ADVP': 3,
+    'B-CONJP': 4,
+    'I-CONJP': 5,
+    'B-INTJ': 6,
+    'I-INTJ': 7,
+    'B-LST': 8,
+    'I-LST': 9,
+    'B-NP': 10,
+    'I-NP': 11,
+    'B-PP': 12,
+    'I-PP': 13,
+    'B-PRT': 14,
+    'I-PRT': 15,
+    'B-SBAR': 16,
+    'I-SBAR': 17,
+    'B-UCP': 18,
+    'I-UCP': 19,
+    'B-VP': 20,
+    'I-VP': 21,
+    'O': 22
 }
 
 if __name__ == '__main__':
@@ -273,16 +271,9 @@ if __name__ == '__main__':
     cutoff += [3] * len(patterns)
     oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR]
     oov_policy += [OOV_POLICY_IGNORE] * len(patterns)
-    dicts = create_dictionaries(
-        'trainer/tests/train.txt', cutoff, oov_policy)
+    dicts = create_dictionaries('trainer/tests/train.txt', cutoff, oov_policy)
     dicts[2] = dict2
-    gen_proto_file(
-        'trainer/tests/train.txt',
-        dicts,
-        oov_policy,
-        'trainer/tests/train_proto.bin')
-    gen_proto_file(
-        'trainer/tests/test.txt',
-        dicts,
-        oov_policy,
-        'trainer/tests/test_proto.bin')
+    gen_proto_file('trainer/tests/train.txt', dicts, oov_policy,
+                   'trainer/tests/train_proto.bin')
+    gen_proto_file('trainer/tests/test.txt', dicts, oov_policy,
+                   'trainer/tests/test_proto.bin')
diff --git a/paddle/trainer/tests/picojson.h b/paddle/trainer/tests/picojson.h
index a0b5c2274b20fdbce76d021326f22b3181f3d9d1..cb657d219e55c1e349ffb77a88945085b4149c78 100644
--- a/paddle/trainer/tests/picojson.h
+++ b/paddle/trainer/tests/picojson.h
@@ -409,7 +409,8 @@ inline std::string value::to_str() const {
     case number_type: {
       char buf[256];
       double tmp;
-      SNPRINTF(buf, sizeof(buf),
+      SNPRINTF(buf,
+               sizeof(buf),
                fabs(u_.number_) < (1ULL << 53) && modf(u_.number_, &tmp) == 0
                    ? "%.f"
                    : "%.17g",
@@ -532,7 +533,8 @@ void value::_serialize(Iter oi, int indent) const {
         ++indent;
       }
       for (object::const_iterator i = u_.object_->begin();
-           i != u_.object_->end(); ++i) {
+           i != u_.object_->end();
+           ++i) {
         if (i != u_.object_->begin()) {
           *oi++ = ',';
         }
@@ -983,7 +985,9 @@ inline std::string parse(value& out, Iter& pos, const Iter& last) {
 }
 
 template <typename Context, typename Iter>
-inline Iter _parse(Context& ctx, const Iter& first, const Iter& last,
+inline Iter _parse(Context& ctx,
+                   const Iter& first,
+                   const Iter& last,
                    std::string* err) {
   input<Iter> in(first, last);
   if (!_parse(ctx, in) && err != NULL) {
@@ -1003,7 +1007,9 @@ inline Iter _parse(Context& ctx, const Iter& first, const Iter& last,
 }
 
 template <typename Iter>
-inline Iter parse(value& out, const Iter& first, const Iter& last,
+inline Iter parse(value& out,
+                  const Iter& first,
+                  const Iter& last,
                   std::string* err) {
   default_parse_context ctx(&out);
   return _parse(ctx, first, last, err);
@@ -1017,8 +1023,10 @@ inline std::string parse(value& out, const std::string& s) {
 
 inline std::string parse(value& out, std::istream& is) {
   std::string err;
-  parse(out, std::istreambuf_iterator<char>(is.rdbuf()),
-        std::istreambuf_iterator<char>(), &err);
+  parse(out,
+        std::istreambuf_iterator<char>(is.rdbuf()),
+        std::istreambuf_iterator<char>(),
+        &err);
   return err;
 }
 
diff --git a/paddle/trainer/tests/test.txt b/paddle/trainer/tests/test.txt
index 68e7f72e3d8d4ee4e592309cfc230fad24a810d4..3ad503b34f2e1a84c632d0894f180b5cf9ac550a 100644
--- a/paddle/trainer/tests/test.txt
+++ b/paddle/trainer/tests/test.txt
@@ -998,4 +998,3 @@ from IN B-PP
 Friday NNP B-NP
 's POS B-NP
 Tokyo NNP I-NP
-
diff --git a/paddle/trainer/tests/testPyDataWrapper.py b/paddle/trainer/tests/testPyDataWrapper.py
index 49bd760f4e20e2a12e5686b3193bdba2895612e4..4607bec24e1fec6f8b9996eb32fe991dbbe3ed79 100644
--- a/paddle/trainer/tests/testPyDataWrapper.py
+++ b/paddle/trainer/tests/testPyDataWrapper.py
@@ -21,7 +21,10 @@ import json
 import string
 
 
-@provider(slots=[SparseNonValueSlot(10), DenseSlot(2), SparseValueSlot(10), StringSlot(1), IndexSlot(3)])
+@provider(slots=[
+    SparseNonValueSlot(10), DenseSlot(2), SparseValueSlot(10), StringSlot(1),
+    IndexSlot(3)
+])
 def processNonSequenceData(obj, filename):
     with open(filename, "rb") as f:
         for line in f:
@@ -50,6 +53,7 @@ val_randomer = lambda: random.uniform(-1.0, 1.0)
 seq_count_randomer = lambda: random.randrange(1, SEQUENCE_LIMIT)
 str_count_randomer = lambda: random.randrange(1, STRING_LIMIT)
 
+
 class IDRandomer():  # A random generator, return unique id
     def __init__(self):
         self.id_set = set()
@@ -61,38 +65,57 @@ class IDRandomer():  # A random generator, return unique id
             return idx
         else:
             return self.__call__()
+
+
 # SparseValueSlot
 def sparse_value_creator(_):
     rand = IDRandomer()
     return [(rand(), val_randomer()) for _ in xrange(sparse_count_randomer())]
+
+
 sparse_value = map(sparse_value_creator, range(seq_count_randomer()))
 
+
 # DenseSlot
 def dense_creator(_):
     return [val_randomer() for _ in xrange(SPARSE_ID_LIMIT)]
+
+
 dense = map(dense_creator, range(seq_count_randomer()))
 
+
 # SparseNonValueSlot
 def sparse_creator(_):
     rand = IDRandomer()
     return [rand() for _ in xrange(sparse_count_randomer())]
+
+
 sparse_nonvalue = map(sparse_creator, range(seq_count_randomer()))
 
 # IndexSlot
 ids = [sparse_id_randomer() for _ in range(seq_count_randomer())]
 
+
 # StringSlot
-def random_str(size = 8, chars=string.ascii_letters + string.digits):
+def random_str(size=8, chars=string.ascii_letters + string.digits):
     return ''.join(random.choice(chars) for _ in range(size))
+
+
 strs = [random_str(str_count_randomer()) for _ in range(seq_count_randomer())]
 
+
 def processSeqAndGenerateDataInit(obj, *args, **kwargs):
     obj.json_filename = kwargs.get("load_data_args", "test_data.json")
 
-@provider(slots=[SparseValueSlot(SPARSE_ID_LIMIT), DenseSlot(SPARSE_ID_LIMIT),
-                 SparseNonValueSlot(SPARSE_ID_LIMIT), IndexSlot(SPARSE_ID_LIMIT),
-                 StringSlot(SPARSE_ID_LIMIT)],
-          use_seq=True, init_hook=processSeqAndGenerateDataInit)
+
+@provider(
+    slots=[
+        SparseValueSlot(SPARSE_ID_LIMIT), DenseSlot(SPARSE_ID_LIMIT),
+        SparseNonValueSlot(SPARSE_ID_LIMIT), IndexSlot(SPARSE_ID_LIMIT),
+        StringSlot(SPARSE_ID_LIMIT)
+    ],
+    use_seq=True,
+    init_hook=processSeqAndGenerateDataInit)
 def processSeqAndGenerateData(obj, name):
     retv = [sparse_value, dense, sparse_nonvalue, ids, strs]
     # Write to protoseq.
@@ -104,10 +127,15 @@ def processSeqAndGenerateData(obj, name):
 def processSubSeqAndGenerateDataInit(obj, *args, **kwargs):
     obj.json_filename = kwargs.get("load_data_args", "test_data.json")
 
-@provider(slots=[SparseValueSlot(SPARSE_ID_LIMIT), DenseSlot(SPARSE_ID_LIMIT),
-                 SparseNonValueSlot(SPARSE_ID_LIMIT), IndexSlot(SPARSE_ID_LIMIT),
-                 StringSlot(SPARSE_ID_LIMIT)],
-          use_seq=True, init_hook=processSubSeqAndGenerateDataInit)
+
+@provider(
+    slots=[
+        SparseValueSlot(SPARSE_ID_LIMIT), DenseSlot(SPARSE_ID_LIMIT),
+        SparseNonValueSlot(SPARSE_ID_LIMIT), IndexSlot(SPARSE_ID_LIMIT),
+        StringSlot(SPARSE_ID_LIMIT)
+    ],
+    use_seq=True,
+    init_hook=processSubSeqAndGenerateDataInit)
 def processSubSeqAndGenerateData(obj, name):
     retv_json = [sparse_value, dense, sparse_nonvalue, ids, strs]
     retv_wrapper = [[sparse_value], [dense], [sparse_nonvalue], [ids], [strs]]
@@ -116,6 +144,7 @@ def processSubSeqAndGenerateData(obj, name):
         json.dump(retv_json, f)
     yield retv_wrapper
 
+
 if __name__ == "__main__":
     pvd = processNonSequenceData("test.txt")
     print pvd.getNextBatch(100)
diff --git a/paddle/trainer/tests/test_Compare.cpp b/paddle/trainer/tests/test_Compare.cpp
index 735c5a5b27d8189195be8a720158977edc5d8c9e..03312f9e470e0f8b01e229237d25a7ac8e088c5c 100644
--- a/paddle/trainer/tests/test_Compare.cpp
+++ b/paddle/trainer/tests/test_Compare.cpp
@@ -52,8 +52,8 @@ void calcGradient(bool useGpu, comData& Data) {
   vector<Argument>& inArgs = dataBatch.getStreams();
   trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
   for (int i = 0; i < 2; ++i) {
-    trainer.getGradientMachine()->forwardBackward(inArgs, &Data.outArgs,
-                                                  PASS_TRAIN);
+    trainer.getGradientMachine()->forwardBackward(
+        inArgs, &Data.outArgs, PASS_TRAIN);
   }
   trainer.getGradientMachine()->finish();
 }
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/trainer/tests/test_CompareSparse.cpp
index 311dd333a1b1638e75ca7aaf441c441d3cf54447..a7c6862ce3362556fa60cc3309445347476e7f33 100644
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
@@ -23,7 +23,7 @@ using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
 static const string& configFile1 =
-              "trainer/tests/sample_trainer_config_qb_rnn.conf";
+    "trainer/tests/sample_trainer_config_qb_rnn.conf";
 
 P_DECLARE_bool(use_gpu);
 P_DECLARE_string(config);
@@ -38,8 +38,9 @@ P_DECLARE_bool(local);
 P_DECLARE_bool(use_old_updater);
 P_DECLARE_bool(parallel_nn);
 P_DECLARE_string(config_args);
-P_DEFINE_double(max_diff_ratio, 0.0f,
-              "max diff ratio allowed for parameters value");
+P_DEFINE_double(max_diff_ratio,
+                0.0f,
+                "max diff ratio allowed for parameters value");
 
 int gNumDevices = 0;
 
@@ -53,8 +54,7 @@ std::vector<ParameterPtr> trainerOnePassTest(const string& configFile,
   FLAGS_config_args = sparseUpdate ? "sparse_update=1" : "sparse_update=0";
 
   LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
-            << " configFile=" << configFile
-            << " sparseUpdate=" << sparseUpdate;
+            << " configFile=" << configFile << " sparseUpdate=" << sparseUpdate;
   srand(FLAGS_seed);
   *ThreadLocalRand::getSeed() = FLAGS_seed;
   ThreadLocalRandomEngine::get().seed(FLAGS_seed);
@@ -91,8 +91,12 @@ std::vector<ParameterPtr>& getDenseParameters() {
   return denseParameters;
 }
 
-void checkBuffer(real* A, const char* desA, real* B, const char* desB,
-                 size_t len, double maxDiffRatio) {
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 double maxDiffRatio) {
   double maxDiff = 0;
   double maxValue = 0;
   for (size_t i = 0; i < len; ++i) {
@@ -101,10 +105,8 @@ void checkBuffer(real* A, const char* desA, real* B, const char* desB,
     maxDiff = std::max(maxDiff, diff);
   }
   EXPECT_LE(maxDiff / maxValue, maxDiffRatio);
-  LOG(INFO) << " maxDiff=" << maxDiff
-            << " maxValue=" << maxValue
-            << " maxDiff/maxValue=" << maxDiff / maxValue
-            << "\n\n";
+  LOG(INFO) << " maxDiff=" << maxDiff << " maxValue=" << maxValue
+            << " maxDiff/maxValue=" << maxDiff / maxValue << "\n\n";
 }
 
 void compareValue(const vector<ParameterPtr>& parametersA,
@@ -125,8 +127,12 @@ void compareValue(const vector<ParameterPtr>& parametersA,
 
     LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
               << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(), "para_A", paraB.getData(), "para_B",
-                paraA.getSize(), maxDiffRatio);
+    checkBuffer(paraA.getData(),
+                "para_A",
+                paraB.getData(),
+                "para_B",
+                paraA.getSize(),
+                maxDiffRatio);
   }
 }
 
@@ -172,8 +178,7 @@ TEST(compareSparse, multiGradientMachine) {
       if (useGpu) continue;
 #endif
       FLAGS_parallel_nn = useGpu;
-      LOG(INFO) << " local=" << local
-                << " useGpu=" << useGpu;
+      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
       int trainerCount = useGpu ? numGpu : 2;
       std::vector<ParameterPtr> parameters =
           trainerOnePassTest(configFile1, true, trainerCount, useGpu);
@@ -197,8 +202,7 @@ TEST(compareSparse, NeuralNetwork) {
       if (useGpu) continue;
 #endif
       FLAGS_parallel_nn = useGpu;
-      LOG(INFO) << " local=" << local
-                << " useGpu=" << useGpu;
+      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
       int trainerCount = 1;
       std::vector<ParameterPtr> parameters =
           trainerOnePassTest(configFile1, true, trainerCount, useGpu);
diff --git a/paddle/trainer/tests/test_CompareTwoNets.cpp b/paddle/trainer/tests/test_CompareTwoNets.cpp
index d1057f2aeabd3bcc41330f1cfe72227de3837140..81320da6ac9c6e880b936a6b1e2650796bb50ff7 100644
--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/trainer/tests/test_CompareTwoNets.cpp
@@ -32,10 +32,12 @@ P_DECLARE_string(nics);
 
 P_DEFINE_string(config_file_a, "", "config of one network to compare");
 P_DEFINE_string(config_file_b, "", "config of another network to compare");
-P_DEFINE_bool(need_high_accuracy, false,
+P_DEFINE_bool(need_high_accuracy,
+              false,
               "whether need to run in double accuracy");
 P_DEFINE_double(
-    max_diff_ratio, 0.0f,
+    max_diff_ratio,
+    0.0f,
     "max diff ratio allowed for outputs and parameters (value/gradient)");
 P_DECLARE_bool(thread_local_rand_use_global_seed);
 P_DECLARE_int32(seed);
@@ -71,14 +73,18 @@ void calcGradient(ComData& data, const string configFile) {
   vector<Argument>& inArgs = dataBatch.getStreams();
 
   trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
-  trainer.getGradientMachine()->forwardBackward(inArgs, &data.outArgs,
-                                                PASS_TRAIN);
+  trainer.getGradientMachine()->forwardBackward(
+      inArgs, &data.outArgs, PASS_TRAIN);
 
   trainer.getGradientMachine()->finish();
 }
 
-void checkBuffer(real* A, const char* desA, real* B, const char* desB,
-                 size_t len, size_t width = 1) {
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 size_t width = 1) {
   int nNum = 0;
   real maxVal = 0;
   for (size_t i = 0; i < len; ++i) {
@@ -90,8 +96,8 @@ void checkBuffer(real* A, const char* desA, real* B, const char* desB,
     maxDiff = std::max(maxDiff, diff);
     if (diff > maxVal * FLAGS_max_diff_ratio) {
       nNum++;
-      VLOG(1) << "Row: " << i / width << ", " << desA << " : " << A[i]
-              << "    " << desB << " : " << B[i] << " diff=" << diff;
+      VLOG(1) << "Row: " << i / width << ", " << desA << " : " << A[i] << "    "
+              << desB << " : " << B[i] << " diff=" << diff;
     }
   }
   EXPECT_EQ(0, nNum);
@@ -114,8 +120,12 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
     LOG(INFO) << "\n--------------------------------"
               << " Check Network Output_" << i << ":"
               << " -------------------------------------\n";
-    checkBuffer(matA.getData(), "network A output", matB.getData(),
-                "network B output", matA.getElementCnt(), matA.getWidth());
+    checkBuffer(matA.getData(),
+                "network A output",
+                matB.getData(),
+                "network B output",
+                matA.getElementCnt(),
+                matA.getWidth());
   }
 
   vector<ParameterPtr>& parametersA = comDataA.parameters;
@@ -136,7 +146,10 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 
     LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
               << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(), "Network A", paraB.getData(), "Network B",
+    checkBuffer(paraA.getData(),
+                "Network A",
+                paraB.getData(),
+                "Network B",
                 paraA.getSize());
 
     CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
@@ -144,7 +157,10 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 
     LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
               << " ; size : " << gradA.getSize() << " -----------";
-    checkBuffer(gradA.getData(), "Network A", gradB.getData(), "Network B",
+    checkBuffer(gradA.getData(),
+                "Network A",
+                gradB.getData(),
+                "Network B",
                 gradA.getSize());
   }
 }
diff --git a/paddle/trainer/tests/test_CompareTwoOpts.cpp b/paddle/trainer/tests/test_CompareTwoOpts.cpp
index 2c44da43fcd698808805480599f2c6223d120f8d..a52f2fa7e7708925dbcb173167b17bbfef93a4da 100644
--- a/paddle/trainer/tests/test_CompareTwoOpts.cpp
+++ b/paddle/trainer/tests/test_CompareTwoOpts.cpp
@@ -32,11 +32,13 @@ P_DECLARE_string(nics);
 
 P_DEFINE_string(config_file_a, "", "config of one network to compare");
 P_DEFINE_string(config_file_b, "", "config of another network to compare");
-P_DEFINE_bool(need_high_accuracy, true,
+P_DEFINE_bool(need_high_accuracy,
+              true,
               "whether need to run in double accuracy (recommended)");
 P_DEFINE_double(
-      max_diff_ratio, 0.0f,
-      "max diff ratio allowed for outputs and parameters (value/gradient)");
+    max_diff_ratio,
+    0.0f,
+    "max diff ratio allowed for outputs and parameters (value/gradient)");
 
 struct ComData {
   vector<Argument> outArgs;
@@ -62,8 +64,12 @@ void calcGradient(ComData& data, const string configFile) {
   trainer.train();
 }
 
-void checkBuffer(real* A, const char* desA, real* B, const char* desB,
-                 size_t len, size_t width = 1) {
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 size_t width = 1) {
   int nNum = 0;
   for (size_t i = 0; i < len; ++i) {
     real diff = fabs(A[i] - B[i]);
@@ -94,8 +100,12 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
     LOG(INFO) << "\n--------------------------------"
               << " Check Network Output_" << i << ":"
               << " -------------------------------------\n";
-    checkBuffer(matA.getData(), "network A output", matB.getData(),
-                "network B output", matA.getElementCnt(), matA.getWidth());
+    checkBuffer(matA.getData(),
+                "network A output",
+                matB.getData(),
+                "network B output",
+                matA.getElementCnt(),
+                matA.getWidth());
   }
 
   vector<ParameterPtr>& parametersA = comDataA.parameters;
@@ -116,7 +126,10 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 
     LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
               << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(), "Network A", paraB.getData(), "Network B",
+    checkBuffer(paraA.getData(),
+                "Network A",
+                paraB.getData(),
+                "Network B",
                 paraA.getSize());
 
     CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
@@ -124,7 +137,10 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 
     LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
               << " ; size : " << gradA.getSize() << " -----------";
-    checkBuffer(gradA.getData(), "Network A", gradB.getData(), "Network B",
+    checkBuffer(gradA.getData(),
+                "Network A",
+                gradB.getData(),
+                "Network B",
                 gradA.getSize());
   }
 }
diff --git a/paddle/trainer/tests/test_Prediction.cpp b/paddle/trainer/tests/test_Prediction.cpp
index 1c7f93666b8dfd0307797fc5e20b05b355c75a38..6db33439b319e84e99e828246ca672fa8274e4bf 100644
--- a/paddle/trainer/tests/test_Prediction.cpp
+++ b/paddle/trainer/tests/test_Prediction.cpp
@@ -20,7 +20,8 @@ limitations under the License. */
 
 P_DECLARE_string(config);
 P_DECLARE_string(config_args);
-P_DEFINE_string(merger, "./paddle_merge_model",
+P_DEFINE_string(merger,
+                "./paddle_merge_model",
                 "path to paddle_merge_model binary");
 
 using namespace paddle;  // NOLINT
@@ -120,8 +121,10 @@ TEST(GradientMachine, create) {
           rand() / (real)RAND_MAX;  // NOLINT TODO(yuyang): use rand_r
     }
   }
-  MatrixPtr input = Matrix::create(numSamples, inputDim,
-                                   /* trans */ false, FLAGS_use_gpu);
+  MatrixPtr input = Matrix::create(numSamples,
+                                   inputDim,
+                                   /* trans */ false,
+                                   FLAGS_use_gpu);
   input->copyFrom(cpuInput);
   inArgs[0].value = input;
   gradientMachine1->forward(inArgs, &outArgs, PASS_TEST);
@@ -139,8 +142,8 @@ TEST(GradientMachine, create) {
 
   gradientMachine3->forward(inArgs, &outArgs2, PASS_TEST);
   out2.copyFrom(*outArgs2[0].value);
-  checkBuffer(out1.getData(), out2.getData(),
-              out2.getHeight() * out2.getWidth());
+  checkBuffer(
+      out1.getData(), out2.getData(), out2.getHeight() * out2.getWidth());
 
   cmd = " rm -rf " + modelDir + "/*";
   LOG(INFO) << "cmd " << cmd;
diff --git a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
index 49332b877db646fdcd7cd3b11ec96bac64dd2d6d..e53291386c6b553e26248dae75e321d4b7246823 100644
--- a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
+++ b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #ifndef PADDLE_NO_PYTHON
 #include <gtest/gtest.h>
 #include <paddle/utils/PythonUtil.h>
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index ad2a715ef89c6f4c4b509e1a8b816699b709c59d..900c05af851aede67253535228d75d211dee6a85 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -33,7 +33,9 @@ P_DECLARE_string(config);
 P_DECLARE_int32(gpu_id);
 P_DECLARE_bool(allow_only_one_model_on_one_gpu);
 
-void checkGradientTest(const string& configFile, bool useGpu, bool parallel,
+void checkGradientTest(const string& configFile,
+                       bool useGpu,
+                       bool parallel,
                        int trainerCount = 1) {
   FLAGS_use_gpu = useGpu;
   FLAGS_parallel_nn = parallel;
@@ -94,7 +96,7 @@ TEST(checkGradient, multi) {
 TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
 
 TEST(checkGradient, chunk) {
-#if defined(__APPLE__) || defined (__OSX__)
+#if defined(__APPLE__) || defined(__OSX__)
   EXPECT_EQ(0, system("python trainer/tests/gen_proto_data.py"));
 #else
   EXPECT_EQ(0, system("python2 trainer/tests/gen_proto_data.py"));
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
index 4554b94485f99f1fea1ebef8f5ae8a59b630d106..da2954d1664fc18cb78e6217807ff9799d220f7f 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -41,12 +41,13 @@ public:
   }
 };
 
-
-
 int gNumDevices = 0;
 
-void trainerOnePassTest(const string& configFile, bool useGpu, bool parallel,
-                        int trainerCount = 1, double averageWindow = 0.0f,
+void trainerOnePassTest(const string& configFile,
+                        bool useGpu,
+                        bool parallel,
+                        int trainerCount = 1,
+                        double averageWindow = 0.0f,
                         bool doAverageInCpu = false) {
   FLAGS_use_gpu = useGpu;
   FLAGS_parallel_nn = parallel;
@@ -164,13 +165,13 @@ double checkRemoteParameterUpdater(TrainerForTest& trainer) {
   const vector<Argument>& inArgs = dataBatch.getStreams();
   vector<Argument> outArgs;
 
-  UpdateCallback updateCallback =
-      [parameterUpdater, parameterCheck](Parameter* para) {
-        parameterCheck[para->getID()]
-            ->getBuf(PARAMETER_GRADIENT)
-            ->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
-        parameterUpdater->update(para);
-      };
+  UpdateCallback updateCallback = [parameterUpdater,
+                                   parameterCheck](Parameter* para) {
+    parameterCheck[para->getID()]
+        ->getBuf(PARAMETER_GRADIENT)
+        ->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
+    parameterUpdater->update(para);
+  };
 
   parameterUpdater->startPass();
   parameterUpdaterCheck->startPass();
@@ -178,8 +179,8 @@ double checkRemoteParameterUpdater(TrainerForTest& trainer) {
   for (int i = 0; i < config.opt_config().num_batches_per_get_parameter() * 2;
        ++i) {
     PassType passType = parameterUpdater->startBatch(actualBatchSize);
-    gradientMachine->forwardBackward(inArgs, &outArgs, passType,
-                                     updateCallback);
+    gradientMachine->forwardBackward(
+        inArgs, &outArgs, passType, updateCallback);
     parameterUpdater->finishBatch(0);
 
     parameterUpdaterCheck->startBatch(actualBatchSize);
@@ -191,7 +192,7 @@ double checkRemoteParameterUpdater(TrainerForTest& trainer) {
 
   double sum = 0.0f;
   for (size_t i = 0; i != parameters.size(); ++i) {
-    real* v1, *v2;
+    real *v1, *v2;
     CpuVector trainerPara(parameters[i]->getSize());
     trainerPara.copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
     if (!FLAGS_use_gpu) {
@@ -217,8 +218,10 @@ double checkRemoteParameterUpdater(TrainerForTest& trainer) {
   return sum;
 }
 
-void checkRemoteParameterUpdaterTest(const string& configFile, bool useGpu,
-                                     bool parallel, int trainerCount = 1,
+void checkRemoteParameterUpdaterTest(const string& configFile,
+                                     bool useGpu,
+                                     bool parallel,
+                                     int trainerCount = 1,
                                      bool useOldUpdater = false,
                                      int num_batches_per_get_parameter = 1) {
   FLAGS_use_gpu = useGpu;
diff --git a/paddle/trainer/tests/test_gen_dict.txt b/paddle/trainer/tests/test_gen_dict.txt
index 91a84146180e0135b37ee8c76508e588412c2870..1000f90057824bf665b32fe47a7f78e7a0077e7b 100644
--- a/paddle/trainer/tests/test_gen_dict.txt
+++ b/paddle/trainer/tests/test_gen_dict.txt
@@ -6,4 +6,4 @@
 5
 6
 7
-8
\ No newline at end of file
+8
diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
index fcee318d16e00428bda447e80575dbf1b027102d..49e8a97ad057246addf29274dd9c436d1481de91 100644
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
@@ -51,8 +51,10 @@ void checkOutput(const string& expRetFile) {
   }
 }
 
-void prepareInArgs(vector<Argument>& inArgs, const size_t batchSize,
-                   bool useGpu, bool hasSubseq) {
+void prepareInArgs(vector<Argument>& inArgs,
+                   const size_t batchSize,
+                   bool useGpu,
+                   bool hasSubseq) {
   inArgs.clear();
   // sentence id
   Argument sentId;
@@ -87,7 +89,9 @@ void prepareInArgs(vector<Argument>& inArgs, const size_t batchSize,
   inArgs.emplace_back(dummyInput);
 }
 
-void testGeneration(const string& configFile, bool useGpu, bool hasSubseq,
+void testGeneration(const string& configFile,
+                    bool useGpu,
+                    bool hasSubseq,
                     const string& expRetFile) {
   FLAGS_use_gpu = useGpu;
   auto config = std::make_shared<TrainerConfigHelper>(configFile);
@@ -114,8 +118,10 @@ TEST(RecurrentGradientMachine, test_generation) {
 #else
   const auto useGpuConfs = {true, false};
 #endif
-  auto testGen = [&](const string& configFile, bool hasSubseq,
-                     const string& expRetFile, bool beam_search) {
+  auto testGen = [&](const string& configFile,
+                     bool hasSubseq,
+                     const string& expRetFile,
+                     bool beam_search) {
     FLAGS_config_args = beam_search ? "beam_search=1" : "beam_search=0";
     for (auto useGpu : useGpuConfs) {
       testGeneration(configFile, useGpu, hasSubseq, expRetFile);
@@ -126,7 +132,9 @@ TEST(RecurrentGradientMachine, test_generation) {
   // In hierarchical RNN, beam search and one way search are only in inner-RNN,
   // outer-RNN will concat the generated inner-results (first for beam search)
   // from inner-RNN. Thus, they have the same outer-results.
-  testGen(NEST_CONFIG_FILE, true, expectFile + ".nest",
+  testGen(NEST_CONFIG_FILE,
+          true,
+          expectFile + ".nest",
           false);  // no beam search
   testGen(NEST_CONFIG_FILE, true, expectFile + ".nest", true);  // beam search
 }
diff --git a/paddle/trainer/tests/train.txt b/paddle/trainer/tests/train.txt
index 8d9b15dcf5bba477c2c6f732806a47b0aa6e098a..2313aee987ba71ba7ea779d3cf7705478e7fbde2 100644
--- a/paddle/trainer/tests/train.txt
+++ b/paddle/trainer/tests/train.txt
@@ -4998,4 +4998,3 @@ However RB B-ADVP
 the DT B-NP
 disclosure NN I-NP
 of IN B-PP
-
diff --git a/paddle/utils/BarrierStat.cpp b/paddle/utils/BarrierStat.cpp
index cbc738a839d08a83ff8498315e29427697c930c7..82c5b84e5960753d5ec4c35bd667a8e43269e9e1 100644
--- a/paddle/utils/BarrierStat.cpp
+++ b/paddle/utils/BarrierStat.cpp
@@ -20,19 +20,21 @@ limitations under the License. */
 #include "paddle/utils/BarrierStat.h"
 #include "paddle/utils/Flags.h"
 
-P_DEFINE_bool(log_barrier_abstract, true,
+P_DEFINE_bool(log_barrier_abstract,
+              true,
               "if true, show abstract of barrier performance");
-P_DEFINE_int32(log_barrier_lowest_nodes, 5,
+P_DEFINE_int32(log_barrier_lowest_nodes,
+               5,
                "how many lowest node will be logged");
-P_DEFINE_bool(log_barrier_show_log, false,  // for performance tuning insight
+P_DEFINE_bool(log_barrier_show_log,
+              false,  // for performance tuning insight
               "if true, always show barrier abstract even with little gap");
 
 namespace paddle {
 
-std::ostream &operator<<(std::ostream &output, BarrierStatBase &stat) {
+std::ostream &operator<<(std::ostream &output, const BarrierStatBase &stat) {
   if (FLAGS_log_barrier_abstract) {
-    std::lock_guard<std::mutex> guard(
-        const_cast<BarrierStatBase &>(stat).lock_);
+    std::lock_guard<std::mutex> guard(stat.lock_);
     stat.showAbstract(output);
   }
   return output;
@@ -136,7 +138,7 @@ void BarrierEndStat::reset(bool clearRawData) {
   totAbstract_.minDelta = UINT64_MAX;
 }
 
-void BarrierEndStat::showAbstract(std::ostream &output) {
+void BarrierEndStat::showAbstract(std::ostream &output) const {
   // do not support the case "<=2 pserver"
   if (numConnThreads_ <= 2 || !totSamples_) {
     return;
@@ -144,7 +146,8 @@ void BarrierEndStat::showAbstract(std::ostream &output) {
 
   // duplicate freq info
   std::vector<struct Abstract> outputAbstract = abstract_;
-  std::sort(outputAbstract.begin(), outputAbstract.end(),
+  std::sort(outputAbstract.begin(),
+            outputAbstract.end(),
             [](const struct Abstract &a, const struct Abstract &b) {
               return a.freq > b.freq;
             });
@@ -272,7 +275,7 @@ void BarrierDeltaStat::reset(bool clearRawData) {
   totAbstract_.minDelta = UINT64_MAX;
 }
 
-void BarrierDeltaStat::showAbstract(std::ostream &output) {
+void BarrierDeltaStat::showAbstract(std::ostream &output) const {
   // do not support the case "<=2 pserver"
   if (numConnThreads_ <= 2 || !totSamples_) {
     return;
@@ -280,7 +283,8 @@ void BarrierDeltaStat::showAbstract(std::ostream &output) {
 
   // duplicate freq info
   std::vector<struct Abstract> outputAbstract = abstract_;
-  std::sort(outputAbstract.begin(), outputAbstract.end(),
+  std::sort(outputAbstract.begin(),
+            outputAbstract.end(),
             [](const struct Abstract &a, const struct Abstract &b) {
               return a.freq > b.freq;
             });
diff --git a/paddle/utils/BarrierStat.h b/paddle/utils/BarrierStat.h
index 22d6cc9bcec5ec4e655a216ddf5873e47b86fa38..661340ad275365ab567175d4280abdab18444fac 100644
--- a/paddle/utils/BarrierStat.h
+++ b/paddle/utils/BarrierStat.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <stdint.h>
@@ -218,11 +217,12 @@ public:
   }
 
 protected:
-  virtual void showAbstract(std::ostream &output) {}
-  friend std::ostream &operator<<(std::ostream &output, BarrierStatBase &stat);
+  virtual void showAbstract(std::ostream &output) const {}
+  friend std::ostream &operator<<(std::ostream &output,
+                                  const BarrierStatBase &stat);
 
 protected:
-  std::mutex lock_;
+  mutable std::mutex lock_;
   std::mutex abstractLock_;  // see note on updaterStat
   // each freqency for each barrier trainer
   std::vector<struct Abstract> abstract_;
@@ -262,7 +262,7 @@ protected:
    * log_barrier_abstract, log_barrier_lowest_nodes, log_barrier_threshold
    * control details.
    */
-  virtual void showAbstract(std::ostream &output);
+  virtual void showAbstract(std::ostream &output) const;
 
 private:
   std::unique_ptr<TimeVectorEnd> timeVector_;
@@ -286,7 +286,7 @@ public:
   virtual bool checkPassBarrier() { return timeVector_->empty(); }
 
 protected:
-  virtual void showAbstract(std::ostream &outPut);
+  virtual void showAbstract(std::ostream &outPut) const;
 
 private:
   // store delta time in uint64_t, eg BP time of all trainers
@@ -304,44 +304,44 @@ private:
 // nodes.
 
 // end barrier
-#define __REGISTER_BARRIER_TIMER_SERVER(set, statName, numConnThreads, \
-                                        trainerId, ...)                \
-  do {                                                                 \
-    if (numConnThreads > 2) {                                          \
-      std::string internalName =                                       \
-          std::string(statName) + std::string(__VA_ARGS__);            \
-      BarrierStatPtr __stat =                                          \
-          (set).getStat(numConnThreads, internalName, BARRIER_END);    \
-      struct timeval cur;                                              \
-      gettimeofday(&cur, nullptr);                                     \
-      __stat->updateStat(cur, trainerId);                              \
-    }                                                                  \
+#define __REGISTER_BARRIER_TIMER_SERVER(                            \
+    set, statName, numConnThreads, trainerId, ...)                  \
+  do {                                                              \
+    if (numConnThreads > 2) {                                       \
+      std::string internalName =                                    \
+          std::string(statName) + std::string(__VA_ARGS__);         \
+      BarrierStatPtr __stat =                                       \
+          (set).getStat(numConnThreads, internalName, BARRIER_END); \
+      struct timeval cur;                                           \
+      gettimeofday(&cur, nullptr);                                  \
+      __stat->updateStat(cur, trainerId);                           \
+    }                                                               \
   } while (0);
 
 // end barrier with user-defined timer
-#define __REGISTER_BARRIER_TIMER_SERVER_SET(set, statName, numConnThreads, \
-                                            trainerId, cur, ...)           \
-  do {                                                                     \
-    if (numConnThreads > 2) {                                              \
-      std::string internalName =                                           \
-          std::string(statName) + std::string(__VA_ARGS__);                \
-      BarrierStatPtr __stat =                                              \
-          (set).getStat(numConnThreads, internalName, BARRIER_END);        \
-      __stat->updateStat(cur, trainerId);                                  \
-    }                                                                      \
+#define __REGISTER_BARRIER_TIMER_SERVER_SET(                        \
+    set, statName, numConnThreads, trainerId, cur, ...)             \
+  do {                                                              \
+    if (numConnThreads > 2) {                                       \
+      std::string internalName =                                    \
+          std::string(statName) + std::string(__VA_ARGS__);         \
+      BarrierStatPtr __stat =                                       \
+          (set).getStat(numConnThreads, internalName, BARRIER_END); \
+      __stat->updateStat(cur, trainerId);                           \
+    }                                                               \
   } while (0);
 
 // delta barrier
-#define __REGISTER_BARRIER_DELTA_SERVER_SET(set, statName, numConnThreads, \
-                                            trainerId, delta, ...)         \
-  do {                                                                     \
-    if (numConnThreads > 2) {                                              \
-      std::string internalName =                                           \
-          std::string(statName) + std::string(__VA_ARGS__);                \
-      BarrierStatPtr __stat =                                              \
-          (set).getStat(numConnThreads, internalName, BARRIER_DELTA);      \
-      __stat->updateStat(delta, trainerId);                                \
-    }                                                                      \
+#define __REGISTER_BARRIER_DELTA_SERVER_SET(                          \
+    set, statName, numConnThreads, trainerId, delta, ...)             \
+  do {                                                                \
+    if (numConnThreads > 2) {                                         \
+      std::string internalName =                                      \
+          std::string(statName) + std::string(__VA_ARGS__);           \
+      BarrierStatPtr __stat =                                         \
+          (set).getStat(numConnThreads, internalName, BARRIER_DELTA); \
+      __stat->updateStat(delta, trainerId);                           \
+    }                                                                 \
   } while (0);
 
 // check end barrier
@@ -373,10 +373,10 @@ private:
  */
 
 // try to capture which trainer is slowest node in sync-sgd at pserver.
-#define REGISTER_SLOW_NODES_PROBE(set, statName, numConnThreads, trainerId,   \
-                                  ...)                                        \
-  __REGISTER_BARRIER_TIMER_SERVER((set), statName, numConnThreads, trainerId, \
-                                  __VA_ARGS__)
+#define REGISTER_SLOW_NODES_PROBE(                 \
+    set, statName, numConnThreads, trainerId, ...) \
+  __REGISTER_BARRIER_TIMER_SERVER(                 \
+      (set), statName, numConnThreads, trainerId, __VA_ARGS__)
 // try to check if all threads or trainers have passed barriers for data
 // accuracy.
 #define CHECK_BARRIER_TIMER(set, statName, numConnThreads, ...) \
@@ -384,12 +384,12 @@ private:
 
 #ifdef PADDLE_DISABLE_TIMER
 
-#define REGISTER_BARRIER_TIMER_SERVER(set, statName, numConnThreads, \
-                                      trainerId, ...)
-#define REGISTER_BARRIER_TIMER_SERVER_SET(set, statName, numConnThreads, \
-                                          trainerId, cur, ...)
-#define REGISTER_BARRIER_DELTA_SERVER_SET(set, statName, numConnThreads, \
-                                          trainerId, cur, ...)
+#define REGISTER_BARRIER_TIMER_SERVER( \
+    set, statName, numConnThreads, trainerId, ...)
+#define REGISTER_BARRIER_TIMER_SERVER_SET( \
+    set, statName, numConnThreads, trainerId, cur, ...)
+#define REGISTER_BARRIER_DELTA_SERVER_SET( \
+    set, statName, numConnThreads, trainerId, cur, ...)
 
 #else
 
@@ -397,10 +397,10 @@ private:
  * sensing barrier time distribution for all parallelization threads.
  * it provides low API for slow node check(REGISTER_SLOW_NODES_PROBE)
  */
-#define REGISTER_BARRIER_TIMER_SERVER(set, statName, numConnThreads,          \
-                                      trainerId, ...)                         \
-  __REGISTER_BARRIER_TIMER_SERVER((set), statName, numConnThreads, trainerId, \
-                                  __VA_ARGS__)
+#define REGISTER_BARRIER_TIMER_SERVER(             \
+    set, statName, numConnThreads, trainerId, ...) \
+  __REGISTER_BARRIER_TIMER_SERVER(                 \
+      (set), statName, numConnThreads, trainerId, __VA_ARGS__)
 
 /*
  * sensing barrier time distribution for all parallelization threads.
@@ -409,18 +409,18 @@ private:
  * time distribution
  * for receiving data.
  */
-#define REGISTER_BARRIER_TIMER_SERVER_SET(set, statName, numConnThreads, \
-                                          trainerId, cur, ...)           \
-  __REGISTER_BARRIER_TIMER_SERVER_SET((set), statName, numConnThreads,   \
-                                      trainerId, cur, __VA_ARGS__)
+#define REGISTER_BARRIER_TIMER_SERVER_SET(              \
+    set, statName, numConnThreads, trainerId, cur, ...) \
+  __REGISTER_BARRIER_TIMER_SERVER_SET(                  \
+      (set), statName, numConnThreads, trainerId, cur, __VA_ARGS__)
 
 // try to capture time delta from all trainers, such as forwardBackward time
 // which implies
 // computation fluctuation
-#define REGISTER_BARRIER_DELTA_SERVER_SET(set, statName, numConnThreads, \
-                                          trainerId, delta, ...)         \
-  __REGISTER_BARRIER_DELTA_SERVER_SET((set), statName, numConnThreads,     \
-                                      trainerId, delta, __VA_ARGS__)
+#define REGISTER_BARRIER_DELTA_SERVER_SET(                \
+    set, statName, numConnThreads, trainerId, delta, ...) \
+  __REGISTER_BARRIER_DELTA_SERVER_SET(                    \
+      (set), statName, numConnThreads, trainerId, delta, __VA_ARGS__)
 
 #endif  // DISABLE_TIMER
 }  // namespace paddle
diff --git a/paddle/utils/ClassRegistrar.h b/paddle/utils/ClassRegistrar.h
index 0c7747ac77a118e794a4b0d46d10b9cc1a2d15f5..ee58ccb2ad42ac9e5380e3a80fe0044965eab083 100644
--- a/paddle/utils/ClassRegistrar.h
+++ b/paddle/utils/ClassRegistrar.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <map>
@@ -63,16 +62,16 @@ public:
   // Create a class instance of type @type using args
   BaseClass* createByType(const std::string& type, CreateArgs... args) {
     ClassCreator creator;
-    CHECK(mapGet(type, creatorMap_, &creator))
-        << "Unknown class type: " << type;
+    CHECK(mapGet(type, creatorMap_, &creator)) << "Unknown class type: "
+                                               << type;
     return creator(args...);
   }
 
   template <typename T>
   inline void forEachType(T callback) {
-      for (auto it = creatorMap_.begin(); it != creatorMap_.end(); ++it) {
-          callback(it->first);
-      }
+    for (auto it = creatorMap_.begin(); it != creatorMap_.end(); ++it) {
+      callback(it->first);
+    }
   }
 
 protected:
diff --git a/paddle/utils/CommandLineParser.cpp b/paddle/utils/CommandLineParser.cpp
index 8edcad5747b419387a933b74a2b477ea82382054..307e304bb03d79fa9a640ece9c84845919b0d9c4 100644
--- a/paddle/utils/CommandLineParser.cpp
+++ b/paddle/utils/CommandLineParser.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "CommandLineParser.h"
 #ifndef PADDLE_USE_GFLAGS
 #include "paddle/utils/StringUtil.h"
@@ -31,7 +30,6 @@ static constexpr int kStatusOK = 0;
 static constexpr int kStatusInvalid = 1;
 static constexpr int kStatusNotFound = 2;
 
-
 /**
  * \brief: Convert a string to any type value.
  *
@@ -48,13 +46,16 @@ template <>
 bool StringToValue<bool>(const std::string& content, bool* value) {
   std::string tmp = content;
 
-  std::transform(tmp.begin(), tmp.end(), tmp.begin(), [](char in) -> char {
-    if (in <= 'Z' && in >= 'A') {
-      return in - ('Z' - 'z');
-    } else {
-      return in;
-    }
-  });  // tolower.
+  std::transform(tmp.begin(),
+                 tmp.end(),
+                 tmp.begin(),
+                 [](char in) -> char {
+                   if (in <= 'Z' && in >= 'A') {
+                     return in - ('Z' - 'z');
+                   } else {
+                     return in;
+                   }
+                 });  // tolower.
 
   if (tmp == "true" || tmp == "1") {
     *value = true;
@@ -121,20 +122,16 @@ int ParseArgument(const std::string& argument, std::string* extraInfo) {
  * parse '--flag_name', '-flag_name' as true; '--noflag_name', '-noflag_name' as
  * false
  */
-static int ParseBoolArgumentExtra(
-    const std::string& argument, std::string* extraInfo) {
+static int ParseBoolArgumentExtra(const std::string& argument,
+                                  std::string* extraInfo) {
   (void)(extraInfo);  // unused extraInfo, just make api same.
 
   //! @warning: The order and content of prefixes is DESIGNED for parsing
   //! command line. The length of prefixes are 1, 2, 3, 4. The parse logic takes
   //! use of this fact. DO NOT CHANGE IT without reading how to parse command
   //! below.
-  static const std::vector<std::pair<const char*, bool> >  prefixes = {
-    {"-", true},
-    {"--", true},
-    {"-no", false},
-    {"--no", false}
-  };
+  static const std::vector<std::pair<const char*, bool>> prefixes = {
+      {"-", true}, {"--", true}, {"-no", false}, {"--no", false}};
 
   for (flags_internal::CommandLineFlagRegistry<bool>::Command& command :
        flags_internal::CommandLineFlagRegistry<bool>::Instance()->commands) {
@@ -153,7 +150,6 @@ static int ParseBoolArgumentExtra(
   return kStatusNotFound;
 }
 
-
 /**
  * \brief: Print command line arguments' usage with type T.
  */
@@ -170,12 +166,9 @@ static void PrintTypeUsage() {
   }
 }
 
-template <typename ...TS>
+template <typename... TS>
 static void PrintTypeUsages() {
-  int unused[] = {
-    0,
-    (PrintTypeUsage<TS>(), 0) ...
-  };
+  int unused[] = {0, (PrintTypeUsage<TS>(), 0)...};
   (void)(unused);
 }
 /**
@@ -190,7 +183,8 @@ static void PrintUsageAndExit(const char* argv0) {
 /**
  * \brief: Print the error flags, usage, and exit.
  */
-static void PrintParseError(const std::string& name, const char* actualInput,
+static void PrintParseError(const std::string& name,
+                            const char* actualInput,
                             const char* arg0) {
   std::cerr << "Parse command flag " << name << " error! User input is "
             << actualInput << std::endl;
@@ -211,7 +205,7 @@ void ParseCommandLineFlags(int* argc, char** argv, bool withHelp) {
     PrintParseError(extra, argv[i], argv[0]); \
   }
 
-    ParseArgumentWithType(bool);    // NOLINT
+    ParseArgumentWithType(bool);  // NOLINT
     ParseArgumentWithType(int32_t);
     ParseArgumentWithType(double);  // NOLINT
     ParseArgumentWithType(int64_t);
diff --git a/paddle/utils/CommandLineParser.h b/paddle/utils/CommandLineParser.h
index d18675ffa30d7f36ee470c35a93e522c68bbfdda..c46567913e253bdda645f129449773040c0ec93d 100644
--- a/paddle/utils/CommandLineParser.h
+++ b/paddle/utils/CommandLineParser.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #ifndef PADDLE_USE_GFLAGS
 #include "DisableCopy.h"
@@ -72,7 +71,8 @@ struct CommandLineFlagRegister {
    * \param [inout] val: The command line argument instance, FLAGS_xxx.
    * \param [in] desc: The command line helper message.
    */
-  CommandLineFlagRegister(const std::string& name, T* val,
+  CommandLineFlagRegister(const std::string& name,
+                          T* val,
                           const std::string desc) {
     CommandLineFlagRegistry<T>::Instance()->commands.push_back(
         {name, val, desc, *val});
@@ -83,7 +83,8 @@ struct CommandLineFlagRegister {
  * \brief: Define a command line arguments.
  *
  * \param type: The variable type, such as int, double, etc.
- * \param name: The variable name. The command line argument is '--name', the variable
+ * \param name: The variable name. The command line argument is '--name', the
+ *variable
  *is 'FLAGS_name'
  * \param default_value: The default value of command line argument.
  * \param text: The description in command line argument.
diff --git a/paddle/utils/CompilerMacros.h b/paddle/utils/CompilerMacros.h
new file mode 100644
index 0000000000000000000000000000000000000000..4236d750c4d8bf722fdf3e371dc95b2d9aa8223d
--- /dev/null
+++ b/paddle/utils/CompilerMacros.h
@@ -0,0 +1,17 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#define ATTR_NORETURN __attribute__((noreturn))
diff --git a/paddle/utils/CustomStackTrace.cpp b/paddle/utils/CustomStackTrace.cpp
index 232a478ecd93a7dcb7da7b02a5a1af37a1d1bc43..8740fe662ea21ce93c7c0d9505cdeb75975b3020 100644
--- a/paddle/utils/CustomStackTrace.cpp
+++ b/paddle/utils/CustomStackTrace.cpp
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "CustomStackTrace.h"
 #include "CommandLineParser.h"
 #include <iostream>
 
-P_DEFINE_bool(layer_stack_error_only_current_thread,
+P_DEFINE_bool(
+    layer_stack_error_only_current_thread,
     true,
     "Dump current thread or whole process layer stack when signal error "
     "occurred. true means only dump current thread layer stack");
@@ -33,21 +33,23 @@ void installLayerStackTracer() {
     if (!gLayerStackTrace.empty()) {
       size_t curTid = -1UL;
       std::hash<std::thread::id> hasher;
-      gLayerStackTrace.dump([&curTid, &hasher](std::thread::id tid,
-                            bool* isForwarding,
-                            const std::string& layerName) {
-        if (curTid != hasher(tid)) {
-          if (curTid != -1UL) {
-            std::cerr << std::endl;
-          }
-          curTid = hasher(tid);
-          std::cerr << "Thread [" << tid << "] ";
-          if (isForwarding) {
-            std::cerr << (*isForwarding ? "Forwarding ": "Backwarding ");
-          }
-        }
-        std::cerr << layerName << ", ";
-      }, FLAGS_layer_stack_error_only_current_thread);
+      gLayerStackTrace.dump(
+          [&curTid, &hasher](std::thread::id tid,
+                             bool* isForwarding,
+                             const std::string& layerName) {
+            if (curTid != hasher(tid)) {
+              if (curTid != -1UL) {
+                std::cerr << std::endl;
+              }
+              curTid = hasher(tid);
+              std::cerr << "Thread [" << tid << "] ";
+              if (isForwarding) {
+                std::cerr << (*isForwarding ? "Forwarding " : "Backwarding ");
+              }
+            }
+            std::cerr << layerName << ", ";
+          },
+          FLAGS_layer_stack_error_only_current_thread);
       std::cerr << std::endl;
     }
     std::cerr.write(data, sz);
diff --git a/paddle/utils/CustomStackTrace.h b/paddle/utils/CustomStackTrace.h
index 774c4db2b9be40c38286ef1248bf77746949fd6b..878e14eb5fcf870bf6c29758a1b9a297c13ce730 100644
--- a/paddle/utils/CustomStackTrace.h
+++ b/paddle/utils/CustomStackTrace.h
@@ -24,13 +24,13 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * A ThreadLocal stack for tracing train/test process. 
- * (More details of ThreadLocal can be find 
+ * A ThreadLocal stack for tracing train/test process.
+ * (More details of ThreadLocal can be find
  * in the comments of ThreadLocal class.)
- * 
+ *
  * For example.
  * @code{.cpp}
- * 
+ *
  * paddle::CustomStackTrace<std::string> stack;
  * for (auto& layer : layers){
  *   stack.push(layer->getName());
@@ -48,7 +48,7 @@ namespace paddle {
  * @endcode
  */
 template <typename T>
-class CustomStackTrace{
+class CustomStackTrace {
 public:
   /**
    * @brief Pop out an item from the top of the stack if item == top.
@@ -87,7 +87,6 @@ public:
     return true;
   }
 
-
   /**
    * @brief DumpCallback Type. It will be invoked many times by dump method.
    *
@@ -96,8 +95,8 @@ public:
    * The third parameter is the item in stack.
    */
   typedef std::function<void(const std::thread::id& /*threadId*/,
-                              bool* /*isPushing*/,
-                              const T& /*item*/)> DumpCallback;
+                             bool* /*isPushing*/,
+                             const T& /*item*/)> DumpCallback;
 
   /**
    * Dump all thread stack, and all stack will be cleared.
@@ -160,25 +159,23 @@ private:
    * @brief Get thread local stack reference.
    */
   std::stack<T>& stack() {
-    return this->getThreadLocal(this->logStack_,
-                                this->stackBuffers_);
+    return this->getThreadLocal(this->logStack_, this->stackBuffers_);
   }
 
   /**
    * @brief Get thread local pushing flag.
    */
   bool& pushing() {
-    return this->getThreadLocal(this->isPushing_,
-                                this->pushingBuffers_);
+    return this->getThreadLocal(this->isPushing_, this->pushingBuffers_);
   }
 
 private:
   mutable std::mutex mtx_;
 
-  std::unordered_map<std::thread::id, std::stack<T>* > stackBuffers_;
-  std::unordered_map<std::thread::id, bool* > pushingBuffers_;
+  std::unordered_map<std::thread::id, std::stack<T>*> stackBuffers_;
+  std::unordered_map<std::thread::id, bool*> pushingBuffers_;
   ThreadLocal<bool> isPushing_;
-  ThreadLocal<std::stack<T> > logStack_;
+  ThreadLocal<std::stack<T>> logStack_;
 };
 
 extern CustomStackTrace<std::string> gLayerStackTrace;
diff --git a/paddle/utils/DisableCopy.h b/paddle/utils/DisableCopy.h
index 964daa237beb3085bc78404c6585e6fab16dc27b..e991c07cdf68dac2bdf7fd66de03a292a3bec3c8 100644
--- a/paddle/utils/DisableCopy.h
+++ b/paddle/utils/DisableCopy.h
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 /**
  * Disable copy macro.
  */
-#define DISABLE_COPY(CLASS_NAME)\
-  CLASS_NAME(CLASS_NAME &&) = delete; \
+#define DISABLE_COPY(CLASS_NAME)                \
+  CLASS_NAME(CLASS_NAME &&) = delete;           \
   CLASS_NAME(const CLASS_NAME &other) = delete; \
-  CLASS_NAME& operator=(const CLASS_NAME &other) = delete
+  CLASS_NAME &operator=(const CLASS_NAME &other) = delete
diff --git a/paddle/utils/Excepts.cpp b/paddle/utils/Excepts.cpp
index 9123508fc78d002a9fc5fd0e7e9da8ddec975d6f..b2fad3ac9dd6477e388185d95ebd49c8f0da4c84 100644
--- a/paddle/utils/Excepts.cpp
+++ b/paddle/utils/Excepts.cpp
@@ -27,28 +27,28 @@ int feenableexcept(unsigned int excepts) {
   static fenv_t fenv;
   unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
 
-  if ( fegetenv (&fenv) ) return -1;
+  if (fegetenv(&fenv)) return -1;
   old_excepts = fenv.__control & FE_ALL_EXCEPT;
 
   // unmask
   fenv.__control &= ~new_excepts;
-  fenv.__mxcsr   &= ~(new_excepts << 7);
+  fenv.__mxcsr &= ~(new_excepts << 7);
 
-  return ( fesetenv (&fenv) ? -1 : old_excepts );
+  return (fesetenv(&fenv) ? -1 : old_excepts);
 }
 
 int fedisableexcept(unsigned int excepts) {
   static fenv_t fenv;
   unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
 
-  if ( fegetenv (&fenv) ) return -1;
+  if (fegetenv(&fenv)) return -1;
   old_excepts = fenv.__control & FE_ALL_EXCEPT;
 
   // mask
   fenv.__control |= new_excepts;
-  fenv.__mxcsr   |= new_excepts << 7;
+  fenv.__mxcsr |= new_excepts << 7;
 
-  return ( fesetenv (&fenv) ? -1 : old_excepts );
+  return (fesetenv(&fenv) ? -1 : old_excepts);
 }
 
 #endif
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index b2b5a5949e59cb7e65eb0db7573adae8e50f80a8..6fae24e1b58c5296019cfaefe97905c3e8632210 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Flags.h"
 
 #ifdef PADDLE_ONLY_CPU
@@ -22,7 +21,8 @@ P_DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
 #endif
 
 P_DEFINE_bool(
-    parallel_nn, false,
+    parallel_nn,
+    false,
     "Whether to use multi-threads to calculate one neural network."
     "If it was set false, use gpu_id specify which gpu core to use"
     "(the device property in the trainer config file will be ingored)."
@@ -32,39 +32,48 @@ P_DEFINE_int32(trainer_count, 1, "Defined how many trainers to train");
 P_DEFINE_int32(gpu_id, 0, "Which gpu core to use");
 P_DEFINE_int32(port, 20134, "Listening port for pserver");
 P_DEFINE_int32(data_server_port, 21134, "Listening port for dserver");
-P_DEFINE_int32(ports_num, 1,
+P_DEFINE_int32(ports_num,
+               1,
                "The ports number for parameter send,"
                " increment based on default port number");
-P_DEFINE_int32(ports_num_for_sparse, 0,
+P_DEFINE_int32(ports_num_for_sparse,
+               0,
                "The ports number for parameter send,"
                " increment based on default (port + ports_num)");
 P_DEFINE_string(nics, "xgbe0,xgbe1", "network device name for pservers");
 P_DEFINE_string(rdma_tcp, "tcp", "use rdma or tcp rdma transport protocol");
 P_DEFINE_int32(
-    trainer_id, 0,
+    trainer_id,
+    0,
     "For distributed training, each trainer must be given an unique id"
     " ranging from 0 to num_trainers-1. Trainer 0 is the master"
     " trainer");
 P_DEFINE_int32(num_gradient_servers, 1, "number of gradient servers");
 P_DEFINE_string(comment, "", "A string for commenting this training task");
-P_DEFINE_string(load_missing_parameter_strategy, "fail",
+P_DEFINE_string(load_missing_parameter_strategy,
+                "fail",
                 "which operation to take on load model fails. support "
                 "fail/rand/zero only.");
 P_DEFINE_int32(log_period, 100, "Log progress every so many batches");
-P_DEFINE_int32(log_period_server, 500,
+P_DEFINE_int32(log_period_server,
+               500,
                "Log progress every so many batches at pserver end");
 P_DEFINE_double(checkgrad_eps, 1e-5, "parameter change size for checkgrad");
-P_DEFINE_int32(enable_parallel_vector, 0,
+P_DEFINE_int32(enable_parallel_vector,
+               0,
                "threshold for enable parallel vector");
-P_DEFINE_bool(loadsave_parameters_in_pserver, false,
+P_DEFINE_bool(loadsave_parameters_in_pserver,
+              false,
               "load and save parameters in pserver. "
               "only work while parameter set sparse_remote_update.");
-P_DEFINE_int32(beam_size, 1,
+P_DEFINE_int32(beam_size,
+               1,
                "Beam size used in generating most probable output sequences.");
 
 P_DEFINE_bool(show_layer_stat, false, "show the statistics of each layer");
 P_DEFINE_string(predict_file, "", "File name for saving predict result");
 P_DEFINE_bool(prev_batch_state, false, "batch is continue with next batch");
-P_DEFINE_string(init_model_path, "",
+P_DEFINE_string(init_model_path,
+                "",
                 "Path of the initial model parameters."
                 "If it was set, start_pass will be ignored.");
diff --git a/paddle/utils/Flags.h b/paddle/utils/Flags.h
index b23a29eff9069117a64bfa46d8930a9a43510949..dda60c3f965abd8575677c785b21b058b3400ee5 100644
--- a/paddle/utils/Flags.h
+++ b/paddle/utils/Flags.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include "CommandLineParser.h"
diff --git a/paddle/utils/GlobalConstants.cpp b/paddle/utils/GlobalConstants.cpp
index 8ed6471e4e85de6d1d012660242e2eae05139ec5..d769cd1ee7d4403f9fddbe91d2afec2c986d6b18 100644
--- a/paddle/utils/GlobalConstants.cpp
+++ b/paddle/utils/GlobalConstants.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "GlobalConstants.h"
 
 namespace paddle {
diff --git a/paddle/utils/GlobalConstants.h b/paddle/utils/GlobalConstants.h
index 8818b014f80be92f1b7b6907739c3d36bcaa7466..4c74c17a50c8cdbc18a075a58f97efc6b3330deb 100644
--- a/paddle/utils/GlobalConstants.h
+++ b/paddle/utils/GlobalConstants.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <string>
 
@@ -20,9 +19,9 @@ namespace paddle {
 
 namespace enumeration_wrapper {
 enum PassType {
-  PASS_TRAIN,  // Train pass
-  PASS_TEST,   // Test pass
-  PASS_GC,     // Gradient Check pass
+  PASS_TRAIN,   // Train pass
+  PASS_TEST,    // Test pass
+  PASS_GC,      // Gradient Check pass
   PASS_METRIC,  // pass for generate template output with no drop rate.
   // pass for metric learning training with metric learning error, only used
   // when we are doing KNN evaluation.
@@ -81,7 +80,7 @@ enum ParameterType {
 }  // namespace enumeration_wrapper
 
 //! explicit import enum into paddle namespace.
-using namespace enumeration_wrapper;    // NOLINT
+using namespace enumeration_wrapper;  // NOLINT
 
 class TrainAlgorithm {
 public:
diff --git a/paddle/utils/Locks.h b/paddle/utils/Locks.h
index 1fc0363d34597c9447996479aaf771e46d0ba600..5990e1657021611437e8fe730147dfaf207c800d 100644
--- a/paddle/utils/Locks.h
+++ b/paddle/utils/Locks.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <pthread.h>
@@ -26,7 +25,7 @@ namespace paddle {
 
 /**
  * A simple read-write lock.
- * The RWlock allows a number of readers or at most one writer 
+ * The RWlock allows a number of readers or at most one writer
  * at any point in time.
  * The RWlock disable copy.
  *
@@ -37,7 +36,7 @@ namespace paddle {
  *
  * Use lock_shared() to lock on read mode, other thread can get
  * it by using the same method lock_shared().
- * 
+ *
  * Unlock:
  *
  * Use unlock() to unlock the lock.
@@ -68,13 +67,13 @@ protected:
 };
 
 /**
- * The ReadLockGuard is a read mode RWLock 
- * using RAII management mechanism. 
+ * The ReadLockGuard is a read mode RWLock
+ * using RAII management mechanism.
  */
 class ReadLockGuard {
 public:
   /**
-   * @brief Construct Function. Lock on rwlock in read mode. 
+   * @brief Construct Function. Lock on rwlock in read mode.
    */
   explicit ReadLockGuard(RWLock& rwlock) : rwlock_(&rwlock) {
     rwlock_->lock_shared();
@@ -82,7 +81,7 @@ public:
 
   /**
    * @brief Destruct Function.
-   * @note This method just unlock the read mode rwlock, 
+   * @note This method just unlock the read mode rwlock,
    * won't destroy the lock.
    */
   ~ReadLockGuard() { rwlock_->unlock(); }
@@ -120,16 +119,15 @@ class Semaphore {
 public:
   //! Disable copy & assign
   Semaphore(const Semaphore& other) = delete;
-  Semaphore& operator= (const Semaphore&& other) = delete;
+  Semaphore& operator=(const Semaphore&& other) = delete;
 
   //! Enable move.
-  Semaphore(Semaphore&& other): m(std::move(other.m)) {
-  }
+  Semaphore(Semaphore&& other) : m(std::move(other.m)) {}
 
 public:
   /**
-   * @brief Construct Function. 
-   * @param[in] initValue the initial value of the 
+   * @brief Construct Function.
+   * @param[in] initValue the initial value of the
    * semaphore, default 0.
    */
   explicit Semaphore(int initValue = 0);
@@ -137,22 +135,23 @@ public:
   ~Semaphore();
 
   /**
-   * @brief The same as wait(), except if the decrement can not 
+   * @brief The same as wait(), except if the decrement can not
    * be performed until ts return false install of blocking.
-   * @param[in] ts an absolute timeout in seconds and nanoseconds 
+   * @param[in] ts an absolute timeout in seconds and nanoseconds
    * since the Epoch 1970-01-01 00:00:00 +0000(UTC).
-   * @return ture if the decrement proceeds before ts, 
+   * @return ture if the decrement proceeds before ts,
    * else return false.
    */
   bool timeWait(struct timespec* ts);
 
   /**
-   * @brief decrement the semaphore. If the semaphore's value is 0, then call blocks.
+   * @brief decrement the semaphore. If the semaphore's value is 0, then call
+   * blocks.
    */
   void wait();
 
   /**
-   * @brief increment the semaphore. If the semaphore's value 
+   * @brief increment the semaphore. If the semaphore's value
    * greater than 0, wake up a thread blocked in wait().
    */
   void post();
@@ -178,9 +177,9 @@ public:
   ~ThreadBarrier();
 
   /**
-   * @brief . 
-   * If there were count - 1 threads waiting before, 
-   * then wake up all the count - 1 threads and continue run together. 
+   * @brief .
+   * If there were count - 1 threads waiting before,
+   * then wake up all the count - 1 threads and continue run together.
    * Else block the thread until waked by other thread .
    */
   void wait();
@@ -218,12 +217,12 @@ public:
 
   /**
    * @brief wait until pred return ture.
-   * @tparam Predicate c++ concepts, describes a function object 
-   * that takes a single iterator argument 
-   * that is dereferenced and used to 
+   * @tparam Predicate c++ concepts, describes a function object
+   * that takes a single iterator argument
+   * that is dereferenced and used to
    * return a value testable as a bool.
-   * @note pred shall not apply any non-constant function 
-   * through the dereferenced iterator. 
+   * @note pred shall not apply any non-constant function
+   * through the dereferenced iterator.
    */
   template <class Predicate>
   void wait(Predicate pred) {
diff --git a/paddle/utils/Logging.cpp b/paddle/utils/Logging.cpp
index a0644940b5402af295f8abcd0e9cd8badcac7616..14303bd4c747db2c10ee24b1601f709a79174850 100644
--- a/paddle/utils/Logging.cpp
+++ b/paddle/utils/Logging.cpp
@@ -91,8 +91,8 @@ static inline int env2index(const char* envName,
 }
 
 static bool gLogToStderr = env2bool("PLOG_LOGTOSTDERR", true);
-static const std::vector<std::string> gLevelName = {"INFO", "WARNING", "ERROR",
-                                                    "FATAL"};
+static const std::vector<std::string> gLevelName = {
+    "INFO", "WARNING", "ERROR", "FATAL"};
 static int gMinLogLevel =
     env2int("PLOG_MINLOGLEVEL", env2index("PLOG_MINLOGLEVEL", gLevelName, 0));
 
@@ -134,7 +134,7 @@ static void initializeLogFds(char* argv0) {
   gLogInited = true;
 }
 
-static void (*gFailureFunctionPtr)() __attribute__((noreturn)) = abort;
+static void (*gFailureFunctionPtr)() ATTR_NORETURN = abort;
 
 LogMessage::LogMessage(const char* fname, int line, int severity)
     : fname_(fname), line_(line), severity_(severity) {}
@@ -143,11 +143,19 @@ LogMessage::~LogMessage() { this->generateLogMessage(); }
 
 void LogMessage::generateLogMessage() {
   if (!gLogInited) {
-    fprintf(stderr, "%c %s:%d] %s\n", "IWEF"[severity_], fname_, line_,
+    fprintf(stderr,
+            "%c %s:%d] %s\n",
+            "IWEF"[severity_],
+            fname_,
+            line_,
             str().c_str());
   } else {
     for (auto& fd : gLogFds[this->severity_]) {
-      dprintf(fd, "%c %s:%d] %s\n", "IWEF"[severity_], fname_, line_,
+      dprintf(fd,
+              "%c %s:%d] %s\n",
+              "IWEF"[severity_],
+              fname_,
+              line_,
               str().c_str());
     }
   }
@@ -167,11 +175,9 @@ void initializeLogging(int argc, char** argv) {
 }
 
 namespace logging {
-void setMinLogLevel(int level) {
-  paddle::internal::gMinLogLevel = level;
-}
+void setMinLogLevel(int level) { paddle::internal::gMinLogLevel = level; }
 
-void installFailureFunction(void (*callback)()) {
+void installFailureFunction(void (*callback)() ATTR_NORETURN) {
   paddle::internal::gFailureFunctionPtr = callback;
 }
 
@@ -191,13 +197,11 @@ void initializeLogging(int argc, char** argv) {
 }
 
 namespace logging {
-void setMinLogLevel(int level) {
-  FLAGS_minloglevel = level;
-}
+void setMinLogLevel(int level) { FLAGS_minloglevel = level; }
 void installFailureFunction(void (*callback)()) {
   google::InstallFailureFunction(callback);
 }
-void installFailureWriter(void(*callback)(const char*, int)) {
+void installFailureWriter(void (*callback)(const char*, int)) {
   google::InstallFailureWriter(callback);
 }
 }  // namespace logging
diff --git a/paddle/utils/Logging.h b/paddle/utils/Logging.h
index 7fdfa3240c1de71ca8cd4c4b7e772b6767b43672..e9029b421fa3b68845a54194f4cfa69439a99a0c 100644
--- a/paddle/utils/Logging.h
+++ b/paddle/utils/Logging.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include <string>
 
 #ifndef PADDLE_USE_GLOG
+#include "CompilerMacros.h"
 
 //! TODO(yuyang18): Move this utility macro into some global header.
 #define PP_CAT(a, b) PP_CAT_I(a, b)
@@ -31,11 +32,11 @@ limitations under the License. */
 
 /**
  * Generate Unique Variable Name, Usefully in macro.
- * @SEE http://stackoverflow.com/questions/1082192/how-to-generate-random-variable-names-in-c-using-macros
+ * @SEE
+ * http://stackoverflow.com/questions/1082192/how-to-generate-random-variable-names-in-c-using-macros
  */
 #define UNIQUE_NAME(base) PP_CAT(base, __LINE__)
 
-
 namespace paddle {
 
 //! Log levels.
@@ -168,13 +169,13 @@ void setMinLogLevel(int level);
  * @brief Install Log(Fatal) failure function. Default is abort();
  * @param callback: The failure function.
  */
-void installFailureFunction(void (*callback)());
+void installFailureFunction(void (*callback)() ATTR_NORETURN);
 
 /**
  * @brief installFailureWriter
  * @note: not implemented currently.
  */
-inline void installFailureWriter(void(*callback)(const char*, int)) {
+inline void installFailureWriter(void (*callback)(const char*, int)) {
   (void)(callback);  // unused callback.
 }
 }  //  namespace logging
@@ -186,7 +187,7 @@ void initializeLogging(int argc, char** argv);
 namespace logging {
 void setMinLogLevel(int level);
 void installFailureFunction(void (*callback)());
-void installFailureWriter(void(*callback)(const char*, int));
+void installFailureWriter(void (*callback)(const char*, int));
 }  //  namespace logging
 }
 #endif  // PADDLE_USE_GLOG
diff --git a/paddle/utils/PythonUtil.cpp b/paddle/utils/PythonUtil.cpp
index 90e5093f96ea4e892b7f2b1f2baa1bf1d6c85c05..7f17a825228ef56be7b8678bf003e57388d4b0bf 100644
--- a/paddle/utils/PythonUtil.cpp
+++ b/paddle/utils/PythonUtil.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "PythonUtil.h"
 #include <sstream>
 #include <signal.h>
@@ -33,7 +32,8 @@ int executeCMD(const char* cmd, char* result) {
   strncpy(ps, cmd, kExecuteCMDBufLength);
   if ((ptr = popen(ps, "r")) != NULL) {
     size_t count = fread(bufPs, 1, kExecuteCMDBufLength, ptr);
-    memcpy(result, bufPs,
+    memcpy(result,
+           bufPs,
            count - 1);  // why count-1: remove the '\n' at the end
     result[count] = 0;
     pclose(ptr);
@@ -71,15 +71,14 @@ std::string callPythonFunc(const std::string& moduleName,
 
 #else
 
-
 static std::recursive_mutex g_pyMutex;
 
 PyGuard::PyGuard() : guard_(g_pyMutex) {}
 
-
-static void printPyErrorStack(std::ostream& os, bool withEndl = false,
+static void printPyErrorStack(std::ostream& os,
+                              bool withEndl = false,
                               bool withPyPath = true) {
-  PyObject * ptype, *pvalue, *ptraceback;
+  PyObject *ptype, *pvalue, *ptraceback;
   PyErr_Fetch(&ptype, &pvalue, &ptraceback);
   PyErr_NormalizeException(&ptype, &pvalue, &ptraceback);
   PyErr_Clear();
@@ -91,10 +90,8 @@ static void printPyErrorStack(std::ostream& os, bool withEndl = false,
   }
   PyTracebackObject* obj = (PyTracebackObject*)ptraceback;
 
-  os << "Python Error: " << PyString_AsString(PyObject_Str(ptype))
-            <<" : " << (pvalue == NULL ? ""
-                                       : PyString_AsString(
-                                           PyObject_Str(pvalue)));
+  os << "Python Error: " << PyString_AsString(PyObject_Str(ptype)) << " : "
+     << (pvalue == NULL ? "" : PyString_AsString(PyObject_Str(pvalue)));
   if (withEndl) {
     os << std::endl;
   }
@@ -104,8 +101,8 @@ static void printPyErrorStack(std::ostream& os, bool withEndl = false,
   }
   while (obj != NULL) {
     int line = obj->tb_lineno;
-    const char* filename = PyString_AsString(
-          obj->tb_frame->f_code->co_filename);
+    const char* filename =
+        PyString_AsString(obj->tb_frame->f_code->co_filename);
     os << "            " << filename << " : " << line;
     if (withEndl) {
       os << std::endl;
@@ -143,7 +140,8 @@ std::string callPythonFunc(const std::string& moduleName,
 }
 
 PyObjectPtr createPythonClass(
-    const std::string& moduleName, const std::string& className,
+    const std::string& moduleName,
+    const std::string& className,
     const std::vector<std::string>& args,
     const std::map<std::string, std::string>& kwargs) {
   PyGuard guard;
@@ -164,21 +162,18 @@ PyObjectPtr createPythonClass(
   PyObjectPtr kwargsObjectList(PyDict_New());
   for (auto& x : kwargs) {
     PyObjectPtr pyArg(Py_BuildValue("s#", x.second.c_str(), x.second.length()));
-    PyDict_SetItemString(kwargsObjectList.get(), x.first.c_str(),
-                         pyArg.release());
+    PyDict_SetItemString(
+        kwargsObjectList.get(), x.first.c_str(), pyArg.release());
   }
 
-  PyObjectPtr pyInstance(PyInstance_New(pyClass.get(), argsObjectList.release(),
-                                        kwargsObjectList.release()));
+  PyObjectPtr pyInstance(PyInstance_New(
+      pyClass.get(), argsObjectList.release(), kwargsObjectList.release()));
   CHECK_PY(pyInstance) << "Create class " << className << " failed.";
   return pyInstance;
 }
 
-
 namespace py {
-char* repr(PyObject* obj) {
-  return PyString_AsString(PyObject_Repr(obj));
-}
+char* repr(PyObject* obj) { return PyString_AsString(PyObject_Repr(obj)); }
 
 std::string getPyCallStack() {
   std::ostringstream os;
@@ -186,7 +181,7 @@ std::string getPyCallStack() {
   return os.str();
 }
 
-PyObjectPtr import(const std::string &moduleName) {
+PyObjectPtr import(const std::string& moduleName) {
   auto module = PyImport_ImportModule(moduleName.c_str());
   CHECK_PY(module) << "Import " << moduleName << "Error";
   return PyObjectPtr(module);
diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h
index 00fc177022ac343a5760e57bcbcabf18f697bd4d..65677d90101a0ee2e62c8ac45c50b88326e169e1 100644
--- a/paddle/utils/PythonUtil.h
+++ b/paddle/utils/PythonUtil.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #ifndef PADDLE_NO_PYTHON
@@ -83,8 +82,7 @@ PyObjectPtr createPythonClass(const std::string& moduleName,
                               const std::vector<std::string>& args,
                               const std::map<std::string, std::string>& kwargs);
 
-#define CHECK_PY(x)\
-  CHECK((x) != nullptr) << ::paddle::py::getPyCallStack()
+#define CHECK_PY(x) CHECK((x) != nullptr) << ::paddle::py::getPyCallStack()
 
 namespace py {
 PyObjectPtr import(const std::string& moduleName);
@@ -101,13 +99,13 @@ template <typename T>
 T castInt(PyObject* obj, bool* ok = nullptr) {
   if (PyLong_Check(obj)) {
     if (ok) *ok = true;
-    return (T) PyLong_AsUnsignedLong(obj);
+    return (T)PyLong_AsUnsignedLong(obj);
   } else if (PyInt_Check(obj)) {
     if (ok) *ok = true;
-    return (T) PyInt_AsLong(obj);
+    return (T)PyInt_AsLong(obj);
   } else {
     if (ok) *ok = false;
-    return (T) 0;
+    return (T)0;
   }
 }
 
@@ -116,14 +114,12 @@ T castInt(PyObject* obj, bool* ok = nullptr) {
  *
  * Just like toString method in java.
  */
-char *repr(PyObject* obj);
+char* repr(PyObject* obj);
 
 /**
  * Invoke repr of python object.
  */
-inline char *repr(const PyObjectPtr &obj) {
-  return repr(obj.get());
-}
+inline char* repr(const PyObjectPtr& obj) { return repr(obj.get()); }
 
 /**
  * Get Python Error Stack String.
@@ -137,8 +133,7 @@ std::string getPyCallStack();
  */
 class ObjectHelper {
 public:
-  explicit ObjectHelper(const PyObjectPtr& obj): obj_(obj) {
-  }
+  explicit ObjectHelper(const PyObjectPtr& obj) : obj_(obj) {}
 
   /**
    * get attribute
@@ -211,15 +206,13 @@ public:
     CHECK(PySequence_Check(seq_));
   }
 
-  explicit SequenceHelper(PyObject* seq): seq_(seq) {
+  explicit SequenceHelper(PyObject* seq) : seq_(seq) {
     CHECK(PySequence_Check(seq_));
   }
 
-  inline size_t size() const {
-    return (size_t) PySequence_Size(seq_);
-  }
+  inline size_t size() const { return (size_t)PySequence_Size(seq_); }
 
-  inline PyObject* operator[] (size_t i) const {
+  inline PyObject* operator[](size_t i) const {
     return PySequence_Fast_GET_ITEM(seq_, i);
   }
 
@@ -260,9 +253,9 @@ private:
 
 class DictHelper {
 public:
-  explicit DictHelper(PyObject* d): dict_(d) {}
+  explicit DictHelper(PyObject* d) : dict_(d) {}
 
-  explicit DictHelper(const PyObjectPtr& d): dict_(d.get()) {}
+  explicit DictHelper(const PyObjectPtr& d) : dict_(d.get()) {}
 
   void set(const std::string& key, PyObject* item) {
     PyDict_SetItemString(dict_, key.c_str(), item);
@@ -274,17 +267,15 @@ public:
 
   void setStringList(const std::string& key,
                      const std::vector<std::string>& items) {
-    auto * list = PyList_New(items.size());
-    for (size_t i=0; i < items.size(); ++i) {
+    auto* list = PyList_New(items.size());
+    for (size_t i = 0; i < items.size(); ++i) {
       PyList_SetItem(list, i, PyString_FromString(items[i].c_str()));
     }
     this->set(key, list);
   }
 
 private:
-  inline void checkDict() {
-    CHECK(PyDict_Check(this->dict_));
-  }
+  inline void checkDict() { CHECK(PyDict_Check(this->dict_)); }
 
   PyObject* dict_;
 };
@@ -298,7 +289,7 @@ inline static bool isCallable(const PyObjectPtr& obj) {
  */
 class CallableHelper {
 public:
-  explicit CallableHelper(const PyObjectPtr& obj): obj_(obj) {
+  explicit CallableHelper(const PyObjectPtr& obj) : obj_(obj) {
     CHECK(py::isCallable(obj_));
   }
 
@@ -308,21 +299,17 @@ public:
    * reset args, and create new tuple.
    * @param sz args size.
    */
-  void setArgsSize(size_t sz) {
-    args.reset(PyTuple_New(sz));
-  }
+  void setArgsSize(size_t sz) { args.reset(PyTuple_New(sz)); }
 
   /**
    * Get args sequence. User can set/get by SequenceHelper.
    */
-  SequenceHelper getArgs() {
-    return SequenceHelper(args);
-  }
+  SequenceHelper getArgs() { return SequenceHelper(args); }
 
   /**
    * Call python method, return an object.
    */
-  PyObject* operator() () {
+  PyObject* operator()() {
     PyGuard guard;
     return PyObject_Call(obj_.get(), args.get(), kwargs.get());
   }
diff --git a/paddle/utils/Queue.h b/paddle/utils/Queue.h
index f952cf58778dee0565a8e88ef0015d51dc295428..58d17e86c432b90a6b3240dd5528146a24b72184 100644
--- a/paddle/utils/Queue.h
+++ b/paddle/utils/Queue.h
@@ -142,12 +142,9 @@ public:
    */
   bool waitNotEmptyFor(int seconds) {
     std::unique_lock<std::mutex> lock(queueLock_);
-    return queueCV_.wait_for(
-          lock,
-          std::chrono::seconds(seconds),
-          [this] {
-      return numElements_ != 0;
-    });
+    return queueCV_.wait_for(lock,
+                             std::chrono::seconds(seconds),
+                             [this] { return numElements_ != 0; });
   }
 
 private:
@@ -190,7 +187,7 @@ template <typename T>
 class BlockingQueue {
 public:
   /**
-   * @brief Construct Function. 
+   * @brief Construct Function.
    * @param[in] capacity the max numer of elements the queue can have.
    */
   explicit BlockingQueue(size_t capacity) : capacity_(capacity) {}
@@ -198,9 +195,9 @@ public:
   /**
    * @brief enqueue an element into Queue.
    * @param[in] x The enqueue element, pass by reference .
-   * @note This method is thread-safe, and will wake up another thread 
+   * @note This method is thread-safe, and will wake up another thread
    * who was blocked because of the queue is empty.
-   * @note If it's size() >= capacity before enqueue, 
+   * @note If it's size() >= capacity before enqueue,
    * this method will block and wait until size() < capacity.
    */
   void enqueue(const T& x) {
@@ -229,7 +226,7 @@ public:
   /**
    * Return size of queue.
    *
-   * @note This method is thread safe. 
+   * @note This method is thread safe.
    * The size of the queue won't change until the method return.
    */
   size_t size() {
diff --git a/paddle/utils/Stat.h b/paddle/utils/Stat.h
index 00e5aaec2babfde5cc95b6afad8713e685ffa52a..4051145d9246639fce5d041103c1211a939eddca 100644
--- a/paddle/utils/Stat.h
+++ b/paddle/utils/Stat.h
@@ -93,7 +93,8 @@ public:
     return ret.first->second;
   }
 
-  BarrierStatPtr getStat(uint16_t numConnThreads, const std::string& name,
+  BarrierStatPtr getStat(uint16_t numConnThreads,
+                         const std::string& name,
                          BarrierStatType bType);
 
   void deleteStat(const std::string& name);
@@ -204,8 +205,10 @@ protected:
 
 class TimerOnce {
 public:
-  TimerOnce(Stat* stat, const char* info = "",
-            uint64_t threshold = -1, bool autoStart = true,
+  TimerOnce(Stat* stat,
+            const char* info = "",
+            uint64_t threshold = -1,
+            bool autoStart = true,
             uint64_t startStamp = 0)
       : stat_(stat), info_(info), timer_(autoStart), threshold_(threshold) {
     if (!autoStart) {
@@ -261,21 +264,21 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1,
 
 #define REGISTER_TIMER_SET(statName, start, ...)                            \
   static StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \
-  TimerOnce __timerOnce(__stat.get(), "", registerTimerArg1(__VA_ARGS__),   \
-                        false, start);
+  TimerOnce __timerOnce(                                                    \
+      __stat.get(), "", registerTimerArg1(__VA_ARGS__), false, start);
 
 // dynmaic timer, support to discriminate runtime entity, used in pserver
-#define REGISTER_TIMER_DYNAMIC(statName, ...)                               \
-  StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName);        \
+#define REGISTER_TIMER_DYNAMIC(statName, ...)                        \
+  StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \
   TimerOnce __timerOnce(__stat.get(), "", registerTimerArg1(__VA_ARGS__));
 
-#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...)                    \
-  StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName);        \
-  TimerOnce __timerOnce(__stat.get(), "", registerTimerArg1(__VA_ARGS__),   \
-                        false, start);
+#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...)             \
+  StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \
+  TimerOnce __timerOnce(                                             \
+      __stat.get(), "", registerTimerArg1(__VA_ARGS__), false, start);
 
-#define REGISTER_TIMER_INFO(statName, info)                                 \
-  static StatPtr __stat = globalStat.getStat(statName);                     \
+#define REGISTER_TIMER_INFO(statName, info)             \
+  static StatPtr __stat = globalStat.getStat(statName); \
   TimerOnce __timerOnce(__stat.get(), info, 10 * 1000000LU /*threshold*/);
 
 #endif  // DISABLE_TIMER
diff --git a/paddle/utils/StringUtil.h b/paddle/utils/StringUtil.h
index 50301a19be46bf608cf072d3f47335abbb830bc9..8b44dad19231781623a0a65d02b24ac1cf9e4523 100644
--- a/paddle/utils/StringUtil.h
+++ b/paddle/utils/StringUtil.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <string>
@@ -68,8 +67,6 @@ inline T to(const std::string& s) {
   return v;
 }
 
-
-
 }  // namespace str
 
 #undef DEFINE_STRING_CONVERSION
diff --git a/paddle/utils/Thread.h b/paddle/utils/Thread.h
index f6c826a1eeb656ff852c70f70b85c3b00a6a5e8b..ade0ee496f94f6165f35dd1a0a37618df8fae585 100644
--- a/paddle/utils/Thread.h
+++ b/paddle/utils/Thread.h
@@ -57,7 +57,8 @@ public:
   void join() { thread_->join(); }
 
   /**
-   * @brief Define what to be done on this thread through override this function.
+   * @brief Define what to be done on this thread through override this
+   * function.
    */
   virtual void run() = 0;
 
@@ -155,10 +156,9 @@ public:
   /**
    * @brief Construct Function. No thread will be created.
    */
-  SyncThreadPool()
-    : jobStartBarrier_(0),
-    jobFinishBarrier_(0)
-  { LOG(FATAL) << "Not implemented"; }
+  SyncThreadPool() : jobStartBarrier_(0), jobFinishBarrier_(0) {
+    LOG(FATAL) << "Not implemented";
+  }
 
   /**
    * @brief Construct Fucntion. Create numWorkers of threads in the pool.
@@ -191,7 +191,8 @@ public:
   /**
    * @brief Execute a job using all the theads in the pool.
    * @param[in] jobFunc The function to be executed.
-   * @param[in] ownerFunc Owner thread can do something in owerFunc when job executing.
+   * @param[in] ownerFunc Owner thread can do something in owerFunc when job
+   * executing.
    * @note For the ownerFunc, tid=getNumThreads().
    */
   void exec(JobFunc jobFunc, JobFunc ownerFunc = nullptr) {
@@ -316,7 +317,8 @@ protected:
  *
  * Force stop:
  *
- *    Use forceStop() to exit forcibly even though there are remaining jobs in the
+ *    Use forceStop() to exit forcibly even though there are remaining jobs in
+ * the
  * job queue.
  */
 template <class T>
@@ -426,7 +428,8 @@ protected:
   /**
    * @brief Do the jobs in the job queue sequentianlly
    * and enqueue the result into the result queue.
-   * @note A nullptr will be enqueued into the resulte queue, when a worker finished.
+   * @note A nullptr will be enqueued into the resulte queue, when a worker
+   * finished.
    */
   virtual void run() {
     while (true) {
@@ -492,7 +495,9 @@ public:
   }
 
   ~AsyncThreadPool() {
-    if (!stopping_) { stop(); }
+    if (!stopping_) {
+      stop();
+    }
   }
 
   /**
@@ -501,7 +506,7 @@ public:
   void stop() {
     stopping_ = true;
     for (size_t i = 0; i < workers_.size(); i++) {
-      jobs_.enqueue([]{});
+      jobs_.enqueue([] {});
     }
     for (auto& worker : workers_) {
       worker->join();
@@ -526,7 +531,7 @@ public:
    * asynchronously.
    * Call std::future::get() when the execturation result is needed;
    */
-  template<class F, class... Args>
+  template <class F, class... Args>
   auto addJob(F&& f, Args&&... args)
       -> std::future<typename std::result_of<F(Args...)>::type> {
     CHECK(!stopping_) << "AsyncThreadPool is closed";
@@ -535,7 +540,7 @@ public:
     auto task = std::make_shared<std::packaged_task<T()>>(
         std::bind(std::forward<F>(f), std::forward<Args>(args)...));
     auto res = task->get_future();
-    jobs_.enqueue([task]{ (*task)(); });
+    jobs_.enqueue([task] { (*task)(); });
     return res;
   }
 
@@ -551,15 +556,15 @@ public:
    *
    * @note *results* may need to be carefully cleared before *addBatchJobs()*.
    */
-  template<class F>
-  void addBatchJobs(const std::vector<F> &jobs,
-      std::vector<typename std::result_of<F()>::type> &results) {
+  template <class F>
+  void addBatchJobs(const std::vector<F>& jobs,
+                    std::vector<typename std::result_of<F()>::type>& results) {
     typedef typename std::result_of<F()>::type T;
     static_assert(!std::is_same<T, void>::value,
-        "should pass a non-void function as job");
+                  "should pass a non-void function as job");
 
-    std::vector<std::future<T> > resFuts;
-    for (const auto &job : jobs) {
+    std::vector<std::future<T>> resFuts;
+    for (const auto& job : jobs) {
       resFuts.emplace_back(addJob(job));
     }
     for (auto& fut : resFuts) {
@@ -572,13 +577,16 @@ public:
    * @tparam F don't need to have a return value.
    * @param[in] jobs a vector of executable objection.
    */
-  template<class F>
-  void addBatchJobs(const std::vector<F> &jobs) {
+  template <class F>
+  void addBatchJobs(const std::vector<F>& jobs) {
     CHECK(!stopping_) << "AsyncThreadPool is closed";
-    std::vector<std::future<bool> > tmpRes;
+    std::vector<std::future<bool>> tmpRes;
 
     for (const auto& job : jobs) {
-      tmpRes.emplace_back(addJob([&job]{ job(); return true; }));
+      tmpRes.emplace_back(addJob([&job] {
+        job();
+        return true;
+      }));
     }
 
     for (auto& res : tmpRes) {
@@ -604,4 +612,4 @@ private:
   bool stopping_;
 };  // class AsyncThreadPool
 
-}   // namespace paddle
+}  // namespace paddle
diff --git a/paddle/utils/ThreadLocal.cpp b/paddle/utils/ThreadLocal.cpp
index 0f948f1029af85c97d2564a089b7bf878244643c..49d4b1526537def9b8183934faa971402f3678aa 100644
--- a/paddle/utils/ThreadLocal.cpp
+++ b/paddle/utils/ThreadLocal.cpp
@@ -16,7 +16,8 @@ limitations under the License. */
 #include "ThreadLocal.h"
 #include "CommandLineParser.h"
 
-P_DEFINE_bool(thread_local_rand_use_global_seed, false,
+P_DEFINE_bool(thread_local_rand_use_global_seed,
+              false,
               "Whether to use global seed in thread local rand.");
 
 namespace paddle {
diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h
index b91e4ad5472cab4f48f1eb59304aa7c0cf3f621f..06c8b392af23f81ab48042cb4d24a40b1c50275d 100644
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <pthread.h>
@@ -91,9 +90,7 @@ public:
   /**
    * Implicit conversion to T*
    */
-  operator T*() {
-    return get();
-  }
+  operator T*() { return get(); }
 
 private:
   static void dataDestructor(void* p) { delete (T*)p; }
diff --git a/paddle/utils/TypeDefs.h b/paddle/utils/TypeDefs.h
index e02fd62b53823f8bc84b957b4fa62aeb62346c0d..e8be779bea255eec71057495d1253ed92c2256c3 100644
--- a/paddle/utils/TypeDefs.h
+++ b/paddle/utils/TypeDefs.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 namespace paddle {
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index 45251213d2d7930947f39d4730245ca8f7dfddc8..bc727cfa74cdfb51b36259bd08733804578f6d66 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Util.h"
 
 #include <dirent.h>
@@ -54,7 +53,8 @@ P_DEFINE_int32(seed, 1, "random number seed. 0 for srand(time)");
 #include <gperftools/profiler.h>
 
 P_DEFINE_int32(profile_signal, 12, "signal for switch google profiler");
-P_DEFINE_string(profile_data_file, "gperf.prof",
+P_DEFINE_string(profile_data_file,
+                "gperf.prof",
                 "file for storing profile data");
 
 static void profilerSwitch(int signalNumber) {
@@ -94,19 +94,19 @@ static void installProfilerSwitch() {}
 namespace paddle {
 
 pid_t getTID() {
-  #if defined(__APPLE__) || defined(__OSX__)
-      // syscall is deprecated: first deprecated in macOS 10.12.
-      // syscall is unsupported;
-      // syscall pid_t tid = syscall(SYS_thread_selfid);
-      uint64_t tid;
-      pthread_threadid_np(NULL, &tid);
-  #else
-      #ifndef __NR_gettid
-      #define __NR_gettid 224
-      #endif
-      pid_t tid = syscall(__NR_gettid);
-  #endif
-  CHECK_NE(tid, -1);
+#if defined(__APPLE__) || defined(__OSX__)
+  // syscall is deprecated: first deprecated in macOS 10.12.
+  // syscall is unsupported;
+  // syscall pid_t tid = syscall(SYS_thread_selfid);
+  uint64_t tid;
+  pthread_threadid_np(NULL, &tid);
+#else
+#ifndef __NR_gettid
+#define __NR_gettid 224
+#endif
+  pid_t tid = syscall(__NR_gettid);
+#endif
+  CHECK_NE((int)tid, -1);
   return tid;
 }
 
@@ -126,22 +126,25 @@ void registerInitFunction(std::function<void()> func, int priority) {
 }
 
 void runInitFunctions() {
-  std::call_once(g_onceFlag, []() {
-    LOG(INFO) << "Calling runInitFunctions";
-    if (g_initFuncs) {
-      std::sort(g_initFuncs->begin(), g_initFuncs->end(),
-                [](const PriorityFuncPair& x, const PriorityFuncPair& y) {
-                  return x.first > y.first;
-                });
-      for (auto& f : *g_initFuncs) {
-        f.second();
-      }
-      delete g_initFuncs;
-      g_initFuncs = nullptr;
-    }
-    g_initialized = true;
-    LOG(INFO) << "Call runInitFunctions done.";
-  });
+  std::call_once(
+      g_onceFlag,
+      []() {
+        LOG(INFO) << "Calling runInitFunctions";
+        if (g_initFuncs) {
+          std::sort(g_initFuncs->begin(),
+                    g_initFuncs->end(),
+                    [](const PriorityFuncPair& x, const PriorityFuncPair& y) {
+                      return x.first > y.first;
+                    });
+          for (auto& f : *g_initFuncs) {
+            f.second();
+          }
+          delete g_initFuncs;
+          g_initFuncs = nullptr;
+        }
+        g_initialized = true;
+        LOG(INFO) << "Call runInitFunctions done.";
+      });
 }
 
 void initMain(int argc, char** argv) {
@@ -282,7 +285,7 @@ void mkDir(const char* filename) {
   }
 }
 
-void mkDirRecursively(const char *dir) {
+void mkDirRecursively(const char* dir) {
   struct stat sb;
 
   if (!stat(dir, &sb)) return;
@@ -303,7 +306,6 @@ void loadFileList(const std::string& fileListFileName,
   }
 }
 
-
 double getMemoryUsage() {
   FILE* fp = fopen("/proc/meminfo", "r");
   CHECK(fp) << "failed to fopen /proc/meminfo";
@@ -363,7 +365,9 @@ size_t calculateServiceNum(const std::string& pservers, int ports_num) {
   return hosts.size() * ports_num;
 }
 
-void memcpyWithCheck(void* dest, const void* src, size_t num,
+void memcpyWithCheck(void* dest,
+                     const void* src,
+                     size_t num,
                      const void* srcEnd) {
   int minus = (char*)srcEnd - (char*)src - num;
   CHECK_LE(0, minus) << "memcpyWithCheck: copy " << num
@@ -378,7 +382,7 @@ hl_activation_mode_t hlActiveType(const std::string& type) {
     return HL_ACTIVATION_RELU;
   } else if (type == "tanh") {
     return HL_ACTIVATION_TANH;
-  } else if (type == "linear") {
+  } else if (type == "linear" || type == "") {
     return HL_ACTIVATION_LINEAR;
   } else {
     LOG(FATAL) << "Do not support activation type " << type;
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index 2adb626c83f94c7c5d7a8d53653a46090e19e7b7..ed38f8fa60b3716c12e755b047557c1409fa767c 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
 #include <algorithm>
@@ -47,7 +46,8 @@ limitations under the License. */
  */
 #define FOR_EACH(iterator_name, container)                              \
   for (auto iterator_name = (container).begin(), e = (container).end(); \
-       iterator_name != e; ++iterator_name)
+       iterator_name != e;                                              \
+       ++iterator_name)
 
 /**
  * Loop over the elements in a container in reverse order
@@ -60,8 +60,8 @@ limitations under the License. */
  */
 #define FOR_EACH_R(iterator_name, container)                              \
   for (auto iterator_name = (container).rbegin(), e = (container).rend(); \
-       iterator_name != e; ++iterator_name)
-
+       iterator_name != e;                                                \
+       ++iterator_name)
 
 namespace paddle {
 
@@ -77,11 +77,11 @@ pid_t getTID();
  * \f]
  */
 inline constexpr size_t findLastSet(size_t x) {
-  return std::is_same<size_t , unsigned int>::value ?
-      (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
-    : (std::is_same<size_t , unsigned long>::value ? // NOLINT
-      (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
-    : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
+  return std::is_same<size_t, unsigned int>::value
+             ? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
+             : (std::is_same<size_t, unsigned long>::value  // NOLINT
+                    ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
+                    : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
 }
 
 /**
@@ -95,7 +95,6 @@ inline int mod(int a, int b) {
   return r >= 0 ? r : r + b;
 }
 
-
 /**
  * find the value given a key k from container c.
  * If the key can be found, the value is stored in *value
@@ -120,7 +119,7 @@ static bool contains(const Container& container, const T& val) {
 /**
  * pop and get the front element of a container
  */
-template<typename Container>
+template <typename Container>
 typename Container::value_type pop_get_front(Container& c) {
   typename Container::value_type v;
   swap(v, c.front());
@@ -207,7 +206,6 @@ protected:
   int devId_;
 };
 
-
 /**
  * Enables direct access to memory allocations on a peer device(d2).
  * input:
@@ -250,7 +248,6 @@ private:
   bool syncFlag_;
 };
 
-
 inline bool useGpu(int deviceId) {
   return FLAGS_parallel_nn ? (deviceId >= 0 ? true : false) : FLAGS_use_gpu;
 }
@@ -328,7 +325,9 @@ T readT(char*& p, const char* pEnd) {
   return v;
 }
 
-void memcpyWithCheck(void* dest, const void* src, size_t num,
+void memcpyWithCheck(void* dest,
+                     const void* src,
+                     size_t num,
                      const void* srcEnd);
 
 /**
@@ -338,7 +337,6 @@ void memcpyWithCheck(void* dest, const void* src, size_t num,
 class SyncThreadPool;
 SyncThreadPool* getGlobalSyncThreadPool();
 
-
 namespace path {
 
 // directory separator
@@ -363,7 +361,8 @@ std::string dirname(const std::string& path);
 std::string join(const std::string& part1, const std::string& part2);
 
 template <typename... Args>
-std::string join(const std::string& part1, const std::string& part2,
+std::string join(const std::string& part1,
+                 const std::string& part2,
                  Args... args) {
   return join(join(part1, part2), args...);
 }
@@ -392,8 +391,8 @@ public:
     std::call_once(onceFlag_, [&] { invokeThreadId_ = curThreadId; });
     CHECK_EQ(invokeThreadId_, curThreadId)
         << "This method should invoke in "
-           "same thread, but first invoked in " << invokeThreadId_
-        << " current invoked in " << curThreadId;
+           "same thread, but first invoked in "
+        << invokeThreadId_ << " current invoked in " << curThreadId;
   }
 
 private:
@@ -447,28 +446,23 @@ private:
  * @brief The ScopedCallbacks class is a callback invoker when object is
  *        created and destroyed.
  */
-template <typename CallbackType, typename ...Args>
+template <typename CallbackType, typename... Args>
 class ScopedCallbacks {
 public:
-  ScopedCallbacks(CallbackType enter,
-                  CallbackType exit,
-                  Args& ... args)
-    : exit_(std::bind(exit, args...)) {
+  ScopedCallbacks(CallbackType enter, CallbackType exit, Args&... args)
+      : exit_(std::bind(exit, args...)) {
     enter(args...);
   }
 
   ScopedCallbacks(const ScopedCallbacks& other) = delete;
-  ScopedCallbacks& operator = (const ScopedCallbacks& other) = delete;
+  ScopedCallbacks& operator=(const ScopedCallbacks& other) = delete;
 
-  ~ScopedCallbacks() {
-    exit_();
-  }
+  ~ScopedCallbacks() { exit_(); }
 
 private:
   std::function<void()> exit_;
 };
 
-
 /**
  * std compatible allocator with memory alignment.
  * @tparam T type of allocator elements.
@@ -537,8 +531,7 @@ public:
       return nullptr;
     }
     if (n > max_size()) {
-      throw std::length_error(
-          "AlignAllocator<T>::allocate() - Int Overflow.");
+      throw std::length_error("AlignAllocator<T>::allocate() - Int Overflow.");
     }
     void* r = nullptr;
     CHECK_EQ(posix_memalign(&r, Alignment * 8, sizeof(T) * n), 0);
@@ -558,7 +551,6 @@ private:
   AlignedAllocator& operator=(const AlignedAllocator&);  // disable
 };
 
-
 class Deprecated {
 public:
   explicit Deprecated(const std::string& msg = "") {
diff --git a/paddle/utils/Version.cpp b/paddle/utils/Version.cpp
index b59b78f5707bd4a7ee9f8073927f55c0c9ef0398..e706983918b4a865f6674a34083ef0143bd6e185 100644
--- a/paddle/utils/Version.cpp
+++ b/paddle/utils/Version.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "Version.h"
 
 #include "Flags.h"
@@ -34,18 +33,22 @@ void printVersion(std::ostream& os) {
 #ifndef PADDLE_VERSION
 #define PADDLE_VERSION "unknown"
 #endif
-  os << "paddle version: " << PADDLE_VERSION << std::endl << std::boolalpha
-      << "\t" << "withGpu: " << version::isWithGpu() << std::endl
-      << "\t" << "withAvx: " << version::isWithAvx() << std::endl
-      << "\t" << "withPyDataProvider: " << version::isWithPyDataProvider()
-      << std::endl
-      << "\t" << "withTimer: " << version::isWithTimer() << std::endl
-      << "\t" << "withFpga: " << version::isWithFpga() << std::endl
-      << "\t" << "real byte size: "<< version::sizeofReal() << std::endl
-      << std::endl;
+  os << "paddle version: " << PADDLE_VERSION << std::endl
+     << std::boolalpha << "\t"
+     << "withGpu: " << version::isWithGpu() << std::endl
+     << "\t"
+     << "withAvx: " << version::isWithAvx() << std::endl
+     << "\t"
+     << "withPyDataProvider: " << version::isWithPyDataProvider() << std::endl
+     << "\t"
+     << "withTimer: " << version::isWithTimer() << std::endl
+     << "\t"
+     << "withFpga: " << version::isWithFpga() << std::endl
+     << "\t"
+     << "real byte size: " << version::sizeofReal() << std::endl
+     << std::endl;
 }
 
-
 void printVersion() {
   if (FLAGS_version) {
     printVersion(std::cout);
diff --git a/paddle/utils/Version.h b/paddle/utils/Version.h
index e6655fa75dabfeec99bc2157b8c9a1e9e4f19263..e6c799644ee7f88e4e90eec565d1bab2bc9faed7 100644
--- a/paddle/utils/Version.h
+++ b/paddle/utils/Version.h
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 #include <stddef.h>
 #include "TypeDefs.h"
@@ -35,7 +34,6 @@ namespace paddle {
  *    real byte size: 4
  */
 
-
 namespace version {
 
 /**
@@ -44,7 +42,6 @@ namespace version {
  */
 void printVersion();
 
-
 void printVersion(std::ostream& os);
 /**
  * @brief isWithGpu
@@ -75,7 +72,6 @@ constexpr bool isWithPyDataProvider() {
 #endif
 }
 
-
 /**
  * @brief isWithTimer
  * @return true if paddle compiled with timer.
@@ -116,25 +112,19 @@ constexpr bool isWithFpga() {
  * @brief sizeofReal
  * @return return the byte size of real
  */
-constexpr size_t sizeofReal() {
-  return sizeof(real);
-}
+constexpr size_t sizeofReal() { return sizeof(real); }
 
 /**
  * @brief isPaddleUseDouble
  * @return true if paddle compiled with double precision.
  */
-constexpr bool isPaddleUseDouble() {
-  return sizeofReal() == sizeof(double);
-}
+constexpr bool isPaddleUseDouble() { return sizeofReal() == sizeof(double); }
 
 /**
  * @brief isPaddleUseFloat
  * @return true if paddle compiled with float precision
  */
-constexpr bool isPaddleUseFloat() {
-  return sizeofReal() == sizeof(float);
-}
+constexpr bool isPaddleUseFloat() { return sizeofReal() == sizeof(float); }
 
 }  //  namespace version
 
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp
index 347ae64c26dfdfcdaff62886481c20e9c4c7cfec..93016daeaea644ca44499fdc6024ec8deac57ca8 100644
--- a/paddle/utils/arch/linux/Locks.cpp
+++ b/paddle/utils/arch/linux/Locks.cpp
@@ -22,26 +22,19 @@ public:
   sem_t sem;
 };
 
-Semaphore::Semaphore(int initValue): m(new SemaphorePrivate()) {
+Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
   sem_init(&m->sem, 0, initValue);
 }
 
-Semaphore::~Semaphore() {
-  sem_destroy(&m->sem);
-}
+Semaphore::~Semaphore() { sem_destroy(&m->sem); }
 
 bool Semaphore::timeWait(struct timespec* ts) {
   return (0 == sem_timedwait(&m->sem, ts));
 }
 
-void Semaphore::wait() {
-  sem_wait(&m->sem);
-}
-
-void Semaphore::post() {
-  sem_post(&m->sem);
-}
+void Semaphore::wait() { sem_wait(&m->sem); }
 
+void Semaphore::post() { sem_post(&m->sem); }
 
 class SpinLockPrivate {
 public:
@@ -51,25 +44,20 @@ public:
   char padding_[64 - sizeof(pthread_spinlock_t)];
 };
 
-SpinLock::SpinLock():m(new SpinLockPrivate()) {}
-
+SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
 
 SpinLock::~SpinLock() { delete m; }
 
-void SpinLock::lock() {
-  pthread_spin_lock(&m->lock_);
-}
+void SpinLock::lock() { pthread_spin_lock(&m->lock_); }
 
-void SpinLock::unlock() {
-  pthread_spin_unlock(&m->lock_);
-}
+void SpinLock::unlock() { pthread_spin_unlock(&m->lock_); }
 
 class ThreadBarrierPrivate {
 public:
   pthread_barrier_t barrier_;
 };
 
-ThreadBarrier::ThreadBarrier(int count): m(new ThreadBarrierPrivate()) {
+ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate()) {
   pthread_barrier_init(&m->barrier_, nullptr, count);
 }
 
@@ -78,8 +66,6 @@ ThreadBarrier::~ThreadBarrier() {
   delete m;
 }
 
-void ThreadBarrier::wait() {
-  pthread_barrier_wait(&m->barrier_);
-}
+void ThreadBarrier::wait() { pthread_barrier_wait(&m->barrier_); }
 
 }  // namespace paddle
diff --git a/paddle/utils/arch/osx/Locks.cpp b/paddle/utils/arch/osx/Locks.cpp
index b3ec454976520be10995bd8399b7ce838e3fa824..ae563a6afd29b6315d9c6609474faddbfaaded14 100644
--- a/paddle/utils/arch/osx/Locks.cpp
+++ b/paddle/utils/arch/osx/Locks.cpp
@@ -22,20 +22,16 @@ namespace paddle {
 
 class SemaphorePrivate {
 public:
-  ~SemaphorePrivate() {
-    dispatch_release(sem);
-  }
+  ~SemaphorePrivate() { dispatch_release(sem); }
 
   dispatch_semaphore_t sem;
 };
 
-Semaphore::Semaphore(int initValue): m(new SemaphorePrivate()) {
+Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
   m->sem = dispatch_semaphore_create(initValue);
 }
 
-Semaphore::~Semaphore() {
-  delete m;
-}
+Semaphore::~Semaphore() { delete m; }
 
 bool Semaphore::timeWait(timespec *ts) {
   dispatch_time_t tm = dispatch_walltime(ts, 0);
@@ -46,9 +42,7 @@ void Semaphore::wait() {
   dispatch_semaphore_wait(m->sem, DISPATCH_TIME_FOREVER);
 }
 
-void Semaphore::post() {
-  dispatch_semaphore_signal(m->sem);
-}
+void Semaphore::post() { dispatch_semaphore_signal(m->sem); }
 
 class SpinLockPrivate {
 public:
@@ -56,17 +50,15 @@ public:
   char padding_[64 - sizeof(lock_)];  // Padding to cache line size
 };
 
-SpinLock::SpinLock(): m(new SpinLockPrivate()) {}
+SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
 SpinLock::~SpinLock() { delete m; }
 
 void SpinLock::lock() {
-  while (m->lock_.test_and_set(std::memory_order_acquire)) {}
-}
-
-void SpinLock::unlock() {
-  m->lock_.clear(std::memory_order_release);
+  while (m->lock_.test_and_set(std::memory_order_acquire)) {
+  }
 }
 
+void SpinLock::unlock() { m->lock_.clear(std::memory_order_release); }
 
 class ThreadBarrierPrivate {
 public:
@@ -75,7 +67,7 @@ public:
   int count_;
   int tripCount_;
 
-  inline explicit ThreadBarrierPrivate(int cnt):count_(0), tripCount_(cnt) {
+  inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) {
     CHECK_NE(cnt, 0);
     CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
     CHECK_GE(pthread_cond_init(&cond_, 0), 0);
@@ -106,7 +98,7 @@ public:
   }
 };
 
-ThreadBarrier::ThreadBarrier(int count): m(new ThreadBarrierPrivate(count)) {}
+ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
 ThreadBarrier::~ThreadBarrier() { delete m; }
 void ThreadBarrier::wait() { m->wait(); }
 
diff --git a/paddle/utils/enable_virtualenv.py b/paddle/utils/enable_virtualenv.py
index 99d822a4145cca3f5ae35c4cf144210f35460827..ccfaa7c147b2ce25cb6007aa04cfc33961b7e10b 100644
--- a/paddle/utils/enable_virtualenv.py
+++ b/paddle/utils/enable_virtualenv.py
@@ -1,10 +1,12 @@
 import os
 
+
 def __activate_virtual_env__():
-  __path__ = os.getenv('VIRTUAL_ENV')
-  if __path__ is None:
-    return
-  __script__ = os.path.join(__path__, 'bin', 'activate_this.py')
-  execfile(__script__, {'__file__': __script__})
+    __path__ = os.getenv('VIRTUAL_ENV')
+    if __path__ is None:
+        return
+    __script__ = os.path.join(__path__, 'bin', 'activate_this.py')
+    execfile(__script__, {'__file__': __script__})
+
 
 __activate_virtual_env__()
diff --git a/paddle/utils/tests/test_CommandLineParser.cpp b/paddle/utils/tests/test_CommandLineParser.cpp
index d5f6018864cb9f14dd3006facebf82c66e909736..5ecfb2b4f511e63eac21a5eae3829532f6860d66 100644
--- a/paddle/utils/tests/test_CommandLineParser.cpp
+++ b/paddle/utils/tests/test_CommandLineParser.cpp
@@ -63,10 +63,15 @@ TEST(CommandLineParser, defaultValue) {
 }
 
 TEST(CommandLineParser, normal) {
-  char* argv[] = {
-      cc("test_program"), cc("--i2=32"),              cc("--str1=abc"),
-      cc("--b2=1"),       cc("-b1=False"),            cc("--d2=.34"),
-      cc("--d1=0"),       cc("--l1=-12345678901234"), cc("-ul2=3212")};
+  char* argv[] = {cc("test_program"),
+                  cc("--i2=32"),
+                  cc("--str1=abc"),
+                  cc("--b2=1"),
+                  cc("-b1=False"),
+                  cc("--d2=.34"),
+                  cc("--d1=0"),
+                  cc("--l1=-12345678901234"),
+                  cc("-ul2=3212")};
   int argc = sizeof(argv) / sizeof(char*);
   paddle::ParseCommandLineFlags(&argc, argv);
   ASSERT_EQ(argc, 1);
@@ -104,9 +109,6 @@ int main(int argc, char** argv) {
 
 #else
 
-int main(int argc, char** argv) {
-  return 0;
-}
+int main(int argc, char** argv) { return 0; }
 
 #endif
-
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
index 3e665021471cb3c179b13960dcc9f2284a0d664c..3bfb381ed93659feebcc567a04b2a095dc94dfa8 100644
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -22,11 +22,12 @@ limitations under the License. */
 
 P_DEFINE_int32(test_thread_num, 10, "testing thread number");
 
-void testNormalImpl(const std::function<void(
-                      paddle::CustomStackTrace<std::string>&,
-                      size_t, size_t,
-                      paddle::ThreadBarrier&,
-                      paddle::ThreadBarrier&)>& callback) {
+void testNormalImpl(
+    const std::function<void(paddle::CustomStackTrace<std::string>&,
+                             size_t,
+                             size_t,
+                             paddle::ThreadBarrier&,
+                             paddle::ThreadBarrier&)>& callback) {
   paddle::CustomStackTrace<std::string> tracer;
   paddle::ThreadBarrier doneBarrier(FLAGS_test_thread_num + 1);
   paddle::ThreadBarrier startBarrier(FLAGS_test_thread_num + 1);
@@ -35,10 +36,13 @@ void testNormalImpl(const std::function<void(
   std::vector<std::unique_ptr<std::thread>> threads;
   threads.reserve(FLAGS_test_thread_num);
 
-  for (int32_t i=0; i < FLAGS_test_thread_num; ++i) {
-    threads.emplace_back(new std::thread([&tracer, &countDown, &layerSize,
-                                         &startBarrier, &doneBarrier,
-                                         &callback]{
+  for (int32_t i = 0; i < FLAGS_test_thread_num; ++i) {
+    threads.emplace_back(new std::thread([&tracer,
+                                          &countDown,
+                                          &layerSize,
+                                          &startBarrier,
+                                          &doneBarrier,
+                                          &callback] {
       callback(tracer, countDown, layerSize, startBarrier, doneBarrier);
     }));
   }
@@ -55,18 +59,19 @@ void testNormalImpl(const std::function<void(
   }
 }
 
-
 TEST(CustomStackTrace, normalTrain) {
   testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
-                 size_t countDown, size_t layerSize,
-                 paddle::ThreadBarrier& start, paddle::ThreadBarrier& finish){
+                    size_t countDown,
+                    size_t layerSize,
+                    paddle::ThreadBarrier& start,
+                    paddle::ThreadBarrier& finish) {
     while (countDown-- > 0) {
       start.wait();
-      for (size_t i=0; i < layerSize; ++i) {
+      for (size_t i = 0; i < layerSize; ++i) {
         tracer.push("layer_" + std::to_string(i));
       }
       tracer.pop("");
-      for (size_t i=0; i < layerSize; ++i) {
+      for (size_t i = 0; i < layerSize; ++i) {
         tracer.pop("layer_" + std::to_string(layerSize - 1 - i));
       }
       finish.wait();
@@ -75,12 +80,14 @@ TEST(CustomStackTrace, normalTrain) {
 }
 
 TEST(CustomStackTrace, normalTest) {
-  testNormalImpl([] (paddle::CustomStackTrace<std::string>& tracer,
-                 size_t countDown, size_t layerSize,
-                 paddle::ThreadBarrier& start, paddle::ThreadBarrier& finish){
+  testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
+                    size_t countDown,
+                    size_t layerSize,
+                    paddle::ThreadBarrier& start,
+                    paddle::ThreadBarrier& finish) {
     while (countDown-- > 0) {
       start.wait();
-      for (size_t i=0; i < layerSize; ++i) {
+      for (size_t i = 0; i < layerSize; ++i) {
         tracer.push("layer_" + std::to_string(i));
       }
       tracer.clear();  // in forward test, tracer will clear after forward.
diff --git a/paddle/utils/tests/test_CustomStackTracePrint.cpp b/paddle/utils/tests/test_CustomStackTracePrint.cpp
index c19c98614e6a7d6285990aa19849131579f7307b..d39a190961a96906eef2b510cb3538c639d5df5c 100644
--- a/paddle/utils/tests/test_CustomStackTracePrint.cpp
+++ b/paddle/utils/tests/test_CustomStackTracePrint.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 int main(int argc, char** argv) {
   paddle::initMain(argc, argv);
 
-  for (size_t i=0; i < 1000; ++i) {
+  for (size_t i = 0; i < 1000; ++i) {
     paddle::gLayerStackTrace.push("layer_" + std::to_string(i));
     if (i == 998) {
       throw "Unhandle exception";
diff --git a/paddle/utils/tests/test_Logging.cpp b/paddle/utils/tests/test_Logging.cpp
index a9382de6da4ef5b425afa4a8d76652d7506d8e72..9f477fab14a2abde93505a05fc4c9ccd3d6426b6 100644
--- a/paddle/utils/tests/test_Logging.cpp
+++ b/paddle/utils/tests/test_Logging.cpp
@@ -54,7 +54,7 @@ TEST(Logging, Check) {
 
   auto pcheckDown = [&] { P_CHECK(a == b); };
   ASSERT_DEATH(pcheckDown(),
-    "F .*test_Logging.cpp:[0-9]+] Check failed: a == b ");
+               "F .*test_Logging.cpp:[0-9]+] Check failed: a == b ");
 
   P_CHECK_LE(a, b);
   P_CHECK_LT(a, b);
@@ -157,8 +157,6 @@ int main(int argc, char** argv) {
 
 #else
 
-int main(int, char**) {
-  return 0;
-}
+int main(int, char**) { return 0; }
 
 #endif
diff --git a/paddle/utils/tests/test_SpinLock.cpp b/paddle/utils/tests/test_SpinLock.cpp
index ebc84e0f52d823bf4799d08ff8ea6a036e131f66..77d281962cfeaa3cc951a72eddf4f37b619c5691 100644
--- a/paddle/utils/tests/test_SpinLock.cpp
+++ b/paddle/utils/tests/test_SpinLock.cpp
@@ -21,17 +21,18 @@ limitations under the License. */
 
 P_DEFINE_int32(test_thread_num, 100, "testing thread number");
 
-void testNormalImpl(size_t thread_num, const std::function
-    <void(size_t, size_t&, paddle::SpinLock&)>& callback) {
+void testNormalImpl(
+    size_t thread_num,
+    const std::function<void(size_t, size_t&, paddle::SpinLock&)>& callback) {
   paddle::SpinLock mutex;
   std::vector<std::thread> threads;
   threads.reserve(thread_num);
 
   size_t count = 0;
   for (size_t i = 0; i < thread_num; ++i) {
-      threads.emplace_back([&thread_num, &count, &mutex, &callback]{
-          callback(thread_num, count, mutex);
-      });
+    threads.emplace_back([&thread_num, &count, &mutex, &callback] {
+      callback(thread_num, count, mutex);
+    });
   }
   for (auto& thread : threads) {
     thread.join();
@@ -41,12 +42,13 @@ void testNormalImpl(size_t thread_num, const std::function
 }
 
 TEST(ThreadSpinLock, normalTest) {
-  for (auto &thread_num : {10, 30, 50 , 100 , 300, 1000}) {
-    testNormalImpl(thread_num, [](size_t thread_num,
-        size_t& count, paddle::SpinLock& mutex){
-        std::lock_guard<paddle::SpinLock> lock(mutex);
-        ++count;
-    });
+  for (auto& thread_num : {10, 30, 50, 100, 300, 1000}) {
+    testNormalImpl(
+        thread_num,
+        [](size_t thread_num, size_t& count, paddle::SpinLock& mutex) {
+          std::lock_guard<paddle::SpinLock> lock(mutex);
+          ++count;
+        });
   }
 }
 
diff --git a/paddle/utils/tests/test_StringUtils.cpp b/paddle/utils/tests/test_StringUtils.cpp
index b8636709e9b42c7baa5d0106492ab6c0782ed6d4..2c699b791ffad8ed680c5537005aac7dad832f41 100644
--- a/paddle/utils/tests/test_StringUtils.cpp
+++ b/paddle/utils/tests/test_StringUtils.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "paddle/utils/StringUtil.h"
 
 #include <gtest/gtest.h>
diff --git a/paddle/utils/tests/test_Thread.cpp b/paddle/utils/tests/test_Thread.cpp
index bf4e2753458e08a0b20a33663d8b8175919852b0..154db5d9c616d4817b933c82587834f5ce2d0f8e 100644
--- a/paddle/utils/tests/test_Thread.cpp
+++ b/paddle/utils/tests/test_Thread.cpp
@@ -20,7 +20,7 @@ using paddle::AsyncThreadPool;  // NOLINT
 
 TEST(AsyncThreadPool, addJob) {
   AsyncThreadPool pool(8);
-  auto a = pool.addJob([]{ return 1; });
+  auto a = pool.addJob([] { return 1; });
   auto b = pool.addJob([] { return true; });
   auto c = pool.addJob([] { return false; });
 
@@ -36,10 +36,7 @@ TEST(AsyncThreadPool, addBatchJob) {
   std::vector<AsyncThreadPool::JobFunc> jobs;
 
   for (int i = 0; i < 10000; i++) {
-    jobs.emplace_back(
-        [&] {
-          counter++;
-        });
+    jobs.emplace_back([&] { counter++; });
   }
 
   pool.addBatchJobs(jobs);
@@ -55,13 +52,16 @@ TEST(AsyncThreadPool, multiThreadAddBatchJob) {
   int counter = 0;
   const int numMonitors = 300;
   const int numSlaves = 300;
-  std::vector<AsyncThreadPool::JobFunc> moniterJobs(numMonitors, [&] {
-      std::vector<AsyncThreadPool::JobFunc> slaveJobs(numSlaves,
-          [mut, &counter] {
-            std::lock_guard<std::mutex> lk(*mut);
-            counter++;
-          });
-      levelTwoPool.addBatchJobs(slaveJobs);
+  std::vector<AsyncThreadPool::JobFunc> moniterJobs(
+      numMonitors,
+      [&] {
+        std::vector<AsyncThreadPool::JobFunc> slaveJobs(
+            numSlaves,
+            [mut, &counter] {
+              std::lock_guard<std::mutex> lk(*mut);
+              counter++;
+            });
+        levelTwoPool.addBatchJobs(slaveJobs);
       });
   levelOnePool.addBatchJobs(moniterJobs);
   ASSERT_EQ(counter, numMonitors * numSlaves);
@@ -70,13 +70,10 @@ TEST(AsyncThreadPool, multiThreadAddBatchJob) {
 TEST(AsyncThreadPool, addBatchJobWithResults) {
   AsyncThreadPool pool(100);
 
-  std::vector<std::function<int()> > jobs;
+  std::vector<std::function<int()>> jobs;
   const int numJobs = 100;
   for (int i = 0; i < numJobs; i++) {
-    jobs.emplace_back(
-        [i]{
-          return i;
-        });
+    jobs.emplace_back([i] { return i; });
   }
 
   std::vector<int> res;
diff --git a/paddle/utils/tests/test_ThreadBarrier.cpp b/paddle/utils/tests/test_ThreadBarrier.cpp
index 90bd6c21bc8e5ac05b248a0517f9e4fb43d04054..20b9babd94cf4e6a475daece349c871bd606d83d 100644
--- a/paddle/utils/tests/test_ThreadBarrier.cpp
+++ b/paddle/utils/tests/test_ThreadBarrier.cpp
@@ -22,42 +22,44 @@ limitations under the License. */
 
 P_DEFINE_int32(test_thread_num, 100, "testing thread number");
 
-void testNormalImpl(size_t thread_num,
-                    const std::function<void(size_t,
-                    std::mutex&, std::set<std::thread::id>&,
-                    paddle::ThreadBarrier&)>& callback) {
- std::mutex mutex;
- std::set<std::thread::id> tids;
- paddle::ThreadBarrier barrier(thread_num);
+void testNormalImpl(
+    size_t thread_num,
+    const std::function<void(size_t,
+                             std::mutex&,
+                             std::set<std::thread::id>&,
+                             paddle::ThreadBarrier&)>& callback) {
+  std::mutex mutex;
+  std::set<std::thread::id> tids;
+  paddle::ThreadBarrier barrier(thread_num);
 
- std::vector<std::thread> threads;
- threads.reserve(thread_num);
- for (size_t i = 0; i < thread_num; ++i) {
-    threads.emplace_back([&thread_num, &mutex,
-                         &tids, &barrier, &callback]{
-        callback(thread_num, mutex, tids, barrier);
+  std::vector<std::thread> threads;
+  threads.reserve(thread_num);
+  for (size_t i = 0; i < thread_num; ++i) {
+    threads.emplace_back([&thread_num, &mutex, &tids, &barrier, &callback] {
+      callback(thread_num, mutex, tids, barrier);
     });
- }
+  }
 
- for (auto& thread : threads) {
-   thread.join();
- }
+  for (auto& thread : threads) {
+    thread.join();
+  }
 }
 
 TEST(ThreadBarrier, normalTest) {
-  for (auto &thread_num : {10, 30, 50 , 100 , 300, 1000}) {
+  for (auto& thread_num : {10, 30, 50, 100, 300, 1000}) {
     testNormalImpl(thread_num,
-                  [](size_t thread_num, std::mutex& mutex,
-                  std::set<std::thread::id>& tids,
-                  paddle::ThreadBarrier& barrier){
-      {
-        std::lock_guard<std::mutex> guard(mutex);
-        tids.insert(std::this_thread::get_id());
-      }
-      barrier.wait();
-      // Check whether all threads reach this point or not
-      CHECK_EQ(tids.size(), thread_num);
-    });
+                   [](size_t thread_num,
+                      std::mutex& mutex,
+                      std::set<std::thread::id>& tids,
+                      paddle::ThreadBarrier& barrier) {
+                     {
+                       std::lock_guard<std::mutex> guard(mutex);
+                       tids.insert(std::this_thread::get_id());
+                     }
+                     barrier.wait();
+                     // Check whether all threads reach this point or not
+                     CHECK_EQ(tids.size(), thread_num);
+                   });
   }
 }
 
diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4
index 25e36f9c4c1687aec46ca7202d1ba8a6e0088fec..c835cfd5221c8579b383c0a6f0b2f0f554eac6d2 100644
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -92,7 +92,7 @@ message PoolConfig {
   optional uint32 start = 4;
 
   // Defines the stride size between successive pooling squares.
-  required uint32 stride = 5;
+  required uint32 stride = 5 [default = 1];
 
   // The size of output feature map.
   required uint32 output_x = 6;
@@ -105,19 +105,27 @@ message PoolConfig {
   optional uint32 padding = 8 [default = 0];
 
   // if not set, use size_x
-  optional uint32 size_y = 9 [default = 0];
+  optional uint32 size_y = 9;
 
   // if not set, use stride
-  optional uint32 stride_y = 10 [default = 0];
+  optional uint32 stride_y = 10;
 
   // if not set, use output_x
-  optional uint32 output_y = 11 [default = 0];
+  optional uint32 output_y = 11;
 
   // if not set, use img_size
-  optional uint32 img_size_y = 12 [default = 0];
+  optional uint32 img_size_y = 12;
 
   // if not set, use padding
-  optional uint32 padding_y = 13 [default = 0];
+  optional uint32 padding_y = 13;
+}
+
+message SppConfig {
+  required string pool_type = 1;
+  required uint32 pyramid_height = 2;
+  required uint32 channels = 3;
+  required uint32 img_size = 4;
+  optional uint32 img_size_y = 5;
 }
 
 message NormConfig {
@@ -170,6 +178,15 @@ message BlockExpandConfig {
   required uint32 img_size_y = 11;
 }
 
+message MaxOutConfig {
+  required uint32 channels = 1;
+  required uint32 groups = 2;
+
+  // The size of input feature map.
+  required uint32 img_size_x = 3;
+  required uint32 img_size_y = 4;
+}
+
 message ProjectionConfig {
   required string type = 1;
   required string name = 2;
@@ -187,6 +204,9 @@ message ProjectionConfig {
 
   // For IdentityOffsetProjection
   optional uint64 offset = 11 [default = 0];
+
+  // For pool
+  optional PoolConfig pool_conf = 12;
 }
 
 message OperatorConfig {
@@ -203,6 +223,15 @@ message OperatorConfig {
   optional int32 num_filters = 7;
 }
 
+message BilinearInterpConfig {
+  // The size of input feature map.
+  optional uint32 img_size_x = 1;
+  optional uint32 img_size_y = 2;
+  // The size of output feature map.
+  required uint32 out_size_x = 3;
+  required uint32 out_size_y = 4;
+  required uint32 num_channels = 5;
+}
 
 message ImageConfig {
   // The image data dimensionality.
@@ -225,6 +254,9 @@ message LayerInputConfig {
   // If the input layer has multi-output.
   // Set the argument name.
   optional string input_layer_argument = 9;
+  optional BilinearInterpConfig bilinear_interp_conf = 10;
+  optional MaxOutConfig maxout_conf = 11;
+  optional SppConfig spp_conf = 12;
 }
 
 message LayerConfig {
@@ -245,7 +277,7 @@ sinclude(`ModelConfigLayer.proto.m4')
   // (which is how convnets are usually trained). Setting this to
   // false will untie the biases, yielding a separate bias for
   // every location at which the filter is applied.
-  optional bool shared_biases = 8;
+  optional bool shared_biases = 8 [default = false];
 
   // Valid values are ones that divide the area of the output
   // grid in this convolutional layer. For example if this layer
@@ -369,6 +401,18 @@ sinclude(`ModelConfigLayer.proto.m4')
 
   // use to compute moving mean and variance.
   optional real moving_average_fraction = 47 [default = 0.9];
+
+  // bias size
+  optional uint32 bias_size = 48 [default = 0];
+
+  // this parameter can be used as a user-defined parameter when necessary, 
+  // without changing the proto file.
+  // e.g., when a new layer with a user-defined parameter is implemented, 
+  // it can be used to pass that parameter, without modifying the proto file.
+  // string type is used for flexibility: different types can be converted
+  // to string and reinterpreted in the user's own layer implementation.  
+  optional string user_arg = 49;
+
 }
 
 message EvaluatorConfig {
diff --git a/proto/TrainerConfig.proto.m4 b/proto/TrainerConfig.proto.m4
index a42ff88d54b5e445e7cfadc7467c1bc7d8c7ef26..3b0e24f90bed8cdf0e102c12d2a4a041c17a8447 100644
--- a/proto/TrainerConfig.proto.m4
+++ b/proto/TrainerConfig.proto.m4
@@ -130,7 +130,7 @@ message OptimizationConfig {
 };
 
 message TrainerConfig {
-  required ModelConfig model_config = 1;
+  optional ModelConfig model_config = 1;
   optional DataConfig data_config = 2;
   required OptimizationConfig opt_config = 3;
   optional DataConfig test_data_config = 4;
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 7f9e87eee6037666b86420fba194624859d356b3..c90af2ee000d46a032984ee23559e7e99b49ddad 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -11,4 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
diff --git a/python/paddle/proto/__init__.py b/python/paddle/proto/__init__.py
index 7f9e87eee6037666b86420fba194624859d356b3..cd6a59ecbb0952e89f34b11678a60ad300585979 100644
--- a/python/paddle/proto/__init__.py
+++ b/python/paddle/proto/__init__.py
@@ -12,3 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from paddle.proto.TrainerConfig_pb2 import OptimizationConfig, TrainerConfig
+from paddle.proto.ModelConfig_pb2 import ModelConfig
diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py
index 53409b746d811a3d73188a613c6b121e71955552..0c577ec657bc6d35c41e55ed5ab6adb80ab2c37c 100644
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
@@ -18,9 +18,8 @@ import collections
 import functools
 import itertools
 
-logging.basicConfig(
-    format="[%(levelname)s %(asctime)s %(filename)s:%(lineno)s]"
-           " %(message)s")
+logging.basicConfig(format="[%(levelname)s %(asctime)s %(filename)s:%(lineno)s]"
+                    " %(message)s")
 
 
 class SequenceType(object):
@@ -132,8 +131,10 @@ class InputOrderWrapper(object):
     def __call__(self, obj, filename):
         for item in self.generator(obj, filename):
             if isinstance(item, dict):
-                yield [item.get(input_name, None) for input_name in
-                       self.input_order]
+                yield [
+                    item.get(input_name, None)
+                    for input_name in self.input_order
+                ]
             else:
                 yield item
 
@@ -162,8 +163,8 @@ class CheckWrapper(object):
                 yield items
             except AssertionError as e:
                 self.logger.warning(
-                    "Item (%s) is not fit the input type with error %s"
-                    % (repr(item), repr(e)))
+                    "Item (%s) is not fit the input type with error %s" %
+                    (repr(item), repr(e)))
 
                 if self.check_fail_continue:
                     continue
@@ -202,13 +203,17 @@ class CheckWrapper(object):
             callback(each)
 
 
-def provider(input_types=None, should_shuffle=None, pool_size=-1,
+def provider(input_types=None,
+             should_shuffle=None,
+             pool_size=-1,
              min_pool_size=-1,
              can_over_batch_size=True,
              calc_batch_size=None,
              cache=CacheType.NO_CACHE,
-             check=False, check_fail_continue=False,
-             init_hook=None, **kwargs):
+             check=False,
+             check_fail_continue=False,
+             init_hook=None,
+             **kwargs):
     """
     Provider decorator. Use it to make a function into PyDataProvider2 object.
     In this function, user only need to get each sample for some train/test
@@ -318,9 +323,9 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1,
                             "Could not recognize should_shuffle (%s), "
                             "just use default value of should_shuffle."
                             " Please set should_shuffle to bool value or "
-                            "something in %s" % (
-                                repr(self.should_shuffle),
-                                repr(true_table + false_table)))
+                            "something in %s" %
+                            (repr(self.should_shuffle),
+                             repr(true_table + false_table)))
                         self.should_shuffle = None
 
                 self.pool_size = pool_size
@@ -351,8 +356,7 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1,
                     self.generator = InputOrderWrapper(self.generator,
                                                        self.input_order)
                 if self.check:
-                    self.generator = CheckWrapper(self.generator,
-                                                  self.slots,
+                    self.generator = CheckWrapper(self.generator, self.slots,
                                                   check_fail_continue,
                                                   self.logger)
 
@@ -368,4 +372,3 @@ def deserialize_args(args):
     :return:
     """
     return cPickle.loads(args)
-
diff --git a/python/paddle/trainer/PyDataProviderWrapper.py b/python/paddle/trainer/PyDataProviderWrapper.py
index c4b907af54699f31b3792fce423bc7251634e0da..90b684a000017fc03c8c33f829aaa64a5f769e45 100644
--- a/python/paddle/trainer/PyDataProviderWrapper.py
+++ b/python/paddle/trainer/PyDataProviderWrapper.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 This module provide a wrapper(decorator) to wrap a data process method into a
 PyDataProvider. Some examples are shown `here <data_provider/python_case.html>`_.
@@ -47,6 +46,7 @@ except ImportError:
 
 import io
 
+
 class SlotType(object):  # Just a hint for user.
     pass
 
@@ -83,6 +83,7 @@ class SparseNonValueSlot(SlotType):
     - **SubSeq**: [[[int, int, ...], [int, ....], ...] ,  \
                    [[int, int, ...], [int, ....], ...] , ...]
     """
+
     def __init__(self, dim):
         """
         :param dim: slot dimension
@@ -294,8 +295,9 @@ class GeneralPyDataProvider:
                 fn = "%s_%d" % (self.profile_filename, self.profile_count)
                 sortby = "cumulative"
                 with open(fn, "w") as f:
-                    pstats.Stats(self.profiler, stream=f).sort_stats(
-                        sortby).print_stats()
+                    pstats.Stats(
+                        self.profiler,
+                        stream=f).sort_stats(sortby).print_stats()
                 self.logger.info("saving profile to file %s" % fn)
                 self.profile_count += 1
             self.logger.info("resetting profile")
@@ -453,9 +455,10 @@ class GeneralPyDataProvider:
             seq_stream.flush()
             subseq_stream.flush()
 
-            return "".join([self.int_packer.pack(current_batch_size),
-                            data_bytes.getvalue(),
-                            seq_bytes.getvalue(), subseq_bytes.getvalue()])
+            return "".join([
+                self.int_packer.pack(current_batch_size), data_bytes.getvalue(),
+                seq_bytes.getvalue(), subseq_bytes.getvalue()
+            ])
 
         finally:
             data_stream.close()
@@ -516,7 +519,7 @@ class GeneralPyDataProvider:
                         self.data_pool[idx])
                     idx -= 1
 
-                ret_list += self.data_pool[self.data_pool_idx: idx + 1]
+                ret_list += self.data_pool[self.data_pool_idx:idx + 1]
 
                 # for speed reason, just shift left index, not delete data actually.
                 self.data_pool_idx = idx + 1
@@ -537,8 +540,8 @@ class GeneralPyDataProvider:
         if self.max_pool_size == 0:
             for i in xrange(min(self.file_count, len(self.generators))):
                 self.data_pool += list(self.generators[i])
-            self.generators = self.generators[
-                              min(self.file_count, len(self.generators)):]
+            self.generators = self.generators[min(self.file_count,
+                                                  len(self.generators)):]
             self.max_pool_size = len(self.data_pool)
         else:
             while len(self.data_pool) < self.max_pool_size and len(
@@ -562,9 +565,15 @@ def default_init_hook(cls, *args, **kwargs):
     del cls, args, kwargs
 
 
-def provider(slots=None, use_seq=False, should_shuffle=True, pool_size=1,
-             can_over_batch_size=True, calc_batch_size=lambda data: 1,
-             debug=False, init_hook=default_init_hook, profile_filename=None):
+def provider(slots=None,
+             use_seq=False,
+             should_shuffle=True,
+             pool_size=1,
+             can_over_batch_size=True,
+             calc_batch_size=lambda data: 1,
+             debug=False,
+             init_hook=default_init_hook,
+             profile_filename=None):
     """
     The decorator for PyDataProvider. User should use this to create Provider class.
     User should only concern how to read sample from file.
@@ -663,7 +672,7 @@ def provider(slots=None, use_seq=False, should_shuffle=True, pool_size=1,
             def __init__(self, *file_list, **kwargs):
                 logging.basicConfig(
                     format="[%(levelname)s %(asctime)s %(filename)s:%(lineno)s]"
-                           " %(message)s")
+                    " %(message)s")
 
                 self.logger = logging.getLogger("")
                 if debug:
diff --git a/python/paddle/trainer/__init__.py b/python/paddle/trainer/__init__.py
index 7f9e87eee6037666b86420fba194624859d356b3..c90af2ee000d46a032984ee23559e7e99b49ddad 100644
--- a/python/paddle/trainer/__init__.py
+++ b/python/paddle/trainer/__init__.py
@@ -11,4 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index c1e74c7a2d8f7448429edcdbc2ec7c32f6cedd57..9db42bf172a77ff0972107dd26eed3882bf5906e 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 from __future__ import print_function
-
 '''
 The following functions are available in the config file:
 
@@ -101,50 +100,45 @@ except Exception as e:
     raise
 
 logging.basicConfig(
-    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s',
-)
+    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', )
 logger = logging.getLogger('paddle')
 logger.setLevel(logging.INFO)
 __real_print__ = print
-print=logger.info
+print = logger.info
 
 # from layer type name to layer class
 g_layer_type_map = {}
 
+
 # Initialize global variables. We use this function so that we can
 # call parse_config() multiple times
 def init_config_environment(
-        g_default_momentum = None,
-        g_default_decay_rate = None,
-        g_default_initial_mean = 0.,
-        g_default_initial_std = 0.01,
-        g_default_num_batches_regularization = None,
-        g_default_initial_strategy = 0,
-        g_default_initial_smart = False,
-        g_default_gradient_clipping_threshold = None,
-        g_default_device = None,
-        g_default_update_hooks = None,
-        g_default_compact_func = None,
-
-        g_config = TrainerConfig(),
-        g_layer_map = {},
-        g_parameter_map = {},
-
-        g_extended_config_funcs = {},
+        g_default_momentum=None,
+        g_default_decay_rate=None,
+        g_default_initial_mean=0.,
+        g_default_initial_std=0.01,
+        g_default_num_batches_regularization=None,
+        g_default_initial_strategy=0,
+        g_default_initial_smart=False,
+        g_default_gradient_clipping_threshold=None,
+        g_default_device=None,
+        g_default_update_hooks=None,
+        g_default_compact_func=None,
+        g_config=TrainerConfig(),
+        g_layer_map={},
+        g_parameter_map={},
+        g_extended_config_funcs={},
 
         # store command args of paddle_trainer
-        g_command_config_args = {},
+        g_command_config_args={},
 
         # Used for PyDataProvider to avoid duplicate module name
-        g_py_module_name_list = [],
-
-        g_current_submodel = None,
-        g_root_submodel = None,
-        g_submodel_map = {},
-        g_submodel_stack = [],
-
-        g_add_submodel_suffix = False,
-    ):
+        g_py_module_name_list=[],
+        g_current_submodel=None,
+        g_root_submodel=None,
+        g_submodel_map={},
+        g_submodel_stack=[],
+        g_add_submodel_suffix=False, ):
 
     for k, v in locals().iteritems():
         globals()[k] = copy.deepcopy(v)
@@ -161,43 +155,54 @@ def config_assert(b, msg):
     if not b:
         logger.fatal(msg)
 
+
 g_config_funcs = {}
 
+
 # decorator for indicating a function which can be used in config file
 def config_func(func):
     g_config_funcs[func.func_name] = func
     return func
 
+
 # decorator for indicating a class which can be used in config file
 def config_class(cls):
     g_config_funcs[cls.__name__] = cls
     return cls
 
+
 # decorator for indicating a class for a layer type
 def config_layer(layer_type):
     def wrap(cls):
         g_config_funcs[cls.__name__] = cls
         g_layer_type_map[layer_type] = cls
         return cls
+
     return wrap
 
+
 def gen_parameter_name(layer_name, input_index):
     return '_%s.w%d' % (layer_name, input_index)
 
+
 def gen_bias_parameter_name(layer_name):
     return '_%s.wbias' % layer_name
 
+
 def default(x, default_value):
     return default_value if x is None else x
 
+
 class Cfg(object):
     def add_keys(self, locals):
         for k, v in locals.iteritems():
             if not k.startswith('_'):
                 self.__setattr__(k, v)
 
+
 # functions available in config file
 
+
 # Define the name of the input layers of the NeuralNetwork.
 # The type of these layers must be "data".
 # These layers will be provided with the DataBatch obtained
@@ -216,9 +221,10 @@ def Inputs(*args):
         if g_current_submodel is g_root_submodel:
             g_config.model_config.input_layer_names.append(name)
 
+
 @config_func
 def HasInputsSet():
-    return len(g_config.model_config.input_layer_names) != 0
+    return len(g_current_submodel.input_layer_names) != 0
 
 
 # Define the name of the output layers of the NeuralNetwork.
@@ -244,7 +250,7 @@ def SubModelBegin(name):
     global g_current_submodel, g_root_submodel, g_submodel_stack
     g_submodel_stack.append(g_current_submodel)
 
-    name = MakeLayerNameInParentSubmodel(name) #rename in nested submodel
+    name = MakeLayerNameInParentSubmodel(name)  #rename in nested submodel
 
     config_assert(name not in g_submodel_map,
                   'Duplicated submodel name: %s' % name)
@@ -254,36 +260,42 @@ def SubModelBegin(name):
     g_submodel_map[name] = sub_model
     g_current_submodel = sub_model
 
+
 @config_func
-def SubModelEnd(name = None):
+def SubModelEnd(name=None):
     global g_current_submodel, g_root_submodel, g_submodel_stack
-    config_assert(g_current_submodel is not g_root_submodel, "submodel not begin")
+    config_assert(g_current_submodel is not g_root_submodel,
+                  "submodel not begin")
     if name is not None:
-        config_assert(g_current_submodel.name == MakeLayerNameInParentSubmodel(name),
-                      "submodel name error")
+        config_assert(
+            g_current_submodel.name == MakeLayerNameInParentSubmodel(name),
+            "submodel name error")
 
     g_current_submodel = g_submodel_stack.pop()
 
+
 def MakeLayerNameInParentSubmodel(name):
     suffix = ""
     if len(g_submodel_stack) > 1:
         suffix = "@" + g_submodel_stack[-1].name
     return name + suffix
 
+
 def GetLayerBaseName(name):
     return name.split('@')[0]
 
-def MakeLayerNameInSubmodel(name, submodel_name = None):
+
+def MakeLayerNameInSubmodel(name, submodel_name=None):
     global g_current_submodel
     global g_add_submodel_suffix
-    if (submodel_name is None
-        and not g_add_submodel_suffix
-        and not g_current_submodel.is_recurrent_layer_group):
+    if (submodel_name is None and not g_add_submodel_suffix and
+            not g_current_submodel.is_recurrent_layer_group):
         return name
     if submodel_name is None:
         submodel_name = g_current_submodel.name
     return name + "@" + submodel_name
 
+
 # Define a recurrent layer group begin with RecurrentLayerGroupBegin
 # and end with RecurrentLayerGroupEnd.
 # A recurrent layer group forward/backward one frame after previous frame
@@ -332,8 +344,10 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name,
         if in_links_count == 0:
             in_links_has_subseq = has_subseq
         else:
-            config_assert(in_links_has_subseq == has_subseq,
-                          "The sequence type of in_links should be the same in RecurrentLayerGroup")
+            config_assert(
+                in_links_has_subseq == has_subseq,
+                "The sequence type of in_links should be the same in RecurrentLayerGroup"
+            )
         in_links_count += 1
         layer_name = MakeLayerNameInParentSubmodel(name)
         layer = g_layer_map[layer_name]
@@ -347,6 +361,7 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name,
         pair.link_name = MakeLayerNameInSubmodel(name)
         pair.has_subseq = has_subseq
 
+
 @config_func
 def RecurrentLayerGroupSetOutLink(link):
     if isinstance(link, basestring):
@@ -363,8 +378,7 @@ def RecurrentLayerGroupSetOutLink(link):
 
 
 def RecurrentLayerGroupSetGenerator(generator=None):
-    generator.eos_layer_name = MakeLayerNameInSubmodel(
-        generator.eos_layer_name)
+    generator.eos_layer_name = MakeLayerNameInSubmodel(generator.eos_layer_name)
     g_current_submodel.generator.CopyFrom(generator)
 
 
@@ -375,21 +389,18 @@ def RecurrentLayerGroupBegin(name,
                              generator=None,
                              target_inlinkname="",
                              seq_reversed=False):
-    RecurrentLayerGroupWithoutOutLinksBegin(name,
-                                            in_links,
-                                            seq_reversed,
+    RecurrentLayerGroupWithoutOutLinksBegin(name, in_links, seq_reversed,
                                             target_inlinkname)
     for link in out_links:
         RecurrentLayerGroupSetOutLink(link)
 
-
     if generator is not None:
         RecurrentLayerGroupSetGenerator(generator)
-        config_assert(len(in_links) == 0,
-                      "no in_links should be passed to generator")
-        config_assert(len(out_links) >= 1,
-                      "one or more than one out_links should be passed to generator")
-
+        config_assert(
+            len(in_links) == 0, "no in_links should be passed to generator")
+        config_assert(
+            len(out_links) >= 1,
+            "one or more than one out_links should be passed to generator")
 
 
 @config_func
@@ -397,9 +408,10 @@ def RecurrentLayerGroupEnd(name):
     global g_current_submodel
     config_assert(g_current_submodel.is_recurrent_layer_group,
                   "RecurrentLayerGroup not begin")
-    for pair in g_current_submodel.memories: #check exist
+    for pair in g_current_submodel.memories:  #check exist
         layer = g_layer_map[pair.layer_name]
-        config_assert(layer is not None, "memory declare wrong name:%s" % pair.layer_name)
+        config_assert(layer is not None,
+                      "memory declare wrong name:%s" % pair.layer_name)
         memory_link = g_layer_map[pair.link_name]
         config_assert(layer.size == memory_link.size,
                       "memory declare wrong size:%d" % memory_link.size)
@@ -418,12 +430,14 @@ def RecurrentLayerGroupEnd(name):
         else:
             GatherAgentLayer(name=agent_name, size=layer.size)
 
+
 # Define the model type
 # currently, the paddle supports "nn", "recurrent_nn", "recursive_nn" and "multi_nn"
 @config_func
 def model_type(name):
     g_config.model_config.type = name
 
+
 @config_class
 class Bias(Cfg):
     def __init__(
@@ -441,10 +455,10 @@ class Bias(Cfg):
             sparse_remote_update=None,
             gradient_clipping_threshold=None,
             is_static=None,
-            is_shared=None,
-            ):
+            is_shared=None, ):
         self.add_keys(locals())
 
+
 # Define one input for a layer
 @config_class
 class Input(Cfg):
@@ -465,28 +479,32 @@ class Input(Cfg):
             sparse_update=None,
             gradient_clipping_threshold=None,
             conv=None,
+            bilinear_interp=None,
             norm=None,
             pool=None,
             image=None,
             block_expand=None,
+            maxout=None,
+            spp=None,
             format=None,
             nnz=None,
             is_static=None,
             is_shared=None,
             update_hooks=None,
-            input_layer_argument=None,
-            ):
+            input_layer_argument=None, ):
         self.add_keys(locals())
         self.input_layer_name = MakeLayerNameInSubmodel(input_layer_name)
 
+
 # Define a projection for iexed layer
 @config_class
 class Projection(Input):
-    type = None # subclass should set it correctly
+    type = None  # subclass should set it correctly
+
     def __init__(
             self,
             input_layer_name,
-            size = 0, # projection output size
+            size=0,  # projection output size
             parameter_name=None,
             learning_rate=None,
             momentum=None,
@@ -506,8 +524,7 @@ class Projection(Input):
             is_static=None,
             is_shared=None,
             update_hooks=None,
-            input_layer_argument=None,
-            ):
+            input_layer_argument=None, ):
         self.add_keys(locals())
         self.input_layer_name = MakeLayerNameInSubmodel(input_layer_name)
 
@@ -521,8 +538,10 @@ class Projection(Input):
     # to indicate using the size from Layer config
     def calc_output_size(self, input_layer_config):
         return self.size
+
     def calc_parameter_size(self, input_size, output_size):
         raise NotimplementedError
+
     def calc_parameter_dims(self, input_size, output_size):
         raise NotimplementedError
 
@@ -533,31 +552,32 @@ class IdentityProjection(Projection):
 
     def calc_output_size(self, input_layer_config):
         return input_layer_config.size
+
     def calc_parameter_size(self, input_size, output_size):
         return 0
+
     def calc_parameter_dims(self, input_size, output_size):
         return []
 
+
 # Like IdentityProjection, but layer size may smaller than input size,
 # the projection select dimesions [offset, offset+layer_size) from input
 @config_class
 class IdentityOffsetProjection(Projection):
     type = 'identity_offset'
 
-    def __init__(
-            self,
-            input_layer_name,
-            offset,
-            **xargs):
-        super(IdentityOffsetProjection, self).__init__(
-            input_layer_name, **xargs)
+    def __init__(self, input_layer_name, offset, **xargs):
+        super(IdentityOffsetProjection, self).__init__(input_layer_name,
+                                                       **xargs)
         self.proj_conf.offset = offset
 
     def calc_parameter_size(self, input_size, output_size):
         return 0
+
     def calc_parameter_dims(self, input_size, output_size):
         return []
 
+
 # DotMulProjection performs element-wise multiplication with weight
 @config_class
 class DotMulProjection(Projection):
@@ -565,49 +585,68 @@ class DotMulProjection(Projection):
 
     def calc_output_size(self, input_layer_config):
         return input_layer_config.size
+
     def calc_parameter_size(self, input_size, output_size):
         return output_size
+
     def calc_parameter_dims(self, input_size, output_size):
         return [1, output_size]
 
+
+# ScalingProjection
+@config_class
+class ScalingProjection(Projection):
+    type = 'scaling'
+
+    def calc_output_size(self, input_layer_config):
+        return input_layer_config.size
+
+    def calc_parameter_size(self, input_size, output_size):
+        return 1
+
+    def calc_parameter_dims(self, input_size, output_size):
+        return [1, 1]
+
+
 @config_class
 class TableProjection(Projection):
     type = 'table'
 
     def calc_parameter_size(self, input_size, output_size):
         return input_size * output_size
+
     def calc_parameter_dims(self, input_size, output_size):
         return [input_size, output_size]
 
+
 @config_class
 class FullMatrixProjection(Projection):
     type = 'fc'
 
     def calc_parameter_size(self, input_size, output_size):
         return input_size * output_size
+
     def calc_parameter_dims(self, input_size, output_size):
         return [input_size, output_size]
 
+
 @config_class
 class TransposedFullMatrixProjection(Projection):
     type = 'trans_fc'
 
     def calc_parameter_size(self, input_size, output_size):
         return input_size * output_size
+
     def calc_parameter_dims(self, input_size, output_size):
         return [output_size, input_size]
 
+
 @config_class
 class ContextProjection(Projection):
     type = 'context'
 
-    def __init__(
-            self,
-            input_layer_name,
-            context_start,
-            context_length,
-            trainable_padding,
-            **xargs):
+    def __init__(self, input_layer_name, context_start, context_length,
+                 trainable_padding, **xargs):
         super(ContextProjection, self).__init__(input_layer_name, **xargs)
         self.proj_conf.context_start = context_start
         self.proj_conf.context_length = context_length
@@ -631,14 +670,51 @@ class ContextProjection(Projection):
     _total_pad = 0
 
 
+@config_class
+class ConvProjection(Projection):
+    type = 'conv'
+
+    def __init__(self,
+                 input_layer_name,
+                 num_filters=None,
+                 conv_conf=None,
+                 **xargs):
+        super(ConvProjection, self).__init__(input_layer_name, **xargs)
+
+        if num_filters is not None:
+            self.proj_conf.num_filters = num_filters
+
+        parse_conv(conv_conf, input_layer_name, self.proj_conf.conv_conf,
+                   num_filters)
+        # TODO: support rectangle input
+        self.proj_conf.output_size = (self.proj_conf.conv_conf.output_x
+                                      **2) * num_filters
+
+    def calc_output_size(self, input_layer_config):
+        return self.proj_conf.output_size
+
+    def calc_parameter_size(self, input_size, output_size):
+        co = self.proj_conf.num_filters
+        ci = self.proj_conf.conv_conf.channels
+        fh = self.proj_conf.conv_conf.filter_size
+        fw = self.proj_conf.conv_conf.filter_size_y
+        return co * ci * fh * fw
+
+    def calc_bias_size(self):
+        return self.proj_conf.num_filters
+
+    def calc_parameter_dims(self, input_size, output_size):
+        return None
+
+
 # Define a operator for mixed layer
 @config_class
 class Operator(Cfg):
-    type = None # subclass should set it correctly
+    type = None  # subclass should set it correctly
+
     def __init__(
             self,
-            input_layer_names,
-            ):
+            input_layer_names, ):
         self.add_keys(locals())
         self.operator_conf = OperatorConfig()
         self.operator_conf.type = self.type
@@ -649,16 +725,13 @@ class Operator(Cfg):
     def calc_output_size(self, input_sizes):
         return 0
 
+
 @config_class
 class DotMulOperator(Operator):
     type = 'dot_mul'
-    def __init__(
-            self,
-            input_layer_names,
-            scale=None,
-            **xargs):
-        super(DotMulOperator, self).__init__(
-            input_layer_names, **xargs)
+
+    def __init__(self, input_layer_names, scale=None, **xargs):
+        super(DotMulOperator, self).__init__(input_layer_names, **xargs)
         if scale is not None:
             self.operator_conf.dotmul_scale = scale
 
@@ -674,25 +747,24 @@ class DotMulOperator(Operator):
         return input_sizes[0]
 
 
-
 @config_class
 class ConvOperator(Operator):
     type = 'conv'
-    def __init__(
-            self,
-            input_layer_names,
-            num_filters=None,
-            conv_conf=None,
-            **xargs):
-        super(ConvOperator, self).__init__(
-            input_layer_names, **xargs)
+
+    def __init__(self,
+                 input_layer_names,
+                 num_filters=None,
+                 conv_conf=None,
+                 **xargs):
+        super(ConvOperator, self).__init__(input_layer_names, **xargs)
         if num_filters is not None:
             self.operator_conf.num_filters = num_filters
 
         parse_conv(conv_conf,
                    MakeLayerNameInSubmodel(input_layer_names[0]),
-                   self.operator_conf.conv_conf)
-        self.operator_conf.output_size = (self.operator_conf.conv_conf.output_x  ** 2) * num_filters
+                   self.operator_conf.conv_conf, num_filters)
+        self.operator_conf.output_size = (self.operator_conf.conv_conf.output_x
+                                          **2) * num_filters
 
         config_assert(len(input_layer_names) == 2, "Conv is binary operator")
 
@@ -703,29 +775,36 @@ class ConvOperator(Operator):
 # please refer to the comments in proto/ModelConfig.proto
 @config_class
 class Conv(Cfg):
-    def __init__(
-            self,
-            filter_size,
-            channels,
-            padding = None,
-            stride = None,
-            groups = None,
-            filter_channels = None,
-            output_x = None,
-            img_size = None,
-            caffe_mode = True,
-            filter_size_y = None,
-            padding_y = None,
-            stride_y = None):
+    def __init__(self,
+                 filter_size,
+                 channels,
+                 padding=None,
+                 stride=None,
+                 groups=None,
+                 filter_channels=None,
+                 output_x=None,
+                 img_size=None,
+                 caffe_mode=True,
+                 filter_size_y=None,
+                 padding_y=None,
+                 stride_y=None):
         self.add_keys(locals())
         if filter_size_y is None:
-          self.filter_size_y = filter_size
+            self.filter_size_y = filter_size
         if padding_y is None:
-          self.padding_y = padding
+            self.padding_y = padding
         if stride_y is None:
-          self.stride_y = stride
+            self.stride_y = stride
         if output_x is not None:
-          config_assert(output_x <= 0)
+            config_assert(output_x <= 0)
+
+
+# please refer to the comments in proto/ModelConfig.proto
+@config_class
+class BilinearInterp(Cfg):
+    def __init__(self, out_size_x=None, out_size_y=None, num_channels=None):
+        self.add_keys(locals())
+
 
 # please refer to the comments in proto/ModelConfig.proto
 @config_class
@@ -735,56 +814,68 @@ class Pool(Cfg):
             pool_type,
             channels,
             size_x,
-            size_y = None,
-            img_width = None,
-            start = None,
-            stride = None,
-            stride_y = None,
-            padding = None,
-            padding_y = None):
+            size_y=None,
+            img_width=None,
+            start=None,
+            stride=None,  # 1 by defalut in protobuf
+            stride_y=None,
+            padding=None,  # 0 by defalut in protobuf
+            padding_y=None):
+        self.add_keys(locals())
+
+
+# please refer to the comments in proto/ModelConfig.proto
+@config_class
+class SpatialPyramidPool(Cfg):
+    def __init__(self, pool_type, pyramid_height, channels, img_width=None):
         self.add_keys(locals())
 
+
 # please refer to the comments in proto/ModelConfig.proto
 @config_class
 class Norm(Cfg):
-    def __init__(
-            self,
-            norm_type,
-            channels,
-            size,
-            scale,
-            pow,
-            output_x = None,
-            img_size = None,
-            blocked = None):
+    def __init__(self,
+                 norm_type,
+                 channels,
+                 size,
+                 scale,
+                 pow,
+                 output_x=None,
+                 img_size=None,
+                 blocked=None):
         self.add_keys(locals())
 
+
 # please refer to the comments in proto/ModelConfig.proto
 @config_class
 class Image(Cfg):
-    def __init__(
-            self,
-            channels,
-            img_size = None):
+    def __init__(self, channels, img_size=None):
         self.add_keys(locals())
 
+
 @config_class
 class BlockExpand(Cfg):
-    def __init__(
-            self,
-            channels,
-            padding_x = 0,
-            padding_y = 0,
-            stride_x = 0,
-            stride_y = 0,
-            block_x = 0,
-            block_y = 0,
-            img_size_x = 0,
-            img_size_y = 0,
-            output_x = 0,
-            output_y = 0):
+    def __init__(self,
+                 channels,
+                 padding_x=0,
+                 padding_y=0,
+                 stride_x=0,
+                 stride_y=0,
+                 block_x=0,
+                 block_y=0,
+                 img_size_x=0,
+                 img_size_y=0,
+                 output_x=0,
+                 output_y=0):
         self.add_keys(locals())
 
+
+@config_class
+class MaxOut(Cfg):
+    def __init__(self, channels, groups, img_size_x=0, img_size_y=0):
+        self.add_keys(locals())
+
+
 def DataBase(async_load_data=False,
              constant_slots=None,
              data_ratio=1,
@@ -798,23 +889,23 @@ def DataBase(async_load_data=False,
 
     if constant_slots:
         data_config.constant_slots.extend(constant_slots)
-    data_config.data_ratio=data_ratio
-    data_config.is_main_data=is_main_data
+    data_config.data_ratio = data_ratio
+    data_config.is_main_data = is_main_data
 
-    usage_ratio=default(usage_ratio, settings_deprecated["usage_ratio"])
+    usage_ratio = default(usage_ratio, settings_deprecated["usage_ratio"])
     config_assert(usage_ratio >= 0 and usage_ratio <= 1,
                   "The range of usage_ratio is [0, 1]")
     data_config.usage_ratio = usage_ratio
 
     return data_config
 
+
 @config_func
-def SimpleData(
-        files=None,
-        feat_dim=None,
-        context_len=None,
-        buffer_capacity=None,
-        **xargs):
+def SimpleData(files=None,
+               feat_dim=None,
+               context_len=None,
+               buffer_capacity=None,
+               **xargs):
     data_config = DataBase(**xargs)
     data_config.type = 'simple'
     data_config.files = files
@@ -825,31 +916,36 @@ def SimpleData(
         data_config.buffer_capacity = buffer_capacity
     return data_config
 
+
 @config_func
-def PyData(
-        files=None,
-        type=None,
-        file_group_queue_capacity=None,
-        load_data_module=None,
-        load_data_object=None,
-        load_data_args="",
-        load_file_count=None,
-        constant_slots=None,
-        load_thread_num=None,
-        **xargs):
+def PyData(files=None,
+           type=None,
+           file_group_queue_capacity=None,
+           load_data_module=None,
+           load_data_object=None,
+           load_data_args="",
+           load_file_count=None,
+           constant_slots=None,
+           load_thread_num=None,
+           **xargs):
     data_config = DataBase(**xargs)
     data_config.type = 'py'
     if load_data_module in g_py_module_name_list:
+
         def get_path(module):
             m = __import__(load_data_module)
             return os.path.split(os.path.realpath(m.__file__))[0]
+
         # python C-api is not thread safe, one module can only be import once,
         # so here we nedd to copy the module with different names if it has to be
         # imported several times.
-        module_new_name = "%s_copy_%d" % (load_data_module, len(g_py_module_name_list))
+        module_new_name = "%s_copy_%d" % (load_data_module,
+                                          len(g_py_module_name_list))
         g_py_module_name_list.append(module_new_name)
-        module_path = "%s/%s.py" % (get_path(load_data_module), load_data_module)
-        new_module_path = "%s/%s.py" % (get_path(load_data_module), module_new_name)
+        module_path = "%s/%s.py" % (get_path(load_data_module),
+                                    load_data_module)
+        new_module_path = "%s/%s.py" % (get_path(load_data_module),
+                                        module_new_name)
         if os.path.isfile(module_path) == False:
             raise Exception("File %s is not exist." % module_path)
         shutil.copy2(module_path, new_module_path)
@@ -874,15 +970,15 @@ def PyData(
         data_config.constant_slots.extend(constant_slots)
     return data_config
 
+
 @config_func
-def ProtoData(
-        files=None,
-        type=None,
-        file_group_queue_capacity=None,
-        load_file_count=None,
-        constant_slots=None,
-        load_thread_num=None,
-        **xargs):
+def ProtoData(files=None,
+              type=None,
+              file_group_queue_capacity=None,
+              load_file_count=None,
+              constant_slots=None,
+              load_thread_num=None,
+              **xargs):
     data_config = DataBase(**xargs)
     if type is None:
         data_config.type = 'proto'
@@ -903,25 +999,24 @@ def ProtoData(
         data_config.constant_slots.extend(constant_slots)
     return data_config
 
+
 #real data for training is actually provided by "sub_data" data providers.
 @config_func
-def MultiData(
-        sub_data=[]
-        ):
+def MultiData(sub_data=[]):
     data_config = DataConfig()
     data_config.type = 'multi'
     data_config.sub_data_configs.extend(sub_data)
     return data_config
 
+
 @config_func
-def Data(
-        type,
-        files=None,
-        feat_dim=None,
-        slot_dims=None,
-        context_len=None,
-        buffer_capacity=None,
-        **xargs):
+def Data(type,
+         files=None,
+         feat_dim=None,
+         slot_dims=None,
+         context_len=None,
+         buffer_capacity=None,
+         **xargs):
 
     data_config = DataBase(**xargs)
     data_config.type = type
@@ -957,56 +1052,109 @@ def TestData(data_config, async_load_data=None):
                        " Data definition")
         g_config.test_data_config.async_load_data = async_load_data
 
+
+def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
+    bilinear_conf.out_size_x = bilinear.out_size_x
+    bilinear_conf.out_size_y = bilinear.out_size_y
+    bilinear_conf.num_channels = bilinear.num_channels
+
+
+'''
+caffe_mode: compute the output size using floor instead of ceil,
+            which is consistent of caffe and CuDNN's convention.
+'''
+
+
+def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
+    output = (2 * padding + img_size - filter_size) / float(stride)
+    if caffe_mode:
+        return 1 + int(math.floor(output))
+    else:
+        return 1 + int(math.ceil(output))
+
+
+'''
+calcualte image_size based on output_size for convolution. 
+It is the reverse function of cnn_output_size
+'''
+
+
+def cnn_image_size(output_size, filter_size, padding, stride, caffe_mode):
+    if caffe_mode:
+        img_size = (output_size - 1) * stride + filter_size - 2 * padding
+    else:
+        img_size = (output_size - 2) * stride + filter_size - 2 * padding + 1
+    return img_size
+
+
 def parse_pool(pool, input_layer_name, pool_conf):
     pool_conf.pool_type = pool.pool_type
-    config_assert(pool.pool_type in ['max-projection', 'avg-projection',
-                  'cudnn-max-pool', 'cudnn-avg-pool'],
-                  "pool-type %s is not in "
+    config_assert(pool.pool_type in [
+        'max-projection', 'avg-projection', 'cudnn-max-pool', 'cudnn-avg-pool'
+    ], "pool-type %s is not in "
                   "['max-projection', 'avg-projection', "
-                  "'cudnn-max-pool', 'cudnn-avg-pool']"
-                  % pool.pool_type)
+                  "'cudnn-max-pool', 'cudnn-avg-pool']" % pool.pool_type)
 
     pool_conf.channels = pool.channels
     pool_conf.size_x = pool.size_x
     pool_conf.stride = pool.stride
 
     pool_conf.size_y = default(pool.size_y, pool_conf.size_x)
-    pool_conf.stride_y = default(pool.stride_y, pool_conf.stride);
+    pool_conf.stride_y = default(pool.stride_y, pool_conf.stride)
 
     img_pixels = g_layer_map[input_layer_name].size / pool.channels
     # the img_width may be removed,
     # and it can be calculated automatically later.
-    pool_conf.img_size = default(pool.img_width, int(img_pixels ** 0.5))
+    pool_conf.img_size = default(pool.img_width, int(img_pixels**0.5))
     pool_conf.img_size_y = img_pixels / pool_conf.img_size
     config_assert(pool_conf.img_size * pool_conf.img_size_y == img_pixels,
-                  "Incorrect input image size %d for input image pixels %d"
-                  % (pool_conf.img_size, img_pixels))
+                  "Incorrect input image size %d for input image pixels %d" %
+                  (pool_conf.img_size, img_pixels))
 
     config_assert(not pool.start, "start is deprecated in pooling.")
 
     if pool.padding is not None:
         pool_conf.padding = pool.padding
-        pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
-        pool_conf.output_x = int(math.ceil((pool_conf.img_size + \
-            2*pool_conf.padding - pool_conf.size_x) / \
-            float(pool_conf.stride))) + 1
-        pool_conf.output_y = int(math.ceil((pool_conf.img_size_y + \
-            2*pool_conf.padding_y - pool_conf.size_y) / \
-            float(pool_conf.stride_y))) + 1
+    pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
+    pool_conf.output_x = cnn_output_size(pool_conf.img_size, pool_conf.size_x,
+                                         pool_conf.padding, pool_conf.stride,
+                                         False)
+    pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y,
+                                         pool_conf.padding_y,
+                                         pool_conf.stride_y, False)
+
+
+def parse_spp(spp, input_layer_name, spp_conf):
+    spp_conf.pool_type = spp.pool_type
+    config_assert(spp.pool_type in ['max-projection', 'avg-projection'],
+                  "pool-type %s is not in "
+                  "['max-projection', 'avg-projection']" % spp.pool_type)
+    spp_conf.pyramid_height = spp.pyramid_height
+    spp_conf.channels = spp.channels
+
+    img_pixels = g_layer_map[input_layer_name].size / spp_conf.channels
+
+    spp_conf.img_size = default(spp.img_width, int(img_pixels**0.5))
+    spp_conf.img_size_y = img_pixels / spp_conf.img_size
+    config_assert(spp_conf.img_size * spp_conf.img_size_y == img_pixels,
+                  "Incorrect input image size %d for input image pixels %d" %
+                  (spp_conf.img_size, img_pixels))
+
 
 def parse_image(image, input_layer_name, image_conf):
     image_conf.channels = image.channels
     image_pixels = g_layer_map[input_layer_name].size / image_conf.channels
-    image_conf.img_size = int(image_pixels ** 0.5)
-    config_assert((image_conf.img_size ** 2) == image_pixels,
-                  "Incorrect input image size %d for input image pixels %d"
-                  % (image_conf.img_size, image_pixels))
+    image_conf.img_size = int(image_pixels**0.5)
+    config_assert((image_conf.img_size**2) == image_pixels,
+                  "Incorrect input image size %d for input image pixels %d" %
+                  (image_conf.img_size, image_pixels))
+
 
 def parse_norm(norm, input_layer_name, norm_conf):
     norm_conf.norm_type = norm.norm_type
     config_assert(norm.norm_type in ['rnorm', 'cmrnorm-projection'],
-                  "norm-type %s is not in [rnorm, 'cmrnorm-projection']"
-                  % norm.norm_type)
+                  "norm-type %s is not in [rnorm, 'cmrnorm-projection']" %
+                  norm.norm_type)
     norm_conf.channels = norm.channels
     norm_conf.size = norm.size
     norm_conf.scale = norm.scale
@@ -1014,20 +1162,24 @@ def parse_norm(norm, input_layer_name, norm_conf):
     norm_conf.blocked = norm.blocked
 
     img_pixels = g_layer_map[input_layer_name].size / norm.channels
-    norm_conf.img_size = int(img_pixels ** 0.5)
-    config_assert((norm_conf.img_size ** 2) == img_pixels,
-                  "Incorrect input image size %d for input image pixels %d"
-                  % (norm_conf.img_size, img_pixels))
+    norm_conf.img_size = int(img_pixels**0.5)
+    config_assert((norm_conf.img_size**2) == img_pixels,
+                  "Incorrect input image size %d for input image pixels %d" %
+                  (norm_conf.img_size, img_pixels))
     norm_conf.output_x = norm_conf.img_size
     if norm.norm_type in ['cmrnorm-projection']:
         norm_conf.scale /= norm.size
     else:
-        norm_conf.scale /= norm.size ** 2
+        norm_conf.scale /= norm.size**2
+
+
 '''
 caffe_mode: compute the output size using floor instead of ceil,
             which is consistent of caffe and CuDNN's convention.
 '''
-def parse_conv(conv, input_layer_name, conv_conf):
+
+
+def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
     conv_conf.filter_size = conv.filter_size
     conv_conf.filter_size_y = conv.filter_size_y
     conv_conf.channels = conv.channels
@@ -1036,25 +1188,38 @@ def parse_conv(conv, input_layer_name, conv_conf):
     conv_conf.stride = conv.stride
     conv_conf.stride_y = conv.stride_y
     conv_conf.groups = conv.groups
-    conv_conf.filter_channels = conv.channels / conv.groups
     conv_conf.caffe_mode = conv.caffe_mode
 
-    img_pixels = g_layer_map[input_layer_name].size / conv.channels
-    print('channels=%d size=%d'%(conv.channels,
-      g_layer_map[input_layer_name].size))
-    conv_conf.img_size = int(img_pixels ** 0.5)
-    config_assert((conv_conf.img_size ** 2) == img_pixels,
-                  ("Input layer %s: Incorrect input image size %d for input "
-                   + "image pixels %d")
-                  % (input_layer_name, conv_conf.img_size, img_pixels))
-    if conv.caffe_mode:
-        conv_conf.output_x = \
-            1 + int(math.floor((2 * conv.padding + conv_conf.img_size \
-            - conv.filter_size) / float(conv.stride)))
+    if not trans:
+        conv_conf.filter_channels = conv.channels / conv.groups
+
+        img_pixels = g_layer_map[input_layer_name].size / conv.channels
+        print('channels=%d size=%d' % (conv.channels,
+                                       g_layer_map[input_layer_name].size))
+        conv_conf.img_size = int(img_pixels**0.5)
+        config_assert((conv_conf.img_size**2) == img_pixels, (
+            "Input layer %s: Incorrect input image size %d for input " +
+            "image pixels %d") %
+                      (input_layer_name, conv_conf.img_size, img_pixels))
+
+        conv_conf.output_x = cnn_output_size(
+            conv_conf.img_size, conv_conf.filter_size, conv_conf.padding,
+            conv_conf.stride, conv_conf.caffe_mode)
     else:
-        conv_conf.output_x = \
-            1 + int(math.ceil((2 * conv.padding + conv_conf.img_size \
-            - conv.filter_size) / float(conv.stride)))
+        conv_conf.filter_channels = num_filters / conv.groups
+
+        outputSize = g_layer_map[input_layer_name].size / conv.channels
+        print('channels=%d size=%d' % (conv.channels,
+                                       g_layer_map[input_layer_name].size))
+        conv_conf.output_x = int(outputSize**0.5)
+        config_assert((conv_conf.output_x**2) == outputSize, (
+            "Input layer %s: Incorrect input image size %d for input " +
+            "image pixels %d") %
+                      (input_layer_name, conv_conf.output_x, outputSize))
+        conv_conf.img_size = cnn_image_size(
+            conv_conf.output_x, conv_conf.filter_size, conv_conf.padding,
+            conv_conf.stride, conv_conf.caffe_mode)
+
 
 def parse_block_expand(block_expand, input_layer_name, block_expand_conf):
     block_expand_conf.channels = block_expand.channels
@@ -1069,18 +1234,24 @@ def parse_block_expand(block_expand, input_layer_name, block_expand_conf):
     if block_expand_conf.img_size_x == 0:
         block_expand_conf.output_x = 0
     else:
-        block_expand_conf.output_x = \
-            1 + \
-            int(math.ceil((2 * block_expand.padding_x + block_expand.img_size_x \
-            - block_expand.block_x) / float(block_expand.stride_x)))
+        block_expand_conf.output_x = cnn_output_size(
+            block_expand.img_size_x, block_expand.block_x,
+            block_expand.padding_x, block_expand.stride_x, False)
 
     if block_expand_conf.img_size_y == 0:
-      block_expand_conf.output_y = 0
+        block_expand_conf.output_y = 0
     else:
-        block_expand_conf.output_y = \
-            1 + \
-            int(math.ceil((2 * block_expand.padding_y + block_expand.img_size_y \
-            - block_expand.block_y) / float(block_expand.stride_y)))
+        block_expand_conf.output_y = cnn_output_size(
+            block_expand.img_size_y, block_expand.block_y,
+            block_expand.padding_y, block_expand.stride_y, False)
+
+
+def parse_maxout(maxout, input_layer_name, maxout_conf):
+    maxout_conf.channels = maxout.channels
+    maxout_conf.groups = maxout.groups
+    maxout_conf.img_size_x = maxout.img_size_x
+    maxout_conf.img_size_y = maxout.img_size_y
+
 
 # Define an evaluator
 @config_func
@@ -1088,15 +1259,14 @@ def Evaluator(
         name,
         type,
         inputs,
-        chunk_scheme = None,
-        num_chunk_types = None,
-        classification_threshold = None,
-        positive_label = None,
-        dict_file = None,
-        result_file = None,
-        num_results = None,
-        delimited = None,
-        ):
+        chunk_scheme=None,
+        num_chunk_types=None,
+        classification_threshold=None,
+        positive_label=None,
+        dict_file=None,
+        result_file=None,
+        num_results=None,
+        delimited=None, ):
     evaluator = g_config.model_config.evaluators.add()
     evaluator.type = type
     evaluator.name = MakeLayerNameInSubmodel(name)
@@ -1125,19 +1295,20 @@ def Evaluator(
     if delimited is not None:
         evaluator.delimited = delimited
 
+
 class LayerBase(object):
     def __init__(
             self,
             name,
             type,
-            size, # size can be 0. In this case, subclass should set it.
+            size,  # size can be 0. In this case, subclass should set it.
             inputs,
             device=None,
             active_type="",
             drop_rate=0.,
             coeff=None):
         config_assert('@' not in name,
-                "layer name: %s contain special character @" % name)
+                      "layer name: %s contain special character @" % name)
         global g_current_submodel
         name = MakeLayerNameInSubmodel(name)
 
@@ -1176,8 +1347,8 @@ class LayerBase(object):
             if type_of(input) == str:
                 input_layer_name = input
                 input_config = Input(
-                    input_layer_name = input,
-                    parameter_name = gen_parameter_name(name, input_index))
+                    input_layer_name=input,
+                    parameter_name=gen_parameter_name(name, input_index))
                 input_layer_name = input_config.input_layer_name
             elif isinstance(input, Input):
                 input_layer_name = input.input_layer_name
@@ -1186,16 +1357,15 @@ class LayerBase(object):
                     input_config.parameter_name = \
                         gen_parameter_name(name, input_index)
             elif isinstance(input, Operator):
-                self.operators.append(input);
+                self.operators.append(input)
                 input.operator_conf.input_indices.append(input_index)
                 input_config = Input(input.input_layer_names[0])
                 input_layer_name = input_config.input_layer_name
             else:
-                raise ValueError(
-                    'Wrong type for inputs: %s' % type_of(input))
+                raise ValueError('Wrong type for inputs: %s' % type_of(input))
             config_assert(input_layer_name in g_layer_map,
-                          "Unknown input layer '%s' for layer %s"
-                          % (input_layer_name, name))
+                          "Unknown input layer '%s' for layer %s" %
+                          (input_layer_name, name))
             self.inputs[input_index] = input_config
             layer_input = self.config.inputs.add()
             layer_input.input_layer_name = input_config.input_layer_name
@@ -1207,26 +1377,26 @@ class LayerBase(object):
 
         g_current_submodel.layer_names.append(self.config.name)
 
-
     def get_input_layer(self, input_index):
         return g_layer_map[self.config.inputs[input_index].input_layer_name]
 
     # will return the bias created if not *for_self*
     def create_bias_parameter(
             self,
-            bias, # True/False or BiasCfg
+            bias,  # True/False or BiasCfg
             size,
-            dims = None,
-            for_self = True, # whether create bias for layer self
-            ):
+            dims=None,
+            for_self=True,  # whether create bias for layer self
+    ):
 
         if size == 0:
             return
         if dims is None:
             dims = [1, size]
 
-        config_assert(type_of(bias) == bool or type_of(bias) == Bias,
-                      'Incorrect type for bias: %s' % type_of(bias))
+        config_assert(
+            type_of(bias) == bool or type_of(bias) == Bias,
+            'Incorrect type for bias: %s' % type_of(bias))
 
         if type_of(bias) == bool:
             if bias:
@@ -1241,7 +1411,8 @@ class LayerBase(object):
                 Parameter(
                     bias.parameter_name,
                     size,
-                    self.config.device if self.config.HasField('device') else None,
+                    self.config.device
+                    if self.config.HasField('device') else None,
                     dims,
                     bias.learning_rate,
                     bias.momentum,
@@ -1253,22 +1424,21 @@ class LayerBase(object):
                     initial_smart=bias.initial_smart,
                     num_batches_regularization=bias.num_batches_regularization,
                     sparse_remote_update=bias.sparse_remote_update,
-                    gradient_clipping_threshold=bias.gradient_clipping_threshold,
+                    gradient_clipping_threshold=bias.
+                    gradient_clipping_threshold,
                     is_static=bias.is_static,
-                    is_shared=bias.is_shared,
-                    )
+                    is_shared=bias.is_shared, )
             if for_self:
                 self.config.bias_parameter_name = bias.parameter_name
             else:
                 return bias.parameter_name
 
-    def create_input_parameter(
-            self,
-            input_index,
-            size,
-            dims=None,
-            sparse = None,
-            format = None):
+    def create_input_parameter(self,
+                               input_index,
+                               size,
+                               dims=None,
+                               sparse=None,
+                               format=None):
         if dims is None:
             # TODO(yuyang18): print warning and callstack here!
             dims = list()
@@ -1283,12 +1453,12 @@ class LayerBase(object):
 
         if input_config.parameter_name in g_parameter_map:
             para = g_parameter_map[input_config.parameter_name]
-            config_assert(size == para.size, ('Shared parameter "%s" does not '
-                                              + 'have same size: %s vs. %s')
+            config_assert(size == para.size, (
+                'Shared parameter "%s" does not ' + 'have same size: %s vs. %s')
                           % (input_config.parameter_name, para.size, size))
 
-            config_assert(dims == para.dims, ('Shared parameter "%s" does not '
-                                              + 'have same dims: %s vs. %s')
+            config_assert(dims == para.dims, (
+                'Shared parameter "%s" does not ' + 'have same dims: %s vs. %s')
                           % (input_config.parameter_name, para.dims, dims))
             return
 
@@ -1308,13 +1478,13 @@ class LayerBase(object):
             num_batches_regularization=input_config.num_batches_regularization,
             sparse_remote_update=input_config.sparse_remote_update,
             sparse_update=input_config.sparse_update,
-            gradient_clipping_threshold=input_config.gradient_clipping_threshold,
+            gradient_clipping_threshold=input_config.
+            gradient_clipping_threshold,
             sparse=sparse,
             format=format,
             is_static=input_config.is_static,
             is_shared=input_config.is_shared,
-            update_hooks=input_config.update_hooks
-            )
+            update_hooks=input_config.update_hooks)
 
     def set_layer_size(self, size):
         if self.config.size == 0:
@@ -1324,27 +1494,18 @@ class LayerBase(object):
                           'Different inputs result in' +
                           'different layer size at layer %s' % self.config.name)
 
+
 @config_layer('multi_class_cross_entropy_with_selfnorm')
 class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            softmax_selfnorm_alpha=0.1,
-            **xargs):
-        super(MultiClassCrossEntropySelfNormCostLayer, self).__init__(name,
-            'multi_class_cross_entropy_with_selfnorm', 0, inputs, **xargs)
+    def __init__(self, name, inputs, softmax_selfnorm_alpha=0.1, **xargs):
+        super(MultiClassCrossEntropySelfNormCostLayer, self).__init__(
+            name, 'multi_class_cross_entropy_with_selfnorm', 0, inputs, **xargs)
         self.config.softmax_selfnorm_alpha = softmax_selfnorm_alpha
 
+
 @config_layer('fc')
 class FCLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            size,
-            inputs,
-            bias=True,
-            **xargs):
+    def __init__(self, name, size, inputs, bias=True, **xargs):
         super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
@@ -1358,22 +1519,23 @@ class FCLayer(LayerBase):
             else:
                 sparse = None
 
-            self.create_input_parameter(input_index, psize, dims, sparse, format)
+            self.create_input_parameter(input_index, psize, dims, sparse,
+                                        format)
         self.create_bias_parameter(bias, self.config.size)
 
+
 @config_layer('selective_fc')
 class SelectiveFCLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            size,
-            inputs,
-            bias=True,
-            selective_fc_pass_generation=False,
-            has_selected_colums=True,
-            selective_fc_full_mul_ratio=0.02,
-            selective_fc_parallel_plain_mul_thread_num=None,
-            **xargs):
+    def __init__(self,
+                 name,
+                 size,
+                 inputs,
+                 bias=True,
+                 selective_fc_pass_generation=False,
+                 has_selected_colums=True,
+                 selective_fc_full_mul_ratio=0.02,
+                 selective_fc_parallel_plain_mul_thread_num=None,
+                 **xargs):
         super(SelectiveFCLayer, self).__init__(
             name, 'selective_fc', size, inputs=inputs, **xargs)
         # user MUST know if selctive fc is used in training,
@@ -1394,8 +1556,8 @@ class SelectiveFCLayer(LayerBase):
         input_num = len(self.inputs)
         if has_selected_colums:
             config_assert(input_num >= 2,
-                ("if indices of selected columns are not specified, "
-                "selective_fc Layer has at least two inputs"))
+                          ("if indices of selected columns are not specified, "
+                           "selective_fc Layer has at least two inputs"))
             input_num -= 1
 
         for input_index in xrange(input_num):
@@ -1408,26 +1570,23 @@ class SelectiveFCLayer(LayerBase):
             if sparse:
                 psize = self.inputs[input_index].nnz
 
-            self.create_input_parameter(
-                input_index, psize, dims, sparse, format)
+            self.create_input_parameter(input_index, psize, dims, sparse,
+                                        format)
         self.create_bias_parameter(bias, self.config.size)
 
+
 @config_layer('print')
 class PrintLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs):
+    def __init__(self, name, inputs):
         super(PrintLayer, self).__init__(name, 'print', 0, inputs)
 
+
 @config_layer('data')
 class DataLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            size,
-            device=None):
-        super(DataLayer, self).__init__(name, 'data' , size, inputs=[], device=device)
+    def __init__(self, name, size, device=None):
+        super(DataLayer, self).__init__(
+            name, 'data', size, inputs=[], device=device)
+
 
 '''
 DataNormLayer: A layer for data normalization
@@ -1455,14 +1614,11 @@ Note:
           min-max: y = (x-min)/(max-min)
           decimal-scaling: y = x/10^j, where j is the smallest integer such that max(|y|)<1
 '''
+
+
 @config_layer('data_norm')
 class DataNormLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            data_norm_strategy="z-score",
-            device=None):
+    def __init__(self, name, inputs, data_norm_strategy="z-score", device=None):
         super(DataNormLayer, self).__init__(
             name, 'data_norm', 0, inputs=inputs, device=device)
         self.config.data_norm_strategy = data_norm_strategy
@@ -1474,15 +1630,12 @@ class DataNormLayer(LayerBase):
         self.inputs[0].is_static = True
         self.create_input_parameter(0, para_size, para_dims)
 
+
 @config_layer('prelu')
 class ParameterReluLayer(LayerBase):
     layer_type = 'prelu'
-    def __init__(
-            self,
-            name,
-            inputs,
-            partial_sum = 1,
-            **args):
+
+    def __init__(self, name, inputs, partial_sum=1, **args):
         super(ParameterReluLayer, self).__init__(
             name, self.layer_type, 0, inputs=inputs, **args)
         config_assert(len(self.inputs) == 1)
@@ -1491,17 +1644,18 @@ class ParameterReluLayer(LayerBase):
         self.set_layer_size(input_layer.size)
         self.create_input_parameter(0, input_layer.size / partial_sum)
 
+
 @config_layer('conv')
 class ConvLayerBase(LayerBase):
     layer_type = 'conv'
-    def __init__(
-            self,
-            name,
-            inputs=[],
-            bias=True,
-            num_filters=None,
-            shared_biases=False,
-            **xargs):
+
+    def __init__(self,
+                 name,
+                 inputs=[],
+                 bias=True,
+                 num_filters=None,
+                 shared_biases=False,
+                 **xargs):
         super(ConvLayerBase, self).__init__(
             name, self.layer_type, 0, inputs=inputs, **xargs)
 
@@ -1518,7 +1672,7 @@ class ConvLayerBase(LayerBase):
             config_assert(use_gpu, "cudnn_conv only support GPU")
 
         if (use_gpu == 1 and self.layer_type != "exconv" and
-           (parallel_nn == 0 or self.config.device > -1)):
+            (parallel_nn == 0 or self.config.device > -1)):
             self.layer_type = "cudnn_conv"
         else:
             self.layer_type = "exconv"
@@ -1530,16 +1684,14 @@ class ConvLayerBase(LayerBase):
 
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
-            parse_conv(
-                self.inputs[input_index].conv,
-                input_layer.name,
-                self.config.inputs[input_index].conv_conf)
+            parse_conv(self.inputs[input_index].conv, input_layer.name,
+                       self.config.inputs[input_index].conv_conf, num_filters)
             conv_conf = self.config.inputs[input_index].conv_conf
             psize = self.calc_parameter_size(conv_conf)
             print("output size for %s is %d " % (name, conv_conf.output_x))
             self.create_input_parameter(input_index, psize)
             self.set_layer_size(
-                (conv_conf.output_x ** 2) * self.config.num_filters)
+                (conv_conf.output_x**2) * self.config.num_filters)
 
         psize = self.config.size
         if shared_biases:
@@ -1550,70 +1702,139 @@ class ConvLayerBase(LayerBase):
         return self.config.num_filters * conv_conf.filter_channels \
                     * (conv_conf.filter_size * conv_conf.filter_size_y)
 
+
 @config_layer('exconv')
 class ConvLayer(ConvLayerBase):
     layer_type = 'exconv'
 
+
 @config_layer('cudnn_conv')
 class ConvLayer(ConvLayerBase):
     layer_type = 'cudnn_conv'
 
-@config_layer('norm')
-class NormLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            device=None):
-        super(NormLayer, self).__init__(name, 'norm', 0, inputs=inputs, device=device)
+
+@config_layer('convt')
+class ConvTransLayerBase(LayerBase):
+    layer_type = 'convt'
+
+    def __init__(self,
+                 name,
+                 inputs=[],
+                 bias=True,
+                 num_filters=None,
+                 shared_biases=False,
+                 **xargs):
+        super(ConvTransLayerBase, self).__init__(
+            name, self.layer_type, 0, inputs=inputs, **xargs)
+
+        if num_filters is not None:
+            self.config.num_filters = num_filters
+
+        use_gpu = int(g_command_config_args.get("use_gpu", 0))
+        parallel_nn = int(g_command_config_args.get("parallel_nn", 0))
+
+        # cudnn_convt has not been implemented so use exconvt only
+        self.layer_type = "exconvt"
+        # need to specify layer in config
+        self.config.type = self.layer_type
+
+        if shared_biases is not None:
+            self.config.shared_biases = shared_biases
+
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
-            parse_norm(
-                self.inputs[input_index].norm,
+            parse_conv(
+                self.inputs[input_index].conv,
                 input_layer.name,
-                self.config.inputs[input_index].norm_conf)
-            norm_conf = self.config.inputs[input_index].norm_conf
-            self.set_layer_size((norm_conf.output_x ** 2) * norm_conf.channels)
+                self.config.inputs[input_index].conv_conf,
+                num_filters,
+                trans=True)
+            conv_conf = self.config.inputs[input_index].conv_conf
+            psize = self.calc_parameter_size(conv_conf)
+            print("output size for %s is %d " % (name, conv_conf.output_x))
+            self.create_input_parameter(input_index, psize)
+            self.set_layer_size(
+                (conv_conf.img_size**2) * self.config.num_filters)
 
-@config_layer('pool')
+        psize = self.config.size
+        if shared_biases:
+            psize = self.config.num_filters
+        self.create_bias_parameter(bias, psize, [psize, 1])
+
+    def calc_parameter_size(self, conv_conf):
+        return conv_conf.channels * conv_conf.filter_channels \
+                    * (conv_conf.filter_size * conv_conf.filter_size_y)
+
+
+@config_layer('exconvt')
+class ConvTransLayer(ConvTransLayerBase):
+    layer_type = 'exconvt'
+
+
+@config_layer('norm')
+class NormLayer(LayerBase):
+    def __init__(self, name, inputs, device=None):
+        super(NormLayer, self).__init__(
+            name, 'norm', 0, inputs=inputs, device=device)
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            parse_norm(self.inputs[input_index].norm, input_layer.name,
+                       self.config.inputs[input_index].norm_conf)
+            norm_conf = self.config.inputs[input_index].norm_conf
+            self.set_layer_size((norm_conf.output_x**2) * norm_conf.channels)
+
+
+@config_layer('pool')
 class PoolLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            device=None):
-        super(PoolLayer, self).__init__(name, 'pool', 0, inputs=inputs, device=device)
+    def __init__(self, name, inputs, device=None):
+        super(PoolLayer, self).__init__(
+            name, 'pool', 0, inputs=inputs, device=device)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
-            parse_pool(
-                self.inputs[input_index].pool,
-                input_layer.name,
-                self.config.inputs[input_index].pool_conf)
+            parse_pool(self.inputs[input_index].pool, input_layer.name,
+                       self.config.inputs[input_index].pool_conf)
             pool_conf = self.config.inputs[input_index].pool_conf
-            print("output size for %s is %d*%d " % (
-                name, pool_conf.output_y, pool_conf.output_x))
-            self.set_layer_size((pool_conf.output_x * pool_conf.output_y) * pool_conf.channels)
+            print("output size for %s is %d*%d " % (name, pool_conf.output_y,
+                                                    pool_conf.output_x))
+            self.set_layer_size(
+                (pool_conf.output_x * pool_conf.output_y) * pool_conf.channels)
+
+
+@config_layer('spp')
+class SpatialPyramidPoolLayer(LayerBase):
+    def __init__(self, name, inputs, device=None):
+        super(SpatialPyramidPoolLayer, self).__init__(
+            name, 'spp', 0, inputs=inputs, device=device)
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            parse_spp(self.inputs[input_index].spp, input_layer.name,
+                      self.config.inputs[input_index].spp_conf)
+            spp_conf = self.config.inputs[input_index].spp_conf
+            output_size = (pow(4, spp_conf.pyramid_height) - 1) / (4 - 1)
+            print("output size for %s is %d " % (name, output_size))
+            self.set_layer_size(output_size * spp_conf.channels)
+
 
 @config_layer('batch_norm')
 class BatchNormLayer(LayerBase):
     layer_type = 'batch_norm'
-    def __init__(
-            self,
-            name,
-            inputs,
-            active_type="linear",
-            bias=True,
-            device=None,
-            use_global_stats=True,
-            moving_average_fraction=0.9,
-            batch_norm_type=None,
-            **xargs):
+
+    def __init__(self,
+                 name,
+                 inputs,
+                 active_type="linear",
+                 bias=True,
+                 device=None,
+                 use_global_stats=True,
+                 moving_average_fraction=0.9,
+                 batch_norm_type=None,
+                 **xargs):
         if inputs is None:
             inputs = []
         elif not isinstance(inputs, list):
             inputs = [inputs]
-        config_assert(len(inputs) == 1,
-                      "BatchNormLayer must have one and only one input")
+        config_assert(
+            len(inputs) == 1, "BatchNormLayer must have one and only one input")
         # Create Input for moving mean and std,
         # in batch normalization layer.
         # These paras no need to update, so set is_static is true.
@@ -1622,12 +1843,13 @@ class BatchNormLayer(LayerBase):
         use_gpu = bool(int(g_command_config_args.get("use_gpu", 0)))
         is_shared = True if not use_gpu else False
         for i in xrange(2):
-            inputs.append(Input(inputs[0].input_layer_name,
-                                initial_std=0.0,
-                                initial_mean=0.0,
-                                is_static=True,
-                                 is_shared=is_shared,
-                                ))
+            inputs.append(
+                Input(
+                    inputs[0].input_layer_name,
+                    initial_std=0.0,
+                    initial_mean=0.0,
+                    is_static=True,
+                    is_shared=is_shared, ))
 
         parallel_nn = bool(int(g_command_config_args.get("parallel_nn", 0)))
         cudnn_version = int(g_command_config_args.get("cudnn_version", 0))
@@ -1637,21 +1859,25 @@ class BatchNormLayer(LayerBase):
             ((not parallel_nn) or self.config.device > -1) and \
             cudnn_version >= 4007
         self.layer_type = "cudnn_batch_norm" if use_cudnn else "batch_norm"
-        super(BatchNormLayer, self).__init__(name, self.layer_type, 0,
-                                             active_type=active_type,
-                                             inputs=inputs, device=device, **xargs)
+        super(BatchNormLayer, self).__init__(
+            name,
+            self.layer_type,
+            0,
+            active_type=active_type,
+            inputs=inputs,
+            device=device,
+            **xargs)
 
         if use_global_stats is not None:
             self.config.use_global_stats = use_global_stats
         if moving_average_fraction is not None:
             self.config.moving_average_fraction = moving_average_fraction
 
-        input_layer= self.get_input_layer(0)
-        parse_image(self.inputs[0].image,
-                    input_layer.name,
+        input_layer = self.get_input_layer(0)
+        parse_image(self.inputs[0].image, input_layer.name,
                     self.config.inputs[0].image_conf)
         image_conf = self.config.inputs[0].image_conf
-        self.set_layer_size((image_conf.img_size ** 2) * image_conf.channels)
+        self.set_layer_size((image_conf.img_size**2) * image_conf.channels)
 
         psize = self.calc_parameter_size(image_conf)
         dims = [1, psize]
@@ -1664,60 +1890,74 @@ class BatchNormLayer(LayerBase):
     def calc_parameter_size(self, image_conf):
         return image_conf.channels
 
+
 @config_layer('trans')
 class TransLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            device=None):
-        super(TransLayer, self).__init__(name, 'trans', 0, inputs=inputs, device=device)
-        config_assert(len(self.inputs) == 1,
-                      'TransLayer must have one and only one input')
+    def __init__(self, name, inputs, device=None):
+        super(TransLayer, self).__init__(
+            name, 'trans', 0, inputs=inputs, device=device)
+        config_assert(
+            len(self.inputs) == 1,
+            'TransLayer must have one and only one input')
         self.set_layer_size(self.get_input_layer(0).size)
 
+
 @config_layer('resize')
 class ResizeLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            size,
-            inputs,
-            device=None):
-        super(ResizeLayer, self).__init__(name, 'resize', size=size, inputs=inputs, device=device)
-        config_assert(len(self.inputs) == 1,
-                      'ResizeLayer must have one and only one input')
+    def __init__(self, name, size, inputs, device=None):
+        super(ResizeLayer, self).__init__(
+            name, 'resize', size=size, inputs=inputs, device=device)
+        config_assert(
+            len(self.inputs) == 1,
+            'ResizeLayer must have one and only one input')
+
 
 @config_layer('blockexpand')
 class BlockExpandLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            device=None):
-        super(BlockExpandLayer, self).__init__(name, 'blockexpand', 0, inputs=inputs, device=device)
+    def __init__(self, name, inputs, device=None):
+        super(BlockExpandLayer, self).__init__(
+            name, 'blockexpand', 0, inputs=inputs, device=device)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
-            parse_block_expand(self.inputs[input_index].block_expand,
-                input_layer.name,
+            parse_block_expand(
+                self.inputs[input_index].block_expand, input_layer.name,
                 self.config.inputs[input_index].block_expand_conf)
-            block_expand_conf = self.config.inputs[input_index].block_expand_conf
-            self.set_layer_size(block_expand_conf.block_x * block_expand_conf.block_y
-                * block_expand_conf.channels)
+            block_expand_conf = self.config.inputs[
+                input_index].block_expand_conf
+            self.set_layer_size(block_expand_conf.block_x *
+                                block_expand_conf.block_y *
+                                block_expand_conf.channels)
+
+
+@config_layer('maxout')
+class MaxOutLayer(LayerBase):
+    def __init__(self, name, inputs, **xargs):
+        super(MaxOutLayer, self).__init__(
+            name, 'maxout', 0, inputs=inputs, **xargs)
+        input_layer = self.get_input_layer(0)
+        parse_maxout(self.inputs[0].maxout, input_layer.name,
+                     self.config.inputs[0].maxout_conf)
+        maxout_conf = self.config.inputs[0].maxout_conf
+        self.set_layer_size(g_layer_map[input_layer.name].size /
+                            maxout_conf.groups)
+
 
 # key: cost type
 # value: cost class
 g_cost_map = {}
 
+
 # define a cost layer without any parameters
 def define_cost(class_name, cost_type):
     def init(cls, name, inputs, device=None, coeff=1.):
-        super(type(cls), cls).__init__(name, cost_type, 1, inputs, device=device, coeff=coeff)
+        super(type(cls), cls).__init__(
+            name, cost_type, 1, inputs, device=device, coeff=coeff)
 
-    cls = type(class_name, (LayerBase,), dict(__init__=init))
+    cls = type(class_name, (LayerBase, ), dict(__init__=init))
     global g_cost_map
     g_cost_map[cost_type] = cls
 
+
 define_cost('MultiClassCrossEntropy', 'multi-class-cross-entropy')
 define_cost('RankingCost', 'rank-cost')
 define_cost('AucValidation', 'auc-validation')
@@ -1726,20 +1966,17 @@ define_cost('SumOfSquaresCostLayer', 'square_error')
 define_cost('MultiBinaryLabelCrossEntropy', 'multi_binary_label_cross_entropy')
 define_cost('SoftBinaryClassCrossEntropy', 'soft_binary_class_cross_entropy')
 define_cost('HuberTwoClass', 'huber')
+define_cost('SumCost', 'sum_cost')
+
 
 @config_layer('hsigmoid')
 class HierarchicalSigmoidLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            num_classes,
-            inputs,
-            device=None,
-            bias=True):
+    def __init__(self, name, num_classes, inputs, device=None, bias=True):
         super(HierarchicalSigmoidLayer, self).__init__(
             name, 'hsigmoid', 1, inputs=inputs, device=device)
-        config_assert(len(self.inputs) >= 2,
-                      'HierarchicalSigmoidLayer must have at least 2 inputs')
+        config_assert(
+            len(self.inputs) >= 2,
+            'HierarchicalSigmoidLayer must have at least 2 inputs')
         self.config.num_classes = num_classes
         for input_index in xrange(len(self.inputs) - 1):
             input_layer = self.get_input_layer(input_index)
@@ -1748,6 +1985,7 @@ class HierarchicalSigmoidLayer(LayerBase):
             self.create_input_parameter(input_index, psize, dims)
         self.create_bias_parameter(bias, num_classes - 1)
 
+
 '''
 lambdaCost for lambdaRank LTR approach
 
@@ -1772,59 +2010,57 @@ Usage:
           max_sort_size can be greater than the size of a list, in which
           case the algorithm will sort the entire list to get gradient.
 '''
+
+
 @config_layer('lambda_cost')
 class LambdaCost(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            NDCG_num = 5,
-            max_sort_size = -1,
-            device=None):
+    def __init__(self, name, inputs, NDCG_num=5, max_sort_size=-1, device=None):
         super(LambdaCost, self).__init__(
             name, 'lambda_cost', 1, inputs=inputs, device=device)
-        config_assert(len(self.inputs) == 2,
-                      'lambdaCost must have 2 inputs')
+        config_assert(len(self.inputs) == 2, 'lambdaCost must have 2 inputs')
         self.config.NDCG_num = NDCG_num
         if max_sort_size != -1:
-          config_assert(NDCG_num <= max_sort_size,
-                        'NDCG_num must be less than or equal to max_sort_size')
+            config_assert(
+                NDCG_num <= max_sort_size,
+                'NDCG_num must be less than or equal to max_sort_size')
         self.config.max_sort_size = max_sort_size
 
+
 @config_layer('nce')
 class NCELayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            num_classes,
-            inputs,
-            num_neg_samples=10,
-            neg_sampling_dist=None,
-            bias=True,
-            **xargs):
+    def __init__(self,
+                 name,
+                 num_classes,
+                 inputs,
+                 num_neg_samples=10,
+                 neg_sampling_dist=None,
+                 bias=True,
+                 **xargs):
         super(NCELayer, self).__init__(name, 'nce', 1, inputs=inputs, **xargs)
-        config_assert(len(self.inputs) >= 2,
-                      'NCELayer must have at least 2 inputs')
+        config_assert(
+            len(self.inputs) >= 2, 'NCELayer must have at least 2 inputs')
         self.config.num_classes = num_classes
         if neg_sampling_dist is not None:
-            config_assert(len(neg_sampling_dist) == num_classes,
-                          'len(neg_sampling_dist)(%s) is not same as num_classes (%s)'
-                          % (len(neg_sampling_dist), num_classes))
+            config_assert(
+                len(neg_sampling_dist) == num_classes,
+                'len(neg_sampling_dist)(%s) is not same as num_classes (%s)' %
+                (len(neg_sampling_dist), num_classes))
             s = sum(neg_sampling_dist)
-            config_assert(abs(s - 1) < 1e-5,
-                          'The sum of neg_sampling_dist (%s) is not 1' % s)
+            config_assert(
+                abs(s - 1) < 1e-5,
+                'The sum of neg_sampling_dist (%s) is not 1' % s)
 
             self.config.neg_sampling_dist.extend(neg_sampling_dist)
 
         self.config.num_neg_samples = num_neg_samples
         num_real_inputs = len(self.inputs) - 1
-        input_layer =  self.get_input_layer(num_real_inputs)
+        input_layer = self.get_input_layer(num_real_inputs)
         config_assert(input_layer.type == 'data',
                       'Expecting the last input layer of an nce layer to be '
                       'a data layer')
 
-        if (num_real_inputs > 1 and input_layer.size == 1
-            and self.get_input_layer(num_real_inputs - 1).type == 'data'):
+        if (num_real_inputs > 1 and input_layer.size == 1 and
+                self.get_input_layer(num_real_inputs - 1).type == 'data'):
             # This input layer is assumed to be a sample weight layer
             num_real_inputs -= 1
 
@@ -1838,105 +2074,82 @@ class NCELayer(LayerBase):
 
 @config_layer('addto')
 class AddToLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            bias=True,
-            **xargs):
+    def __init__(self, name, inputs, bias=True, **xargs):
         super(AddToLayer, self).__init__(
             name, 'addto', 0, inputs=inputs, **xargs)
-        config_assert(len(inputs) > 0,
-                      'inputs cannot be empty for AddToLayer')
+        config_assert(len(inputs) > 0, 'inputs cannot be empty for AddToLayer')
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             self.set_layer_size(input_layer.size)
         self.create_bias_parameter(bias, self.config.size)
 
+
 @config_layer('agent')
 class AgentLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            size,
-            device=None):
-        super(AgentLayer, self).__init__(name, 'agent', size, inputs=[], device=device)
+    def __init__(self, name, size, device=None):
+        super(AgentLayer, self).__init__(
+            name, 'agent', size, inputs=[], device=device)
+
 
 @config_layer('sequence_agent')
 class SequenceAgentLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            size,
-            device=None):
+    def __init__(self, name, size, device=None):
         super(SequenceAgentLayer, self).__init__(
             name, 'sequence_agent', size, inputs=[], device=device)
 
+
 @config_layer('gather_agent')
 class GatherAgentLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            size,
-            device=None):
+    def __init__(self, name, size, device=None):
         super(GatherAgentLayer, self).__init__(
             name, 'gather_agent', size, inputs=[], device=device)
 
+
 @config_layer('scatter_agent')
 class ScatterAgentLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            size,
-            device=None):
+    def __init__(self, name, size, device=None):
         super(ScatterAgentLayer, self).__init__(
             name, 'scatter_agent', size, inputs=[], device=device)
 
+
 @config_layer('sequence_gather_agent')
 class SequenceGatherAgentLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            size,
-            device=None):
+    def __init__(self, name, size, device=None):
         super(SequenceGatherAgentLayer, self).__init__(
-                name, 'sequence_gather_agent', size, inputs=[], device=device)
+            name, 'sequence_gather_agent', size, inputs=[], device=device)
+
 
 @config_layer('sequence_scatter_agent')
 class SequenceScatterAgentLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            size,
-            device=None):
+    def __init__(self, name, size, device=None):
         super(SequenceScatterAgentLayer, self).__init__(
-                name, 'sequence_scatter_agent', size, inputs=[], device=device)
+            name, 'sequence_scatter_agent', size, inputs=[], device=device)
+
 
 @config_layer('multiplex')
 class MultiplexLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            size,
-            device=None):
-        super(MultiplexLayer, self).__init__(name, 'multiplex', size, inputs=inputs, device=device)
-        config_assert(len(inputs) > 2,
-          'MultiplexLayer should have more than 2 inputs.')
+    def __init__(self, name, inputs, size, device=None):
+        super(MultiplexLayer, self).__init__(
+            name, 'multiplex', size, inputs=inputs, device=device)
+        config_assert(
+            len(inputs) > 2, 'MultiplexLayer should have more than 2 inputs.')
         for i in range(1, len(inputs)):
-            config_assert(self.get_input_layer(i).size == size,
-                          "All the input layers except the first one should"
-                          "have the same size as the MultiplexLayer.")
+            config_assert(
+                self.get_input_layer(i).size == size,
+                "All the input layers except the first one should"
+                "have the same size as the MultiplexLayer.")
+
 
 @config_func
-def Link(name,
-        has_subseq=False,
-        ):
+def Link(
+        name,
+        has_subseq=False, ):
     link_config = LinkConfig()
     link_config.link_name = name
     link_config.has_subseq = has_subseq
     return link_config
 
+
 # memory for recurrent layer group.
 # *name* and *size* are actual layer's name and size.
 # will return name of the memory,
@@ -1951,43 +2164,46 @@ def Link(name,
 # can only be initailized by a *boot_layer* which is a sequence.
 #
 @config_func
-def Memory(name,
-           size,
-           is_sequence=False,
-           boot_layer=None,
-           boot_bias=False,
-           boot_bias_active_type="",
-           boot_with_const_id=None,
-           ):
+def Memory(
+        name,
+        size,
+        is_sequence=False,
+        boot_layer=None,
+        boot_bias=False,
+        boot_bias_active_type="",
+        boot_with_const_id=None, ):
     agent_name = name + "+delay1"
     if is_sequence:
         agent_layer = SequenceAgentLayer(agent_name, size)
     else:
         agent_layer = AgentLayer(agent_name, size)
     config_assert(g_current_submodel.is_recurrent_layer_group,
-                      'Memory should be used in recurrent layer group only')
+                  'Memory should be used in recurrent layer group only')
     memory = g_current_submodel.memories.add()
     memory.layer_name = MakeLayerNameInSubmodel(name)
     memory.link_name = MakeLayerNameInSubmodel(agent_name)
     memory.is_sequence = is_sequence
-    options = sum((boot_layer is not None,
-                   bool(boot_bias),
+    options = sum((boot_layer is not None, bool(boot_bias),
                    boot_with_const_id is not None))
-    config_assert(options <= 1,
-        'take one option at most from boot_layer, boot_bias, or boot_with_const_id')
+    config_assert(
+        options <= 1,
+        'take one option at most from boot_layer, boot_bias, or boot_with_const_id'
+    )
     if boot_layer is not None:
         boot_layer = MakeLayerNameInParentSubmodel(boot_layer)
         config_assert(boot_layer in g_layer_map,
-                      'boot_layer "%s" does not correspond to a layer name' % boot_layer)
+                      'boot_layer "%s" does not correspond to a layer name' %
+                      boot_layer)
         memory.boot_layer_name = boot_layer
     elif boot_bias:
         memory.boot_bias_parameter_name = agent_layer.create_bias_parameter(
-            boot_bias, size, for_self = False)
+            boot_bias, size, for_self=False)
         memory.boot_bias_active_type = boot_bias_active_type
     elif boot_with_const_id is not None:
         memory.boot_with_const_id = boot_with_const_id
     return agent_name
 
+
 # Generator for recurrent layer group, to use it:
 #  1. define a id layer as output of layer group
 #  2. define a memory of this id layer, and assign a boot id(begin of sequence)
@@ -1999,11 +2215,10 @@ def Memory(name,
 @config_func
 def Generator(
         max_num_frames,
-        eos_layer_name = "eos_check",
-        num_results_per_sample = 1,
-        beam_size = 1,
-        log_prob = None,
-        ):
+        eos_layer_name="eos_check",
+        num_results_per_sample=1,
+        beam_size=1,
+        log_prob=None, ):
     generator_config = GeneratorConfig()
     generator_config.max_num_frames = max_num_frames
     generator_config.eos_layer_name = eos_layer_name
@@ -2013,60 +2228,55 @@ def Generator(
         generator_config.log_prob = log_prob
     return generator_config
 
+
 @config_layer('expand')
 class ExpandLayer(LayerBase):
-   def __init__(
-            self,
-            name,
-            inputs,
-            trans_type='non-seq',
-            device=None,
-            bias=False):
-       super(ExpandLayer, self).__init__(
-           name, 'expand', 0, inputs=inputs, device=device)
-       config_assert(len(self.inputs) == 2,
-                     'ExpandLayer takes 2 and only 2 inputs')
-       self.config.trans_type =  trans_type
-       for input_index in xrange(len(self.inputs)):
-           input_layer = self.get_input_layer(input_index)
-       self.set_layer_size(self.get_input_layer(0).size)
-       self.create_bias_parameter(bias, self.config.size)
+    def __init__(self,
+                 name,
+                 inputs,
+                 trans_type='non-seq',
+                 device=None,
+                 bias=False):
+        super(ExpandLayer, self).__init__(
+            name, 'expand', 0, inputs=inputs, device=device)
+        config_assert(
+            len(self.inputs) == 2, 'ExpandLayer takes 2 and only 2 inputs')
+        self.config.trans_type = trans_type
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+        self.set_layer_size(self.get_input_layer(0).size)
+        self.create_bias_parameter(bias, self.config.size)
+
 
 @config_layer('featmap_expand')
 class FeatMapExpandLayer(LayerBase):
-   def __init__(
-            self,
-            name,
-            inputs,
-            device=None,
-            num_filters=None,
-            bias=False):
-       super(FeatMapExpandLayer, self).__init__(
-           name, 'featmap_expand', 0, inputs=inputs, device=device)
-       config_assert(len(self.inputs) == 1,
-                     'ExpandLayer takes 1 and only 1 inputs')
-       if num_filters is not None:
+    def __init__(self, name, inputs, device=None, num_filters=None, bias=False):
+        super(FeatMapExpandLayer, self).__init__(
+            name, 'featmap_expand', 0, inputs=inputs, device=device)
+        config_assert(
+            len(self.inputs) == 1, 'ExpandLayer takes 1 and only 1 inputs')
+        if num_filters is not None:
             self.config.num_filters = num_filters
-       else:
+        else:
             logger.fatal("FeatMapExpandLayer must specify num_filters.")
-       self.set_layer_size(self.get_input_layer(0).size * num_filters)
+        self.set_layer_size(self.get_input_layer(0).size * num_filters)
 
 
 @config_layer('max')
 class MaxLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            trans_type='non-seq',
-            active_type='linear',
-            device=None,
-            bias=False,
-            output_max_index=None):
-        super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, device=device)
+    def __init__(self,
+                 name,
+                 inputs,
+                 trans_type='non-seq',
+                 active_type='linear',
+                 device=None,
+                 bias=False,
+                 output_max_index=None):
+        super(MaxLayer, self).__init__(
+            name, 'max', 0, inputs=inputs, device=device)
         config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input')
-        self.config.trans_type =  trans_type
-        self.config.active_type =  active_type
+        self.config.trans_type = trans_type
+        self.config.active_type = active_type
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             self.set_layer_size(input_layer.size)
@@ -2077,12 +2287,7 @@ class MaxLayer(LayerBase):
 
 @config_layer('maxid')
 class MaxIdLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            beam_size=None,
-            device=None):
+    def __init__(self, name, inputs, beam_size=None, device=None):
         super(MaxIdLayer, self).__init__(
             name, 'maxid', 0, inputs=inputs, device=device)
         config_assert(len(self.inputs) == 1, 'MaxIdLayer must have 1 input')
@@ -2100,37 +2305,39 @@ class MaxIdLayer(LayerBase):
 
 @config_layer('eos_id')
 class EosIdLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            eos_id,
-            device=None):
+    def __init__(self, name, inputs, eos_id, device=None):
         super(EosIdLayer, self).__init__(
             name, 'eos_id', 0, inputs=inputs, device=device)
         config_assert(len(self.inputs) == 1, 'EosIdLayer must have 1 input')
-        self.set_layer_size(2) # boolean output
+        self.set_layer_size(2)  # boolean output
         self.config.eos_id = eos_id
 
+
 @config_layer('seqlastins')
 class SequenceLastInstanceLayer(LayerBase):
-    def __init__(
-            self,
+    def __init__(self,
+                 name,
+                 inputs,
+                 active_type='linear',
+                 trans_type='non-seq',
+                 device=None,
+                 bias=False):
+        super(SequenceLastInstanceLayer, self).__init__(
             name,
-            inputs,
-            active_type='linear',
-            trans_type='non-seq',
-            device=None,
-            bias=False):
-        super(SequenceLastInstanceLayer, self).__init__(name, 'seqlastins',
-          0, inputs=inputs, device=device, active_type=active_type)
-        config_assert(len(inputs) == 1, 'SequenceLastInstanceLayer must have 1 input')
-        self.config.trans_type =  trans_type
+            'seqlastins',
+            0,
+            inputs=inputs,
+            device=device,
+            active_type=active_type)
+        config_assert(
+            len(inputs) == 1, 'SequenceLastInstanceLayer must have 1 input')
+        self.config.trans_type = trans_type
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             self.set_layer_size(input_layer.size)
         self.create_bias_parameter(bias, self.config.size)
 
+
 @config_layer('seqfirstins')
 class SequenceFirstInstanceLayer(SequenceLastInstanceLayer):
     def __init__(
@@ -2140,167 +2347,163 @@ class SequenceFirstInstanceLayer(SequenceLastInstanceLayer):
             active_type='linear',
             trans_type='non-seq',
             device=None,
-            bias=False,
-            ):
-        super(SequenceFirstInstanceLayer, self).__init__(name,
-          inputs=inputs, active_type=active_type, device=device, bias=bias)
-        self.config.trans_type =  trans_type
+            bias=False, ):
+        super(SequenceFirstInstanceLayer, self).__init__(
+            name,
+            inputs=inputs,
+            active_type=active_type,
+            device=device,
+            bias=bias)
+        self.config.trans_type = trans_type
         self.config.select_first = True
 
+
 @config_layer('seqconcat')
 class SequenceConcatLayer(LayerBase):
-    def __init__(
-            self,
+    def __init__(self,
+                 name,
+                 inputs,
+                 active_type='linear',
+                 device=None,
+                 bias=False):
+        super(SequenceConcatLayer, self).__init__(
             name,
-            inputs,
-            active_type='linear',
-            device=None,
-            bias=False):
-        super(SequenceConcatLayer, self).__init__(name, 'seqconcat',
-          0, inputs=inputs, device=device, active_type=active_type)
-        config_assert(len(inputs) == 2, 'SequenceConcatLayer must have 2 inputs')
+            'seqconcat',
+            0,
+            inputs=inputs,
+            device=device,
+            active_type=active_type)
+        config_assert(
+            len(inputs) == 2, 'SequenceConcatLayer must have 2 inputs')
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             self.set_layer_size(input_layer.size)
         self.create_bias_parameter(bias, self.config.size)
 
+
 @config_layer('seqreshape')
 class SequenceReshapeLayer(LayerBase):
-    def __init__(
-            self,
+    def __init__(self,
+                 name,
+                 size,
+                 inputs,
+                 active_type='linear',
+                 device=None,
+                 bias=False):
+        super(SequenceReshapeLayer, self).__init__(
             name,
+            'seqreshape',
             size,
-            inputs,
-            active_type='linear',
-            device=None,
-            bias=False):
-        super(SequenceReshapeLayer, self).__init__(name, 'seqreshape',
-          size, inputs=inputs, device=device, active_type=active_type)
-        config_assert(len(inputs) == 1, 'SequenceReshapeLayer must have 1 inputs')
+            inputs=inputs,
+            device=device,
+            active_type=active_type)
+        config_assert(
+            len(inputs) == 1, 'SequenceReshapeLayer must have 1 inputs')
         self.set_layer_size(size)
         self.create_bias_parameter(bias, size)
 
+
 @config_layer('subseq')
 class SubSequenceLayer(LayerBase):
-    def __init__(
-            self,
+    def __init__(self,
+                 name,
+                 inputs,
+                 active_type='linear',
+                 device=None,
+                 bias=False):
+        super(SubSequenceLayer, self).__init__(
             name,
-            inputs,
-            active_type='linear',
-            device=None,
-            bias=False):
-        super(SubSequenceLayer, self).__init__(name, 'subseq',
-          0, inputs=inputs, device=device, active_type=active_type)
+            'subseq',
+            0,
+            inputs=inputs,
+            device=device,
+            active_type=active_type)
         config_assert(len(inputs) == 3, 'SubSequenceLayer must have 3 inputs')
         input_layer0 = self.get_input_layer(0)
         size = input_layer0.size
         self.set_layer_size(size)
         self.create_bias_parameter(bias, size)
 
+
 @config_layer('out_prod')
 class OuterProdLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            device=None):
-        super(OuterProdLayer, self).__init__(name, 'out_prod',
-          0, inputs=inputs, device=device)
+    def __init__(self, name, inputs, device=None):
+        super(OuterProdLayer, self).__init__(
+            name, 'out_prod', 0, inputs=inputs, device=device)
         config_assert(len(inputs) == 2, 'OuterProdLayer must have 2 inputs')
         input_layer0 = self.get_input_layer(0)
         input_layer1 = self.get_input_layer(1)
         self.set_layer_size(input_layer0.size * input_layer1.size)
 
+
 @config_layer('power')
 class PowerLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            device=None):
-        super(PowerLayer, self).__init__(name, 'power',
-          0, inputs=inputs, device=device)
+    def __init__(self, name, inputs, device=None):
+        super(PowerLayer, self).__init__(
+            name, 'power', 0, inputs=inputs, device=device)
         config_assert(len(inputs) == 2, 'PowerLayer must have 2 inputs')
         input_layer1 = self.get_input_layer(1)
         self.set_layer_size(input_layer1.size)
         input_layer0 = self.get_input_layer(0)
-        config_assert(1==input_layer0.size,
-          'The left input is the exponent and should be of size 1')
+        config_assert(1 == input_layer0.size,
+                      'The left input is the exponent and should be of size 1')
+
 
 @config_layer('slope_intercept')
 class SlopeInterceptLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            slope=1.0,
-            intercept=0.0,
-            device=None):
-        super(SlopeInterceptLayer, self).__init__(name, 'slope_intercept',
-          0, inputs=inputs, device=device)
+    def __init__(self, name, inputs, slope=1.0, intercept=0.0, device=None):
+        super(SlopeInterceptLayer, self).__init__(
+            name, 'slope_intercept', 0, inputs=inputs, device=device)
         self.config.slope = slope
         self.config.intercept = intercept
         config_assert(len(inputs) == 1, 'SlopeInterceptLayer must have 1 input')
         input_layer0 = self.get_input_layer(0)
         self.set_layer_size(input_layer0.size)
 
+
 @config_layer('scaling')
 class ScalingLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            device=None):
-        super(ScalingLayer, self).__init__(name, 'scaling',
-          0, inputs=inputs, device=device)
+    def __init__(self, name, inputs, device=None):
+        super(ScalingLayer, self).__init__(
+            name, 'scaling', 0, inputs=inputs, device=device)
         config_assert(len(inputs) == 2, 'ScalingLayer must have 2 inputs')
         input_layer1 = self.get_input_layer(1)
         self.set_layer_size(input_layer1.size)
         input_layer0 = self.get_input_layer(0)
-        config_assert(1==input_layer0.size,
-          'The left input should be of size 1')
+        config_assert(1 == input_layer0.size,
+                      'The left input should be of size 1')
+
 
 @config_layer('conv_shift')
 class ConvShiftLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            device=None):
-        super(ConvShiftLayer, self).__init__(name, 'conv_shift',
-          0, inputs=inputs, device=device)
+    def __init__(self, name, inputs, device=None):
+        super(ConvShiftLayer, self).__init__(
+            name, 'conv_shift', 0, inputs=inputs, device=device)
         config_assert(len(inputs) == 2, 'ConvShiftLayer must have 2 inputs')
         input_layer0 = self.get_input_layer(0)
         self.set_layer_size(input_layer0.size)
 
+
 @config_layer('convex_comb')
 class ConvexCombinationLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            size,
-            inputs,
-            device=None):
+    def __init__(self, name, size, inputs, device=None):
         super(ConvexCombinationLayer, self).__init__(
-           name, 'convex_comb', size, inputs=inputs, device=device)
-        config_assert(len(self.inputs) == 2,
-          'ConvexCombinationLayer must have 2 inputs')
+            name, 'convex_comb', size, inputs=inputs, device=device)
+        config_assert(
+            len(self.inputs) == 2, 'ConvexCombinationLayer must have 2 inputs')
         config_assert(
             size * self.get_input_layer(0).size == self.get_input_layer(1).size,
             'Wrong input size for ConvexCombinationLayer')
         self.set_layer_size(size)
 
+
 @config_layer('interpolation')
 class InterpolationLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            device=None):
+    def __init__(self, name, inputs, device=None):
         super(InterpolationLayer, self).__init__(
             name, 'interpolation', 0, inputs=inputs, device=device)
-        config_assert(len(self.inputs) == 3,
-            'InterpolationLayer must have 3 inputs')
+        config_assert(
+            len(self.inputs) == 3, 'InterpolationLayer must have 3 inputs')
         input_layer0 = self.get_input_layer(0)
         input_layer1 = self.get_input_layer(1)
         input_layer2 = self.get_input_layer(2)
@@ -2309,48 +2512,51 @@ class InterpolationLayer(LayerBase):
         config_assert(input_layer1.size == input_layer2.size,
                       'the two vector inputs should be of the same size')
 
+
+@config_layer('bilinear_interp')
+class BilinearInterpLayer(LayerBase):
+    def __init__(self, name, inputs, **xargs):
+        super(BilinearInterpLayer, self).__init__(
+            name, 'bilinear_interp', 0, inputs=inputs, **xargs)
+        input_layer = self.get_input_layer(0)
+        parse_bilinear(self.inputs[0].bilinear_interp, input_layer.name,
+                       self.config.inputs[0].bilinear_interp_conf)
+        conf = self.inputs[0].bilinear_interp
+        self.set_layer_size(conf.out_size_x * conf.out_size_y *
+                            conf.num_channels)
+
+
 @config_layer('sum_to_one_norm')
 class SumToOneNormLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            device=None):
+    def __init__(self, name, inputs, device=None):
         super(SumToOneNormLayer, self).__init__(
-           name, 'sum_to_one_norm', 0, inputs=inputs, device=device)
-        config_assert(len(self.inputs) == 1,
-          'SumToOneNormLayer must have 1 input')
+            name, 'sum_to_one_norm', 0, inputs=inputs, device=device)
+        config_assert(
+            len(self.inputs) == 1, 'SumToOneNormLayer must have 1 input')
         input_layer0 = self.get_input_layer(0)
         self.set_layer_size(input_layer0.size)
 
+
 @config_layer('cos_vm')
 class CosSimVecMatLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            size,
-            inputs,
-            cos_scale=1.0,
-            device=None):
+    def __init__(self, name, size, inputs, cos_scale=1.0, device=None):
         super(CosSimVecMatLayer, self).__init__(
-          name, 'cos_vm', size, inputs=inputs, device=device)
+            name, 'cos_vm', size, inputs=inputs, device=device)
         self.config.cos_scale = cos_scale
-        config_assert(len(self.inputs) == 2,
-          'CosSimVecMatLayer must have 2 inputs')
+        config_assert(
+            len(self.inputs) == 2, 'CosSimVecMatLayer must have 2 inputs')
         config_assert(
             size * self.get_input_layer(0).size == self.get_input_layer(1).size,
             'Wrong input size for CosSimVecMatLayer')
 
+
 @config_layer('sampling_id')
 class SamplingIdLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            device=None):
+    def __init__(self, name, inputs, device=None):
         super(SamplingIdLayer, self).__init__(
             name, 'sampling_id', 0, inputs=inputs, device=device)
-        config_assert(len(self.inputs) == 1, 'SamplingIdLayer must have 1 input')
+        config_assert(
+            len(self.inputs) == 1, 'SamplingIdLayer must have 1 input')
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             self.set_layer_size(input_layer.size)
@@ -2363,33 +2569,33 @@ class SamplingIdLayer(LayerBase):
 # 'squarerootn': sum each sample, but divide by sqrt(sample_num).
 @config_layer('average')
 class AverageLayer(LayerBase):
-    def __init__(
-            self,
+    def __init__(self,
+                 name,
+                 inputs,
+                 average_strategy='average',
+                 trans_type='non-seq',
+                 active_type='linear',
+                 device=None,
+                 bias=False):
+        super(AverageLayer, self).__init__(
             name,
-            inputs,
-            average_strategy='average',
-            trans_type='non-seq',
-            active_type='linear',
-            device=None,
-            bias=False):
-        super(AverageLayer, self).__init__(name, 'average', 0, inputs=inputs,
-            device=device, active_type=active_type)
+            'average',
+            0,
+            inputs=inputs,
+            device=device,
+            active_type=active_type)
         self.config.average_strategy = average_strategy
-        self.config.trans_type =  trans_type
+        self.config.trans_type = trans_type
         config_assert(len(inputs) == 1, 'AverageLayer must have 1 input')
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             self.set_layer_size(input_layer.size)
         self.create_bias_parameter(bias, self.config.size)
 
+
 @config_layer('cos')
 class CosSimLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            cos_scale=5,
-            device=None):
+    def __init__(self, name, inputs, cos_scale=5, device=None):
         super(CosSimLayer, self).__init__(
             name, 'cos', 1, inputs=inputs, device=device)
         config_assert(len(self.inputs) == 2, 'CosSimLayer must have 2 inputs')
@@ -2401,18 +2607,13 @@ class CosSimLayer(LayerBase):
 
 @config_layer('tensor')
 class TensorLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            size,
-            inputs,
-            device=None,
-            bias=True,
-            **xargs):
-        super(TensorLayer, self).__init__(name, 'tensor', size, inputs=inputs, device=device, **xargs)
+    def __init__(self, name, size, inputs, device=None, bias=True, **xargs):
+        super(TensorLayer, self).__init__(
+            name, 'tensor', size, inputs=inputs, device=device, **xargs)
         config_assert(len(self.inputs) == 2, 'TensorLayer must have 2 inputs')
         config_assert(size > 0, 'size must be positive')
-        config_assert(inputs[1].parameter_name == None, 'second parameter should be None.')
+        config_assert(inputs[1].parameter_name == None,
+                      'second parameter should be None.')
         input_layer0 = self.get_input_layer(0)
         input_layer1 = self.get_input_layer(1)
         psize = size * input_layer0.size * input_layer1.size
@@ -2423,14 +2624,13 @@ class TensorLayer(LayerBase):
 
 @config_layer('mixed')
 class MixedLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            size=0,
-            bias=True,
-            error_clipping_threshold=None,
-            **xargs):
+    def __init__(self,
+                 name,
+                 inputs,
+                 size=0,
+                 bias=True,
+                 error_clipping_threshold=None,
+                 **xargs):
         config_assert(inputs, 'inputs cannot be empty')
         super(MixedLayer, self).__init__(
             name, 'mixed', size, inputs=inputs, **xargs)
@@ -2455,24 +2655,28 @@ class MixedLayer(LayerBase):
             else:
                 sz = operator.calc_output_size(operator_conf.input_sizes)
                 if sz != 0:
-                    config_assert(sz == self.config.size,
-                                  "different inputs have different size: %s vs. %s" %
-                                  (sz, self.config.size))
+                    config_assert(
+                        sz == self.config.size,
+                        "different inputs have different size: %s vs. %s" %
+                        (sz, self.config.size))
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             input = self.inputs[input_index]
             if input_index not in operator_input_index:
-                config_assert(isinstance(input, Projection), "input should be projection or operation")
+                config_assert(
+                    isinstance(input, Projection),
+                    "input should be projection or operation")
             if self.config.size == 0 and isinstance(input, Projection):
                 size = input.calc_output_size(input_layer)
                 if size != 0:
                     self.set_layer_size(size)
             elif isinstance(input, Projection):
-            	sz = input.calc_output_size(input_layer)
-            	if sz != 0:
-            		config_assert(sz == self.config.size,
-            		"different inputs have different size: %s vs. %s" %
-            		(sz, self.config.size))
+                sz = input.calc_output_size(input_layer)
+                if sz != 0:
+                    config_assert(
+                        sz == self.config.size,
+                        "different inputs have different size: %s vs. %s" %
+                        (sz, self.config.size))
         config_assert(size != 0, "size is not set")
 
         for input_index in xrange(len(self.inputs)):
@@ -2484,7 +2688,8 @@ class MixedLayer(LayerBase):
 
                 input_config = self.config.inputs[input_index]
                 input_config.proj_conf.CopyFrom(input.proj_conf)
-                input_config.proj_conf.name = gen_parameter_name(name, input_index)
+                input_config.proj_conf.name = gen_parameter_name(name,
+                                                                 input_index)
                 psize = input.calc_parameter_size(input_layer.size, size)
                 dims = input.calc_parameter_dims(input_layer.size, size)
                 self.create_input_parameter(input_index, psize, dims)
@@ -2496,49 +2701,60 @@ class MixedLayer(LayerBase):
             record_operator_conf = self.config.operator_confs.add()
             record_operator_conf.CopyFrom(operator_conf)
 
+        psize = self.config.size
+        if isinstance(self.inputs[0], ConvProjection):
+            self.config.shared_biases = True
+            psize = 0
+            for input in self.inputs:
+                psize += input.calc_bias_size()
 
-        self.create_bias_parameter(bias, self.config.size)
+        if bias:
+            self.config.bias_size = psize
+            self.create_bias_parameter(bias, psize)
 
         if error_clipping_threshold is not None:
             self.config.error_clipping_threshold = error_clipping_threshold
 
+
 # like MixedLayer, but no bias parameter
 @config_func
-def ExpressionLayer(name,
-            inputs,
-            **xargs):
+def ExpressionLayer(name, inputs, **xargs):
     MixedLayer(name, inputs, bias=False, **xargs)
 
+
 @config_layer('concat')
 class ConcatenateLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            **xargs):
+    def __init__(self, name, inputs, bias=False, **xargs):
         config_assert(inputs, 'inputs cannot be empty')
+        config_assert(not bias, 'ConcatenateLayer cannot support bias.')
         super(ConcatenateLayer, self).__init__(
             name, 'concat', 0, inputs=inputs, **xargs)
         size = 0
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             input = self.inputs[input_index]
-            if self.config.size ==  0:
+            if self.config.size == 0:
                 size += input_layer.size
 
         self.set_layer_size(size)
 
+
 # like concat layer, but each input layer was processed by a Projection.
 @config_layer('concat2')
 class ConcatenateLayer2(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            **xargs):
+    def __init__(self, name, inputs, bias=False, **xargs):
         config_assert(inputs, 'inputs cannot be empty')
         super(ConcatenateLayer2, self).__init__(
             name, 'concat2', 0, inputs=inputs, **xargs)
+
+        if isinstance(self.inputs[0], ConvProjection):
+            for input_index in xrange(len(self.inputs) - 1):
+                input = self.inputs[input_index + 1]
+                config_assert(
+                    isinstance(input, ConvProjection),
+                    "The first input of ConcatenateLayer2 is ConvProjection, "
+                    "the other inputs should also be ConvProjection.")
+
         size = 0
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
@@ -2559,21 +2775,28 @@ class ConcatenateLayer2(LayerBase):
             input_config.proj_conf.CopyFrom(input.proj_conf)
             input_config.proj_conf.name = gen_parameter_name(name, input_index)
             psize = input.calc_parameter_size(input.proj_conf.input_size,
-              input.proj_conf.output_size)
+                                              input.proj_conf.output_size)
             dims = input.calc_parameter_dims(input.proj_conf.input_size,
-              input.proj_conf.output_size)
+                                             input.proj_conf.output_size)
             self.create_input_parameter(input_index, psize, dims)
 
+        psize = self.config.size
+        if isinstance(self.inputs[0], ConvProjection):
+            self.config.shared_biases = True
+            psize = 0
+            for input in self.inputs:
+                psize += input.calc_bias_size()
+
+        if bias:
+            self.config.bias_size = psize
+            self.create_bias_parameter(bias, psize)
+
+
 @config_layer('recurrent')
 class RecurrentLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            reversed=False,
-            bias=True,
-            **xargs):
-        super(RecurrentLayer, self).__init__(name, 'recurrent', 0, inputs, **xargs)
+    def __init__(self, name, inputs, reversed=False, bias=True, **xargs):
+        super(RecurrentLayer, self).__init__(name, 'recurrent', 0, inputs,
+                                             **xargs)
         config_assert(len(self.inputs) == 1, 'RecurrentLayer must have 1 input')
         input_layer = self.get_input_layer(0)
         size = input_layer.size
@@ -2583,17 +2806,17 @@ class RecurrentLayer(LayerBase):
         self.create_input_parameter(0, size * size, dims)
         self.create_bias_parameter(bias, self.config.size)
 
+
 @config_layer('lstmemory')
 class LstmLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            reversed=False,
-            active_gate_type="sigmoid",
-            active_state_type="sigmoid",
-            bias=True,
-            **xargs):
+    def __init__(self,
+                 name,
+                 inputs,
+                 reversed=False,
+                 active_gate_type="sigmoid",
+                 active_state_type="sigmoid",
+                 bias=True,
+                 **xargs):
         super(LstmLayer, self).__init__(name, 'lstmemory', 0, inputs, **xargs)
         config_assert(len(self.inputs) == 1, 'LstmLayer must have 1 input')
         input_layer = self.get_input_layer(0)
@@ -2602,117 +2825,126 @@ class LstmLayer(LayerBase):
         size = input_layer.size / 4
         self.set_layer_size(size)
         self.config.reversed = reversed
-        self.config.active_gate_type  = active_gate_type
+        self.config.active_gate_type = active_gate_type
         self.config.active_state_type = active_state_type
         self.create_input_parameter(0, size * size * 4, [size, size, 4])
         #bias includes 3 kinds of peephole, 4 + 3 = 7
         self.create_bias_parameter(bias, size * 7)
 
+
 @config_layer('lstm_step')
 class LstmStepLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            size,
-            inputs,
-            active_gate_type="sigmoid",
-            active_state_type="sigmoid",
-            bias=True,
-            **xargs):
-        super(LstmStepLayer, self).__init__(name, 'lstm_step',
-          size, inputs, **xargs)
+    def __init__(self,
+                 name,
+                 size,
+                 inputs,
+                 active_gate_type="sigmoid",
+                 active_state_type="sigmoid",
+                 bias=True,
+                 **xargs):
+        super(LstmStepLayer, self).__init__(name, 'lstm_step', size, inputs,
+                                            **xargs)
         config_assert(len(inputs) == 2, 'LstmStepLayer must have 2 inputs')
         input_layer0 = self.get_input_layer(0)
         input_layer1 = self.get_input_layer(1)
-        config_assert(input_layer0.size == 4 * size, 'input_layer0.size != 4 * layer.size')
-        config_assert(input_layer1.size == size, 'input_layer1.size != layer.size')
-        self.config.active_gate_type  = active_gate_type
+        config_assert(input_layer0.size == 4 * size,
+                      'input_layer0.size != 4 * layer.size')
+        config_assert(input_layer1.size == size,
+                      'input_layer1.size != layer.size')
+        self.config.active_gate_type = active_gate_type
         self.config.active_state_type = active_state_type
         self.create_bias_parameter(bias, size * 3)
 
+
 # get the specific output from the input layer.
 @config_layer('get_output')
 class GetOutputLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            size,
-            inputs):
-        super(GetOutputLayer, self).__init__(name, 'get_output' , size, inputs)
-        config_assert(len(self.inputs) == 1, 'GetOutputLayer must have 1 inputs')
+    def __init__(self, name, size, inputs):
+        super(GetOutputLayer, self).__init__(name, 'get_output', size, inputs)
+        config_assert(
+            len(self.inputs) == 1, 'GetOutputLayer must have 1 inputs')
         inputs = self.inputs[0]
         config_assert(inputs.input_layer_argument,
                       'input_layer_argument cannot be empty')
 
+
 @config_layer('mdlstmemory')
 class MDLstmLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            directions=True,
-            active_gate_type="sigmoid",
-            active_state_type="sigmoid",
-            bias=True,
-            **xargs):
-        super(MDLstmLayer, self).__init__(name, 'mdlstmemory', 0, inputs, **xargs)
+    def __init__(self,
+                 name,
+                 inputs,
+                 directions=True,
+                 active_gate_type="sigmoid",
+                 active_state_type="sigmoid",
+                 bias=True,
+                 **xargs):
+        super(MDLstmLayer, self).__init__(name, 'mdlstmemory', 0, inputs,
+                                          **xargs)
         config_assert(len(self.inputs) == 1, 'MDLstmLayer must have 1 input')
         input_layer = self.get_input_layer(0)
         dim_num = len(directions)
         #check input_layer.size is divided by (3+dim_num)
-        config_assert(input_layer.size % (3+dim_num) == 0, "size % (dim_num) should be 0!")
-        size = input_layer.size / (3+dim_num)
+        config_assert(input_layer.size % (3 + dim_num) == 0,
+                      "size % (dim_num) should be 0!")
+        size = input_layer.size / (3 + dim_num)
         self.set_layer_size(size)
-        self.config.active_gate_type  = active_gate_type
+        self.config.active_gate_type = active_gate_type
         self.config.active_state_type = active_state_type
         for i in xrange(len(directions)):
             self.config.directions.append(int(directions[i]))
-        self.create_input_parameter(0, size * size * (3+dim_num), [size, size, 3+dim_num])
+        self.create_input_parameter(0, size * size * (3 + dim_num),
+                                    [size, size, 3 + dim_num])
         #bias includes 3 kinds of peephole, 3+dim_num+2+dim_num
-        self.create_bias_parameter(bias, size * (5+2*dim_num))
+        self.create_bias_parameter(bias, size * (5 + 2 * dim_num))
+
 
 @config_layer('gated_recurrent')
 class GatedRecurrentLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            inputs,
-            reversed=False,
-            active_gate_type="sigmoid",
-            bias=True,
-            **xargs):
-        super(GatedRecurrentLayer, self).__init__(name, 'gated_recurrent', 0, inputs, **xargs)
-        config_assert(len(self.inputs) == 1, 'GatedRecurrentLayer must have 1 input')
+    def __init__(self,
+                 name,
+                 inputs,
+                 reversed=False,
+                 active_gate_type="sigmoid",
+                 bias=True,
+                 **xargs):
+        super(GatedRecurrentLayer, self).__init__(name, 'gated_recurrent', 0,
+                                                  inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1, 'GatedRecurrentLayer must have 1 input')
         input_layer = self.get_input_layer(0)
         #check input_layer.size is divided by 3
         config_assert(input_layer.size % 3 == 0, "size % 3 should be 0!")
         size = input_layer.size / 3
         self.set_layer_size(size)
         self.config.reversed = reversed
-        self.config.active_gate_type  = active_gate_type
+        self.config.active_gate_type = active_gate_type
         self.create_input_parameter(0, size * size * 3, [size, size * 3])
         self.create_bias_parameter(bias, size * 3)
 
+
 @config_layer('gru_step')
 class GruStepLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            size,
-            inputs,
-            active_gate_type="sigmoid",
-            bias=True,
-            **xargs):
-        super(GruStepLayer, self).__init__(name, 'gru_step', size, inputs, **xargs)
+    def __init__(self,
+                 name,
+                 size,
+                 inputs,
+                 active_gate_type="sigmoid",
+                 bias=True,
+                 **xargs):
+        super(GruStepLayer, self).__init__(name, 'gru_step', size, inputs,
+                                           **xargs)
         config_assert(len(self.inputs) == 2, 'GruStepLayer must have 2 input')
         input_layer0 = self.get_input_layer(0)
         input_layer1 = self.get_input_layer(1)
-        config_assert(input_layer0.size == 3 * size, 'input_layer0.size != 3 * layer.size')
-        config_assert(input_layer1.size == size, 'input_layer1.size != layer.size')
-        self.config.active_gate_type  = active_gate_type
+        config_assert(input_layer0.size == 3 * size,
+                      'input_layer0.size != 3 * layer.size')
+        config_assert(input_layer1.size == size,
+                      'input_layer1.size != layer.size')
+        self.config.active_gate_type = active_gate_type
         self.create_input_parameter(0, size * size * 3, [size, size * 3])
         self.create_bias_parameter(bias, size * 3)
 
+
 '''
  A layer for calculating the cost of sequential conditional random field model.
  Example: CRFLayer(name="crf_cost", size=label_num,
@@ -2720,20 +2952,18 @@ class GruStepLayer(LayerBase):
           where "weight" is optional, one weight for each sequence
  @param coeff: weight of the layer
 '''
+
+
 @config_layer('crf')
 class CRFLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            size,
-            inputs,
-            coeff=1.0,
-            device=None):
+    def __init__(self, name, size, inputs, coeff=1.0, device=None):
         super(CRFLayer, self).__init__(name, 'crf', size, inputs, device=device)
-        config_assert(2 <= len(self.inputs) <= 3, 'CRFLayer must have 2 or 3 inputs')
+        config_assert(2 <= len(self.inputs) <= 3,
+                      'CRFLayer must have 2 or 3 inputs')
         self.create_input_parameter(0, size * (size + 2), [size, size + 2])
         self.config.coeff = coeff
 
+
 '''
  A layer for calculating the decoding sequence of sequential conditional
  random field model.
@@ -2742,14 +2972,11 @@ class CRFLayer(LayerBase):
  this layer will also calculate error, output_.value[i] is 1 for incorrect
  decoding or 0 for correct decoding
 '''
+
+
 @config_layer('crf_decoding')
 class CRFDecodingLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            size,
-            inputs,
-            device=None):
+    def __init__(self, name, size, inputs, device=None):
         super(CRFDecodingLayer, self).__init__(
             name, 'crf_decoding', size, inputs, device=device)
         config_assert(
@@ -2757,47 +2984,35 @@ class CRFDecodingLayer(LayerBase):
             'CRFDecodingLayer cannot have more than 2 inputs')
         self.create_input_parameter(0, size * (size + 2), [size, size + 2])
 
+
 @config_layer('ctc')
 class CTCLayer(LayerBase):
-    def __init__(
-            self,
-            name,
-            size,
-            inputs,
-            norm_by_times = False,
-            device=None):
+    def __init__(self, name, size, inputs, norm_by_times=False, device=None):
         super(CTCLayer, self).__init__(name, 'ctc', size, inputs, device=device)
         self.config.norm_by_times = norm_by_times
         config_assert(len(self.inputs) == 2, 'CTCLayer must have 2 inputs')
 
+
 @config_layer('recurrent_layer_group')
 class RecurrentLayerGroup(LayerBase):
-    def __init__(
-            self,
-            name,
-            device=None):
+    def __init__(self, name, device=None):
         super(RecurrentLayerGroup, self).__init__(
             name, 'recurrent_layer_group', 0, inputs=[], device=device)
 
 
 # Deprecated, use a new layer specific class instead
 @config_func
-def Layer(
-        name,
-        type,
-        **xargs):
+def Layer(name, type, **xargs):
     layers = {}
     layers.update(g_cost_map)
     layers.update(g_layer_type_map)
     layer_func = layers.get(type)
-    config_assert(layer_func,
-                  "layer type '%s' not supported." % type)
-    layer_func(name, **xargs)
+    config_assert(layer_func, "layer type '%s' not supported." % type)
+    return layer_func(name, **xargs)
+
 
 @config_func
-def ParameterHook(
-    type,
-    **kwargs):
+def ParameterHook(type, **kwargs):
     if type == 'pruning':
         mask_filename = kwargs.get('mask_filename', None)
         assert mask_filename is not None
@@ -2810,30 +3025,28 @@ def ParameterHook(
 
 
 @config_func
-def Parameter(
-        name,
-        size,
-        device,
-        dims,
-        learning_rate=None,
-        momentum=None,
-        decay_rate=None,
-        decay_rate_l1=None,
-        initial_mean=None,
-        initial_std=None,
-        initial_strategy=None,
-        initial_smart=None,
-        num_batches_regularization=None,
-        sparse_remote_update=None,
-        sparse_update=None,
-        gradient_clipping_threshold=None,
-        sparse=None,
-        format=None,
-        need_compact=None,
-        is_static=None,
-        is_shared=None,
-        update_hooks=None
-        ):
+def Parameter(name,
+              size,
+              device,
+              dims,
+              learning_rate=None,
+              momentum=None,
+              decay_rate=None,
+              decay_rate_l1=None,
+              initial_mean=None,
+              initial_std=None,
+              initial_strategy=None,
+              initial_smart=None,
+              num_batches_regularization=None,
+              sparse_remote_update=None,
+              sparse_update=None,
+              gradient_clipping_threshold=None,
+              sparse=None,
+              format=None,
+              need_compact=None,
+              is_static=None,
+              is_shared=None,
+              update_hooks=None):
 
     config_assert(name not in g_parameter_map,
                   'Duplicated parameter name: ' + name)
@@ -2864,8 +3077,8 @@ def Parameter(
     para.initial_std = default(initial_std, g_default_initial_std)
     para.initial_mean = default(initial_mean, g_default_initial_mean)
 
-    num_batches_regularization = default(
-        num_batches_regularization, g_default_num_batches_regularization)
+    num_batches_regularization = default(num_batches_regularization,
+                                         g_default_num_batches_regularization)
     if num_batches_regularization is not None:
         para.num_batches_regularization = int(num_batches_regularization)
 
@@ -2875,18 +3088,21 @@ def Parameter(
             g_config.opt_config.use_sparse_remote_updater = True
     if sparse_update is not None:
         para.sparse_update = sparse_update
-    gradient_clipping_threshold = default(
-        gradient_clipping_threshold, g_default_gradient_clipping_threshold)
+    gradient_clipping_threshold = default(gradient_clipping_threshold,
+                                          g_default_gradient_clipping_threshold)
     if gradient_clipping_threshold is not None:
         para.gradient_clipping_threshold = gradient_clipping_threshold
-    para.initial_strategy = default(initial_strategy, g_default_initial_strategy)
+    para.initial_strategy = default(initial_strategy,
+                                    g_default_initial_strategy)
     para.initial_smart = default(initial_smart, g_default_initial_smart)
     if para.initial_smart:
         para.initial_mean = 0.
         if len(para.dims) != 0:
             para.initial_std = 1. / math.sqrt(para.dims[0])
         else:
-            print("Use initial_smart, but dims not set. Initial_smart may not be used in this layer")
+            print(
+                "Use initial_smart, but dims not set. Initial_smart may not be used in this layer"
+            )
             traceback.print_exc()
             para.initial_std = 1. / math.sqrt(para.size)
     if g_default_compact_func is not None:
@@ -2925,64 +3141,78 @@ def default_initial_std(val):
     global g_default_initial_std
     g_default_initial_std = val
 
+
 @config_func
 def default_initial_mean(val):
     global g_default_initial_mean
     g_default_initial_mean = val
 
+
 @config_func
 def default_initial_strategy(val):
     global g_default_initial_strategy
     g_default_initial_strategy = val
 
+
 @config_func
 def default_initial_smart(val):
     global g_default_initial_smart
     g_default_initial_smart = val
 
+
 @config_func
 def default_momentum(val):
     global g_default_momentum
     g_default_momentum = val
 
+
 @config_func
 def default_decay_rate(val):
     global g_default_decay_rate
     g_default_decay_rate = val
 
+
 @config_func
 def default_num_batches_regularization(val):
     global g_default_num_batches_regularization
     g_default_num_batches_regularization = val
 
+
 @config_func
 def default_gradient_clipping_threshold(val):
     global g_default_gradient_clipping_threshold
     g_default_gradient_clipping_threshold = val
 
+
 @config_func
 def default_device(val):
     global g_default_device
     g_default_device = val
 
+
 @config_func
 def default_update_hooks(val):
     global g_default_update_hooks
     g_default_update_hooks = val
 
+
 @config_func
 def default_compact_func(val):
     global g_default_compact_func
     g_default_compact_func = val
 
+
 def make_importer(config_dir, config_args):
     def Import(config_file, local_args={}):
         if not config_file.startswith('/'):
             config_file = config_dir + '/' + config_file
             g_config.config_files.append(config_file)
-        execfile(config_file, make_config_environment(config_file, config_args), local_args)
+        execfile(config_file,
+                 make_config_environment(config_file, config_args), local_args)
+
     return Import
 
+
 settings = dict(
     batch_size=None,
     mini_batch_size=None,
@@ -3011,26 +3241,24 @@ settings = dict(
     ada_rou=0.95,
     delta_add_rate=1.0,
     shrink_parameter_value=0,
-    adam_beta1 = 0.9,
-    adam_beta2 = 0.999,
-    adam_epsilon = 1e-8,
-)
+    adam_beta1=0.9,
+    adam_beta2=0.999,
+    adam_epsilon=1e-8, )
 
-settings_deprecated = dict(
-    usage_ratio=1.,
-)
+settings_deprecated = dict(usage_ratio=1., )
 
 trainer_settings = dict(
     save_dir="./output/model",
     init_model_path=None,
-    start_pass=0,
-)
+    start_pass=0, )
+
 
 @config_func
 def Settings(**args):
     for k, v in args.iteritems():
         if k == "usage_ratio":
-            logger.warning("Deprecated: define usage_ratio in DataConfig instead")
+            logger.warning(
+                "Deprecated: define usage_ratio in DataConfig instead")
             if g_config.HasField("data_config"):
                 g_config.data_config.__setattr__(k, v)
             settings_deprecated[k] = v
@@ -3042,10 +3270,12 @@ def Settings(**args):
         else:
             logger.fatal('Unkown setting: %s' % k)
 
+
 @config_func
 def cluster_config(**args):
     pass
 
+
 @config_func
 def EnableSubmodelSuffix(flag=True):
     """
@@ -3055,10 +3285,12 @@ def EnableSubmodelSuffix(flag=True):
     global g_add_submodel_suffix
     g_add_submodel_suffix = flag
 
+
 def make_config_environment(config_file, config_args):
     def make_setter(k):
         def setter(v):
             logger.fatal("Obsolete: use Settings(%s=%s, ...) instead" % (k, v))
+
         return setter
 
     funcs = {}
@@ -3074,13 +3306,13 @@ def make_config_environment(config_file, config_args):
 
     funcs.update(
         Import=make_importer(config_dir, config_args),
-        get_config_arg=make_get_config_arg(config_args),
-    )
+        get_config_arg=make_get_config_arg(config_args), )
 
     funcs.update(g_extended_config_funcs)
 
     return funcs
 
+
 def make_get_config_arg(config_args):
     def get_config_arg(name, type, default=None):
         if type == bool:
@@ -3097,6 +3329,7 @@ def make_get_config_arg(config_args):
 
     return get_config_arg
 
+
 def importlib(name):
     __import__(name)
     return sys.modules[name]
@@ -3109,10 +3342,12 @@ def find_caller():
             return s[0], s[1], s[2]
     return "(unknown file)", 0, "(unknown function)"
 
+
 def my_fatal(s):
     logger.critical(s)
     raise Exception()
 
+
 def parse_config(config_file, config_arg_str):
     '''
     @param config_arg_str: a string of the form var1=val1,var2=val2. It will be
@@ -3150,7 +3385,7 @@ def parse_config(config_file, config_arg_str):
     for k, v in settings.iteritems():
         if v is None:
             continue
-        g_config.opt_config.__setattr__(k, v);
+        g_config.opt_config.__setattr__(k, v)
 
     for k, v in trainer_settings.iteritems():
         if v is None:
@@ -3177,6 +3412,7 @@ def parse_config_and_serialize(config_file, config_arg_str):
         traceback.print_exc()
         raise
 
+
 if __name__ == '__main__':
     try:
         config = parse_config(sys.argv[1], '')
diff --git a/python/paddle/trainer/config_parser_extension.py b/python/paddle/trainer/config_parser_extension.py
index 3445076274b0a87254c6af6c3417fe51022c7891..ba4c79efdc10ec6cc895e76ddb87bc3fbd19ddc1 100644
--- a/python/paddle/trainer/config_parser_extension.py
+++ b/python/paddle/trainer/config_parser_extension.py
@@ -17,11 +17,10 @@ from paddle.proto.DataConfig_pb2 import DataConfig
 g_config = None
 
 
-def SimpleData(
-        files=None,
-        feat_dim=None,
-        context_len=None,
-        buffer_capacity=None):
+def SimpleData(files=None,
+               feat_dim=None,
+               context_len=None,
+               buffer_capacity=None):
 
     data_config = DataConfig()
     data_config.type = 'simple'
@@ -33,6 +32,7 @@ def SimpleData(
         data_config.buffer_capacity = buffer_capacity
     return data_config
 
+
 def get_config_funcs(trainer_config):
     global g_config
     g_config = trainer_config
diff --git a/python/paddle/trainer/recurrent_units.py b/python/paddle/trainer/recurrent_units.py
index 7d51de78b0d79ce59bd08507660ebe99b273a4a0..a80ad13d1ed52d84c3b5882939271b91ecc07bb3 100644
--- a/python/paddle/trainer/recurrent_units.py
+++ b/python/paddle/trainer/recurrent_units.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
- 
+
 # recurrent_units.py
 # Version 2.0
 #
@@ -22,161 +22,175 @@
 
 from paddle.trainer.config_parser import *
 
+
 # long short term memory, can be used in recurrent machine
 # *inputs* must be a list of Projections, for example:
 #   inputs = [FullMatrixProjection("input_layer_name")],
 # *para_prefix* defines parameter names, if the *para_prefix* of 
 #   two LstmRecurrentUnit is same, they share same parameters
 # *out_memory* can be defined outside if it's used outside
-def LstmRecurrentUnit(name, size, 
-                      active_type, state_active_type, gate_active_type, 
-                      inputs, para_prefix = None, 
-                      error_clipping_threshold = 0,
-                      out_memory = None):
+def LstmRecurrentUnit(name,
+                      size,
+                      active_type,
+                      state_active_type,
+                      gate_active_type,
+                      inputs,
+                      para_prefix=None,
+                      error_clipping_threshold=0,
+                      out_memory=None):
 
-    if para_prefix is None: 
+    if para_prefix is None:
         para_prefix = name
     if out_memory is None:
-        out_memory = Memory(name = name, size = size)
+        out_memory = Memory(name=name, size=size)
+
+    state_memory = Memory(name=name + "_" + "state", size=size)
 
-    state_memory = Memory(name = name + "_" + "state", size = size)
- 
     Layer(
-          name = name + "_" + "input_recurrent",
-          type = "mixed",
-          size = size * 4, #(input_s, input_gate, forget_gate, output_gate)
-          error_clipping_threshold = error_clipping_threshold,
-          bias = Bias(initial_std = 0, 
-                      parameter_name = para_prefix + "_input_recurrent.b"),
-          inputs = inputs + [
-            FullMatrixProjection(out_memory,
-                                 parameter_name = para_prefix + "_input_recurrent.w"),
-            ],
-    )
+        name=name + "_" + "input_recurrent",
+        type="mixed",
+        size=size * 4,  #(input_s, input_gate, forget_gate, output_gate)
+        error_clipping_threshold=error_clipping_threshold,
+        bias=Bias(
+            initial_std=0, parameter_name=para_prefix + "_input_recurrent.b"),
+        inputs=inputs + [
+            FullMatrixProjection(
+                out_memory, parameter_name=para_prefix + "_input_recurrent.w"),
+        ], )
     LstmStepLayer(
-          name = name,
-          size = size,
-          bias = Bias(parameter_name = para_prefix + "_check.b"),
-          inputs = [name + "_" + "input_recurrent", state_memory],
-          active_type = active_type,
-          active_gate_type = gate_active_type,
-          active_state_type = state_active_type,
-    )
+        name=name,
+        size=size,
+        bias=Bias(parameter_name=para_prefix + "_check.b"),
+        inputs=[name + "_" + "input_recurrent", state_memory],
+        active_type=active_type,
+        active_gate_type=gate_active_type,
+        active_state_type=state_active_type, )
     GetOutputLayer(
-          name = name + "_" + "state",
-          size = size,
-          inputs = Input(name, input_layer_argument = "state"),
-    )
-
-def LstmRecurrentUnitNaive(name, size, 
-                           active_type, state_active_type, gate_active_type, 
-                           inputs, para_prefix = None, 
-                           error_clipping_threshold = 0,
-                           out_memory = None):
-
-    if para_prefix is None: 
+        name=name + "_" + "state",
+        size=size,
+        inputs=Input(
+            name, input_layer_argument="state"), )
+
+
+def LstmRecurrentUnitNaive(name,
+                           size,
+                           active_type,
+                           state_active_type,
+                           gate_active_type,
+                           inputs,
+                           para_prefix=None,
+                           error_clipping_threshold=0,
+                           out_memory=None):
+
+    if para_prefix is None:
         para_prefix = name
     if out_memory is None:
-        out_memory = Memory(name = name, size = size)
+        out_memory = Memory(name=name, size=size)
+
+    state_memory = Memory(name=name + "_" + "state", size=size)
 
-    state_memory = Memory(name = name + "_" + "state", size = size)
- 
     Layer(
-          name = name + "_" + "input_recurrent",
-          type = "mixed",
-          size = size * 4, #(input_s, input_gate, forget_gate, output_gate)
-          error_clipping_threshold = error_clipping_threshold,
-          bias = Bias(initial_std = 0, 
-                      parameter_name = para_prefix + "_input_recurrent.b"),
-          inputs = inputs + [
-            FullMatrixProjection(out_memory,
-                                 parameter_name = para_prefix + "_input_recurrent.w"),
-            ],
-    )
+        name=name + "_" + "input_recurrent",
+        type="mixed",
+        size=size * 4,  #(input_s, input_gate, forget_gate, output_gate)
+        error_clipping_threshold=error_clipping_threshold,
+        bias=Bias(
+            initial_std=0, parameter_name=para_prefix + "_input_recurrent.b"),
+        inputs=inputs + [
+            FullMatrixProjection(
+                out_memory, parameter_name=para_prefix + "_input_recurrent.w"),
+        ], )
     ExpressionLayer(
-          name = name + "_" + "input_s",
-          size = size,
-          active_type = active_type,
-          inputs = [IdentityOffsetProjection(name + "_" + "input_recurrent", offset=0)],
-    )
+        name=name + "_" + "input_s",
+        size=size,
+        active_type=active_type,
+        inputs=[
+            IdentityOffsetProjection(
+                name + "_" + "input_recurrent", offset=0)
+        ], )
     ExpressionLayer(
-          name = name + "_" + "input_gate",
-          active_type = gate_active_type,
-          inputs = [IdentityOffsetProjection(name + "_" + "input_recurrent", offset=size),
-                    DotMulProjection(state_memory,
-                                     parameter_name = para_prefix + "_input_check.w")],
-    )
+        name=name + "_" + "input_gate",
+        active_type=gate_active_type,
+        inputs=[
+            IdentityOffsetProjection(
+                name + "_" + "input_recurrent", offset=size), DotMulProjection(
+                    state_memory, parameter_name=para_prefix + "_input_check.w")
+        ], )
     ExpressionLayer(
-          name = name + "_" + "forget_gate",
-          active_type = gate_active_type,
-          inputs = [IdentityOffsetProjection(name + "_" + "input_recurrent", offset=size*2),
-                    DotMulProjection(state_memory,
-                                     parameter_name = para_prefix + "_forget_check.w")],
-    )
+        name=name + "_" + "forget_gate",
+        active_type=gate_active_type,
+        inputs=[
+            IdentityOffsetProjection(
+                name + "_" + "input_recurrent", offset=size * 2),
+            DotMulProjection(
+                state_memory, parameter_name=para_prefix + "_forget_check.w")
+        ], )
     ExpressionLayer(
-          name = name + "_" + "state",
-          inputs = [DotMulOperator([name + "_" + "input_s",
-                                    name + "_" + "input_gate"]),
-                    DotMulOperator([state_memory, 
-                                    name + "_" + "forget_gate"]),
-                    ],
-    )
+        name=name + "_" + "state",
+        inputs=[
+            DotMulOperator([name + "_" + "input_s", name + "_" + "input_gate"]),
+            DotMulOperator([state_memory, name + "_" + "forget_gate"]),
+        ], )
     ExpressionLayer(
-          name = name + "_" + "output_gate",
-          active_type = gate_active_type,
-          inputs = [IdentityOffsetProjection(name + "_" + "input_recurrent", offset=size*3),
-                    DotMulProjection(name + "_" + "state",
-                                     parameter_name = para_prefix + "_output_check.w")],
-    )
+        name=name + "_" + "output_gate",
+        active_type=gate_active_type,
+        inputs=[
+            IdentityOffsetProjection(
+                name + "_" + "input_recurrent", offset=size * 3),
+            DotMulProjection(
+                name + "_" + "state",
+                parameter_name=para_prefix + "_output_check.w")
+        ], )
     ExpressionLayer(
-          name = name + "_" + "state_atv",
-          active_type = state_active_type,
-          inputs = IdentityProjection(name + "_" + "state"),
-    )
+        name=name + "_" + "state_atv",
+        active_type=state_active_type,
+        inputs=IdentityProjection(name + "_" + "state"), )
     ExpressionLayer(
-          name = name,
-          inputs = DotMulOperator([name + "_" + "state_atv",
-                                   name + "_" + "output_gate"]),
-    )
+        name=name,
+        inputs=DotMulOperator(
+            [name + "_" + "state_atv", name + "_" + "output_gate"]), )
+
 
 # like LstmRecurrentUnit, but it's a layer group.
 # it is equivalent to LstmLayer
-def LstmRecurrentLayerGroup(name, size, 
-                            active_type, state_active_type, gate_active_type, 
-                            inputs, para_prefix = None,
-                            error_clipping_threshold = 0,
-                            seq_reversed = False):
+def LstmRecurrentLayerGroup(name,
+                            size,
+                            active_type,
+                            state_active_type,
+                            gate_active_type,
+                            inputs,
+                            para_prefix=None,
+                            error_clipping_threshold=0,
+                            seq_reversed=False):
 
     input_layer_name = name + "_" + "transform_input"
     Layer(
-          name = input_layer_name,
-          type = "mixed",
-          size = size * 4,
-          active_type = "",
-          bias = False,
-          inputs = inputs,
-    )
-
-    RecurrentLayerGroupBegin(name + "_layer_group", 
-                             in_links = [input_layer_name], 
-                             out_links = [name],
-                             seq_reversed = seq_reversed)
+        name=input_layer_name,
+        type="mixed",
+        size=size * 4,
+        active_type="",
+        bias=False,
+        inputs=inputs, )
+
+    RecurrentLayerGroupBegin(
+        name + "_layer_group",
+        in_links=[input_layer_name],
+        out_links=[name],
+        seq_reversed=seq_reversed)
 
     LstmRecurrentUnit(
-        name = name,
-        size = size,
-        active_type = active_type,
-        state_active_type = state_active_type,
-        gate_active_type = gate_active_type,
-        inputs = [IdentityProjection(input_layer_name)],
-        para_prefix = para_prefix,
-        error_clipping_threshold = error_clipping_threshold,
-        )
+        name=name,
+        size=size,
+        active_type=active_type,
+        state_active_type=state_active_type,
+        gate_active_type=gate_active_type,
+        inputs=[IdentityProjection(input_layer_name)],
+        para_prefix=para_prefix,
+        error_clipping_threshold=error_clipping_threshold, )
 
     RecurrentLayerGroupEnd(name + "_layer_group")
 
 
-
 # gated recurrent unit, can be used in recurrent machine
 # *inputs* should be a list of Projections, for example:
 #   inputs = [FullMatrixProjection("input_layer_name")],
@@ -184,142 +198,157 @@ def LstmRecurrentLayerGroup(name, size,
 #   two GatedRecurrentUnit is same, they share same parameters
 # *out_memory* can be defined outside if it's used outside
 
-def GatedRecurrentUnit(name, size, 
-                       active_type, gate_active_type, 
-                       inputs, para_prefix = None, 
-                       error_clipping_threshold = 0,
-                       out_memory = None):
-    if type_of(inputs) == str: #only used by GatedRecurrentLayerGroup
+
+def GatedRecurrentUnit(name,
+                       size,
+                       active_type,
+                       gate_active_type,
+                       inputs,
+                       para_prefix=None,
+                       error_clipping_threshold=0,
+                       out_memory=None):
+    if type_of(inputs) == str:  #only used by GatedRecurrentLayerGroup
         input_layer_name = inputs
     else:
         input_layer_name = name + "_" + "transform_input"
         Layer(
-            name = input_layer_name,
-            type = "mixed",
-            size = size * 3,
-            active_type = "",
-            bias = False,
-            inputs = inputs,
-        )
-
-    if para_prefix is None: 
+            name=input_layer_name,
+            type="mixed",
+            size=size * 3,
+            active_type="",
+            bias=False,
+            inputs=inputs, )
+
+    if para_prefix is None:
         para_prefix = name
     if out_memory is None:
-        out_memory = Memory(name = name, size = size)
+        out_memory = Memory(name=name, size=size)
 
     GruStepLayer(
-          name = name,
-          size = size,
-          bias = Bias(parameter_name = para_prefix + "_gate.b"),
-          inputs = [input_layer_name,
-                    Input(out_memory, parameter_name = para_prefix + "_gate.w")],
-          active_type = active_type,
-          active_gate_type = gate_active_type,
-    )
-
-def GatedRecurrentUnitNaive(name, size, 
-                            active_type, gate_active_type, 
-                            inputs, para_prefix = None, 
-                            error_clipping_threshold = 0,
-                            out_memory = None):
-
-    if type_of(inputs) == str: #only used by GatedRecurrentLayerGroup
+        name=name,
+        size=size,
+        bias=Bias(parameter_name=para_prefix + "_gate.b"),
+        inputs=[
+            input_layer_name, Input(
+                out_memory, parameter_name=para_prefix + "_gate.w")
+        ],
+        active_type=active_type,
+        active_gate_type=gate_active_type, )
+
+
+def GatedRecurrentUnitNaive(name,
+                            size,
+                            active_type,
+                            gate_active_type,
+                            inputs,
+                            para_prefix=None,
+                            error_clipping_threshold=0,
+                            out_memory=None):
+
+    if type_of(inputs) == str:  #only used by GatedRecurrentLayerGroup
         input_layer_name = inputs
     else:
         input_layer_name = name + "_" + "transform_input"
         Layer(
-            name = input_layer_name,
-            type = "mixed",
-            size = size * 3,
-            active_type = "",
-            bias = False,
-            inputs = inputs,
-        )
-
-    if para_prefix is None: 
+            name=input_layer_name,
+            type="mixed",
+            size=size * 3,
+            active_type="",
+            bias=False,
+            inputs=inputs, )
+
+    if para_prefix is None:
         para_prefix = name
     if out_memory is None:
-        out_memory = Memory(name = name, size = size)
+        out_memory = Memory(name=name, size=size)
 
     Layer(
-          name = name + "_" + "update_gate",
-          type = "mixed",
-          size = size, 
-          active_type = gate_active_type,
-          error_clipping_threshold = error_clipping_threshold,
-          bias = Bias(initial_std = 0, parameter_name = para_prefix + "_update_gate.b"),
-          inputs = [IdentityOffsetProjection(input_layer_name, offset=0),
-                    FullMatrixProjection(out_memory,
-                                         parameter_name = para_prefix + "_update_gate.w")],
-    )
+        name=name + "_" + "update_gate",
+        type="mixed",
+        size=size,
+        active_type=gate_active_type,
+        error_clipping_threshold=error_clipping_threshold,
+        bias=Bias(
+            initial_std=0, parameter_name=para_prefix + "_update_gate.b"),
+        inputs=[
+            IdentityOffsetProjection(
+                input_layer_name, offset=0), FullMatrixProjection(
+                    out_memory, parameter_name=para_prefix + "_update_gate.w")
+        ], )
     Layer(
-          name = name + "_" + "reset_gate",
-          type = "mixed",
-          size = size, 
-          active_type = gate_active_type,
-          error_clipping_threshold = error_clipping_threshold,
-          bias = Bias(initial_std = 0, parameter_name = para_prefix + "_reset_gate.b"),
-          inputs = [IdentityOffsetProjection(input_layer_name, offset=size),
-                    FullMatrixProjection(out_memory,
-                                         parameter_name = para_prefix + "_reset_gate.w")],
-    )
+        name=name + "_" + "reset_gate",
+        type="mixed",
+        size=size,
+        active_type=gate_active_type,
+        error_clipping_threshold=error_clipping_threshold,
+        bias=Bias(
+            initial_std=0, parameter_name=para_prefix + "_reset_gate.b"),
+        inputs=[
+            IdentityOffsetProjection(
+                input_layer_name, offset=size), FullMatrixProjection(
+                    out_memory, parameter_name=para_prefix + "_reset_gate.w")
+        ], )
     ExpressionLayer(
-          name = name + "_" + "reset_output",
-          inputs = DotMulOperator([out_memory, name + "_" + "reset_gate"]),
-    )
+        name=name + "_" + "reset_output",
+        inputs=DotMulOperator([out_memory, name + "_" + "reset_gate"]), )
     Layer(
-          name = name + "_" + "output_candidate",
-          type = "mixed",
-          size = size, 
-          active_type = active_type,
-          error_clipping_threshold = error_clipping_threshold,
-          bias = Bias(initial_std = 0, parameter_name = para_prefix + "_output_candidate.b"),
-          inputs = [IdentityOffsetProjection(input_layer_name, offset=size*2),
-                    FullMatrixProjection(name + "_" + "reset_output",
-                                         parameter_name = para_prefix + "_output_candidate.w")],
-    )
-    ExpressionLayer( #element-wise interpolation
-          name = name,
-          inputs = [IdentityProjection(out_memory),
-                    DotMulOperator([out_memory, 
-                                    name + "_" + "update_gate"], scale=-1.0),
-                    DotMulOperator([name + "_" + "output_candidate", 
-                                    name + "_" + "update_gate"]),
-                    ],
-    )
+        name=name + "_" + "output_candidate",
+        type="mixed",
+        size=size,
+        active_type=active_type,
+        error_clipping_threshold=error_clipping_threshold,
+        bias=Bias(
+            initial_std=0, parameter_name=para_prefix + "_output_candidate.b"),
+        inputs=[
+            IdentityOffsetProjection(
+                input_layer_name, offset=size * 2), FullMatrixProjection(
+                    name + "_" + "reset_output",
+                    parameter_name=para_prefix + "_output_candidate.w")
+        ], )
+    ExpressionLayer(  #element-wise interpolation
+        name=name,
+        inputs=[
+            IdentityProjection(out_memory),
+            DotMulOperator(
+                [out_memory, name + "_" + "update_gate"], scale=-1.0),
+            DotMulOperator(
+                [name + "_" + "output_candidate", name + "_" + "update_gate"]),
+        ], )
+
 
 # like GatedRecurrentUnit, but it's a layer group.
 # it is equivalent to GatedRecurrentLayer.
-def GatedRecurrentLayerGroup(name, size, 
-                             active_type, gate_active_type, 
-                             inputs, para_prefix = None,
-                             error_clipping_threshold = 0,
-                             seq_reversed = False):
+def GatedRecurrentLayerGroup(name,
+                             size,
+                             active_type,
+                             gate_active_type,
+                             inputs,
+                             para_prefix=None,
+                             error_clipping_threshold=0,
+                             seq_reversed=False):
 
     input_layer_name = name + "_" + "transform_input"
     Layer(
-          name = input_layer_name,
-          type = "mixed",
-          size = size * 3,
-          active_type = "",
-          bias = False,
-          inputs = inputs,
-    )
-
-    RecurrentLayerGroupBegin(name + "_layer_group", 
-                             in_links = [input_layer_name], 
-                             out_links = [name],
-                             seq_reversed = seq_reversed)
+        name=input_layer_name,
+        type="mixed",
+        size=size * 3,
+        active_type="",
+        bias=False,
+        inputs=inputs, )
+
+    RecurrentLayerGroupBegin(
+        name + "_layer_group",
+        in_links=[input_layer_name],
+        out_links=[name],
+        seq_reversed=seq_reversed)
 
     GatedRecurrentUnit(
-        name = name,
-        size = size,
-        active_type = active_type,
-        gate_active_type = gate_active_type,
-        inputs = input_layer_name, #transform outside
-        para_prefix = para_prefix,
-        error_clipping_threshold = error_clipping_threshold,
-        )
+        name=name,
+        size=size,
+        active_type=active_type,
+        gate_active_type=gate_active_type,
+        inputs=input_layer_name,  #transform outside
+        para_prefix=para_prefix,
+        error_clipping_threshold=error_clipping_threshold, )
 
     RecurrentLayerGroupEnd(name + "_layer_group")
-
diff --git a/python/paddle/trainer_config_helpers/__init__.py b/python/paddle/trainer_config_helpers/__init__.py
index 451b9ac3396eadf9fab2b5fd940a6f924e042976..adebebba2523f851507c4a0525eeaae9cfeb9dcc 100644
--- a/python/paddle/trainer_config_helpers/__init__.py
+++ b/python/paddle/trainer_config_helpers/__init__.py
@@ -20,3 +20,6 @@ from layers import *
 from networks import *
 from optimizers import *
 from attrs import *
+
+# This will enable operator overload for LayerOutput
+import math
diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py
index 292014519374eabbe55c61daa73692814a52aac2..6261934e1bc8e8df62aeaa0757f4a237f91ef748 100644
--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
@@ -12,20 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = ["TanhActivation", "SigmoidActivation",
-           "SoftmaxActivation", "IdentityActivation", "LinearActivation",
-           'SequenceSoftmaxActivation', 'ExpActivation',
-           "ReluActivation", "BReluActivation", "SoftReluActivation",
-           "STanhActivation",
-           "AbsActivation", "SquareActivation",
-           "BaseActivation"]
+__all__ = [
+    "TanhActivation", "SigmoidActivation", "SoftmaxActivation",
+    "IdentityActivation", "LinearActivation", 'SequenceSoftmaxActivation',
+    'ExpActivation', "ReluActivation", "BReluActivation", "SoftReluActivation",
+    "STanhActivation", "AbsActivation", "SquareActivation", "BaseActivation"
+]
 
 
 class BaseActivation(object):
     """
-    A mark for activation class. 
+    A mark for activation class.
     Each activation inherit BaseActivation, which has two parameters.
-     
+
     :param name: activation name in paddle config.
     :type name: basestring
     :param support_hppl: True if supported by hppl. HPPL is a library used by paddle
@@ -51,7 +50,8 @@ class TanhActivation(BaseActivation):
        f(z)=tanh(z)=\\frac{e^z-e^{-z}}{e^z+e^{-z}}
     """
 
-    def __init__(self): BaseActivation.__init__(self, 'tanh', True)
+    def __init__(self):
+        BaseActivation.__init__(self, 'tanh', True)
 
 
 class SigmoidActivation(BaseActivation):
@@ -63,7 +63,8 @@ class SigmoidActivation(BaseActivation):
        f(z) = \\frac{1}{1+exp(-z)}
     """
 
-    def __init__(self): BaseActivation.__init__(self, 'sigmoid', True)
+    def __init__(self):
+        BaseActivation.__init__(self, 'sigmoid', True)
 
 
 class SoftmaxActivation(BaseActivation):
@@ -104,7 +105,8 @@ class IdentityActivation(BaseActivation):
     Just do nothing for output both forward/backward.
     """
 
-    def __init__(self): BaseActivation.__init__(self, '', False)
+    def __init__(self):
+        BaseActivation.__init__(self, '', False)
 
 
 LinearActivation = IdentityActivation
@@ -124,7 +126,8 @@ class ReluActivation(BaseActivation):
        0  &\\quad\\mathrm{otherwize}
     """
 
-    def __init__(self): BaseActivation.__init__(self, 'relu', True)
+    def __init__(self):
+        BaseActivation.__init__(self, 'relu', True)
 
 
 class BReluActivation(BaseActivation):
@@ -141,7 +144,8 @@ class BReluActivation(BaseActivation):
        0  &\\quad \\mathrm{otherwise}
     """
 
-    def __init__(self): BaseActivation.__init__(self, 'brelu', False)
+    def __init__(self):
+        BaseActivation.__init__(self, 'brelu', False)
 
 
 class SoftReluActivation(BaseActivation):
@@ -149,7 +153,9 @@ class SoftReluActivation(BaseActivation):
     SoftRelu Activation.
     """
 
-    def __init__(self): BaseActivation.__init__(self, 'softrelu', False)
+    def __init__(self):
+        BaseActivation.__init__(self, 'softrelu', False)
+
 
 class STanhActivation(BaseActivation):
     """
@@ -160,7 +166,8 @@ class STanhActivation(BaseActivation):
        f(z) = 1.7159 * tanh(2/3*z)
     """
 
-    def __init__(self): BaseActivation.__init__(self, 'stanh', False)
+    def __init__(self):
+        BaseActivation.__init__(self, 'stanh', False)
 
 
 class AbsActivation(BaseActivation):
@@ -178,7 +185,8 @@ class AbsActivation(BaseActivation):
        0 &\\quad if \\quad z = 0
     """
 
-    def __init__(self): BaseActivation.__init__(self, 'abs', False)
+    def __init__(self):
+        BaseActivation.__init__(self, 'abs', False)
 
 
 class SquareActivation(BaseActivation):
@@ -189,13 +197,29 @@ class SquareActivation(BaseActivation):
        f(z) = z^2.
     """
 
-    def __init__(self): BaseActivation.__init__(self, 'square', False)
+    def __init__(self):
+        BaseActivation.__init__(self, 'square', False)
+
 
 class ExpActivation(BaseActivation):
     """
     Exponential Activation.
-    
+
     .. math::
        f(z) = e^z.
     """
-    def __init__(self): BaseActivation.__init__(self, 'exponential', False)
+
+    def __init__(self):
+        BaseActivation.__init__(self, 'exponential', False)
+
+
+class LogActivation(BaseActivation):
+    """
+    Logarithm Activation.
+
+    .. math::
+       f(z) = log(z)
+    """
+
+    def __init__(self):
+        BaseActivation.__init__(self, 'log', False)
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index d26344124733246c67790025fb186c6b350c3947..54169f382f164e7b9cf061baeb21d4109a8ae5b6 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -13,8 +13,9 @@
 # limitations under the License.
 
 from paddle.trainer.config_parser import *
-__all__ = ['ParamAttr', 'ExtraAttr', 'ParameterAttribute',
-           'ExtraLayerAttribute']
+__all__ = [
+    'ParamAttr', 'ExtraAttr', 'ParameterAttribute', 'ExtraLayerAttribute'
+]
 
 
 def convert_and_compare(x, Type):
@@ -25,7 +26,8 @@ def convert_and_compare(x, Type):
     :param Type: target type to check x over                                                                                                                                                           
     
     """
-    return type(x)(Type(x))==x
+    return type(x)(Type(x)) == x
+
 
 def is_compatible_with(x, Type):
     """                                                                                                                                                                                                
@@ -38,9 +40,9 @@ def is_compatible_with(x, Type):
         return True
     try:
         if float == Type or int == Type:
-        # avoid those types that can be converted to float/int but not very                                                                                                                            
-        # meaningful and  could potentially lead to error                                                                                                                                              
-        # i.e., str and bool typed value should not be used for initializing float/int variable                                                                                                        
+            # avoid those types that can be converted to float/int but not very                                                                                                                            
+            # meaningful and  could potentially lead to error                                                                                                                                              
+            # i.e., str and bool typed value should not be used for initializing float/int variable                                                                                                        
             if not isinstance(x, str) and not isinstance(x, bool):
                 return convert_and_compare(x, Type)
         elif bool == Type:
@@ -91,9 +93,17 @@ class ParameterAttribute(object):
     :type sparse_update: bool
     """
 
-    def __init__(self, name=None, is_static=False, initial_std=None,
-                 initial_mean=None, initial_max=None, initial_min=None,
-                 l1_rate=None, l2_rate=None, learning_rate=None, momentum=None,
+    def __init__(self,
+                 name=None,
+                 is_static=False,
+                 initial_std=None,
+                 initial_mean=None,
+                 initial_max=None,
+                 initial_min=None,
+                 l1_rate=None,
+                 l2_rate=None,
+                 learning_rate=None,
+                 momentum=None,
                  sparse_update=False):
         # initialize strategy.
         if is_static:
@@ -183,7 +193,10 @@ class ExtraLayerAttribute(object):
     :type device: int
     """
 
-    def __init__(self, error_clipping_threshold=None, drop_rate=None, device=None):
+    def __init__(self,
+                 error_clipping_threshold=None,
+                 drop_rate=None,
+                 device=None):
         self.attr = dict()
         if isinstance(error_clipping_threshold, float):
             assert error_clipping_threshold > 0
@@ -200,8 +213,8 @@ class ExtraLayerAttribute(object):
         for key in self.attr:
             if not hasattr(self, 'can_%s' % key) or \
                     not getattr(self, 'can_%s' % key):
-                raise NotImplementedError(
-                    "Layer %s cannot support %s" % (layer_name, key))
+                raise NotImplementedError("Layer %s cannot support %s" %
+                                          (layer_name, key))
 
     @staticmethod
     def to_kwargs(attr):
diff --git a/python/paddle/trainer_config_helpers/data_sources.py b/python/paddle/trainer_config_helpers/data_sources.py
index 3b5c17a271f02b4a7506c4b2ffc10d3759dd97c7..b41097953dad8aa9c8755c25860b177cdbff5b93 100644
--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ b/python/paddle/trainer_config_helpers/data_sources.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 Data Sources are helpers to define paddle training data or testing data.
 """
@@ -26,8 +25,12 @@ except ImportError:
 __all__ = ['define_py_data_sources2']
 
 
-def define_py_data_source(file_list, cls, module,
-                          obj, args=None, async=False,
+def define_py_data_source(file_list,
+                          cls,
+                          module,
+                          obj,
+                          args=None,
+                          async=False,
                           data_cls=PyData):
     """
     Define a python data source.
@@ -68,7 +71,7 @@ def define_py_data_source(file_list, cls, module,
         file_list_name = 'train.list'
         if isinstance(cls, TestData):
             file_list_name = 'test.list'
-        with open(file_list_name, 'r') as f:
+        with open(file_list_name, 'w') as f:
             f.writelines(file_list)
         file_list = file_list_name
 
@@ -76,8 +79,9 @@ def define_py_data_source(file_list, cls, module,
         args = pickle.dumps(args, 0)
 
     if data_cls is None:
+
         def py_data2(files, load_data_module, load_data_object, load_data_args,
-                    **kwargs):
+                     **kwargs):
             data = DataBase()
             data.type = 'py2'
             data.files = files
@@ -86,17 +90,25 @@ def define_py_data_source(file_list, cls, module,
             data.load_data_args = load_data_args
             data.async_load_data = True
             return data
-        data_cls = py_data2
-
-    cls(data_cls(files=file_list,
-                 load_data_module=module,
-                 load_data_object=obj,
-                 load_data_args=args,
-                 async_load_data=async))
 
+        data_cls = py_data2
 
-def define_py_data_sources(train_list, test_list, module, obj, args=None,
-                           train_async=False, data_cls=PyData):
+    cls(
+        data_cls(
+            files=file_list,
+            load_data_module=module,
+            load_data_object=obj,
+            load_data_args=args,
+            async_load_data=async))
+
+
+def define_py_data_sources(train_list,
+                           test_list,
+                           module,
+                           obj,
+                           args=None,
+                           train_async=False,
+                           data_cls=PyData):
     """
     The annotation is almost the same as define_py_data_sources2, except that
     it can specific train_async and data_cls.
@@ -125,8 +137,8 @@ def define_py_data_sources(train_list, test_list, module, obj, args=None,
     """
 
     def __is_splitable__(o):
-        return (isinstance(o, list) or isinstance(o, tuple)
-                ) and hasattr(o, '__len__') and len(o) == 2
+        return (isinstance(o, list) or
+                isinstance(o, tuple)) and hasattr(o, '__len__') and len(o) == 2
 
     assert train_list is not None or test_list is not None
     assert module is not None and obj is not None
@@ -139,7 +151,7 @@ def define_py_data_sources(train_list, test_list, module, obj, args=None,
     test_obj = obj
     train_obj = obj
     if __is_splitable__(obj):
-        train_module, test_module = module
+        train_obj, test_obj = obj
 
     if args is None:
         args = ""
@@ -196,9 +208,10 @@ def define_py_data_sources2(train_list, test_list, module, obj, args=None):
     :return: None
     :rtype: None
     """
-    define_py_data_sources(train_list=train_list,
-                           test_list=test_list,
-                           module=module,
-                           obj=obj,
-                           args=args,
-                           data_cls=None)
+    define_py_data_sources(
+        train_list=train_list,
+        test_list=test_list,
+        module=module,
+        obj=obj,
+        args=args,
+        data_cls=None)
diff --git a/python/paddle/trainer_config_helpers/default_decorators.py b/python/paddle/trainer_config_helpers/default_decorators.py
index b20aebc685fe5a36b69c4e9f09b610631b233ecf..c01050e338d5933f49f0504f2e9ef5f15c7743ba 100644
--- a/python/paddle/trainer_config_helpers/default_decorators.py
+++ b/python/paddle/trainer_config_helpers/default_decorators.py
@@ -13,20 +13,23 @@
 # limitations under the License.
 
 import functools
+import inspect
 from .attrs import ParamAttr
 from .activations import TanhActivation
 from paddle.trainer.config_parser import *
 
-__all__ = ['wrap_name_default', 'wrap_param_attr_default',
-           'wrap_bias_attr_default', 'wrap_act_default',
-           'wrap_param_default']
+__all__ = [
+    'wrap_name_default', 'wrap_param_attr_default', 'wrap_bias_attr_default',
+    'wrap_act_default', 'wrap_param_default'
+]
 
 
 def __default_not_set_callback__(kwargs, name):
     return name not in kwargs or kwargs[name] is None
 
 
-def wrap_param_default(param_names=None, default_factory=None,
+def wrap_param_default(param_names=None,
+                       default_factory=None,
                        not_set_callback=__default_not_set_callback__):
     assert param_names is not None
     assert isinstance(param_names, list) or isinstance(param_names, tuple)
@@ -37,8 +40,13 @@ def wrap_param_default(param_names=None, default_factory=None,
         @functools.wraps(func)
         def __wrapper__(*args, **kwargs):
             if len(args) != 0:
-                logger.warning("please use keyword arguments in paddle config.")
-
+                argspec = inspect.getargspec(func)
+                num_positional = len(argspec.args)
+                if argspec.defaults:
+                    num_positional -= len(argspec.defaults)
+                if not argspec.varargs and len(args) > num_positional:
+                    logger.fatal(
+                        "Must use keyword arguments for non-positional args")
             for name in param_names:
                 if not_set_callback(kwargs, name):  # Not set
                     kwargs[name] = default_factory(func)
@@ -107,13 +115,13 @@ def wrap_param_attr_default(param_names=None, default_factory=None):
     return wrap_param_default(param_names, default_factory)
 
 
-def wrap_bias_attr_default(param_names=None, default_factory=None,
+def wrap_bias_attr_default(param_names=None,
+                           default_factory=None,
                            has_bias=True):
     if param_names is None:
         param_names = ['bias_attr']
     if default_factory is None:
-        default_factory = lambda _: ParamAttr(initial_std=0.,
-                                              initial_mean=0.)
+        default_factory = lambda _: ParamAttr(initial_std=0., initial_mean=0.)
 
     def __bias_attr_not_set__(kwargs, name):
         if has_bias:
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index ded124a5c8ca44af02cd7df81ee2bff87af98337..dc6a36392f9c6bff42d3a37f963ed18a849414f5 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -15,13 +15,14 @@
 from paddle.trainer.config_parser import *
 from default_decorators import *
 
-__all__ = ["evaluator_base","classification_error_evaluator", "auc_evaluator",
-           "pnpair_evaluator", "precision_recall_evaluator",
-           "ctc_error_evaluator", "chunk_evaluator", "sum_evaluator",
-           "column_sum_evaluator", "value_printer_evaluator",
-           "gradient_printer_evaluator", "maxid_printer_evaluator",
-           "maxframe_printer_evaluator", "seqtext_printer_evaluator",
-           "classification_error_printer_evaluator"]
+__all__ = [
+    "evaluator_base", "classification_error_evaluator", "auc_evaluator",
+    "pnpair_evaluator", "precision_recall_evaluator", "ctc_error_evaluator",
+    "chunk_evaluator", "sum_evaluator", "column_sum_evaluator",
+    "value_printer_evaluator", "gradient_printer_evaluator",
+    "maxid_printer_evaluator", "maxframe_printer_evaluator",
+    "seqtext_printer_evaluator", "classification_error_printer_evaluator"
+]
 
 
 class EvaluatorAttribute(object):
@@ -32,10 +33,7 @@ class EvaluatorAttribute(object):
     FOR_UTILS = 1 << 4
 
     KEYS = [
-        "for_classification",
-        "for_regression",
-        "for_rank",
-        "for_print",
+        "for_classification", "for_regression", "for_rank", "for_print",
         "for_utils"
     ]
 
@@ -55,22 +53,23 @@ def evaluator(*attrs):
             setattr(method, EvaluatorAttribute.to_key(attr), True)
         method.is_evaluator = True
         return method
+
     return impl
 
-def evaluator_base(
-        input,
-        type,
-        label=None,
-        weight=None,
-        name=None,
-        chunk_scheme=None,
-        num_chunk_types=None,
-        classification_threshold=None,
-        positive_label=None,
-        dict_file=None,
-        result_file=None,
-        num_results=None,
-        delimited=None):
+
+def evaluator_base(input,
+                   type,
+                   label=None,
+                   weight=None,
+                   name=None,
+                   chunk_scheme=None,
+                   num_chunk_types=None,
+                   classification_threshold=None,
+                   positive_label=None,
+                   dict_file=None,
+                   result_file=None,
+                   num_results=None,
+                   delimited=None):
     """
     Evaluator will evaluate the network status while training/testing.
 
@@ -130,14 +129,14 @@ def evaluator_base(
         result_file=result_file,
         delimited=delimited)
 
+
 @evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
 @wrap_name_default()
-def classification_error_evaluator(
-        input,
-        label,
-        name=None,
-        weight=None,
-        threshold=None):
+def classification_error_evaluator(input,
+                                   label,
+                                   name=None,
+                                   weight=None,
+                                   threshold=None):
     """
     Classification Error Evaluator. It will print error rate for classification.
 
@@ -170,13 +169,14 @@ def classification_error_evaluator(
     :return: None.
     """
 
-    evaluator_base(name=name,
-                   type="classification_error",
-                   input=input,
-                   label=label,
-                   weight=weight,
-                   classification_threshold=threshold,
-                   )
+    evaluator_base(
+        name=name,
+        type="classification_error",
+        input=input,
+        label=label,
+        weight=weight,
+        classification_threshold=threshold, )
+
 
 @evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
 @wrap_name_default()
@@ -184,8 +184,7 @@ def auc_evaluator(
         input,
         label,
         name=None,
-        weight=None,
-        ):
+        weight=None, ):
     """
     Auc Evaluator which adapts to binary classification.
 
@@ -205,11 +204,13 @@ def auc_evaluator(
                   [sample_num, 1].
     :type weight: LayerOutput
     """
-    evaluator_base(name=name,
-                   type="last-column-auc",
-                   input=input,
-                   label=label,
-                   weight=weight)
+    evaluator_base(
+        name=name,
+        type="last-column-auc",
+        input=input,
+        label=label,
+        weight=weight)
+
 
 @evaluator(EvaluatorAttribute.FOR_RANK)
 @wrap_name_default()
@@ -218,8 +219,7 @@ def pnpair_evaluator(
         label,
         info,
         name=None,
-        weight=None,
-        ):
+        weight=None, ):
     """
     Positive-negative pair rate Evaluator which adapts to rank task like
     learning to rank. This evaluator must contain at least three layers.
@@ -242,12 +242,14 @@ def pnpair_evaluator(
                   [sample_num, 1]. (TODO, explaination)
     :type weight: LayerOutput
     """
-    evaluator_base(name=name,
-                   type="pnpair",
-                   input=input,
-                   label=label,
-                   info=info,
-                   weight=weight)
+    evaluator_base(
+        name=name,
+        type="pnpair",
+        input=input,
+        label=label,
+        info=info,
+        weight=weight)
+
 
 @evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
 @wrap_name_default()
@@ -256,8 +258,7 @@ def precision_recall_evaluator(
         label,
         positive_label=None,
         weight=None,
-        name=None,
-        ):
+        name=None, ):
     """
     An Evaluator to calculate precision and recall, F1-score.
     It is adapt to the task with multiple labels.
@@ -286,20 +287,21 @@ def precision_recall_evaluator(
                   [sample_num, 1]. (TODO, explaination)
     :type weight: LayerOutput
     """
-    evaluator_base(name=name,
-                   type="precision_recall",
-                   input=input,
-                   label=label,
-                   positive_label=positive_label,
-                   weight=weight)
+    evaluator_base(
+        name=name,
+        type="precision_recall",
+        input=input,
+        label=label,
+        positive_label=positive_label,
+        weight=weight)
+
 
 @evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
 @wrap_name_default()
 def ctc_error_evaluator(
         input,
         label,
-        name=None,
-        ):
+        name=None, ):
     """
     This evaluator is to calculate sequence-to-sequence edit distance.
 
@@ -317,10 +319,9 @@ def ctc_error_evaluator(
                   label for ctc_layer
     :type label: LayerOutput
     """
-    evaluator_base(name=name,
-                   type="ctc_edit_distance",
-                   input=input,
-                   label=label)
+    evaluator_base(
+        name=name, type="ctc_edit_distance", input=input, label=label)
+
 
 @evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
 @wrap_name_default()
@@ -328,8 +329,7 @@ def chunk_evaluator(
         input,
         name=None,
         chunk_scheme=None,
-        num_chunk_types=None,
-        ):
+        num_chunk_types=None, ):
     """
     Chunk evaluator is used to evaluate segment labelling accuracy for a
     sequence. It calculates the chunk detection F1 score.
@@ -375,19 +375,20 @@ def chunk_evaluator(
     :type chunk_scheme: basestring
     :param num_chunk_types: number of chunk types other than "other"
     """
-    evaluator_base(name=name,
-                   type="chunk",
-                   input=input,
-                   chunk_scheme=chunk_scheme,
-                   num_chunk_types=num_chunk_types)
+    evaluator_base(
+        name=name,
+        type="chunk",
+        input=input,
+        chunk_scheme=chunk_scheme,
+        num_chunk_types=num_chunk_types)
+
 
 @evaluator(EvaluatorAttribute.FOR_UTILS)
 @wrap_name_default()
 def sum_evaluator(
         input,
         name=None,
-        weight=None,
-        ):
+        weight=None, ):
     """
     An Evaluator to sum the result of input.
 
@@ -405,18 +406,15 @@ def sum_evaluator(
                   [sample_num, 1]. (TODO, explaination)
     :type weight: LayerOutput
     """
-    evaluator_base(name=name,
-                   type="sum",
-                   input=input,
-                   weight=weight)
+    evaluator_base(name=name, type="sum", input=input, weight=weight)
+
 
 @evaluator(EvaluatorAttribute.FOR_UTILS)
 @wrap_name_default()
 def column_sum_evaluator(
         input,
         name=None,
-        weight=None,
-        ):
+        weight=None, ):
     """
     This Evaluator is used to sum the last column of input.
 
@@ -431,22 +429,22 @@ def column_sum_evaluator(
     :param input: Input Layer name.
     :type input: LayerOutput
     """
-    evaluator_base(name=name,
-                   type="last-column-sum",
-                   input=input,
-                   weight=weight)
+    evaluator_base(
+        name=name, type="last-column-sum", input=input, weight=weight)
+
 
 """
 The following are printer Evaluators which are usually used to
 print the result, like value or gradient of input layers, the
 results generated in machine translation, the classification error etc.
 """
+
+
 @evaluator(EvaluatorAttribute.FOR_PRINT)
 @wrap_name_default()
 def value_printer_evaluator(
         input,
-        name=None,
-        ):
+        name=None, ):
     """
     This Evaluator is used to print the values of input layers. It contains
     one or more input layers.
@@ -462,16 +460,14 @@ def value_printer_evaluator(
     :param name: Evaluator name.
     :type name: None|basestring
     """
-    evaluator_base(name=name,
-                   type="value_printer",
-                   input=input)
+    evaluator_base(name=name, type="value_printer", input=input)
+
 
 @evaluator(EvaluatorAttribute.FOR_PRINT)
 @wrap_name_default()
 def gradient_printer_evaluator(
         input,
-        name=None,
-        ):
+        name=None, ):
     """
     This Evaluator is used to print the gradient of input layers. It contains
     one or more input layers.
@@ -487,17 +483,15 @@ def gradient_printer_evaluator(
     :param name: Evaluator name.
     :type name: None|basestring
     """
-    evaluator_base(name=name,
-                   type="gradient_printer",
-                   input=input)
+    evaluator_base(name=name, type="gradient_printer", input=input)
+
 
 @evaluator(EvaluatorAttribute.FOR_PRINT)
 @wrap_name_default()
 def maxid_printer_evaluator(
         input,
         num_results=None,
-        name=None,
-        ):
+        name=None, ):
     """
     This Evaluator is used to print maximum top k values and their indexes
     of each row of input layers. It contains one or more input layers.
@@ -517,18 +511,16 @@ def maxid_printer_evaluator(
     :param name: Evaluator name.
     :type name: None|basestring
     """
-    evaluator_base(name=name,
-                   type="max_id_printer",
-                   input=input,
-                   num_results=num_results)
+    evaluator_base(
+        name=name, type="max_id_printer", input=input, num_results=num_results)
+
 
 @evaluator(EvaluatorAttribute.FOR_PRINT)
 @wrap_name_default()
 def maxframe_printer_evaluator(
         input,
         num_results=None,
-        name=None,
-        ):
+        name=None, ):
     """
     This Evaluator is used to print the top k frames of each input layers.
     The input layers should contain sequences info or sequences type.
@@ -549,10 +541,12 @@ def maxframe_printer_evaluator(
     :param name: Evaluator name.
     :type name: None|basestring
     """
-    evaluator_base(name=name,
-                   type="max_frame_printer",
-                   input=input,
-                   num_results=num_results)
+    evaluator_base(
+        name=name,
+        type="max_frame_printer",
+        input=input,
+        num_results=num_results)
+
 
 @evaluator(EvaluatorAttribute.FOR_PRINT)
 @wrap_name_default()
@@ -562,8 +556,7 @@ def seqtext_printer_evaluator(
         id_input=None,
         dict_file=None,
         delimited=None,
-        name=None,
-        ):
+        name=None, ):
     """
     Sequence text printer will print text according to index matrix and a
     dictionary. There can be multiple input to this layer:
@@ -636,12 +629,14 @@ def seqtext_printer_evaluator(
         inputs = [id_input, input]
         input.parents.append(id_input)
 
-    evaluator_base(name=name,
-                   type="seq_text_printer",
-                   input=inputs,
-                   dict_file=dict_file,
-                   result_file=result_file,
-                   delimited=delimited)
+    evaluator_base(
+        name=name,
+        type="seq_text_printer",
+        input=inputs,
+        dict_file=dict_file,
+        result_file=result_file,
+        delimited=delimited)
+
 
 @evaluator(EvaluatorAttribute.FOR_PRINT)
 @wrap_name_default()
@@ -649,8 +644,7 @@ def classification_error_printer_evaluator(
         input,
         label,
         threshold=0.5,
-        name=None,
-        ):
+        name=None, ):
     """
     This Evaluator is used to print the classification error of each sample.
 
@@ -667,8 +661,9 @@ def classification_error_printer_evaluator(
     :param name: Evaluator name.
     :type name: None|basestring
     """
-    evaluator_base(name=name,
-                   type="classification_error_printer",
-                   input=input,
-                   label=label,
-                   classification_threshold=threshold)
+    evaluator_base(
+        name=name,
+        type="classification_error_printer",
+        input=input,
+        label=label,
+        classification_threshold=threshold)
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 686704cb7c9b0bda20101f65b66f1657c9f63770..d984e843204c1cd99ee5b8941dc056c091504869 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -29,34 +29,84 @@ except ImportError:
     import pickle
 import copy
 
-__all__ = ["full_matrix_projection", "AggregateLevel", "ExpandLevel",
-           "identity_projection", "dotmul_projection", "dotmul_operator",
-           "table_projection", "mixed_layer", "data_layer",
-           "embedding_layer", "fc_layer", "grumemory",
-           "pooling_layer", "lstmemory", "last_seq", "first_seq",
-           "cos_sim", "hsigmoid",
-           "regression_cost", 'classification_cost', "LayerOutput",
-           'img_conv_layer', 'img_pool_layer', 'batch_norm_layer',
-           'img_cmrnorm_layer', 'addto_layer',
-           'concat_layer', 'lstm_step_layer', 'recurrent_group',
-           'memory', 'StaticInput', 'expand_layer', 'scaling_layer',
-           'power_layer', 'interpolation_layer', 'trans_layer',
-           'sum_to_one_norm_layer',
-           'get_output_layer', 'LayerType', 'context_projection',
-           'beam_search', 'maxid_layer', 'GeneratedInput', 'SubsequenceInput',
-           'gru_step_layer', 'recurrent_layer',
-           'BaseGeneratedInput', 'conv_operator', 'conv_shift_layer',
-           'tensor_layer', 'selective_fc_layer', 'sampling_id_layer',
-           'slope_intercept_layer', 'trans_full_matrix_projection',
-           'linear_comb_layer',
-           'convex_comb_layer', 'ctc_layer', 'crf_layer', 'crf_decoding_layer',
-           'nce_layer',
-           'cross_entropy_with_selfnorm', 'cross_entropy',
-           'multi_binary_label_cross_entropy',
-           'rank_cost', 'lambda_cost', 'huber_cost',
-           # 'block_expand_layer',  # TODO(yuyang18): this layer is not correct
-           'out_prod_layer', 'print_layer'
-           ]
+__all__ = [
+    "full_matrix_projection",
+    "AggregateLevel",
+    "ExpandLevel",
+    "identity_projection",
+    "dotmul_projection",
+    "dotmul_operator",
+    "repeat_layer",
+    "table_projection",
+    "mixed_layer",
+    "data_layer",
+    "embedding_layer",
+    "fc_layer",
+    "grumemory",
+    "pooling_layer",
+    "lstmemory",
+    "last_seq",
+    "first_seq",
+    "cos_sim",
+    "hsigmoid",
+    "conv_projection",
+    "regression_cost",
+    'classification_cost',
+    "LayerOutput",
+    'img_conv_layer',
+    'img_pool_layer',
+    'batch_norm_layer',
+    'img_cmrnorm_layer',
+    'addto_layer',
+    'concat_layer',
+    'lstm_step_layer',
+    'recurrent_group',
+    'memory',
+    'StaticInput',
+    'expand_layer',
+    'scaling_layer',
+    'scaling_projection',
+    'power_layer',
+    'interpolation_layer',
+    'bilinear_interp_layer',
+    'trans_layer',
+    'sum_to_one_norm_layer',
+    'get_output_layer',
+    'LayerType',
+    'context_projection',
+    'beam_search',
+    'maxid_layer',
+    'GeneratedInput',
+    'SubsequenceInput',
+    'gru_step_layer',
+    'recurrent_layer',
+    'BaseGeneratedInput',
+    'conv_operator',
+    'conv_shift_layer',
+    'tensor_layer',
+    'selective_fc_layer',
+    'sampling_id_layer',
+    'slope_intercept_layer',
+    'trans_full_matrix_projection',
+    'linear_comb_layer',
+    'convex_comb_layer',
+    'ctc_layer',
+    'crf_layer',
+    'crf_decoding_layer',
+    'nce_layer',
+    'cross_entropy_with_selfnorm',
+    'cross_entropy',
+    'multi_binary_label_cross_entropy',
+    'sum_cost',
+    'rank_cost',
+    'lambda_cost',
+    'huber_cost',
+    'block_expand_layer',
+    'maxout_layer',
+    'out_prod_layer',
+    'print_layer',
+    'spp_layer',
+]
 
 
 class LayerType(object):
@@ -78,6 +128,7 @@ class LayerType(object):
     COSINE_SIM = 'cos'
     HSIGMOID = 'hsigmoid'
     CONV_LAYER = "conv"
+    CONVTRANS_LAYER = "convt"
     POOL_LAYER = "pool"
     BATCH_NORM_LAYER = 'batch_norm'
     NORM_LAYER = 'norm'
@@ -93,10 +144,12 @@ class LayerType(object):
 
     EXPAND_LAYER = 'expand'
     INTERPOLATION_LAYER = 'interpolation'
+    BILINEAR_INTERP_LAYER = 'bilinear_interp'
     POWER_LAYER = 'power'
     SCALING_LAYER = 'scaling'
     TRANS_LAYER = 'trans'
     OUT_PROD_LAYER = 'out_prod'
+    FEATURE_MAP_EXPAND_LAYER = 'featmap_expand'
 
     MEMORY = 'memory'
     MAXID_LAYER = 'maxid'
@@ -110,6 +163,8 @@ class LayerType(object):
     SLOPE_INTERCEPT_LAYER = "slope_intercept"
     LINEAR_COMBINATION_LAYER = "convex_comb"
     BLOCK_EXPAND = "blockexpand"
+    MAXOUT = "maxout"
+    SPP_LAYER = "spp"
 
     PRINT_LAYER = "print"
 
@@ -125,6 +180,7 @@ class LayerType(object):
     CROSS_ENTROPY_WITH_SELFNORM = "multi_class_cross_entropy_with_selfnorm"
     SOFT_BIN_CLASS_CROSS_ENTROPY = "soft_binary_class_cross_entropy"
     MULTI_BIN_LABEL_CROSS_ENTROPY = "multi_binary_label_cross_entropy"
+    SUM_COST = "sum_cost"
 
     @staticmethod
     def is_layer_type(type_name):
@@ -173,11 +229,19 @@ class LayerOutput(object):
     :type parents: list|tuple|collections.Sequence
     """
 
-    def __init__(self, name, layer_type, parents=None, activation=None,
-                 num_filters=None, img_norm_type=None, size=None, outputs=None,
+    def __init__(self,
+                 name,
+                 layer_type,
+                 parents=None,
+                 activation=None,
+                 num_filters=None,
+                 img_norm_type=None,
+                 size=None,
+                 outputs=None,
                  reverse=None):
         assert isinstance(name, basestring)
         assert isinstance(layer_type, basestring)
+        assert size is not None
         assert LayerType.is_layer_type(layer_type)
         self.name = name
         self.layer_type = layer_type
@@ -214,6 +278,7 @@ DEVICE = 'device'
 def layer_support(*attrs):
     attrs_list = list(attrs)
     attrs_list.append(DEVICE)
+
     def decorator(method):
         @functools.wraps(method)
         def wrapper(*args, **kwargs):
@@ -273,9 +338,8 @@ def full_matrix_projection(input, size=0, param_attr=None):
     :return: A FullMatrixProjection Object.
     :rtype: FullMatrixProjection
     """
-    proj = FullMatrixProjection(input_layer_name=input.name,
-                                size=size,
-                                **param_attr.attr)
+    proj = FullMatrixProjection(
+        input_layer_name=input.name, size=size, **param_attr.attr)
     proj.origin = input
     return proj
 
@@ -310,9 +374,8 @@ def trans_full_matrix_projection(input, size=0, param_attr=None):
     :return: A TransposedFullMatrixProjection Object.
     :rtype: TransposedFullMatrixProjection
     """
-    proj = TransposedFullMatrixProjection(input_layer_name=input.name,
-                                          size=size,
-                                          **param_attr.attr)
+    proj = TransposedFullMatrixProjection(
+        input_layer_name=input.name, size=size, **param_attr.attr)
     proj.origin = input
     return proj
 
@@ -356,9 +419,8 @@ def table_projection(input, size=0, param_attr=None):
     :return: A TableProjection Object.
     :rtype: TableProjection
     """
-    proj = TableProjection(input_layer_name=input.name,
-                           size=size,
-                           **param_attr.attr)
+    proj = TableProjection(
+        input_layer_name=input.name, size=size, **param_attr.attr)
     proj.origin = input
     return proj
 
@@ -397,19 +459,46 @@ def identity_projection(input, offset=None):
     :type input: LayerOutput
     :param offset: Offset, None if use default.
     :type offset: int
-    :return: A IdentityProjection or IdentityOffsetProjection Object
+    :return: A IdentityProjection or IdentityOffsetProjection object
     :rtype: IdentityProjection or IdentityOffsetProjection
     """
     if offset is None:
         proj = IdentityProjection(input_layer_name=input.name)
         proj.origin = input
     else:
-        proj = IdentityOffsetProjection(input_layer_name=input.name,
-                                        offset=offset)
+        proj = IdentityOffsetProjection(
+            input_layer_name=input.name, offset=offset)
         proj.origin = input
     return proj
 
 
+@wrap_param_attr_default()
+def scaling_projection(input, param_attr=None):
+    """
+    scaling_projection multiplies the input with a scalar parameter and add to
+    the output.
+
+    .. math::
+       out += w * in
+
+    The example usage is:
+
+    .. code-block:: python
+
+       proj = scaling_projection(input=layer)
+
+    :param input: Input Layer.
+    :type input: LayerOutput
+    :param param_attr: Parameter config, None if use default.
+    :type param_attr: ParameterAttribute
+    :return: A ScalingProjection object
+    :rtype: ScalingProjection
+    """
+    proj = ScalingProjection(input_layer_name=input.name, **param_attr.attr)
+    proj.origin = input
+    return proj
+
+
 @wrap_param_attr_default()
 def dotmul_projection(input, param_attr=None):
     """
@@ -434,9 +523,8 @@ def dotmul_projection(input, param_attr=None):
     :return: A DotMulProjection Object.
     :rtype: DotMulProjection
     """
-    proj = DotMulProjection(input_layer_name=input.name,
-                            size=input.size,
-                            **param_attr.attr)
+    proj = DotMulProjection(
+        input_layer_name=input.name, size=input.size, **param_attr.attr)
     proj.origin = input
     return proj
 
@@ -469,21 +557,22 @@ def dotmul_operator(a=None, b=None, scale=1, **kwargs):
     if 'x' in kwargs or 'y' in kwargs:
         logger.warning('x and y arguments for dotmul_operator is deprecated. '
                        'Please use a and b as parameter.')
-    a = kwargs.get('x', a)    # For Backward capacity.
+    a = kwargs.get('x', a)  # For Backward capacity.
     b = kwargs.get('y', b)
     assert isinstance(a, LayerOutput)
     assert isinstance(b, LayerOutput)
     if a.size is not None and b.size is not None:
         assert a.size == b.size
 
-    op = DotMulOperator(input_layer_names=[a.name, b.name],
-                        scale=scale)
+    op = DotMulOperator(input_layer_names=[a.name, b.name], scale=scale)
     op.origin = [a, b]
     return op
 
 
 @wrap_bias_attr_default(['padding_attr'])
-def context_projection(input, context_len, context_start=None,
+def context_projection(input,
+                       context_len,
+                       context_start=None,
                        padding_attr=False):
     """
     Context Projection.
@@ -520,11 +609,12 @@ def context_projection(input, context_len, context_start=None,
     if trainable:
         extra_dict = padding_attr.attr
 
-    proj = ContextProjection(input_layer_name=input.name,
-                             context_length=context_len,
-                             context_start=context_start,
-                             trainable_padding=trainable,
-                             **extra_dict)
+    proj = ContextProjection(
+        input_layer_name=input.name,
+        context_length=context_len,
+        context_start=context_start,
+        trainable_padding=trainable,
+        **extra_dict)
     proj.origin = input
     return proj
 
@@ -538,8 +628,7 @@ class MixedLayerType(LayerOutput):
         def __init__(self):
             Exception.__init__(self)
 
-    def __init__(self, name, size, act, bias_attr, layer_attr,
-                 parents=None):
+    def __init__(self, name, size, act, bias_attr, layer_attr, parents=None):
         """
         Ctor.
         :param name: layer name.
@@ -556,14 +645,19 @@ class MixedLayerType(LayerOutput):
         :param layer_attr: Extra Layer Attribute.
         :type layer_attr: ExtraLayerAttribute or None
         """
-        LayerOutput.__init__(self, name, LayerType.MIXED_LAYER, parents,
-                             size=size, activation=act)
+        LayerOutput.__init__(
+            self,
+            name,
+            LayerType.MIXED_LAYER,
+            parents,
+            size=size,
+            activation=act)
         self.bias_attr = bias_attr
         self.layer_attr = layer_attr
         self.inputs = []
         self.finalized = False
 
-    def __add__(self, other):
+    def __iadd__(self, other):
         """
         + += operator
         :param other: Other projection.
@@ -589,21 +683,27 @@ class MixedLayerType(LayerOutput):
     def __exit__(self, *args, **kwargs):
         del args, kwargs  # unused parameter to suppress warning
         assert len(self.inputs) != 0
-        MixedLayer(
+        ml = MixedLayer(
             name=self.name,
             size=self.size,
             active_type=self.activation.name,
             bias=ParamAttr.to_bias(self.bias_attr),
             inputs=self.inputs,
-            **ExtraLayerAttribute.to_kwargs(self.layer_attr)
-        )
+            **ExtraLayerAttribute.to_kwargs(self.layer_attr))
+        # update the size which might be computed inside MixedLayer
+        # according to the operator's output size
+        self.size = ml.config.size
 
 
 @wrap_name_default("mixed")
 @wrap_act_default(act=LinearActivation())
 @wrap_bias_attr_default(has_bias=False)
 @layer_support(ERROR_CLIPPING, DROPOUT)
-def mixed_layer(size=0, input=None, name=None, act=None, bias_attr=False,
+def mixed_layer(size=0,
+                input=None,
+                name=None,
+                act=None,
+                bias_attr=False,
                 layer_attr=None):
     """
     Mixed Layer. A mixed layer will add all inputs together, then activate.
@@ -648,8 +748,12 @@ def mixed_layer(size=0, input=None, name=None, act=None, bias_attr=False,
     if input is None:
         return MixedLayerType(name, size, act, bias_attr, layer_attr)
     else:
-        with mixed_layer(name=name, size=size, act=act, bias_attr=bias_attr,
-                         layer_attr=layer_attr) as m:
+        with mixed_layer(
+                name=name,
+                size=size,
+                act=act,
+                bias_attr=bias_attr,
+                layer_attr=layer_attr) as m:
             if isinstance(input, collections.Sequence):
                 for each in input:
                     m += each
@@ -679,8 +783,11 @@ def data_layer(name, size, layer_attr=None):
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    Layer(type=LayerType.DATA, name=name, size=size,
-          **ExtraLayerAttribute.to_kwargs(layer_attr))
+    Layer(
+        type=LayerType.DATA,
+        name=name,
+        size=size,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
 
     return LayerOutput(name, LayerType.DATA, size=size)
 
@@ -706,9 +813,12 @@ def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None):
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    with mixed_layer(name=name, size=size, act=LinearActivation(),
-                     bias_attr=False,
-                     layer_attr=layer_attr) as mix:
+    with mixed_layer(
+            name=name,
+            size=size,
+            act=LinearActivation(),
+            bias_attr=False,
+            layer_attr=layer_attr) as mix:
         mix += table_projection(input=input, size=size, param_attr=param_attr)
     return mix
 
@@ -718,8 +828,13 @@ def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None):
 @wrap_bias_attr_default()
 @wrap_act_default()
 @layer_support(ERROR_CLIPPING, DROPOUT)
-def fc_layer(input, size, act=None, name=None,
-             param_attr=None, bias_attr=None, layer_attr=None):
+def fc_layer(input,
+             size,
+             act=None,
+             name=None,
+             param_attr=None,
+             bias_attr=None,
+             layer_attr=None):
     """
     Helper for declare fully connected layer.
 
@@ -771,17 +886,17 @@ def fc_layer(input, size, act=None, name=None,
     assert isinstance(input, collections.Sequence)
 
     Layer(
-        inputs=[Input(ipt.name, **attr.attr) for ipt, attr in zip(
-            input, param_attr)],
+        inputs=[
+            Input(ipt.name, **attr.attr) for ipt, attr in zip(input, param_attr)
+        ],
         name=name,
         type=LayerType.FC_LAYER,
         size=size,
         bias=ParamAttr.to_bias(bias_attr),
         active_type=act.name,
-        **ExtraLayerAttribute.to_kwargs(layer_attr)
-    )
-    return LayerOutput(name, LayerType.FC_LAYER, input, activation=act,
-                       size=size)
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.FC_LAYER, input, activation=act, size=size)
 
 
 @wrap_name_default("print")
@@ -804,8 +919,7 @@ def print_layer(input, name=None):
     Layer(
         name=name,
         type=LayerType.PRINT_LAYER,
-        inputs=[l.name for l in input],
-    )
+        inputs=[l.name for l in input], )
     # this layer don't return anything, can not be input of other layer.
 
 
@@ -813,7 +927,10 @@ def print_layer(input, name=None):
 @wrap_bias_attr_default(has_bias=False)
 @wrap_param_default(['pooling_type'], default_factory=lambda _: MaxPooling())
 @layer_support()
-def pooling_layer(input, pooling_type=None, name=None, bias_attr=None,
+def pooling_layer(input,
+                  pooling_type=None,
+                  name=None,
+                  bias_attr=None,
                   agg_level=AggregateLevel.EACH_TIMESTEP,
                   layer_attr=None):
     """
@@ -860,23 +977,27 @@ def pooling_layer(input, pooling_type=None, name=None, bias_attr=None,
         inputs=[Input(input.name)],
         bias=ParamAttr.to_bias(bias_attr),
         trans_type=agg_level,
-        **extra_dict
-    )
+        **extra_dict)
 
-    return LayerOutput(name, pooling_type.name, parents=[input],
-                       size=input.size)
+    return LayerOutput(
+        name, pooling_type.name, parents=[input], size=input.size)
 
 
 @wrap_bias_attr_default()
 @wrap_param_attr_default()
-@wrap_act_default(param_names=['gate_act'],
-                  act=SigmoidActivation())
+@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
 @wrap_act_default(param_names=["act", 'state_act'], act=TanhActivation())
 @wrap_name_default("lstmemory")
 @layer_support(DROPOUT)
-def lstmemory(input, name=None, reverse=False, act=None,
-              gate_act=None, size=None,
-              state_act=None, bias_attr=None, param_attr=None,
+def lstmemory(input,
+              name=None,
+              reverse=False,
+              act=None,
+              gate_act=None,
+              size=None,
+              state_act=None,
+              bias_attr=None,
+              param_attr=None,
               layer_attr=None):
     """
     Long Short-term Memory Cell.
@@ -951,30 +1072,38 @@ def lstmemory(input, name=None, reverse=False, act=None,
              "layer. The lstm size should be equal with input layer size/4. The"
              " size which is set explicitly will be ignored." % name)
 
-    Layer(name=name,
-          type=LayerType.LSTMEMORY,
-          active_type=act.name,
-          active_state_type=state_act.name,
-          active_gate_type=gate_act.name,
-          reversed=reverse,
-          bias=ParamAttr.to_bias(bias_attr),
-          inputs=[Input(input.name, **param_attr.attr)],
-          **ExtraLayerAttribute.to_kwargs(layer_attr))
+    Layer(
+        name=name,
+        type=LayerType.LSTMEMORY,
+        active_type=act.name,
+        active_state_type=state_act.name,
+        active_gate_type=gate_act.name,
+        reversed=reverse,
+        bias=ParamAttr.to_bias(bias_attr),
+        inputs=[Input(input.name, **param_attr.attr)],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
 
-    return LayerOutput(name, LayerType.LSTMEMORY, [input], size=input.size / 4,
-                       reverse=reverse)
+    return LayerOutput(
+        name,
+        LayerType.LSTMEMORY, [input],
+        size=input.size / 4,
+        reverse=reverse)
 
 
 @wrap_bias_attr_default()
 @wrap_param_attr_default()
-@wrap_act_default(param_names=['gate_act'],
-                  act=SigmoidActivation())
+@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
 @wrap_act_default(param_names=["act"], act=TanhActivation())
 @wrap_name_default("gru")
 @layer_support(DROPOUT)
-def grumemory(input, name=None, reverse=False, act=None,
-              gate_act=None, size=None,
-              bias_attr=None, param_attr=None,
+def grumemory(input,
+              name=None,
+              reverse=False,
+              act=None,
+              gate_act=None,
+              size=None,
+              bias_attr=None,
+              param_attr=None,
               layer_attr=None):
     """
     Gate Recurrent Unit Layer.
@@ -1065,23 +1194,28 @@ def grumemory(input, name=None, reverse=False, act=None,
              " and should be input size / 3. Set size explicitly will be "
              "ignored.")
 
-    Layer(name=name,
-          type=LayerType.GRUMEMORY,
-          active_type=act.name,
-          active_gate_type=gate_act.name,
-          reversed=reverse,
-          bias=ParamAttr.to_bias(bias_attr),
-          inputs=[Input(input.name, **param_attr.attr)],
-          **ExtraLayerAttribute.to_kwargs(layer_attr)
-          )
+    Layer(
+        name=name,
+        type=LayerType.GRUMEMORY,
+        active_type=act.name,
+        active_gate_type=gate_act.name,
+        reversed=reverse,
+        bias=ParamAttr.to_bias(bias_attr),
+        inputs=[Input(input.name, **param_attr.attr)],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
 
-    return LayerOutput(name, LayerType.GRUMEMORY, [input], size=input.size / 3,
-                       reverse=reverse)
+    return LayerOutput(
+        name,
+        LayerType.GRUMEMORY, [input],
+        size=input.size / 3,
+        reverse=reverse)
 
 
 @wrap_name_default()
 @layer_support()
-def last_seq(input, name=None, agg_level=AggregateLevel.EACH_TIMESTEP,
+def last_seq(input,
+             name=None,
+             agg_level=AggregateLevel.EACH_TIMESTEP,
              layer_attr=None):
     """
     Get Last Timestamp Activation of a sequence.
@@ -1107,15 +1241,19 @@ def last_seq(input, name=None, agg_level=AggregateLevel.EACH_TIMESTEP,
         type=LayerType.SEQUENCE_LAST_INSTANCE,
         inputs=[input.name],
         trans_type=agg_level,
-        **ExtraLayerAttribute.to_kwargs(layer_attr)
-    )
-    return LayerOutput(name, LayerType.SEQUENCE_LAST_INSTANCE, parents=[input],
-                       size=input.size)
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name,
+        LayerType.SEQUENCE_LAST_INSTANCE,
+        parents=[input],
+        size=input.size)
 
 
 @wrap_name_default()
 @layer_support()
-def first_seq(input, name=None, agg_level=AggregateLevel.EACH_TIMESTEP,
+def first_seq(input,
+              name=None,
+              agg_level=AggregateLevel.EACH_TIMESTEP,
               layer_attr=None):
     """
     Get First Timestamp Activation of a sequence.
@@ -1142,10 +1280,12 @@ def first_seq(input, name=None, agg_level=AggregateLevel.EACH_TIMESTEP,
         type=LayerType.SEQUENCE_FIRST_INSTANCE,
         inputs=[input.name],
         trans_type=agg_level,
-        **ExtraLayerAttribute.to_kwargs(layer_attr)
-    )
-    return LayerOutput(name, LayerType.SEQUENCE_FIRST_INSTANCE,
-                       parents=[input], size=input.size)
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name,
+        LayerType.SEQUENCE_FIRST_INSTANCE,
+        parents=[input],
+        size=input.size)
 
 
 class ExpandLevel(object):
@@ -1155,7 +1295,8 @@ class ExpandLevel(object):
 
 @wrap_name_default()
 @layer_support()
-def expand_layer(input, expand_as,
+def expand_layer(input,
+                 expand_as,
                  name=None,
                  bias_attr=False,
                  expand_level=ExpandLevel.FROM_TIMESTEP,
@@ -1195,12 +1336,53 @@ def expand_layer(input, expand_as,
         bias=ParamAttr.to_bias(bias_attr=bias_attr),
         type=LayerType.EXPAND_LAYER,
         trans_type=expand_level,
-        **ExtraAttr.to_kwargs(layer_attr)
-    )
-    return LayerOutput(name=name,
-                       size=input.size,
-                       layer_type=LayerType.EXPAND_LAYER,
-                       parents=[input, expand_as])
+        **ExtraAttr.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        size=input.size,
+        layer_type=LayerType.EXPAND_LAYER,
+        parents=[input, expand_as])
+
+
+@wrap_name_default()
+@layer_support()
+def repeat_layer(input, num_repeats, name=None, layer_attr=None):
+    """
+    A layer for repeating the input for num_repeats times. This is equivalent
+    to apply concat_layer() with num_repeats same input.
+
+    .. math::
+       y  = [x, x, \cdots, x]
+
+    The example usage is:
+
+    .. code-block:: python
+
+       expand = repeat_layer(layer, 4)
+
+    :param input: Input layer
+    :type input: LayerOutput
+    :param num_repeats: Repeat the input so many times
+    :type num_repeats: int
+    :param name: Layer name.
+    :type name: basestring
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    l = Layer(
+        inputs=[input.name],
+        name=name,
+        num_filters=num_repeats,
+        type=LayerType.FEATURE_MAP_EXPAND_LAYER,
+        **ExtraAttr.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        size=l.config.size,
+        layer_type=LayerType.FEATURE_MAP_EXPAND_LAYER,
+        parents=[input])
 
 
 @wrap_name_default()
@@ -1247,11 +1429,66 @@ def interpolation_layer(input, weight, name=None, layer_attr=None):
         name=name,
         type=LayerType.INTERPOLATION_LAYER,
         inputs=[weight.name, input[0].name, input[1].name],
-        **ExtraAttr.to_kwargs(layer_attr)
-    )
-    return LayerOutput(name, LayerType.INTERPOLATION_LAYER,
-                       parents=[weight, input[0], input[1]],
-                       size=input[0].size)
+        **ExtraAttr.to_kwargs(layer_attr))
+    return LayerOutput(
+        name,
+        LayerType.INTERPOLATION_LAYER,
+        parents=[weight, input[0], input[1]],
+        size=input[0].size)
+
+
+@wrap_name_default()
+@layer_support()
+def bilinear_interp_layer(input,
+                          out_size_x=None,
+                          out_size_y=None,
+                          name=None,
+                          layer_attr=None):
+    """
+    This layer is to implement bilinear interpolation on conv layer output.
+
+    Please refer to Wikipedia: https://en.wikipedia.org/wiki/Bilinear_interpolation
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       bilinear = bilinear_interp_layer(input=layer1, out_size_x=64, out_size_y=64)
+
+    :param   input:        A input layer.
+    :type    input:        LayerOutput.
+    :param   out_size_x:   bilinear interpolation output width.
+    :type    out_size_x:   int|None
+    :param   out_size_y:   bilinear interpolation output height.
+    :type    out_size_y:   int|None
+    :param   name:         The layer's name, which cna not be specified.
+    :type    name:         None|basestring
+    :param   layer_attr:   Extra Layer attribute.
+    :type    layer_attr:   ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype:  LayerOutput
+    """
+    assert input.layer_type == LayerType.CONV_LAYER
+    assert isinstance(input.activation, LinearActivation)
+    assert out_size_x > 0 and out_size_y > 0
+    assert input.num_filters is not None
+    num_channels = input.num_filters
+    l = Layer(
+        name=name,
+        inputs=Input(
+            input.name,
+            bilinear_interp=BilinearInterp(
+                out_size_x=out_size_x,
+                out_size_y=out_size_y,
+                num_channels=num_channels)),
+        type=LayerType.BILINEAR_INTERP_LAYER,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name,
+        LayerType.BILINEAR_INTERP_LAYER,
+        parents=[input],
+        num_filters=num_channels,
+        size=l.config.size)
 
 
 @wrap_name_default()
@@ -1291,10 +1528,9 @@ def power_layer(input, weight, name=None, layer_attr=None):
         name=name,
         type=LayerType.POWER_LAYER,
         inputs=[weight.name, input.name],
-        **ExtraAttr.to_kwargs(layer_attr)
-    )
-    return LayerOutput(name, LayerType.POWER_LAYER,
-                       parents=[input, weight], size=input.size)
+        **ExtraAttr.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.POWER_LAYER, parents=[input, weight], size=input.size)
 
 
 @wrap_name_default()
@@ -1336,10 +1572,9 @@ def scaling_layer(input, weight, name=None, layer_attr=None):
         name=name,
         type=LayerType.SCALING_LAYER,
         inputs=[weight.name, input.name],
-        **ExtraAttr.to_kwargs(layer_attr)
-    )
-    return LayerOutput(name, LayerType.SCALING_LAYER, parents=[weight, input],
-                       size=input.size)
+        **ExtraAttr.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.SCALING_LAYER, parents=[weight, input], size=input.size)
 
 
 @wrap_name_default()
@@ -1372,10 +1607,9 @@ def trans_layer(input, name=None, layer_attr=None):
         name=name,
         type=LayerType.TRANS_LAYER,
         inputs=[input.name],
-        **ExtraAttr.to_kwargs(layer_attr)
-    )
-    return LayerOutput(name, LayerType.TRANS_LAYER, parents=[input],
-                       size=input.size)
+        **ExtraAttr.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.TRANS_LAYER, parents=[input], size=input.size)
 
 
 @wrap_name_default()
@@ -1417,8 +1651,7 @@ def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
             type=LayerType.COSINE_SIM,
             cos_scale=scale,
             inputs=[a.name, b.name],
-            **ExtraLayerAttribute.to_kwargs(layer_attr)
-        )
+            **ExtraLayerAttribute.to_kwargs(layer_attr))
     else:
         if a.size is not None and b.size is not None:
             assert size == b.size / a.size
@@ -1428,17 +1661,21 @@ def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
             size=size,
             cos_scale=scale,
             inputs=[a.name, b.name],
-            **ExtraLayerAttribute.to_kwargs(layer_attr)
-        )
-    return LayerOutput(name, LayerType.COSINE_SIM, parents=[a, b])
+            **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(name, LayerType.COSINE_SIM, parents=[a, b], size=size)
 
 
 @wrap_name_default()
 @wrap_bias_attr_default(has_bias=True)
 @wrap_param_attr_default()
 @layer_support()
-def hsigmoid(input, label, num_classes, name=None, bias_attr=None,
-             param_attr=None, layer_attr=None):
+def hsigmoid(input,
+             label,
+             num_classes,
+             name=None,
+             bias_attr=None,
+             param_attr=None,
+             layer_attr=None):
     """
     Organize the classes into a binary tree. At each node, a sigmoid function
     is used to calculate the probability of belonging to the right branch.
@@ -1493,15 +1730,15 @@ def hsigmoid(input, label, num_classes, name=None, bias_attr=None,
     ipts_for_layer.append(label.name)
     parents.append(label)
 
-    Layer(
+    l = Layer(
         name=name,
         type=LayerType.HSIGMOID,
         num_classes=num_classes,
         bias=ParamAttr.to_bias(bias_attr),
         inputs=ipts_for_layer,
-        **ExtraLayerAttribute.to_kwargs(layer_attr)
-    )
-    return LayerOutput(name, LayerType.HSIGMOID, parents=parents)
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.HSIGMOID, parents=parents, size=l.config.size)
 
 
 @wrap_name_default("conv")
@@ -1509,11 +1746,23 @@ def hsigmoid(input, label, num_classes, name=None, bias_attr=None,
 @wrap_bias_attr_default()
 @wrap_act_default(act=ReluActivation())
 @layer_support(DROPOUT)
-def img_conv_layer(input, filter_size, num_filters,
-                   name=None, num_channels=None,
-                   act=None, groups=1, stride=1, padding=0, bias_attr=None,
-                   param_attr=None, shared_biases=True, layer_attr=None,
-                   filter_size_y=None, stride_y=None, padding_y=None):
+def img_conv_layer(input,
+                   filter_size,
+                   num_filters,
+                   name=None,
+                   num_channels=None,
+                   act=None,
+                   groups=1,
+                   stride=1,
+                   padding=0,
+                   bias_attr=None,
+                   param_attr=None,
+                   shared_biases=True,
+                   layer_attr=None,
+                   filter_size_y=None,
+                   stride_y=None,
+                   padding_y=None,
+                   trans=False):
     """
     Convolution layer for image. Paddle only support square input currently and
     thus input image's width equals height.
@@ -1522,6 +1771,13 @@ def img_conv_layer(input, filter_size, num_filters,
     <http://ufldl.stanford.edu/tutorial/supervised/
     FeatureExtractionUsingConvolution/>`_ .
 
+    Convolution Transpose (deconv) layer for image. Paddle only support square
+    input currently and thus input image's width equals height.
+
+    The details of convolution transpose layer,
+    please refer to the following explanation and references therein
+    <http://datascience.stackexchange.com/questions/6107/
+    what-are-deconvolutional-layers/>`_ .
     The num_channel means input image's channel number. It may be 1 or 3 when
     input is raw pixels of image(mono or RGB), or it may be the previous layer's
     num_filters * num_group.
@@ -1571,6 +1827,8 @@ def img_conv_layer(input, filter_size, num_filters,
     :type shared_biases: bool
     :param layer_attr: Layer Extra Attribute.
     :type layer_attr: ExtraLayerAttribute
+    :param trans: true if it is a convTransLayer, false if it is a convLayer
+    :type trans: bool
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1601,36 +1859,56 @@ def img_conv_layer(input, filter_size, num_filters,
 
     if param_attr.attr.get('initial_smart'):
         # special initial for conv layers.
-        init_w = (2.0 / (filter_size ** 2 * num_channels)) ** 0.5
+        init_w = (2.0 / (filter_size**2 * num_channels))**0.5
         param_attr.attr["initial_mean"] = 0.0
         param_attr.attr["initial_std"] = init_w
         param_attr.attr["initial_strategy"] = 0
         param_attr.attr["initial_smart"] = False
-    Layer(
+
+    lt = LayerType.CONVTRANS_LAYER if trans else LayerType.CONV_LAYER
+
+    l = Layer(
         name=name,
-        inputs=Input(input.name, conv=Conv(
-            filter_size=filter_size, padding=padding, stride=stride,
-            channels=num_channels, groups=groups,
-            filter_size_y=filter_size_y, padding_y=padding_y,
-            stride_y=stride_y),
-                     **param_attr.attr),
+        inputs=Input(
+            input.name,
+            conv=Conv(
+                filter_size=filter_size,
+                padding=padding,
+                stride=stride,
+                channels=num_channels,
+                groups=groups,
+                filter_size_y=filter_size_y,
+                padding_y=padding_y,
+                stride_y=stride_y),
+            **param_attr.attr),
         active_type=act.name,
         num_filters=num_filters,
         bias=ParamAttr.to_bias(bias_attr),
         shared_biases=shared_biases,
-        type=LayerType.CONV_LAYER,
-        **ExtraLayerAttribute.to_kwargs(layer_attr)
-    )
-    return LayerOutput(name, LayerType.CONV_LAYER, parents=[input],
-                       activation=act, num_filters=num_filters)
+        type=lt,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name,
+        lt,
+        parents=[input],
+        activation=act,
+        num_filters=num_filters,
+        size=l.config.size)
 
 
 @wrap_name_default("pool")
 @layer_support()
-def img_pool_layer(input, pool_size, name=None,
-                   num_channels=None, pool_type=None,
-                   stride=1, start=None, padding=0, layer_attr=None,
-                   pool_size_y=None, stride_y=None, padding_y=None,
+def img_pool_layer(input,
+                   pool_size,
+                   name=None,
+                   num_channels=None,
+                   pool_type=None,
+                   stride=1,
+                   padding=0,
+                   layer_attr=None,
+                   pool_size_y=None,
+                   stride_y=None,
+                   padding_y=None,
                    img_width=None):
     """
     Image pooling Layer.
@@ -1653,15 +1931,13 @@ def img_pool_layer(input, pool_size, name=None,
     :type pool_size_y: int|None
     :param num_channels: number of input channel.
     :type num_channels: int
-    :param pool_type: pooling type. MaxPooling or AveragePooling. Default is
+    :param pool_type: pooling type. MaxPooling or AvgPooling. Default is
                       MaxPooling.
     :type pool_type: BasePoolingType
     :param stride: stride width of pooling.
     :type stride: int
     :param stride_y: stride height of pooling. It is equal to stride by default.
     :type stride_y: int|None
-    :param start: start position of pooling operation. Note it is deprecated now.
-    :type start: int|None
     :param layer_attr: Extra Layer attribute.
     :type layer_attr: ExtraLayerAttribute
     :param img_width: the width of input feature map. If it is None, the input feature
@@ -1687,51 +1963,133 @@ def img_pool_layer(input, pool_size, name=None,
     stride_y = stride if stride_y is None else stride_y
     padding_y = padding if padding_y is None else padding_y
 
-    Layer(
+    l = Layer(
         name=name,
         type=LayerType.POOL_LAYER,
-        inputs=[Input(input.name,
-                      pool=Pool(
-                          pool_type=type_name,
-                          channels=num_channels,
-                          size_x=pool_size,
-                          start=start,
-                          stride=stride,
-                          padding=padding,
-                          size_y=pool_size_y,
-                          stride_y=stride_y,
-                          padding_y=padding_y,
-                          img_width=img_width
-                      ))],
-        **ExtraLayerAttribute.to_kwargs(layer_attr)
-    )
-    return LayerOutput(name, LayerType.POOL_LAYER, parents=[input],
-                       num_filters=num_channels)
-
-
-def __img_norm_layer__(name, input, size, norm_type, scale, power,
-                       num_channels, blocked, layer_attr):
+        inputs=[
+            Input(
+                input.name,
+                pool=Pool(
+                    pool_type=type_name,
+                    channels=num_channels,
+                    size_x=pool_size,
+                    start=None,
+                    stride=stride,
+                    padding=padding,
+                    size_y=pool_size_y,
+                    stride_y=stride_y,
+                    padding_y=padding_y,
+                    img_width=img_width))
+        ],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name,
+        LayerType.POOL_LAYER,
+        parents=[input],
+        num_filters=num_channels,
+        size=l.config.size)
+
+
+@wrap_name_default("spp")
+@layer_support()
+def spp_layer(input,
+              name=None,
+              num_channels=None,
+              pool_type=None,
+              pyramid_height=None,
+              img_width=None,
+              layer_attr=None):
+    """
+    Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition.
+    The details please refer to
+    `Kaiming He's paper <https://arxiv.org/abs/1406.4729>`_.
+
+    :param name: layer name.
+    :type name: basestring
+    :param input: layer's input.
+    :type input: LayerOutput
+    :param num_channels: number of input channel.
+    :type num_channels: int
+    :param pool_type: Pooling type. MaxPooling or AveragePooling. Default is MaxPooling.
+    :type scale: BasePoolingType
+    :param pyramid_height: pyramid height.
+    :type pyramid_height: int
+    :param img_width: the width of input feature map. If it is None, the input feature
+                      map should be square.
+    :type img_width: int|None
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
     if num_channels is None:
         assert input.num_filters is not None
         num_channels = input.num_filters
 
-    Layer(
-        name=name, type=LayerType.NORM_LAYER, inputs=Input(
-            input.name, norm=Norm(norm_type=norm_type,
-                                  channels=num_channels, size=size,
-                                  scale=scale,
-                                  pow=power, blocked=blocked)
-        ),
-        **ExtraLayerAttribute.to_kwargs(layer_attr)
-    )
-    return LayerOutput(name, layer_type=LayerType.NORM_LAYER, parents=[input],
-                       num_filters=num_channels, img_norm_type=norm_type)
+    if pool_type is None:
+        pool_type = MaxPooling()
+    elif isinstance(pool_type, AvgPooling):
+        pool_type.name = 'avg'
+
+    type_name = pool_type.name
+    if (isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)):
+        type_name += '-projection'
+
+    l = Layer(
+        name=name,
+        type=LayerType.SPP_LAYER,
+        inputs=Input(
+            input.name,
+            spp=SpatialPyramidPool(
+                pool_type=type_name,
+                channels=num_channels,
+                pyramid_height=pyramid_height,
+                img_width=img_width)),
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name,
+        layer_type=LayerType.SPP_LAYER,
+        parents=[input],
+        num_filters=num_channels,
+        size=l.config.size)
+
+
+def __img_norm_layer__(name, input, size, norm_type, scale, power, num_channels,
+                       blocked, layer_attr):
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+
+    l = Layer(
+        name=name,
+        type=LayerType.NORM_LAYER,
+        inputs=Input(
+            input.name,
+            norm=Norm(
+                norm_type=norm_type,
+                channels=num_channels,
+                size=size,
+                scale=scale,
+                pow=power,
+                blocked=blocked)),
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name,
+        layer_type=LayerType.NORM_LAYER,
+        parents=[input],
+        num_filters=num_channels,
+        img_norm_type=norm_type,
+        size=l.config.size)
 
 
 @wrap_name_default("crmnorm")
 @layer_support()
-def img_cmrnorm_layer(input, size, scale=0.0128, power=0.75,
-                      name=None, num_channels=None,
+def img_cmrnorm_layer(input,
+                      size,
+                      scale=0.0128,
+                      power=0.75,
+                      name=None,
+                      num_channels=None,
                       layer_attr=None):
     """
     Response normalization across feature maps.
@@ -1765,8 +2123,13 @@ def img_cmrnorm_layer(input, size, scale=0.0128, power=0.75,
 @wrap_act_default(act=ReluActivation())
 @wrap_name_default("batch_norm")
 @layer_support(DROPOUT)
-def batch_norm_layer(input, act=None, name=None, num_channels=None,
-                     bias_attr=None, param_attr=None, layer_attr=None,
+def batch_norm_layer(input,
+                     act=None,
+                     name=None,
+                     num_channels=None,
+                     bias_attr=None,
+                     param_attr=None,
+                     layer_attr=None,
                      batch_norm_type=None,
                      moving_average_fraction=0.9,
                      use_global_stats=None):
@@ -1850,23 +2213,25 @@ def batch_norm_layer(input, act=None, name=None, num_channels=None,
             num_channels = input.size
     assert (batch_norm_type is None) or (batch_norm_type == "batch_norm") or \
            (batch_norm_type == "cudnn_batch_norm")
-    Layer(
+    l = Layer(
         name=name,
-        inputs=Input(input.name,
-                     image=Image(channels=num_channels),
-                     **param_attr.attr),
+        inputs=Input(
+            input.name, image=Image(channels=num_channels), **param_attr.attr),
         active_type=act.name,
         type=LayerType.BATCH_NORM_LAYER,
         batch_norm_type=batch_norm_type,
         bias=ParamAttr.to_bias(bias_attr),
         moving_average_fraction=moving_average_fraction,
         use_global_stats=use_global_stats,
-        **ExtraLayerAttribute.to_kwargs(layer_attr)
-    )
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
 
-    return LayerOutput(name=name, layer_type=LayerType.BATCH_NORM_LAYER,
-                       parents=[input], activation=act,
-                       num_filters=num_channels)
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.BATCH_NORM_LAYER,
+        parents=[input],
+        activation=act,
+        num_filters=num_channels,
+        size=l.config.size)
 
 
 @wrap_name_default()
@@ -1901,18 +2266,16 @@ def sum_to_one_norm_layer(input, name=None, layer_attr=None):
         name=name,
         type=LayerType.SUM_TO_ONE_NORM_LAYER,
         inputs=[input.name],
-        **ExtraAttr.to_kwargs(layer_attr)
-    )
-    return LayerOutput(name, LayerType.SUM_TO_ONE_NORM_LAYER, parents=[input],
-                       size=input.size)
+        **ExtraAttr.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.SUM_TO_ONE_NORM_LAYER, parents=[input], size=input.size)
 
 
 @wrap_name_default("addto")
 @wrap_act_default(act=LinearActivation())
 @wrap_bias_attr_default(has_bias=False)
 @layer_support(DROPOUT)
-def addto_layer(input, act=None, name=None, bias_attr=None,
-                layer_attr=None):
+def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
     """
     AddtoLayer.
 
@@ -1971,21 +2334,27 @@ def addto_layer(input, act=None, name=None, bias_attr=None,
         if each_input.num_filters is not None:
             num_filters = each_input.num_filters
 
-    Layer(
-        name=name, type=LayerType.ADDTO_LAYER, inputs=ipts_for_layer,
+    l = Layer(
+        name=name,
+        type=LayerType.ADDTO_LAYER,
+        inputs=ipts_for_layer,
         bias=ParamAttr.to_bias(bias_attr),
         active_type=act.name,
-        **ExtraLayerAttribute.to_kwargs(layer_attr)
-    )
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
 
-    return LayerOutput(name, LayerType.ADDTO_LAYER, parents=input,
-                       activation=act, num_filters=num_filters)
+    return LayerOutput(
+        name,
+        LayerType.ADDTO_LAYER,
+        parents=input,
+        activation=act,
+        num_filters=num_filters,
+        size=l.config.size)
 
 
 @wrap_act_default(act=IdentityActivation())
 @wrap_name_default("concat")
 @layer_support()
-def concat_layer(input, act=None, name=None, layer_attr=None):
+def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
     """
     Concat all input vector into one huge vector.
     Inputs can be list of LayerOutput or list of projection.
@@ -2038,18 +2407,22 @@ def concat_layer(input, act=None, name=None, layer_attr=None):
                                                               LayerOutput)
         return a
 
-    is_concat_layer = __is_type__(reduce(__reduce_concat_type__,
-                                         map(type, input)), LayerOutput)
+    is_concat_layer = __is_type__(
+        reduce(__reduce_concat_type__, map(type, input)), LayerOutput)
 
-    layer_type = (LayerType.CONCAT_LAYER if is_concat_layer
-                  else LayerType.CONCAT_PROJ_LAYER)
+    layer_type = (LayerType.CONCAT_LAYER
+                  if is_concat_layer else LayerType.CONCAT_PROJ_LAYER)
+
+    if layer_type == LayerType.CONCAT_LAYER:
+        assert not bias_attr
 
     Layer(
-        name=name, type=layer_type,
+        name=name,
+        type=layer_type,
         inputs=[x.name for x in input] if is_concat_layer else input,
         active_type=act.name,
-        **ExtraLayerAttribute.to_kwargs(layer_attr)
-    )
+        bias=ParamAttr.to_bias(bias_attr),
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
 
     sz = 0
     for each_input in input:
@@ -2059,14 +2432,20 @@ def concat_layer(input, act=None, name=None, layer_attr=None):
             sz = None
             break
 
-    return LayerOutput(name, layer_type=layer_type,
-                       parents=input if is_concat_layer else [
-                           x.origin for x in input],
-                       activation=act, size=sz)
-
-
-def memory(name, size, is_seq=False, boot_layer=None,
-           boot_bias=None, boot_bias_active_type=None,
+    return LayerOutput(
+        name,
+        layer_type=layer_type,
+        parents=input if is_concat_layer else [x.origin for x in input],
+        activation=act,
+        size=sz)
+
+
+def memory(name,
+           size,
+           is_seq=False,
+           boot_layer=None,
+           boot_bias=None,
+           boot_bias_active_type=None,
            boot_with_const_id=None):
     """
     The memory layers is a layer cross each time step. Reference this output
@@ -2114,30 +2493,33 @@ def memory(name, size, is_seq=False, boot_layer=None,
 
     assert boot_layer is None or isinstance(boot_layer, LayerOutput)
 
-    agent_name = Memory(name, size,
-                        is_seq,
-                        boot_layer.name if boot_layer is not None else None,
-                        boot_bias,
-                        boot_bias_active_type.name,
-                        boot_with_const_id)
-
-    lout = LayerOutput(name=agent_name, size=size,
-                       layer_type=LayerType.MEMORY,
-                       parents=[boot_layer] if boot_layer is not None
-                       else None)
+    agent_name = Memory(name, size, is_seq, boot_layer.name
+                        if boot_layer is not None else None, boot_bias,
+                        boot_bias_active_type.name, boot_with_const_id)
+
+    lout = LayerOutput(
+        name=agent_name,
+        size=size,
+        layer_type=LayerType.MEMORY,
+        parents=[boot_layer] if boot_layer is not None else None)
     return lout
 
 
 @wrap_bias_attr_default()
-@wrap_act_default(param_names=['gate_act',
-                               'state_act'],
-                  act=SigmoidActivation())
+@wrap_act_default(
+    param_names=['gate_act', 'state_act'], act=SigmoidActivation())
 @wrap_act_default(act=TanhActivation())
 @wrap_name_default('lstm_step')
 @layer_support()
-def lstm_step_layer(input, state, size, act=None,
-                    name=None, gate_act=None, state_act=None,
-                    bias_attr=None, layer_attr=None):
+def lstm_step_layer(input,
+                    state,
+                    size,
+                    act=None,
+                    name=None,
+                    gate_act=None,
+                    state_act=None,
+                    bias_attr=None,
+                    layer_attr=None):
     """
     LSTM Step Layer. It used in recurrent_group. The lstm equations are shown
     as follow.
@@ -2204,24 +2586,32 @@ def lstm_step_layer(input, state, size, act=None,
         active_gate_type=gate_act.name,
         active_state_type=state_act.name,
         bias=ParamAttr.to_bias(bias_attr),
-        size=size, inputs=[input.name, state.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr)
-    )
+        size=size,
+        inputs=[input.name, state.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
 
-    return LayerOutput(name=name, layer_type=LayerType.LSTM_STEP_LAYER,
-                       parents=[input, state], activation=act,
-                       size=size, outputs=['default', 'state'])
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.LSTM_STEP_LAYER,
+        parents=[input, state],
+        activation=act,
+        size=size,
+        outputs=['default', 'state'])
 
 
 @wrap_bias_attr_default()
-@wrap_act_default(param_names=['gate_act'],
-                  act=SigmoidActivation())
+@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
 @wrap_act_default(act=TanhActivation())
 @wrap_name_default('gru_step')
 @layer_support()
-def gru_step_layer(input, output_mem, size=None, act=None,
-                   name=None, gate_act=None,
-                   bias_attr=None, layer_attr=None):
+def gru_step_layer(input,
+                   output_mem,
+                   size=None,
+                   act=None,
+                   name=None,
+                   gate_act=None,
+                   bias_attr=None,
+                   layer_attr=None):
     """
 
     :param input:
@@ -2242,20 +2632,18 @@ def gru_step_layer(input, output_mem, size=None, act=None,
     Layer(
         name=name,
         type=LayerType.GRU_STEP_LAYER,
-        inputs=[
-            input.name,
-            output_mem.name
-        ],
+        inputs=[input.name, output_mem.name],
         bias=ParamAttr.to_bias(bias_attr),
         size=size,
         active_type=act.name,
         active_gate_type=gate_act.name,
-        **ExtraAttr.to_kwargs(layer_attr)
-    )
+        **ExtraAttr.to_kwargs(layer_attr))
     return LayerOutput(
-        name=name, layer_type=LayerType.GRU_STEP_LAYER,
+        name=name,
+        layer_type=LayerType.GRU_STEP_LAYER,
         parents=[input, output_mem],
-        size=size, activation=act)
+        size=size,
+        activation=act)
 
 
 @wrap_name_default()
@@ -2283,13 +2671,19 @@ def get_output_layer(input, arg_name, name=None, layer_attr=None):
                                       ' The get output name is %s, which not' \
                                       ' in %s' % (
                                           arg_name, ",".join(input.outputs))
-    Layer(name=name, type=LayerType.GET_OUTPUT_LAYER,
-          inputs=[Input(input.name, input_layer_argument=arg_name)],
-          size=input.size,
-          **ExtraLayerAttribute.to_kwargs(layer_attr))
+    Layer(
+        name=name,
+        type=LayerType.GET_OUTPUT_LAYER,
+        inputs=[Input(
+            input.name, input_layer_argument=arg_name)],
+        size=input.size,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
 
-    return LayerOutput(name=name, layer_type=LayerType.GET_OUTPUT_LAYER,
-                       parents=[input], size=input.size)
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.GET_OUTPUT_LAYER,
+        parents=[input],
+        size=input.size)
 
 
 @wrap_name_default()
@@ -2297,8 +2691,13 @@ def get_output_layer(input, arg_name, name=None, layer_attr=None):
 @wrap_bias_attr_default()
 @wrap_param_attr_default()
 @layer_support()
-def recurrent_layer(input, act=None, bias_attr=None,
-                    param_attr=None, name=None, reverse=False, layer_attr=None):
+def recurrent_layer(input,
+                    act=None,
+                    bias_attr=None,
+                    param_attr=None,
+                    name=None,
+                    reverse=False,
+                    layer_attr=None):
     """
     Simple recurrent unit layer. It is just a fully connect layer through both
     time and neural network.
@@ -2333,16 +2732,21 @@ def recurrent_layer(input, act=None, bias_attr=None,
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    Layer(name=name,
-          type=LayerType.RECURRENT_LAYER,
-          inputs=Input(input.name, **param_attr.attr),
-          active_type=act.name,
-          bias=ParamAttr.to_bias(bias_attr),
-          reversed=reverse,
-          **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(name=name, layer_type=LayerType.RECURRENT_LAYER,
-                       parents=[input], size=input.size, activation=act,
-                       reverse=reverse)
+    Layer(
+        name=name,
+        type=LayerType.RECURRENT_LAYER,
+        inputs=Input(input.name, **param_attr.attr),
+        active_type=act.name,
+        bias=ParamAttr.to_bias(bias_attr),
+        reversed=reverse,
+        **ExtraAttr.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.RECURRENT_LAYER,
+        parents=[input],
+        size=input.size,
+        activation=act,
+        reverse=reverse)
 
 
 class StaticInput(object):
@@ -2378,7 +2782,12 @@ class SubsequenceInput(object):
 
 
 @wrap_name_default("recurrent_group")
-def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
+def recurrent_group(step,
+                    input,
+                    reverse=False,
+                    name=None,
+                    targetInlink=None,
+                    is_generating=False):
     """
     Recurrent layer group is an extremely flexible recurrent unit in
     PaddlePaddle. As long as the user defines the calculation done within a
@@ -2443,6 +2852,12 @@ def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
 
     :type targetInlink: LayerOutput|SubsequenceInput
 
+    :param is_generating: If is generating, none of input type should be LayerOutput;
+                          else, for training or testing, one of the input type must 
+                          be LayerOutput.
+
+    : type is_generating: bool
+    
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -2470,7 +2885,7 @@ def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
                 return True
         return False
 
-    assert(targetInlink == None or targetInlink_in_inlinks())
+    assert (targetInlink == None or targetInlink_in_inlinks())
     targetInlinkName = None if targetInlink == None \
                             else targetInlink.name if isinstance(targetInlink, LayerOutput) \
                                                    else targetInlink.input.name
@@ -2485,10 +2900,12 @@ def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
             return x.name
 
     RecurrentLayerGroupWithoutOutLinksBegin(
-        name=name, in_links=map(map_in_links, in_links),
+        name=name,
+        in_links=map(map_in_links, in_links),
         seq_reversed=reverse,
         target_inlinkname=targetInlinkName)
     in_args = []
+    has_LayerOutput = True
     for each_input in input:
         assert is_single_input(each_input)
         if isinstance(each_input, LayerOutput):
@@ -2496,16 +2913,22 @@ def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
         elif isinstance(each_input, SubsequenceInput):
             in_args.append(each_input.input)
         else:
+            has_LayerOutput = False
             mem_name = "__%s_memory__" % each_input.input.name
-            mem = memory(name=mem_name,
-                         is_seq=each_input.is_seq,
-                         size=each_input.input.size,
-                         boot_layer=each_input.input)
-            with mixed_layer(name=mem_name, size=each_input.input.size,
-                             act=IdentityActivation()) as mix:
+            mem = memory(
+                name=mem_name,
+                is_seq=each_input.is_seq,
+                size=each_input.input.size,
+                boot_layer=each_input.input)
+            with mixed_layer(
+                    name=mem_name,
+                    size=each_input.input.size,
+                    act=IdentityActivation()) as mix:
                 mix += identity_projection(mem)
             in_args.append(mem)
 
+    assert (is_generating != has_LayerOutput)
+
     layer_outs = step(*in_args)
 
     if isinstance(layer_outs, LayerOutput):
@@ -2544,14 +2967,15 @@ class GeneratedInput(BaseGeneratedInput):
         return maxid_layer(input=input, name='__beam_search_predict__')
 
     def before_real_step(self):
-        predict_id = memory(name='__beam_search_predict__',
-                            size=self.size,
-                            boot_with_const_id=self.bos_id)
-
-        trg_emb = embedding_layer(input=predict_id,
-                                  size=self.embedding_size,
-                                  param_attr=ParamAttr(
-                                      name=self.embedding_name))
+        predict_id = memory(
+            name='__beam_search_predict__',
+            size=self.size,
+            boot_with_const_id=self.bos_id)
+
+        trg_emb = embedding_layer(
+            input=predict_id,
+            size=self.embedding_size,
+            param_attr=ParamAttr(name=self.embedding_name))
         return trg_emb
 
     def __init__(self, size, embedding_name, embedding_size):
@@ -2584,13 +3008,16 @@ def maxid_layer(input, name=None, layer_attr=None):
     """
 
     assert isinstance(input, LayerOutput)
-    Layer(name=name,
-          type='maxid',
-          inputs=[input.name],
-          **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(name=name,
-                       layer_type=LayerType.MAXID_LAYER,
-                       parents=[input])
+    l = Layer(
+        name=name,
+        type='maxid',
+        inputs=[input.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.MAXID_LAYER,
+        parents=[input],
+        size=l.config.size)
 
 
 @wrap_name_default()
@@ -2619,13 +3046,16 @@ def out_prod_layer(input1, input2, name=None, layer_attr=None):
 
     assert isinstance(input1, LayerOutput)
     assert isinstance(input2, LayerOutput)
-    Layer(name=name,
-          type="out_prod",
-          inputs=[input1.name, input2.name],
-          **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(name=name,
-                       layer_type=LayerType.OUT_PROD_LAYER,
-                       parents=[input1, input2])
+    l = Layer(
+        name=name,
+        type=LayerType.OUT_PROD_LAYER,
+        inputs=[input1.name, input2.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.OUT_PROD_LAYER,
+        parents=[input1, input2],
+        size=l.config.size)
 
 
 @wrap_name_default()
@@ -2654,18 +3084,27 @@ def eos_layer(input, eos_id, name=None, layer_attr=None):
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    Layer(name=name,
-          type=LayerType.EOSID_LAYER,
-          eos_id=eos_id,
-          inputs=[input.name],
-          **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(name=name, layer_type=LayerType.EOSID_LAYER,
-                       parents=[input])
+    l = Layer(
+        name=name,
+        type=LayerType.EOSID_LAYER,
+        eos_id=eos_id,
+        inputs=[input.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.EOSID_LAYER,
+        parents=[input],
+        size=l.config.size)
 
 
 @wrap_name_default()
-def beam_search(step, input, bos_id, eos_id, beam_size,
-                max_length=500, name=None,
+def beam_search(step,
+                input,
+                bos_id,
+                eos_id,
+                beam_size,
+                max_length=500,
+                name=None,
                 num_results_per_sample=None):
     """
     Beam search is a heuristic search algorithm used in sequence generation.
@@ -2739,8 +3178,7 @@ def beam_search(step, input, bos_id, eos_id, beam_size,
     if num_results_per_sample > beam_size:
         logger.warning("num_results_per_sample should be less than beam_size")
 
-    if isinstance(input, StaticInput) or isinstance(input,
-                                                    BaseGeneratedInput):
+    if isinstance(input, StaticInput) or isinstance(input, BaseGeneratedInput):
         input = [input]
 
     generated_input_index = -1
@@ -2765,11 +3203,12 @@ def beam_search(step, input, bos_id, eos_id, beam_size,
 
     def __real_step__(*args):
         eos_name = "__%s_eos_layer__" % name
-        RecurrentLayerGroupSetGenerator(Generator(
-            eos_layer_name=eos_name,
-            max_num_frames=max_length,
-            beam_size=beam_size,
-            num_results_per_sample=num_results_per_sample))
+        RecurrentLayerGroupSetGenerator(
+            Generator(
+                eos_layer_name=eos_name,
+                max_num_frames=max_length,
+                beam_size=beam_size,
+                num_results_per_sample=num_results_per_sample))
 
         args = list(args)
         args.insert(generated_input_index, gipt.before_real_step())
@@ -2780,14 +3219,19 @@ def beam_search(step, input, bos_id, eos_id, beam_size,
 
         return predict
 
-    tmp = recurrent_group(step=__real_step__, input=real_input, reverse=False,
-                          name=name)
+    tmp = recurrent_group(
+        step=__real_step__,
+        input=real_input,
+        reverse=False,
+        name=name,
+        is_generating=True)
 
     return tmp
 
+
 def __cost_input__(input, label, weight=None):
     """
-    inputs and parents for cost layers. 
+    inputs and parents for cost layers.
     """
     ipts = [Input(input.name), Input(label.name)]
     parents = [input, label]
@@ -2796,10 +3240,11 @@ def __cost_input__(input, label, weight=None):
         ipts.append(Input(weight.name))
         parents.append(weight)
     return ipts, parents
-    
+
 
 @wrap_name_default()
-def regression_cost(input, label, weight=None, name=None):
+@layer_support()
+def regression_cost(input, label, weight=None, name=None, layer_attr=None):
     """
     Regression Layer.
 
@@ -2814,18 +3259,27 @@ def regression_cost(input, label, weight=None, name=None):
     :param weight: The weight affects the cost, namely the scale of cost.
                    It is an optional argument.
     :type weight: LayerOutput
+    :param layer_attr: layer's extra attribute.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
     ipts, parents = __cost_input__(input, label, weight)
 
-    Layer(inputs=ipts, type="square_error", name=name)
-    return LayerOutput(name, LayerType.COST, parents=parents)
+    Layer(
+        inputs=ipts,
+        type="square_error",
+        name=name,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(name, LayerType.COST, parents=parents, size=1)
 
 
 @wrap_name_default("cost")
 @layer_support()
-def classification_cost(input, label, weight=None, name=None,
+def classification_cost(input,
+                        label,
+                        weight=None,
+                        name=None,
                         evaluator=classification_error_evaluator,
                         layer_attr=None):
     """
@@ -2852,8 +3306,11 @@ def classification_cost(input, label, weight=None, name=None,
 
     ipts, parents = __cost_input__(input, label, weight)
 
-    Layer(name=name, type="multi-class-cross-entropy", inputs=ipts,
-          **ExtraLayerAttribute.to_kwargs(layer_attr))
+    Layer(
+        name=name,
+        type="multi-class-cross-entropy",
+        inputs=ipts,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
 
     def __add_evaluator__(e):
         assert callable(e)
@@ -2872,12 +3329,19 @@ def classification_cost(input, label, weight=None, name=None,
     for each_evaluator in evaluator:
         __add_evaluator__(each_evaluator)
 
-    return LayerOutput(name, LayerType.COST, parents=parents)
+    return LayerOutput(name, LayerType.COST, parents=parents, size=1)
 
 
-def conv_operator(img, filter, filter_size, num_filters,
-                  num_channel=None, stride=1, padding=0,
-                  filter_size_y=None, stride_y=None, padding_y=None):
+def conv_operator(img,
+                  filter,
+                  filter_size,
+                  num_filters,
+                  num_channels=None,
+                  stride=1,
+                  padding=0,
+                  filter_size_y=None,
+                  stride_y=None,
+                  padding_y=None):
     """
     Different from img_conv_layer, conv_op is an Operator, which can be used
     in mixed_layer. And conv_op takes two inputs to perform convolution.
@@ -2906,8 +3370,8 @@ def conv_operator(img, filter, filter_size, num_filters,
     :type filter_size_y: int
     :param num_filters: channel of output data.
     :type num_filters: int
-    :param num_channel: channel of input data.
-    :type num_channel: int
+    :param num_channels: channel of input data.
+    :type num_channels: int
     :param stride: The x dimension of the stride.
     :type stride: int
     :param stride_y: The y dimension of the stride.
@@ -2926,29 +3390,139 @@ def conv_operator(img, filter, filter_size, num_filters,
     if padding_y is None:
         padding_y = padding
 
-    if num_channel is None:
-        num_channel = img.num_filters
+    if num_channels is None:
+        num_channels = img.num_filters
 
     assert isinstance(filter, LayerOutput)
     if filter.size is not None:
-        filter.size = filter_size * filter_size_y * num_filters * num_channel
-
-    op = ConvOperator(input_layer_names=[img.name, filter.name],
-                      num_filters=num_filters,
-                      conv_conf=Conv(filter_size=filter_size,
-                                     padding=padding,
-                                     stride=stride,
-                                     channels=num_channel,
-                                     filter_size_y=filter_size_y,
-                                     padding_y=padding_y,
-                                     stride_y=stride_y,
-                                     groups=1))
+        filter.size = filter_size * filter_size_y * num_filters * num_channels
+
+    op = ConvOperator(
+        input_layer_names=[img.name, filter.name],
+        num_filters=num_filters,
+        conv_conf=Conv(
+            filter_size=filter_size,
+            padding=padding,
+            stride=stride,
+            channels=num_channels,
+            filter_size_y=filter_size_y,
+            padding_y=padding_y,
+            stride_y=stride_y,
+            groups=1))
     op.origin = [img, filter]
     return op
 
 
+@wrap_param_attr_default()
+def conv_projection(input,
+                    filter_size,
+                    num_filters,
+                    num_channels=None,
+                    stride=1,
+                    padding=0,
+                    filter_size_y=None,
+                    stride_y=None,
+                    padding_y=None,
+                    groups=1,
+                    param_attr=None):
+    """
+    ConvProjection with a layer as input.
+    It performs element-wise multiplication with weight.
+
+    Different from img_conv_layer and conv_op, conv_projection is an Projection,
+    which can be used in mixed_layer and conat_layer. It use cudnn to implement
+    conv and only support GPU mode.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       proj = conv_projection(img=input1,
+                              filter_size=3,
+                              num_filters=64,
+                              num_channels=64)
+
+    :param input: input layer
+    :type input: LayerOutput
+    :param filter_size: The x dimension of a filter kernel.
+    :type filter_size: int
+    :param filter_size_y: The y dimension of a filter kernel. Since
+                          PaddlePaddle now supports rectangular filters,
+                          the filter's shape can be (filter_size, filter_size_y).
+    :type filter_size_y: int
+    :param num_filters: channel of output data.
+    :type num_filters: int
+    :param num_channels: channel of input data.
+    :type num_channels: int
+    :param stride: The x dimension of the stride.
+    :type stride: int
+    :param stride_y: The y dimension of the stride.
+    :type stride_y: int
+    :param padding: The x dimension of padding.
+    :type padding: int
+    :param padding_y: The y dimension of padding.
+    :type padding_y: int
+    :param groups: The group number.
+    :type groups: int
+    :param param_attr: Convolution param attribute. None means default attribute
+    :type param_attr: ParameterAttribute
+    :return: A DotMulProjection Object.
+    :rtype: DotMulProjection
+    """
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+
+    if filter_size_y is None:
+        if isinstance(filter_size, collections.Sequence):
+            assert len(filter_size) == 2
+            filter_size, filter_size_y = filter_size
+        else:
+            filter_size_y = filter_size
+
+    if stride_y is None:
+        if isinstance(stride, collections.Sequence):
+            assert len(stride) == 2
+            stride, stride_y = stride
+        else:
+            stride_y = stride
+
+    if padding_y is None:
+        if isinstance(padding, collections.Sequence):
+            assert len(padding) == 2
+            padding, padding_y = padding
+        else:
+            padding_y = padding
+
+    if param_attr.attr.get('initial_smart'):
+        # special initial for conv layers.
+        init_w = (2.0 / (filter_size**2 * num_channels))**0.5
+        param_attr.attr["initial_mean"] = 0.0
+        param_attr.attr["initial_std"] = init_w
+        param_attr.attr["initial_strategy"] = 0
+        param_attr.attr["initial_smart"] = False
+
+    proj = ConvProjection(
+        input_layer_name=input.name,
+        num_filters=num_filters,
+        conv_conf=Conv(
+            filter_size=filter_size,
+            padding=padding,
+            stride=stride,
+            channels=num_channels,
+            filter_size_y=filter_size_y,
+            padding_y=padding_y,
+            stride_y=stride_y,
+            groups=groups),
+        **param_attr.attr)
+
+    proj.origin = input
+    return proj
+
+
 @wrap_name_default()
-def conv_shift_layer(a, b, name=None):
+@layer_support()
+def conv_shift_layer(a, b, name=None, layer_attr=None):
     """
     This layer performs cyclic convolution for two input. For example:
       - a[in]: contains M elements.
@@ -2977,6 +3551,8 @@ def conv_shift_layer(a, b, name=None):
     :type a: LayerOutput
     :param b: input layer b
     :type b: LayerOutput
+    :param layer_attr: layer's extra attribute.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -2986,10 +3562,10 @@ def conv_shift_layer(a, b, name=None):
         name=name,
         type=LayerType.CONV_SHIFT_LAYER,
         inputs=[a.name, b.name],
-    )
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
 
-    return LayerOutput(name, LayerType.CONV_SHIFT_LAYER, parents=[a, b],
-                       size=a.size)
+    return LayerOutput(
+        name, LayerType.CONV_SHIFT_LAYER, parents=[a, b], size=a.size)
 
 
 @wrap_name_default()
@@ -2997,8 +3573,14 @@ def conv_shift_layer(a, b, name=None):
 @wrap_bias_attr_default()
 @wrap_act_default(act=LinearActivation())
 @layer_support(ERROR_CLIPPING, DROPOUT)
-def tensor_layer(a, b, size, act=None, name=None,
-                 param_attr=None, bias_attr=None, layer_attr=None):
+def tensor_layer(a,
+                 b,
+                 size,
+                 act=None,
+                 name=None,
+                 param_attr=None,
+                 bias_attr=None,
+                 layer_attr=None):
     """
     This layer performs tensor operation for two input.
     For example, each sample:
@@ -3047,23 +3629,28 @@ def tensor_layer(a, b, size, act=None, name=None,
         type=LayerType.TENSOR_LAYER,
         active_type=act.name,
         bias=ParamAttr.to_bias(bias_attr),
-        inputs=[Input(a.name, **param_attr.attr),
-                Input(b.name)],
-        **ExtraLayerAttribute.to_kwargs(layer_attr)
-    )
-    return LayerOutput(name, LayerType.TENSOR_LAYER, parents=[a, b],
-                       activation=act, size=size)
+        inputs=[Input(a.name, **param_attr.attr), Input(b.name)],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.TENSOR_LAYER, parents=[a, b], activation=act, size=size)
 
 
 @wrap_name_default()
 @wrap_param_attr_default()
 @wrap_bias_attr_default()
 @wrap_act_default()
-def selective_fc_layer(input, select, size, act=None, name=None,
+@layer_support()
+def selective_fc_layer(input,
+                       select,
+                       size,
+                       act=None,
+                       name=None,
                        pass_generation=False,
                        has_selected_colums=True,
                        mul_ratio=0.02,
-                       param_attr=None, bias_attr=None, layer_attr=None):
+                       param_attr=None,
+                       bias_attr=None,
+                       layer_attr=None):
     """
     Selectived fully connected layer. Different from fc_layer, the output
     of this layer maybe sparse. It requires an additional input to indicate
@@ -3113,8 +3700,9 @@ def selective_fc_layer(input, select, size, act=None, name=None,
     if select.size is not None:
         assert select.size == size
     Layer(
-        inputs=[Input(ipt.name, **attr.attr) for ipt, attr in zip(
-            input, param_attr)] + [select.name],
+        inputs=[
+            Input(ipt.name, **attr.attr) for ipt, attr in zip(input, param_attr)
+        ] + [select.name],
         name=name,
         type=LayerType.SEL_FC_LAYER,
         size=size,
@@ -3123,15 +3711,18 @@ def selective_fc_layer(input, select, size, act=None, name=None,
         selective_fc_pass_generation=pass_generation,
         has_selected_colums=has_selected_colums,
         selective_fc_full_mul_ratio=mul_ratio,
-        **ExtraLayerAttribute.to_kwargs(layer_attr)
-    )
-    return LayerOutput(name, LayerType.SEL_FC_LAYER, list(input) + [select],
-                       activation=act,
-                       size=size)
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name,
+        LayerType.SEL_FC_LAYER,
+        list(input) + [select],
+        activation=act,
+        size=size)
 
 
 @wrap_name_default()
-def sampling_id_layer(input, name=None):
+@layer_support()
+def sampling_id_layer(input, name=None, layer_attr=None):
     """
     A layer for sampling id from multinomial distribution from the input layer.
     Sampling one id for one sample.
@@ -3146,19 +3737,27 @@ def sampling_id_layer(input, name=None):
     :type input: LayerOutput
     :param name: The Layer Name.
     :type name: basestring
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    Layer(
+    l = Layer(
         name=name,
         type=LayerType.SAMPLING_ID_LAYER,
         inputs=[Input(input.name)],
-    )
-    return LayerOutput(name, LayerType.SAMPLING_ID_LAYER, input)
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.SAMPLING_ID_LAYER, input, size=l.config.size)
 
 
 @wrap_name_default()
-def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0):
+@layer_support()
+def slope_intercept_layer(input,
+                          name=None,
+                          slope=1.0,
+                          intercept=0.0,
+                          layer_attr=None):
     """
     This layer for applying a slope and an intercept to the input
     element-wise. There is no activation and weight.
@@ -3180,6 +3779,8 @@ def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0):
     :type slope: float.
     :param intercept: the offset.
     :type intercept: float.
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3189,12 +3790,14 @@ def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0):
         slope=slope,
         intercept=intercept,
         inputs=[Input(input.name)],
-    )
-    return LayerOutput(name, LayerType.SLOPE_INTERCEPT_LAYER, input)
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.SLOPE_INTERCEPT_LAYER, input, size=input.size)
 
 
 @wrap_name_default()
-def linear_comb_layer(weights, vectors, size=None, name=None):
+@layer_support()
+def linear_comb_layer(weights, vectors, size=None, name=None, layer_attr=None):
     """
     A layer for weighted sum of vectors takes two inputs.
       - Input: size of weights is M
@@ -3204,6 +3807,7 @@ def linear_comb_layer(weights, vectors, size=None, name=None):
     .. math::
 
        z(i) = \sum_{j=0}^{M-1} x(j) y(i+Nj)
+
     where :math:`0 \le i \le N-1`
 
     Or in the matrix notation:
@@ -3235,6 +3839,8 @@ def linear_comb_layer(weights, vectors, size=None, name=None):
     :type size: int
     :param name: The Layer Name.
     :type name: basestring
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3242,7 +3848,7 @@ def linear_comb_layer(weights, vectors, size=None, name=None):
     if vectors.size is not None and weights.size is not None:
         assert vectors.size % weights.size == 0
         if size is None:
-                size = vectors.size / weights.size
+            size = vectors.size / weights.size
         else:
             assert size == vectors.size / weights.size
     Layer(
@@ -3250,27 +3856,29 @@ def linear_comb_layer(weights, vectors, size=None, name=None):
         type=LayerType.LINEAR_COMBINATION_LAYER,
         size=size,
         inputs=[Input(weights.name), Input(vectors.name)],
-    )
-    return LayerOutput(name, LayerType.LINEAR_COMBINATION_LAYER,
-                       [weights, vectors], size=size)
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.LINEAR_COMBINATION_LAYER, [weights, vectors], size=size)
 
 
 convex_comb_layer = linear_comb_layer
 
 
 @wrap_name_default()
+@layer_support()
 def block_expand_layer(input,
-                       channel=0,
                        block_x=0,
                        block_y=0,
                        stride_x=0,
                        stride_y=0,
                        padding_x=0,
                        padding_y=0,
-                       name=None):
+                       num_channels=None,
+                       name=None,
+                       layer_attr=None):
     """
     Expand feature map to minibatch matrix.
-       - matrix width is: block_y * block_x * channel
+       - matrix width is: block_y * block_x * num_channels
        - matirx height is: outputH * outputW
 
     .. math::
@@ -3282,7 +3890,7 @@ def block_expand_layer(input,
     The expand method is the same with ExpandConvLayer, but saved the transposed
     value. After expanding, output.sequenceStartPositions will store timeline.
     The number of time steps are outputH * outputW and the dimension of each
-    time step is block_y * block_x * channel. This layer can be used after
+    time step is block_y * block_x * num_channels. This layer can be used after
     convolution neural network, and before recurrent neural network.
 
     The simple usage is:
@@ -3290,7 +3898,7 @@ def block_expand_layer(input,
     .. code-block:: python
 
        block_expand = block_expand_layer(input,
-                                         channel=128,
+                                         num_channels=128,
                                          stride_x=1,
                                          stride_y=1,
                                          block_x=1,
@@ -3298,8 +3906,8 @@ def block_expand_layer(input,
 
     :param input: The input layer.
     :type input: LayerOutput
-    :param channel: The channel number of input layer.
-    :type channel: int
+    :param num_channels: The channel number of input layer.
+    :type num_channels: int|None
     :param block_x: The width of sub block.
     :type block_x: int
     :param block_y: The width of sub block.
@@ -3314,27 +3922,110 @@ def block_expand_layer(input,
     :type padding_y: int
     :param name: The name of this layer, which can not specify.
     :type name: None|basestring.
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    Layer(name=name,
-          input=Input(input.name,
-                      block_expand=BlockExpand(channels=channel,
-                                               block_x=block_x,
-                                               block_y=block_y,
-                                               stride_x=stride_x,
-                                               stride_y=stride_y,
-                                               padding_x=padding_x,
-                                               padding_y=padding_y)
-                      ),
-          type=LayerType.BLOCK_EXPAND,
-          )
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+    l = Layer(
+        name=name,
+        inputs=Input(
+            input.name,
+            block_expand=BlockExpand(
+                channels=num_channels,
+                block_x=block_x,
+                block_y=block_y,
+                stride_x=stride_x,
+                stride_y=stride_y,
+                padding_x=padding_x,
+                padding_y=padding_y)),
+        type=LayerType.BLOCK_EXPAND,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
 
-    return LayerOutput(name, LayerType.BLOCK_EXPAND, parents=[input])
+    return LayerOutput(
+        name, LayerType.BLOCK_EXPAND, parents=[input], size=l.config.size)
 
 
 @wrap_name_default()
-def ctc_layer(input, label, size=None, name=None, norm_by_times=False):
+@layer_support()
+def maxout_layer(input,
+                 groups,
+                 num_channels=None,
+                 size_x=None,
+                 size_y=None,
+                 name=None,
+                 layer_attr=None):
+    """
+    A layer to do max out on conv layer output.
+      - Input: output of a conv layer.
+      - Output: feature map size same as input. Channel is (input channel) / groups.
+
+    So groups should be larger than 1, and the num of channels should be able
+    to devided by groups.
+
+    Please refer to Paper:
+      - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
+      - Multi-digit Number Recognition from Street View \
+        Imagery using Deep Convolutional Neural Networks: \
+        https://arxiv.org/pdf/1312.6082v4.pdf
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       maxout = maxout_layer(input,
+                             num_channels=128,
+                             groups=4)
+
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param num_channels: The channel number of input layer. If None will be set
+                     automatically from previous output.
+    :type num_channels: int|None
+    :param groups: The group number of input layer.
+    :type groups: int
+    :param size_x: conv output width. If None will be set
+                   automatically from previous output.
+    :type size_x: int|None
+    :param size_y: conv output height. If None will be set
+                   automatically from previous output.
+    :type size_y: int|None
+    :param name: The name of this layer, which can not specify.
+    :type name: None|basestring.
+    :param layer_attr: Extra Layer attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert input.layer_type == LayerType.CONV_LAYER
+    assert isinstance(input.activation, LinearActivation)
+    assert groups > 1
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+    assert num_channels % groups == 0
+    l = Layer(
+        name=name,
+        inputs=Input(
+            input.name, maxout=MaxOut(
+                channels=num_channels, groups=groups)),
+        type=LayerType.MAXOUT,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.MAXOUT, parents=[input], size=l.config.size)
+
+
+@wrap_name_default()
+@layer_support()
+def ctc_layer(input,
+              label,
+              size=None,
+              name=None,
+              norm_by_times=False,
+              layer_attr=None):
     """
     Connectionist Temporal Classification (CTC) is designed for temporal
     classication task. That is, for sequence labeling problems where the
@@ -3371,6 +4062,8 @@ def ctc_layer(input, label, size=None, name=None, norm_by_times=False):
     :type name: basestring|None
     :param norm_by_times: Whether to normalization by times. False by default.
     :type norm_by_times: bool
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3386,14 +4079,21 @@ def ctc_layer(input, label, size=None, name=None, norm_by_times=False):
         type=LayerType.CTC_LAYER,
         size=size,
         norm_by_times=norm_by_times,
-        inputs=[input.name, label.name]
-    )
+        inputs=[input.name, label.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(name, LayerType.CTC_LAYER, [input, label], size=size)
 
 
 @wrap_name_default()
 @wrap_param_attr_default()
-def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None):
+@layer_support()
+def crf_layer(input,
+              label,
+              size=None,
+              weight=None,
+              param_attr=None,
+              name=None,
+              layer_attr=None):
     """
     A layer for calculating the cost of sequential conditional random
     field model.
@@ -3419,6 +4119,8 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None):
     :type param_attr: ParameterAttribute
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3432,8 +4134,7 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None):
         else:
             assert size == input.size
 
-    ipts = [Input(input.name, **param_attr.attr),
-            Input(label.name)]
+    ipts = [Input(input.name, **param_attr.attr), Input(label.name)]
     if weight is not None:
         ipts.append(Input(weight.name))
 
@@ -3442,16 +4143,25 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None):
         type=LayerType.CRF_LAYER,
         size=size,
         inputs=ipts,
-    )
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
     parents = [input, label]
     if weight is not None:
         parents.append(weight)
-    return LayerOutput(name, LayerType.CRF_LAYER, parents, size=size)
+    # The size for LayerOutput means the dimension of the output.
+    # It's different from the meaning of crf layer, which is the number of
+    # classes.
+    return LayerOutput(name, LayerType.CRF_LAYER, parents, size=1)
 
 
 @wrap_name_default()
 @wrap_param_attr_default()
-def crf_decoding_layer(input, size, label=None, param_attr=None, name=None):
+@layer_support()
+def crf_decoding_layer(input,
+                       size,
+                       label=None,
+                       param_attr=None,
+                       name=None,
+                       layer_attr=None):
     """
     A layer for calculating the decoding sequence of sequential conditional
     random field model. The decoding sequence is stored in output.ids.
@@ -3469,6 +4179,8 @@ def crf_decoding_layer(input, size, label=None, param_attr=None, name=None):
     :type param_attr: ParameterAttribute
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3485,18 +4197,28 @@ def crf_decoding_layer(input, size, label=None, param_attr=None, name=None):
         type=LayerType.CRF_DECODING_LAYER,
         size=size,
         inputs=ipts,
-    )
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
     parents = [input]
     if label is not None:
         parents.append(label)
-    return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=size)
+    # The size for LayerOutput means the dimension of the output.
+    # It's different from the meaning of crf layer, which is the number of
+    # classes.
+    return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=1)
+
 
 @wrap_bias_attr_default(has_bias=True)
 @wrap_name_default()
 @layer_support()
-def nce_layer(input, label, num_classes, weight=None,
-              num_neg_samples=10, neg_distribution=None,
-              name=None, bias_attr=None, layer_attr=None):
+def nce_layer(input,
+              label,
+              num_classes,
+              weight=None,
+              num_neg_samples=10,
+              neg_distribution=None,
+              name=None,
+              bias_attr=None,
+              layer_attr=None):
     """
     Noise-contrastive estimation.
     Implements the method in the following paper:
@@ -3518,9 +4240,9 @@ def nce_layer(input, label, num_classes, weight=None,
     :param weight: weight layer, can be None(default)
     :type weight: LayerOutput
     :param num_classes: number of classes.
-    :type num_classes: int 
+    :type num_classes: int
     :param num_neg_samples: number of negative samples. Default is 10.
-    :type num_neg_samples: int 
+    :type num_neg_samples: int
     :param neg_distribution: The distribution for generating the random negative labels.
                              A uniform distribution will be used if not provided.
                              If not None, its length must be equal to num_classes.
@@ -3541,7 +4263,7 @@ def nce_layer(input, label, num_classes, weight=None,
         assert isinstance(neg_distribution, collections.Sequence)
         assert len(neg_distribution) == num_classes
         assert sum(neg_distribution) == 1
-    
+
     ipts_for_layer = []
     parents = []
     for each_input in input:
@@ -3557,7 +4279,7 @@ def nce_layer(input, label, num_classes, weight=None,
         ipts_for_layer.append(weight.name)
         parents.append(weight)
 
-    Layer(
+    l = Layer(
         name=name,
         type=LayerType.NCE_LAYER,
         num_classes=num_classes,
@@ -3565,9 +4287,10 @@ def nce_layer(input, label, num_classes, weight=None,
         num_neg_samples=num_neg_samples,
         inputs=ipts_for_layer,
         bias=ParamAttr.to_bias(bias_attr),
-        **ExtraLayerAttribute.to_kwargs(layer_attr)
-    )
-    return LayerOutput(name, LayerType.NCE_LAYER, parents=parents)
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.NCE_LAYER, parents=parents, size=l.config.size)
+
 
 """
 following are cost Layers.
@@ -3575,7 +4298,14 @@ following are cost Layers.
 
 
 @wrap_name_default()
-def rank_cost(left, right, label, weight=None, name=None, coeff=1.0):
+@layer_support()
+def rank_cost(left,
+              right,
+              label,
+              weight=None,
+              name=None,
+              coeff=1.0,
+              layer_attr=None):
     """
     A cost Layer for learning to rank using gradient descent. Details can refer
     to `papers <http://research.microsoft.com/en-us/um/people/cburges/papers/
@@ -3619,6 +4349,8 @@ def rank_cost(left, right, label, weight=None, name=None, coeff=1.0):
     :type name: None|basestring
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3632,17 +4364,24 @@ def rank_cost(left, right, label, weight=None, name=None, coeff=1.0):
         ipts.append(weight.name)
         parents.append(weight)
 
-    Layer(name=name,
-          type=LayerType.RANK_COST,
-          inputs=ipts,
-          coeff=coeff,
-          )
+    Layer(
+        name=name,
+        type=LayerType.RANK_COST,
+        inputs=ipts,
+        coeff=coeff,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
 
-    return LayerOutput(name, LayerType.RANK_COST, parents=parents)
+    return LayerOutput(name, LayerType.RANK_COST, parents=parents, size=1)
 
 
 @wrap_name_default()
-def lambda_cost(input, score, name, NDCG_num=5, max_sort_size=-1):
+@layer_support()
+def lambda_cost(input,
+                score,
+                name,
+                NDCG_num=5,
+                max_sort_size=-1,
+                layer_attr=None):
     """
     lambdaCost for lambdaRank LTR approach.
 
@@ -3673,98 +4412,146 @@ def lambda_cost(input, score, name, NDCG_num=5, max_sort_size=-1):
     :type max_sort_size: int
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
     assert isinstance(input, LayerOutput) and isinstance(score, LayerOutput)
     if score.size is not None:
         assert score.size == 1
-    Layer(name=name,
-          type=LayerType.LAMBDA_COST,
-          inputs=[input.name, score.name],
-          NDCG_num=NDCG_num,
-          max_sort_size=max_sort_size
-          )
+    Layer(
+        name=name,
+        type=LayerType.LAMBDA_COST,
+        inputs=[input.name, score.name],
+        NDCG_num=NDCG_num,
+        max_sort_size=max_sort_size,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
 
-    return LayerOutput(name, LayerType.LAMBDA_COST, parents=[input, score])
+    return LayerOutput(
+        name, LayerType.LAMBDA_COST, parents=[input, score], size=1)
 
 
 @wrap_name_default()
-def cross_entropy(input, label, name=None, coeff=1.0):
+@layer_support()
+def cross_entropy(input, label, name=None, coeff=1.0, layer_attr=None):
     """
     A loss layer for multi class entropy.
 
     .. code-block:: python
 
-       cost = cross_entropy(input, label)
+       cost = cross_entropy(input=input_layer,
+                            label=label_layer)
 
     :param input: The first input layer.
     :type input: LayerOutput.
     :param label: The input label.
     :type input: LayerOutput.
-    :param type: The type of cost.
-    :type type: basestring.
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring.
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float.
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput.
     """
 
-    Layer(name=name,
-          type=LayerType.CROSS_ENTROPY,
-          inputs=[input.name, label.name],
-          coeff=coeff,
-          )
-    return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=[input, label])
+    Layer(
+        name=name,
+        type=LayerType.CROSS_ENTROPY,
+        inputs=[input.name, label.name],
+        coeff=coeff,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.CROSS_ENTROPY, parents=[input, label], size=1)
 
 
 @wrap_name_default()
-def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0,
-                                softmax_selfnorm_alpha=0.1):
+@layer_support()
+def cross_entropy_with_selfnorm(input,
+                                label,
+                                name=None,
+                                coeff=1.0,
+                                softmax_selfnorm_alpha=0.1,
+                                layer_attr=None):
     """
     A loss layer for multi class entropy with selfnorm.
 
     .. code-block:: python
 
-       cost = cross_entropy_with_selfnorm(input, label)
+       cost = cross_entropy_with_selfnorm(input=input_layer,
+                                          label=label_layer)
 
     :param input: The first input layer.
     :type input: LayerOutput.
     :param label: The input label.
     :type input: LayerOutput.
-    :param type: The type of cost.
-    :type type: basestring.
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring.
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float.
     :param softmax_selfnorm_alpha: The scale factor affects the cost.
     :type softmax_selfnorm_alpha: float.
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput.
+    """
+    Layer(
+        name=name,
+        type=LayerType.CROSS_ENTROPY_WITH_SELFNORM,
+        inputs=[input.name, label.name],
+        coeff=coeff,
+        softmax_selfnorm_alpha=softmax_selfnorm_alpha,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+
+    return LayerOutput(
+        name,
+        LayerType.CROSS_ENTROPY_WITH_SELFNORM,
+        parents=[input, label],
+        size=1)
+
+
+@wrap_name_default()
+@layer_support()
+def sum_cost(input, name=None, layer_attr=None):
+    """
+    A loss layer which calculate the sum of the input as loss
+
+    .. code-block:: python
+
+       cost = sum_cost(input=input_layer)
+
+    :param input: The first input layer.
+    :type input: LayerOutput.
+    :param name: The name of this layers. It is not necessary.
+    :type name: None|basestring.
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput.
     """
-    Layer(name=name,
-          type=LayerType.CROSS_ENTROPY_WITH_SELFNORM,
-          inputs=[input.name, label.name],
-          coeff=coeff,
-          softmax_selfnorm_alpha=softmax_selfnorm_alpha,
-          )
+    assert isinstance(input, LayerOutput)
+    Layer(
+        name=name,
+        type=LayerType.SUM_COST,
+        inputs=[input.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
 
-    return LayerOutput(name,
-                       LayerType.CROSS_ENTROPY_WITH_SELFNORM,
-                       parents=[input, label])
+    return LayerOutput(name, LayerType.SUM_COST, parents=[input], size=1)
 
 
 @wrap_name_default()
-def huber_cost(input, label, name=None, coeff=1.0):
+@layer_support()
+def huber_cost(input, label, name=None, coeff=1.0, layer_attr=None):
     """
     A loss layer for huber loss.
 
     .. code-block:: python
 
-       cost = huber_cost(input, label)
+       cost = huber_cost(input=input_layer,
+                         label=label_layer)
 
     :param input: The first input layer.
     :type input: LayerOutput.
@@ -3774,28 +4561,37 @@ def huber_cost(input, label, name=None, coeff=1.0):
     :type name: None|basestring.
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float.
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput.
     """
     assert isinstance(input, LayerOutput)
     if input.size is not None:
         assert input.size == 1
-    Layer(name=name,
-          type=LayerType.HUBER,
-          inputs=[input.name, label.name],
-          coeff=coeff,
-          )
-    return LayerOutput(name, LayerType.HUBER, parents=[input, label])
+    Layer(
+        name=name,
+        type=LayerType.HUBER,
+        inputs=[input.name, label.name],
+        coeff=coeff,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(name, LayerType.HUBER, parents=[input, label], size=1)
 
 
 @wrap_name_default()
-def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0):
+@layer_support()
+def multi_binary_label_cross_entropy(input,
+                                     label,
+                                     name=None,
+                                     coeff=1.0,
+                                     layer_attr=None):
     """
     A loss layer for multi binary label cross entropy.
 
     .. code-block:: python
 
-       cost = multi_binary_label_cross_entropy(input, label)
+       cost = multi_binary_label_cross_entropy(input=input_layer,
+                                               label=label_layer)
 
     :param input: The first input layer.
     :type input: LayerOutput
@@ -3807,20 +4603,27 @@ def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0):
     :type name: None|basestring
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
 
     if input.activation is None or \
             not isinstance(input.activation, SigmoidActivation):
-        logger.log(logging.WARN,
-                   "%s is not recommend for multi_binary_label_cross_entropy's activation, "
-                   "maybe the sigmoid is better" % repr(input.activation))
-
-    Layer(name=name,
-          type=LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
-          inputs=[input.name, label.name],
-          coeff=coeff,
-          )
-    return LayerOutput(name, LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
-                       parents=[input, label])
+        logger.log(
+            logging.WARN,
+            "%s is not recommend for multi_binary_label_cross_entropy's activation, "
+            "maybe the sigmoid is better" % repr(input.activation))
+
+    Layer(
+        name=name,
+        type=LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
+        inputs=[input.name, label.name],
+        coeff=coeff,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name,
+        LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
+        parents=[input, label],
+        size=1)
diff --git a/python/paddle/trainer_config_helpers/math.py b/python/paddle/trainer_config_helpers/math.py
new file mode 100644
index 0000000000000000000000000000000000000000..30a9b1c4e895f2d7629c222208b79545b9c56fda
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/math.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .layers import LayerOutput, mixed_layer, identity_projection, \
+    slope_intercept_layer, scaling_layer, repeat_layer
+from .attrs import is_compatible_with
+from .default_decorators import *
+import activations as act
+from paddle.trainer.config_parser import logger
+
+__all__ = []
+
+
+def register_unary_math_op(op_name, act):
+    def op(input, name=None):
+        return mixed_layer(
+            input=[identity_projection(input=input)], name=name, act=act)
+
+    op = wrap_name_default(op_name)(op)
+    op.__doc__ = type(act).__doc__
+    globals()[op_name] = op
+    __all__.append(op_name)
+
+
+register_unary_math_op('exp', act.ExpActivation())
+register_unary_math_op('log', act.LogActivation())
+register_unary_math_op('abs', act.AbsActivation())
+register_unary_math_op('sigmoid', act.SigmoidActivation())
+register_unary_math_op('tanh', act.TanhActivation())
+register_unary_math_op('square', act.SquareActivation())
+
+
+def add(layeroutput, other):
+    if is_compatible_with(other, float):
+        return slope_intercept_layer(input=layeroutput, intercept=other)
+    if not isinstance(other, LayerOutput):
+        logger.fatal("LayerOutput can only be added with"
+                     " another LayerOutput or a number")
+    if layeroutput.size == other.size:
+        return mixed_layer(input=[
+            identity_projection(input=layeroutput),
+            identity_projection(input=other)
+        ])
+    if other.size != 1 and layeroutput.size != 1:
+        logger.fatal("Two LayerOutput can be added only if they have equal size"
+                     " or one of their sizes is 1. sizes are %s and %s" %
+                     (layeroutput.size, other.size))
+    elif layeroutput.size == 1:
+        tmp = layeroutput
+        layeroutput = other
+        other = tmp
+    other = repeat_layer(other, layeroutput.size)
+    return mixed_layer(input=[
+        identity_projection(input=layeroutput), identity_projection(input=other)
+    ])
+
+
+LayerOutput.__radd__ = add
+LayerOutput.__add__ = add
+
+
+def sub(layeroutput, other):
+    if is_compatible_with(other, float):
+        return slope_intercept_layer(input=layeroutput, intercept=other)
+    if not isinstance(other, LayerOutput):
+        logger.fatal("LayerOutput can only be subtracted with"
+                     " another Layeroutput or a number")
+    neg = slope_intercept_layer(input=other, slope=-1.0)
+    return add(layeroutput, neg)
+
+
+LayerOutput.__sub__ = sub
+
+
+def rsub(layeroutput, other):
+    neg = slope_intercept_layer(input=layeroutput, slope=-1.0)
+    return add(neg, other)
+
+
+LayerOutput.__rsub__ = rsub
+
+
+def mul(layeroutput, other):
+    if is_compatible_with(other, float):
+        return slope_intercept_layer(input=layeroutput, slope=other)
+    if not isinstance(other, LayerOutput):
+        logger.fatal("LayerOutput can only be multiplied with"
+                     " another Layeroutput or a number")
+    elif layeroutput.size == 1:
+        return scaling_layer(input=other, weight=layeroutput)
+    elif other.size == 1:
+        return scaling_layer(input=layeroutput, weight=other)
+    else:
+        logger.fatal("At least one of the operand of '*' must be a number"
+                     " or a LayerOutput with size=1")
+
+
+LayerOutput.__mul__ = mul
+LayerOutput.__rmul__ = mul
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index d8f96195020b42e4aae6cd13520de9558d5622fa..ff6d2e1cffcebf3b55ba7dfbefce9e2af6d09672 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 
 """
@@ -25,28 +24,32 @@ from layers import *  # There are too many layers used in network, so import *
 from poolings import MaxPooling, SumPooling
 from paddle.trainer.config_parser import *
 
-__all__ = ['sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
-           "img_conv_bn_pool", 'dropout_layer', 'lstmemory_group',
-           'lstmemory_unit', 'small_vgg', 'img_conv_group', 'vgg_16_network',
-           'gru_unit', 'gru_group', 'simple_gru', 'simple_attention',
-           'text_conv_pool',
-           'bidirectional_lstm', 'inputs', 'outputs']
-
+__all__ = [
+    'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
+    "img_conv_bn_pool", 'dropout_layer', 'lstmemory_group', 'lstmemory_unit',
+    'small_vgg', 'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group',
+    'simple_gru', 'simple_attention', 'simple_gru2', 'bidirectional_gru',
+    'text_conv_pool', 'bidirectional_lstm', 'inputs', 'outputs'
+]
 
 ######################################################
 #                     Text CNN                       #
 ######################################################
 
+
 @wrap_name_default("sequence_conv_pooling")
 def sequence_conv_pool(input,
-                       context_len, hidden_size,
+                       context_len,
+                       hidden_size,
                        name=None,
                        context_start=None,
-                       pool_type=None, context_proj_layer_name=None,
+                       pool_type=None,
+                       context_proj_layer_name=None,
                        context_proj_param_attr=False,
                        fc_layer_name=None,
                        fc_param_attr=None,
-                       fc_bias_attr=None, fc_act=None,
+                       fc_bias_attr=None,
+                       fc_act=None,
                        pool_bias_attr=None,
                        fc_attr=None,
                        context_attr=None,
@@ -101,40 +104,62 @@ def sequence_conv_pool(input,
     context_proj_layer_name = "%s_conv_proj" % name \
         if context_proj_layer_name is None else context_proj_layer_name
 
-    with mixed_layer(name=context_proj_layer_name,
-                     size=input.size * context_len,
-                     act=LinearActivation(),
-                     layer_attr=context_attr) as m:
-        m += context_projection(input, context_len=context_len,
-                                context_start=context_start,
-                                padding_attr=context_proj_param_attr)
+    with mixed_layer(
+            name=context_proj_layer_name,
+            size=input.size * context_len,
+            act=LinearActivation(),
+            layer_attr=context_attr) as m:
+        m += context_projection(
+            input,
+            context_len=context_len,
+            context_start=context_start,
+            padding_attr=context_proj_param_attr)
 
     fc_layer_name = "%s_conv_fc" % name \
         if fc_layer_name is None else fc_layer_name
-    fl = fc_layer(name=fc_layer_name, input=m, size=hidden_size,
-                  act=fc_act, layer_attr=fc_attr,
-                  param_attr=fc_param_attr, bias_attr=fc_bias_attr)
+    fl = fc_layer(
+        name=fc_layer_name,
+        input=m,
+        size=hidden_size,
+        act=fc_act,
+        layer_attr=fc_attr,
+        param_attr=fc_param_attr,
+        bias_attr=fc_bias_attr)
 
-    return pooling_layer(name=name, input=fl,
-                         pooling_type=pool_type,
-                         bias_attr=pool_bias_attr,
-                         layer_attr=pool_attr)
+    return pooling_layer(
+        name=name,
+        input=fl,
+        pooling_type=pool_type,
+        bias_attr=pool_bias_attr,
+        layer_attr=pool_attr)
 
 
 text_conv_pool = sequence_conv_pool
 
-
 ############################################################################
 #                       Images                                             #
 ############################################################################
 
+
 @wrap_name_default("conv_pool")
-def simple_img_conv_pool(input, filter_size, num_filters, pool_size, name=None,
-                         pool_type=None, act=None, groups=1, conv_stride=1,
-                         conv_padding=0, bias_attr=None, num_channel=None,
-                         param_attr=None, shared_bias=True,
-                         conv_layer_attr=None, pool_stride=1, pool_start=None,
-                         pool_padding=0, pool_layer_attr=None):
+def simple_img_conv_pool(input,
+                         filter_size,
+                         num_filters,
+                         pool_size,
+                         name=None,
+                         pool_type=None,
+                         act=None,
+                         groups=1,
+                         conv_stride=1,
+                         conv_padding=0,
+                         bias_attr=None,
+                         num_channel=None,
+                         param_attr=None,
+                         shared_bias=True,
+                         conv_layer_attr=None,
+                         pool_stride=1,
+                         pool_padding=0,
+                         pool_layer_attr=None):
     """
     Simple image convolution and pooling group.
 
@@ -172,8 +197,6 @@ def simple_img_conv_pool(input, filter_size, num_filters, pool_size, name=None,
     :type conv_layer_attr: ExtraLayerAttribute
     :param pool_stride: see img_pool_layer for details
     :type pool_stride: int
-    :param pool_start: see img_pool_layer for details. It is deprecated now.
-    :type pool_start: int
     :param pool_padding: see img_pool_layer for details
     :type pool_padding: int
     :param pool_layer_attr: see img_pool_layer for details
@@ -181,29 +204,52 @@ def simple_img_conv_pool(input, filter_size, num_filters, pool_size, name=None,
     :return: Layer's output
     :rtype: LayerOutput
     """
-    _conv_ = img_conv_layer(name="%s_conv" % name, input=input,
-                            filter_size=filter_size,
-                            num_filters=num_filters, num_channels=num_channel,
-                            act=act, groups=groups,
-                            stride=conv_stride,
-                            padding=conv_padding, bias_attr=bias_attr,
-                            param_attr=param_attr, shared_biases=shared_bias,
-                            layer_attr=conv_layer_attr)
-    return img_pool_layer(name="%s_pool" % name, input=_conv_,
-                          pool_size=pool_size,
-                          pool_type=pool_type, stride=pool_stride,
-                          start=pool_start, padding=pool_padding,
-                          layer_attr=pool_layer_attr)
+    _conv_ = img_conv_layer(
+        name="%s_conv" % name,
+        input=input,
+        filter_size=filter_size,
+        num_filters=num_filters,
+        num_channels=num_channel,
+        act=act,
+        groups=groups,
+        stride=conv_stride,
+        padding=conv_padding,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        shared_biases=shared_bias,
+        layer_attr=conv_layer_attr)
+    return img_pool_layer(
+        name="%s_pool" % name,
+        input=_conv_,
+        pool_size=pool_size,
+        pool_type=pool_type,
+        stride=pool_stride,
+        padding=pool_padding,
+        layer_attr=pool_layer_attr)
 
 
 @wrap_name_default("conv_bn_pool")
-def img_conv_bn_pool(input, filter_size, num_filters, pool_size, name=None,
-                     pool_type=None, act=None, groups=1, conv_stride=1,
-                     conv_padding=0, conv_bias_attr=None, num_channel=None,
-                     conv_param_attr=None, shared_bias=True,
-                     conv_layer_attr=None, bn_param_attr=None,
-                     bn_bias_attr=None, bn_layer_attr=None, pool_stride=1,
-                     pool_start=None, pool_padding=0, pool_layer_attr=None):
+def img_conv_bn_pool(input,
+                     filter_size,
+                     num_filters,
+                     pool_size,
+                     name=None,
+                     pool_type=None,
+                     act=None,
+                     groups=1,
+                     conv_stride=1,
+                     conv_padding=0,
+                     conv_bias_attr=None,
+                     num_channel=None,
+                     conv_param_attr=None,
+                     shared_bias=True,
+                     conv_layer_attr=None,
+                     bn_param_attr=None,
+                     bn_bias_attr=None,
+                     bn_layer_attr=None,
+                     pool_stride=1,
+                     pool_padding=0,
+                     pool_layer_attr=None):
     """
     Convolution, batch normalization, pooling group.
 
@@ -243,8 +289,6 @@ def img_conv_bn_pool(input, filter_size, num_filters, pool_size, name=None,
     :param bn_layer_attr: ParameterAttribute.
     :param pool_stride: see img_pool_layer's document.
     :type pool_stride: int
-    :param pool_start: see img_pool_layer's document. It is deprecated now.
-    :type pool_start: int
     :param pool_padding: see img_pool_layer's document.
     :type pool_padding: int
     :param pool_layer_attr: see img_pool_layer's document.
@@ -252,31 +296,42 @@ def img_conv_bn_pool(input, filter_size, num_filters, pool_size, name=None,
     :return: Layer groups output
     :rtype: LayerOutput
     """
-    __conv__ = img_conv_layer(name="%s_conv" % name,
-                              input=input, filter_size=filter_size,
-                              num_filters=num_filters, num_channels=num_channel,
-                              act=LinearActivation(), groups=groups,
-                              stride=conv_stride, padding=conv_padding,
-                              bias_attr=conv_bias_attr,
-                              param_attr=conv_param_attr,
-                              shared_biases=shared_bias,
-                              layer_attr=conv_layer_attr)
-    __bn__ = batch_norm_layer(name="%s_bn" % name,
-                              input=__conv__, act=act,
-                              bias_attr=bn_bias_attr, param_attr=bn_param_attr,
-                              layer_attr=bn_layer_attr)
-    return img_pool_layer(name="%s_pool" % name,
-                          input=__bn__, pool_type=pool_type,
-                          pool_size=pool_size, stride=pool_stride,
-                          start=pool_start, padding=pool_padding,
-                          layer_attr=pool_layer_attr)
-
-
-@wrap_act_default(param_names=['conv_act'],
-                  act=ReluActivation())
-@wrap_param_default(param_names=['pool_type'],
-                    default_factory=lambda _: MaxPooling())
-def img_conv_group(input, conv_num_filter,
+    __conv__ = img_conv_layer(
+        name="%s_conv" % name,
+        input=input,
+        filter_size=filter_size,
+        num_filters=num_filters,
+        num_channels=num_channel,
+        act=LinearActivation(),
+        groups=groups,
+        stride=conv_stride,
+        padding=conv_padding,
+        bias_attr=conv_bias_attr,
+        param_attr=conv_param_attr,
+        shared_biases=shared_bias,
+        layer_attr=conv_layer_attr)
+    __bn__ = batch_norm_layer(
+        name="%s_bn" % name,
+        input=__conv__,
+        act=act,
+        bias_attr=bn_bias_attr,
+        param_attr=bn_param_attr,
+        layer_attr=bn_layer_attr)
+    return img_pool_layer(
+        name="%s_pool" % name,
+        input=__bn__,
+        pool_type=pool_type,
+        pool_size=pool_size,
+        stride=pool_stride,
+        padding=pool_padding,
+        layer_attr=pool_layer_attr)
+
+
+@wrap_act_default(param_names=['conv_act'], act=ReluActivation())
+@wrap_param_default(
+    param_names=['pool_type'], default_factory=lambda _: MaxPooling())
+def img_conv_group(input,
+                   conv_num_filter,
                    pool_size,
                    num_channels=None,
                    conv_padding=1,
@@ -337,10 +392,12 @@ def img_conv_group(input, conv_num_filter,
         else:
             extra_kwargs['act'] = conv_act[i]
 
-        tmp = img_conv_layer(input=tmp, padding=conv_padding[i],
-                             filter_size=conv_filter_size[i],
-                             num_filters=conv_num_filter[i],
-                             **extra_kwargs)
+        tmp = img_conv_layer(
+            input=tmp,
+            padding=conv_padding[i],
+            filter_size=conv_filter_size[i],
+            num_filters=conv_num_filter[i],
+            **extra_kwargs)
 
         # logger.debug("tmp.num_filters = %d" % tmp.num_filters)
 
@@ -349,34 +406,41 @@ def img_conv_group(input, conv_num_filter,
             if dropout == 0 or abs(dropout) < 1e-5:  # dropout not set
                 tmp = batch_norm_layer(input=tmp, act=conv_act[i])
             else:
-                tmp = batch_norm_layer(input=tmp, act=conv_act[i],
-                                       layer_attr=ExtraAttr(drop_rate=dropout))
+                tmp = batch_norm_layer(
+                    input=tmp,
+                    act=conv_act[i],
+                    layer_attr=ExtraAttr(drop_rate=dropout))
 
-    return img_pool_layer(input=tmp, stride=pool_stride, pool_size=pool_size,
-                          pool_type=pool_type)
+    return img_pool_layer(
+        input=tmp, stride=pool_stride, pool_size=pool_size, pool_type=pool_type)
 
 
 def small_vgg(input_image, num_channels, num_classes):
     def __vgg__(ipt, num_filter, times, dropouts, num_channels_=None):
-        return img_conv_group(input=ipt, num_channels=num_channels_,
-                              pool_size=2,
-                              pool_stride=2,
-                              conv_num_filter=[num_filter] * times,
-                              conv_filter_size=3,
-                              conv_act=ReluActivation(),
-                              conv_with_batchnorm=True,
-                              conv_batchnorm_drop_rate=dropouts,
-                              pool_type=MaxPooling())
+        return img_conv_group(
+            input=ipt,
+            num_channels=num_channels_,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * times,
+            conv_filter_size=3,
+            conv_act=ReluActivation(),
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type=MaxPooling())
 
     tmp = __vgg__(input_image, 64, 2, [0.3, 0], num_channels)
     tmp = __vgg__(tmp, 128, 2, [0.4, 0])
     tmp = __vgg__(tmp, 256, 3, [0.4, 0.4, 0])
     tmp = __vgg__(tmp, 512, 3, [0.4, 0.4, 0])
-    tmp = img_pool_layer(input=tmp, stride=2,
-                         pool_size=2, pool_type=MaxPooling())
+    tmp = img_pool_layer(
+        input=tmp, stride=2, pool_size=2, pool_type=MaxPooling())
     tmp = dropout_layer(input=tmp, dropout_rate=0.5)
-    tmp = fc_layer(input=tmp, size=512, layer_attr=ExtraAttr(drop_rate=0.5),
-                   act=LinearActivation())
+    tmp = fc_layer(
+        input=tmp,
+        size=512,
+        layer_attr=ExtraAttr(drop_rate=0.5),
+        act=LinearActivation())
     tmp = batch_norm_layer(input=tmp, act=ReluActivation())
     return fc_layer(input=tmp, size=num_classes, act=SoftmaxActivation())
 
@@ -393,37 +457,67 @@ def vgg_16_network(input_image, num_channels, num_classes=1000):
     :return:
     """
 
-    tmp = img_conv_group(input=input_image, num_channels=num_channels,
-                         conv_padding=1, conv_num_filter=[64, 64],
-                         conv_filter_size=3,
-                         conv_act=ReluActivation(), pool_size=2,
-                         pool_stride=2,
-                         pool_type=MaxPooling())
-
-    tmp = img_conv_group(input=tmp, conv_num_filter=[128, 128], conv_padding=1,
-                         conv_filter_size=3, conv_act=ReluActivation(),
-                         pool_stride=2, pool_type=MaxPooling(),
-                         pool_size=2)
-
-    tmp = img_conv_group(input=tmp, conv_num_filter=[256, 256, 256],
-                         conv_padding=1,
-                         conv_filter_size=3, conv_act=ReluActivation(),
-                         pool_stride=2, pool_type=MaxPooling(), pool_size=2)
-
-    tmp = img_conv_group(input=tmp, conv_num_filter=[512, 512, 512],
-                         conv_padding=1,
-                         conv_filter_size=3, conv_act=ReluActivation(),
-                         pool_stride=2, pool_type=MaxPooling(), pool_size=2)
-    tmp = img_conv_group(input=tmp, conv_num_filter=[512, 512, 512],
-                         conv_padding=1,
-                         conv_filter_size=3, conv_act=ReluActivation(),
-                         pool_stride=2, pool_type=MaxPooling(), pool_size=2)
-
-    tmp = fc_layer(input=tmp, size=4096, act=ReluActivation(),
-                   layer_attr=ExtraAttr(drop_rate=0.5))
-
-    tmp = fc_layer(input=tmp, size=4096, act=ReluActivation(),
-                   layer_attr=ExtraAttr(drop_rate=0.5))
+    tmp = img_conv_group(
+        input=input_image,
+        num_channels=num_channels,
+        conv_padding=1,
+        conv_num_filter=[64, 64],
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_size=2,
+        pool_stride=2,
+        pool_type=MaxPooling())
+
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=[128, 128],
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=[256, 256, 256],
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=[512, 512, 512],
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=[512, 512, 512],
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+
+    tmp = fc_layer(
+        input=tmp,
+        size=4096,
+        act=ReluActivation(),
+        layer_attr=ExtraAttr(drop_rate=0.5))
+
+    tmp = fc_layer(
+        input=tmp,
+        size=4096,
+        act=ReluActivation(),
+        layer_attr=ExtraAttr(drop_rate=0.5))
 
     return fc_layer(input=tmp, size=num_classes, act=SoftmaxActivation())
 
@@ -432,10 +526,19 @@ def vgg_16_network(input_image, num_channels, num_classes=1000):
 #                       Recurrent                                          #
 ############################################################################
 
+
 @wrap_name_default("lstm")
-def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
-                bias_param_attr=None, inner_param_attr=None, act=None,
-                gate_act=None, state_act=None, mixed_layer_attr=None,
+def simple_lstm(input,
+                size,
+                name=None,
+                reverse=False,
+                mat_param_attr=None,
+                bias_param_attr=None,
+                inner_param_attr=None,
+                act=None,
+                gate_act=None,
+                state_act=None,
+                mixed_layer_attr=None,
                 lstm_cell_attr=None):
     """
     Simple LSTM Cell.
@@ -489,23 +592,38 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
     :rtype: LayerOutput
     """
     fc_name = 'lstm_transform_%s' % name
-    with mixed_layer(name=fc_name, size=size * 4,
-                     act=IdentityActivation(),
-                     layer_attr=mixed_layer_attr, bias_attr=False) as m:
+    with mixed_layer(
+            name=fc_name,
+            size=size * 4,
+            act=IdentityActivation(),
+            layer_attr=mixed_layer_attr,
+            bias_attr=False) as m:
         m += full_matrix_projection(input, param_attr=mat_param_attr)
 
-    return lstmemory(name=name, input=m, reverse=reverse,
-                     bias_attr=bias_param_attr,
-                     param_attr=inner_param_attr, act=act,
-                     gate_act=gate_act, state_act=state_act,
-                     layer_attr=lstm_cell_attr)
+    return lstmemory(
+        name=name,
+        input=m,
+        reverse=reverse,
+        bias_attr=bias_param_attr,
+        param_attr=inner_param_attr,
+        act=act,
+        gate_act=gate_act,
+        state_act=state_act,
+        layer_attr=lstm_cell_attr)
 
 
 @wrap_name_default('lstm_unit')
-def lstmemory_unit(input, name=None, size=None, param_attr=None,
-                   act=None, gate_act=None, state_act=None,
-                   mixed_bias_attr=None, lstm_bias_attr=None,
-                   mixed_layer_attr=None, lstm_layer_attr=None,
+def lstmemory_unit(input,
+                   name=None,
+                   size=None,
+                   param_attr=None,
+                   act=None,
+                   gate_act=None,
+                   state_act=None,
+                   mixed_bias_attr=None,
+                   lstm_bias_attr=None,
+                   mixed_layer_attr=None,
+                   lstm_layer_attr=None,
                    get_output_layer_attr=None):
     """
     Define calculations that a LSTM unit performs in a single time step.
@@ -576,10 +694,12 @@ def lstmemory_unit(input, name=None, size=None, param_attr=None,
     out_mem = memory(name=name, size=size)
     state_mem = memory(name="%s_state" % name, size=size)
 
-    with mixed_layer(name="%s_input_recurrent" % name,
-                     size=size * 4, bias_attr=mixed_bias_attr,
-                     layer_attr=mixed_layer_attr,
-                     act=IdentityActivation()) as m:
+    with mixed_layer(
+            name="%s_input_recurrent" % name,
+            size=size * 4,
+            bias_attr=mixed_bias_attr,
+            layer_attr=mixed_layer_attr,
+            act=IdentityActivation()) as m:
         m += identity_projection(input=input)
         m += full_matrix_projection(input=out_mem, param_attr=param_attr)
 
@@ -592,22 +712,29 @@ def lstmemory_unit(input, name=None, size=None, param_attr=None,
         act=act,
         gate_act=gate_act,
         state_act=state_act,
-        layer_attr=lstm_layer_attr
-    )
-    get_output_layer(name='%s_state' % name,
-                     input=lstm_out,
-                     arg_name='state',
-                     layer_attr=get_output_layer_attr)
+        layer_attr=lstm_layer_attr)
+    get_output_layer(
+        name='%s_state' % name,
+        input=lstm_out,
+        arg_name='state',
+        layer_attr=get_output_layer_attr)
 
     return lstm_out
 
 
 @wrap_name_default('lstm_group')
-def lstmemory_group(input, size=None, name=None,
-                    reverse=False, param_attr=None,
-                    act=None, gate_act=None, state_act=None,
-                    mixed_bias_attr=None, lstm_bias_attr=None,
-                    mixed_layer_attr=None, lstm_layer_attr=None,
+def lstmemory_group(input,
+                    size=None,
+                    name=None,
+                    reverse=False,
+                    param_attr=None,
+                    act=None,
+                    gate_act=None,
+                    state_act=None,
+                    mixed_bias_attr=None,
+                    lstm_bias_attr=None,
+                    mixed_layer_attr=None,
+                    lstm_layer_attr=None,
                     get_output_layer_attr=None):
     """
     lstm_group is a recurrent layer group version Long Short Term Memory. It
@@ -669,20 +796,25 @@ def lstmemory_group(input, size=None, name=None,
     """
 
     def __lstm_step__(ipt):
-        return lstmemory_unit(input=ipt, name=name,
-                              size=size, mixed_bias_attr=mixed_bias_attr,
-                              mixed_layer_attr=mixed_layer_attr,
-                              param_attr=param_attr,
-                              lstm_bias_attr=lstm_bias_attr,
-                              act=act, gate_act=gate_act,
-                              state_act=state_act,
-                              lstm_layer_attr=lstm_layer_attr,
-                              get_output_layer_attr=get_output_layer_attr)
-
-    return recurrent_group(name='%s_recurrent_group' % name,
-                           step=__lstm_step__,
-                           reverse=reverse,
-                           input=input)
+        return lstmemory_unit(
+            input=ipt,
+            name=name,
+            size=size,
+            mixed_bias_attr=mixed_bias_attr,
+            mixed_layer_attr=mixed_layer_attr,
+            param_attr=param_attr,
+            lstm_bias_attr=lstm_bias_attr,
+            act=act,
+            gate_act=gate_act,
+            state_act=state_act,
+            lstm_layer_attr=lstm_layer_attr,
+            get_output_layer_attr=get_output_layer_attr)
+
+    return recurrent_group(
+        name='%s_recurrent_group' % name,
+        step=__lstm_step__,
+        reverse=reverse,
+        input=input)
 
 
 @wrap_name_default('gru_unit')
@@ -732,8 +864,7 @@ def gru_unit(input,
         bias_attr=gru_bias_attr,
         act=act,
         gate_act=gate_act,
-        layer_attr=gru_layer_attr
-    )
+        layer_attr=gru_layer_attr)
     return gru_out
 
 
@@ -743,7 +874,8 @@ def gru_group(input,
               name=None,
               reverse=False,
               gru_bias_attr=None,
-              act=None, gate_act=None,
+              act=None,
+              gate_act=None,
               gru_layer_attr=None):
     """
     gru_group is a recurrent layer group version Gated Recurrent Unit. It
@@ -792,13 +924,13 @@ def gru_group(input,
             gru_bias_attr=gru_bias_attr,
             act=act,
             gate_act=gate_act,
-            gru_layer_attr=gru_layer_attr
-        )
+            gru_layer_attr=gru_layer_attr)
 
-    return recurrent_group(name='%s_recurrent_group' % name,
-                           step=__gru_step__,
-                           reverse=reverse,
-                           input=input)
+    return recurrent_group(
+        name='%s_recurrent_group' % name,
+        step=__gru_step__,
+        reverse=reverse,
+        input=input)
 
 
 @wrap_name_default('simple_gru')
@@ -812,25 +944,100 @@ def simple_gru(input,
                gru_bias_attr=None,
                act=None,
                gate_act=None,
-               gru_layer_attr=None
-               ):
+               gru_layer_attr=None):
     """
-    simple_gru is also a recurrent layer group version Gated Recurrent Unit as
-    gru_group. The difference only lies in implemention details.
+    You maybe see gru_step_layer, grumemory in layers.py, gru_unit, gru_group,
+    simple_gru in network.py. The reason why there are so many interfaces is
+    that we have two ways to implement recurrent neural network. One way is to
+    use one complete layer to implement rnn (including simple rnn, gru and lstm)
+    with multiple time steps, such as recurrent_layer, lstmemory, grumemory. But,
+    the multiplication operation :math:`W x_t` is not computed in these layers.
+    See details in their interfaces in layers.py. 
+    The other implementation is to use an recurrent group which can ensemble a
+    series of layers to compute rnn step by step. This way is flexible for
+    attenion mechanism or other complex connections.
+
+    - gru_step_layer: only compute rnn by one step. It needs an memory as input
+      and can be used in recurrent group.
+    - gru_unit: a wrapper of gru_step_layer with memory. 
+    - gru_group: a GRU cell implemented by a combination of multiple layers in
+      recurrent group.
+      But :math:`W x_t` is not done in group.  
+    - gru_memory: a GRU cell implemented by one layer, which does same calculation
+      with gru_group and is faster than gru_group. 
+    - simple_gru: a complete GRU implementation inlcuding :math:`W x_t` and 
+      gru_group. :math:`W` contains :math:`W_r`, :math:`W_z` and :math:`W`, see
+      formula in grumemory. 
+
     The computational speed is that, grumemory is relatively better than
     gru_group, and gru_group is relatively better than simple_gru.
 
-    simple_gru does exactly the same calculation as the grumemory layer does.
+    The example usage is:
+
+    ..  code-block:: python
+
+        gru = simple_gru(input=[layer1], size=256)
+
+    :param input: input layer name.
+    :type input: LayerOutput
+    :param name: name of the gru group.
+    :type name: basestring
+    :param size: hidden size of the gru.
+    :type size: int
+    :param reverse: whether to process the input data in a reverse order
+    :type reverse: bool
+    :param act: type of the activiation
+    :type act: BaseActivation
+    :param gate_act: type of the gate activiation
+    :type gate_act: BaseActivation
+    :param gru_bias_attr: bias. False means no bias, None means default bias.
+    :type gru_bias_attr: ParameterAttribute|False
+    :param gru_layer_attr: Extra parameter attribute of the gru layer.
+    :type gru_layer_attr: ParameterAttribute|False
+    :return: the gru group.
+    :rtype: LayerOutput
+    """
+    with mixed_layer(
+            name='%s_transform' % name,
+            size=size * 3,
+            bias_attr=mixed_bias_param_attr,
+            layer_attr=mixed_layer_attr) as m:
+        m += full_matrix_projection(input=input, param_attr=mixed_param_attr)
+
+    return gru_group(
+        name=name,
+        size=size,
+        input=m,
+        reverse=reverse,
+        gru_bias_attr=gru_bias_attr,
+        act=act,
+        gate_act=gate_act,
+        gru_layer_attr=gru_layer_attr)
+
+
+@wrap_name_default('simple_gru2')
+def simple_gru2(input,
+                size,
+                name=None,
+                reverse=False,
+                mixed_param_attr=None,
+                mixed_bias_attr=None,
+                gru_param_attr=None,
+                gru_bias_attr=None,
+                act=None,
+                gate_act=None,
+                mixed_layer_attr=None,
+                gru_cell_attr=None):
+    """
+    simple_gru2 is the same with simple_gru, but using grumemory instead
     Please see grumemory in layers.py for more detail about the maths.
+    simple_gru2 is faster than simple_gru.
 
     The example usage is:
 
     ..  code-block:: python
 
-        gru = gur_group(input=[layer1],
-                        size=256,
-                        act=TanhActivation(),
-                        gate_act=SigmoidActivation())
+        gru = simple_gru2(input=[layer1], size=256)
 
     :param input: input layer name.
     :type input: LayerOutput
@@ -851,36 +1058,135 @@ def simple_gru(input,
     :return: the gru group.
     :rtype: LayerOutput
     """
-    with mixed_layer(name='%s_transform' % name,
-                     size=size * 3,
-                     bias_attr=mixed_bias_param_attr,
-                     layer_attr=mixed_layer_attr) as m:
+    with mixed_layer(
+            name='%s_transform' % name,
+            size=size * 3,
+            bias_attr=mixed_bias_attr,
+            layer_attr=mixed_layer_attr) as m:
         m += full_matrix_projection(input=input, param_attr=mixed_param_attr)
 
-    return gru_group(name=name,
-                     size=size,
-                     input=m,
-                     reverse=reverse,
-                     gru_bias_attr=gru_bias_attr,
-                     act=act,
-                     gate_act=gate_act,
-                     gru_layer_attr=gru_layer_attr)
+    return grumemory(
+        name=name,
+        size=size,
+        input=m,
+        reverse=reverse,
+        bias_attr=gru_bias_attr,
+        param_attr=gru_param_attr,
+        act=act,
+        gate_act=gate_act,
+        layer_attr=gru_cell_attr)
+
+
+@wrap_name_default("bidirectional_gru")
+def bidirectional_gru(input,
+                      size,
+                      name=None,
+                      return_seq=False,
+                      fwd_mixed_param_attr=None,
+                      fwd_mixed_bias_attr=None,
+                      fwd_gru_param_attr=None,
+                      fwd_gru_bias_attr=None,
+                      fwd_act=None,
+                      fwd_gate_act=None,
+                      fwd_mixed_layer_attr=None,
+                      fwd_gru_cell_attr=None,
+                      bwd_mixed_param_attr=None,
+                      bwd_mixed_bias_attr=None,
+                      bwd_gru_param_attr=None,
+                      bwd_gru_bias_attr=None,
+                      bwd_act=None,
+                      bwd_gate_act=None,
+                      bwd_mixed_layer_attr=None,
+                      bwd_gru_cell_attr=None,
+                      last_seq_attr=None,
+                      first_seq_attr=None,
+                      concat_attr=None,
+                      concat_act=None):
+    """
+    A bidirectional_gru is a recurrent unit that iterates over the input
+    sequence both in forward and bardward orders, and then concatenate two
+    outputs to form a final output. However, concatenation of two outputs
+    is not the only way to form the final output, you can also, for example,
+    just add them together.
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        bi_gru = bidirectional_gru(input=[input1], size=512)
+
+    :param name: bidirectional gru layer name.
+    :type name: basestring
+    :param input: input layer.
+    :type input: LayerOutput
+    :param size: gru layer size.
+    :type size: int
+    :param return_seq: If set False, outputs of the last time step are
+                       concatenated and returned.
+                       If set True, the entire output sequences that are
+                       processed in forward and backward directions are
+                       concatenated and returned.
+    :type return_seq: bool
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    args = locals()
+
+    fw = simple_gru2(
+        name='%s_fw' % name,
+        input=input,
+        size=size,
+        **dict((k[len('fwd_'):], v) for k, v in args.iteritems()
+               if k.startswith('fwd_')))
+
+    bw = simple_gru2(
+        name="%s_bw" % name,
+        input=input,
+        size=size,
+        reverse=True,
+        **dict((k[len('bwd_'):], v) for k, v in args.iteritems()
+               if k.startswith('bwd_')))
+
+    if return_seq:
+        return concat_layer(
+            name=name, input=[fw, bw], layer_attr=concat_attr, act=concat_act)
+    else:
+        fw_seq = last_seq(
+            name="%s_fw_last" % name, input=fw, layer_attr=last_seq_attr)
+        bw_seq = first_seq(
+            name="%s_bw_last" % name, input=bw, layer_attr=first_seq_attr)
+        return concat_layer(
+            name=name,
+            input=[fw_seq, bw_seq],
+            layer_attr=concat_attr,
+            act=concat_act)
 
 
 @wrap_name_default("bidirectional_lstm")
-def bidirectional_lstm(input, size, name=None, return_seq=False,
-                       fwd_mat_param_attr=None, fwd_bias_param_attr=None,
-                       fwd_inner_param_attr=None, fwd_act=None,
-                       fwd_gate_act=None, fwd_state_act=None,
-                       fwd_mixed_layer_attr=None, fwd_lstm_cell_attr=None,
-
-                       bwd_mat_param_attr=None, bwd_bias_param_attr=None,
-                       bwd_inner_param_attr=None, bwd_act=None,
-                       bwd_gate_act=None, bwd_state_act=None,
-                       bwd_mixed_layer_attr=None, bwd_lstm_cell_attr=None,
-
-                       last_seq_attr=None, first_seq_attr=None,
-                       concat_attr=None, concat_act=None):
+def bidirectional_lstm(input,
+                       size,
+                       name=None,
+                       return_seq=False,
+                       fwd_mat_param_attr=None,
+                       fwd_bias_param_attr=None,
+                       fwd_inner_param_attr=None,
+                       fwd_act=None,
+                       fwd_gate_act=None,
+                       fwd_state_act=None,
+                       fwd_mixed_layer_attr=None,
+                       fwd_lstm_cell_attr=None,
+                       bwd_mat_param_attr=None,
+                       bwd_bias_param_attr=None,
+                       bwd_inner_param_attr=None,
+                       bwd_act=None,
+                       bwd_gate_act=None,
+                       bwd_state_act=None,
+                       bwd_mixed_layer_attr=None,
+                       bwd_lstm_cell_attr=None,
+                       last_seq_attr=None,
+                       first_seq_attr=None,
+                       concat_attr=None,
+                       concat_act=None):
     """
     A bidirectional_lstm is a recurrent unit that iterates over the input
     sequence both in forward and bardward orders, and then concatenate two
@@ -897,7 +1203,7 @@ def bidirectional_lstm(input, size, name=None, return_seq=False,
 
     ..  code-block:: python
 
-        lstm_step = bidirectional_lstm(input=[input1], size=512)
+        bi_lstm = bidirectional_lstm(input=[input1], size=512)
 
     :param name: bidirectional lstm layer name.
     :type name: basestring
@@ -911,30 +1217,39 @@ def bidirectional_lstm(input, size, name=None, return_seq=False,
                        processed in forward and backward directions are
                        concatenated and returned.
     :type return_seq: bool
-    :return: lstm layer name.
+    :return: LayerOutput object accroding to the return_seq.
     :rtype: LayerOutput
     """
     args = locals()
 
-    fw = simple_lstm(name='%s_fw' % name, input=input, size=size,
-                     **dict((k[len('fwd_'):], v) for k, v in args.iteritems()
-                            if k.startswith('fwd_')))
+    fw = simple_lstm(
+        name='%s_fw' % name,
+        input=input,
+        size=size,
+        **dict((k[len('fwd_'):], v) for k, v in args.iteritems()
+               if k.startswith('fwd_')))
 
-    bw = simple_lstm(name="%s_bw" % name, input=input, size=size,
-                     reverse=True,
-                     **dict((k[len('bwd_'):], v) for k, v in args.iteritems()
-                            if k.startswith('bwd_')))
+    bw = simple_lstm(
+        name="%s_bw" % name,
+        input=input,
+        size=size,
+        reverse=True,
+        **dict((k[len('bwd_'):], v) for k, v in args.iteritems()
+               if k.startswith('bwd_')))
 
     if return_seq:
-        return concat_layer(name=name, input=[fw, bw], layer_attr=concat_attr,
-                            act=concat_act)
+        return concat_layer(
+            name=name, input=[fw, bw], layer_attr=concat_attr, act=concat_act)
     else:
-        fw_seq = last_seq(name="%s_fw_last" % name, input=fw,
-                          layer_attr=last_seq_attr)
-        bw_seq = first_seq(name="%s_bw_last" % name, input=bw,
-                           layer_attr=first_seq_attr)
-        return concat_layer(name=name, input=[fw_seq, bw_seq],
-                            layer_attr=concat_attr, act=concat_act)
+        fw_seq = last_seq(
+            name="%s_fw_last" % name, input=fw, layer_attr=last_seq_attr)
+        bw_seq = first_seq(
+            name="%s_bw_last" % name, input=bw, layer_attr=first_seq_attr)
+        return concat_layer(
+            name=name,
+            input=[fw_seq, bw_seq],
+            layer_attr=concat_attr,
+            act=concat_act)
 
 
 @wrap_name_default()
@@ -1005,37 +1320,41 @@ def simple_attention(encoded_sequence,
     proj_size = encoded_proj.size
 
     with mixed_layer(size=proj_size, name="%s_transform" % name) as m:
-        m += full_matrix_projection(decoder_state,
-                                    param_attr=transform_param_attr)
+        m += full_matrix_projection(
+            decoder_state, param_attr=transform_param_attr)
 
-    expanded = expand_layer(input=m, expand_as=encoded_sequence,
-                            name='%s_expand' % name)
+    expanded = expand_layer(
+        input=m, expand_as=encoded_sequence, name='%s_expand' % name)
 
-    with mixed_layer(size=proj_size, act=weight_act,
-                     name="%s_combine" % name) as m:
+    with mixed_layer(
+            size=proj_size, act=weight_act, name="%s_combine" % name) as m:
         m += identity_projection(expanded)
         m += identity_projection(encoded_proj)
 
     # sequence softmax is used to normalize similarities between decoder state
     # and encoder outputs into a distribution
-    attention_weight = fc_layer(input=m,
-                                size=1,
-                                act=SequenceSoftmaxActivation(),
-                                param_attr=softmax_param_attr,
-                                name="%s_softmax" % name,
-                                bias_attr=False)
+    attention_weight = fc_layer(
+        input=m,
+        size=1,
+        act=SequenceSoftmaxActivation(),
+        param_attr=softmax_param_attr,
+        name="%s_softmax" % name,
+        bias_attr=False)
 
-    scaled = scaling_layer(weight=attention_weight, input=encoded_sequence,
-                           name='%s_scaling' % name)
+    scaled = scaling_layer(
+        weight=attention_weight,
+        input=encoded_sequence,
+        name='%s_scaling' % name)
 
-    return pooling_layer(input=scaled, pooling_type=SumPooling(),
-                         name="%s_pooling" % name)
+    return pooling_layer(
+        input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
 
 
 ############################################################################
 #                         Miscs                                            #
 ############################################################################
 
+
 @wrap_name_default("dropout")
 def dropout_layer(input, dropout_rate, name=None):
     """
@@ -1046,9 +1365,12 @@ def dropout_layer(input, dropout_rate, name=None):
     :param dropout_rate:
     :return:
     """
-    return addto_layer(name=name, input=input, act=LinearActivation(),
-                       bias_attr=False,
-                       layer_attr=ExtraAttr(drop_rate=dropout_rate))
+    return addto_layer(
+        name=name,
+        input=input,
+        act=LinearActivation(),
+        bias_attr=False,
+        layer_attr=ExtraAttr(drop_rate=dropout_rate))
 
 
 def inputs(layers, *args):
@@ -1081,7 +1403,6 @@ def outputs(layers, *args):
 
     def __dfs_travel__(layer,
                        predicate=lambda x: x.layer_type == LayerType.DATA):
-
         """
         DFS LRV Travel for output layer.
 
@@ -1122,8 +1443,9 @@ def outputs(layers, *args):
     for each_layer in layers:
         assert isinstance(each_layer, LayerOutput)
         inputs.extend(__dfs_travel__(each_layer))
-        outputs_.extend(__dfs_travel__(
-            each_layer, lambda x: x.layer_type == LayerType.COST))
+        outputs_.extend(
+            __dfs_travel__(each_layer,
+                           lambda x: x.layer_type == LayerType.COST))
 
     # Currently, we got each leaf node's inputs order, output order.
     # We merge them together.
@@ -1141,16 +1463,13 @@ def outputs(layers, *args):
         if each_output.name not in final_outputs:
             final_outputs.append(each_output.name)
 
-    logger.info(
-        "".join(["The input order is [", ", ".join(final_inputs), "]"])
-    )
+    logger.info("".join(["The input order is [", ", ".join(final_inputs), "]"]))
 
     if len(final_outputs) == 0:
         final_outputs = map(lambda x: x.name, layers)
 
-    logger.info(
-        "".join(["The output order is [", ", ".join(final_outputs), "]"
-                 ]))
+    logger.info("".join(
+        ["The output order is [", ", ".join(final_outputs), "]"]))
 
     Inputs(*final_inputs)
     Outputs(*final_outputs)
diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py
index 4660a6b5003daf20e8b347a1ff55ef3e34869e26..501fc3211bf2b955065f40109ab5ceeaa8041e6e 100644
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@@ -17,11 +17,12 @@ from paddle.trainer.config_parser import Settings, default_decay_rate, \
 
 from .default_decorators import wrap_param_default
 
-__all__ = ['Optimizer', 'BaseSGDOptimizer', 'MomentumOptimizer',
-           'AdamaxOptimizer', 'AdamOptimizer', 'AdaGradOptimizer',
-           'RMSPropOptimizer', 'DecayedAdaGradOptimizer',
-           'AdaDeltaOptimizer', 'BaseRegularization', 'L2Regularization',
-           'settings', 'ModelAverage']
+__all__ = [
+    'Optimizer', 'BaseSGDOptimizer', 'MomentumOptimizer', 'AdamaxOptimizer',
+    'AdamOptimizer', 'AdaGradOptimizer', 'RMSPropOptimizer',
+    'DecayedAdaGradOptimizer', 'AdaDeltaOptimizer', 'BaseRegularization',
+    'L2Regularization', 'settings', 'ModelAverage'
+]
 
 
 class Optimizer(object):
@@ -90,18 +91,15 @@ class MomentumOptimizer(BaseSGDOptimizer):
     :param sparse: with sparse support or not.
     :type sparse: bool
     """
+
     def extra_settings(self):
         default_momentum(self.momentum)
 
     def to_setting_kwargs(self):
         if self.sparse:
-            return {
-                'learning_method': 'sparse_momentum'
-            }
+            return {'learning_method': 'sparse_momentum'}
         else:
-            return {
-                'learning_method': 'momentum'
-            }
+            return {'learning_method': 'momentum'}
 
     def __init__(self, momentum=None, sparse=False):
         self.momentum = momentum
@@ -197,9 +195,7 @@ class AdaGradOptimizer(BaseSGDOptimizer):
     """
 
     def to_setting_kwargs(self):
-        return {
-            'learning_method': 'adagrad'
-        }
+        return {'learning_method': 'adagrad'}
 
     def __init__(self):
         pass
@@ -311,9 +307,7 @@ class L2Regularization(BaseRegularization):
 
     def to_setting_kwargs(self):
         if self.algorithm == 'owlqn':
-            return {
-                'l2weight': self.decay_rate
-            }
+            return {'l2weight': self.decay_rate}
         else:
             return dict()
 
@@ -330,7 +324,8 @@ class ModelAverage(Optimizer):
             'do_average_in_cpu': self.do_average_in_cpu
         }
 
-    def __init__(self, average_window,
+    def __init__(self,
+                 average_window,
                  max_average_window=None,
                  do_average_in_cpu=False):
         self.average_window = average_window
@@ -356,18 +351,24 @@ def __extends__(dict1, dict2):
     return dict1
 
 
-@wrap_param_default(['learning_method'],
-                    default_factory=lambda _: MomentumOptimizer())
-@wrap_param_default(['regularization'],
-                    default_factory=lambda _: BaseRegularization())
+@wrap_param_default(
+    ['learning_method'], default_factory=lambda _: MomentumOptimizer())
+@wrap_param_default(
+    ['regularization'], default_factory=lambda _: BaseRegularization())
 def settings(batch_size,
              learning_rate=1e-3,
+             learning_rate_decay_a=0.,
+             learning_rate_decay_b=0.,
+             learning_rate_schedule='poly',
+             learning_rate_args='',
+             average_window=0,
+             do_average_in_cpu=False,
+             max_average_window=None,
              learning_method=None,
              regularization=None,
              is_async=False,
              model_average=None,
-             gradient_clipping_threshold=None
-             ):
+             gradient_clipping_threshold=None):
     """
     Set the optimization method, learning rate, batch size, and other training
     settings. The currently supported algorithms are SGD and Async-SGD.
@@ -408,10 +409,15 @@ def settings(batch_size,
     else:
         algorithm = 'owlqn'
 
+    args = [
+        'batch_size', 'learning_rate', 'learning_rate_decay_a',
+        'learning_rate_decay_b', 'learning_rate_schedule', 'learning_rate_args',
+        'average_window', 'do_average_in_cpu', 'max_average_window'
+    ]
     kwargs = dict()
-    kwargs['batch_size'] = batch_size
-    kwargs['learning_rate'] = learning_rate
     kwargs['algorithm'] = algorithm
+    for arg in args:
+        kwargs[arg] = locals()[arg]
 
     kwargs = __extends__(kwargs, learning_method.to_setting_kwargs())
     learning_method.extra_settings()
diff --git a/python/paddle/trainer_config_helpers/poolings.py b/python/paddle/trainer_config_helpers/poolings.py
index 3d2320f3ffc42e08add9874da8665b21c184f376..6f13a76f25a106dfda5e71dbc237b95afefff884 100644
--- a/python/paddle/trainer_config_helpers/poolings.py
+++ b/python/paddle/trainer_config_helpers/poolings.py
@@ -11,18 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 """
 
 __all__ = [
-    "BasePoolingType",
-    "MaxPooling",
-    "AvgPooling",
-    "CudnnMaxPooling",
-    "CudnnAvgPooling",
-    "SumPooling",
-    "SquareRootNPooling"
+    "BasePoolingType", "MaxPooling", "AvgPooling", "CudnnMaxPooling",
+    "CudnnAvgPooling", "SumPooling", "SquareRootNPooling"
 ]
 
 
@@ -36,6 +30,7 @@ class BasePoolingType(object):
     :type name: basestring
 
     """
+
     def __init__(self, name):
         self.name = name
 
@@ -54,6 +49,7 @@ class MaxPooling(BasePoolingType):
                              value. None means use default value in proto.
     :type output_max_index: bool|None
     """
+
     def __init__(self, output_max_index=None):
         BasePoolingType.__init__(self, "max")
         self.output_max_index = output_max_index
@@ -64,6 +60,7 @@ class CudnnMaxPooling(BasePoolingType):
     Cudnn max pooling only support GPU. Return the maxinum value in the
     pooling window.
     """
+
     def __init__(self):
         BasePoolingType.__init__(self, "cudnn-max-pool")
 
@@ -73,9 +70,11 @@ class CudnnAvgPooling(BasePoolingType):
     Cudnn average pooling only support GPU. Return the average value in the
     pooling window.
     """
+
     def __init__(self):
         BasePoolingType.__init__(self, "cudnn-avg-pool")
 
+
 class AvgPooling(BasePoolingType):
     """
     Average pooling.
@@ -105,7 +104,9 @@ class SumPooling(AvgPooling):
 
         sum(samples\\_of\\_a\\_sequence)
     """
-    def __init__(self): AvgPooling.__init__(self, AvgPooling.STRATEGY_SUM)
+
+    def __init__(self):
+        AvgPooling.__init__(self, AvgPooling.STRATEGY_SUM)
 
 
 class SquareRootNPooling(AvgPooling):
@@ -118,4 +119,6 @@ class SquareRootNPooling(AvgPooling):
 
         sum(samples\\_of\\_a\\_sequence)/sqrt(sample\\_num)
     """
-    def __init__(self): AvgPooling.__init__(self, AvgPooling.STRATEGY_SQROOTN)
+
+    def __init__(self):
+        AvgPooling.__init__(self, AvgPooling.STRATEGY_SQROOTN)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/.gitignore b/python/paddle/trainer_config_helpers/tests/configs/.gitignore
index 52378fe7a486589352182ef4da6186365daf4bde..eb646b4a71ec1ac0e7992aabf2992fef7a9264a0 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/.gitignore
+++ b/python/paddle/trainer_config_helpers/tests/configs/.gitignore
@@ -1 +1 @@
-*protostr
+protostr/*.unitest
diff --git a/python/paddle/trainer_config_helpers/tests/configs/check.md5 b/python/paddle/trainer_config_helpers/tests/configs/check.md5
deleted file mode 100644
index 96bf3fb2e19d6323ed5822205cc08dda0dee0dfd..0000000000000000000000000000000000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/check.md5
+++ /dev/null
@@ -1,20 +0,0 @@
-86c0815275a9d5eb902e23c6a592f58a  img_layers.protostr
-a5d9259ff1fd7ca23d0ef090052cb1f2  last_first_seq.protostr
-9c038249ec8ff719753a746cdb04c026  layer_activations.protostr
-5913f87b39cee3b2701fa158270aca26  projections.protostr
-7334ba0a4544f0623231330fc51d390d  shared_fc.protostr
-8b8b6bb128a7dfcc937be86145f53e2f  shared_lstm.protostr
-6b39e34beea8dfb782bee9bd3dea9eb5  simple_rnn_layers.protostr
-0fc1409600f1a3301da994ab9d28b0bf  test_cost_layers.protostr
-6cd5f28a3416344f20120698470e0a4c  test_cost_layers_with_weight.protostr
-144bc6d3a509de74115fa623741797ed  test_expand_layer.protostr
-2378518bdb71e8c6e888b1842923df58  test_fc.protostr
-8bb44e1e5072d0c261572307e7672bda  test_grumemory_layer.protostr
-1f3510672dce7a9ed25317fc58579ac7  test_hsigmoid.protostr
-d350bd91a0dc13e854b1364c3d9339c6  test_lstmemory_layer.protostr
-251a948ba41c1071afcd3d9cf9c233f7  test_ntm_layers.protostr
-e6ff04e70aea27c7b06d808cc49c9497  test_print_layer.protostr
-2a75dd33b640c49a8821c2da6e574577  test_rnn_group.protostr
-67d6fde3afb54f389d0ce4ff14726fe1  test_sequence_pooling.protostr
-f586a548ef4350ba1ed47a81859a64cb  unused_layers.protostr
-8122477f4f65244580cec09edc590041  util_layers.protostr
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
index 7cdd682056fd486db2c7274636ba51b1d1e7ba5f..bb594ac2c245d8882569ba2c3cf00623a8fa8e2c 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -4,15 +4,25 @@ set -e
 cd `dirname $0`
 export PYTHONPATH=$PWD/../../../../
 
+protostr=$PWD/protostr
+
 configs=(test_fc layer_activations projections test_print_layer
 test_sequence_pooling test_lstmemory_layer test_grumemory_layer
 last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
-img_layers util_layers simple_rnn_layers unused_layers test_cost_layers
-test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight)
+img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
+test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight
+test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops)
 
+whole_configs=(test_split_datasource)
 
 for conf in ${configs[*]}
 do
     echo "Generating " $conf
-    python -m paddle.utils.dump_config $conf.py > $conf.protostr
+    python -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unitest
+done
+
+for conf in ${whole_configs[*]}
+do
+    echo "Generating " $conf
+    python -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unitest
 done
diff --git a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
index f33357c3906fdbbebb1b4995e84115ff4edef581..9fda16a5407a1fe0af8c5986023a8368e5b87222 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
@@ -1,22 +1,23 @@
 from paddle.trainer_config_helpers import *
 
-settings(
-    learning_rate=1e-3,
-    batch_size=1000
-)
+settings(learning_rate=1e-3, batch_size=1000)
 
-img = data_layer(name='image', size=256*256)
+img = data_layer(name='image', size=256 * 256)
 
 # the parse_conv in config_parse.py is not strictly accurate when filter_size
 # is not square. So here set square filter_size.
-img_conv = img_conv_layer(input=img, num_channels=1, num_filters=64,
-                          filter_size=(32, 32), padding=(1, 1), stride=(1, 1),
-                          act=LinearActivation())
+img_conv = img_conv_layer(
+    input=img,
+    num_channels=1,
+    num_filters=64,
+    filter_size=(32, 32),
+    padding=(1, 1),
+    stride=(1, 1),
+    act=LinearActivation())
 img_bn = batch_norm_layer(input=img_conv, act=ReluActivation())
 
 img_norm = img_cmrnorm_layer(input=img_bn, size=32)
 
 img_pool = img_pool_layer(input=img_conv, pool_size=32, pool_type=MaxPooling())
 
-
 outputs(img_pool, img_norm)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py b/python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..91849b40a0801b07642f96c061755597cd2ec073
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py
@@ -0,0 +1,24 @@
+from paddle.trainer_config_helpers import *
+
+settings(learning_rate=1e-3, batch_size=1000)
+
+img = data_layer(name='image', size=227 * 227)
+
+# the parse_conv in config_parse.py is not strictly accurate when filter_size
+# is not square. So here set square filter_size.
+img_conv = img_conv_layer(
+    input=img,
+    num_channels=1,
+    num_filters=64,
+    filter_size=(32, 32),
+    padding=(1, 1),
+    stride=(1, 1),
+    act=LinearActivation(),
+    trans=True)
+img_bn = batch_norm_layer(input=img_conv, act=ReluActivation())
+
+img_norm = img_cmrnorm_layer(input=img_bn, size=32)
+
+img_pool = img_pool_layer(input=img_conv, pool_size=32, pool_type=MaxPooling())
+
+outputs(img_pool, img_norm)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py b/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
index d54a1c49fd3fdf9eb8a675dd94561e6da5b310bc..3a1a0132b64bb928e857905f99c0be2e81ccbda2 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
@@ -1,21 +1,12 @@
 from paddle.trainer_config_helpers import *
 
-settings(
-    batch_size=1000,
-    learning_rate=1e-5
-)
+settings(batch_size=1000, learning_rate=1e-5)
 
 din = data_layer(name='data', size=30)
 
-seq_op = [
-    first_seq,
-    last_seq
-]
+seq_op = [first_seq, last_seq]
 
-agg_level = [
-    AggregateLevel.EACH_SEQUENCE,
-    AggregateLevel.EACH_TIMESTEP
-]
+agg_level = [AggregateLevel.EACH_SEQUENCE, AggregateLevel.EACH_TIMESTEP]
 
 opts = []
 
@@ -23,4 +14,4 @@ for op in seq_op:
     for al in agg_level:
         opts.append(op(input=din, agg_level=al))
 
-outputs(opts)
\ No newline at end of file
+outputs(opts)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py b/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
index ba10dc78e1e3bf382c14c62d542256c957c1fdf5..7012dbf6a0b70957d6227d4125f4cd75b9abb215 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
@@ -4,18 +4,18 @@ Test all activations.
 
 from paddle.trainer_config_helpers import *
 
-settings(
-    learning_rate=1e-4,
-    batch_size=1000
-)
+settings(learning_rate=1e-4, batch_size=1000)
 
 din = data_layer(name='input', size=100)
 
 acts = [
     TanhActivation, SigmoidActivation, SoftmaxActivation, IdentityActivation,
     LinearActivation, ExpActivation, ReluActivation, BReluActivation,
-    SoftReluActivation, STanhActivation, AbsActivation, SquareActivation]
+    SoftReluActivation, STanhActivation, AbsActivation, SquareActivation
+]
 
-outputs(
-    [fc_layer(input=din, size=100, act=act(), name="layer_%d" % i) for i, act in
-     enumerate(acts)])
+outputs([
+    fc_layer(
+        input=din, size=100, act=act(), name="layer_%d" % i)
+    for i, act in enumerate(acts)
+])
diff --git a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c6d4020f59c9d39c0cfc1f075c16ac16ac33db
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
@@ -0,0 +1,26 @@
+from paddle.trainer_config_helpers import *
+from paddle.trainer_config_helpers import math
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+x = data_layer(name='data', size=100)
+x = math.exp(x)
+x = math.log(x)
+x = math.abs(x)
+x = math.sigmoid(x)
+x = math.square(x)
+x = math.square(x)
+y = 1 + x
+y = y + 1
+y = x + y
+y = y - x
+y = y - 2
+y = 2 - y
+y = 2 * y
+y = y * 3
+z = data_layer(name='data_2', size=1)
+y = y * z
+y = z * y
+y = y + z
+y = z + y
+outputs(y)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/projections.py b/python/paddle/trainer_config_helpers/tests/configs/projections.py
index 4066c5bc6e0f06e43b1c4d13020c092babdaea91..aa4521dcd5db3f845871cfaaedb02a86bcbddc38 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/projections.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/projections.py
@@ -3,10 +3,7 @@ Test mixed layer, projections and operators.
 '''
 from paddle.trainer_config_helpers import *
 
-settings(
-    batch_size=1000,
-    learning_rate=1e-4
-)
+settings(batch_size=1000, learning_rate=1e-4)
 
 din = data_layer(name='test', size=100)
 
@@ -29,19 +26,22 @@ with mixed_layer() as m5:
 
 with mixed_layer() as m6:
     m6 += dotmul_operator(a=m3, b=m4)
+    m6 += scaling_projection(m3)
 
-img = data_layer(name='img', size=32*32)
-flt = data_layer(name='filter', size=3*3*1*64)
+img = data_layer(name='img', size=32 * 32)
+flt = data_layer(name='filter', size=3 * 3 * 1 * 64)
 
 with mixed_layer() as m7:
-    m7 += conv_operator(img=img, filter=flt, num_filters=64,
-                        num_channel=1, filter_size=3)
-
-end = mixed_layer(input=[full_matrix_projection(input=m5),
-                         trans_full_matrix_projection(input=m6),
-                         full_matrix_projection(input=m7)],
-                  size=100,
-                  layer_attr=ExtraAttr(drop_rate=0.5,
-                                       error_clipping_threshold=40))
+    m7 += conv_operator(
+        img=img, filter=flt, num_filters=64, num_channels=1, filter_size=3)
+
+end = mixed_layer(
+    input=[
+        full_matrix_projection(input=m5),
+        trans_full_matrix_projection(input=m6), full_matrix_projection(input=m7)
+    ],
+    size=100,
+    layer_attr=ExtraAttr(
+        drop_rate=0.5, error_clipping_threshold=40))
 
 outputs(end)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..1f262af21126c17eb133b92c84a1ae3bb280a1d6
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
@@ -0,0 +1,176 @@
+type: "nn"
+layers {
+  name: "image"
+  type: "data"
+  size: 65536
+  active_type: ""
+}
+layers {
+  name: "__conv_0__"
+  type: "exconv"
+  size: 3297856
+  active_type: ""
+  inputs {
+    input_layer_name: "image"
+    input_parameter_name: "___conv_0__.w0"
+    conv_conf {
+      filter_size: 32
+      channels: 1
+      stride: 1
+      padding: 1
+      groups: 1
+      filter_channels: 1
+      output_x: 227
+      img_size: 256
+      caffe_mode: true
+      filter_size_y: 32
+      padding_y: 1
+      stride_y: 1
+    }
+  }
+  bias_parameter_name: "___conv_0__.wbias"
+  num_filters: 64
+  shared_biases: true
+}
+layers {
+  name: "__batch_norm_0__"
+  type: "batch_norm"
+  size: 3297856
+  active_type: "relu"
+  inputs {
+    input_layer_name: "__conv_0__"
+    input_parameter_name: "___batch_norm_0__.w0"
+    image_conf {
+      channels: 64
+      img_size: 227
+    }
+  }
+  inputs {
+    input_layer_name: "__conv_0__"
+    input_parameter_name: "___batch_norm_0__.w1"
+  }
+  inputs {
+    input_layer_name: "__conv_0__"
+    input_parameter_name: "___batch_norm_0__.w2"
+  }
+  bias_parameter_name: "___batch_norm_0__.wbias"
+  moving_average_fraction: 0.9
+}
+layers {
+  name: "__crmnorm_0__"
+  type: "norm"
+  size: 3297856
+  active_type: ""
+  inputs {
+    input_layer_name: "__batch_norm_0__"
+    norm_conf {
+      norm_type: "cmrnorm-projection"
+      channels: 64
+      size: 32
+      scale: 0.0004
+      pow: 0.75
+      output_x: 227
+      img_size: 227
+      blocked: false
+    }
+  }
+}
+layers {
+  name: "__pool_0__"
+  type: "pool"
+  size: 2458624
+  active_type: ""
+  inputs {
+    input_layer_name: "__conv_0__"
+    pool_conf {
+      pool_type: "max-projection"
+      channels: 64
+      size_x: 32
+      stride: 1
+      output_x: 196
+      img_size: 227
+      padding: 0
+      size_y: 32
+      stride_y: 1
+      output_y: 196
+      img_size_y: 227
+      padding_y: 0
+    }
+  }
+}
+parameters {
+  name: "___conv_0__.w0"
+  size: 65536
+  initial_mean: 0.0
+  initial_std: 0.0441941738242
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___conv_0__.wbias"
+  size: 64
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 64
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___batch_norm_0__.w0"
+  size: 64
+  initial_mean: 1.0
+  initial_std: 0.0
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___batch_norm_0__.w1"
+  size: 64
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 64
+  initial_strategy: 0
+  initial_smart: false
+  is_static: true
+  is_shared: true
+}
+parameters {
+  name: "___batch_norm_0__.w2"
+  size: 64
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 64
+  initial_strategy: 0
+  initial_smart: false
+  is_static: true
+  is_shared: true
+}
+parameters {
+  name: "___batch_norm_0__.wbias"
+  size: 64
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 64
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "image"
+output_layer_names: "__pool_0__"
+output_layer_names: "__crmnorm_0__"
+sub_models {
+  name: "root"
+  layer_names: "image"
+  layer_names: "__conv_0__"
+  layer_names: "__batch_norm_0__"
+  layer_names: "__crmnorm_0__"
+  layer_names: "__pool_0__"
+  input_layer_names: "image"
+  output_layer_names: "__pool_0__"
+  output_layer_names: "__crmnorm_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..38346354080b02bebd937fd998fd3c63c8030346
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
@@ -0,0 +1,176 @@
+type: "nn"
+layers {
+  name: "image"
+  type: "data"
+  size: 51529
+  active_type: ""
+}
+layers {
+  name: "__conv_0__"
+  type: "exconvt"
+  size: 4194304
+  active_type: ""
+  inputs {
+    input_layer_name: "image"
+    input_parameter_name: "___conv_0__.w0"
+    conv_conf {
+      filter_size: 32
+      channels: 1
+      stride: 1
+      padding: 1
+      groups: 1
+      filter_channels: 64
+      output_x: 227
+      img_size: 256
+      caffe_mode: true
+      filter_size_y: 32
+      padding_y: 1
+      stride_y: 1
+    }
+  }
+  bias_parameter_name: "___conv_0__.wbias"
+  num_filters: 64
+  shared_biases: true
+}
+layers {
+  name: "__batch_norm_0__"
+  type: "batch_norm"
+  size: 4194304
+  active_type: "relu"
+  inputs {
+    input_layer_name: "__conv_0__"
+    input_parameter_name: "___batch_norm_0__.w0"
+    image_conf {
+      channels: 64
+      img_size: 256
+    }
+  }
+  inputs {
+    input_layer_name: "__conv_0__"
+    input_parameter_name: "___batch_norm_0__.w1"
+  }
+  inputs {
+    input_layer_name: "__conv_0__"
+    input_parameter_name: "___batch_norm_0__.w2"
+  }
+  bias_parameter_name: "___batch_norm_0__.wbias"
+  moving_average_fraction: 0.9
+}
+layers {
+  name: "__crmnorm_0__"
+  type: "norm"
+  size: 4194304
+  active_type: ""
+  inputs {
+    input_layer_name: "__batch_norm_0__"
+    norm_conf {
+      norm_type: "cmrnorm-projection"
+      channels: 64
+      size: 32
+      scale: 0.0004
+      pow: 0.75
+      output_x: 256
+      img_size: 256
+      blocked: false
+    }
+  }
+}
+layers {
+  name: "__pool_0__"
+  type: "pool"
+  size: 3240000
+  active_type: ""
+  inputs {
+    input_layer_name: "__conv_0__"
+    pool_conf {
+      pool_type: "max-projection"
+      channels: 64
+      size_x: 32
+      stride: 1
+      output_x: 225
+      img_size: 256
+      padding: 0
+      size_y: 32
+      stride_y: 1
+      output_y: 225
+      img_size_y: 256
+      padding_y: 0
+    }
+  }
+}
+parameters {
+  name: "___conv_0__.w0"
+  size: 65536
+  initial_mean: 0.0
+  initial_std: 0.0441941738242
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___conv_0__.wbias"
+  size: 64
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 64
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___batch_norm_0__.w0"
+  size: 64
+  initial_mean: 1.0
+  initial_std: 0.0
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___batch_norm_0__.w1"
+  size: 64
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 64
+  initial_strategy: 0
+  initial_smart: false
+  is_static: true
+  is_shared: true
+}
+parameters {
+  name: "___batch_norm_0__.w2"
+  size: 64
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 64
+  initial_strategy: 0
+  initial_smart: false
+  is_static: true
+  is_shared: true
+}
+parameters {
+  name: "___batch_norm_0__.wbias"
+  size: 64
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 64
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "image"
+output_layer_names: "__pool_0__"
+output_layer_names: "__crmnorm_0__"
+sub_models {
+  name: "root"
+  layer_names: "image"
+  layer_names: "__conv_0__"
+  layer_names: "__batch_norm_0__"
+  layer_names: "__crmnorm_0__"
+  layer_names: "__pool_0__"
+  input_layer_names: "image"
+  output_layer_names: "__pool_0__"
+  output_layer_names: "__crmnorm_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..7b2911f8e367ebf9d0797e815a7532c714ef698e
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
@@ -0,0 +1,69 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "__first_seq_0__"
+  type: "seqlastins"
+  size: 30
+  active_type: "linear"
+  inputs {
+    input_layer_name: "data"
+  }
+  select_first: true
+  trans_type: "seq"
+}
+layers {
+  name: "__first_seq_1__"
+  type: "seqlastins"
+  size: 30
+  active_type: "linear"
+  inputs {
+    input_layer_name: "data"
+  }
+  select_first: true
+  trans_type: "non-seq"
+}
+layers {
+  name: "__last_seq_0__"
+  type: "seqlastins"
+  size: 30
+  active_type: "linear"
+  inputs {
+    input_layer_name: "data"
+  }
+  trans_type: "seq"
+}
+layers {
+  name: "__last_seq_1__"
+  type: "seqlastins"
+  size: 30
+  active_type: "linear"
+  inputs {
+    input_layer_name: "data"
+  }
+  trans_type: "non-seq"
+}
+input_layer_names: "data"
+output_layer_names: "__first_seq_0__"
+output_layer_names: "__first_seq_1__"
+output_layer_names: "__last_seq_0__"
+output_layer_names: "__last_seq_1__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__first_seq_0__"
+  layer_names: "__first_seq_1__"
+  layer_names: "__last_seq_0__"
+  layer_names: "__last_seq_1__"
+  input_layer_names: "data"
+  output_layer_names: "__first_seq_0__"
+  output_layer_names: "__first_seq_1__"
+  output_layer_names: "__last_seq_0__"
+  output_layer_names: "__last_seq_1__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/layer_activations.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/layer_activations.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..ecf39e4d32167d4e838c43929cc4e7a87ff421a8
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/layer_activations.protostr
@@ -0,0 +1,423 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "layer_0"
+  type: "fc"
+  size: 100
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_0.w0"
+  }
+  bias_parameter_name: "_layer_0.wbias"
+}
+layers {
+  name: "layer_1"
+  type: "fc"
+  size: 100
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_1.w0"
+  }
+  bias_parameter_name: "_layer_1.wbias"
+}
+layers {
+  name: "layer_2"
+  type: "fc"
+  size: 100
+  active_type: "softmax"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_2.w0"
+  }
+  bias_parameter_name: "_layer_2.wbias"
+}
+layers {
+  name: "layer_3"
+  type: "fc"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_3.w0"
+  }
+  bias_parameter_name: "_layer_3.wbias"
+}
+layers {
+  name: "layer_4"
+  type: "fc"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_4.w0"
+  }
+  bias_parameter_name: "_layer_4.wbias"
+}
+layers {
+  name: "layer_5"
+  type: "fc"
+  size: 100
+  active_type: "exponential"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_5.w0"
+  }
+  bias_parameter_name: "_layer_5.wbias"
+}
+layers {
+  name: "layer_6"
+  type: "fc"
+  size: 100
+  active_type: "relu"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_6.w0"
+  }
+  bias_parameter_name: "_layer_6.wbias"
+}
+layers {
+  name: "layer_7"
+  type: "fc"
+  size: 100
+  active_type: "brelu"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_7.w0"
+  }
+  bias_parameter_name: "_layer_7.wbias"
+}
+layers {
+  name: "layer_8"
+  type: "fc"
+  size: 100
+  active_type: "softrelu"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_8.w0"
+  }
+  bias_parameter_name: "_layer_8.wbias"
+}
+layers {
+  name: "layer_9"
+  type: "fc"
+  size: 100
+  active_type: "stanh"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_9.w0"
+  }
+  bias_parameter_name: "_layer_9.wbias"
+}
+layers {
+  name: "layer_10"
+  type: "fc"
+  size: 100
+  active_type: "abs"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_10.w0"
+  }
+  bias_parameter_name: "_layer_10.wbias"
+}
+layers {
+  name: "layer_11"
+  type: "fc"
+  size: 100
+  active_type: "square"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_11.w0"
+  }
+  bias_parameter_name: "_layer_11.wbias"
+}
+parameters {
+  name: "_layer_0.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_0.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_1.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_1.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_2.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_2.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_3.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_3.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_4.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_4.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_5.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_5.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_6.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_6.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_7.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_7.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_8.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_8.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_9.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_9.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_10.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_10.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_11.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_11.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "input"
+output_layer_names: "layer_0"
+output_layer_names: "layer_1"
+output_layer_names: "layer_2"
+output_layer_names: "layer_3"
+output_layer_names: "layer_4"
+output_layer_names: "layer_5"
+output_layer_names: "layer_6"
+output_layer_names: "layer_7"
+output_layer_names: "layer_8"
+output_layer_names: "layer_9"
+output_layer_names: "layer_10"
+output_layer_names: "layer_11"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "layer_0"
+  layer_names: "layer_1"
+  layer_names: "layer_2"
+  layer_names: "layer_3"
+  layer_names: "layer_4"
+  layer_names: "layer_5"
+  layer_names: "layer_6"
+  layer_names: "layer_7"
+  layer_names: "layer_8"
+  layer_names: "layer_9"
+  layer_names: "layer_10"
+  layer_names: "layer_11"
+  input_layer_names: "input"
+  output_layer_names: "layer_0"
+  output_layer_names: "layer_1"
+  output_layer_names: "layer_2"
+  output_layer_names: "layer_3"
+  output_layer_names: "layer_4"
+  output_layer_names: "layer_5"
+  output_layer_names: "layer_6"
+  output_layer_names: "layer_7"
+  output_layer_names: "layer_8"
+  output_layer_names: "layer_9"
+  output_layer_names: "layer_10"
+  output_layer_names: "layer_11"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..da8da1b541f37a09654202f68232b99e4dac9f61
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
@@ -0,0 +1,366 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__exp_0__"
+  type: "mixed"
+  size: 100
+  active_type: "exponential"
+  inputs {
+    input_layer_name: "data"
+    proj_conf {
+      type: "identity"
+      name: "___exp_0__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__log_0__"
+  type: "mixed"
+  size: 100
+  active_type: "log"
+  inputs {
+    input_layer_name: "__exp_0__"
+    proj_conf {
+      type: "identity"
+      name: "___log_0__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__abs_0__"
+  type: "mixed"
+  size: 100
+  active_type: "abs"
+  inputs {
+    input_layer_name: "__log_0__"
+    proj_conf {
+      type: "identity"
+      name: "___abs_0__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__sigmoid_0__"
+  type: "mixed"
+  size: 100
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "__abs_0__"
+    proj_conf {
+      type: "identity"
+      name: "___sigmoid_0__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__square_0__"
+  type: "mixed"
+  size: 100
+  active_type: "square"
+  inputs {
+    input_layer_name: "__sigmoid_0__"
+    proj_conf {
+      type: "identity"
+      name: "___square_0__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__square_1__"
+  type: "mixed"
+  size: 100
+  active_type: "square"
+  inputs {
+    input_layer_name: "__square_0__"
+    proj_conf {
+      type: "identity"
+      name: "___square_1__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__slope_intercept_layer_0__"
+  type: "slope_intercept"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__square_1__"
+  }
+  slope: 1.0
+  intercept: 1
+}
+layers {
+  name: "__slope_intercept_layer_1__"
+  type: "slope_intercept"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__slope_intercept_layer_0__"
+  }
+  slope: 1.0
+  intercept: 1
+}
+layers {
+  name: "__mixed_0__"
+  type: "mixed"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__square_1__"
+    proj_conf {
+      type: "identity"
+      name: "___mixed_0__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+  inputs {
+    input_layer_name: "__slope_intercept_layer_1__"
+    proj_conf {
+      type: "identity"
+      name: "___mixed_0__.w1"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__slope_intercept_layer_2__"
+  type: "slope_intercept"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__square_1__"
+  }
+  slope: -1.0
+  intercept: 0.0
+}
+layers {
+  name: "__mixed_1__"
+  type: "mixed"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_0__"
+    proj_conf {
+      type: "identity"
+      name: "___mixed_1__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+  inputs {
+    input_layer_name: "__slope_intercept_layer_2__"
+    proj_conf {
+      type: "identity"
+      name: "___mixed_1__.w1"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__slope_intercept_layer_3__"
+  type: "slope_intercept"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_1__"
+  }
+  slope: 1.0
+  intercept: 2
+}
+layers {
+  name: "__slope_intercept_layer_4__"
+  type: "slope_intercept"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__slope_intercept_layer_3__"
+  }
+  slope: -1.0
+  intercept: 0.0
+}
+layers {
+  name: "__slope_intercept_layer_5__"
+  type: "slope_intercept"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__slope_intercept_layer_4__"
+  }
+  slope: 1.0
+  intercept: 2
+}
+layers {
+  name: "__slope_intercept_layer_6__"
+  type: "slope_intercept"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__slope_intercept_layer_5__"
+  }
+  slope: 2
+  intercept: 0.0
+}
+layers {
+  name: "__slope_intercept_layer_7__"
+  type: "slope_intercept"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__slope_intercept_layer_6__"
+  }
+  slope: 3
+  intercept: 0.0
+}
+layers {
+  name: "data_2"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "__scaling_layer_0__"
+  type: "scaling"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data_2"
+  }
+  inputs {
+    input_layer_name: "__slope_intercept_layer_7__"
+  }
+}
+layers {
+  name: "__scaling_layer_1__"
+  type: "scaling"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data_2"
+  }
+  inputs {
+    input_layer_name: "__scaling_layer_0__"
+  }
+}
+layers {
+  name: "__repeat_layer_0__"
+  type: "featmap_expand"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data_2"
+  }
+  num_filters: 100
+}
+layers {
+  name: "__mixed_2__"
+  type: "mixed"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__scaling_layer_1__"
+    proj_conf {
+      type: "identity"
+      name: "___mixed_2__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+  inputs {
+    input_layer_name: "__repeat_layer_0__"
+    proj_conf {
+      type: "identity"
+      name: "___mixed_2__.w1"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__repeat_layer_1__"
+  type: "featmap_expand"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data_2"
+  }
+  num_filters: 100
+}
+layers {
+  name: "__mixed_3__"
+  type: "mixed"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_2__"
+    proj_conf {
+      type: "identity"
+      name: "___mixed_3__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+  inputs {
+    input_layer_name: "__repeat_layer_1__"
+    proj_conf {
+      type: "identity"
+      name: "___mixed_3__.w1"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+input_layer_names: "data_2"
+input_layer_names: "data"
+output_layer_names: "__mixed_3__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__exp_0__"
+  layer_names: "__log_0__"
+  layer_names: "__abs_0__"
+  layer_names: "__sigmoid_0__"
+  layer_names: "__square_0__"
+  layer_names: "__square_1__"
+  layer_names: "__slope_intercept_layer_0__"
+  layer_names: "__slope_intercept_layer_1__"
+  layer_names: "__mixed_0__"
+  layer_names: "__slope_intercept_layer_2__"
+  layer_names: "__mixed_1__"
+  layer_names: "__slope_intercept_layer_3__"
+  layer_names: "__slope_intercept_layer_4__"
+  layer_names: "__slope_intercept_layer_5__"
+  layer_names: "__slope_intercept_layer_6__"
+  layer_names: "__slope_intercept_layer_7__"
+  layer_names: "data_2"
+  layer_names: "__scaling_layer_0__"
+  layer_names: "__scaling_layer_1__"
+  layer_names: "__repeat_layer_0__"
+  layer_names: "__mixed_2__"
+  layer_names: "__repeat_layer_1__"
+  layer_names: "__mixed_3__"
+  input_layer_names: "data_2"
+  input_layer_names: "data"
+  output_layer_names: "__mixed_3__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..2b3951c242411e0c0990a52bcb2ae6b1723a9367
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
@@ -0,0 +1,335 @@
+type: "nn"
+layers {
+  name: "test"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__embedding_0__"
+  type: "mixed"
+  size: 256
+  active_type: ""
+  inputs {
+    input_layer_name: "test"
+    input_parameter_name: "___embedding_0__.w0"
+    proj_conf {
+      type: "table"
+      name: "___embedding_0__.w0"
+      input_size: 100
+      output_size: 256
+    }
+  }
+}
+layers {
+  name: "__mixed_0__"
+  type: "mixed"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__embedding_0__"
+    input_parameter_name: "___mixed_0__.w0"
+    proj_conf {
+      type: "fc"
+      name: "___mixed_0__.w0"
+      input_size: 256
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__mixed_1__"
+  type: "mixed"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_0__"
+    input_parameter_name: "___mixed_1__.w0"
+    proj_conf {
+      type: "table"
+      name: "___mixed_1__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__mixed_2__"
+  type: "mixed"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_1__"
+    proj_conf {
+      type: "identity"
+      name: "___mixed_2__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__mixed_3__"
+  type: "mixed"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_2__"
+    input_parameter_name: "___mixed_3__.w0"
+    proj_conf {
+      type: "dot_mul"
+      name: "___mixed_3__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__mixed_4__"
+  type: "mixed"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_3__"
+    input_parameter_name: "___mixed_4__.w0"
+    proj_conf {
+      type: "context"
+      name: "___mixed_4__.w0"
+      input_size: 100
+      output_size: 300
+      context_start: -1
+      context_length: 3
+      trainable_padding: true
+    }
+  }
+}
+layers {
+  name: "__mixed_5__"
+  type: "mixed"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_2__"
+  }
+  inputs {
+    input_layer_name: "__mixed_2__"
+    input_parameter_name: "___mixed_5__.w1"
+    proj_conf {
+      type: "scaling"
+      name: "___mixed_5__.w1"
+      input_size: 100
+      output_size: 100
+    }
+  }
+  inputs {
+    input_layer_name: "__mixed_3__"
+  }
+  operator_confs {
+    type: "dot_mul"
+    input_indices: 0
+    input_indices: 2
+    input_sizes: 100
+    input_sizes: 100
+    output_size: 100
+    dotmul_scale: 1
+  }
+}
+layers {
+  name: "img"
+  type: "data"
+  size: 1024
+  active_type: ""
+}
+layers {
+  name: "filter"
+  type: "data"
+  size: 576
+  active_type: ""
+}
+layers {
+  name: "__mixed_6__"
+  type: "mixed"
+  size: 57600
+  active_type: ""
+  inputs {
+    input_layer_name: "img"
+  }
+  inputs {
+    input_layer_name: "filter"
+  }
+  operator_confs {
+    type: "conv"
+    input_indices: 0
+    input_indices: 1
+    input_sizes: 1024
+    input_sizes: 576
+    output_size: 57600
+    conv_conf {
+      filter_size: 3
+      channels: 1
+      stride: 1
+      padding: 0
+      groups: 1
+      filter_channels: 1
+      output_x: 30
+      img_size: 32
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 0
+      stride_y: 1
+    }
+    num_filters: 64
+  }
+}
+layers {
+  name: "__mixed_7__"
+  type: "mixed"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_4__"
+    input_parameter_name: "___mixed_7__.w0"
+    proj_conf {
+      type: "fc"
+      name: "___mixed_7__.w0"
+      input_size: 300
+      output_size: 100
+    }
+  }
+  inputs {
+    input_layer_name: "__mixed_5__"
+    input_parameter_name: "___mixed_7__.w1"
+    proj_conf {
+      type: "trans_fc"
+      name: "___mixed_7__.w1"
+      input_size: 100
+      output_size: 100
+    }
+  }
+  inputs {
+    input_layer_name: "__mixed_6__"
+    input_parameter_name: "___mixed_7__.w2"
+    proj_conf {
+      type: "fc"
+      name: "___mixed_7__.w2"
+      input_size: 57600
+      output_size: 100
+    }
+  }
+  drop_rate: 0.5
+}
+parameters {
+  name: "___embedding_0__.w0"
+  size: 25600
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 256
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___mixed_0__.w0"
+  size: 25600
+  initial_mean: 0.0
+  initial_std: 0.0625
+  dims: 256
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___mixed_1__.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___mixed_3__.w0"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 1.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___mixed_4__.w0"
+  size: 200
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 2
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___mixed_5__.w1"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 1.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___mixed_7__.w0"
+  size: 30000
+  initial_mean: 0.0
+  initial_std: 0.057735026919
+  dims: 300
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___mixed_7__.w1"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___mixed_7__.w2"
+  size: 5760000
+  initial_mean: 0.0
+  initial_std: 0.00416666666667
+  dims: 57600
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "test"
+input_layer_names: "img"
+input_layer_names: "filter"
+output_layer_names: "__mixed_7__"
+sub_models {
+  name: "root"
+  layer_names: "test"
+  layer_names: "__embedding_0__"
+  layer_names: "__mixed_0__"
+  layer_names: "__mixed_1__"
+  layer_names: "__mixed_2__"
+  layer_names: "__mixed_3__"
+  layer_names: "__mixed_4__"
+  layer_names: "__mixed_5__"
+  layer_names: "img"
+  layer_names: "filter"
+  layer_names: "__mixed_6__"
+  layer_names: "__mixed_7__"
+  input_layer_names: "test"
+  input_layer_names: "img"
+  input_layer_names: "filter"
+  output_layer_names: "__mixed_7__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_fc.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_fc.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..3e8633b0798318bfc50988dbd329256629d5176c
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_fc.protostr
@@ -0,0 +1,125 @@
+type: "nn"
+layers {
+  name: "feature_a"
+  type: "data"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "feature_b"
+  type: "data"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 200
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "feature_a"
+    input_parameter_name: "fc_param"
+  }
+  bias_parameter_name: "bias_param"
+}
+layers {
+  name: "__fc_layer_1__"
+  type: "fc"
+  size: 200
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "feature_b"
+    input_parameter_name: "fc_param"
+  }
+  bias_parameter_name: "bias_param"
+}
+layers {
+  name: "__fc_layer_2__"
+  type: "fc"
+  size: 10
+  active_type: "softmax"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "softmax_param"
+  }
+  inputs {
+    input_layer_name: "__fc_layer_1__"
+    input_parameter_name: "softmax_param"
+  }
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__cost_0__"
+  type: "multi-class-cross-entropy"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_2__"
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  coeff: 1.0
+}
+parameters {
+  name: "fc_param"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 1.0
+  dims: 200
+  dims: 200
+  initial_strategy: 1
+  initial_smart: false
+}
+parameters {
+  name: "bias_param"
+  size: 200
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 200
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "softmax_param"
+  size: 2000
+  initial_mean: 0.0
+  initial_std: 1.0
+  dims: 200
+  dims: 10
+  initial_strategy: 1
+  initial_smart: false
+}
+input_layer_names: "feature_a"
+input_layer_names: "feature_b"
+input_layer_names: "label"
+output_layer_names: "__cost_0__"
+evaluators {
+  name: "classification_error_evaluator"
+  type: "classification_error"
+  input_layers: "__fc_layer_2__"
+  input_layers: "label"
+}
+sub_models {
+  name: "root"
+  layer_names: "feature_a"
+  layer_names: "feature_b"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__fc_layer_1__"
+  layer_names: "__fc_layer_2__"
+  layer_names: "label"
+  layer_names: "__cost_0__"
+  input_layer_names: "feature_a"
+  input_layer_names: "feature_b"
+  input_layer_names: "label"
+  output_layer_names: "__cost_0__"
+  evaluator_names: "classification_error_evaluator"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..0a83499b724806666a241489467207f3c7151a3a
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
@@ -0,0 +1,393 @@
+type: "recurrent_nn"
+layers {
+  name: "data_a"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "data_b"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__mixed_0__"
+  type: "mixed"
+  size: 400
+  active_type: ""
+  inputs {
+    input_layer_name: "data_a"
+    input_parameter_name: "mixed_param"
+    proj_conf {
+      type: "fc"
+      name: "___mixed_0__.w0"
+      input_size: 100
+      output_size: 400
+    }
+  }
+}
+layers {
+  name: "__mixed_1__"
+  type: "mixed"
+  size: 400
+  active_type: ""
+  inputs {
+    input_layer_name: "data_b"
+    input_parameter_name: "mixed_param"
+    proj_conf {
+      type: "fc"
+      name: "___mixed_1__.w0"
+      input_size: 100
+      output_size: 400
+    }
+  }
+}
+layers {
+  name: "__lstm_group_0___recurrent_group"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "__mixed_0__@__lstm_group_0___recurrent_group"
+  type: "scatter_agent"
+  size: 400
+  active_type: ""
+}
+layers {
+  name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
+  type: "agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
+  type: "agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
+  type: "mixed"
+  size: 400
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_0__@__lstm_group_0___recurrent_group"
+    proj_conf {
+      type: "identity"
+      name: "___lstm_group_0___input_recurrent.w0"
+      input_size: 400
+      output_size: 400
+    }
+  }
+  inputs {
+    input_layer_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
+    input_parameter_name: "lstm_param"
+    proj_conf {
+      type: "fc"
+      name: "___lstm_group_0___input_recurrent.w1"
+      input_size: 100
+      output_size: 400
+    }
+  }
+}
+layers {
+  name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
+  type: "lstm_step"
+  size: 100
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
+  }
+  inputs {
+    input_layer_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
+  }
+  bias_parameter_name: "lstm_bias"
+  active_gate_type: "sigmoid"
+  active_state_type: "sigmoid"
+}
+layers {
+  name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
+  type: "get_output"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
+    input_layer_argument: "state"
+  }
+}
+layers {
+  name: "__lstm_group_0__"
+  type: "gather_agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__lstm_group_1___recurrent_group"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "__mixed_1__@__lstm_group_1___recurrent_group"
+  type: "scatter_agent"
+  size: 400
+  active_type: ""
+}
+layers {
+  name: "__lstm_group_1__+delay1@__lstm_group_1___recurrent_group"
+  type: "agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__lstm_group_1___state+delay1@__lstm_group_1___recurrent_group"
+  type: "agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__lstm_group_1___input_recurrent@__lstm_group_1___recurrent_group"
+  type: "mixed"
+  size: 400
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_1__@__lstm_group_1___recurrent_group"
+    proj_conf {
+      type: "identity"
+      name: "___lstm_group_1___input_recurrent.w0"
+      input_size: 400
+      output_size: 400
+    }
+  }
+  inputs {
+    input_layer_name: "__lstm_group_1__+delay1@__lstm_group_1___recurrent_group"
+    input_parameter_name: "lstm_param"
+    proj_conf {
+      type: "fc"
+      name: "___lstm_group_1___input_recurrent.w1"
+      input_size: 100
+      output_size: 400
+    }
+  }
+}
+layers {
+  name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
+  type: "lstm_step"
+  size: 100
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__lstm_group_1___input_recurrent@__lstm_group_1___recurrent_group"
+  }
+  inputs {
+    input_layer_name: "__lstm_group_1___state+delay1@__lstm_group_1___recurrent_group"
+  }
+  bias_parameter_name: "lstm_bias"
+  active_gate_type: "sigmoid"
+  active_state_type: "sigmoid"
+}
+layers {
+  name: "__lstm_group_1___state@__lstm_group_1___recurrent_group"
+  type: "get_output"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
+    input_layer_argument: "state"
+  }
+}
+layers {
+  name: "__lstm_group_1__"
+  type: "gather_agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__last_seq_0__"
+  type: "seqlastins"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__lstm_group_0__"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__last_seq_1__"
+  type: "seqlastins"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__lstm_group_1__"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 10
+  active_type: "softmax"
+  inputs {
+    input_layer_name: "__last_seq_0__"
+    input_parameter_name: "softmax_param"
+  }
+  inputs {
+    input_layer_name: "__last_seq_1__"
+    input_parameter_name: "softmax_param"
+  }
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__cost_0__"
+  type: "multi-class-cross-entropy"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  coeff: 1.0
+}
+parameters {
+  name: "mixed_param"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 400
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "lstm_param"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 400
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "lstm_bias"
+  size: 300
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 300
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "softmax_param"
+  size: 1000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data_a"
+input_layer_names: "data_b"
+input_layer_names: "label"
+output_layer_names: "__cost_0__"
+evaluators {
+  name: "classification_error_evaluator"
+  type: "classification_error"
+  input_layers: "__fc_layer_0__"
+  input_layers: "label"
+}
+sub_models {
+  name: "root"
+  layer_names: "data_a"
+  layer_names: "data_b"
+  layer_names: "__mixed_0__"
+  layer_names: "__mixed_1__"
+  layer_names: "__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0__"
+  layer_names: "__lstm_group_1___recurrent_group"
+  layer_names: "__lstm_group_1__"
+  layer_names: "__last_seq_0__"
+  layer_names: "__last_seq_1__"
+  layer_names: "__fc_layer_0__"
+  layer_names: "label"
+  layer_names: "__cost_0__"
+  input_layer_names: "data_a"
+  input_layer_names: "data_b"
+  input_layer_names: "label"
+  output_layer_names: "__cost_0__"
+  evaluator_names: "classification_error_evaluator"
+  is_recurrent_layer_group: false
+}
+sub_models {
+  name: "__lstm_group_0___recurrent_group"
+  layer_names: "__mixed_0__@__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0__@__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
+    link_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
+    is_sequence: false
+  }
+  memories {
+    layer_name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
+    link_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "__mixed_0__"
+    link_name: "__mixed_0__@__lstm_group_0___recurrent_group"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
+    link_name: "__lstm_group_0__"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
+sub_models {
+  name: "__lstm_group_1___recurrent_group"
+  layer_names: "__mixed_1__@__lstm_group_1___recurrent_group"
+  layer_names: "__lstm_group_1__+delay1@__lstm_group_1___recurrent_group"
+  layer_names: "__lstm_group_1___state+delay1@__lstm_group_1___recurrent_group"
+  layer_names: "__lstm_group_1___input_recurrent@__lstm_group_1___recurrent_group"
+  layer_names: "__lstm_group_1__@__lstm_group_1___recurrent_group"
+  layer_names: "__lstm_group_1___state@__lstm_group_1___recurrent_group"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
+    link_name: "__lstm_group_1__+delay1@__lstm_group_1___recurrent_group"
+    is_sequence: false
+  }
+  memories {
+    layer_name: "__lstm_group_1___state@__lstm_group_1___recurrent_group"
+    link_name: "__lstm_group_1___state+delay1@__lstm_group_1___recurrent_group"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "__mixed_1__"
+    link_name: "__mixed_1__@__lstm_group_1___recurrent_group"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
+    link_name: "__lstm_group_1__"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..dacb40185f863025528c2d4eeb8b325425953a93
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
@@ -0,0 +1,418 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 200
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+  bias_parameter_name: "___fc_layer_0__.wbias"
+}
+layers {
+  name: "__recurrent_layer_0__"
+  type: "recurrent"
+  size: 200
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___recurrent_layer_0__.w0"
+  }
+  bias_parameter_name: "___recurrent_layer_0__.wbias"
+  reversed: false
+}
+layers {
+  name: "__recurrent_layer_1__"
+  type: "recurrent"
+  size: 200
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___recurrent_layer_1__.w0"
+  }
+  bias_parameter_name: "___recurrent_layer_1__.wbias"
+  reversed: true
+}
+layers {
+  name: "__fc_layer_1__"
+  type: "fc"
+  size: 800
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___fc_layer_1__.w0"
+  }
+}
+layers {
+  name: "__lstmemory_0__"
+  type: "lstmemory"
+  size: 200
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "__fc_layer_1__"
+    input_parameter_name: "___lstmemory_0__.w0"
+  }
+  bias_parameter_name: "___lstmemory_0__.wbias"
+  reversed: false
+  active_gate_type: "sigmoid"
+  active_state_type: "tanh"
+}
+layers {
+  name: "__fc_layer_2__"
+  type: "fc"
+  size: 800
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___fc_layer_2__.w0"
+  }
+}
+layers {
+  name: "__lstmemory_1__"
+  type: "lstmemory"
+  size: 200
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "__fc_layer_2__"
+    input_parameter_name: "___lstmemory_1__.w0"
+  }
+  bias_parameter_name: "___lstmemory_1__.wbias"
+  reversed: true
+  active_gate_type: "sigmoid"
+  active_state_type: "tanh"
+}
+layers {
+  name: "__fc_layer_3__"
+  type: "fc"
+  size: 600
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___fc_layer_3__.w0"
+  }
+}
+layers {
+  name: "__gru_0__"
+  type: "gated_recurrent"
+  size: 200
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "__fc_layer_3__"
+    input_parameter_name: "___gru_0__.w0"
+  }
+  bias_parameter_name: "___gru_0__.wbias"
+  reversed: false
+  active_gate_type: "sigmoid"
+}
+layers {
+  name: "__fc_layer_4__"
+  type: "fc"
+  size: 600
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___fc_layer_4__.w0"
+  }
+}
+layers {
+  name: "__gru_1__"
+  type: "gated_recurrent"
+  size: 200
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "__fc_layer_4__"
+    input_parameter_name: "___gru_1__.w0"
+  }
+  bias_parameter_name: "___gru_1__.wbias"
+  reversed: true
+  active_gate_type: "sigmoid"
+}
+layers {
+  name: "__last_seq_0__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__recurrent_layer_0__"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__first_seq_0__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__recurrent_layer_1__"
+  }
+  select_first: true
+  trans_type: "non-seq"
+}
+layers {
+  name: "__last_seq_1__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__lstmemory_0__"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__first_seq_1__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__lstmemory_1__"
+  }
+  select_first: true
+  trans_type: "non-seq"
+}
+layers {
+  name: "__last_seq_2__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__gru_0__"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__first_seq_2__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__gru_1__"
+  }
+  select_first: true
+  trans_type: "non-seq"
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__.wbias"
+  size: 200
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 200
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___recurrent_layer_0__.w0"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___recurrent_layer_0__.wbias"
+  size: 200
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 200
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___recurrent_layer_1__.w0"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___recurrent_layer_1__.wbias"
+  size: 200
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 200
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___fc_layer_1__.w0"
+  size: 160000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 800
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___lstmemory_0__.w0"
+  size: 160000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 200
+  dims: 4
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___lstmemory_0__.wbias"
+  size: 1400
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1400
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___fc_layer_2__.w0"
+  size: 160000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 800
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___lstmemory_1__.w0"
+  size: 160000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 200
+  dims: 4
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___lstmemory_1__.wbias"
+  size: 1400
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1400
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___fc_layer_3__.w0"
+  size: 120000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 600
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___gru_0__.w0"
+  size: 120000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 600
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___gru_0__.wbias"
+  size: 600
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 600
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___fc_layer_4__.w0"
+  size: 120000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 600
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___gru_1__.w0"
+  size: 120000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 600
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___gru_1__.wbias"
+  size: 600
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 600
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__last_seq_0__"
+output_layer_names: "__first_seq_0__"
+output_layer_names: "__last_seq_1__"
+output_layer_names: "__first_seq_1__"
+output_layer_names: "__last_seq_2__"
+output_layer_names: "__first_seq_2__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__recurrent_layer_0__"
+  layer_names: "__recurrent_layer_1__"
+  layer_names: "__fc_layer_1__"
+  layer_names: "__lstmemory_0__"
+  layer_names: "__fc_layer_2__"
+  layer_names: "__lstmemory_1__"
+  layer_names: "__fc_layer_3__"
+  layer_names: "__gru_0__"
+  layer_names: "__fc_layer_4__"
+  layer_names: "__gru_1__"
+  layer_names: "__last_seq_0__"
+  layer_names: "__first_seq_0__"
+  layer_names: "__last_seq_1__"
+  layer_names: "__first_seq_1__"
+  layer_names: "__last_seq_2__"
+  layer_names: "__first_seq_2__"
+  input_layer_names: "data"
+  output_layer_names: "__last_seq_0__"
+  output_layer_names: "__first_seq_0__"
+  output_layer_names: "__last_seq_1__"
+  output_layer_names: "__first_seq_1__"
+  output_layer_names: "__last_seq_2__"
+  output_layer_names: "__first_seq_2__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..b110e91498ce7d112987714bd769868179141c54
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr
@@ -0,0 +1,152 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 120
+  active_type: ""
+}
+layers {
+  name: "__bidirectional_gru_0___fw_transform"
+  type: "mixed"
+  size: 120
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___bidirectional_gru_0___fw_transform.w0"
+    proj_conf {
+      type: "fc"
+      name: "___bidirectional_gru_0___fw_transform.w0"
+      input_size: 120
+      output_size: 120
+    }
+  }
+}
+layers {
+  name: "__bidirectional_gru_0___fw"
+  type: "gated_recurrent"
+  size: 40
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__bidirectional_gru_0___fw_transform"
+    input_parameter_name: "___bidirectional_gru_0___fw.w0"
+  }
+  bias_parameter_name: "___bidirectional_gru_0___fw.wbias"
+  reversed: false
+  active_gate_type: "sigmoid"
+}
+layers {
+  name: "__bidirectional_gru_0___bw_transform"
+  type: "mixed"
+  size: 120
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___bidirectional_gru_0___bw_transform.w0"
+    proj_conf {
+      type: "fc"
+      name: "___bidirectional_gru_0___bw_transform.w0"
+      input_size: 120
+      output_size: 120
+    }
+  }
+}
+layers {
+  name: "__bidirectional_gru_0___bw"
+  type: "gated_recurrent"
+  size: 40
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__bidirectional_gru_0___bw_transform"
+    input_parameter_name: "___bidirectional_gru_0___bw.w0"
+  }
+  bias_parameter_name: "___bidirectional_gru_0___bw.wbias"
+  reversed: true
+  active_gate_type: "sigmoid"
+}
+layers {
+  name: "__bidirectional_gru_0__"
+  type: "concat"
+  size: 80
+  active_type: ""
+  inputs {
+    input_layer_name: "__bidirectional_gru_0___fw"
+  }
+  inputs {
+    input_layer_name: "__bidirectional_gru_0___bw"
+  }
+}
+parameters {
+  name: "___bidirectional_gru_0___fw_transform.w0"
+  size: 14400
+  initial_mean: 0.0
+  initial_std: 0.0912870929175
+  dims: 120
+  dims: 120
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___bidirectional_gru_0___fw.w0"
+  size: 4800
+  initial_mean: 0.0
+  initial_std: 0.158113883008
+  dims: 40
+  dims: 120
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___bidirectional_gru_0___fw.wbias"
+  size: 120
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 120
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___bidirectional_gru_0___bw_transform.w0"
+  size: 14400
+  initial_mean: 0.0
+  initial_std: 0.0912870929175
+  dims: 120
+  dims: 120
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___bidirectional_gru_0___bw.w0"
+  size: 4800
+  initial_mean: 0.0
+  initial_std: 0.158113883008
+  dims: 40
+  dims: 120
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___bidirectional_gru_0___bw.wbias"
+  size: 120
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 120
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__bidirectional_gru_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__bidirectional_gru_0___fw_transform"
+  layer_names: "__bidirectional_gru_0___fw"
+  layer_names: "__bidirectional_gru_0___bw_transform"
+  layer_names: "__bidirectional_gru_0___bw"
+  layer_names: "__bidirectional_gru_0__"
+  input_layer_names: "data"
+  output_layer_names: "__bidirectional_gru_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..13d0d477eb58f6da887d0ad9c683caef37e00010
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
@@ -0,0 +1,123 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 2304
+  active_type: ""
+}
+layers {
+  name: "__conv_0__"
+  type: "exconv"
+  size: 36864
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___conv_0__.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 1
+      stride: 1
+      padding: 1
+      groups: 1
+      filter_channels: 1
+      output_x: 48
+      img_size: 48
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 1
+    }
+  }
+  bias_parameter_name: "___conv_0__.wbias"
+  num_filters: 16
+  shared_biases: true
+}
+layers {
+  name: "__bilinear_interp_layer_0__"
+  type: "bilinear_interp"
+  size: 65536
+  active_type: ""
+  inputs {
+    input_layer_name: "__conv_0__"
+    bilinear_interp_conf {
+      out_size_x: 64
+      out_size_y: 64
+      num_channels: 16
+    }
+  }
+}
+layers {
+  name: "__pool_0__"
+  type: "pool"
+  size: 16384
+  active_type: ""
+  inputs {
+    input_layer_name: "__bilinear_interp_layer_0__"
+    pool_conf {
+      pool_type: "max-projection"
+      channels: 4
+      size_x: 2
+      stride: 2
+      output_x: 64
+      img_size: 128
+      padding: 0
+      size_y: 2
+      stride_y: 2
+      output_y: 64
+      img_size_y: 128
+      padding_y: 0
+    }
+  }
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 384
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__pool_0__"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+}
+parameters {
+  name: "___conv_0__.w0"
+  size: 144
+  initial_mean: 0.0
+  initial_std: 0.471404520791
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___conv_0__.wbias"
+  size: 16
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 16
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 6291456
+  initial_mean: 0.0
+  initial_std: 0.0078125
+  dims: 16384
+  dims: 384
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data"
+output_layer_names: "__fc_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__conv_0__"
+  layer_names: "__bilinear_interp_layer_0__"
+  layer_names: "__pool_0__"
+  layer_names: "__fc_layer_0__"
+  input_layer_names: "data"
+  output_layer_names: "__fc_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..f6045fe1f68255daf0d9b5ab05034eec633e4503
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
@@ -0,0 +1,302 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "labels"
+  type: "data"
+  size: 5000
+  active_type: ""
+}
+layers {
+  name: "probs"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "xe-label"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 4
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+  bias_parameter_name: "___fc_layer_0__.wbias"
+}
+layers {
+  name: "__ctc_layer_0__"
+  type: "ctc"
+  size: 5001
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+  inputs {
+    input_layer_name: "labels"
+  }
+  norm_by_times: false
+}
+layers {
+  name: "crf_label"
+  type: "data"
+  size: 4
+  active_type: ""
+}
+layers {
+  name: "__crf_layer_0__"
+  type: "crf"
+  size: 4
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___crf_layer_0__.w0"
+  }
+  inputs {
+    input_layer_name: "crf_label"
+  }
+  coeff: 1.0
+}
+layers {
+  name: "left"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "right"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "__rank_cost_0__"
+  type: "rank-cost"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "left"
+  }
+  inputs {
+    input_layer_name: "right"
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  coeff: 1.0
+}
+layers {
+  name: "list_feature"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "list_scores"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "__lambda_cost_0__"
+  type: "lambda_cost"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "list_feature"
+  }
+  inputs {
+    input_layer_name: "list_scores"
+  }
+  NDCG_num: 5
+  max_sort_size: -1
+}
+layers {
+  name: "__cross_entropy_0__"
+  type: "multi-class-cross-entropy"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "probs"
+  }
+  inputs {
+    input_layer_name: "xe-label"
+  }
+  coeff: 1.0
+}
+layers {
+  name: "__cross_entropy_with_selfnorm_0__"
+  type: "multi_class_cross_entropy_with_selfnorm"
+  active_type: ""
+  inputs {
+    input_layer_name: "probs"
+  }
+  inputs {
+    input_layer_name: "xe-label"
+  }
+  softmax_selfnorm_alpha: 0.1
+  coeff: 1.0
+}
+layers {
+  name: "huber_probs"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "huber_label"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "__huber_cost_0__"
+  type: "huber"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "huber_probs"
+  }
+  inputs {
+    input_layer_name: "huber_label"
+  }
+  coeff: 1.0
+}
+layers {
+  name: "__multi_binary_label_cross_entropy_0__"
+  type: "multi_binary_label_cross_entropy"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "probs"
+  }
+  inputs {
+    input_layer_name: "xe-label"
+  }
+  coeff: 1.0
+}
+layers {
+  name: "__sum_cost_0__"
+  type: "sum_cost"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  coeff: 1.0
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 800
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 4
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__.wbias"
+  size: 4
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 4
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___crf_layer_0__.w0"
+  size: 24
+  initial_mean: 0.0
+  initial_std: 0.5
+  dims: 4
+  dims: 6
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "input"
+input_layer_names: "labels"
+input_layer_names: "crf_label"
+input_layer_names: "left"
+input_layer_names: "right"
+input_layer_names: "label"
+input_layer_names: "list_feature"
+input_layer_names: "list_scores"
+input_layer_names: "probs"
+input_layer_names: "xe-label"
+input_layer_names: "huber_probs"
+input_layer_names: "huber_label"
+output_layer_names: "__ctc_layer_0__"
+output_layer_names: "__crf_layer_0__"
+output_layer_names: "__rank_cost_0__"
+output_layer_names: "__lambda_cost_0__"
+output_layer_names: "__cross_entropy_0__"
+output_layer_names: "__cross_entropy_with_selfnorm_0__"
+output_layer_names: "__huber_cost_0__"
+output_layer_names: "__multi_binary_label_cross_entropy_0__"
+output_layer_names: "__sum_cost_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "labels"
+  layer_names: "probs"
+  layer_names: "xe-label"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__ctc_layer_0__"
+  layer_names: "crf_label"
+  layer_names: "__crf_layer_0__"
+  layer_names: "left"
+  layer_names: "right"
+  layer_names: "label"
+  layer_names: "__rank_cost_0__"
+  layer_names: "list_feature"
+  layer_names: "list_scores"
+  layer_names: "__lambda_cost_0__"
+  layer_names: "__cross_entropy_0__"
+  layer_names: "__cross_entropy_with_selfnorm_0__"
+  layer_names: "huber_probs"
+  layer_names: "huber_label"
+  layer_names: "__huber_cost_0__"
+  layer_names: "__multi_binary_label_cross_entropy_0__"
+  layer_names: "__sum_cost_0__"
+  input_layer_names: "input"
+  input_layer_names: "labels"
+  input_layer_names: "crf_label"
+  input_layer_names: "left"
+  input_layer_names: "right"
+  input_layer_names: "label"
+  input_layer_names: "list_feature"
+  input_layer_names: "list_scores"
+  input_layer_names: "probs"
+  input_layer_names: "xe-label"
+  input_layer_names: "huber_probs"
+  input_layer_names: "huber_label"
+  output_layer_names: "__ctc_layer_0__"
+  output_layer_names: "__crf_layer_0__"
+  output_layer_names: "__rank_cost_0__"
+  output_layer_names: "__lambda_cost_0__"
+  output_layer_names: "__cross_entropy_0__"
+  output_layer_names: "__cross_entropy_with_selfnorm_0__"
+  output_layer_names: "__huber_cost_0__"
+  output_layer_names: "__multi_binary_label_cross_entropy_0__"
+  output_layer_names: "__sum_cost_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..811b38ae4a51e8faedb59fea2b81a8be3cceeae6
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
@@ -0,0 +1,111 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "weight"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 10
+  active_type: "softmax"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+  bias_parameter_name: "___fc_layer_0__.wbias"
+}
+layers {
+  name: "__cost_0__"
+  type: "multi-class-cross-entropy"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  inputs {
+    input_layer_name: "weight"
+  }
+  coeff: 1.0
+}
+layers {
+  name: "__regression_cost_0__"
+  type: "square_error"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  inputs {
+    input_layer_name: "weight"
+  }
+  coeff: 1.0
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 3000
+  initial_mean: 0.0
+  initial_std: 0.057735026919
+  dims: 300
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__.wbias"
+  size: 10
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 10
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "input"
+input_layer_names: "label"
+input_layer_names: "weight"
+output_layer_names: "__cost_0__"
+output_layer_names: "__regression_cost_0__"
+evaluators {
+  name: "classification_error_evaluator"
+  type: "classification_error"
+  input_layers: "__fc_layer_0__"
+  input_layers: "label"
+  input_layers: "weight"
+}
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "label"
+  layer_names: "weight"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__cost_0__"
+  layer_names: "__regression_cost_0__"
+  input_layer_names: "input"
+  input_layer_names: "label"
+  input_layer_names: "weight"
+  output_layer_names: "__cost_0__"
+  output_layer_names: "__regression_cost_0__"
+  evaluator_names: "classification_error_evaluator"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_expand_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_expand_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..f4b36052264bc41b4c06826c3b3c1428c103add7
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_expand_layer.protostr
@@ -0,0 +1,56 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "data_seq"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "__expand_layer_0__"
+  type: "expand"
+  size: 30
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+  }
+  inputs {
+    input_layer_name: "data_seq"
+  }
+  trans_type: "seq"
+}
+layers {
+  name: "__expand_layer_1__"
+  type: "expand"
+  size: 30
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+  }
+  inputs {
+    input_layer_name: "data_seq"
+  }
+  trans_type: "non-seq"
+}
+input_layer_names: "data"
+input_layer_names: "data_seq"
+output_layer_names: "__expand_layer_0__"
+output_layer_names: "__expand_layer_1__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "data_seq"
+  layer_names: "__expand_layer_0__"
+  layer_names: "__expand_layer_1__"
+  input_layer_names: "data"
+  input_layer_names: "data_seq"
+  output_layer_names: "__expand_layer_0__"
+  output_layer_names: "__expand_layer_1__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_fc.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_fc.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..8151898832ded3796fb8c56b201d5ebfca3ce6cb
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_fc.protostr
@@ -0,0 +1,98 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__trans_layer_0__"
+  type: "trans"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+  }
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 100
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__trans_layer_0__"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+}
+layers {
+  name: "mask"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__selective_fc_layer_0__"
+  type: "selective_fc"
+  size: 100
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___selective_fc_layer_0__.w0"
+  }
+  inputs {
+    input_layer_name: "mask"
+  }
+  bias_parameter_name: "___selective_fc_layer_0__.wbias"
+  selective_fc_pass_generation: false
+  has_selected_colums: true
+  selective_fc_full_mul_ratio: 0.02
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___selective_fc_layer_0__.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+  is_sparse: false
+}
+parameters {
+  name: "___selective_fc_layer_0__.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+input_layer_names: "mask"
+output_layer_names: "__fc_layer_0__"
+output_layer_names: "__selective_fc_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__trans_layer_0__"
+  layer_names: "__fc_layer_0__"
+  layer_names: "mask"
+  layer_names: "__selective_fc_layer_0__"
+  input_layer_names: "data"
+  input_layer_names: "mask"
+  output_layer_names: "__fc_layer_0__"
+  output_layer_names: "__selective_fc_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_grumemory_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_grumemory_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..2c19b2fd120e7c01ee9aa088f674a74498540a3c
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_grumemory_layer.protostr
@@ -0,0 +1,51 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 120
+  active_type: ""
+}
+layers {
+  name: "__gru_0__"
+  type: "gated_recurrent"
+  size: 40
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___gru_0__.w0"
+  }
+  bias_parameter_name: "___gru_0__.wbias"
+  reversed: true
+  active_gate_type: "tanh"
+}
+parameters {
+  name: "___gru_0__.w0"
+  size: 4800
+  initial_mean: 0.0
+  initial_std: 0.158113883008
+  dims: 40
+  dims: 120
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___gru_0__.wbias"
+  size: 120
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 120
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__gru_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__gru_0__"
+  input_layer_names: "data"
+  output_layer_names: "__gru_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_hsigmoid.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_hsigmoid.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..e81fcb13c4c6ee8e76036d71d47fdaac9cd3d716
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_hsigmoid.protostr
@@ -0,0 +1,62 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__hsigmoid_0__"
+  type: "hsigmoid"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___hsigmoid_0__.w0"
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  bias_parameter_name: "___hsigmoid_0__.wbias"
+  num_classes: 10
+}
+parameters {
+  name: "___hsigmoid_0__.w0"
+  size: 900
+  initial_mean: 0.0
+  initial_std: 0.333333333333
+  dims: 9
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___hsigmoid_0__.wbias"
+  size: 9
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 9
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+input_layer_names: "label"
+output_layer_names: "__hsigmoid_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "label"
+  layer_names: "__hsigmoid_0__"
+  input_layer_names: "data"
+  input_layer_names: "label"
+  output_layer_names: "__hsigmoid_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_lstmemory_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_lstmemory_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..76a4afab82c59196564128cb9cb8d72ba2a7b101
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_lstmemory_layer.protostr
@@ -0,0 +1,53 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 128
+  active_type: ""
+}
+layers {
+  name: "__lstmemory_0__"
+  type: "lstmemory"
+  size: 32
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___lstmemory_0__.w0"
+  }
+  bias_parameter_name: "___lstmemory_0__.wbias"
+  reversed: true
+  active_gate_type: "tanh"
+  active_state_type: "tanh"
+}
+parameters {
+  name: "___lstmemory_0__.w0"
+  size: 4096
+  initial_mean: 0.0
+  initial_std: 0.176776695297
+  dims: 32
+  dims: 32
+  dims: 4
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___lstmemory_0__.wbias"
+  size: 224
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 224
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__lstmemory_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__lstmemory_0__"
+  input_layer_names: "data"
+  output_layer_names: "__lstmemory_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..1be2a7ceebfb74d677ac056dcc3a9f72fd31ccd6
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
@@ -0,0 +1,209 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 2304
+  active_type: ""
+}
+layers {
+  name: "__conv_0__"
+  type: "exconv"
+  size: 36864
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___conv_0__.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 1
+      stride: 1
+      padding: 1
+      groups: 1
+      filter_channels: 1
+      output_x: 48
+      img_size: 48
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 1
+    }
+  }
+  bias_parameter_name: "___conv_0__.wbias"
+  num_filters: 16
+  shared_biases: true
+}
+layers {
+  name: "__maxout_layer_0__"
+  type: "maxout"
+  size: 18432
+  active_type: ""
+  inputs {
+    input_layer_name: "__conv_0__"
+    maxout_conf {
+      channels: 16
+      groups: 2
+      img_size_x: 0
+      img_size_y: 0
+    }
+  }
+}
+layers {
+  name: "__pool_0__"
+  type: "pool"
+  size: 4608
+  active_type: ""
+  inputs {
+    input_layer_name: "__maxout_layer_0__"
+    pool_conf {
+      pool_type: "max-projection"
+      channels: 8
+      size_x: 2
+      stride: 2
+      output_x: 24
+      img_size: 48
+      padding: 0
+      size_y: 2
+      stride_y: 2
+      output_y: 24
+      img_size_y: 48
+      padding_y: 0
+    }
+  }
+}
+layers {
+  name: "__conv_1__"
+  type: "exconv"
+  size: 18432
+  active_type: ""
+  inputs {
+    input_layer_name: "__pool_0__"
+    input_parameter_name: "___conv_1__.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 32
+      stride: 1
+      padding: 1
+      groups: 1
+      filter_channels: 32
+      output_x: 12
+      img_size: 12
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 1
+    }
+  }
+  bias_parameter_name: "___conv_1__.wbias"
+  num_filters: 128
+  shared_biases: true
+}
+layers {
+  name: "__maxout_layer_1__"
+  type: "maxout"
+  size: 9216
+  active_type: ""
+  inputs {
+    input_layer_name: "__conv_0__"
+    maxout_conf {
+      channels: 128
+      groups: 4
+      img_size_x: 0
+      img_size_y: 0
+    }
+  }
+}
+layers {
+  name: "__block_expand_layer_0__"
+  type: "blockexpand"
+  size: 192
+  active_type: ""
+  inputs {
+    input_layer_name: "__maxout_layer_0__"
+    block_expand_conf {
+      channels: 32
+      stride_x: 1
+      stride_y: 1
+      padding_x: 0
+      padding_y: 0
+      block_x: 1
+      block_y: 6
+      output_x: 0
+      output_y: 0
+      img_size_x: 0
+      img_size_y: 0
+    }
+  }
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 384
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__block_expand_layer_0__"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+}
+parameters {
+  name: "___conv_0__.w0"
+  size: 144
+  initial_mean: 0.0
+  initial_std: 0.471404520791
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___conv_0__.wbias"
+  size: 16
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 16
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___conv_1__.w0"
+  size: 36864
+  initial_mean: 0.0
+  initial_std: 0.0833333333333
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___conv_1__.wbias"
+  size: 128
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 128
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 73728
+  initial_mean: 0.0
+  initial_std: 0.0721687836487
+  dims: 192
+  dims: 384
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data"
+output_layer_names: "__fc_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__conv_0__"
+  layer_names: "__maxout_layer_0__"
+  layer_names: "__pool_0__"
+  layer_names: "__conv_1__"
+  layer_names: "__maxout_layer_1__"
+  layer_names: "__block_expand_layer_0__"
+  layer_names: "__fc_layer_0__"
+  input_layer_names: "data"
+  output_layer_names: "__fc_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..b30bbb2a4e24d74ebe1d6c8eda8be5aa09217f6d
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
@@ -0,0 +1,225 @@
+type: "nn"
+layers {
+  name: "w"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "a"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "b"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "c"
+  type: "data"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "d"
+  type: "data"
+  size: 31
+  active_type: ""
+}
+layers {
+  name: "__interpolation_layer_0__"
+  type: "interpolation"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "w"
+  }
+  inputs {
+    input_layer_name: "a"
+  }
+  inputs {
+    input_layer_name: "b"
+  }
+}
+layers {
+  name: "__power_layer_0__"
+  type: "power"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "w"
+  }
+  inputs {
+    input_layer_name: "a"
+  }
+}
+layers {
+  name: "__scaling_layer_0__"
+  type: "scaling"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "w"
+  }
+  inputs {
+    input_layer_name: "a"
+  }
+}
+layers {
+  name: "__cos_sim_0__"
+  type: "cos"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "a"
+  }
+  inputs {
+    input_layer_name: "b"
+  }
+  cos_scale: 5
+}
+layers {
+  name: "__cos_sim_1__"
+  type: "cos_vm"
+  size: 2
+  active_type: ""
+  inputs {
+    input_layer_name: "a"
+  }
+  inputs {
+    input_layer_name: "c"
+  }
+  cos_scale: 5
+}
+layers {
+  name: "__sum_to_one_norm_layer_0__"
+  type: "sum_to_one_norm"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "a"
+  }
+}
+layers {
+  name: "__conv_shift_layer_0__"
+  type: "conv_shift"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "a"
+  }
+  inputs {
+    input_layer_name: "d"
+  }
+}
+layers {
+  name: "__tensor_layer_0__"
+  type: "tensor"
+  size: 1000
+  active_type: ""
+  inputs {
+    input_layer_name: "a"
+    input_parameter_name: "___tensor_layer_0__.w0"
+  }
+  inputs {
+    input_layer_name: "b"
+  }
+  bias_parameter_name: "___tensor_layer_0__.wbias"
+}
+layers {
+  name: "__slope_intercept_layer_0__"
+  type: "slope_intercept"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "a"
+  }
+  slope: 0.7
+  intercept: 0.9
+}
+layers {
+  name: "__linear_comb_layer_0__"
+  type: "convex_comb"
+  size: 2
+  active_type: ""
+  inputs {
+    input_layer_name: "b"
+  }
+  inputs {
+    input_layer_name: "c"
+  }
+}
+parameters {
+  name: "___tensor_layer_0__.w0"
+  size: 10000000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  dims: 1000
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___tensor_layer_0__.wbias"
+  size: 1000
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1000
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "w"
+input_layer_names: "a"
+input_layer_names: "b"
+input_layer_names: "c"
+input_layer_names: "d"
+output_layer_names: "__interpolation_layer_0__"
+output_layer_names: "__power_layer_0__"
+output_layer_names: "__scaling_layer_0__"
+output_layer_names: "__cos_sim_0__"
+output_layer_names: "__cos_sim_1__"
+output_layer_names: "__sum_to_one_norm_layer_0__"
+output_layer_names: "__conv_shift_layer_0__"
+output_layer_names: "__tensor_layer_0__"
+output_layer_names: "__slope_intercept_layer_0__"
+output_layer_names: "__linear_comb_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "w"
+  layer_names: "a"
+  layer_names: "b"
+  layer_names: "c"
+  layer_names: "d"
+  layer_names: "__interpolation_layer_0__"
+  layer_names: "__power_layer_0__"
+  layer_names: "__scaling_layer_0__"
+  layer_names: "__cos_sim_0__"
+  layer_names: "__cos_sim_1__"
+  layer_names: "__sum_to_one_norm_layer_0__"
+  layer_names: "__conv_shift_layer_0__"
+  layer_names: "__tensor_layer_0__"
+  layer_names: "__slope_intercept_layer_0__"
+  layer_names: "__linear_comb_layer_0__"
+  input_layer_names: "w"
+  input_layer_names: "a"
+  input_layer_names: "b"
+  input_layer_names: "c"
+  input_layer_names: "d"
+  output_layer_names: "__interpolation_layer_0__"
+  output_layer_names: "__power_layer_0__"
+  output_layer_names: "__scaling_layer_0__"
+  output_layer_names: "__cos_sim_0__"
+  output_layer_names: "__cos_sim_1__"
+  output_layer_names: "__sum_to_one_norm_layer_0__"
+  output_layer_names: "__conv_shift_layer_0__"
+  output_layer_names: "__tensor_layer_0__"
+  output_layer_names: "__slope_intercept_layer_0__"
+  output_layer_names: "__linear_comb_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..c402aff174ab7c7d7f63234960d4a24d84622dd4
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
@@ -0,0 +1,26 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__print_0__"
+  type: "print"
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+}
+input_layer_names: "input"
+output_layer_names: "input"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__print_0__"
+  input_layer_names: "input"
+  output_layer_names: "input"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..41d2e2f2671f5c05425f9bd2e91d8adc33129761
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
@@ -0,0 +1,650 @@
+type: "recurrent_nn"
+layers {
+  name: "seq_input"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "sub_seq_input"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "__mixed_0__"
+  type: "mixed"
+  size: 400
+  active_type: ""
+  inputs {
+    input_layer_name: "seq_input"
+    input_parameter_name: "___mixed_0__.w0"
+    proj_conf {
+      type: "fc"
+      name: "___mixed_0__.w0"
+      input_size: 100
+      output_size: 400
+    }
+  }
+}
+layers {
+  name: "__mixed_1__"
+  type: "mixed"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "seq_input"
+    input_parameter_name: "___mixed_1__.w0"
+    proj_conf {
+      type: "fc"
+      name: "___mixed_1__.w0"
+      input_size: 100
+      output_size: 300
+    }
+  }
+}
+layers {
+  name: "__recurrent_group_0__"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "seq_input@__recurrent_group_0__"
+  type: "scatter_agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "rnn_forward+delay1@__recurrent_group_0__"
+  type: "agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "rnn_forward@__recurrent_group_0__"
+  type: "fc"
+  size: 200
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "seq_input@__recurrent_group_0__"
+    input_parameter_name: "_rnn_forward@__recurrent_group_0__.w0"
+  }
+  inputs {
+    input_layer_name: "rnn_forward+delay1@__recurrent_group_0__"
+    input_parameter_name: "_rnn_forward@__recurrent_group_0__.w1"
+  }
+  bias_parameter_name: "_rnn_forward@__recurrent_group_0__.wbias"
+}
+layers {
+  name: "rnn_forward"
+  type: "gather_agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__last_seq_0__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "rnn_forward"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__recurrent_group_1__"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "seq_input@__recurrent_group_1__"
+  type: "scatter_agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "rnn_back+delay1@__recurrent_group_1__"
+  type: "agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "rnn_back@__recurrent_group_1__"
+  type: "fc"
+  size: 200
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "seq_input@__recurrent_group_1__"
+    input_parameter_name: "_rnn_back@__recurrent_group_1__.w0"
+  }
+  inputs {
+    input_layer_name: "rnn_back+delay1@__recurrent_group_1__"
+    input_parameter_name: "_rnn_back@__recurrent_group_1__.w1"
+  }
+  bias_parameter_name: "_rnn_back@__recurrent_group_1__.wbias"
+}
+layers {
+  name: "rnn_back"
+  type: "gather_agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__first_seq_0__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "rnn_back"
+  }
+  select_first: true
+  trans_type: "non-seq"
+}
+layers {
+  name: "__recurrent_group_2__"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "sub_seq_input@__recurrent_group_2__"
+  type: "sequence_scatter_agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "rnn_subseq_forward+delay1@__recurrent_group_2__"
+  type: "agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "rnn_subseq_forward@__recurrent_group_2__"
+  type: "fc"
+  size: 200
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "sub_seq_input@__recurrent_group_2__"
+    input_parameter_name: "_rnn_subseq_forward@__recurrent_group_2__.w0"
+  }
+  inputs {
+    input_layer_name: "rnn_subseq_forward+delay1@__recurrent_group_2__"
+    input_parameter_name: "_rnn_subseq_forward@__recurrent_group_2__.w1"
+  }
+  bias_parameter_name: "_rnn_subseq_forward@__recurrent_group_2__.wbias"
+}
+layers {
+  name: "rnn_subseq_forward"
+  type: "sequence_gather_agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__last_seq_1__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "rnn_subseq_forward"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__lstm_group_0___recurrent_group"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "__mixed_0__@__lstm_group_0___recurrent_group"
+  type: "scatter_agent"
+  size: 400
+  active_type: ""
+}
+layers {
+  name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
+  type: "agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
+  type: "agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
+  type: "mixed"
+  size: 400
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_0__@__lstm_group_0___recurrent_group"
+    proj_conf {
+      type: "identity"
+      name: "___lstm_group_0___input_recurrent.w0"
+      input_size: 400
+      output_size: 400
+    }
+  }
+  inputs {
+    input_layer_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
+    input_parameter_name: "___lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group.w1"
+    proj_conf {
+      type: "fc"
+      name: "___lstm_group_0___input_recurrent.w1"
+      input_size: 100
+      output_size: 400
+    }
+  }
+}
+layers {
+  name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
+  type: "lstm_step"
+  size: 100
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
+  }
+  inputs {
+    input_layer_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
+  }
+  bias_parameter_name: "___lstm_group_0__@__lstm_group_0___recurrent_group.wbias"
+  active_gate_type: "sigmoid"
+  active_state_type: "sigmoid"
+}
+layers {
+  name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
+  type: "get_output"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
+    input_layer_argument: "state"
+  }
+}
+layers {
+  name: "__lstm_group_0__"
+  type: "gather_agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__last_seq_2__"
+  type: "seqlastins"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__lstm_group_0__"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__gru_group_0___recurrent_group"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "__mixed_1__@__gru_group_0___recurrent_group"
+  type: "scatter_agent"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
+  type: "agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__gru_group_0__@__gru_group_0___recurrent_group"
+  type: "gru_step"
+  size: 100
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__mixed_1__@__gru_group_0___recurrent_group"
+    input_parameter_name: "___gru_group_0__@__gru_group_0___recurrent_group.w0"
+  }
+  inputs {
+    input_layer_name: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
+  }
+  bias_parameter_name: "___gru_group_0__@__gru_group_0___recurrent_group.wbias"
+  active_gate_type: "sigmoid"
+}
+layers {
+  name: "__gru_group_0__"
+  type: "gather_agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__last_seq_3__"
+  type: "seqlastins"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__gru_group_0__"
+  }
+  trans_type: "non-seq"
+}
+parameters {
+  name: "___mixed_0__.w0"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 400
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___mixed_1__.w0"
+  size: 30000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 300
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_rnn_forward@__recurrent_group_0__.w0"
+  size: 20000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_rnn_forward@__recurrent_group_0__.w1"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_rnn_forward@__recurrent_group_0__.wbias"
+  size: 200
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 200
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_rnn_back@__recurrent_group_1__.w0"
+  size: 20000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_rnn_back@__recurrent_group_1__.w1"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_rnn_back@__recurrent_group_1__.wbias"
+  size: 200
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 200
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_rnn_subseq_forward@__recurrent_group_2__.w0"
+  size: 20000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_rnn_subseq_forward@__recurrent_group_2__.w1"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_rnn_subseq_forward@__recurrent_group_2__.wbias"
+  size: 200
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 200
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group.w1"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 400
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___lstm_group_0__@__lstm_group_0___recurrent_group.wbias"
+  size: 300
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 300
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gru_group_0__@__gru_group_0___recurrent_group.w0"
+  size: 30000
+  initial_mean: 0.0
+  initial_std: 0.01
+  dims: 100
+  dims: 300
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gru_group_0__@__gru_group_0___recurrent_group.wbias"
+  size: 300
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 300
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "seq_input"
+input_layer_names: "sub_seq_input"
+output_layer_names: "__last_seq_0__"
+output_layer_names: "__first_seq_0__"
+output_layer_names: "__last_seq_1__"
+output_layer_names: "__last_seq_2__"
+output_layer_names: "__last_seq_3__"
+sub_models {
+  name: "root"
+  layer_names: "seq_input"
+  layer_names: "sub_seq_input"
+  layer_names: "label"
+  layer_names: "__mixed_0__"
+  layer_names: "__mixed_1__"
+  layer_names: "__recurrent_group_0__"
+  layer_names: "rnn_forward"
+  layer_names: "__last_seq_0__"
+  layer_names: "__recurrent_group_1__"
+  layer_names: "rnn_back"
+  layer_names: "__first_seq_0__"
+  layer_names: "__recurrent_group_2__"
+  layer_names: "rnn_subseq_forward"
+  layer_names: "__last_seq_1__"
+  layer_names: "__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0__"
+  layer_names: "__last_seq_2__"
+  layer_names: "__gru_group_0___recurrent_group"
+  layer_names: "__gru_group_0__"
+  layer_names: "__last_seq_3__"
+  input_layer_names: "seq_input"
+  input_layer_names: "sub_seq_input"
+  output_layer_names: "__last_seq_0__"
+  output_layer_names: "__first_seq_0__"
+  output_layer_names: "__last_seq_1__"
+  output_layer_names: "__last_seq_2__"
+  output_layer_names: "__last_seq_3__"
+  is_recurrent_layer_group: false
+}
+sub_models {
+  name: "__recurrent_group_0__"
+  layer_names: "seq_input@__recurrent_group_0__"
+  layer_names: "rnn_forward+delay1@__recurrent_group_0__"
+  layer_names: "rnn_forward@__recurrent_group_0__"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "rnn_forward@__recurrent_group_0__"
+    link_name: "rnn_forward+delay1@__recurrent_group_0__"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "seq_input"
+    link_name: "seq_input@__recurrent_group_0__"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "rnn_forward@__recurrent_group_0__"
+    link_name: "rnn_forward"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
+sub_models {
+  name: "__recurrent_group_1__"
+  layer_names: "seq_input@__recurrent_group_1__"
+  layer_names: "rnn_back+delay1@__recurrent_group_1__"
+  layer_names: "rnn_back@__recurrent_group_1__"
+  is_recurrent_layer_group: true
+  reversed: true
+  memories {
+    layer_name: "rnn_back@__recurrent_group_1__"
+    link_name: "rnn_back+delay1@__recurrent_group_1__"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "seq_input"
+    link_name: "seq_input@__recurrent_group_1__"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "rnn_back@__recurrent_group_1__"
+    link_name: "rnn_back"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
+sub_models {
+  name: "__recurrent_group_2__"
+  layer_names: "sub_seq_input@__recurrent_group_2__"
+  layer_names: "rnn_subseq_forward+delay1@__recurrent_group_2__"
+  layer_names: "rnn_subseq_forward@__recurrent_group_2__"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "rnn_subseq_forward@__recurrent_group_2__"
+    link_name: "rnn_subseq_forward+delay1@__recurrent_group_2__"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "sub_seq_input"
+    link_name: "sub_seq_input@__recurrent_group_2__"
+    has_subseq: true
+  }
+  out_links {
+    layer_name: "rnn_subseq_forward@__recurrent_group_2__"
+    link_name: "rnn_subseq_forward"
+    has_subseq: true
+  }
+  target_inlinkid: -1
+}
+sub_models {
+  name: "__lstm_group_0___recurrent_group"
+  layer_names: "__mixed_0__@__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0__@__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
+    link_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
+    is_sequence: false
+  }
+  memories {
+    layer_name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
+    link_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "__mixed_0__"
+    link_name: "__mixed_0__@__lstm_group_0___recurrent_group"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
+    link_name: "__lstm_group_0__"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
+sub_models {
+  name: "__gru_group_0___recurrent_group"
+  layer_names: "__mixed_1__@__gru_group_0___recurrent_group"
+  layer_names: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
+  layer_names: "__gru_group_0__@__gru_group_0___recurrent_group"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "__gru_group_0__@__gru_group_0___recurrent_group"
+    link_name: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "__mixed_1__"
+    link_name: "__mixed_1__@__gru_group_0___recurrent_group"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "__gru_group_0__@__gru_group_0___recurrent_group"
+    link_name: "__gru_group_0__"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..1999c006d237eb449d59c8e8a2a83c1e4fab9d0e
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
@@ -0,0 +1,111 @@
+type: "nn"
+layers {
+  name: "dat_in"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__seq_pooling_0__"
+  type: "max"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  trans_type: "seq"
+}
+layers {
+  name: "__seq_pooling_1__"
+  type: "max"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__seq_pooling_2__"
+  type: "average"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  average_strategy: "average"
+  trans_type: "seq"
+}
+layers {
+  name: "__seq_pooling_3__"
+  type: "average"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  average_strategy: "average"
+  trans_type: "non-seq"
+}
+layers {
+  name: "__seq_pooling_4__"
+  type: "average"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  average_strategy: "sum"
+  trans_type: "seq"
+}
+layers {
+  name: "__seq_pooling_5__"
+  type: "average"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  average_strategy: "sum"
+  trans_type: "non-seq"
+}
+layers {
+  name: "__seq_pooling_6__"
+  type: "max"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  output_max_index: true
+  trans_type: "non-seq"
+}
+input_layer_names: "dat_in"
+output_layer_names: "__seq_pooling_0__"
+output_layer_names: "__seq_pooling_1__"
+output_layer_names: "__seq_pooling_2__"
+output_layer_names: "__seq_pooling_3__"
+output_layer_names: "__seq_pooling_4__"
+output_layer_names: "__seq_pooling_5__"
+output_layer_names: "__seq_pooling_6__"
+sub_models {
+  name: "root"
+  layer_names: "dat_in"
+  layer_names: "__seq_pooling_0__"
+  layer_names: "__seq_pooling_1__"
+  layer_names: "__seq_pooling_2__"
+  layer_names: "__seq_pooling_3__"
+  layer_names: "__seq_pooling_4__"
+  layer_names: "__seq_pooling_5__"
+  layer_names: "__seq_pooling_6__"
+  input_layer_names: "dat_in"
+  output_layer_names: "__seq_pooling_0__"
+  output_layer_names: "__seq_pooling_1__"
+  output_layer_names: "__seq_pooling_2__"
+  output_layer_names: "__seq_pooling_3__"
+  output_layer_names: "__seq_pooling_4__"
+  output_layer_names: "__seq_pooling_5__"
+  output_layer_names: "__seq_pooling_6__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..1cfb92255aa92fa3fbc16a816851a5c2f81c2b56
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr
@@ -0,0 +1,72 @@
+model_config {
+  type: "nn"
+  layers {
+    name: "a"
+    type: "data"
+    size: 10
+    active_type: ""
+  }
+  input_layer_names: "a"
+  output_layer_names: "a"
+  sub_models {
+    name: "root"
+    layer_names: "a"
+    input_layer_names: "a"
+    output_layer_names: "a"
+    is_recurrent_layer_group: false
+  }
+}
+data_config {
+  type: "py2"
+  files: "train.list"
+  async_load_data: true
+  for_test: false
+  load_data_module: "a"
+  load_data_object: "c"
+  load_data_args: ""
+  data_ratio: 1
+  is_main_data: true
+  usage_ratio: 1.0
+}
+opt_config {
+  batch_size: 1000
+  algorithm: "sgd"
+  learning_rate: 0.001
+  learning_rate_decay_a: 0.0
+  learning_rate_decay_b: 0.0
+  l1weight: 0.1
+  l2weight: 0.0
+  c1: 0.0001
+  backoff: 0.5
+  owlqn_steps: 10
+  max_backoff: 5
+  l2weight_zero_iter: 0
+  average_window: 0
+  learning_method: "momentum"
+  ada_epsilon: 1e-06
+  do_average_in_cpu: false
+  ada_rou: 0.95
+  learning_rate_schedule: "poly"
+  delta_add_rate: 1.0
+  shrink_parameter_value: 0
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_epsilon: 1e-08
+  learning_rate_args: ""
+  async_lagged_grad_discard_ratio: 1.5
+}
+test_data_config {
+  type: "py2"
+  files: "test.list"
+  async_load_data: true
+  for_test: true
+  load_data_module: "b"
+  load_data_object: "d"
+  load_data_args: ""
+  data_ratio: 1
+  is_main_data: true
+  usage_ratio: 1.0
+}
+save_dir: "./output/model"
+start_pass: 0
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..8b0a8f2146b709ee67981049da8061597e1716be
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr
@@ -0,0 +1,34 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 3200
+  active_type: ""
+}
+layers {
+  name: "__spp_0__"
+  type: "spp"
+  size: 80
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    spp_conf {
+      pool_type: "max-projection"
+      pyramid_height: 2
+      channels: 16
+      img_size: 10
+      img_size_y: 20
+    }
+  }
+}
+input_layer_names: "data"
+output_layer_names: "__spp_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__spp_0__"
+  input_layer_names: "data"
+  output_layer_names: "__spp_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/unused_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/unused_layers.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..89ed28406e553ba93bec8c86879a85f0a5c1caa1
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/unused_layers.protostr
@@ -0,0 +1,27 @@
+type: "nn"
+layers {
+  name: "probs"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__sampling_id_layer_0__"
+  type: "sampling_id"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "probs"
+  }
+}
+input_layer_names: "probs"
+output_layer_names: "__sampling_id_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "probs"
+  layer_names: "__sampling_id_layer_0__"
+  input_layer_names: "probs"
+  output_layer_names: "__sampling_id_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..d0ad388165007b8f96f059e5b003c52f756383e5
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr
@@ -0,0 +1,81 @@
+type: "nn"
+layers {
+  name: "a"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "b"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__addto_0__"
+  type: "addto"
+  size: 10
+  active_type: ""
+  inputs {
+    input_layer_name: "a"
+  }
+  inputs {
+    input_layer_name: "b"
+  }
+}
+layers {
+  name: "__concat_0__"
+  type: "concat"
+  size: 20
+  active_type: ""
+  inputs {
+    input_layer_name: "a"
+  }
+  inputs {
+    input_layer_name: "b"
+  }
+}
+layers {
+  name: "__concat_1__"
+  type: "concat2"
+  size: 20
+  active_type: ""
+  inputs {
+    input_layer_name: "a"
+    proj_conf {
+      type: "identity"
+      name: "___concat_1__.w0"
+      input_size: 10
+      output_size: 10
+    }
+  }
+  inputs {
+    input_layer_name: "b"
+    proj_conf {
+      type: "identity"
+      name: "___concat_1__.w1"
+      input_size: 10
+      output_size: 10
+    }
+  }
+}
+input_layer_names: "a"
+input_layer_names: "b"
+output_layer_names: "__addto_0__"
+output_layer_names: "__concat_0__"
+output_layer_names: "__concat_1__"
+sub_models {
+  name: "root"
+  layer_names: "a"
+  layer_names: "b"
+  layer_names: "__addto_0__"
+  layer_names: "__concat_0__"
+  layer_names: "__concat_1__"
+  input_layer_names: "a"
+  input_layer_names: "b"
+  output_layer_names: "__addto_0__"
+  output_layer_names: "__concat_0__"
+  output_layer_names: "__concat_1__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
index 78114ce32b019cde7a028acde4d281cf6b3dac8e..968328835842667470467830c7ca59e4b9fa723d 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
@@ -1,5 +1,17 @@
 #!/bin/bash
 cd `dirname $0`
+
 set -e
+
+protostr=`dirname $0`/protostr
+
+files=`ls $protostr | grep -v "unitest"`
+
 ./generate_protostr.sh
-md5sum -c check.md5
+
+for file in $files
+do
+    base_protostr=$protostr/$file
+    new_protostr=$protostr/$file.unitest
+    diff $base_protostr $new_protostr -u
+done
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py b/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
index 202cf367fc7f289de5a9cd15d095d141e6010444..7c848ef3fcd63314bfe91db6ebac406ba8758998 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
@@ -1,9 +1,6 @@
 from paddle.trainer_config_helpers import *
 
-settings(
-    learning_rate=1e-4,
-    batch_size=1000
-)
+settings(learning_rate=1e-4, batch_size=1000)
 
 a = data_layer(name='feature_a', size=200)
 b = data_layer(name='feature_b', size=200)
@@ -11,12 +8,22 @@ b = data_layer(name='feature_b', size=200)
 fc_param = ParamAttr(name='fc_param', initial_max=1.0, initial_min=-1.0)
 bias_param = ParamAttr(name='bias_param', initial_mean=0.0, initial_std=0.0)
 
-softmax_param = ParamAttr(name='softmax_param', initial_max=1.0, initial_min=-1.0)
+softmax_param = ParamAttr(
+    name='softmax_param', initial_max=1.0, initial_min=-1.0)
 
-hidden_a = fc_layer(input=a, size=200, param_attr=fc_param, bias_attr=bias_param)
-hidden_b = fc_layer(input=b, size=200, param_attr=fc_param, bias_attr=bias_param)
+hidden_a = fc_layer(
+    input=a, size=200, param_attr=fc_param, bias_attr=bias_param)
+hidden_b = fc_layer(
+    input=b, size=200, param_attr=fc_param, bias_attr=bias_param)
 
-predict = fc_layer(input=[hidden_a, hidden_b], param_attr=[softmax_param, softmax_param],
-                   bias_attr=False, size=10, act=SoftmaxActivation())
+predict = fc_layer(
+    input=[hidden_a, hidden_b],
+    param_attr=[softmax_param, softmax_param],
+    bias_attr=False,
+    size=10,
+    act=SoftmaxActivation())
 
-outputs(classification_cost(input=predict, label=data_layer(name='label', size=10)))
+outputs(
+    classification_cost(
+        input=predict, label=data_layer(
+            name='label', size=10)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
index 8557e9daaf66ad56a51bac3d5b64ff08d861d05a..05810597b3154c3b287441465db16ee6e24b0ca2 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
@@ -16,14 +16,26 @@ with mixed_layer(size=400, bias_attr=False) as m2:
 lstm_param = ParamAttr(name='lstm_param')
 lstm_bias = ParamAttr(name='lstm_bias', initial_mean=0., initial_std=0.)
 
-lstm1 = lstmemory_group(input=m1, param_attr=lstm_param, lstm_bias_attr=lstm_bias, mixed_bias_attr=False)
-lstm2 = lstmemory_group(input=m2, param_attr=lstm_param, lstm_bias_attr=lstm_bias, mixed_bias_attr=False)
+lstm1 = lstmemory_group(
+    input=m1,
+    param_attr=lstm_param,
+    lstm_bias_attr=lstm_bias,
+    mixed_bias_attr=False)
+lstm2 = lstmemory_group(
+    input=m2,
+    param_attr=lstm_param,
+    lstm_bias_attr=lstm_bias,
+    mixed_bias_attr=False)
 
 softmax_param = ParamAttr(name='softmax_param')
 
-predict = fc_layer(input=[last_seq(input=lstm1), last_seq(input=lstm2)],
-                   size=10,
-                   param_attr=[softmax_param, softmax_param],
-                   bias_attr=False,
-                   act=SoftmaxActivation())
-outputs(classification_cost(input=predict, label=data_layer(name='label', size=10)))
+predict = fc_layer(
+    input=[last_seq(input=lstm1), last_seq(input=lstm2)],
+    size=10,
+    param_attr=[softmax_param, softmax_param],
+    bias_attr=False,
+    act=SoftmaxActivation())
+outputs(
+    classification_cost(
+        input=predict, label=data_layer(
+            name='label', size=10)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py b/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
index 87c2a85cf92dde19cc78d14d5d212940f11546f9..a5b5bb30b1d21aaa0c90868af7b5138e8a81aab1 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
@@ -1,9 +1,6 @@
 from paddle.trainer_config_helpers import *
 
-settings(
-    batch_size=1000,
-    learning_rate=1e-4
-)
+settings(batch_size=1000, learning_rate=1e-4)
 
 din = data_layer(name='data', size=200)
 
@@ -13,24 +10,28 @@ rnn = recurrent_layer(input=hidden, act=SigmoidActivation())
 
 rnn2 = recurrent_layer(input=hidden, act=SigmoidActivation(), reverse=True)
 
-lstm1_param = fc_layer(input=hidden, size=200*4, act=LinearActivation(),
-                       bias_attr=False)
+lstm1_param = fc_layer(
+    input=hidden, size=200 * 4, act=LinearActivation(), bias_attr=False)
 
 lstm1 = lstmemory(input=lstm1_param, act=SigmoidActivation())
 
-lstm2_param = fc_layer(input=hidden, size=200*4, act=LinearActivation(),
-                       bias_attr=False)
+lstm2_param = fc_layer(
+    input=hidden, size=200 * 4, act=LinearActivation(), bias_attr=False)
 
 lstm2 = lstmemory(input=lstm2_param, act=SigmoidActivation(), reverse=True)
 
-gru1_param = fc_layer(input=hidden, size=200*3, act=LinearActivation(),
-                      bias_attr=False)
+gru1_param = fc_layer(
+    input=hidden, size=200 * 3, act=LinearActivation(), bias_attr=False)
 gru1 = grumemory(input=gru1_param, act=SigmoidActivation())
 
-gru2_param = fc_layer(input=hidden, size=200*3, act=LinearActivation(),
-                      bias_attr=False)
+gru2_param = fc_layer(
+    input=hidden, size=200 * 3, act=LinearActivation(), bias_attr=False)
 gru2 = grumemory(input=gru2_param, act=SigmoidActivation(), reverse=True)
 
-outputs(last_seq(input=rnn), first_seq(input=rnn2),
-        last_seq(input=lstm1), first_seq(input=lstm2),
-        last_seq(input=gru1), first_seq(gru2))
+outputs(
+    last_seq(input=rnn),
+    first_seq(input=rnn2),
+    last_seq(input=lstm1),
+    first_seq(input=lstm2),
+    last_seq(input=gru1),
+    first_seq(gru2))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py b/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd7f609638e384314177d653e46ecf7a4b41a12f
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
@@ -0,0 +1,7 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-4)
+
+din = data_layer(name='data', size=120)
+
+outputs(bidirectional_gru(input=din, size=40, return_seq=True))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py b/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
new file mode 100644
index 0000000000000000000000000000000000000000..e15a55b412f9459ecd89a0f654256097099c1398
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
@@ -0,0 +1,27 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+data = data_layer(name='data', size=2304)
+
+conv = img_conv_layer(
+    input=data,
+    filter_size=3,
+    num_channels=1,
+    num_filters=16,
+    padding=1,
+    act=LinearActivation(),
+    bias_attr=True)
+
+bilinear = bilinear_interp_layer(input=conv, out_size_x=64, out_size_y=64)
+
+pool = img_pool_layer(
+    input=bilinear,
+    num_channels=4,
+    pool_size=2,
+    stride=2,
+    pool_type=MaxPooling())
+
+fc = fc_layer(input=pool, size=384, bias_attr=False)
+
+outputs(fc)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
index 64b45f4ded10b09ec4a7e77499e2d7b21215f430..fd979a1e9f4337417512b4d6581c34e54c3957bd 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
@@ -1,9 +1,6 @@
 from paddle.trainer_config_helpers import *
 
-settings(
-    learning_rate=1e-4,
-    batch_size=1000
-)
+settings(learning_rate=1e-4, batch_size=1000)
 
 seq_in = data_layer(name='input', size=200)
 labels = data_layer(name='labels', size=5000)
@@ -11,16 +8,34 @@ labels = data_layer(name='labels', size=5000)
 probs = data_layer(name='probs', size=10)
 xe_label = data_layer(name='xe-label', size=10)
 
-outputs(ctc_layer(input=seq_in, label=labels),
-        crf_layer(input=fc_layer(input=seq_in, size=4),
-                  label=data_layer(name='crf_label', size=4)),
-        rank_cost(left=data_layer(name='left', size=1),
-                  right=data_layer(name='right', size=1),
-                  label=data_layer(name='label', size=1)),
-        lambda_cost(input=data_layer(name='list_feature', size=100),
-                    score=data_layer(name='list_scores', size=1)),
-        cross_entropy(input=probs, label=xe_label),
-        cross_entropy_with_selfnorm(input=probs, label=xe_label),
-        huber_cost(input=data_layer(name='huber_probs', size=1),
-                   label=data_layer(name='huber_label', size=1)),
-        multi_binary_label_cross_entropy(input=probs, label=xe_label))
+hidden = fc_layer(input=seq_in, size=4)
+outputs(
+    ctc_layer(
+        input=seq_in, label=labels),
+    crf_layer(
+        input=hidden, label=data_layer(
+            name='crf_label', size=4)),
+    rank_cost(
+        left=data_layer(
+            name='left', size=1),
+        right=data_layer(
+            name='right', size=1),
+        label=data_layer(
+            name='label', size=1)),
+    lambda_cost(
+        input=data_layer(
+            name='list_feature', size=100),
+        score=data_layer(
+            name='list_scores', size=1)),
+    cross_entropy(
+        input=probs, label=xe_label),
+    cross_entropy_with_selfnorm(
+        input=probs, label=xe_label),
+    huber_cost(
+        input=data_layer(
+            name='huber_probs', size=1),
+        label=data_layer(
+            name='huber_label', size=1)),
+    multi_binary_label_cross_entropy(
+        input=probs, label=xe_label),
+    sum_cost(input=hidden))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
index 29749cbb666379c2927f30cfea9ca63030159c1c..d30f70a55c5b1834074966dfb3f378e01447c8ab 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
@@ -1,14 +1,14 @@
 from paddle.trainer_config_helpers import *
 
-settings(
-    learning_rate=1e-4,
-    batch_size=1000
-)
+settings(learning_rate=1e-4, batch_size=1000)
 
 data = data_layer(name='input', size=300)
 lbl = data_layer(name='label', size=1)
 wt = data_layer(name='weight', size=1)
 fc = fc_layer(input=data, size=10, act=SoftmaxActivation())
 
-outputs(classification_cost(input=fc, label=lbl, weight=wt),
-        regression_cost(input=fc, label=lbl, weight=wt))
+outputs(
+    classification_cost(
+        input=fc, label=lbl, weight=wt),
+    regression_cost(
+        input=fc, label=lbl, weight=wt))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
index d9c841ab277e10fe3bbb6751d002af62862e9237..81e5161ebc165f13ebb919fd3c0fe617167be048 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
@@ -1,14 +1,12 @@
 from paddle.trainer_config_helpers import *
 
-settings(
-    batch_size=1000,
-    learning_rate=1e-5
-)
+settings(batch_size=1000, learning_rate=1e-5)
 
 din = data_layer(name='data', size=30)
 data_seq = data_layer(name='data_seq', size=30)
 
-outputs(expand_layer(input=din, expand_as=data_seq,
-                     expand_level=ExpandLevel.FROM_SEQUENCE),
-        expand_layer(input=din, expand_as=data_seq,
-                     expand_level=ExpandLevel.FROM_TIMESTEP))
+outputs(
+    expand_layer(
+        input=din, expand_as=data_seq, expand_level=ExpandLevel.FROM_SEQUENCE),
+    expand_layer(
+        input=din, expand_as=data_seq, expand_level=ExpandLevel.FROM_TIMESTEP))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_fc.py b/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
index a6d033f291d2c60086a0c6e7de2005c4acfbbc03..2842d3429c9c917845f8f4c33d3618608d40291d 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
@@ -1,20 +1,16 @@
 from paddle.trainer_config_helpers import *
 
-settings(
-    batch_size=1000,
-    learning_rate=1e-5
-)
+settings(batch_size=1000, learning_rate=1e-5)
 
 din = data_layer(name='data', size=100)
 
 trans = trans_layer(input=din)
 
-hidden = fc_layer(input=trans, size=100,
-                  bias_attr=False)
+hidden = fc_layer(input=trans, size=100, bias_attr=False)
 
 mask = data_layer(name='mask', size=100)
 
-hidden_sel = selective_fc_layer(input=din, select=mask, size=100,
-                                act=SigmoidActivation())
+hidden_sel = selective_fc_layer(
+    input=din, select=mask, size=100, act=SigmoidActivation())
 
 outputs(hidden, hidden_sel)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
index 8d9fd9df5179c7006afd369879301e823e11bdb5..474e4f36bad7eab13251afe265d1a7d107549efd 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
@@ -1,11 +1,13 @@
 from paddle.trainer_config_helpers import *
 
-settings(
-    batch_size=1000,
-    learning_rate=1e-4
-)
+settings(batch_size=1000, learning_rate=1e-4)
 
 din = data_layer(name='data', size=120)
 
-outputs(grumemory(input=din, size=40, reverse=True, gate_act=TanhActivation(),
-                  act=SigmoidActivation()))
+outputs(
+    grumemory(
+        input=din,
+        size=40,
+        reverse=True,
+        gate_act=TanhActivation(),
+        act=SigmoidActivation()))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py b/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
index 46069074ded56098e3fb995dda0ad360fc897900..dff1c535b3e84e14d0e7c343efe911f19872280a 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
@@ -1,11 +1,8 @@
 from paddle.trainer_config_helpers import *
 
-settings(
-    learning_rate=1e-4,
-    batch_size=1000
-)
+settings(learning_rate=1e-4, batch_size=1000)
 
 din = data_layer(name='data', size=100)
 label = data_layer(name='label', size=10)
 
-outputs(hsigmoid(input=din, label=label, num_classes=10))
\ No newline at end of file
+outputs(hsigmoid(input=din, label=label, num_classes=10))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
index 56304addb17b233229a0bf717378833195e3188f..7ca1cc2db365dedda5d9673cafaa851a464a7b6b 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
@@ -1,11 +1,13 @@
 from paddle.trainer_config_helpers import *
 
-settings(
-    batch_size=1000,
-    learning_rate=1e-5
-)
+settings(batch_size=1000, learning_rate=1e-5)
 
 din = data_layer(name='data', size=128)
 
-outputs(lstmemory(input=din, reverse=True, gate_act=TanhActivation(),
-                  act=TanhActivation(), size=32))
+outputs(
+    lstmemory(
+        input=din,
+        reverse=True,
+        gate_act=TanhActivation(),
+        act=TanhActivation(),
+        size=32))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
new file mode 100644
index 0000000000000000000000000000000000000000..081430d716093877db6b2e44ac5417c37ede9a6e
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
@@ -0,0 +1,37 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+data = data_layer(name='data', size=2304)
+
+conv = img_conv_layer(
+    input=data,
+    filter_size=3,
+    num_channels=1,
+    num_filters=16,
+    padding=1,
+    act=LinearActivation(),
+    bias_attr=True)
+
+maxout = maxout_layer(input=conv, num_channels=16, groups=2)
+
+pool = img_pool_layer(
+    input=maxout, num_channels=8, pool_size=2, stride=2, pool_type=MaxPooling())
+
+conv2 = img_conv_layer(
+    input=pool,
+    filter_size=3,
+    num_channels=32,
+    num_filters=128,
+    padding=1,
+    act=LinearActivation(),
+    bias_attr=True)
+
+maxout2 = maxout_layer(input=conv, num_channels=128, groups=4)
+
+block = block_expand_layer(
+    input=maxout, num_channels=32, stride_x=1, stride_y=1, block_x=1, block_y=6)
+
+fc = fc_layer(input=block, size=384, bias_attr=False)
+
+outputs(fc)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
index 4d8e1fdc6b598d3d0e29d3834c804e5a6976710b..b7a15666f0a5b863cbafec5f73dcfe0b9db2e0c7 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
@@ -1,9 +1,6 @@
 from paddle.trainer_config_helpers import *
 
-settings(
-    batch_size=1000,
-    learning_rate=1e-5
-)
+settings(batch_size=1000, learning_rate=1e-5)
 
 weight = data_layer(name='w', size=1)
 a = data_layer(name='a', size=100)
@@ -11,13 +8,23 @@ b = data_layer(name='b', size=100)
 c = data_layer(name='c', size=200)
 d = data_layer(name='d', size=31)
 
-outputs(interpolation_layer(input=[a, b], weight=weight),
-        power_layer(input=a, weight=weight),
-        scaling_layer(input=a, weight=weight),
-        cos_sim(a=a, b=b),
-        cos_sim(a=a, b=c, size=2),
-        sum_to_one_norm_layer(input=a),
-        conv_shift_layer(a=a, b=d),
-        tensor_layer(a=a, b=b, size=1000),
-        slope_intercept_layer(input=a, slope=0.7, intercept=0.9),
-        linear_comb_layer(weights=b, vectors=c))
+outputs(
+    interpolation_layer(
+        input=[a, b], weight=weight),
+    power_layer(
+        input=a, weight=weight),
+    scaling_layer(
+        input=a, weight=weight),
+    cos_sim(
+        a=a, b=b),
+    cos_sim(
+        a=a, b=c, size=2),
+    sum_to_one_norm_layer(input=a),
+    conv_shift_layer(
+        a=a, b=d),
+    tensor_layer(
+        a=a, b=b, size=1000),
+    slope_intercept_layer(
+        input=a, slope=0.7, intercept=0.9),
+    linear_comb_layer(
+        weights=b, vectors=c))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
index f6b2661c7b9e8563d3816def9490e22962e3f7cb..8da26ff44b19d0c18efae201a3b39002555d2605 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
@@ -1,9 +1,6 @@
 from paddle.trainer_config_helpers import *
 
-settings(
-    learning_rate=1e-4,
-    batch_size=1000
-)
+settings(learning_rate=1e-4, batch_size=1000)
 
 din = data_layer(name='input', size=100)
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py b/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
index 53f5c5d2499f9e6693d40b05354af83614fde8f5..60b4849d69d497109ef5af3257e212df233a2d0b 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
@@ -1,9 +1,6 @@
 from paddle.trainer_config_helpers import *
 
-settings(
-    learning_rate=1e-4,
-    batch_size=1000
-)
+settings(learning_rate=1e-4, batch_size=1000)
 
 seq = data_layer(name='seq_input', size=100)
 sub_seq = data_layer(name='sub_seq_input', size=100)
@@ -25,11 +22,15 @@ with mixed_layer() as lstm_param:  # test lstm unit, rnn group
 with mixed_layer() as gru_param:
     gru_param += full_matrix_projection(input=seq, size=100 * 3)
 
-outputs(last_seq(input=recurrent_group(step=generate_rnn_simple('rnn_forward'),
-                                       input=seq)),
-        first_seq(input=recurrent_group(step=generate_rnn_simple('rnn_back'),
-                                        input=seq, reverse=True)),
-        last_seq(input=recurrent_group(step=generate_rnn_simple(
-            'rnn_subseq_forward'), input=SubsequenceInput(input=sub_seq))),
-        last_seq(input=lstmemory_group(input=lstm_param, size=100)),
-        last_seq(input=gru_group(input=gru_param, size=100)))
+outputs(
+    last_seq(input=recurrent_group(
+        step=generate_rnn_simple('rnn_forward'), input=seq)),
+    first_seq(input=recurrent_group(
+        step=generate_rnn_simple('rnn_back'), input=seq, reverse=True)),
+    last_seq(input=recurrent_group(
+        step=generate_rnn_simple('rnn_subseq_forward'),
+        input=SubsequenceInput(input=sub_seq))),
+    last_seq(input=lstmemory_group(
+        input=lstm_param, size=100)),
+    last_seq(input=gru_group(
+        input=gru_param, size=100)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
index 2e24164b5578cfe2abdb2a0ad889d5fd0c3f6e57..f67b6364d88560dfedb07428869482795be6af0c 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
@@ -1,22 +1,12 @@
 from paddle.trainer_config_helpers import *
 
-settings(
-    learning_rate=1e-4,
-    batch_size=1000
-)
+settings(learning_rate=1e-4, batch_size=1000)
 
 din = data_layer(name='dat_in', size=100)
 
-POOL_TYPE = [
-    MaxPooling,
-    AvgPooling,
-    SumPooling
-]
+POOL_TYPE = [MaxPooling, AvgPooling, SumPooling]
 
-AGG_LEVEL = [
-    AggregateLevel.EACH_SEQUENCE,
-    AggregateLevel.EACH_TIMESTEP
-]
+AGG_LEVEL = [AggregateLevel.EACH_SEQUENCE, AggregateLevel.EACH_TIMESTEP]
 
 opts = []
 
@@ -24,7 +14,8 @@ for pt in POOL_TYPE:
     for al in AGG_LEVEL:
         opts.append(pooling_layer(input=din, agg_level=al, pooling_type=pt()))
 
-opts.append(pooling_layer(input=din,
-                          pooling_type=MaxPooling(output_max_index=True)))
+opts.append(
+    pooling_layer(
+        input=din, pooling_type=MaxPooling(output_max_index=True)))
 
 outputs(opts)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py b/python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py
new file mode 100644
index 0000000000000000000000000000000000000000..318b4459bab7a70ddec534c4ad217161ffc72d5a
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py
@@ -0,0 +1,10 @@
+from paddle.trainer_config_helpers import *
+
+define_py_data_sources2(
+    train_list="train.list",
+    test_list="test.list",
+    module=["a", "b"],
+    obj=("c", "d"))
+settings(learning_rate=1e-3, batch_size=1000)
+
+outputs(data_layer(name="a", size=10))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e20ffb584e8bdd86100455d4e55fe633b878e034
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
@@ -0,0 +1,14 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=100, learning_rate=1e-5)
+
+data = data_layer(name='data', size=3200)
+
+spp = spp_layer(
+    input=data,
+    pyramid_height=2,
+    num_channels=16,
+    pool_type=MaxPooling(),
+    img_width=10)
+
+outputs(spp)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py b/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
index a6a3d09a4315ac3be63b8fecdb0fa359de834df0..ebb39219bdc1fa314e1d70bcda902f71296772f6 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
@@ -1,8 +1,5 @@
 from paddle.trainer_config_helpers import *
-settings(
-    batch_size=1000,
-    learning_rate=1e-4
-)
+settings(batch_size=1000, learning_rate=1e-4)
 
 probs = data_layer(name='probs', size=100)
 
@@ -11,4 +8,4 @@ outputs(
 
     # It seems this layer is not correct, and should be rewrite.
     # block_expand_layer(input=probs, channel=1, block_x=1, block_y=3),
-)
\ No newline at end of file
+)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/util_layers.py b/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
index aadb3f3f5e7997a56a73724b2858346e5ae17179..27f1c8e9938cdec12fccb37a3127bba1f8ee8d04 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
@@ -7,9 +7,7 @@ b = data_layer(name='b', size=10)
 
 result = addto_layer(input=[a, b])
 concat1 = concat_layer(input=[a, b])
-concat2 = concat_layer(input=[
-    identity_projection(input=a),
-    identity_projection(input=b)
-])
+concat2 = concat_layer(
+    input=[identity_projection(input=a), identity_projection(input=b)])
 
-outputs(result, concat1, concat2)
\ No newline at end of file
+outputs(result, concat1, concat2)
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test_config.py b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
index faaab9107d8fbf11afafb722075acbe986efe9fd..44d134d1f7d5fb5de790cf564f4c1e0899571473 100644
--- a/python/paddle/trainer_config_helpers/tests/layers_test_config.py
+++ b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
@@ -24,13 +24,19 @@ z = out_prod_layer(input1=x, input2=y)
 x1 = fc_layer(input=x, size=5)
 y1 = fc_layer(input=y, size=5)
 
-z1 = mixed_layer(act=LinearActivation(),
-                 input=[conv_operator(img=x1,
-                                      filter=y1,
-                                      filter_size=1,
-                                      num_filters=5,
-                                      num_channel=5,
-                                      stride=1)])
+z1 = mixed_layer(
+    act=LinearActivation(),
+    input=[
+        conv_operator(
+            img=x1,
+            filter=y1,
+            filter_size=1,
+            num_filters=5,
+            num_channels=5,
+            stride=1)
+    ])
+
+assert z1.size > 0
 
 y2 = fc_layer(input=y, size=15)
 
@@ -39,34 +45,36 @@ cos3 = cos_sim(a=x1, b=y2, size=3)
 
 linear_comb = linear_comb_layer(weights=x1, vectors=y2, size=3)
 
-out = fc_layer(input=[cos1, cos3, linear_comb, z, z1],
-               size=num_classes,
-               act=SoftmaxActivation())
+out = fc_layer(
+    input=[cos1, cos3, linear_comb, z, z1],
+    size=num_classes,
+    act=SoftmaxActivation())
 
 print_layer(input=[out])
 
 outputs(classification_cost(out, data_layer(name="label", size=num_classes)))
 
-dotmul = mixed_layer(input=[dotmul_operator(a=x1, b=x1),
-                            dotmul_projection(input=y1)])
-
-proj_with_attr_init = mixed_layer(input=full_matrix_projection(input=y1,
-                                                               param_attr=ParamAttr(learning_rate = 0,
-                                                                                 initial_mean = 0,
-                                                                                 initial_std = 0)),
-                               bias_attr = ParamAttr(initial_mean=0, initial_std=0, learning_rate=0),
-                               act = LinearActivation(),
-                               size = 5,
-                               name='proj_with_attr_init')
-
+dotmul = mixed_layer(
+    input=[dotmul_operator(
+        a=x1, b=x1), dotmul_projection(input=y1)])
+
+proj_with_attr_init = mixed_layer(
+    input=full_matrix_projection(
+        input=y1,
+        param_attr=ParamAttr(
+            learning_rate=0, initial_mean=0, initial_std=0)),
+    bias_attr=ParamAttr(
+        initial_mean=0, initial_std=0, learning_rate=0),
+    act=LinearActivation(),
+    size=5,
+    name='proj_with_attr_init')
 
 # for ctc
-tmp = fc_layer(input=[x1, dotmul, proj_with_attr_init],
-               size=num_classes + 1,
-               act=SoftmaxActivation())
-ctc = ctc_layer(input=tmp,
-                label=y,
-                size=num_classes + 1)
+tmp = fc_layer(
+    input=[x1, dotmul, proj_with_attr_init],
+    size=num_classes + 1,
+    act=SoftmaxActivation())
+ctc = ctc_layer(input=tmp, label=y, size=num_classes + 1)
 ctc_eval = ctc_error_evaluator(input=tmp, label=y)
 
 settings(
@@ -74,5 +82,4 @@ settings(
     learning_rate=2e-3,
     learning_method=AdamOptimizer(),
     regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25
-)
+    gradient_clipping_threshold=25)
diff --git a/python/paddle/trainer_config_helpers/utils.py b/python/paddle/trainer_config_helpers/utils.py
index 40ecbb99e2b9f2a6b74ed3e94ecdfa02ada71fc3..c0235b28cdfb96fbff9c02c217ffd972e4f8816e 100644
--- a/python/paddle/trainer_config_helpers/utils.py
+++ b/python/paddle/trainer_config_helpers/utils.py
@@ -23,8 +23,8 @@ def deprecated(instead):
         @functools.wraps(func)
         def __wrapper__(*args, **kwargs):
             logger.warning("The interface %s is deprecated, "
-                           "will be removed soon. Please use %s instead."
-                           % (func.__name__, instead))
+                           "will be removed soon. Please use %s instead." %
+                           (func.__name__, instead))
 
             return func(*args, **kwargs)
 
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index 4f3c9434efbd80e130ecbb4d2cd3b056555b3af3..3e93f41c2e32025b3e29a0990833d7e97a7c8caa 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = ['dump_config']
\ No newline at end of file
+__all__ = ['dump_config']
diff --git a/python/paddle/utils/dump_config.py b/python/paddle/utils/dump_config.py
index d8a2722575d539447fbed90d055df71863b4ba01..c5ce5c8d9a084d68b250d091808f528459f46921 100644
--- a/python/paddle/utils/dump_config.py
+++ b/python/paddle/utils/dump_config.py
@@ -19,13 +19,21 @@ import sys
 __all__ = []
 
 if __name__ == '__main__':
+    whole_conf = False
     if len(sys.argv) == 2:
         conf = parse_config(sys.argv[1], '')
     elif len(sys.argv) == 3:
         conf = parse_config(sys.argv[1], sys.argv[2])
+    elif len(sys.argv) == 4:
+        conf = parse_config(sys.argv[1], sys.argv[2])
+        if sys.argv[3] == '--whole':
+            whole_conf = True
     else:
         raise RuntimeError()
 
     assert isinstance(conf, TrainerConfig_pb2.TrainerConfig)
 
-    print conf.model_config
+    if whole_conf:
+        print conf
+    else:
+        print conf.model_config
diff --git a/python/paddle/utils/image_util.py b/python/paddle/utils/image_util.py
index c545d16aafbc741bce25f9469e7f67de5b88fa8c..b5c6431c06f77cef5c31ca844a8427eebaea2fce 100644
--- a/python/paddle/utils/image_util.py
+++ b/python/paddle/utils/image_util.py
@@ -16,17 +16,20 @@ import numpy as np
 from PIL import Image
 from cStringIO import StringIO
 
+
 def resize_image(img, target_size):
     """
     Resize an image so that the shorter edge has length target_size.
     img: the input image to be resized.
     target_size: the target resized image size.
     """
-    percent = (target_size/float(min(img.size[0], img.size[1])))
-    resized_size = int(round(img.size[0] * percent)), int(round(img.size[1] * percent))
+    percent = (target_size / float(min(img.size[0], img.size[1])))
+    resized_size = int(round(img.size[0] * percent)), int(
+        round(img.size[1] * percent))
     img = img.resize(resized_size, Image.ANTIALIAS)
     return img
 
+
 def flip(im):
     """
     Return the flipped image.
@@ -38,6 +41,7 @@ def flip(im):
     else:
         return im[:, ::-1]
 
+
 def crop_img(im, inner_size, color=True, test=True):
     """
     Return cropped image.
@@ -50,20 +54,22 @@ def crop_img(im, inner_size, color=True, test=True):
       If True, crop the center of images.
     """
     if color:
-        height, width = max(inner_size, im.shape[1]), max(inner_size, im.shape[2])
+        height, width = max(inner_size, im.shape[1]), max(inner_size,
+                                                          im.shape[2])
         padded_im = np.zeros((3, height, width))
         startY = (height - im.shape[1]) / 2
         startX = (width - im.shape[2]) / 2
         endY, endX = startY + im.shape[1], startX + im.shape[2]
-        padded_im[:, startY: endY, startX: endX] = im
+        padded_im[:, startY:endY, startX:endX] = im
     else:
         im = im.astype('float32')
-        height, width = max(inner_size, im.shape[0]), max(inner_size, im.shape[1])
+        height, width = max(inner_size, im.shape[0]), max(inner_size,
+                                                          im.shape[1])
         padded_im = np.zeros((height, width))
         startY = (height - im.shape[0]) / 2
         startX = (width - im.shape[1]) / 2
         endY, endX = startY + im.shape[0], startX + im.shape[1]
-        padded_im[startY: endY, startX: endX] = im
+        padded_im[startY:endY, startX:endX] = im
     if test:
         startY = (height - inner_size) / 2
         startX = (width - inner_size) / 2
@@ -72,19 +78,21 @@ def crop_img(im, inner_size, color=True, test=True):
         startX = np.random.randint(0, width - inner_size + 1)
     endY, endX = startY + inner_size, startX + inner_size
     if color:
-        pic = padded_im[:, startY: endY, startX: endX]
+        pic = padded_im[:, startY:endY, startX:endX]
     else:
-        pic = padded_im[startY: endY, startX: endX]
+        pic = padded_im[startY:endY, startX:endX]
     if (not test) and (np.random.randint(2) == 0):
         pic = flip(pic)
     return pic
 
+
 def decode_jpeg(jpeg_string):
     np_array = np.array(Image.open(StringIO(jpeg_string)))
     if len(np_array.shape) == 3:
         np_array = np.transpose(np_array, (2, 0, 1))
     return np_array
 
+
 def preprocess_img(im, img_mean, crop_size, is_train, color=True):
     """
     Does data augmentation for images.
@@ -99,6 +107,7 @@ def preprocess_img(im, img_mean, crop_size, is_train, color=True):
     pic -= img_mean
     return pic.flatten()
 
+
 def load_meta(meta_path, mean_img_size, crop_size, color=True):
     """
     Return the loaded meta file.
@@ -109,17 +118,18 @@ def load_meta(meta_path, mean_img_size, crop_size, color=True):
     mean = np.load(meta_path)['data_mean']
     border = (mean_img_size - crop_size) / 2
     if color:
-        assert(mean_img_size * mean_img_size * 3 == mean.shape[0])
+        assert (mean_img_size * mean_img_size * 3 == mean.shape[0])
         mean = mean.reshape(3, mean_img_size, mean_img_size)
-        mean = mean[:, border: border + crop_size,
-                       border: border + crop_size].astype('float32')
+        mean = mean[:, border:border + crop_size, border:border +
+                    crop_size].astype('float32')
     else:
-        assert(mean_img_size * mean_img_size == mean.shape[0])
+        assert (mean_img_size * mean_img_size == mean.shape[0])
         mean = mean.reshape(mean_img_size, mean_img_size)
-        mean = mean[border: border + crop_size,
-                    border: border + crop_size].astype('float32')
+        mean = mean[border:border + crop_size, border:border +
+                    crop_size].astype('float32')
     return mean
 
+
 def load_image(img_path, is_color=True):
     """
     Load image and return. 
@@ -130,6 +140,7 @@ def load_image(img_path, is_color=True):
     img.load()
     return img
 
+
 def oversample(img, crop_dims):
     """
     image : iterable of (H x W x K) ndarrays
@@ -152,50 +163,53 @@ def oversample(img, crop_dims):
         for j in w_indices:
             crops_ix[curr] = (i, j, i + crop_dims[0], j + crop_dims[1])
             curr += 1
-    crops_ix[4] = np.tile(im_center, (1, 2)) + np.concatenate([
-        -crop_dims / 2.0,
-         crop_dims / 2.0
-    ])
+    crops_ix[4] = np.tile(im_center, (1, 2)) + np.concatenate(
+        [-crop_dims / 2.0, crop_dims / 2.0])
     crops_ix = np.tile(crops_ix, (2, 1))
 
     # Extract crops
-    crops = np.empty((10 * len(img), crop_dims[0], crop_dims[1],
-                      im_shape[-1]), dtype=np.float32)
+    crops = np.empty(
+        (10 * len(img), crop_dims[0], crop_dims[1], im_shape[-1]),
+        dtype=np.float32)
     ix = 0
     for im in img:
         for crop in crops_ix:
             crops[ix] = im[crop[0]:crop[2], crop[1]:crop[3], :]
             ix += 1
-        crops[ix-5:ix] = crops[ix-5:ix, :, ::-1, :]  # flip for mirrors
+        crops[ix - 5:ix] = crops[ix - 5:ix, :, ::-1, :]  # flip for mirrors
     return crops
 
+
 class ImageTransformer:
-    def __init__(self, transpose = None,
-                 channel_swap = None, mean = None, is_color = True):
+    def __init__(self,
+                 transpose=None,
+                 channel_swap=None,
+                 mean=None,
+                 is_color=True):
         self.transpose = transpose
         self.channel_swap = None
         self.mean = None
-        self.is_color = is_color 
+        self.is_color = is_color
 
-    def set_transpose(self, order): 
+    def set_transpose(self, order):
         if self.is_color:
-            assert 3 == len(order) 
+            assert 3 == len(order)
         self.transpose = order
 
-    def set_channel_swap(self, order): 
+    def set_channel_swap(self, order):
         if self.is_color:
-            assert 3 == len(order) 
+            assert 3 == len(order)
         self.channel_swap = order
 
     def set_mean(self, mean):
         # mean value, may be one value per channel 
         if mean.ndim == 1:
-            mean = mean[:, np.newaxis, np.newaxis]       
-        else: 
+            mean = mean[:, np.newaxis, np.newaxis]
+        else:
             # elementwise mean
             if self.is_color:
                 assert len(mean.shape) == 3
-        self.mean = mean 
+        self.mean = mean
 
     def transformer(self, data):
         if self.transpose is not None:
diff --git a/python/paddle/utils/make_model_diagram.py b/python/paddle/utils/make_model_diagram.py
index c8990bf73e8e46ffbb7c5fa4ebe84efe44c75498..29e271717d7108f343d4c28d51c7dfb11bb33fba 100644
--- a/python/paddle/utils/make_model_diagram.py
+++ b/python/paddle/utils/make_model_diagram.py
@@ -15,7 +15,6 @@
 # Generate dot diagram file for the given paddle model config
 # The generated file can be viewed using Graphviz (http://graphviz.org)
 
-
 import sys
 import traceback
 
@@ -46,16 +45,16 @@ def make_diagram(config_file, dot_file, config_arg_str):
     submodel_layers = set()
 
     def make_link(link):
-        return 'l%s -> l%s;' % (
-            name2id[link.layer_name], name2id[link.link_name])
+        return 'l%s -> l%s;' % (name2id[link.layer_name],
+                                name2id[link.link_name])
 
     def make_mem(mem):
         s = ''
         if mem.boot_layer_name:
-            s += 'l%s -> l%s;\n' % (
-                name2id[mem.boot_layer_name], name2id[mem.layer_name])
-        s += 'l%s -> l%s [style=dashed];' % (
-            name2id[mem.layer_name], name2id[mem.link_name])
+            s += 'l%s -> l%s;\n' % (name2id[mem.boot_layer_name],
+                                    name2id[mem.layer_name])
+        s += 'l%s -> l%s [style=dashed];' % (name2id[mem.layer_name],
+                                             name2id[mem.link_name])
         return s
 
     print >> f, 'digraph graphname {'
@@ -110,8 +109,8 @@ def make_diagram(config_file, dot_file, config_arg_str):
 
 
 def usage():
-    print >> sys.stderr, ("Usage: python show_model_diagram.py"
-                          + " CONFIG_FILE DOT_FILE [config_str]")
+    print >> sys.stderr, ("Usage: python show_model_diagram.py" +
+                          " CONFIG_FILE DOT_FILE [config_str]")
     exit(1)
 
 
diff --git a/python/paddle/utils/plotcurve.py b/python/paddle/utils/plotcurve.py
index 076eb6f74d5b1871a47446152dc32d2d1762f747..7bc7c5f8d243ed4cca834c48197c511e44baf215 100644
--- a/python/paddle/utils/plotcurve.py
+++ b/python/paddle/utils/plotcurve.py
@@ -12,7 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Plot training and testing curve from paddle log.
 
 It takes input from a file or stdin, and output to a file or stdout.
@@ -59,8 +58,8 @@ import re
 import os
 
 
-def plot_paddle_curve(keys, inputfile, outputfile,
-                      format='png', show_fig = False):
+def plot_paddle_curve(keys, inputfile, outputfile, format='png',
+                      show_fig=False):
     """Plot curves from paddle log and save to outputfile.
 
     :param keys: a list of strings to be plotted, e.g. AvgCost
@@ -93,12 +92,17 @@ def plot_paddle_curve(keys, inputfile, outputfile,
         return
     m = len(keys) + 1
     for i in xrange(1, m):
-        pyplot.plot(x[:, 0], x[:, i], color=cm.jet(1.0 * (i - 1) / (2 * m)),
-                    label=keys[i - 1])
+        pyplot.plot(
+            x[:, 0],
+            x[:, i],
+            color=cm.jet(1.0 * (i - 1) / (2 * m)),
+            label=keys[i - 1])
         if (x_test.shape[0] > 0):
-            pyplot.plot(x[:, 0], x_test[:, i],
-                        color=cm.jet(1.0 - 1.0 * (i - 1) / (2 * m)),
-                        label="Test " + keys[i - 1])
+            pyplot.plot(
+                x[:, 0],
+                x_test[:, i],
+                color=cm.jet(1.0 - 1.0 * (i - 1) / (2 * m)),
+                label="Test " + keys[i - 1])
     pyplot.xlabel('number of epoch')
     pyplot.legend(loc='best')
     if show_fig:
@@ -111,12 +115,20 @@ def main(argv):
     """
     main method of plotting curves.
     """
-    cmdparser = argparse.ArgumentParser("Plot training and testing curves from paddle log file.")
-    cmdparser.add_argument('key', nargs='*', help='keys of scores to plot, the default is AvgCost')
-    cmdparser.add_argument('-i', '--input', help='input filename of paddle log, '
-                                                 'default will be standard input')
-    cmdparser.add_argument('-o', '--output', help='output filename of figure, '
-                                                 'default will be standard output')
+    cmdparser = argparse.ArgumentParser(
+        "Plot training and testing curves from paddle log file.")
+    cmdparser.add_argument(
+        'key', nargs='*', help='keys of scores to plot, the default is AvgCost')
+    cmdparser.add_argument(
+        '-i',
+        '--input',
+        help='input filename of paddle log, '
+        'default will be standard input')
+    cmdparser.add_argument(
+        '-o',
+        '--output',
+        help='output filename of figure, '
+        'default will be standard output')
     cmdparser.add_argument('--format', help='figure format(png|pdf|ps|eps|svg)')
     args = cmdparser.parse_args(argv)
     keys = args.key
diff --git a/python/paddle/utils/predefined_net.py b/python/paddle/utils/predefined_net.py
index 6b0db0b6984d0e139f701f4363291b9c9dd66f82..e9033432ed5200a88ce1ce4f3d7e74d03cf8c8e5 100644
--- a/python/paddle/utils/predefined_net.py
+++ b/python/paddle/utils/predefined_net.py
@@ -41,9 +41,8 @@ def image_data(data_dir,
                the size of the mean image, the number of classes.
     async_load_data: whether to load image data asynchronuously.
     """
-    data_creator = ImageClassificationDatasetCreater(data_dir,
-                                                     processed_image_size,
-                                                     color)
+    data_creator = ImageClassificationDatasetCreater(
+        data_dir, processed_image_size, color)
     batch_data_dir = data_dir
     train_list = os.path.join(batch_data_dir, train_list)
     test_list = os.path.join(batch_data_dir, test_list)
@@ -64,13 +63,17 @@ def image_data(data_dir,
         'color': color_string
     }
 
-    define_py_data_sources2(train_list, test_list,
-                           module='image_provider',
-                           obj='processData',
-                           args=args)
-    return {"image_size": image_size,
-            "num_classes": num_classes,
-            "is_color": is_color}
+    define_py_data_sources2(
+        train_list,
+        test_list,
+        module='image_provider',
+        obj='processData',
+        args=args)
+    return {
+        "image_size": image_size,
+        "num_classes": num_classes,
+        "is_color": is_color
+    }
 
 
 def get_extra_layer_attr(drop_rate):
@@ -80,8 +83,8 @@ def get_extra_layer_attr(drop_rate):
         return ExtraLayerAttribute(drop_rate=drop_rate)
 
 
-def image_data_layers(image_size, num_classes,
-                      is_color=False, is_predict=False):
+def image_data_layers(image_size, num_classes, is_color=False,
+                      is_predict=False):
     """
     Data layers for image classification.
     image_size: image size.
@@ -109,56 +112,58 @@ def simple_conv_net(data_conf, is_color=False):
         num_classes: num of classes.
         is_color: whether the input images are color.
     """
-    for k, v in data_conf.iteritems(): globals()[k] = v
+    for k, v in data_conf.iteritems():
+        globals()[k] = v
     data_input, label_input, num_image_channels = \
         image_data_layers(image_size, num_classes, is_color, is_predict)
     filter_sizes = [5, 5]
     num_channels = [32, 64]
     strides = [1, 1]
     fc_dims = [500]
-    conv_bn_pool1 = img_conv_bn_pool(name="g1",
-                                     input=data_input,
-                                     filter_size=filter_sizes[0],
-                                     num_channel=num_image_channels,
-                                     num_filters=num_channels[0],
-                                     conv_stride=1,
-                                     conv_padding=0,
-                                     pool_size=3,
-                                     pool_stride=2,
-                                     act=ReluActivation())
-    conv_bn_pool2 = img_conv_bn_pool(name="g2",
-                                     input=conv_bn_pool1,
-                                     filter_size=filter_sizes[1],
-                                     num_channel=num_channels[0],
-                                     num_filters=num_channels[1],
-                                     conv_stride=1,
-                                     conv_padding=0,
-                                     pool_size=3,
-                                     pool_stride=2,
-                                     act=ReluActivation())
-    fc3 = fc_layer(name="fc3",
-                   input=conv_bn_pool2,
-                   dim=fc_dims[0],
-                   act=ReluActivation())
-    fc3_dropped = dropout_layer(name="fc3_dropped",
-                                input=fc3,
-                                dropout_rate=0.5)
-    output = fc_layer(name="output",
-                      input=fc3_dropped,
-                      dim=fc_dims[0],
-                      act=SoftmaxActivation())
+    conv_bn_pool1 = img_conv_bn_pool(
+        name="g1",
+        input=data_input,
+        filter_size=filter_sizes[0],
+        num_channel=num_image_channels,
+        num_filters=num_channels[0],
+        conv_stride=1,
+        conv_padding=0,
+        pool_size=3,
+        pool_stride=2,
+        act=ReluActivation())
+    conv_bn_pool2 = img_conv_bn_pool(
+        name="g2",
+        input=conv_bn_pool1,
+        filter_size=filter_sizes[1],
+        num_channel=num_channels[0],
+        num_filters=num_channels[1],
+        conv_stride=1,
+        conv_padding=0,
+        pool_size=3,
+        pool_stride=2,
+        act=ReluActivation())
+    fc3 = fc_layer(
+        name="fc3", input=conv_bn_pool2, dim=fc_dims[0], act=ReluActivation())
+    fc3_dropped = dropout_layer(name="fc3_dropped", input=fc3, dropout_rate=0.5)
+    output = fc_layer(
+        name="output",
+        input=fc3_dropped,
+        dim=fc_dims[0],
+        act=SoftmaxActivation())
     if is_predict:
         end_of_network(output)
     else:
-        cost = classify(name="cost",
-                        input=output,
-                        label=label_input)
+        cost = classify(name="cost", input=output, label=label_input)
         end_of_network(cost)
 
 
-def conv_layer_group(prefix_num, num_layers, input,
-                     input_channels, output_channels,
-                     drop_rates=[], strides=[],
+def conv_layer_group(prefix_num,
+                     num_layers,
+                     input,
+                     input_channels,
+                     output_channels,
+                     drop_rates=[],
+                     strides=[],
                      with_bn=[]):
     """
     A set of convolution layers, and batch normalization layers,
@@ -190,36 +195,45 @@ def conv_layer_group(prefix_num, num_layers, input,
             i_conv_in = group_output
         i_channels_conv = input_channels if i == 1 else output_channels
         conv_act = LinearActivation() if with_bn[i - 1] else ReluActivation()
-        conv_output = img_conv_layer(name="conv%d_%d" % (prefix_num, i),
-                                     input=i_conv_in,
-                                     filter_size=3,
-                                     num_channels=i_channels_conv,
-                                     num_filters=output_channels,
-                                     stride=strides[i - 1],
-                                     padding=1,
-                                     act=conv_act)
+        conv_output = img_conv_layer(
+            name="conv%d_%d" % (prefix_num, i),
+            input=i_conv_in,
+            filter_size=3,
+            num_channels=i_channels_conv,
+            num_filters=output_channels,
+            stride=strides[i - 1],
+            padding=1,
+            act=conv_act)
         if with_bn[i - 1]:
-            bn = batch_norm_layer(name="conv%d_%d_bn" % (prefix_num, i),
-                                  input=conv_output,
-                                  num_channels=output_channels,
-                                  act=ReluActivation(),
-                                  layer_attr=get_extra_layer_attr(
-                                      drop_rate=drop_rates[i - 1]))
+            bn = batch_norm_layer(
+                name="conv%d_%d_bn" % (prefix_num, i),
+                input=conv_output,
+                num_channels=output_channels,
+                act=ReluActivation(),
+                layer_attr=get_extra_layer_attr(drop_rate=drop_rates[i - 1]))
             group_output = bn
         else:
             group_output = conv_output
-    pool = img_pool_layer(name="pool%d" % prefix_num,
-                          input=group_output,
-                          pool_size=2,
-                          num_channels=output_channels,
-                          stride=2)
+    pool = img_pool_layer(
+        name="pool%d" % prefix_num,
+        input=group_output,
+        pool_size=2,
+        num_channels=output_channels,
+        stride=2)
     return pool
 
 
-def vgg_conv_net(image_size, num_classes, num_layers,
-                 channels, strides, with_bn, fc_dims,
-                 drop_rates, drop_rates_fc=[],
-                 is_color=True, is_predict=False):
+def vgg_conv_net(image_size,
+                 num_classes,
+                 num_layers,
+                 channels,
+                 strides,
+                 with_bn,
+                 fc_dims,
+                 drop_rates,
+                 drop_rates_fc=[],
+                 is_color=True,
+                 is_predict=False):
     """
     A Wrapper for a VGG network for image classification.
     It is a set of convolutional groups followed by several fully
@@ -248,51 +262,49 @@ def vgg_conv_net(image_size, num_classes, num_layers,
     for i in range(len(num_layers)):
         input_layer = data_input if i == 0 else group_output
         input_channels = 3 if i == 0 else channels[i - 1]
-        group_output = conv_layer_group(prefix_num=i + 1,
-                                        num_layers=num_layers[i],
-                                        input=input_layer,
-                                        input_channels=input_channels,
-                                        output_channels=channels[i],
-                                        drop_rates=drop_rates[i],
-                                        strides=strides[i],
-                                        with_bn=with_bn[i])
+        group_output = conv_layer_group(
+            prefix_num=i + 1,
+            num_layers=num_layers[i],
+            input=input_layer,
+            input_channels=input_channels,
+            output_channels=channels[i],
+            drop_rates=drop_rates[i],
+            strides=strides[i],
+            with_bn=with_bn[i])
     conv_output_name = group_output
     if drop_rates_fc[0] != 0.0:
         dropped_pool_name = "pool_dropped"
-        conv_output_name = dropout_layer(name=dropped_pool_name,
-                                         input=conv_output_name,
-                                         dropout_rate=drop_rates_fc[0])
+        conv_output_name = dropout_layer(
+            name=dropped_pool_name,
+            input=conv_output_name,
+            dropout_rate=drop_rates_fc[0])
     for i in range(len(fc_dims)):
         input_layer_name = conv_output_name if i == 0 else fc_output
         active_type = LinearActivation() if i == len(
             fc_dims) - 1 else ReluActivation()
         drop_rate = 0.0 if i == len(fc_dims) - 1 else drop_rates_fc[i + 1]
-        fc_output = fc_layer(name="fc%d" % (i + 1),
-                             input=input_layer_name,
-                             size=fc_dims[i],
-                             act=active_type,
-                             layer_attr=get_extra_layer_attr(drop_rate))
-    bn = batch_norm_layer(name="fc_bn",
-                          input=fc_output,
-                          num_channels=fc_dims[len(fc_dims) - 1],
-                          act=ReluActivation(),
-                          layer_attr=get_extra_layer_attr(
-                              drop_rate=drop_rates_fc[-1]))
-    output = fc_layer(name="output",
-                      input=bn,
-                      size=num_classes,
-                      act=SoftmaxActivation())
+        fc_output = fc_layer(
+            name="fc%d" % (i + 1),
+            input=input_layer_name,
+            size=fc_dims[i],
+            act=active_type,
+            layer_attr=get_extra_layer_attr(drop_rate))
+    bn = batch_norm_layer(
+        name="fc_bn",
+        input=fc_output,
+        num_channels=fc_dims[len(fc_dims) - 1],
+        act=ReluActivation(),
+        layer_attr=get_extra_layer_attr(drop_rate=drop_rates_fc[-1]))
+    output = fc_layer(
+        name="output", input=bn, size=num_classes, act=SoftmaxActivation())
     if is_predict:
         outputs(output)
     else:
-        cost = classification_cost(name="cost",
-                                   input=output,
-                                   label=label_input)
+        cost = classification_cost(name="cost", input=output, label=label_input)
         outputs(cost)
 
 
-def vgg16_conv_net(image_size, num_classes,
-                   is_color=True, is_predict=False):
+def vgg16_conv_net(image_size, num_classes, is_color=True, is_predict=False):
     """
     A Wrapper for a 16 layers VGG network for image classification.
     The detailed architecture of the paper can be found here:
@@ -314,8 +326,7 @@ def vgg16_conv_net(image_size, num_classes,
                  is_predict=is_predict)
 
 
-def small_vgg(data_conf,
-              is_predict=False):
+def small_vgg(data_conf, is_predict=False):
     """
     A Wrapper for a small VGG network for CIFAR-10 image classification.
     The detailed architecture of the paper can be found here:
@@ -329,7 +340,8 @@ def small_vgg(data_conf,
         num_classes: num of classes.
         is_color: whether the input images are color.
     """
-    for k, v in data_conf.iteritems(): globals()[k] = v
+    for k, v in data_conf.iteritems():
+        globals()[k] = v
     vgg_conv_net(image_size, num_classes,
                  num_layers=[2, 2, 3, 3],
                  channels=[64, 128, 256, 512],
@@ -343,8 +355,11 @@ def small_vgg(data_conf,
                  is_predict=is_predict)
 
 
-def training_settings(learning_rate=0.1, batch_size=128, algorithm="sgd",
-                      momentum=0.9, decay_rate=0.001):
+def training_settings(learning_rate=0.1,
+                      batch_size=128,
+                      algorithm="sgd",
+                      momentum=0.9,
+                      decay_rate=0.001):
     """
     Training settings.
     learning_rate: learning rate of the training.
@@ -357,8 +372,9 @@ def training_settings(learning_rate=0.1, batch_size=128, algorithm="sgd",
     momentum: momentum of the training algorithm.
     decay_rate: weight decay rate.
     """
-    Settings(algorithm=algorithm,
-             batch_size=batch_size,
-             learning_rate=learning_rate / float(batch_size))
+    Settings(
+        algorithm=algorithm,
+        batch_size=batch_size,
+        learning_rate=learning_rate / float(batch_size))
     default_momentum(momentum)
     default_decay_rate(decay_rate * batch_size)
diff --git a/python/paddle/utils/preprocess_img.py b/python/paddle/utils/preprocess_img.py
index 2765e93fa14e210c5ec65d5cfac4cd57a793927a..f3c609e4cd1a3714219965cd543ab11136d3585f 100644
--- a/python/paddle/utils/preprocess_img.py
+++ b/python/paddle/utils/preprocess_img.py
@@ -28,16 +28,18 @@ def resize_image(img, target_size):
     img: the input image to be resized.
     target_size: the target resized image size.
     """
-    percent = (target_size/float(min(img.size[0], img.size[1])))
+    percent = (target_size / float(min(img.size[0], img.size[1])))
     resized_size = int(round(img.size[0] * percent)),\
                    int(round(img.size[1] * percent))
     img = img.resize(resized_size, Image.ANTIALIAS)
     return img
 
+
 class DiskImage:
     """
     A class of image data on disk.
     """
+
     def __init__(self, path, target_size):
         """
         path: path of the image.
@@ -77,6 +79,7 @@ class ImageClassificationDatasetCreater(preprocess_util.DatasetCreater):
     """
     A class to process data for image classification.
     """
+
     def __init__(self, data_path, target_size, color=True):
         """
         data_path: the path to store the training data and batches.
@@ -95,8 +98,7 @@ class ImageClassificationDatasetCreater(preprocess_util.DatasetCreater):
         The meta file contains the meam image, as well as some configs.
         data: the training Dataaet.
         """
-        output_path = os.path.join(self.data_path,
-                                   self.batch_dir_name,
+        output_path = os.path.join(self.data_path, self.batch_dir_name,
                                    self.meta_filename)
         if self.color:
             mean_img = np.zeros((3, self.target_size, self.target_size))
@@ -108,12 +110,13 @@ class ImageClassificationDatasetCreater(preprocess_util.DatasetCreater):
             mean_img += cropped_img
         mean_img /= len(data.data)
         mean_img = mean_img.astype('int32').flatten()
-        preprocess_util.save_file({"data_mean": mean_img,
-                                   "image_size": self.target_size,
-                                   "mean_image_size": self.target_size,
-                                   "num_classes": self.num_classes,
-                                   "color": self.color},
-                                  output_path)
+        preprocess_util.save_file({
+            "data_mean": mean_img,
+            "image_size": self.target_size,
+            "mean_image_size": self.target_size,
+            "num_classes": self.num_classes,
+            "color": self.color
+        }, output_path)
         pass
 
     def create_dataset_from_list(self, path):
@@ -125,12 +128,11 @@ class ImageClassificationDatasetCreater(preprocess_util.DatasetCreater):
             label_name = items[1]
             if not label_name in label_set:
                 label_set[label_name] = len(label_set.keys())
-            img = DiskImage(path = image_path, target_size = self.target_size)
-            label = preprocess_util.Lablel(label = label_set[label_name],
-                                           name=label_name)
+            img = DiskImage(path=image_path, target_size=self.target_size)
+            label = preprocess_util.Lablel(
+                label=label_set[label_name], name=label_name)
         return preprocess_util.Dataset(data, self.keys), label_set
 
-
     def create_dataset_from_dir(self, path):
         """
         Create a Dataset object for image classfication.
@@ -143,11 +145,12 @@ class ImageClassificationDatasetCreater(preprocess_util.DatasetCreater):
         label_set = preprocess_util.get_label_set_from_dir(path)
         data = []
         for l_name in label_set.keys():
-            image_paths = preprocess_util.list_images(os.path.join(path, l_name))
+            image_paths = preprocess_util.list_images(
+                os.path.join(path, l_name))
             for p in image_paths:
-                img = DiskImage(path = p, target_size = self.target_size)
-                label = preprocess_util.Label(label = label_set[l_name],
-                                              name = l_name)
+                img = DiskImage(path=p, target_size=self.target_size)
+                label = preprocess_util.Label(
+                    label=label_set[l_name], name=l_name)
                 data.append((img, label))
         random.shuffle(data)
         return preprocess_util.Dataset(data, self.keys), label_set
diff --git a/python/paddle/utils/preprocess_util.py b/python/paddle/utils/preprocess_util.py
index f187ed1f67366ab9d6c74a9b5f030b7861168879..e5067a80ea7005bee7781e885b3658a2c03dc6f2 100644
--- a/python/paddle/utils/preprocess_util.py
+++ b/python/paddle/utils/preprocess_util.py
@@ -18,6 +18,7 @@ import cPickle as pickle
 import random
 import collections
 
+
 def save_file(data, filename):
     """
     Save data into pickle format.
@@ -26,6 +27,7 @@ def save_file(data, filename):
     """
     pickle.dump(data, open(filename, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
 
+
 def save_list(l, outfile):
     """
     Save a list of string into a text file. There is one line for each string.
@@ -42,15 +44,20 @@ def exclude_pattern(f):
     """
     return f.startswith(".") or f.endswith("~")
 
+
 def list_dirs(path):
     """
     Return a list of directories in path. Exclude all the directories that
     start with '.'.
     path: the base directory to search over.
     """
-    return [os.path.join(path, d) for d in next(os.walk(path))[1] if not exclude_pattern(d)]
+    return [
+        os.path.join(path, d) for d in next(os.walk(path))[1]
+        if not exclude_pattern(d)
+    ]
 
-def list_images(path, exts = set(["jpg", "png", "bmp", "jpeg"])):
+
+def list_images(path, exts=set(["jpg", "png", "bmp", "jpeg"])):
     """
     Return a list of images in path.
     path: the base directory to search over.
@@ -60,6 +67,7 @@ def list_images(path, exts = set(["jpg", "png", "bmp", "jpeg"])):
             if os.path.isfile(os.path.join(path, d)) and not exclude_pattern(d)\
             and os.path.splitext(d)[-1][1:] in exts]
 
+
 def list_files(path):
     """
     Return a list of files in path.
@@ -69,6 +77,7 @@ def list_files(path):
     return [os.path.join(path, d) for d in  os.listdir(path) \
             if os.path.isfile(os.path.join(path, d)) and not exclude_pattern(d)]
 
+
 def get_label_set_from_dir(path):
     """
     Return a dictionary of the labels and label ids from a path.
@@ -84,6 +93,7 @@ class Label:
     """
     A class of label data.
     """
+
     def __init__(self, label, name):
         """
         label: the id of the label.
@@ -98,9 +108,10 @@ class Label:
         """
         return int(self.label)
 
-    def  __hash__(self):
+    def __hash__(self):
         return hash((self.label))
 
+
 class Dataset:
     """
     A class to represent a dataset. A dataset contains a set of items.
@@ -108,6 +119,7 @@ class Dataset:
     For example: in image classification dataset, each item contains two slot,
     The first slot is an image, and the second slot is a label.
     """
+
     def __init__(self, data, keys):
         """
         data: a list of data.
@@ -120,7 +132,7 @@ class Dataset:
 
     def check_valid(self):
         for d in self.data:
-            assert(len(d) == len(self.keys))
+            assert (len(d) == len(self.keys))
 
     def permute(self, key_id, num_per_batch):
         """
@@ -167,8 +179,9 @@ class Dataset:
         while len(permuted_data) < len(self.data):
             for k in keyvalue_indices:
                 begin_idx = keyvalue_readpointer[k]
-                end_idx = int(min(begin_idx + num_data_per_key_batch,
-                              len(keyvalue_indices[k])))
+                end_idx = int(
+                    min(begin_idx + num_data_per_key_batch,
+                        len(keyvalue_indices[k])))
                 print "begin_idx, end_idx"
                 print begin_idx, end_idx
                 for idx in range(begin_idx, end_idx):
@@ -177,12 +190,12 @@ class Dataset:
         self.data = permuted_data
 
 
-
 class DataBatcher:
     """
     A class that is used to create batches for both training and testing
     datasets.
     """
+
     def __init__(self, train_data, test_data, label_set):
         """
         train_data, test_data: Each one is a dataset object repesenting
@@ -190,10 +203,10 @@ class DataBatcher:
         label_set: a dictionary storing the mapping from label name to label id.
         """
         self.train_data = train_data
-        self.test_data  = test_data
+        self.test_data = test_data
         self.label_set = label_set
         self.num_per_batch = 5000
-        assert(self.train_data.keys == self.test_data.keys)
+        assert (self.train_data.keys == self.test_data.keys)
 
     def create_batches_and_list(self, output_path, train_list_name,
                                 test_list_name, label_set_name):
@@ -202,16 +215,19 @@ class DataBatcher:
         It also create train.list and test.list to indicate the list
         of the batch files for training and testing data, respectively.
         """
-        train_list = self.create_batches(self.train_data, output_path,
-                                         "train_", self.num_per_batch)
+        train_list = self.create_batches(self.train_data, output_path, "train_",
+                                         self.num_per_batch)
         test_list = self.create_batches(self.test_data, output_path, "test_",
-                                       self.num_per_batch)
+                                        self.num_per_batch)
         save_list(train_list, os.path.join(output_path, train_list_name))
         save_list(test_list, os.path.join(output_path, test_list_name))
         save_file(self.label_set, os.path.join(output_path, label_set_name))
 
-    def create_batches(self, data, output_path,
-                       prefix = "", num_data_per_batch=5000):
+    def create_batches(self,
+                       data,
+                       output_path,
+                       prefix="",
+                       num_data_per_batch=5000):
         """
         Create batches for a Dataset object.
         data: the Dataset object to process.
@@ -244,6 +260,7 @@ class DatasetCreater(object):
        - create_dataset()
        - create_meta_file()
     """
+
     def __init__(self, data_path):
         """
         data_path: the path to store the training data and batches.
@@ -324,24 +341,22 @@ class DatasetCreater(object):
         out_path = os.path.join(self.data_path, self.batch_dir_name)
         if not os.path.exists(out_path):
             os.makedirs(out_path)
-        if (self.overwrite or
-            not os.path.exists(os.path.join(out_path, self.train_list_name))):
+        if (self.overwrite or not os.path.exists(
+                os.path.join(out_path, self.train_list_name))):
             train_data, train_label_set = \
                 self.create_dataset(train_path)
             test_data, test_label_set = \
                 self.create_dataset(test_path)
 
-            train_data.permute(self.keys.index(self.permutate_key),
-                               self.num_per_batch)
+            train_data.permute(
+                self.keys.index(self.permutate_key), self.num_per_batch)
 
-            assert(train_label_set == test_label_set)
-            data_batcher = DataBatcher(train_data, test_data,
-                                       train_label_set)
+            assert (train_label_set == test_label_set)
+            data_batcher = DataBatcher(train_data, test_data, train_label_set)
             data_batcher.num_per_batch = self.num_per_batch
-            data_batcher.create_batches_and_list(self.output_path,
-                                                 self.train_list_name,
-                                                 self.test_list_name,
-                                                 self.label_set_name)
+            data_batcher.create_batches_and_list(
+                self.output_path, self.train_list_name, self.test_list_name,
+                self.label_set_name)
             self.num_classes = len(train_label_set.keys())
             self.create_meta_file(train_data)
         return out_path
diff --git a/python/paddle/utils/show_pb.py b/python/paddle/utils/show_pb.py
index 81b28c48554efb3618baf24f7e62a213f3b5f2e2..3b371727b84e51f9f7db80b34e6e38fd149fcaaa 100644
--- a/python/paddle/utils/show_pb.py
+++ b/python/paddle/utils/show_pb.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 Show the content of proto buffer data file of PADDLE
 """
@@ -21,6 +20,7 @@ import sys
 from google.protobuf.internal.decoder import _DecodeVarint
 import paddle.proto.DataFormat_pb2 as DataFormat
 
+
 def read_proto(file, message):
     """
     read a protobuffer struct from file, the length of the struct is stored as
@@ -39,7 +39,7 @@ def read_proto(file, message):
 
 
 def usage():
-    print >>sys.stderr, "Usage: python show_pb.py PROTO_DATA_FILE"
+    print >> sys.stderr, "Usage: python show_pb.py PROTO_DATA_FILE"
     exit(1)
 
 
@@ -51,10 +51,7 @@ if __name__ == '__main__':
     header = DataFormat.DataHeader()
     read_proto(f, header)
     print header
-    
+
     sample = DataFormat.DataSample()
     while read_proto(f, sample):
         print sample
-
-
-    
diff --git a/python/paddle/utils/torch2paddle.py b/python/paddle/utils/torch2paddle.py
index 6a5ea130615e7add6a8b2999a7c365836653d8e0..958f55dbc4ee0b588b78fb630153b585f1ad4be0 100644
--- a/python/paddle/utils/torch2paddle.py
+++ b/python/paddle/utils/torch2paddle.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 Convert torch parameter file to paddle model files.
 
@@ -28,10 +27,11 @@ import torchfile
 import cPickle as pickle
 import argparse
 
+
 # save parameters
 def save_layer_parameters(outfile, feats):
     version = 0
-    value_size  = 4;
+    value_size = 4
     ret = ""
     for feat in feats:
         ret += feat.tostring()
@@ -41,16 +41,18 @@ def save_layer_parameters(outfile, feats):
     fo.write(ret)
     fo.close()
 
+
 def save_net_parameters(layers, params, output_path):
     for i in range(len(layers)):
-        weight = params[i*2]
-        biases = params[i*2+1]        
+        weight = params[i * 2]
+        biases = params[i * 2 + 1]
         weight_file = os.path.join(output_path, '_%s.w0' % layers[i])
         biases_file = os.path.join(output_path, '_%s.wbias' % layers[i])
         print "Saving for layer %s." % layers[i]
         save_layer_parameters(weight_file, [weight])
         save_layer_parameters(biases_file, biases)
 
+
 def load_layer_parameters(filename):
     fn = open(filename, 'rb')
     version, = struct.unpack('i', fn.read(4))
@@ -60,16 +62,20 @@ def load_layer_parameters(filename):
     value = np.fromfile(fn, dtype)
     return value
 
+
 def main(argv):
     """
     main method of converting torch to paddle files.
     :param argv:
     :return:
     """
-    cmdparser = argparse.ArgumentParser("Convert torch parameter file to paddle model files.")
-    cmdparser.add_argument('-i', '--input', help='input filename of torch parameters')
+    cmdparser = argparse.ArgumentParser(
+        "Convert torch parameter file to paddle model files.")
+    cmdparser.add_argument(
+        '-i', '--input', help='input filename of torch parameters')
     cmdparser.add_argument('-l', '--layers', help='list of layer names')
-    cmdparser.add_argument('-o', '--output', help='output file path of paddle model')
+    cmdparser.add_argument(
+        '-o', '--output', help='output file path of paddle model')
 
     args = cmdparser.parse_args(argv)
     if args.input and args.layers and args.output:
@@ -77,7 +83,10 @@ def main(argv):
         layers = [line.strip() for line in open(args.layers, 'r')]
         save_net_parameters(layers, params, args.output)
     else:
-        print('Usage: python torch2paddle.py -i torchfile.t7 -l layers.txt -o path/to/paddle_model')
+        print(
+            'Usage: python torch2paddle.py -i torchfile.t7 -l layers.txt -o path/to/paddle_model'
+        )
+
 
 if __name__ == "__main__":
     main(sys.argv[1:])