Merge branch 'develop' into kmax_score_layer

59a8ebc6 · caoying03 · 98a83cd2 · 50fe7abe · 59a8ebc6 · 59a8ebc6
126 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -24,7 +24,7 @@
        description: Format files with ClangFormat.
        entry: clang-format -i
        language: system
-        files: \.(c|cc|cxx|cpp|h|hpp|hxx)$
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
    sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
    hooks:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,8 +36,8 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    OFF)
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
-option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      OFF)
+option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)

--- a/Dockerfile
+++ b/Dockerfile
@@ -27,13 +27,16 @@ RUN apt-get update && \
    git python-pip python-dev openssh-server bison  \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
-    python-numpy python-matplotlib gcc-4.8 g++-4.8 \
+    python-matplotlib gcc-4.8 g++-4.8 \
    automake locales clang-format-3.8 swig doxygen cmake  \
    liblapack-dev liblapacke-dev libboost-dev \
    clang-3.8 llvm-3.8 libclang-3.8-dev \
    net-tools && \
    apt-get clean -y
+# paddle is using numpy.flip, which is introduced since 1.12.0
+RUN pip --no-cache-dir install 'numpy>=1.12.0'
 # Install Go and glide
 RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
    tar -C /usr/local -xzf go.tgz && \

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -74,8 +74,6 @@ if(WITH_MKLDNN)
        set(OPENMP_FLAGS "-fopenmp")
        set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
        set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
-        set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
-        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
    else()

--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -42,29 +42,21 @@ macro(add_style_check_target TARGET_NAME)
    if(WITH_STYLE_CHECK)
        set(SOURCES_LIST ${ARGN})
        list(REMOVE_DUPLICATES SOURCES_LIST)
-        list(SORT SOURCES_LIST)
        foreach(filename ${SOURCES_LIST})
-            set(LINT ON)
            foreach(pattern ${IGNORE_PATTERN})
                if(filename MATCHES ${pattern})
-                    message(STATUS "DROP LINT ${filename}")
+                    list(REMOVE_ITEM SOURCES_LIST ${filename})
-                    set(LINT OFF)
                endif()
            endforeach()
-            if(LINT MATCHES ON)
-                # cpplint code style
-                get_filename_component(base_filename ${filename} NAME)
-                set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
-                add_custom_command(OUTPUT ${CUR_GEN} PRE_BUILD
-                    COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
-                            "--filter=${STYLE_FILTER}"
-                            "--write-success=${CUR_GEN}" ${filename}
-                    DEPENDS ${filename} ${PROJ_ROOT}/paddle/scripts/cpplint.py
-                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-                add_custom_target(${base_filename}.cpplint DEPENDS ${CUR_GEN})
-                add_dependencies(${TARGET_NAME} ${base_filename}.cpplint)
-            endif()
        endforeach()
+        if(SOURCES_LIST)
+            add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+                COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
+                        "--filter=${STYLE_FILTER}"
+                        ${SOURCES_LIST}
+                COMMENT "cpplint: Checking source code style"
+                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})        
+        endif()
    endif()
 endmacro()
--- a/cmake/external/any.cmake
+++ b/cmake/external/any.cmake
@@ -7,7 +7,7 @@ INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any)
 ExternalProject_Add(
    extern_lib_any
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/thelink2012/any.git"
+    GIT_REPOSITORY  "https://github.com/PaddlePaddle/any.git"
    GIT_TAG         "8fef1e93710a0edf8d7658999e284a1142c4c020"
    PREFIX          ${ANY_SOURCE_DIR}
    UPDATE_COMMAND  ""

--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -28,7 +28,14 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
 ExternalProject_Add(
    extern_gflags
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
+    # TODO(yiwang): The annoying warnings mentioned in
+    # https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by
+    # gflags.  I fired a PR https://github.com/gflags/gflags/pull/230
+    # to fix it.  Before it gets accepted by the gflags team, we use
+    # my personal fork, which contains above fix, temporarily.  Let's
+    # change this back to the official Github repo once my PR is
+    # merged.
+    GIT_REPOSITORY  "https://github.com/wangkuiyi/gflags.git"
    PREFIX          ${GFLAGS_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -69,8 +69,13 @@ ENDIF(NOT ${CBLAS_FOUND})
 MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
 INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
-ADD_LIBRARY(cblas STATIC IMPORTED)
+# FIXME(gangliao): generate cblas target to track all high performance
-SET_PROPERTY(TARGET cblas PROPERTY IMPORTED_LOCATION ${CBLAS_LIBRARIES})
+# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
+SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
+FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+ADD_LIBRARY(cblas STATIC ${dummyfile})
+TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
 IF(NOT ${CBLAS_FOUND})
    ADD_DEPENDENCIES(cblas extern_openblas)
    LIST(APPEND external_project_dependencies cblas)

--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -24,7 +24,6 @@ IF(WITH_PYTHON)
 ENDIF(WITH_PYTHON)
 SET(py_env "")
-SET(USE_VIRTUALENV_FOR_TEST 1)
 IF(PYTHONINTERP_FOUND)
    find_python_module(pip REQUIRED)
    find_python_module(numpy REQUIRED)

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -115,7 +115,7 @@ set(COMMON_FLAGS
    -Wno-error=literal-suffix
    -Wno-error=sign-compare
    -Wno-error=unused-local-typedefs
-    -Wno-error=parentheses-equality # Warnings in Pybind11
+    -Wno-error=parentheses-equality # Warnings in pybind11
 )
 set(GPU_COMMON_FLAGS
@@ -195,6 +195,7 @@ endif()
 # Modern gpu architectures: Pascal
 if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
      list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
+      list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
 endif()
 # Custom gpu architecture

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -403,3 +403,16 @@ function(py_proto_compile TARGET_NAME)
  protobuf_generate_python(py_srcs ${py_proto_compile_SRCS})
  add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs})
 endfunction()
+function(py_test TARGET_NAME)
+  if(WITH_TESTING)
+    set(options STATIC static SHARED shared)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})  
+    add_test(NAME ${TARGET_NAME}
+             COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
+             python2 ${py_test_SRCS}
+             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+  endif()
+endfunction()
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -149,9 +149,12 @@ endfunction()
 # Create a python unittest using run_python_tests.sh,
 # which takes care of making correct running environment
 function(add_python_test TEST_NAME)
-  add_test(NAME ${TEST_NAME}
+    foreach(arg ${ARGN})
-        COMMAND env PADDLE_PACKAGE_DIR=${PADDLE_PYTHON_PACKAGE_DIR}
+        get_filename_component(py_fn ${arg} NAME_WE)
-        bash ${PROJ_ROOT}/paddle/scripts/run_python_tests.sh
+        set(TRG_NAME ${TEST_NAME}_${py_fn})
-        ${USE_VIRTUALENV_FOR_TEST} ${PYTHON_EXECUTABLE} ${ARGN}
+        add_test(NAME ${TRG_NAME}
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+                COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
+                python2 ${arg}
+                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+    endforeach()
 endfunction()
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
+# Intel® MKL-DNN on PaddlePaddle: Design Doc
+我们计划将Intel深度神经网络数学库(**MKL-DNN**\[[1](#references)\])集成到PaddlePaddle，充分展现英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
+我们短期内的基本目标是：
+- 完成常用layer的MKL-DNN实现。
+- 完成常见深度神经网络VGG，GoogLeNet 和 ResNet的MKL-DNN实现。
+## Contents
+- [Overview](#overview)
+- [Actions](#actions)
+ 	- [CMake](#cmake)
+	- [Layers](#layers)
+	- [Activations](#activations)
+	- [Unit Tests](#unit-tests)
+	- [Protobuf Messages](#protobuf-messages)
+	- [Python API](#python-api)
+	- [Demos](#demos)
+	- [Benchmarking](#benchmarking)
+	- [Others](#others)
+- [Design Concerns](#design-concerns)
+## Overview
+我们会把MKL-DNN作为第三方库集成进PaddlePaddle，整体框架图
+<div align="center">
+<img src="image/overview.png" width=350><br/>
+Figure 1. PaddlePaddle on IA.
+</div>
+## Actions
+我们把集成方案大致分为了如下几个方面。
+### CMake
+我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项，当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能。
+同时，我们会引入`WITH_MKLML`选项，用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用，但是建议在开启MKL-DNN的同时也打开MKLML的开关，这样才能发挥最好的性能。
+所以，我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件，它们会在编译PaddlePaddle的时候下载对应的软件包，并放到PaddlePaddle的third party目录中。
+**备注**：当`WITH_MKLML=ON`的时候，会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库，所以会稍微改动`cmake/cblas.cmake`中的逻辑。
+### Layers
+所有MKL-DNN相关的C++ layers，都会按照PaddlePaddle的目录结构存放在
+`paddle/gserver/layers`中，并且文件名都会一以*Mkldnn*开头。
+所有MKL-DNN的layers都会继承于一个叫做`MkldnnLayer`的父类，该父类继承于PaddlePaddle的基类`Layer`。
+### Activations
+由于在PaddlePaddle中，激活函数是独立于layer概念的，所以会在`paddle/gserver/activations`目录下添加一个`MkldnnActivation.h`文件定义一些用于MKL-DNN的接口，实现方法还是会在`ActivationFunction.cpp`文件。
+### Unit Tests
+会在`paddle/gserver/test`目录下添加`test_Mkldnn.cpp`和`MkldnnTester.*`用于MKL-DNN的测试。
+Activation的测试，计划在PaddlePaddle原有的测试文件上直接添加新的测试type。
+### Protobuf Messages
+根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。
+### Python API
+目前只考虑**v1 API**。
+计划在`python/paddle/trainer/config_parser.py`里面添加`use_mkldnn`这个选择，方便用户选择使用MKL-DNN的layers。
+具体实现方式比如：
+```python
+use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+if use_mkldnn
+    self.layer_type = mkldnn_*
+```
+所有MKL-DNN的layer type会以*mkldnn_*开头，以示区分。 
+并且可能在`python/paddle/trainer_config_helper`目录下的`activations.py `和`layers.py`里面添加必要的MKL-DNN的接口。
+### Demos
+会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹，里面放入一些用于MKL-DNN测试的demo脚本。
+### Benchmarking
+会考虑添加部分逻辑在`benchmark/paddle/image/run.sh`，添加使用MKL-DNN的测试。
+### Others
+1. 如果在使用MKL-DNN的情况下，会把CPU的Buffer对齐为64。
+2. 深入PaddlePaddle，寻找有没有其他可以优化的可能，进一步优化。比如可能会用OpenMP改进SGD的更新性能。
+## Design Concerns
+为了更好的符合PaddlePaddle的代码风格\[[2](#references)\]，同时又尽可能少的牺牲MKL-DNN的性能\[[3](#references)\]。
+我们总结出一些特别需要注意的点：
+1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MkldnnLayer`特有的设备ID。
+2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
+3. 创建`MkldnnMatrix`，用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
+4. 创建`MkldnnBase`，定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MkldnnStream`和`CpuEngine`，和未来可能还会用到`FPGAEngine`等。
+5. 在**Argument**里添加两个`MkldnnMatrixPtr`，取名为`mkldnnValue`和`mkldnnGrad`，用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名)，用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。
+6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑，用于判断`deviceId`，并针对device在MKL-DNN和CPU之间不统一的情况，做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。
+7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
+## References
+1. [Intel Math Kernel Library for Deep Neural Networks (Intel MKL-DNN)](https://github.com/01org/mkl-dnn "Intel MKL-DNN")
+2. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
+3. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的CUDNN部分使用的也是`NCHW`，所以不存在这个问题)，所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
--- a/doc/design/mkldnn/image/overview.png
+++ b/doc/design/mkldnn/image/overview.png
--- a/paddle/.set_python_path.sh
+++ b/paddle/.set_python_path.sh
@@ -21,22 +21,15 @@
 # 
 # It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...}
 #
+PYPATH=""
-if ! python -c "import paddle" >/dev/null 2>/dev/null; then
+set -x
-  PYPATH=""
+while getopts "d:" opt; do
-  set -x
+  case $opt in
-  while getopts "d:" opt; do
+    d)
-    case $opt in
+      PYPATH=$OPTARG
-      d)
+      ;;
-        PYPATH=$OPTARG
+  esac
-        ;;
+done
-    esac
+shift $(($OPTIND - 1))
-  done
+export PYTHONPATH=$PYPATH:$PYTHONPATH
-  shift $(($OPTIND - 1))
+$@
-  export PYTHONPATH=$PYPATH:$PYTHONPATH
-  $@
-else
-  echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment."
-  echo "Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'"
-  exit 1
-fi
--- a/paddle/api/test/CMakeLists.txt
+++ b/paddle/api/test/CMakeLists.txt
-add_python_test(test_swig_api
+py_test(testTrain SRCS testTrain.py)
-    testArguments.py testGradientMachine.py testMatrix.py testVector.py testTrain.py testTrainer.py)
+py_test(testMatrix SRCS testMatrix.py)
+py_test(testVector SRCS testVector.py)
+py_test(testTrainer SRCS testTrainer.py)
+py_test(testArguments SRCS testArguments.py)
+py_test(testGradientMachine SRCS testGradientMachine.py)
--- a/paddle/cuda/src/hl_batch_transpose.cu
+++ b/paddle/cuda/src/hl_batch_transpose.cu
@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "hl_batch_transpose.h"
 #include "hl_base.h"
+#include "hl_batch_transpose.h"
 const int TILE_DIM = 64;
 const int BLOCK_ROWS = 16;
 // No bank-conflict transpose for a batch of data.
-__global__ void batchTransposeNoBankConflicts(real* odata,
+__global__ void batchTransposeNoBankConflicts(
-                                              const real* idata,
+    real* odata, const real* idata, int numSamples, int width, int height) {
-                                              int numSamples, int width,
-                                              int height) {
  __shared__ float tile[TILE_DIM][TILE_DIM + 1];
  const int x = blockIdx.x * TILE_DIM + threadIdx.x;
@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
          newX] = tile[threadIdx.x][j];
 }
-void batchTranspose(const real* input, real* output, int width, int height,
+void batchTranspose(
-                    int batchSize) {
+    const real* input, real* output, int width, int height, int batchSize) {
  dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
  dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize);
-  batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+  batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-      (output, input, batchSize, width, height);
+      output, input, batchSize, width, height);
  CHECK_SYNC("batchTranspose failed!");
 }
--- a/paddle/cuda/src/hl_cuda_aggregate.cu
+++ b/paddle/cuda/src/hl_cuda_aggregate.cu
@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "hl_aggregate.h"
 #include "hl_base.h"
 #include "hl_cuda.h"
 #include "hl_cuda.ph"
-#include "hl_aggregate.h"
-#include "hl_thread.ph"
 #include "hl_matrix_base.cuh"
+#include "hl_thread.ph"
 #include "paddle/utils/Logging.h"
 /**
 * @brief   matrix row operator.
 */
-template<class Agg, int blockSize>
+template <class Agg, int blockSize>
-__global__ void KeMatrixRowOp(Agg agg,
+__global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) {
-                              real *E,
-                              real *Sum,
-                              int dimN) {
  __shared__ real sum_s[blockSize];
-  int cnt = (dimN + blockSize -1) / blockSize;
+  int cnt = (dimN + blockSize - 1) / blockSize;
-  int rowId = blockIdx.x + blockIdx.y*gridDim.x;
+  int rowId = blockIdx.x + blockIdx.y * gridDim.x;
-  int index = rowId*dimN;
+  int index = rowId * dimN;
  int tid = threadIdx.x;
  int lmt = tid;
@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg,
  sum_s[tid] = tmp;
  __syncthreads();
-  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
    if (tid < stride) {
      sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]);
    }
@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg,
 }
 template <class Agg>
-void hl_matrix_row_op(Agg agg,
+void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
-                      real *A_d,
-                      real *C_d,
-                      int dimM,
-                      int dimN) {
  int blocksX = dimM;
  int blocksY = 1;
  dim3 threads(128, 1);
  dim3 grid(blocksX, blocksY);
-  KeMatrixRowOp<Agg, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixRowOp<Agg, 128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-           (agg, A_d, C_d, dimN);
+      agg, A_d, C_d, dimN);
 }
 void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
-  hl_matrix_row_op(aggregate::sum(),
+  hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN);
-                   A_d,
-                   C_d,
-                   dimM,
-                   dimN);
  CHECK_SYNC("hl_matrix_row_sum failed");
 }
@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
-  hl_matrix_row_op(aggregate::max(),
+  hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN);
-                   A_d,
-                   C_d,
-                   dimM,
-                   dimN);
  CHECK_SYNC("hl_matrix_row_max failed");
 }
@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
-  hl_matrix_row_op(aggregate::min(),
+  hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN);
-                   A_d,
-                   C_d,
-                   dimM,
-                   dimN);
  CHECK_SYNC("hl_matrix_row_min failed");
 }
 /**
 * @brief   matrix column operator.
 */
-template<class Agg>
+template <class Agg>
-__global__ void KeMatrixColumnOp(Agg agg,
+__global__ void KeMatrixColumnOp(
-                                 real *E,
+    Agg agg, real *E, real *Sum, int dimM, int dimN) {
-                                 real *Sum,
-                                 int dimM,
-                                 int dimN) {
  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
  real tmp = agg.init();
  if (rowIdx < dimN) {
@@ -127,15 +104,12 @@ __global__ void KeMatrixColumnOp(Agg agg,
  }
 }
-template<class Agg, int blockDimX, int blockDimY>
+template <class Agg, int blockDimX, int blockDimY>
-__global__ void KeMatrixColumnOp_S(Agg agg,
+__global__ void KeMatrixColumnOp_S(
-                                   real *E,
+    Agg agg, real *E, real *Sum, int dimM, int dimN) {
-                                   real *Sum,
+  __shared__ real _sum[blockDimX * blockDimY];
-                                   int dimM,
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-                                   int dimN) {
+  int index = threadIdx.y;
-    __shared__ real _sum[blockDimX*blockDimY];
-    int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-    int index = threadIdx.y;
  real tmp = agg.init();
  if (rowIdx < dimN) {
@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
      index += blockDimY;
    }
  }
-  _sum[threadIdx.x + threadIdx.y*blockDimX] = tmp;
+  _sum[threadIdx.x + threadIdx.y * blockDimX] = tmp;
  __syncthreads();
  if (rowIdx < dimN) {
-    if (threadIdx.y ==0) {
+    if (threadIdx.y == 0) {
      real tmp = agg.init();
-      for (int i=0; i < blockDimY; i++) {
+      for (int i = 0; i < blockDimY; i++) {
-        tmp = agg(tmp, _sum[threadIdx.x + i*blockDimX]);
+        tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]);
      }
      Sum[rowIdx] = tmp;
    }
@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
 }
 template <class Agg>
-void hl_matrix_column_op(Agg agg,
+void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
-                         real *A_d,
-                         real *C_d,
-                         int dimM,
-                         int dimN) {
  if (dimN >= 8192) {
-    int blocksX = (dimN + 128 -1) / 128;
+    int blocksX = (dimN + 128 - 1) / 128;
    int blocksY = 1;
    dim3 threads(128, 1);
    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp<Agg><<< grid, threads, 0, STREAM_DEFAULT >>>
+    KeMatrixColumnOp<Agg><<<grid, threads, 0, STREAM_DEFAULT>>>(
-             (agg, A_d, C_d, dimM, dimN);
+        agg, A_d, C_d, dimM, dimN);
  } else {
-    int blocksX = (dimN + 32 -1) / 32;
+    int blocksX = (dimN + 32 - 1) / 32;
    int blocksY = 1;
    dim3 threads(32, 32);
    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp_S<Agg, 32, 32><<< grid, threads, 0, STREAM_DEFAULT>>>
+    KeMatrixColumnOp_S<Agg, 32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
-             (agg, A_d, C_d, dimM, dimN);
+        agg, A_d, C_d, dimM, dimN);
  }
  return;
@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
-  hl_matrix_column_op(aggregate::sum(),
+  hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN);
-                      A_d,
-                      C_d,
-                      dimM,
-                      dimN);
  CHECK_SYNC("hl_matrix_column_sum failed");
 }
@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
-  hl_matrix_column_op(aggregate::max(),
+  hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN);
-                      A_d,
-                      C_d,
-                      dimM,
-                      dimN);
  CHECK_SYNC("hl_matrix_column_max failed");
 }
@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
-  hl_matrix_column_op(aggregate::min(),
+  hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN);
-                      A_d,
-                      C_d,
-                      dimM,
-                      dimN);
  CHECK_SYNC("hl_matrix_column_min failed");
 }
@@ -226,16 +184,16 @@ template <int blockSize>
 __global__ void KeVectorSum(real *E, real *Sum, int dimM) {
  __shared__ double sum_s[blockSize];
  int tid = threadIdx.x;
-  int index = blockIdx.y*blockDim.x+threadIdx.x;
+  int index = blockIdx.y * blockDim.x + threadIdx.x;
  sum_s[tid] = 0.0f;
  while (index < dimM) {
    sum_s[tid] += E[index];
-    index += blockDim.x*gridDim.y;
+    index += blockDim.x * gridDim.y;
  }
  __syncthreads();
-  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
    if (tid < stride) {
      sum_s[tid] += sum_s[tid + stride];
    }
@@ -259,38 +217,39 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
  dim3 threads(blockSize, 1);
  dim3 grid(blocksX, blocksY);
-  struct _hl_event_st hl_event_st  = {.cu_event = t_resource.event};
+  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
  hl_event_t hl_event = &hl_event_st;
-  while (!hl_cuda_event_is_ready(hl_event)) {}
+  while (!hl_cuda_event_is_ready(hl_event)) {
+  }
-  KeVectorSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeVectorSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-           (A_d, t_resource.gpu_mem, dimM);
+      A_d, t_resource.gpu_mem, dimM);
-  KeVectorSum<128><<< 1, threads, 0, STREAM_DEFAULT >>>
+  KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
-           (t_resource.gpu_mem, t_resource.cpu_mem, 128);
+      t_resource.gpu_mem, t_resource.cpu_mem, 128);
  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
  cudaError_t err = (cudaError_t)hl_get_device_last_error();
-  CHECK_EQ(cudaSuccess, err)
+  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
-    << "CUDA error: " << hl_get_device_error_string((size_t)err);
+                             << hl_get_device_error_string((size_t)err);
 }
 template <int blockSize>
 __global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) {
  __shared__ double sum_s[blockSize];
  int tid = threadIdx.x;
-  int index = blockIdx.y*blockDim.x+threadIdx.x;
+  int index = blockIdx.y * blockDim.x + threadIdx.x;
  sum_s[tid] = 0.0f;
  while (index < dimM) {
    sum_s[tid] += abs(E[index]);
-    index += blockDim.x*gridDim.y;
+    index += blockDim.x * gridDim.y;
  }
  __syncthreads();
-  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
    if (tid < stride) {
      sum_s[tid] += sum_s[tid + stride];
    }
@@ -314,20 +273,21 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
  dim3 threads(blockSize, 1);
  dim3 grid(blocksX, blocksY);
-  struct _hl_event_st hl_event_st  = {.cu_event = t_resource.event};
+  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
  hl_event_t hl_event = &hl_event_st;
-  while (!hl_cuda_event_is_ready(hl_event)) {}
+  while (!hl_cuda_event_is_ready(hl_event)) {
+  }
-  KeVectorAbsSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeVectorAbsSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-           (A_d, t_resource.gpu_mem, dimM);
+      A_d, t_resource.gpu_mem, dimM);
-  KeVectorAbsSum<128><<< 1, threads, 0, STREAM_DEFAULT >>>
+  KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
-           (t_resource.gpu_mem, t_resource.cpu_mem, 128);
+      t_resource.gpu_mem, t_resource.cpu_mem, 128);
  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
  cudaError_t err = (cudaError_t)hl_get_device_last_error();
-  CHECK_EQ(cudaSuccess, err)
+  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
-    << "CUDA error: " << hl_get_device_error_string((size_t)err);
+                             << hl_get_device_error_string((size_t)err);
 }
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -12,21 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <float.h>
 #include "hl_base.h"
 #include "hl_cnn.h"
 #include "hl_device_functions.cuh"
-__global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
+__global__ void KeMaxPoolForward(const int nthreads,
-                                 const int channels, const int height,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int height,
                                 const int width,
-                                 const int pooledH, const int pooledW,
+                                 const int pooledH,
-                                 const int ksizeW, const int ksizeH,
+                                 const int pooledW,
-                                 const int strideH, const int strideW,
+                                 const int ksizeW,
-                                 const int offsetH, const int offsetW,
+                                 const int ksizeH,
-                                 real* tgtData, const int tgtStride) {
+                                 const int strideH,
-  int index =  blockIdx.x * blockDim.x + threadIdx.x;
+                                 const int strideW,
+                                 const int offsetH,
+                                 const int offsetW,
+                                 real* tgtData,
+                                 const int tgtStride) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
  if (index < nthreads) {
    int pw = index % pooledW;
    int ph = (index / pooledW) % pooledH;
@@ -46,44 +52,70 @@ __global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
          maxval = inputData[h * width + w];
      }
    }
-    int tgtIndex = index % (pooledW * pooledH * channels) +
+    int tgtIndex =
-        frameNum * tgtStride;
+        index % (pooledW * pooledH * channels) + frameNum * tgtStride;
    tgtData[tgtIndex] = maxval;
  }
 }
-void hl_maxpool_forward(const int frameCnt, const real* inputData,
+void hl_maxpool_forward(const int frameCnt,
+                        const real* inputData,
                        const int channels,
-                        const int height, const int width,
+                        const int height,
-                        const int pooledH, const int pooledW,
+                        const int width,
-                        const int sizeX, const int sizeY,
+                        const int pooledH,
-                        const int strideH, const int strideW,
+                        const int pooledW,
-                        const int paddingH, const int paddingW,
+                        const int sizeX,
-                        real* tgtData, const int tgtStride) {
+                        const int sizeY,
+                        const int strideH,
+                        const int strideW,
+                        const int paddingH,
+                        const int paddingW,
+                        real* tgtData,
+                        const int tgtStride) {
  int num_kernels = pooledH * pooledW * channels * frameCnt;
  int blocks = (num_kernels + 1024 - 1) / 1024;
  dim3 threads(1024, 1);
  dim3 grid(blocks, 1);
-  KeMaxPoolForward<<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMaxPoolForward<<<grid, threads, 0, STREAM_DEFAULT>>>(num_kernels,
-           (num_kernels, inputData, channels, height, width,
+                                                         inputData,
-           pooledH, pooledW, sizeX, sizeY, strideH, strideW,
+                                                         channels,
-           paddingH, paddingW, tgtData, tgtStride);
+                                                         height,
+                                                         width,
+                                                         pooledH,
+                                                         pooledW,
+                                                         sizeX,
+                                                         sizeY,
+                                                         strideH,
+                                                         strideW,
+                                                         paddingH,
+                                                         paddingW,
+                                                         tgtData,
+                                                         tgtStride);
  CHECK_SYNC("hl_maxpool_forward failed");
 }
-__global__ void KeMaxPoolBackward(const int nthreads, const real* inputData,
+__global__ void KeMaxPoolBackward(const int nthreads,
-                                  const real* outData, const real* outGrad,
+                                  const real* inputData,
-                                  const int channels, const int height,
+                                  const real* outData,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int height,
                                  const int width,
-                                  const int pooledH, const int pooledW,
+                                  const int pooledH,
-                                  const int sizeX, const int sizeY,
+                                  const int pooledW,
-                                  const int strideH, const int strideW,
+                                  const int sizeX,
-                                  const int padH, const int padW,
+                                  const int sizeY,
-                                  real scaleA, real scaleB,
+                                  const int strideH,
-                                  real* targetGrad, const int outStride) {
+                                  const int strideW,
-  int index = blockIdx.x  * blockDim.x + threadIdx.x;
+                                  const int padH,
+                                  const int padW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  const int outStride) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
  if (index < nthreads) {
    // find out the local index
    // find out the local offset
@@ -107,43 +139,69 @@ __global__ void KeMaxPoolBackward(const int nthreads, const real* inputData,
        }
      }
    }
-    targetGrad[index] =
+    targetGrad[index] = scaleB * targetGrad[index] + scaleA * gradient;
-      scaleB * targetGrad[index] + scaleA * gradient;
  }
 }
-void hl_maxpool_backward(const int frameCnt, const real* inputData,
+void hl_maxpool_backward(const int frameCnt,
-                        const real* outData, const real* outGrad,
+                         const real* inputData,
-                        const int channels, const int height,
+                         const real* outData,
-                        const int width,
+                         const real* outGrad,
-                        const int pooledH, const int pooledW,
+                         const int channels,
-                        const int sizeX, const int sizeY,
+                         const int height,
-                        const int strideH, const int strideW,
+                         const int width,
-                        const int paddingH, const int paddingW,
+                         const int pooledH,
-                        real scaleA, real scaleB,
+                         const int pooledW,
-                        real* targetGrad, const int outStride) {
+                         const int sizeX,
+                         const int sizeY,
+                         const int strideH,
+                         const int strideW,
+                         const int paddingH,
+                         const int paddingW,
+                         real scaleA,
+                         real scaleB,
+                         real* targetGrad,
+                         const int outStride) {
  int num_kernels = height * width * channels * frameCnt;
  int blocks = (num_kernels + 1024 - 1) / 1024;
-  KeMaxPoolBackward<<< blocks, 1024, 0, STREAM_DEFAULT >>>
+  KeMaxPoolBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-           (num_kernels, inputData, outData, outGrad, channels,
+                                                         inputData,
-           height, width, pooledH, pooledW, sizeX, sizeY,
+                                                         outData,
-           strideH, strideW,
+                                                         outGrad,
-           paddingH, paddingW,
+                                                         channels,
-           scaleA, scaleB,
+                                                         height,
-           targetGrad, outStride);
+                                                         width,
+                                                         pooledH,
+                                                         pooledW,
+                                                         sizeX,
+                                                         sizeY,
+                                                         strideH,
+                                                         strideW,
+                                                         paddingH,
+                                                         paddingW,
+                                                         scaleA,
+                                                         scaleB,
+                                                         targetGrad,
+                                                         outStride);
  CHECK_SYNC("hl_maxpool_backward");
 }
-__global__ void KeAvgPoolForward(const int nthreads, const real* inputData,
+__global__ void KeAvgPoolForward(const int nthreads,
+                                 const real* inputData,
                                 const int channels,
-                                 const int height, const int width,
+                                 const int height,
-                                 const int pooledH, const int pooledW,
+                                 const int width,
-                                 const int sizeX, const int sizeY,
+                                 const int pooledH,
-                                 const int strideH, const int strideW,
+                                 const int pooledW,
-                                 const int padH, const int padW,
+                                 const int sizeX,
-                                 real* tgtData, const int tgtStride) {
+                                 const int sizeY,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int padH,
+                                 const int padW,
+                                 real* tgtData,
+                                 const int tgtStride) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  if (index < nthreads) {
    int pw = index % pooledW;
@@ -168,39 +226,64 @@ __global__ void KeAvgPoolForward(const int nthreads, const real* inputData,
        aveval += inputData[h * width + w];
      }
    }
-    int tgtIndex = index % (pooledW * pooledH * channels) +
+    int tgtIndex =
-        frameNum * tgtStride;
+        index % (pooledW * pooledH * channels) + frameNum * tgtStride;
    tgtData[tgtIndex] = aveval / pool_size;
  }
 }
-void hl_avgpool_forward(const int frameCnt, const real* inputData,
+void hl_avgpool_forward(const int frameCnt,
+                        const real* inputData,
                        const int channels,
-                        const int height, const int width,
+                        const int height,
-                        const int pooledH, const int pooledW,
+                        const int width,
-                        const int sizeX, const int sizeY,
+                        const int pooledH,
-                        const int strideH, const int strideW,
+                        const int pooledW,
-                        const int paddingH, const int paddingW, 
+                        const int sizeX,
-                        real* tgtData, const int tgtStride) {
+                        const int sizeY,
+                        const int strideH,
+                        const int strideW,
+                        const int paddingH,
+                        const int paddingW,
+                        real* tgtData,
+                        const int tgtStride) {
  int num_kernels = pooledH * pooledW * channels * frameCnt;
  int blocks = (num_kernels + 1024 - 1) / 1024;
-  KeAvgPoolForward<<< blocks, 1024, 0, STREAM_DEFAULT >>>
+  KeAvgPoolForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-           (num_kernels, inputData, channels,
+                                                        inputData,
-           height, width, pooledH, pooledW,
+                                                        channels,
-           sizeX, sizeY, strideH, strideW,
+                                                        height,
-           paddingH, paddingW, tgtData, tgtStride);
+                                                        width,
+                                                        pooledH,
+                                                        pooledW,
+                                                        sizeX,
+                                                        sizeY,
+                                                        strideH,
+                                                        strideW,
+                                                        paddingH,
+                                                        paddingW,
+                                                        tgtData,
+                                                        tgtStride);
  CHECK_SYNC("hl_avgpool_forward failed");
 }
-__global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
+__global__ void KeAvgPoolBackward(const int nthreads,
-                                  const int channels, const int height,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int height,
                                  const int width,
-                                  const int pooledH, const int pooledW,
+                                  const int pooledH,
-                                  const int sizeX, const int sizeY,
+                                  const int pooledW,
-                                  const int strideH, const int strideW,
+                                  const int sizeX,
-                                  const int padH, const int padW,
+                                  const int sizeY,
-                                  real scaleA, real scaleB,
+                                  const int strideH,
-                                  real* tgtGrad, const int outStride) {
+                                  const int strideW,
+                                  const int padH,
+                                  const int padW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* tgtGrad,
+                                  const int outStride) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  if (index < nthreads) {
    int offsetW = index % width + padW;
@@ -215,7 +298,6 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
    real gradient = 0;
    outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
    for (int ph = phstart; ph < phend; ++ph) {
      for (int pw = pwstart; pw < pwend; ++pw) {
        // figure out the pooling size
@@ -224,32 +306,50 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
        int hend = min(hstart + sizeY, height + padH);
        int wend = min(wstart + sizeX, width + padW);
        int poolsize = (hend - hstart) * (wend - wstart);
-        gradient += outGrad[ph * pooledW + pw]/poolsize;
+        gradient += outGrad[ph * pooledW + pw] / poolsize;
      }
    }
    tgtGrad[index] = scaleB * tgtGrad[index] + scaleA * gradient;
  }
 }
-void hl_avgpool_backward(const int frameCnt, const real* outGrad,
+void hl_avgpool_backward(const int frameCnt,
+                         const real* outGrad,
                         const int channels,
-                         const int height, const int width,
+                         const int height,
-                         const int pooledH, const int pooledW,
+                         const int width,
-                         const int sizeX, const int sizeY,
+                         const int pooledH,
-                         const int strideH, const int strideW,
+                         const int pooledW,
-                         const int paddingH, const int paddingW,
+                         const int sizeX,
-                         real scaleA, real scaleB,
+                         const int sizeY,
-                         real* backGrad, const int outStride) {
+                         const int strideH,
+                         const int strideW,
+                         const int paddingH,
+                         const int paddingW,
+                         real scaleA,
+                         real scaleB,
+                         real* backGrad,
+                         const int outStride) {
  int num_kernels = height * width * channels * frameCnt;
  int blocks = (num_kernels + 1024 - 1) / 1024;
-  KeAvgPoolBackward <<< blocks, 1024, 0, STREAM_DEFAULT >>>
+  KeAvgPoolBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-           (num_kernels, outGrad, channels, height, width,
+                                                         outGrad,
-           pooledH, pooledW, sizeX, sizeY,
+                                                         channels,
-           strideH, strideW,
+                                                         height,
-           paddingH, paddingW,
+                                                         width,
-           scaleA, scaleB,
+                                                         pooledH,
-           backGrad, outStride);
+                                                         pooledW,
+                                                         sizeX,
+                                                         sizeY,
+                                                         strideH,
+                                                         strideW,
+                                                         paddingH,
+                                                         paddingW,
+                                                         scaleA,
+                                                         scaleB,
+                                                         backGrad,
+                                                         outStride);
  CHECK_SYNC("hl_avgpool_backward failed");
 }
@@ -266,7 +366,7 @@ __global__ void KeBilinearInterpFw(const real* in,
                                   const size_t numChannels,
                                   const real ratioH,
                                   const real ratioW) {
-  int nthreads = outputH * outputW;                      
+  int nthreads = outputH * outputW;
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  if (tid < nthreads) {
    int outIdH = tid / outputW;
@@ -287,13 +387,14 @@ __global__ void KeBilinearInterpFw(const real* in,
    real w1lambda = ratioW * outImgIdx - inImgIdx;
    real w2lambda = 1.f - w1lambda;
-    const real* inPos =
+    const real* inPos = &in[outIdH * inputW + channelId * inImgSize +
-      &in[outIdH * inputW + channelId * inImgSize + inImgIdy * inImgW + inImgIdx];
+                            inImgIdy * inImgW + inImgIdx];
    // bilinear interpolation
    out[outIdH * outputW + outIdW] =
-      h2lambda * (w2lambda * inPos[0]            + w1lambda * inPos[wId]) + 
+        h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wId]) +
-      h1lambda * (w2lambda * inPos[hId * inImgW] + w1lambda * inPos[hId * inImgW + wId]);
+        h1lambda * (w2lambda * inPos[hId * inImgW] +
+                    w1lambda * inPos[hId * inImgW + wId]);
  }
 }
@@ -313,9 +414,19 @@ void hl_bilinear_forward(const real* inData,
  int threadNum = outputH * outputW;
  int blocks = (threadNum + 1024 - 1) / 1024;
-  KeBilinearInterpFw<<< blocks, 1024, 0, STREAM_DEFAULT>>>(
+  KeBilinearInterpFw<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inData,
-    inData, inImgH, inImgW, inputH, inputW, outData, outImgH,
+                                                          inImgH,
-    outImgW, outputH, outputW, numChannels, ratioH, ratioW);
+                                                          inImgW,
+                                                          inputH,
+                                                          inputW,
+                                                          outData,
+                                                          outImgH,
+                                                          outImgW,
+                                                          outputH,
+                                                          outputW,
+                                                          numChannels,
+                                                          ratioH,
+                                                          ratioW);
  CHECK_SYNC("hl_bilinear_forward failed");
 }
@@ -353,13 +464,15 @@ __global__ void KeBilinearInterpBw(real* in,
    real w1lambda = ratioW * outImgIdx - inImgIdx;
    real w2lambda = 1.f - w1lambda;
-    real* inPos =
+    real* inPos = &in[outIdH * inputW + channelId * inImgSize +
-      &in[outIdH * inputW + channelId * inImgSize + inImgIdy * inImgW + inImgIdx];
+                      inImgIdy * inImgW + inImgIdx];
    const real* outPos = &out[outIdH * outputW + outIdW];
    paddle::paddleAtomicAdd(&inPos[0], h2lambda * w2lambda * outPos[0]);
    paddle::paddleAtomicAdd(&inPos[wId], h2lambda * w1lambda * outPos[0]);
-    paddle::paddleAtomicAdd(&inPos[hId * inImgW], h1lambda * w2lambda * outPos[0]);
+    paddle::paddleAtomicAdd(&inPos[hId * inImgW],
-    paddle::paddleAtomicAdd(&inPos[hId * inImgW + wId], h1lambda * w1lambda * outPos[0]);
+                            h1lambda * w2lambda * outPos[0]);
+    paddle::paddleAtomicAdd(&inPos[hId * inImgW + wId],
+                            h1lambda * w1lambda * outPos[0]);
  }
 }
@@ -379,22 +492,37 @@ void hl_bilinear_backward(real* inGrad,
  int threadNum = outputH * outputW;
  int blocks = (threadNum + 1024 - 1) / 1024;
-  KeBilinearInterpBw<<< blocks, 1024, 0, STREAM_DEFAULT>>>(
+  KeBilinearInterpBw<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inGrad,
-    inGrad, inImgH, inImgW, inputH, inputW, outGrad, outImgH,
+                                                          inImgH,
-    outImgW, outputH, outputW, numChannels, ratioH, ratioW);
+                                                          inImgW,
+                                                          inputH,
+                                                          inputW,
+                                                          outGrad,
+                                                          outImgH,
+                                                          outImgW,
+                                                          outputH,
+                                                          outputW,
+                                                          numChannels,
+                                                          ratioH,
+                                                          ratioW);
  CHECK_SYNC("hl_bilinear_backward failed");
 }
-__global__ void maxoutFpCompute(size_t nthreads, const real * inData,
+__global__ void maxoutFpCompute(size_t nthreads,
-                                real * outData, int* idData, 
+                                const real* inData,
-                                size_t size, size_t featLen, size_t groups) {
+                                real* outData,
+                                int* idData,
+                                size_t size,
+                                size_t featLen,
+                                size_t groups) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if(index < nthreads) {
+  if (index < nthreads) {
    size_t batch_idx = index / size;
    size_t i = index % size;
    size_t channel_idx = i / featLen;
    size_t feat_idx = i % featLen;
-    size_t data_idx = (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
+    size_t data_idx =
+        (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
    real max = inData[data_idx];
    int maxId = 0;
    for (size_t g = 1; g < groups; ++g) {
@@ -409,37 +537,50 @@ __global__ void maxoutFpCompute(size_t nthreads, const real * inData,
  }
 }
-void hl_maxout_forward(const real* inData, real* outData,
+void hl_maxout_forward(const real* inData,
-                       int* idData, size_t batchSize, size_t size,
+                       real* outData,
-                       size_t featLen, size_t groups) {
+                       int* idData,
+                       size_t batchSize,
+                       size_t size,
+                       size_t featLen,
+                       size_t groups) {
  int num_kernels = size * batchSize;
  int blocks = (num_kernels + 1024 - 1) / 1024;
-  maxoutFpCompute<<< blocks, 1024, 0, STREAM_DEFAULT>>>(
+  maxoutFpCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(
-    num_kernels, inData, outData, idData, size, featLen, groups);
+      num_kernels, inData, outData, idData, size, featLen, groups);
  CHECK_SYNC("hl_maxout_forward failed");
 }
-__global__ void maxoutBpCompute(size_t nthreads, real* inGrad,
+__global__ void maxoutBpCompute(size_t nthreads,
-                                const real* outGrad, const int* idData,
+                                real* inGrad,
-                                size_t size, size_t featLen, size_t groups) {
+                                const real* outGrad,
+                                const int* idData,
+                                size_t size,
+                                size_t featLen,
+                                size_t groups) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if(index < nthreads) {
+  if (index < nthreads) {
    size_t batch_idx = index / size;
    size_t i = index % size;
    size_t channel_idx = i / featLen;
    size_t feat_idx = i % featLen;
    size_t newIndex = batch_idx * size;
-    size_t gradIdx = (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx;
+    size_t gradIdx =
+        (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx;
    (inGrad + newIndex * groups)[gradIdx] += (outGrad + newIndex)[i];
  }
 }
-void hl_maxout_backward(real* inGrad, const real* outGrad,
+void hl_maxout_backward(real* inGrad,
-                        const int* idData, size_t batchSize, size_t size,
+                        const real* outGrad,
-                        size_t featLen, size_t groups) {
+                        const int* idData,
+                        size_t batchSize,
+                        size_t size,
+                        size_t featLen,
+                        size_t groups) {
  int num_kernels = size * batchSize;
  int blocks = (num_kernels + 1024 - 1) / 1024;
-  maxoutBpCompute<<< blocks, 1024, 0, STREAM_DEFAULT >>>(
+  maxoutBpCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(
-    num_kernels, inGrad, outGrad, idData, size, featLen, groups);
+      num_kernels, inGrad, outGrad, idData, size, featLen, groups);
  CHECK_SYNC("hl_maxout_backward failed");
 }
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "hl_activation_functions.h"
 #include "hl_base.h"
 #include "hl_cuda_cublas.h"
 #include "hl_device_functions.cuh"
-#include "hl_activation_functions.h"
 #include "paddle/utils/Logging.h"
-typedef hppl::Active<real>::forward  t_forward;
+typedef hppl::Active<real>::forward t_forward;
 typedef hppl::Active<real>::backward t_backward;
 bool hl_lstm_sequence_parallel(int frameSize) {
@@ -42,9 +41,9 @@ public:
      value_ += (start + length - 1) * frameSize + idx;
    }
  }
-  __device__ inline real *getPtr() const {return value_;}
+  __device__ inline real *getPtr() const { return value_; }
-  __device__ inline real getValue() {return *value_;}
+  __device__ inline real getValue() { return *value_; }
-  __device__ inline void setValue(real value) {*value_ = value;}
+  __device__ inline void setValue(real value) { *value_ = value; }
  template <int reversed, int frameSize>
  __device__ inline void nextFrame() {
    if (reversed == 0) {
@@ -55,28 +54,25 @@ public:
  }
 };
-__device__ __forceinline__
+__device__ __forceinline__ void ptx_sync(const int id, const int barriers) {
-void ptx_sync(const int id, const int barriers) {
  asm volatile("bar.sync %0, %1;" : : "r"(id), "r"(barriers) : "memory");
 }
-__device__ __forceinline__
+__device__ __forceinline__ void ptx_arrive(const int id, const int barriers) {
-void ptx_arrive(const int id, const int barriers) {
  asm volatile("bar.arrive %0, %1;" : : "r"(id), "r"(barriers) : "memory");
 }
-template<int valueSize, int frameSize>
+template <int valueSize, int frameSize>
-__device__ __forceinline__ real
+__device__ __forceinline__ real forward_sequence(real value,
-forward_sequence(real value,
+                                                 real *shValue,
-                 real *shValue,
+                                                 real *state,
-                 real *state,
+                                                 real *preOutput,
-                 real *preOutput,
+                                                 real *output,
-                 real *output,
+                                                 real check,
-                 real check,
+                                                 int index,
-                 int index,
+                                                 t_forward activeNode,
-                 t_forward activeNode,
+                                                 t_forward activeGate,
-                 t_forward activeGate,
+                                                 t_forward activeState) {
-                 t_forward activeState) {
  real out;
  real prevOut;
  real state_r;
@@ -112,17 +108,20 @@ forward_sequence(real value,
  if (idy == 0) {
    ptx_sync(2, frameSize * 2);
    prevOut = state[idx];
-     prevOut = activeState(prevOut);
+    prevOut = activeState(prevOut);
    preOutput[idx] = prevOut;
    ptx_arrive(3, frameSize * 2);
  }
  return value;
 }
-#define     OUTPUT_BARRIER_ID               10
+#define OUTPUT_BARRIER_ID 10
-#define     OUTPUT_BARRIER_ID2              11
+#define OUTPUT_BARRIER_ID2 11
-template<int valueSize, int frameSize, int reversed,
+template <int valueSize,
-         int computeThreads, int blockSize>
+          int frameSize,
+          int reversed,
+          int computeThreads,
+          int blockSize>
 __global__ void KeLstmForward(real *gateValue,
                              real *state,
                              real *output,
@@ -184,10 +183,16 @@ __global__ void KeLstmForward(real *gateValue,
        }
      }
      value = forward_sequence<valueSize, frameSize>(
-        value, shValue, shState, shPrevOutput, shOutput, check, index,
+          value,
-        hppl::gpu::forward[active_node],
+          shValue,
-        hppl::gpu::forward[active_gate],
+          shState,
-        hppl::gpu::forward[active_state]);
+          shPrevOutput,
+          shOutput,
+          check,
+          index,
+          hppl::gpu::forward[active_node],
+          hppl::gpu::forward[active_gate],
+          hppl::gpu::forward[active_state]);
      const int idx = index % frameSize;
      const int idy = index / frameSize;
      if (valueSize == 128) {
@@ -218,7 +223,7 @@ __global__ void KeLstmForward(real *gateValue,
          real B_r[frameSize];
          const int computeIdx = index - valueSize;
          if (i == 0) {
-            #pragma unroll
+#pragma unroll
            for (int n = 0; n < frameSize; n++) {
              B_r[n] = weight[n * valueSize + computeIdx];
            }
@@ -230,7 +235,7 @@ __global__ void KeLstmForward(real *gateValue,
          }
          real sum = 0.0f;
          for (int n = 0; n < frameSize; n++) {
-            sum += A_r[n]*B_r[n];
+            sum += A_r[n] * B_r[n];
          }
          shValue[computeIdx] = sum;
          ptx_arrive(OUTPUT_BARRIER_ID2, blockSize);
@@ -239,14 +244,14 @@ __global__ void KeLstmForward(real *gateValue,
      if (valueSize == 256) {
        real B_r[frameSize];
        if (i == 0) {
-          #pragma unroll
+#pragma unroll
          for (int n = 0; n < frameSize; n++) {
            B_r[n] = weight[n * valueSize + index];
          }
        }
        real sum = 0.0f;
        for (int n = 0; n < frameSize; n++) {
-          sum += shOutput[n]*B_r[n];
+          sum += shOutput[n] * B_r[n];
        }
        value += sum;
      }
@@ -273,50 +278,81 @@ void hl_lstm_parallel_forward(real *gateValue,
  dim3 grid(numSequences, 1);
  if (!reversed) {
    if (frameSize == 32) {
-      KeLstmForward<128, 32, 0, 128, 256>
+      KeLstmForward<128, 32, 0, 128, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
-               <<<grid, 256, 0, STREAM_DEFAULT>>>
+          gateValue,
-               (gateValue, stateValue, outputValue, preOutputValue,
+          stateValue,
-               checkIg, checkFg, checkOg, weight, sequence,
+          outputValue,
-               active_node, active_gate, active_state);
+          preOutputValue,
+          checkIg,
+          checkFg,
+          checkOg,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
    } else if (frameSize == 64) {
-      KeLstmForward<256, 64, 0, 256, 256>
+      KeLstmForward<256, 64, 0, 256, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
-               <<<grid, 256, 0, STREAM_DEFAULT>>>
+          gateValue,
-               (gateValue, stateValue, outputValue, preOutputValue,
+          stateValue,
-               checkIg, checkFg, checkOg, weight, sequence,
+          outputValue,
-               active_node, active_gate, active_state);
+          preOutputValue,
+          checkIg,
+          checkFg,
+          checkOg,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
    }
  } else {
    if (frameSize == 32) {
-      KeLstmForward<128, 32, 1, 128, 256>
+      KeLstmForward<128, 32, 1, 128, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
-               <<<grid, 256, 0, STREAM_DEFAULT>>>
+          gateValue,
-               (gateValue, stateValue, outputValue, preOutputValue,
+          stateValue,
-               checkIg, checkFg, checkOg, weight, sequence,
+          outputValue,
-               active_node, active_gate, active_state);
+          preOutputValue,
+          checkIg,
+          checkFg,
+          checkOg,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
    } else if (frameSize == 64) {
-      KeLstmForward<256, 64, 1, 256, 256>
+      KeLstmForward<256, 64, 1, 256, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
-               <<<grid, 256, 0, STREAM_DEFAULT>>>
+          gateValue,
-               (gateValue, stateValue, outputValue, preOutputValue,
+          stateValue,
-               checkIg, checkFg, checkOg, weight, sequence,
+          outputValue,
-               active_node, active_gate, active_state);
+          preOutputValue,
+          checkIg,
+          checkFg,
+          checkOg,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
    }
  }
  CHECK_SYNC("hl_lstm_parallel_forward failed");
 }
-__device__ __forceinline__
+__device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
-void transpose_32x32(real a[], const int idx) {
  int addr = idx % 32;
-  #pragma unroll
+#pragma unroll
  for (int k = 1; k < 32; k++) {
    // rSrc[k] = __shfl(rSrc[k], (threadIdx.x + k) % 32, 32);
    addr = __shfl(addr, (idx + 1) % 32, 32);
    a[k] = __shfl(a[k], addr, 32);
  }
-  #pragma unroll
+#pragma unroll
  for (int tid = 0; tid < 31; tid++) {
    real tmp = (idx > tid) ? a[0] : a[1];
-    #pragma unroll
+#pragma unroll
    for (int k = 31; k > 0; k--) {
      a[(k + 1) % 32] = (idx > tid) ? a[k] : a[(k + 1) % 32];
    }
@@ -324,29 +360,28 @@ void transpose_32x32(real a[], const int idx) {
  }
  addr = (32 - idx) % 32;
-  #pragma unroll
+#pragma unroll
  for (int k = 0; k < 32; k++) {
    a[k] = __shfl(a[k], addr, 32);
    addr = __shfl(addr, (idx + 31) % 32, 32);
  }
 }
-template<int valueSize, int frameSize>
+template <int valueSize, int frameSize>
-__device__ void
+__device__ void backward_sequence(real rGateValue,
-backward_sequence(real rGateValue,
+                                  real rOutputGrad,
-                  real rOutputGrad,
+                                  real rPreOutputValue,
-                  real rPreOutputValue,
+                                  real &rGateGrad,
-                  real &rGateGrad,
+                                  real &rStateGrad,
-                  real &rStateGrad,
+                                  real *shStateGrad,
-                  real *shStateGrad,
+                                  real *shStateValue,
-                  real *shStateValue,
+                                  real *shGateValue,
-                  real *shGateValue,
+                                  real rCheck,
-                  real rCheck,
+                                  real &rGateValuePrev,
-                  real &rGateValuePrev,
+                                  int index,
-                  int index,
+                                  t_backward activeNode,
-                  t_backward activeNode,
+                                  t_backward activeGate,
-                  t_backward activeGate,
+                                  t_backward activeState) {
-                  t_backward activeState) {
  const int frameIdx = index % frameSize;
  const int frameIdy = index / frameSize;
  if (frameIdy == 3) {
@@ -363,8 +398,8 @@ backward_sequence(real rGateValue,
    rStateGrad = rGateGrad * rCheck;
    shStateGrad[index] = rStateGrad;
    ptx_sync(3, valueSize);
-    rStateGrad += shStateGrad[frameIdx + frameSize *2];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 2];
-    rStateGrad += shStateGrad[frameIdx + frameSize *3];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
    rGateGrad = rStateGrad * shGateValue[frameIdx];
    rGateGrad = activeGate(rGateGrad, rGateValue);
  } else if (frameIdy == 2) {
@@ -373,7 +408,7 @@ backward_sequence(real rGateValue,
    shStateGrad[index] = rStateGrad;
    ptx_sync(3, valueSize);
    rStateGrad += shStateGrad[frameIdx + frameSize];
-    rStateGrad += shStateGrad[frameIdx + frameSize *3];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
    rGateValuePrev = rGateValue;
    rGateGrad = rStateGrad * shStateValue[frameIdx];
    rGateGrad = activeGate(rGateGrad, rGateValue);
@@ -381,43 +416,43 @@ backward_sequence(real rGateValue,
    shGateValue[frameIdx] = rGateValue;
    ptx_sync(3, valueSize);
    rStateGrad = shStateGrad[frameIdx + frameSize];
-    rStateGrad += shStateGrad[frameIdx + frameSize *2];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 2];
-    rStateGrad += shStateGrad[frameIdx + frameSize *3];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
    rGateGrad = rStateGrad * shGateValue[frameIdx + frameSize];
    rGateGrad = activeNode(rGateGrad, rGateValue);
  }
 }
-template<int valueSize, int frameSize>
+template <int valueSize, int frameSize>
 __device__ void load_weight(real rWeight[], real *weight, const int index) {
  if (valueSize == 128) {
    weight += index;
-    #pragma unroll
+#pragma unroll
    for (int n = 0; n < frameSize; n++) {
-      rWeight[n] = weight[n*valueSize];
+      rWeight[n] = weight[n * valueSize];
    }
    transpose_32x32(rWeight, index % 32);
  }
  if (valueSize == 256) {
    int id = (index / 32) % 2;
    weight += index - id * 32 + id * 32 * valueSize;
-    #pragma unroll
+#pragma unroll
    for (int n = 0; n < 32; n++) {
-      rWeight[n] = weight[n*valueSize];
+      rWeight[n] = weight[n * valueSize];
-      rWeight[n + 32] = weight[n*valueSize + 32];
+      rWeight[n + 32] = weight[n * valueSize + 32];
    }
    transpose_32x32(rWeight, index % 32);
    transpose_32x32(&rWeight[32], index % 32);
  }
 }
-template<int valueSize, int frameSize, int reversed>
+template <int valueSize, int frameSize, int reversed>
 __global__ void KeLstmBackward(real *gateValue,
                               real *gateGrad,
                               real *stateValue,
-                               real *stateGrad,       /* do not need save */
+                               real *stateGrad, /* do not need save */
                               real *preOutputValue,
-                               real *preOutputGrad,   /* do not need save */
+                               real *preOutputGrad, /* do not need save */
                               real *checkIg,
                               real *checkIgGrad,
                               real *checkFg,
@@ -484,20 +519,27 @@ __global__ void KeLstmBackward(real *gateValue,
  for (int i = 0; i < length; ++i) {
    if (frameIdy == 3) {
-      if (i != length -1) {
+      if (i != length - 1) {
        frameStateValue.nextFrame<!reversed, frameSize>();
        shStateValue[frameIdx] = frameStateValue.getValue();
      } else {
        shStateValue[frameIdx] = 0.0;
      }
    }
-    backward_sequence<valueSize, frameSize>(
+    backward_sequence<valueSize, frameSize>(rGateValue,
-        rGateValue, rOutputGrad, rPreOutputValue, rGateGrad,
+                                            rOutputGrad,
-        rStateGrad, shStateGrad, shStateValue, shGateValue,
+                                            rPreOutputValue,
-        rCheck, rGateValuePrev, index,
+                                            rGateGrad,
-        hppl::gpu::backward[active_node],
+                                            rStateGrad,
-        hppl::gpu::backward[active_gate],
+                                            shStateGrad,
-        hppl::gpu::backward[active_state]);
+                                            shStateValue,
+                                            shGateValue,
+                                            rCheck,
+                                            rGateValuePrev,
+                                            index,
+                                            hppl::gpu::backward[active_node],
+                                            hppl::gpu::backward[active_gate],
+                                            hppl::gpu::backward[active_state]);
    if (frameIdy == 3) {
      rCheckGrad += rGateGrad * rStateValue;
      rStateValue = shStateValue[frameIdx];
@@ -523,9 +565,9 @@ __global__ void KeLstmBackward(real *gateValue,
      shGateGrad[frameIdy][frameIdx] = rGateGrad;
      if (valueSize == 128) {
        real sum = 0.0f;
-        #pragma unroll
+#pragma unroll
        for (int n = 0; n < frameSize; n++) {
-          sum += shGateGrad[frameIdy][n]*B_r[n];
+          sum += shGateGrad[frameIdy][n] * B_r[n];
        }
        if (frameIdy == 3) {
          rOutputGrad += sum;
@@ -541,7 +583,7 @@ __global__ void KeLstmBackward(real *gateValue,
        }
        real sum = 0.0f;
        for (int n = 0; n < frameSize; n++) {
-          sum += A_r[n]*B_r[n];
+          sum += A_r[n] * B_r[n];
        }
        if (frameIdy == 3) {
          rOutputGrad += sum;
@@ -552,8 +594,8 @@ __global__ void KeLstmBackward(real *gateValue,
      if (frameIdy == 3) {
        ptx_sync(6, valueSize);
-        #pragma unroll
+#pragma unroll
-        for (int i = 0; i < 3; i ++) {
+        for (int i = 0; i < 3; i++) {
          rOutputGrad += shOutputGrad[i][frameIdx];
        }
      } else {
@@ -564,11 +606,14 @@ __global__ void KeLstmBackward(real *gateValue,
  /* TODO: Temporary save & merger in another kernel */
  if (frameIdy == 1) {
-    if (checkIgGrad) paddle::paddleAtomicAdd(checkIgGrad+frameIdx, rCheckGrad);
+    if (checkIgGrad)
+      paddle::paddleAtomicAdd(checkIgGrad + frameIdx, rCheckGrad);
  } else if (frameIdy == 2) {
-    if (checkFgGrad) paddle::paddleAtomicAdd(checkFgGrad+frameIdx, rCheckGrad);
+    if (checkFgGrad)
+      paddle::paddleAtomicAdd(checkFgGrad + frameIdx, rCheckGrad);
  } else if (frameIdy == 3) {
-    if (checkOgGrad) paddle::paddleAtomicAdd(checkOgGrad+frameIdx, rCheckGrad);
+    if (checkOgGrad)
+      paddle::paddleAtomicAdd(checkOgGrad + frameIdx, rCheckGrad);
  }
 }
@@ -593,68 +638,183 @@ void hl_lstm_parallel_backward_data(real *gateValue,
                                    hl_activation_mode_t active_node,
                                    hl_activation_mode_t active_gate,
                                    hl_activation_mode_t active_state) {
-  CHECK(frameSize == 32 || frameSize == 64 ||
+  CHECK(frameSize == 32 || frameSize == 64 || frameSize == 128 ||
-        frameSize == 128 || frameSize == 256);
+        frameSize == 256);
  dim3 grid(numSequences, 1);
  if (!reversed) {
    if (frameSize == 32) {
-      KeLstmBackward<128, 32, 0><<<grid, 128, 0, STREAM_DEFAULT>>>
+      KeLstmBackward<128, 32, 0><<<grid, 128, 0, STREAM_DEFAULT>>>(
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
+          gateValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
+          gateGrad,
-          checkOgGrad, outputGrad, weight, sequence,
+          stateValue,
-          active_node, active_gate, active_state);
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
    } else if (frameSize == 64) {
-      KeLstmBackward<256, 64, 0><<<grid, 256, 0, STREAM_DEFAULT>>>
+      KeLstmBackward<256, 64, 0><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
+          gateValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
+          gateGrad,
-          checkOgGrad, outputGrad, weight, sequence,
+          stateValue,
-          active_node, active_gate, active_state);
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
    } else if (frameSize == 128) {
-      KeLstmBackward<512, 128, 0><<<grid, 512, 0, STREAM_DEFAULT>>>
+      KeLstmBackward<512, 128, 0><<<grid, 512, 0, STREAM_DEFAULT>>>(
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
+          gateValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
+          gateGrad,
-          checkOgGrad, outputGrad, weight, sequence,
+          stateValue,
-          active_node, active_gate, active_state);
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
    } else if (frameSize == 256) {
-      KeLstmBackward<1024, 256, 0><<<grid, 1024, 0, STREAM_DEFAULT>>>
+      KeLstmBackward<1024, 256, 0><<<grid, 1024, 0, STREAM_DEFAULT>>>(
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
+          gateValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
+          gateGrad,
-          checkOgGrad, outputGrad, weight, sequence,
+          stateValue,
-          active_node, active_gate, active_state);
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
    }
  } else {
    if (frameSize == 32) {
-      KeLstmBackward<128, 32, 1><<<grid, 128, 0, STREAM_DEFAULT>>>
+      KeLstmBackward<128, 32, 1><<<grid, 128, 0, STREAM_DEFAULT>>>(
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
+          gateValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
+          gateGrad,
-          checkOgGrad, outputGrad, weight, sequence,
+          stateValue,
-          active_node, active_gate, active_state);
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
    } else if (frameSize == 64) {
-      KeLstmBackward<256, 64, 1><<<grid, 256, 0, STREAM_DEFAULT>>>
+      KeLstmBackward<256, 64, 1><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
+          gateValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
+          gateGrad,
-          checkOgGrad, outputGrad, weight, sequence,
+          stateValue,
-          active_node, active_gate, active_state);
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
    } else if (frameSize == 128) {
-      KeLstmBackward<512, 128, 1><<<grid, 512, 0, STREAM_DEFAULT>>>
+      KeLstmBackward<512, 128, 1><<<grid, 512, 0, STREAM_DEFAULT>>>(
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
+          gateValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
+          gateGrad,
-          checkOgGrad, outputGrad, weight, sequence,
+          stateValue,
-          active_node, active_gate, active_state);
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
    } else if (frameSize == 256) {
-      KeLstmBackward<1024, 256, 1><<<grid, 1024, 0, STREAM_DEFAULT>>>
+      KeLstmBackward<1024, 256, 1><<<grid, 1024, 0, STREAM_DEFAULT>>>(
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
+          gateValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
+          gateGrad,
-          checkOgGrad, outputGrad, weight, sequence,
+          stateValue,
-          active_node, active_gate, active_state);
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
    }
  }
  CHECK_SYNC("hl_lstm_parallel_backward_data");
 }
-template<int B_X, int B_Y>
+template <int B_X, int B_Y>
 __global__ void KeSetGradZero(real *gateGrad,
-    const int *starts, int valueSize, int numSequences, bool reversed) {
+                              const int *starts,
+                              int valueSize,
+                              int numSequences,
+                              bool reversed) {
  // const int tid = threadIdx.x;
  const int frameIdx = blockIdx.x * B_X + threadIdx.x;
@@ -682,19 +842,31 @@ void hl_lstm_parallel_backward_weight(real *weightGrad,
  int valueSize = 4 * frameSize;
  dim3 threads(32, 32);
  dim3 grid((valueSize + 32 - 1) / 32, (numSequences + 32 - 1) / 32);
-  KeSetGradZero<32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>
+  KeSetGradZero<32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
-           (gateGrad, sequence, valueSize, numSequences, reversed);
+      gateGrad, sequence, valueSize, numSequences, reversed);
  if (!reversed) {
    hl_matrix_mul(outputValue,
-      HPPL_OP_T, gateGrad + valueSize, HPPL_OP_N, weightGrad,
+                  HPPL_OP_T,
-      frameSize, valueSize, batchSize - 1,
+                  gateGrad + valueSize,
-      1.0, 1.0);
+                  HPPL_OP_N,
+                  weightGrad,
+                  frameSize,
+                  valueSize,
+                  batchSize - 1,
+                  1.0,
+                  1.0);
  } else {
    hl_matrix_mul(outputValue + frameSize,
-      HPPL_OP_T, gateGrad, HPPL_OP_N, weightGrad,
+                  HPPL_OP_T,
-      frameSize, valueSize, batchSize - 1,
+                  gateGrad,
-      1.0, 1.0);
+                  HPPL_OP_N,
+                  weightGrad,
+                  frameSize,
+                  valueSize,
+                  batchSize - 1,
+                  1.0,
+                  1.0);
  }
  CHECK_SYNC("hl_lstm_parallel_backward_weight");
 }
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "hl_base.h"
+#include "hl_device_functions.cuh"
+#include "hl_gpu_matrix_kernel.cuh"
 #include "hl_matrix.h"
-#include "hl_matrix_ops.cuh"
 #include "hl_matrix_apply.cuh"
+#include "hl_matrix_ops.cuh"
 #include "hl_sequence.h"
 #include "hl_sparse.ph"
 #include "paddle/utils/Logging.h"
-#include "hl_device_functions.cuh"
-#include "hl_gpu_matrix_kernel.cuh"
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b);
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b);
-void hl_matrix_add(real *A_d,
+void hl_matrix_add(real* A_d,
-                   real *B_d,
+                   real* B_d,
-                   real *C_d,
+                   real* C_d,
                   int dimM,
                   int dimN,
                   real alpha,
@@ -36,33 +35,32 @@ void hl_matrix_add(real *A_d,
  CHECK_NOTNULL(B_d);
  CHECK_NOTNULL(C_d);
-  hl_gpu_apply_ternary_op
+  hl_gpu_apply_ternary_op<real, ternary::_add<real>, 0, 0>(
-    <real, ternary::_add<real>, 0, 0>(ternary::_add<real>(alpha, beta),
+      ternary::_add<real>(alpha, beta),
-                                      A_d,
+      A_d,
-                                      B_d,
+      B_d,
-                                      C_d,
+      C_d,
-                                      dimM,
+      dimM,
-                                      dimN,
+      dimN,
-                                      dimN,
+      dimN,
-                                      dimN,
+      dimN,
-                                      dimN);
+      dimN);
  CHECK_SYNC("hl_matrix_add failed");
 }
 #ifdef PADDLE_TYPE_DOUBLE
-    #define THRESHOLD   128
+#define THRESHOLD 128
 #else
-    #define THRESHOLD   64
+#define THRESHOLD 64
 #endif
-__device__ __forceinline__
+__device__ __forceinline__ void findMax(real* I,
-void findMax(real* I,
+                                        real* dfMax_s,
-             real* dfMax_s,
+                                        int blockSize,
-             int blockSize,
+                                        int base,
-             int base,
+                                        int curIdx,
-             int curIdx,
+                                        int nextIdx,
-             int nextIdx,
+                                        int dimN,
-             int dimN,
+                                        real* max) {
-             real* max) {
  dfMax_s[base] = -1.0e20;
  while (curIdx < dimN) {
    if (dfMax_s[base] < I[nextIdx]) {
@@ -78,25 +76,24 @@ void findMax(real* I,
    if (base < stride) {
      nextIdx = base + stride;
      if (dfMax_s[base] < dfMax_s[nextIdx]) {
-          dfMax_s[base] = dfMax_s[nextIdx];
+        dfMax_s[base] = dfMax_s[nextIdx];
      }
    }
  }
-  if (0 == base)  {
+  if (0 == base) {
    max[0] = dfMax_s[0];
  }
  __syncthreads();
 }
-__device__ __forceinline__
+__device__ __forceinline__ void subMaxAndExp(real* I,
-void subMaxAndExp(real* I,
+                                             real* O,
-                  real* O,
+                                             int curIdx,
-                  int curIdx,
+                                             int nextIdx,
-                  int nextIdx,
+                                             int blockSize,
-                  int blockSize,
+                                             int dimN,
-                  int dimN,
+                                             real max) {
-                  real max) {
  real val;
  while (curIdx < dimN) {
    val = I[nextIdx] - max;
@@ -115,14 +112,13 @@ void subMaxAndExp(real* I,
  __syncthreads();
 }
-__device__ __forceinline__
+__device__ __forceinline__ void valueSum(real* O,
-void valueSum(real* O,
+                                         real* dfMax_s,
-              real* dfMax_s,
+                                         int blockSize,
-              int blockSize,
+                                         int base,
-              int base,
+                                         int curIdx,
-              int curIdx,
+                                         int nextIdx,
-              int nextIdx,
+                                         int dimN) {
-              int dimN) {
  dfMax_s[base] = 0;
  while (curIdx < dimN) {
    dfMax_s[base] += O[nextIdx];
@@ -141,13 +137,8 @@ void valueSum(real* O,
  __syncthreads();
 }
-__device__ __forceinline__
+__device__ __forceinline__ void divSum(
-void divSum(real* O,
+    real* O, real sum, int curIdx, int nextIdx, int blockSize, int dimN) {
-            real sum,
-            int curIdx,
-            int nextIdx,
-            int blockSize,
-            int dimN) {
  while (curIdx < dimN) {
    O[nextIdx] /= sum;
    nextIdx += blockSize;
@@ -155,20 +146,18 @@ void divSum(real* O,
  }
 }
-__device__ __forceinline__
+__device__ __forceinline__ void softmax(real* I,
-void softmax(real* I,
+                                        real* O,
-             real* O,
+                                        real* dfMax_s,
-             real* dfMax_s,
+                                        int blockSize,
-             int blockSize,
+                                        int base,
-             int base,
+                                        int curIdx,
-             int curIdx,
+                                        int nextIdx,
-             int nextIdx,
+                                        int dimN) {
-             int dimN) {
  __shared__ real max;
  // find the max number
-  findMax(I, dfMax_s, blockSize, base, curIdx,
+  findMax(I, dfMax_s, blockSize, base, curIdx, nextIdx, dimN, &max);
-          nextIdx, dimN, &max);
  // sub max Value and do Exp operation
  subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max);
@@ -181,8 +170,8 @@ void softmax(real* I,
  divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN);
 }
-template<int blockSize>
+template <int blockSize>
-__global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
+__global__ void KeMatrixSoftMax(real* O, real* I, int dimN) {
  int base = threadIdx.x;
  __shared__ real dfMax_s[blockSize];
  int nextIdx = blockIdx.x * dimN + base;
@@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
 }
-void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) {
+void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
  dim3 block(512, 1);
  dim3 grid(dimM, 1);
-  KeMatrixSoftMax<512>
+  KeMatrixSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
-           <<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
  CHECK_SYNC("hl_matrix_softmax failed");
 }
-template<int blockSize>
+template <int blockSize>
-__global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
+__global__ void KeSequenceSoftMax(real* O, real* I, const int* index) {
  int base = threadIdx.x;
  int bid = blockIdx.x;
  __shared__ real dfMax_s[blockSize];
@@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
 }
-void hl_sequence_softmax_forward(real *A_d,
+void hl_sequence_softmax_forward(real* A_d,
-                                 real *C_d,
+                                 real* C_d,
                                 const int* index,
                                 int numSequence) {
  CHECK_NOTNULL(A_d);
@@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d,
  dim3 block(512, 1);
  dim3 grid(numSequence, 1);
-  KeSequenceSoftMax<512>
+  KeSequenceSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
-           <<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
  CHECK_SYNC("hl_sequence_softmax_forward failed");
 }
-__global__ void KeMatrixDerivative(real *grad_d,
+__global__ void KeMatrixDerivative(
-                                   real *output_d,
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
-                                   real *sftmaxSum_d,
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-                                   int dimM,
+  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
-                                   int dimN) {
-  int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
-  int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
  int index;
  if (rowIdx < dimM && colIdx < dimN) {
-    index = rowIdx*dimN + colIdx;
+    index = rowIdx * dimN + colIdx;
    grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]);
  }
 }
-void hl_matrix_softmax_derivative(real *grad_d,
+void hl_matrix_softmax_derivative(
-                                  real *output_d,
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
-                                  real *sftmaxSum_d,
-                                  int dimM,
-                                  int dimN) {
  CHECK_NOTNULL(grad_d);
  CHECK_NOTNULL(output_d);
  CHECK_NOTNULL(sftmaxSum_d);
  int blocksX = (dimM + 0) / 1;
-  int blocksY = (dimN + 1024 -1) / 1024;
+  int blocksY = (dimN + 1024 - 1) / 1024;
  dim3 threads(1, 1024);
  dim3 grid(blocksX, blocksY);
-  KeMatrixDerivative<<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixDerivative<<<grid, threads, 0, STREAM_DEFAULT>>>(
-           (grad_d, output_d, sftmaxSum_d, dimM, dimN);
+      grad_d, output_d, sftmaxSum_d, dimM, dimN);
  CHECK_SYNC("hl_matrix_softmax_derivative failed");
 }
-__global__ void KeMatrixMultiBinaryCrossEntropy(real* output,
+__global__ void KeMatrixMultiBinaryCrossEntropy(
-                                                real* entropy,
+    real* output, real* entropy, int* row, int* col, int dimM, int dimN) {
-                                                int* row,
-                                                int* col,
-                                                int dimM,
-                                                int dimN) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  if (index < dimM) {
-    for (int i = 0; i < dimN; i ++) {
+    for (int i = 0; i < dimN; i++) {
      entropy[index] -= log(1 - output[index * dimN + i]);
    }
-    int *row_col = col + row[index];
+    int* row_col = col + row[index];
    int col_num = row[index + 1] - row[index];
-    for (int i = 0; i < col_num; i ++) {
+    for (int i = 0; i < col_num; i++) {
      real o = output[index * dimN + row_col[i]];
      entropy[index] -= log(o / (1 - o));
    }
@@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output,
  dim3 threads(n_threads);
  dim3 grid(blocks);
  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
-  KeMatrixMultiBinaryCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixMultiBinaryCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
-          (output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
+      output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed");
 }
-__global__ void KeMatrixMultiBinaryCrossEntropyBp(real* output,
+__global__ void KeMatrixMultiBinaryCrossEntropyBp(
-                                                  real* grad,
+    real* output, real* grad, int* row, int* col, int dimM, int dimN) {
-                                                  int* row,
-                                                  int* col,
-                                                  int dimM,
-                                                  int dimN) {
  int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (row_idx < dimM) {
-    for (int i = 0; i < dimN; i ++) {
+    for (int i = 0; i < dimN; i++) {
      int index = row_idx * dimN + i;
      grad[index] += 1.0 / (1 - output[index]);
    }
    int col_num = row[row_idx + 1] - row[row_idx];
-    int *row_col = col + row[row_idx];
+    int* row_col = col + row[row_idx];
-    for (int i = 0; i < col_num; i ++) {
+    for (int i = 0; i < col_num; i++) {
      int index = row_idx * dimN + row_col[i];
      grad[index] -= 1.0 / (output[index] * (1 - output[index]));
    }
  }
 }
-void hl_matrix_multi_binary_cross_entropy_bp(real* output,
+void hl_matrix_multi_binary_cross_entropy_bp(
-                                             real* grad,
+    real* output, real* grad, hl_sparse_matrix_s csr_mat, int dimM, int dimN) {
-                                             hl_sparse_matrix_s csr_mat,
-                                             int dimM,
-                                             int dimN) {
  CHECK_NOTNULL(output);
  CHECK_NOTNULL(grad);
  CHECK_NOTNULL(csr_mat);
@@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output,
  dim3 threads(n_threads);
  dim3 grid(blocks);
  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
-  KeMatrixMultiBinaryCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixMultiBinaryCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
-          (output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
+      output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed");
 }
-__global__ void KeMatrixCrossEntropy(real* O,
+__global__ void KeMatrixCrossEntropy(
-                                     real* E,
+    real* O, real* E, int* label, int dimM, int dimN) {
-                                     int* label,
-                                     int dimM,
-                                     int dimN) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int newBase;
  if (index < dimM) {
@@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O,
  }
 }
-void hl_matrix_cross_entropy(real* A_d,
+void hl_matrix_cross_entropy(
-                             real* C_d,
+    real* A_d, real* C_d, int* label_d, int dimM, int dimN) {
-                             int* label_d,
-                             int dimM,
-                             int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);
  int blocks = (dimM + 1024 - 1) / 1024;
  dim3 threads(1024, 1);
  dim3 grid(blocks, 1);
-  KeMatrixCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
-           (A_d, C_d, label_d, dimM, dimN);
+      A_d, C_d, label_d, dimM, dimN);
  CHECK_SYNC("hl_matrix_cross_entropy failed");
 }
-__global__ void KeMatrixCrossEntropyBp(real* grad_d,
+__global__ void KeMatrixCrossEntropyBp(
-                                       real* output_d,
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
-                                       int* label_d,
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-                                       int dimM,
+  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
-                                       int dimN) {
-  int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
-  int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
  int index;
  if (rowIdx < dimM && colIdx < dimN) {
-    index = rowIdx*dimN + colIdx;
+    index = rowIdx * dimN + colIdx;
    if (label_d[rowIdx] == colIdx) {
      grad_d[index] -= 1.0f / output_d[index];
    }
  }
 }
-void hl_matrix_cross_entropy_bp(real* grad_d,
+void hl_matrix_cross_entropy_bp(
-                                real* output_d,
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
-                                int* label_d,
-                                int dimM,
-                                int dimN) {
  CHECK_NOTNULL(grad_d);
  CHECK_NOTNULL(output_d);
  CHECK_NOTNULL(label_d);
-  int blocksX = (dimM + 0)/1;
+  int blocksX = (dimM + 0) / 1;
-  int blocksY = (dimN + 1024 -1) / 1024;
+  int blocksY = (dimN + 1024 - 1) / 1024;
  dim3 threads(1, 1024);
  dim3 grid(blocksX, blocksY);
-  KeMatrixCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
-           (grad_d, output_d, label_d, dimM, dimN);
+      grad_d, output_d, label_d, dimM, dimN);
  CHECK_SYNC("hl_matrix_cross_entropy_bp failed");
 }
 void hl_matrix_zero_mem(real* data, int num) {
-  hl_gpu_apply_unary_op(
+  hl_gpu_apply_unary_op(unary::Zero<real>(), data, 1, num, num);
-        unary::Zero<real>(), data, 1, num, num);
 }
 __global__ void KeParamReluForward(real* output,
@@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output,
  int ty = blockIdx.y * blockDim.y + threadIdx.y;
  if (tx < width && ty < height) {
    int index = ty * width + tx;
-    output[index] = input[index] > 0 ? input[index] :
+    output[index] =
-        input[index] * w[tx / partial_sum];
+        input[index] > 0 ? input[index] : input[index] * w[tx / partial_sum];
  }
 }
@@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output,
  CHECK_NOTNULL(w);
  dim3 threads(16, 16);
  int blockX = (width + 16 - 1) / 16;
-  int blockY = (height + 16 -1) / 16;
+  int blockY = (height + 16 - 1) / 16;
  dim3 grid(blockX, blockY);
-  KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>
+  KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (output, input, w, width, height, partial_sum);
+      output, input, w, width, height, partial_sum);
  CHECK_SYNC("hl_param_relu_forward failed");
 }
-template<int blockSize>
+template <int blockSize>
 __global__ void KeParamReluBackWardW(real* grad_w,
                                     real* grad_o,
                                     real* input,
@@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w,
  int grid_num = width / partial_sum;
  dim3 threads(blockSize, 1);
  dim3 grid(grid_num, 1);
-  KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
+  KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (grad_w, grad_o, input, width, height, partial_sum);
+      grad_w, grad_o, input, width, height, partial_sum);
  CHECK_SYNC("hl_param_relu_backward_w failed");
 }
@@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o,
  CHECK_NOTNULL(diff);
  dim3 threads(16, 16);
  int blockX = (width + 16 - 1) / 16;
-  int blockY = (height + 16 -1) / 16;
+  int blockY = (height + 16 - 1) / 16;
  dim3 grid(blockX, blockY);
-  KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>
+  KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (grad_o, data, w, diff, width, height, partial_sum);
+      grad_o, data, w, diff, width, height, partial_sum);
  CHECK_SYNC("hl_param_relu_backward_diff failed");
 }
-__global__ void KeMatrixAddSharedBias(real* A,
+__global__ void KeMatrixAddSharedBias(
-                                      real* B,
+    real* A, real* B, const int channel, const int M, const int N, real scale) {
-                                      const int channel,
-                                      const int M,
-                                      const int N,
-                                      real scale) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int dim = N / channel;
  if (index < M * N) {
@@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d,
                               real scale) {
  const int blocks = 512;
  const int grids = DIVUP(dimM * dimN, blocks);
-  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>
+  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>(
-    (A_d, B_d, channel, dimM, dimN, scale);
+      A_d, B_d, channel, dimM, dimN, scale);
  CHECK_SYNC("hl_matrix_add_shared_bias failed");
 }
 template <int blockSize>
-__global__ void KeMatrixCollectSharedBias(real *B,
+__global__ void KeMatrixCollectSharedBias(real* B,
-                                          real *A,
+                                          real* A,
                                          const int channel,
                                          const int M,
                                          const int N,
@@ -589,7 +541,7 @@ __global__ void KeMatrixCollectSharedBias(real *B,
      int n = j * blockSize + tid;
      int m = n / dim;
      int w = n % dim;
-      smem[tid] =  (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
+      smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
      __syncthreads();
      simpleReduce(smem, tid, blockSize);
      sum += smem[0];
@@ -611,33 +563,32 @@ void hl_matrix_collect_shared_bias(real* B_d,
  const int limit = 64;
  int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
-  KeMatrixCollectSharedBias<blocks>
+  KeMatrixCollectSharedBias<blocks><<<grids, blocks, 0, STREAM_DEFAULT>>>(
-      <<< grids, blocks, 0, STREAM_DEFAULT>>>
+      B_d, A_d, channel, dimM, dimN, dim, limit, scale);
-      (B_d, A_d, channel, dimM, dimN, dim, limit, scale);
  CHECK_SYNC("hl_matrix_collect_shared_bias failed");
 }
-__global__ void keMatrixRotate(real* mat, real* matRot,
+__global__ void keMatrixRotate(
-                               int dimM, int dimN, bool clockWise) {
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < dimM * dimN) {
+  if (idx < dimM * dimN) {
-        int i = idx / dimN;
+    int i = idx / dimN;
-        int j = idx % dimN;
+    int j = idx % dimN;
-        if (clockWise) {
+    if (clockWise) {
-            matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
+      matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
-        } else {
+    } else {
-            matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
+      matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
-        }
    }
+  }
 }
-void hl_matrix_rotate(real *mat, real* matRot,
+void hl_matrix_rotate(
-                      int dimM, int dimN, bool clockWise) {
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
-    CHECK_NOTNULL(mat);
+  CHECK_NOTNULL(mat);
-    CHECK_NOTNULL(matRot);
+  CHECK_NOTNULL(matRot);
-    const int threads = 512;
+  const int threads = 512;
-    const int blocks = DIVUP(dimM * dimN, threads);
+  const int blocks = DIVUP(dimM * dimN, threads);
-    keMatrixRotate<<< blocks, threads, 0, STREAM_DEFAULT >>>
+  keMatrixRotate<<<blocks, threads, 0, STREAM_DEFAULT>>>(
-            (mat, matRot, dimM, dimN, clockWise);
+      mat, matRot, dimM, dimN, clockWise);
-    CHECK_SYNC("hl_matrix_rotate failed");
+  CHECK_SYNC("hl_matrix_rotate failed");
 }
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -16,36 +16,36 @@ limitations under the License. */
 #include "hl_device_functions.cuh"
 #include "paddle/utils/Logging.h"
-__global__ void KeMaxSequenceForward(real *input,
+__global__ void KeMaxSequenceForward(real* input,
-                                     const int *sequence,
+                                     const int* sequence,
                                     real* output,
-                                     int *index,
+                                     int* index,
                                     int numSequences,
                                     int dim) {
  int dimIdx = threadIdx.x;
  int sequenceId = blockIdx.x;
  if (sequenceId >= numSequences) return;
  int start = sequence[sequenceId];
-  int end = sequence[sequenceId+1];
+  int end = sequence[sequenceId + 1];
  for (int i = dimIdx; i < dim; i += blockDim.x) {
    real tmp = -HL_FLOAT_MAX;
    int tmpId = -1;
    for (int insId = start; insId < end; insId++) {
-      if (tmp < input[insId*dim + i]) {
+      if (tmp < input[insId * dim + i]) {
-        tmp = input[insId*dim + i];
+        tmp = input[insId * dim + i];
        tmpId = insId;
      }
    }
-    output[sequenceId*dim + i] = tmp;
+    output[sequenceId * dim + i] = tmp;
-    index[sequenceId*dim + i] = tmpId;
+    index[sequenceId * dim + i] = tmpId;
  }
 }
 void hl_max_sequence_forward(real* input,
                             const int* sequence,
                             real* output,
-                             int *index,
+                             int* index,
                             int numSequences,
                             int dim) {
  CHECK_NOTNULL(input);
@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input,
  dim3 threads(256, 1);
  dim3 grid(numSequences, 1);
-  KeMaxSequenceForward<<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMaxSequenceForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (input, sequence, output, index, numSequences, dim);
+      input, sequence, output, index, numSequences, dim);
  CHECK_SYNC("hl_max_sequence_forward failed");
 }
-__global__ void KeMaxSequenceBackward(real *outputGrad,
+__global__ void KeMaxSequenceBackward(
-                                      int *index,
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
-                                      real* inputGrad,
-                                      int numSequences,
-                                      int dim) {
  int idx = threadIdx.x + blockIdx.x * blockDim.x;
  int colIdx = idx % dim;
-  if (idx < numSequences*dim) {
+  if (idx < numSequences * dim) {
    int insId = index[idx];
    inputGrad[insId * dim + colIdx] += outputGrad[idx];
  }
 }
-void hl_max_sequence_backward(real* outputGrad,
+void hl_max_sequence_backward(
-                              int *index,
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
-                              real* inputGrad,
-                              int numSequences,
-                              int dim) {
  CHECK_NOTNULL(outputGrad);
  CHECK_NOTNULL(index);
  CHECK_NOTNULL(inputGrad);
@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad,
  unsigned int blocks = (numSequences * dim + 128 - 1) / 128;
  dim3 threads(128, 1);
  dim3 grid(blocks, 1);
-  KeMaxSequenceBackward<<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMaxSequenceBackward<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (outputGrad, index, inputGrad, numSequences, dim);
+      outputGrad, index, inputGrad, numSequences, dim);
  CHECK_SYNC("hl_max_sequence_backward failed");
 }
-template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
+template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
 __global__ void KeMatrixAddRows(real* output,
                                real* table,
                                int* ids,
@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output,
  while (sampleId < numSamples) {
    int tableId = ids[sampleId];
    if ((0 <= tableId) && (tableId < tableSize)) {
-      real *outputData = output + sampleId * dim;
+      real* outputData = output + sampleId * dim;
-      real *tableData = table + tableId * dim;
+      real* tableData = table + tableId * dim;
      for (int i = idx; i < dim; i += blockDimX) {
        if (AddRow == 0) {
          outputData[i] += tableData[i];
@@ -114,24 +108,27 @@ __global__ void KeMatrixAddRows(real* output,
        }
      }
    }
-    sampleId += blockDimY*gridDimX;
+    sampleId += blockDimY * gridDimX;
  }
 }
-template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd>
+template <int blockDimX,
-__global__
+          int blockDimY,
-void KeSequence2Batch(real *batch,
+          int gridDimX,
-                      real *sequence,
+          bool seq2batch,
-                      const int *batchIndex,
+          bool isAdd>
-                      int seqWidth,
+__global__ void KeSequence2Batch(real* batch,
-                      int batchCount) {
+                                 real* sequence,
+                                 const int* batchIndex,
+                                 int seqWidth,
+                                 int batchCount) {
  int idx = threadIdx.x;
  int idy = threadIdx.y;
  int id = blockIdx.x + idy * gridDimX;
  while (id < batchCount) {
    int seqId = batchIndex[id];
-    real* batchData = batch + id*seqWidth;
+    real* batchData = batch + id * seqWidth;
-    real* seqData = sequence + seqId*seqWidth;
+    real* seqData = sequence + seqId * seqWidth;
    for (int i = idx; i < seqWidth; i += blockDimX) {
      if (seq2batch) {
        if (isAdd) {
@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch,
        }
      }
    }
-    id += blockDimY*gridDimX;
+    id += blockDimY * gridDimX;
  }
 }
-void hl_sequence2batch_copy(real *batch,
+void hl_sequence2batch_copy(real* batch,
-                            real *sequence,
+                            real* sequence,
-                            const int *batchIndex,
+                            const int* batchIndex,
                            int seqWidth,
                            int batchCount,
                            bool seq2batch) {
@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch,
  dim3 threads(128, 8);
  dim3 grid(8, 1);
  if (seq2batch) {
-    KeSequence2Batch<128, 8, 8, 1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
+    KeSequence2Batch<128, 8, 8, 1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+        batch, sequence, batchIndex, seqWidth, batchCount);
  } else {
-    KeSequence2Batch<128, 8, 8, 0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
+    KeSequence2Batch<128, 8, 8, 0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+        batch, sequence, batchIndex, seqWidth, batchCount);
  }
  CHECK_SYNC("hl_sequence2batch_copy failed");
 }
-void hl_sequence2batch_add(real *batch,
+void hl_sequence2batch_add(real* batch,
-                           real *sequence,
+                           real* sequence,
-                           int *batchIndex,
+                           int* batchIndex,
                           int seqWidth,
                           int batchCount,
                           bool seq2batch) {
@@ -186,23 +183,22 @@ void hl_sequence2batch_add(real *batch,
  dim3 threads(128, 8);
  dim3 grid(8, 1);
  if (seq2batch) {
-    KeSequence2Batch<128, 8, 8, 1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
+    KeSequence2Batch<128, 8, 8, 1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+        batch, sequence, batchIndex, seqWidth, batchCount);
  } else {
-    KeSequence2Batch<128, 8, 8, 0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
+    KeSequence2Batch<128, 8, 8, 0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+        batch, sequence, batchIndex, seqWidth, batchCount);
  }
  CHECK_SYNC("hl_sequence2batch_add failed");
 }
-template<bool normByTimes, bool seq2batch>
+template <bool normByTimes, bool seq2batch>
-__global__
+__global__ void KeSequence2BatchPadding(real* batch,
-void KeSequence2BatchPadding(real* batch,
+                                        real* sequence,
-                             real* sequence,
+                                        const int* sequenceStartPositions,
-                             const int* sequenceStartPositions,
+                                        const size_t sequenceWidth,
-                             const size_t sequenceWidth,
+                                        const size_t maxSequenceLength,
-                             const size_t maxSequenceLength,
+                                        const size_t numSequences) {
-                             const size_t numSequences) {
  int batchIdx = blockIdx.y;
  int sequenceStart = sequenceStartPositions[batchIdx];
  int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart;
@@ -276,37 +272,49 @@ void hl_sequence2batch_copy_padding(real* batch,
  if (seq2batch) {
    /* sequence -> batch */
    if (normByTimes) {
-      KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
+      KeSequence2BatchPadding<1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-              batch, sequence, sequenceStartPositions,
+          batch,
-              sequenceWidth, maxSequenceLength, numSequences);
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
    } else {
-      KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
+      KeSequence2BatchPadding<0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-              batch, sequence, sequenceStartPositions,
+          batch,
-              sequenceWidth, maxSequenceLength, numSequences);
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
    }
  } else {
    /* batch -> sequence */
    if (normByTimes) {
-      KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
+      KeSequence2BatchPadding<1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-              batch, sequence, sequenceStartPositions,
+          batch,
-              sequenceWidth, maxSequenceLength, numSequences);
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
    } else {
-      KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
+      KeSequence2BatchPadding<0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-              batch, sequence, sequenceStartPositions,
+          batch,
-              sequenceWidth, maxSequenceLength, numSequences);
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
    }
  }
  CHECK_SYNC("hl_sequence2batch_copy_padding failed");
 }
-__device__ inline float my_rsqrt(float x) {
+__device__ inline float my_rsqrt(float x) { return rsqrtf(x); }
-  return rsqrtf(x);
-}
-__device__ inline double my_rsqrt(double x) {
+__device__ inline double my_rsqrt(double x) { return rsqrt(x); }
-  return rsqrt(x);
-}
 __global__ void KeSequenceAvgForward(real* dst,
                                     real* src,
@@ -327,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst,
    for (int i = start; i < end; i++) {
      sum += src[i * width + col];
    }
-    sum = mode == 1 ? sum :
+    sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength
-        (mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength));
+                                       : sum * my_rsqrt((real)seqLength));
    dst[gid] += sum;
  }
 }
@@ -347,10 +355,10 @@ void hl_sequence_avg_forward(real* dst,
  int grid = DIVUP(width * height, 512);
  CHECK(mode == 0 || mode == 1 || mode == 2)
-    << "mode error in hl_sequence_avg_forward!";
+      << "mode error in hl_sequence_avg_forward!";
-  KeSequenceAvgForward<<< grid, block, 0, STREAM_DEFAULT >>>
+  KeSequenceAvgForward<<<grid, block, 0, STREAM_DEFAULT>>>(
-           (dst, src, starts, height, width, mode);
+      dst, src, starts, height, width, mode);
  CHECK_SYNC("hl_sequence_avg_forward failed");
 }
@@ -370,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst,
    int seqLength = end - start;
    if (seqLength == 0) return;
    real grad = src[gid];
-    grad = mode == 1 ? grad :
+    grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength
-        (mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength));
+                                         : grad * my_rsqrt((real)seqLength));
    for (int i = start; i < end; i++) {
      dst[i * width + col] += grad;
    }
@@ -392,9 +400,9 @@ void hl_sequence_avg_backward(real* dst,
  int grid = DIVUP(width * height, 512);
  CHECK(mode == 0 || mode == 1 || mode == 2)
-    << "mode error in hl_sequence_avg_backward!";
+      << "mode error in hl_sequence_avg_backward!";
-  KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>>
+  KeSequenceAvgBackward<<<grid, block, 0, STREAM_DEFAULT>>>(
-           (dst, src, starts, height, width, mode);
+      dst, src, starts, height, width, mode);
  CHECK_SYNC("hl_sequence_avg_backward failed");
 }
--- a/paddle/cuda/src/hl_cuda_sparse.cu
+++ b/paddle/cuda/src/hl_cuda_sparse.cu
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "hl_cuda.h"
+#include "hl_cuda_sparse.cuh"
+#include "hl_matrix_apply.cuh"
+#include "hl_matrix_ops.cuh"
 #include "hl_sparse.h"
 #include "hl_sparse.ph"
-#include "hl_matrix_ops.cuh"
-#include "hl_matrix_apply.cuh"
-#include "hl_cuda_sparse.cuh"
 #include "paddle/utils/Logging.h"
 DEFINE_MATRIX_UNARY_PARAMETER_OP(mul_scalar, ONE_PARAMETER, a = a * p);
@@ -34,15 +33,15 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
  CHECK(A_d->format == HL_SPARSE_CSR) << "matrix format error!";
  if (A_d->nnz == 0) {
-    hl_gpu_apply_unary_op(
+    hl_gpu_apply_unary_op(unary::Zero<real>(), C_d, dimM, dimN, dimN);
-        unary::Zero<real>(), C_d, dimM, dimN, dimN);
    return;
  }
  /* nnz != 0 */
  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
-  CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) &&
+  CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) && A_d2->csr_row &&
-        A_d2->csr_row && A_d2->csr_col) << "parameter transa error!";
+        A_d2->csr_col)
+      << "parameter transa error!";
  int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
  int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
@@ -50,21 +49,11 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
  dim3 grid(blocksX, blocksY);
  if (A_d->type == HL_NO_VALUE) {
-    KeSMatrixCsr2Dense<0>
+    KeSMatrixCsr2Dense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(A_d2->csr_val,
+        A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN);
-                                             A_d2->csr_row,
-                                             A_d2->csr_col,
-                                             C_d,
-                                             dimM,
-                                             dimN);
  } else if (A_d->type == HL_FLOAT_VALUE) {
-    KeSMatrixCsr2Dense<1>
+    KeSMatrixCsr2Dense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(A_d2->csr_val,
+        A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN);
-                                             A_d2->csr_row,
-                                             A_d2->csr_col,
-                                             C_d,
-                                             dimM,
-                                             dimN);
  } else {
  }
  CHECK_SYNC("hl_matrix_csr2dense failed");
@@ -80,15 +69,15 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
  CHECK(A_d->format == HL_SPARSE_CSC) << "matrix format error!";
  if (A_d->nnz == 0) {
-    hl_gpu_apply_unary_op(
+    hl_gpu_apply_unary_op(unary::Zero<real>(), C_d, dimM, dimN, dimN);
-        unary::Zero<real>(), C_d, dimM, dimN, dimN);
    return;
  }
  /* nnz != 0 */
  hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
-  CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) &&
+  CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) && A_d2->csc_row &&
-        A_d2->csc_row && A_d2->csc_col) << "parameter transa error!";
+        A_d2->csc_col)
+      << "parameter transa error!";
  int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
  int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
@@ -96,21 +85,11 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
  dim3 grid(blocksX, blocksY);
  if (A_d->type == HL_NO_VALUE) {
-    KeSMatrixCsc2Dense<0>
+    KeSMatrixCsc2Dense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(A_d2->csc_val,
+        A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN);
-                                             A_d2->csc_row,
-                                             A_d2->csc_col,
-                                             C_d,
-                                             dimM,
-                                             dimN);
  } else if (A_d->type == HL_FLOAT_VALUE) {
-    KeSMatrixCsc2Dense<1>
+    KeSMatrixCsc2Dense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(A_d2->csc_val,
+        A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN);
-                                             A_d2->csc_row,
-                                             A_d2->csc_col,
-                                             C_d,
-                                             dimM,
-                                             dimN);
  } else {
  }
  CHECK_SYNC("hl_matrix_csc2dense failed");
@@ -118,43 +97,43 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
 void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
                             hl_matrix_format_t format,
-                             hl_matrix_value_t  value_type,
+                             hl_matrix_value_t value_type,
                             int dimM,
                             int dimN,
                             int nnz) {
  CHECK_NOTNULL(A_d);
  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-    << "sparse matrix format error!";
+      << "sparse matrix format error!";
  CHECK(value_type == HL_FLOAT_VALUE || value_type == HL_NO_VALUE)
-    << "sparse matrix value type error!";
+      << "sparse matrix value type error!";
  /* avoid malloc 0 bytes */
  int nnz_s = (nnz == 0 ? 1 : nnz);
  if (format == HL_SPARSE_CSR) {
    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
+    char *tmp =
-                              + sizeof(_hl_csr_matrix));
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
    CHECK_NOTNULL(tmp);
-    hl_csr_matrix csr = (hl_csr_matrix)(tmp+sizeof(_hl_sparse_matrix_s));
+    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
    csr->sparsity = -1.0;
    if (value_type == HL_NO_VALUE) {
      csr->csr_val = NULL;
      csr->nnz_s = nnz_s;
-      csr->row_s = dimM+1;
+      csr->row_s = dimM + 1;
-      csr->csr_row = (int*)hl_malloc_device((dimM+1)*sizeof(int));
+      csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int));
-      csr->csr_col = (int*)hl_malloc_device((nnz_s)*sizeof(int));
+      csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int));
      *A_d = (hl_sparse_matrix_s)tmp;
      (*A_d)->matrix = (hl_matrix_s)csr;
    } else if (value_type == HL_FLOAT_VALUE) {
      csr->nnz_s = nnz_s;
-      csr->row_s = dimM+1;
+      csr->row_s = dimM + 1;
-      csr->csr_val = (real*)hl_malloc_device((nnz_s)*sizeof(real));
+      csr->csr_val = (real *)hl_malloc_device((nnz_s) * sizeof(real));
-      csr->csr_row = (int*)hl_malloc_device((dimM+1)*sizeof(int));
+      csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int));
-      csr->csr_col = (int*)hl_malloc_device((nnz_s)*sizeof(int));
+      csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int));
      *A_d = (hl_sparse_matrix_s)tmp;
      (*A_d)->matrix = (hl_matrix_s)csr;
@@ -162,28 +141,28 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
  } else if (format == HL_SPARSE_CSC) {
    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
+    char *tmp =
-                              + sizeof(_hl_csc_matrix));
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
    CHECK_NOTNULL(tmp);
-    hl_csc_matrix csc = (hl_csc_matrix)(tmp+sizeof(_hl_sparse_matrix_s));
+    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
    csc->sparsity = -1.0f;
    if (value_type == HL_NO_VALUE) {
      csc->csc_val = NULL;
      csc->nnz_s = nnz_s;
-      csc->col_s = dimN+1;
+      csc->col_s = dimN + 1;
-      csc->csc_row = (int*)hl_malloc_device((nnz_s)*sizeof(int));
+      csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int));
-      csc->csc_col = (int*)hl_malloc_device((dimN+1)*sizeof(int));
+      csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int));
      *A_d = (hl_sparse_matrix_s)tmp;
      (*A_d)->matrix = (hl_matrix_s)csc;
    } else if (value_type == HL_FLOAT_VALUE) {
      csc->nnz_s = nnz_s;
-      csc->col_s = dimN+1;
+      csc->col_s = dimN + 1;
-      csc->csc_val = (real*)hl_malloc_device((nnz_s)*sizeof(real));
+      csc->csc_val = (real *)hl_malloc_device((nnz_s) * sizeof(real));
-      csc->csc_row = (int*)hl_malloc_device((nnz_s)*sizeof(int));
+      csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int));
-      csc->csc_col = (int*)hl_malloc_device((dimN+1)*sizeof(int));
+      csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int));
      *A_d = (hl_sparse_matrix_s)tmp;
      (*A_d)->matrix = (hl_matrix_s)csc;
@@ -200,7 +179,7 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
 void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {
  CHECK_NOTNULL(A_d);
  CHECK(A_d->format == HL_SPARSE_CSR || A_d->format == HL_SPARSE_CSC)
-    << "sparse matrix format error!";
+      << "sparse matrix format error!";
  if (A_d->matrix == NULL) {
    free(A_d);
@@ -249,77 +228,77 @@ void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {
 }
 void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                void * dest_d,
+                                void *dest_d,
                                size_t size,
                                hl_matrix_format_t format,
-                                hl_matrix_value_t  value_type,
+                                hl_matrix_value_t value_type,
                                int dimM,
                                int dimN,
                                int nnz) {
  CHECK_NOTNULL(A_d);
  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-    << "sparse matrix format error!";
+      << "sparse matrix format error!";
  if (format == HL_SPARSE_CSR) {
    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-    size_t size_ = (dimM+1)*sizeof(int) + nnz*sizeof(int);
+    size_t size_ = (dimM + 1) * sizeof(int) + nnz * sizeof(int);
    if (value_type != HL_NO_VALUE) {
-      size_ += nnz*sizeof(real);
+      size_ += nnz * sizeof(real);
    }
    CHECK_LE(size_, size) << "dest_d size(" << size
-      << ") too small, should bigger than(" << size_ << ")!";
+                          << ") too small, should bigger than(" << size_
+                          << ")!";
-    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
+    char *tmp =
-                              + sizeof(_hl_csr_matrix));
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
    CHECK_NOTNULL(tmp);
-    hl_csr_matrix csr = (hl_csr_matrix)(tmp+sizeof(_hl_sparse_matrix_s));
+    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
    if (value_type == HL_NO_VALUE) {
      csr->csr_val = NULL;
-      csr->csr_row = (int*)dest_d;
+      csr->csr_row = (int *)dest_d;
-      csr->csr_col = (int*)((char*)dest_d + (dimM+1)*sizeof(int));
+      csr->csr_col = (int *)((char *)dest_d + (dimM + 1) * sizeof(int));
    } else {
-      csr->csr_val = (real*)dest_d;
+      csr->csr_val = (real *)dest_d;
-      csr->csr_row = (int*)((char*)dest_d + nnz*sizeof(real));
+      csr->csr_row = (int *)((char *)dest_d + nnz * sizeof(real));
-      csr->csr_col = (int*)((char*)dest_d +
+      csr->csr_col = (int *)((char *)dest_d + nnz * sizeof(real) +
-                            nnz*sizeof(real) +
+                             (dimM + 1) * sizeof(int));
-                            (dimM+1)*sizeof(int));
    }
    csr->nnz_s = nnz;
-    csr->row_s = dimM+1;
+    csr->row_s = dimM + 1;
    csr->sparsity = -1.0;
    *A_d = (hl_sparse_matrix_s)tmp;
    (*A_d)->matrix = (hl_matrix_s)csr;
  } else if (format == HL_SPARSE_CSC) {
    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-    size_t size_ = (dimN+1)*sizeof(int) + nnz*sizeof(int);
+    size_t size_ = (dimN + 1) * sizeof(int) + nnz * sizeof(int);
    if (value_type != HL_NO_VALUE) {
-      size_ += nnz*sizeof(real);
+      size_ += nnz * sizeof(real);
    }
    CHECK_LE(size_, size) << "dest_d size(" << size
-      << ") too small, should bigger than(" << size_ << ")!";
+                          << ") too small, should bigger than(" << size_
+                          << ")!";
-    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
+    char *tmp =
-                              + sizeof(_hl_csc_matrix));
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
    CHECK_NOTNULL(tmp);
-    hl_csc_matrix csc = (hl_csc_matrix)(tmp+sizeof(_hl_sparse_matrix_s));
+    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
    if (value_type == HL_NO_VALUE) {
      csc->csc_val = NULL;
-      csc->csc_col = (int*)dest_d;
+      csc->csc_col = (int *)dest_d;
-      csc->csc_row = (int*)((char*)dest_d + (dimN+1)*sizeof(int));
+      csc->csc_row = (int *)((char *)dest_d + (dimN + 1) * sizeof(int));
    } else {
-      csc->csc_val = (real*)dest_d;
+      csc->csc_val = (real *)dest_d;
-      csc->csc_col = (int*)((char*)dest_d + nnz*sizeof(real));
+      csc->csc_col = (int *)((char *)dest_d + nnz * sizeof(real));
-      csc->csc_row = (int*)((char*)dest_d +
+      csc->csc_row = (int *)((char *)dest_d + nnz * sizeof(real) +
-                            nnz*sizeof(real) +
+                             (dimN + 1) * sizeof(int));
-                            (dimN+1)*sizeof(int));
    }
    csc->nnz_s = nnz;
-    csc->col_s = dimN+1;
+    csc->col_s = dimN + 1;
    csc->sparsity = -1.0f;
    *A_d = (hl_sparse_matrix_s)tmp;
    (*A_d)->matrix = (hl_matrix_s)csc;
@@ -333,11 +312,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
 }
 void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                real* value_d,
+                                real *value_d,
-                                int* rows_d,
+                                int *rows_d,
-                                int* cols_d,
+                                int *cols_d,
                                hl_matrix_format_t format,
-                                hl_matrix_value_t  value_type,
+                                hl_matrix_value_t value_type,
                                int dimM,
                                int dimN,
                                int nnz) {
@@ -345,11 +324,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
  CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-    << "sparse matrix format error!";
+      << "sparse matrix format error!";
  if (format == HL_SPARSE_CSR) {
-    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
+    char *tmp =
-                              + sizeof(_hl_csr_matrix));
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
    CHECK_NOTNULL(tmp);
    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
@@ -362,8 +341,8 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
    *A_d = (hl_sparse_matrix_s)tmp;
    (*A_d)->matrix = (hl_matrix_s)csr;
  } else if (format == HL_SPARSE_CSC) {
-    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
+    char *tmp =
-                              + sizeof(_hl_csc_matrix));
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
    CHECK_NOTNULL(tmp);
    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
@@ -396,35 +375,30 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
                          hl_stream_t stream) {
  CHECK_NOTNULL(csr_matrix);
  CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
-    << "csr_matrix is not csr format!";
+      << "csr_matrix is not csr format!";
  CHECK_NOTNULL(csr_matrix->matrix);
  hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
-  CHECK_LE(csr_matrix->nnz, csr->nnz_s)
+  CHECK_LE(csr_matrix->nnz, csr->nnz_s) << "copy size " << csr_matrix->nnz
-    << "copy size " << csr_matrix->nnz
+                                        << " is big than alloc size "
-    << " is big than alloc size " << csr->nnz_s;
+                                        << csr->nnz_s;
-  CHECK_LE((csr_matrix->rows+1), csr->row_s)
+  CHECK_LE((csr_matrix->rows + 1), csr->row_s)
-    << "copy size " << (csr_matrix->rows + 1)
+      << "copy size " << (csr_matrix->rows + 1) << " is big than alloc size "
-    << " is big than alloc size " << csr->row_s;
+      << csr->row_s;
-  CHECK(csr_matrix->type == HL_FLOAT_VALUE ||
+  CHECK(csr_matrix->type == HL_FLOAT_VALUE || csr_matrix->type == HL_NO_VALUE)
-        csr_matrix->type == HL_NO_VALUE)
+      << "sparse matrix value type error!";
-        << "sparse matrix value type error!";
  if (csr_matrix->type == HL_NO_VALUE) {
    if (csr_row == NULL && csr_col == NULL) {
      return;
    } else if (csr_row != NULL && csr_col != NULL) {
-      hl_memcpy_async(csr->csr_row,
+      hl_memcpy_async(
-                      csr_row,
+          csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream);
-                      (csr_matrix->rows+1)*sizeof(int),
-                      stream);
-      hl_memcpy_async(csr->csr_col,
+      hl_memcpy_async(
-                      csr_col,
+          csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream);
-                      (csr_matrix->nnz)*sizeof(int),
-                      stream);
    } else {
      LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
    }
@@ -432,30 +406,21 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
    if (csr_val == NULL && csr_row == NULL && csr_col == NULL) {
      return;
    } else if (csr_val != NULL && csr_row == NULL && csr_col == NULL) {
-      hl_memcpy_async(csr->csr_val,
+      hl_memcpy_async(
-                      csr_val,
+          csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream);
-                      (csr_matrix->nnz)*sizeof(real),
-                      stream);
    } else if (csr_val != NULL && csr_row != NULL && csr_col != NULL) {
-      hl_memcpy_async(csr->csr_val,
+      hl_memcpy_async(
-                      csr_val,
+          csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream);
-                      (csr_matrix->nnz)*sizeof(real),
+      hl_memcpy_async(
-                      stream);
+          csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream);
-      hl_memcpy_async(csr->csr_row,
+      hl_memcpy_async(
-                      csr_row,
+          csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream);
-                      (csr_matrix->rows+1)*sizeof(int),
-                      stream);
-      hl_memcpy_async(csr->csr_col,
-                      csr_col,
-                      (csr_matrix->nnz)*sizeof(int),
-                      stream);
    } else {
      LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
    }
  }
-  csr->sparsity = ((float)csr_matrix->nnz) /
+  csr->sparsity = ((float)csr_matrix->nnz) / ((float)csr_matrix->rows) /
-                  ((float)csr_matrix->rows) /
                  ((float)csr_matrix->cols);
 }
@@ -466,33 +431,28 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
                          hl_stream_t stream) {
  CHECK_NOTNULL(csc_matrix);
  CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
-    << "csc_matrix is not csc format error!";
+      << "csc_matrix is not csc format error!";
  hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
-  CHECK_LE(csc_matrix->nnz, csc->nnz_s)
+  CHECK_LE(csc_matrix->nnz, csc->nnz_s) << "copy size " << csc_matrix->nnz
-    << "copy size " << csc_matrix->nnz
+                                        << " is big than alloc size "
-    << " is big than alloc size " << csc->nnz_s;
+                                        << csc->nnz_s;
-  CHECK_LE((csc_matrix->cols+1), csc->col_s)
+  CHECK_LE((csc_matrix->cols + 1), csc->col_s)
-    << "copy size " <<(csc_matrix->cols + 1)
+      << "copy size " << (csc_matrix->cols + 1) << " is big than alloc size "
-    << " is big than alloc size " << csc->col_s;
+      << csc->col_s;
-  CHECK(csc_matrix->type == HL_FLOAT_VALUE ||
+  CHECK(csc_matrix->type == HL_FLOAT_VALUE || csc_matrix->type == HL_NO_VALUE)
-        csc_matrix->type == HL_NO_VALUE)
+      << "sparse matrix value type error!";
-        << "sparse matrix value type error!";
  if (csc_matrix->type == HL_NO_VALUE) {
    if (csc_row == NULL && csc_col == NULL) {
      return;
    } else if (csc_row != NULL && csc_col != NULL) {
-      hl_memcpy_async(csc->csc_row,
+      hl_memcpy_async(
-                      csc_row,
+          csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream);
-                      (csc_matrix->nnz)*sizeof(int),
+      hl_memcpy_async(
-                      stream);
+          csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream);
-      hl_memcpy_async(csc->csc_col,
-                      csc_col,
-                      (csc_matrix->cols+1)*sizeof(int),
-                      stream);
    } else {
      LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
    }
@@ -500,30 +460,21 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
    if (csc_val == NULL && csc_row == NULL && csc_col == NULL) {
      return;
    } else if (csc_val != NULL && csc_row == NULL && csc_col == NULL) {
-      hl_memcpy_async(csc->csc_val,
+      hl_memcpy_async(
-                      csc_val,
+          csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream);
-                      (csc_matrix->nnz)*sizeof(real),
-                      stream);
    } else if (csc_val != NULL && csc_row != NULL && csc_col != NULL) {
-      hl_memcpy_async(csc->csc_val,
+      hl_memcpy_async(
-                      csc_val,
+          csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream);
-                      (csc_matrix->nnz)*sizeof(real),
+      hl_memcpy_async(
-                      stream);
+          csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream);
-      hl_memcpy_async(csc->csc_row,
+      hl_memcpy_async(
-                      csc_row,
+          csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream);
-                      (csc_matrix->nnz)*sizeof(int),
-                      stream);
-      hl_memcpy_async(csc->csc_col,
-                      csc_col,
-                      (csc_matrix->cols+1)*sizeof(int),
-                      stream);
    } else {
      LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
    }
  }
-  csc->sparsity = ((float)csc_matrix->nnz) /
+  csc->sparsity = ((float)csc_matrix->nnz) / ((float)csc_matrix->rows) /
-                  ((float)csc_matrix->rows) /
                  ((float)csc_matrix->cols);
 }
@@ -531,32 +482,23 @@ void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
                             hl_sparse_matrix_s src,
                             hl_stream_t stream) {
  CHECK(dst && src && dst->matrix && src->matrix)
-    << "parameter dst or src is null pointer!";
+      << "parameter dst or src is null pointer!";
-  CHECK_EQ(dst->format, src->format)
+  CHECK_EQ(dst->format, src->format) << "sparse matrix format does not match!";
-    << "sparse matrix format does not match!";
  CHECK(dst->type != HL_FLOAT_VALUE || src->type != HL_NO_VALUE)
-    << "src sparse matrix is no value, dst sparse matrix has value!";
+      << "src sparse matrix is no value, dst sparse matrix has value!";
  if (dst->format == HL_SPARSE_CSR) {
    dst->rows = src->rows;
    dst->cols = src->cols;
-    dst->nnz  = src->nnz;
+    dst->nnz = src->nnz;
    hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
-    hl_memcpy_csr_matrix(dst,
+    hl_memcpy_csr_matrix(dst, csr->csr_val, csr->csr_row, csr->csr_col, stream);
-                         csr->csr_val,
-                         csr->csr_row,
-                         csr->csr_col,
-                         stream);
  } else if (dst->format == HL_SPARSE_CSC) {
    dst->rows = src->rows;
    dst->cols = src->cols;
-    dst->nnz  = src->nnz;
+    dst->nnz = src->nnz;
    hl_csc_matrix csc = (hl_csc_matrix)src->matrix;
-    hl_memcpy_csc_matrix(dst,
+    hl_memcpy_csc_matrix(dst, csc->csc_val, csc->csc_row, csc->csc_col, stream);
-                         csc->csc_val,
-                         csc->csc_row,
-                         csc->csc_col,
-                         stream);
  } else {
    LOG(FATAL) << "sparse matrix format error!";
  }
@@ -569,20 +511,24 @@ static void _beta_mul_c(real *c, int dimM, int dimN, real beta) {
  if (beta == 0.0) {
    hl_gpu_apply_unary_op(unary::Zero<real>(), c, dimM, dimN, dimN);
  } else {
-    if (beta != 1.0){
+    if (beta != 1.0) {
-      hl_gpu_apply_unary_op(
+      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta), c, dimM, dimN, dimN);
-        unary::mul_scalar<real>(beta), c, dimM, dimN, dimN);
    }
  }
  return;
 }
-void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
+void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
-                             real *B_d, hl_trans_op_t transb,
+                             hl_trans_op_t transa,
+                             real *B_d,
+                             hl_trans_op_t transb,
                             real *C_d,
-                             int dimM, int dimN, int dimK,
+                             int dimM,
-                             real alpha, real beta) {
+                             int dimN,
+                             int dimK,
+                             real alpha,
+                             real beta) {
  CHECK_EQ(transb, HPPL_OP_N);
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(B_d);
@@ -592,7 +538,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
  if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) ||
      (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) {
-      LOG(FATAL) << "parameter error!";
+    LOG(FATAL) << "parameter error!";
  }
  if (A_d->nnz == 0) {
@@ -603,8 +549,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
  /* nnz != 0 */
  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
-       A_d2->csr_row == NULL ||
+      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
-       A_d2->csr_col == NULL) {
    LOG(FATAL) << "parameter error!";
  }
@@ -617,63 +562,63 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
    /* sparsity pattern */
    // A_d->sparsity;
    if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCsrMulDense<0>
+      KeSMatrixCsrMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+          C_d,
-                                               A_d2->csr_val,
+          A_d2->csr_val,
-                                               A_d2->csr_col,
+          A_d2->csr_col,
-                                               A_d2->csr_row,
+          A_d2->csr_row,
-                                               B_d,
+          B_d,
-                                               dimM,
+          dimM,
-                                               dimN,
+          dimN,
-                                               dimK,
+          dimK,
-                                               alpha,
+          alpha,
-                                               beta);
+          beta);
    } else {
-      KeSMatrixCsrMulDense<1>
+      KeSMatrixCsrMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+          C_d,
-                                               A_d2->csr_val,
+          A_d2->csr_val,
-                                               A_d2->csr_col,
+          A_d2->csr_col,
-                                               A_d2->csr_row,
+          A_d2->csr_row,
-                                               B_d,
+          B_d,
-                                               dimM,
+          dimM,
-                                               dimN,
+          dimN,
-                                               dimK,
+          dimK,
-                                               alpha,
+          alpha,
-                                               beta);
+          beta);
    }
  } else if (HPPL_OP_T == transa) {
    _beta_mul_c(C_d, dimM, dimN, beta);
-    int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) /
+    int blocksX =
-                  CU_CSC_MUL_DENSE_BLOCK_N;
+        (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N;
-    int blocksY = (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) /
+    int blocksY =
-                  CU_CSC_MUL_DENSE_BLOCK_K;
+        (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K;
    dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
    dim3 grid(blocksX, blocksY);
    if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCscMulDense<0>
+      KeSMatrixCscMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+          C_d,
-                                               A_d2->csr_val,
+          A_d2->csr_val,
-                                               A_d2->csr_col,
+          A_d2->csr_col,
-                                               A_d2->csr_row,
+          A_d2->csr_row,
-                                               B_d,
+          B_d,
-                                               dimM,
+          dimM,
-                                               dimN,
+          dimN,
-                                               dimK,
+          dimK,
-                                               alpha,
+          alpha,
-                                               beta);
+          beta);
    } else {
-      KeSMatrixCscMulDense<1>
+      KeSMatrixCscMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+          C_d,
-                                               A_d2->csr_val,
+          A_d2->csr_val,
-                                               A_d2->csr_col,
+          A_d2->csr_col,
-                                               A_d2->csr_row,
+          A_d2->csr_row,
-                                               B_d,
+          B_d,
-                                               dimM,
+          dimM,
-                                               dimN,
+          dimN,
-                                               dimK,
+          dimK,
-                                               alpha,
+          alpha,
-                                               beta);
+          beta);
    }
  } else {
    LOG(FATAL) << "parameter transa error!";
@@ -682,11 +627,16 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
  CHECK_SYNC("hl_matrix_csr_mul_dense failed");
 }
-void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
+void hl_matrix_dense_mul_csc(real *A_d,
-                             hl_sparse_matrix_s B_d, hl_trans_op_t transb,
+                             hl_trans_op_t transa,
+                             hl_sparse_matrix_s B_d,
+                             hl_trans_op_t transb,
                             real *C_d,
-                             int dimM, int dimN, int dimK,
+                             int dimM,
-                             real alpha, real beta) {
+                             int dimN,
+                             int dimK,
+                             real alpha,
+                             real beta) {
  CHECK_EQ(transa, HPPL_OP_N);
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(B_d);
@@ -698,8 +648,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
    LOG(FATAL) << "parameter dims error!";
  }
-  CHECK_EQ(B_d->format, HL_SPARSE_CSC)
+  CHECK_EQ(B_d->format, HL_SPARSE_CSC) << "matrix format error!";
-    << "matrix format error!";
  if (B_d->nnz == 0) {
    _beta_mul_c(C_d, dimM, dimN, beta);
@@ -709,8 +658,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
  /* nnz != 0 */
  hl_csc_matrix B_d2 = (hl_csc_matrix)(B_d->matrix);
  if ((B_d2->csc_val == NULL && B_d->type != HL_NO_VALUE) ||
-       B_d2->csc_row == NULL ||
+      B_d2->csc_row == NULL || B_d2->csc_col == NULL) {
-       B_d2->csc_col == NULL) {
    LOG(FATAL) << "parameter B is null!";
  }
@@ -721,60 +669,60 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
    dim3 grid(blocksX, blocksY);
    if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsc<0>
+      KeSMatrixDenseMulCsc<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+          C_d,
-                                               A_d,
+          A_d,
-                                               B_d2->csc_val,
+          B_d2->csc_val,
-                                               B_d2->csc_row,
+          B_d2->csc_row,
-                                               B_d2->csc_col,
+          B_d2->csc_col,
-                                               dimM,
+          dimM,
-                                               dimN,
+          dimN,
-                                               dimK,
+          dimK,
-                                               alpha,
+          alpha,
-                                               beta);
+          beta);
    } else {
-      KeSMatrixDenseMulCsc<1>
+      KeSMatrixDenseMulCsc<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+          C_d,
-                                               A_d,
+          A_d,
-                                               B_d2->csc_val,
+          B_d2->csc_val,
-                                               B_d2->csc_row,
+          B_d2->csc_row,
-                                               B_d2->csc_col,
+          B_d2->csc_col,
-                                               dimM,
+          dimM,
-                                               dimN,
+          dimN,
-                                               dimK,
+          dimK,
-                                               alpha,
+          alpha,
-                                               beta);
+          beta);
    }
  } else if (transb == HPPL_OP_T) {
    _beta_mul_c(C_d, dimM, dimN, beta);
-    int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X;
+    int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X;
-    int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M;
+    int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M;
    dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
    dim3 grid(blocksX, blocksY);
    if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsr<0>
+      KeSMatrixDenseMulCsr<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+          C_d,
-                                               A_d,
+          A_d,
-                                               B_d2->csc_val,
+          B_d2->csc_val,
-                                               B_d2->csc_col,
+          B_d2->csc_col,
-                                               B_d2->csc_row,
+          B_d2->csc_row,
-                                               dimM,
+          dimM,
-                                               dimN,
+          dimN,
-                                               dimK,
+          dimK,
-                                               alpha,
+          alpha,
-                                               beta);
+          beta);
    } else {
-      KeSMatrixDenseMulCsr<1>
+      KeSMatrixDenseMulCsr<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+          C_d,
-                                               A_d,
+          A_d,
-                                               B_d2->csc_val,
+          B_d2->csc_val,
-                                               B_d2->csc_col,
+          B_d2->csc_col,
-                                               B_d2->csc_row,
+          B_d2->csc_row,
-                                               dimM,
+          dimM,
-                                               dimN,
+          dimN,
-                                               dimK,
+          dimK,
-                                               alpha,
+          alpha,
-                                               beta);
+          beta);
    }
  } else {
    LOG(FATAL) << "parameter transb error!";
@@ -783,24 +731,28 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
  CHECK_SYNC("hl_matrix_dense_mul_csc failed");
 }
-void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
+void hl_matrix_dense_mul_csr(real *A_d,
-                             hl_sparse_matrix_s B_d, hl_trans_op_t transb,
+                             hl_trans_op_t transa,
+                             hl_sparse_matrix_s B_d,
+                             hl_trans_op_t transb,
                             real *C_d,
-                             int dimM, int dimN, int dimK,
+                             int dimM,
-                             real alpha, real beta) {
+                             int dimN,
+                             int dimK,
+                             real alpha,
+                             real beta) {
  CHECK_EQ(transa, HPPL_OP_N);
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(B_d);
  CHECK_NOTNULL(C_d);
-  if (dimM <= 0 || dimN <= 0 || dimK <= 0
+  if (dimM <= 0 || dimN <= 0 || dimK <= 0 ||
-      || (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN))
+      (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN)) ||
-      || (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) {
+      (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) {
    LOG(FATAL) << "parameter dims error!";
  }
-  CHECK_EQ(B_d->format, HL_SPARSE_CSR)
+  CHECK_EQ(B_d->format, HL_SPARSE_CSR) << "matrix format error!";
-    << "matrix format error!";
  if (B_d->nnz == 0) {
    _beta_mul_c(C_d, dimM, dimN, beta);
@@ -810,41 +762,40 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
  /* nnz != 0 */
  hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix);
  if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) ||
-       B_d2->csr_row == NULL ||
+      B_d2->csr_row == NULL || B_d2->csr_col == NULL) {
-       B_d2->csr_col == NULL) {
    LOG(FATAL) << "parameter transa error!";
  }
  if (transb == HPPL_OP_N) {
    _beta_mul_c(C_d, dimM, dimN, beta);
-    int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X;
+    int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X;
-    int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M;
+    int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M;
    dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
    dim3 grid(blocksX, blocksY);
    if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsr<0>
+      KeSMatrixDenseMulCsr<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+          C_d,
-                                               A_d,
+          A_d,
-                                               B_d2->csr_val,
+          B_d2->csr_val,
-                                               B_d2->csr_row,
+          B_d2->csr_row,
-                                               B_d2->csr_col,
+          B_d2->csr_col,
-                                               dimM,
+          dimM,
-                                               dimN,
+          dimN,
-                                               dimK,
+          dimK,
-                                               alpha,
+          alpha,
-                                               beta);
+          beta);
    } else {
-      KeSMatrixDenseMulCsr<1>
+      KeSMatrixDenseMulCsr<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+          C_d,
-                                               A_d,
+          A_d,
-                                               B_d2->csr_val,
+          B_d2->csr_val,
-                                               B_d2->csr_row,
+          B_d2->csr_row,
-                                               B_d2->csr_col,
+          B_d2->csr_col,
-                                               dimM,
+          dimM,
-                                               dimN,
+          dimN,
-                                               dimK,
+          dimK,
-                                               alpha,
+          alpha,
-                                               beta);
+          beta);
    }
  } else if (transb == HPPL_OP_T) {
    int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST;
@@ -852,29 +803,29 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
    dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST);
    dim3 grid(blocksX, blocksY);
    if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsc<0>
+      KeSMatrixDenseMulCsc<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+          C_d,
-                                               A_d,
+          A_d,
-                                               B_d2->csr_val,
+          B_d2->csr_val,
-                                               B_d2->csr_col,
+          B_d2->csr_col,
-                                               B_d2->csr_row,
+          B_d2->csr_row,
-                                               dimM,
+          dimM,
-                                               dimN,
+          dimN,
-                                               dimK,
+          dimK,
-                                               alpha,
+          alpha,
-                                               beta);
+          beta);
    } else {
-      KeSMatrixDenseMulCsc<1>
+      KeSMatrixDenseMulCsc<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+          C_d,
-                                               A_d,
+          A_d,
-                                               B_d2->csr_val,
+          B_d2->csr_val,
-                                               B_d2->csr_col,
+          B_d2->csr_col,
-                                               B_d2->csr_row,
+          B_d2->csr_row,
-                                               dimM,
+          dimM,
-                                               dimN,
+          dimN,
-                                               dimK,
+          dimK,
-                                               alpha,
+          alpha,
-                                               beta);
+          beta);
    }
  } else {
    LOG(FATAL) << "parameter transb error!";
@@ -883,11 +834,16 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
  CHECK_SYNC("hl_matrix_dense_mul_csr failed");
 }
-void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
+void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
-                             real *B_d, hl_trans_op_t transb,
+                             hl_trans_op_t transa,
+                             real *B_d,
+                             hl_trans_op_t transb,
                             real *C_d,
-                             int dimM, int dimN, int dimK,
+                             int dimM,
-                             real alpha, real beta) {
+                             int dimN,
+                             int dimK,
+                             real alpha,
+                             real beta) {
  CHECK_EQ(transb, HPPL_OP_N);
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(B_d);
@@ -908,42 +864,43 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
  /* nnz != 0 */
  hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
  if ((A_d2->csc_val == NULL && A_d->type != HL_NO_VALUE) ||
-       A_d2->csc_row == NULL ||
+      A_d2->csc_row == NULL || A_d2->csc_col == NULL) {
-       A_d2->csc_col == NULL) {
    LOG(FATAL) << "parameter error!";
  }
  if (HPPL_OP_N == transa) {
    _beta_mul_c(C_d, dimM, dimN, beta);
-    int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N -1)/CU_CSC_MUL_DENSE_BLOCK_N;
+    int blocksX =
-    int blocksY = (dimK + CU_CSC_MUL_DENSE_BLOCK_K -1)/CU_CSC_MUL_DENSE_BLOCK_K;
+        (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N;
+    int blocksY =
+        (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K;
    dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
    dim3 grid(blocksX, blocksY);
    if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCscMulDense<0>
+      KeSMatrixCscMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+          C_d,
-                                               A_d2->csc_val,
+          A_d2->csc_val,
-                                               A_d2->csc_row,
+          A_d2->csc_row,
-                                               A_d2->csc_col,
+          A_d2->csc_col,
-                                               B_d,
+          B_d,
-                                               dimM,
+          dimM,
-                                               dimN,
+          dimN,
-                                               dimK,
+          dimK,
-                                               alpha,
+          alpha,
-                                               beta);
+          beta);
    } else {
-      KeSMatrixCscMulDense<1>
+      KeSMatrixCscMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+          C_d,
-                                               A_d2->csc_val,
+          A_d2->csc_val,
-                                               A_d2->csc_row,
+          A_d2->csc_row,
-                                               A_d2->csc_col,
+          A_d2->csc_col,
-                                               B_d,
+          B_d,
-                                               dimM,
+          dimM,
-                                               dimN,
+          dimN,
-                                               dimK,
+          dimK,
-                                               alpha,
+          alpha,
-                                               beta);
+          beta);
    }
  } else if (HPPL_OP_T == transa) {
    int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N;
@@ -954,29 +911,29 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
    /* sparsity pattern */
    // A_d->sparsity;
    if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCsrMulDense<0>
+      KeSMatrixCsrMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+          C_d,
-                                               A_d2->csc_val,
+          A_d2->csc_val,
-                                               A_d2->csc_row,
+          A_d2->csc_row,
-                                               A_d2->csc_col,
+          A_d2->csc_col,
-                                               B_d,
+          B_d,
-                                               dimM,
+          dimM,
-                                               dimN,
+          dimN,
-                                               dimK,
+          dimK,
-                                               alpha,
+          alpha,
-                                               beta);
+          beta);
    } else {
-      KeSMatrixCsrMulDense<1>
+      KeSMatrixCsrMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
+          C_d,
-                                               A_d2->csc_val,
+          A_d2->csc_val,
-                                               A_d2->csc_row,
+          A_d2->csc_row,
-                                               A_d2->csc_col,
+          A_d2->csc_col,
-                                               B_d,
+          B_d,
-                                               dimM,
+          dimM,
-                                               dimN,
+          dimN,
-                                               dimK,
+          dimK,
-                                               alpha,
+          alpha,
-                                               beta);
+          beta);
    }
  } else {
    LOG(FATAL) << "parameter transa error!";
@@ -985,11 +942,16 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
  CHECK_SYNC("hl_matrix_csc_mul_dense failed");
 }
-void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
+void hl_sparse_matrix_mul(real *A_d,
-                          real *B_d, hl_trans_op_t transb,
+                          hl_trans_op_t transa,
-                          hl_sparse_matrix_s  C_d,
+                          real *B_d,
-                          int dimM, int dimN, int dimK,
+                          hl_trans_op_t transb,
-                          real alpha, real beta) {
+                          hl_sparse_matrix_s C_d,
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(B_d);
  CHECK_NOTNULL(C_d);
@@ -1000,18 +962,14 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
  if (C_d->format == HL_SPARSE_CSC) {
    hl_csc_matrix C_d2 = (hl_csc_matrix)(C_d->matrix);
-    if (C_d2->csc_val == NULL ||
+    if (C_d2->csc_val == NULL || C_d2->csc_row == NULL ||
-        C_d2->csc_row == NULL ||
        C_d2->csc_col == NULL) {
      LOG(FATAL) << "parameter error!";
    }
    if (beta != 1.0) {
-      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
+      hl_gpu_apply_unary_op(
-                            C_d2->csc_val,
+          unary::mul_scalar<real>(beta), C_d2->csc_val, 1, C_d->nnz, C_d->nnz);
-                            1,
-                            C_d->nnz,
-                            C_d->nnz);
    }
    int blocksX = dimN;
@@ -1020,34 +978,30 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
    dim3 grid(blocksX, blocksY);
    bool transA = transa == HPPL_OP_T ? 1 : 0;
    bool transB = transb == HPPL_OP_T ? 1 : 0;
-    KeSMatrixDenseMulDense2CSC
+    KeSMatrixDenseMulDense2CSC<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d2->csc_val,
+        C_d2->csc_val,
-                                             C_d2->csc_row,
+        C_d2->csc_row,
-                                             C_d2->csc_col,
+        C_d2->csc_col,
-                                             A_d,
+        A_d,
-                                             B_d,
+        B_d,
-                                             transA,
+        transA,
-                                             transB,
+        transB,
-                                             dimM,
+        dimM,
-                                             dimN,
+        dimN,
-                                             dimK,
+        dimK,
-                                             alpha,
+        alpha,
-                                             beta);
+        beta);
    CHECK_SYNC("hl_sparse_matrix_mul failed");
  } else {
    hl_csr_matrix C_d2 = (hl_csr_matrix)(C_d->matrix);
    if ((C_d2->csr_val == NULL && C_d->type != HL_NO_VALUE) ||
-         C_d2->csr_row == NULL ||
+        C_d2->csr_row == NULL || C_d2->csr_col == NULL) {
-         C_d2->csr_col == NULL) {
      LOG(FATAL) << "parameter error!";
    }
    if (beta != 1.0) {
-      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
+      hl_gpu_apply_unary_op(
-                            C_d2->csr_val,
+          unary::mul_scalar<real>(beta), C_d2->csr_val, 1, C_d->nnz, C_d->nnz);
-                            1,
-                            C_d->nnz,
-                            C_d->nnz);
    }
    bool transA = transa == HPPL_OP_T ? 1 : 0;
@@ -1058,20 +1012,20 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
      dim3 threads(CU_CSCMM_DMD2CSR_THREAD_X, 1);
      dim3 grid(blocksX, blocksY);
-      KeSMatrixDenseMulDense2CSR
+      KeSMatrixDenseMulDense2CSR<<<grid, threads, 0, STREAM_DEFAULT>>>(
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d2->csr_val,
+          C_d2->csr_val,
-                                               C_d2->csr_row,
+          C_d2->csr_row,
-                                               C_d2->csr_col,
+          C_d2->csr_col,
-                                               A_d,
+          A_d,
-                                               B_d,
+          B_d,
-                                               transA,
+          transA,
-                                               transB,
+          transB,
-                                               dimM,
+          dimM,
-                                               dimN,
+          dimN,
-                                               dimK,
+          dimK,
-                                               alpha,
+          alpha,
-                                               beta);
+          beta);
-     CHECK_SYNC("hl_sparse_matrix_mul failed");
+      CHECK_SYNC("hl_sparse_matrix_mul failed");
    } else {
      CHECK(!transA) << "Not supported A is trans and B is not trans!";
@@ -1080,21 +1034,21 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
      avgNnzPerRow = avgNnzPerRow > 0 ? avgNnzPerRow : 1;
      int gridx = DIVUP(avgNnzPerRow, CU_BLOCK_SIZE);
      dim3 grid(gridx, dimM);
-      KeSMatrixDenseMulDenseTrans2CSR
+      KeSMatrixDenseMulDenseTrans2CSR<<<grid, block, 0, STREAM_DEFAULT>>>(
-         <<<grid, block, 0, STREAM_DEFAULT>>>(C_d2->csr_val,
+          C_d2->csr_val,
-                                               C_d2->csr_row,
+          C_d2->csr_row,
-                                               C_d2->csr_col,
+          C_d2->csr_col,
-                                               A_d,
+          A_d,
-                                               B_d,
+          B_d,
-                                               transA,
+          transA,
-                                               transB,
+          transB,
-                                               dimM,
+          dimM,
-                                               dimN,
+          dimN,
-                                               dimK,
+          dimK,
-                                               alpha,
+          alpha,
-                                               beta);
+          beta);
-     CHECK_SYNC("hl_sparse_matrix_mul failed");
+      CHECK_SYNC("hl_sparse_matrix_mul failed");
-   }
+    }
  }
 }
@@ -1111,7 +1065,7 @@ void hl_memcpy_from_csc_matrix(real *csc_val,
  CHECK_NOTNULL(csc_col);
  CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
-     << "csc_matrix is not csc format error!";
+      << "csc_matrix is not csc format error!";
  if (csc_matrix->nnz > row_size ||
      csc_matrix->cols + 1 > static_cast<int>(col_size)) {
@@ -1119,20 +1073,20 @@ void hl_memcpy_from_csc_matrix(real *csc_val,
  }
  hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
-  hl_memcpy_async((void*)csc_row,
+  hl_memcpy_async((void *)csc_row,
-                  (void*)csc->csc_row,
+                  (void *)csc->csc_row,
                  (csc_matrix->nnz) * sizeof(int),
                  stream);
-  hl_memcpy_async((void*)csc_col,
+  hl_memcpy_async((void *)csc_col,
-                  (void*)csc->csc_col,
+                  (void *)csc->csc_col,
                  (csc_matrix->cols + 1) * sizeof(int),
                  stream);
  if (csc_matrix->type == HL_FLOAT_VALUE) {
    if (csc_val != NULL) {
      CHECK_LE(csc_matrix->nnz, val_size) << "size not match!";
-      hl_memcpy_async((void*)csc_val,
+      hl_memcpy_async((void *)csc_val,
-                      (void*)csc->csc_val,
+                      (void *)csc->csc_val,
-                      (csc_matrix->nnz)*sizeof(real),
+                      (csc_matrix->nnz) * sizeof(real),
                      stream);
    } else {
      LOG(FATAL) << "parameter csr_val is null pointer!";
@@ -1152,7 +1106,7 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
  CHECK_NOTNULL(csr_row);
  CHECK_NOTNULL(csr_col);
  CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
-    << "csr_matrix is not csr format error!";
+      << "csr_matrix is not csr format error!";
  if (csr_matrix->nnz > col_size ||
      csr_matrix->rows + 1 > static_cast<int>(row_size)) {
@@ -1160,20 +1114,20 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
  }
  hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
-  hl_memcpy_async((void*)csr_row,
+  hl_memcpy_async((void *)csr_row,
-                  (void*)csr->csr_row,
+                  (void *)csr->csr_row,
-                  (csr_matrix->rows+1)*sizeof(int),
+                  (csr_matrix->rows + 1) * sizeof(int),
                  stream);
-  hl_memcpy_async((void*)csr_col,
+  hl_memcpy_async((void *)csr_col,
-                  (void*)csr->csr_col,
+                  (void *)csr->csr_col,
-                  (csr_matrix->nnz)*sizeof(int),
+                  (csr_matrix->nnz) * sizeof(int),
                  stream);
  if (csr_matrix->type == HL_FLOAT_VALUE) {
    if (csr_val != NULL) {
      CHECK_LE(csr_matrix->nnz, val_size) << "size not match!";
-      hl_memcpy_async((void*)csr_val,
+      hl_memcpy_async((void *)csr_val,
-                      (void*)csr->csr_val,
+                      (void *)csr->csr_val,
-                      (csr_matrix->nnz)*sizeof(real),
+                      (csr_matrix->nnz) * sizeof(real),
                      stream);
    } else {
      LOG(FATAL) << "parameter csr_val is null pointer!";
@@ -1181,8 +1135,8 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
  }
 }
-void hl_sparse_matrix_column_sum(real* A_d, hl_sparse_matrix_s B_d, int dimM,
+void hl_sparse_matrix_column_sum(
-                                 int dimN, real scale) {
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {
  if (B_d->format == HL_SPARSE_CSR) {
    hl_matrix_csr_column_sum(A_d, B_d, dimM, dimN, scale);
  } else {
@@ -1190,8 +1144,8 @@ void hl_sparse_matrix_column_sum(real* A_d, hl_sparse_matrix_s B_d, int dimM,
  }
 }
-void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d,
+void hl_matrix_csr_column_sum(
-                              int dimM, int dimN, real scale) {
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(B_d);
@@ -1216,8 +1170,7 @@ void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d,
  CHECK_SYNC("hl_matrix_csr_column_sum failed");
 }
-void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
+void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) {
-                               real* B_d, real scale) {
  if (A_d->format == HL_SPARSE_CSR) {
    hl_matrix_csr_add_bias(A_d, B_d, scale);
  } else {
@@ -1225,8 +1178,7 @@ void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
  }
 }
-void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d,
+void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) {
-                            real scale) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(B_d);
@@ -1247,8 +1199,12 @@ void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d,
  CHECK_SYNC("hl_sparse_matrix_add_bias failed");
 }
-void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, real *B_d, int dimM,
+void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
-                                int dimN, real alpha, real beta) {
+                                real *B_d,
+                                int dimM,
+                                int dimN,
+                                real alpha,
+                                real beta) {
  if (A_d->format == HL_SPARSE_CSR) {
    hl_matrix_csr_add_dense(A_d, B_d, dimM, dimN, alpha, beta);
  } else {
@@ -1256,8 +1212,12 @@ void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, real *B_d, int dimM,
  }
 }
-void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM,
+void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
-                             int dimN, real alpha, real beta) {
+                             real *B_d,
+                             int dimM,
+                             int dimN,
+                             real alpha,
+                             real beta) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(B_d);
@@ -1277,20 +1237,26 @@ void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM,
  gridX = gridX > 0 ? gridX : 1;
  dim3 block(512, 1);
  dim3 grid(gridX, dimM);
-  KeSMatrixCsrAddDense<<<grid, block, 0, STREAM_DEFAULT>>>(
+  KeSMatrixCsrAddDense<<<grid, block, 0, STREAM_DEFAULT>>>(A_d2->csr_val,
-    A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, B_d, alpha, beta, dimM, dimN);
+                                                           A_d2->csr_row,
+                                                           A_d2->csr_col,
+                                                           B_d,
+                                                           alpha,
+                                                           beta,
+                                                           dimM,
+                                                           dimN);
  CHECK_SYNC("hl_sparse_matrix_add_dense failed");
 }
-int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
+int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
  __sparse_get_return__(sMat, row);
 }
-int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
+int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
  __sparse_get_return__(sMat, col);
 }
-real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
+real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
  __sparse_get_return__(sMat, val);
 }
--- a/paddle/cuda/src/hl_perturbation_util.cu
+++ b/paddle/cuda/src/hl_perturbation_util.cu
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <cmath>
 #include <stdlib.h>
-#include "hl_cuda.h"
+#include <cmath>
-#include "hl_time.h"
 #include "hl_base.h"
+#include "hl_cuda.h"
 #include "hl_perturbation_util.cuh"
+#include "hl_time.h"
 #define _USE_MATH_DEFINES
@@ -30,10 +29,16 @@ limitations under the License. */
 * centerX, centerY: translation.
 * sourceX, sourceY: output coordinates in the original image.
 */
-__device__ void getTranformCoord(int x, int y, real theta, real scale,
+__device__ void getTranformCoord(int x,
-                                 real tgtCenter, real imgCenter,
+                                 int y,
-                                 real centerR, real centerC,
+                                 real theta,
-                                 int* sourceX, int* sourceY) {
+                                 real scale,
+                                 real tgtCenter,
+                                 real imgCenter,
+                                 real centerR,
+                                 real centerC,
+                                 int* sourceX,
+                                 int* sourceY) {
  real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)};
  // compute coornidates in the rotated and scaled image
@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
 * created by Wei Xu (genome), converted by Jiang Wang
 */
-__global__ void kSamplingPatches(const real* imgs, real* targets,
+__global__ void kSamplingPatches(const real* imgs,
-                                 int imgSize, int tgtSize, const int channels,
+                                 real* targets,
-                                 int samplingRate, const real* thetas,
+                                 int imgSize,
-                                 const real* scales, const int* centerRs,
+                                 int tgtSize,
-                                 const int* centerCs, const real padValue,
+                                 const int channels,
+                                 int samplingRate,
+                                 const real* thetas,
+                                 const real* scales,
+                                 const int* centerRs,
+                                 const int* centerCs,
+                                 const real padValue,
                                 const int numImages) {
  const int caseIdx = blockIdx.x * 4 + threadIdx.x;
  const int pxIdx = blockIdx.y * 128 + threadIdx.y;
@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
    const int pxY = pxIdx / tgtSize;
    int srcPxX, srcPxY;
-    getTranformCoord(pxX, pxY, thetas[imgIdx], scales[imgIdx], tgtCenter,
+    getTranformCoord(pxX,
-                     imgCenter, centerCs[caseIdx], centerRs[caseIdx], &srcPxX,
+                     pxY,
+                     thetas[imgIdx],
+                     scales[imgIdx],
+                     tgtCenter,
+                     imgCenter,
+                     centerCs[caseIdx],
+                     centerRs[caseIdx],
+                     &srcPxX,
                     &srcPxY);
    imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels;
@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
 *
 * created by Wei Xu
 */
-void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
+void hl_generate_disturb_params(real*& gpuAngle,
-                                int*& gpuCenterR, int*& gpuCenterC,
+                                real*& gpuScaleRatio,
-                                int numImages, int imgSize, real rotateAngle,
+                                int*& gpuCenterR,
-                                real scaleRatio, int samplingRate,
+                                int*& gpuCenterC,
+                                int numImages,
+                                int imgSize,
+                                real rotateAngle,
+                                real scaleRatio,
+                                int samplingRate,
                                bool isTrain) {
  // The number of output samples.
  int numPatches = numImages * samplingRate;
@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
    for (int i = 0; i < numImages; i++) {
      r_angle[i] =
          (rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0)  // NOLINT
-                                          - 0.5);
+                                          -
+                                          0.5);
      s_ratio[i] =
          1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio;  // NOLINT
    }
@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
        int pxY =
            (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0));  // NOLINT
-        const real H[4] = {cos(-r_angle[i]), -sin(-r_angle[i]),
+        const real H[4] = {cos(-r_angle[i]),
-                           sin(-r_angle[i]), cos(-r_angle[i])};
+                           -sin(-r_angle[i]),
+                           sin(-r_angle[i]),
+                           cos(-r_angle[i])};
        real x = pxX - imgCenter;
        real y = pxY - imgCenter;
        real xx = H[0] * x + H[1] * y;
@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
  delete[] center_c;
 }
-void hl_conv_random_disturb_with_params(const real* images, int imgSize,
+void hl_conv_random_disturb_with_params(const real* images,
-                                        int tgtSize, int channels,
+                                        int imgSize,
-                                        int numImages, int samplingRate,
+                                        int tgtSize,
+                                        int channels,
+                                        int numImages,
+                                        int samplingRate,
                                        const real* gpuRotationAngle,
                                        const real* gpuScaleRatio,
                                        const int* gpuCenterR,
@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
  dim3 threadsPerBlock(4, 128);
  dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128));
-  kSamplingPatches <<<numBlocks, threadsPerBlock>>>
+  kSamplingPatches<<<numBlocks, threadsPerBlock>>>(images,
-      (images, target, imgSize, tgtSize, channels, samplingRate,
+                                                   target,
-      gpuRotationAngle, gpuScaleRatio, gpuCenterR, gpuCenterC,
+                                                   imgSize,
-      paddingValue, numImages);
+                                                   tgtSize,
+                                                   channels,
+                                                   samplingRate,
+                                                   gpuRotationAngle,
+                                                   gpuScaleRatio,
+                                                   gpuCenterR,
+                                                   gpuCenterC,
+                                                   paddingValue,
+                                                   numImages);
  hl_device_synchronize();
 }
-void hl_conv_random_disturb(const real* images, int imgSize,
+void hl_conv_random_disturb(const real* images,
-                            int tgtSize, int channels, int numImages,
+                            int imgSize,
-                            real scaleRatio, real rotateAngle,
+                            int tgtSize,
-                            int samplingRate, real* gpu_r_angle,
+                            int channels,
-                            real* gpu_s_ratio, int* gpu_center_r,
+                            int numImages,
-                            int* gpu_center_c, int paddingValue,
+                            real scaleRatio,
-                            bool isTrain, real* targets) {
+                            real rotateAngle,
+                            int samplingRate,
+                            real* gpu_r_angle,
+                            real* gpu_s_ratio,
+                            int* gpu_center_r,
+                            int* gpu_center_c,
+                            int paddingValue,
+                            bool isTrain,
+                            real* targets) {
  // generate the random disturbance sequence and the sampling locations
-  hl_generate_disturb_params(gpu_r_angle, gpu_s_ratio, gpu_center_r,
+  hl_generate_disturb_params(gpu_r_angle,
-                  gpu_center_c, numImages, imgSize, rotateAngle,
+                             gpu_s_ratio,
-                  scaleRatio, samplingRate, isTrain);
+                             gpu_center_r,
+                             gpu_center_c,
-  hl_conv_random_disturb_with_params(
+                             numImages,
-                  images, imgSize, tgtSize, channels, numImages,
+                             imgSize,
-                  samplingRate, gpu_r_angle, gpu_s_ratio,
+                             rotateAngle,
-                  gpu_center_r, gpu_center_r, paddingValue,
+                             scaleRatio,
-                  targets);
+                             samplingRate,
+                             isTrain);
+  hl_conv_random_disturb_with_params(images,
+                                     imgSize,
+                                     tgtSize,
+                                     channels,
+                                     numImages,
+                                     samplingRate,
+                                     gpu_r_angle,
+                                     gpu_s_ratio,
+                                     gpu_center_r,
+                                     gpu_center_r,
+                                     paddingValue,
+                                     targets);
 }
--- a/paddle/cuda/src/hl_table_apply.cu
+++ b/paddle/cuda/src/hl_table_apply.cu
@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "hl_base.h"
-#include "hl_device_functions.cuh"
 #include "hl_cuda.h"
+#include "hl_device_functions.cuh"
 #include "paddle/utils/Logging.h"
-template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
+template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
-__global__ void KeMatrixAddRows(real* output, int ldo,
+__global__ void KeMatrixAddRows(real* output,
-                                real* table, int ldt,
+                                int ldo,
+                                real* table,
+                                int ldt,
                                int* ids,
                                int numSamples,
                                int tableSize,
@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
  while (idy < numSamples) {
    int tableId = ids[idy];
    if ((0 <= tableId) && (tableId < tableSize)) {
-      real *out = output + idy * ldo;
+      real* out = output + idy * ldo;
-      real *tab = table + tableId * ldt;
+      real* tab = table + tableId * ldt;
      for (int i = idx; i < dim; i += blockDimX) {
        if (AddRow) {
          paddle::paddleAtomicAdd(&tab[i], out[i]);
@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
  }
 }
-void hl_matrix_select_rows(real* output, int ldo,
+void hl_matrix_select_rows(real* output,
-                           real* table, int ldt,
+                           int ldo,
+                           real* table,
+                           int ldt,
                           int* ids,
                           int numSamples,
                           int tableSize,
@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,
  dim3 threads(128, 8);
  dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (output, ldo, table, ldt, ids, numSamples, tableSize, dim);
+      output, ldo, table, ldt, ids, numSamples, tableSize, dim);
  CHECK_SYNC("hl_matrix_select_rows failed");
 }
-void hl_matrix_add_to_rows(real* table, int ldt,
+void hl_matrix_add_to_rows(real* table,
-                           real* input, int ldi,
+                           int ldt,
+                           real* input,
+                           int ldi,
                           int* ids,
                           int numSamples,
                           int tableSize,
@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,
  dim3 threads(128, 8);
  dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (input, ldi, table, ldt, ids, numSamples, tableSize, dim);
+      input, ldi, table, ldt, ids, numSamples, tableSize, dim);
  CHECK_SYNC("hl_matrix_add_to_rows failed");
 }
-template<class T, int blockDimX, int gridDimX>
+template <class T, int blockDimX, int gridDimX>
-__global__ void KeVectorSelect(T* dst, int sized,
+__global__ void KeVectorSelect(
-                               const T* src, int sizes,
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
-                               const int* ids, int sizei) {
  int idx = threadIdx.x + blockDimX * blockIdx.x;
  while (idx < sizei) {
    int index = ids[idx];
@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
 }
 template <class T>
-void hl_vector_select_from(T* dst, int sized,
+void hl_vector_select_from(
-                           const T* src, int sizes,
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
-                           const int* ids, int sizei) {
  CHECK_NOTNULL(dst);
  CHECK_NOTNULL(src);
  CHECK_NOTNULL(ids);
@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,
  dim3 threads(512, 1);
  dim3 grid(8, 1);
-  KeVectorSelect<T, 512, 8><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (dst, sized, src, sizes, ids, sizei);
+      dst, sized, src, sizes, ids, sizei);
  CHECK_SYNC("hl_vector_select_from failed");
 }
-template
+template void hl_vector_select_from(real* dst,
-void hl_vector_select_from(real* dst, int sized,
+                                    int sized,
-                           const real* src, int sizes,
+                                    const real* src,
-                           const int* ids, int sizei);
+                                    int sizes,
-template
+                                    const int* ids,
-void hl_vector_select_from(int* dst, int sized,
+                                    int sizei);
-                           const int* src, int sizes,
+template void hl_vector_select_from(
-                           const int* ids, int sizei);
+    int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);
--- a/paddle/cuda/src/hl_top_k.cu
+++ b/paddle/cuda/src/hl_top_k.cu
@@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "hl_base.h"
-#include "hl_top_k.h"
 #include "hl_sparse.ph"
+#include "hl_top_k.h"
 #include "paddle/utils/Logging.h"
 // using namespace hppl;
 struct Pair {
-  __device__ __forceinline__
+  __device__ __forceinline__ Pair() {}
-  Pair() {}
-  __device__ __forceinline__
+  __device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {}
-  Pair(real value, int id) : v_(value), id_(id) {}
-  __device__ __forceinline__
+  __device__ __forceinline__ void set(real value, int id) {
-  void set(real value, int id) {
    v_ = value;
    id_ = id;
  }
-  __device__ __forceinline__
+  __device__ __forceinline__ void operator=(const Pair& in) {
-  void operator=(const Pair& in) {
    v_ = in.v_;
    id_ = in.id_;
  }
-  __device__ __forceinline__
+  __device__ __forceinline__ bool operator<(const real value) const {
-  bool operator<(const real value) const {
    return (v_ < value);
  }
-  __device__ __forceinline__
+  __device__ __forceinline__ bool operator<(const Pair& in) const {
-  bool operator<(const Pair& in) const {
    return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_));
  }
-  __device__ __forceinline__
+  __device__ __forceinline__ bool operator>(const Pair& in) const {
-  bool operator>(const Pair& in) const {
    return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_));
  }
@@ -58,8 +50,9 @@ struct Pair {
  int id_;
 };
-__device__ __forceinline__
+__device__ __forceinline__ void addTo(Pair topK[],
-void addTo(Pair topK[], const Pair &p, int beamSize) {
+                                      const Pair& p,
+                                      int beamSize) {
  for (int k = beamSize - 2; k >= 0; k--) {
    if (topK[k] < p) {
      topK[k + 1] = topK[k];
@@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) {
  topK[0] = p;
 }
-template<int beamSize>
+template <int beamSize>
-__device__ __forceinline__
+__device__ __forceinline__ void addTo(Pair topK[], const Pair& p) {
-void addTo(Pair topK[], const Pair &p) {
  for (int k = beamSize - 2; k >= 0; k--) {
    if (topK[k] < p) {
      topK[k + 1] = topK[k];
@@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) {
  topK[0] = p;
 }
-template<int blockSize>
+template <int blockSize>
-__device__ __forceinline__
+__device__ __forceinline__ void getTopK(
-void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
+    Pair topK[], real* src, int idx, int dim, int beamSize) {
  while (idx < dim) {
    if (topK[beamSize - 1] < src[idx]) {
      Pair tmp(src[idx], idx);
@@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
  }
 }
-template<int blockSize>
+template <int blockSize>
-__device__ __forceinline__
+__device__ __forceinline__ void getTopK(
-void getTopK(Pair topK[], real *src, int idx, int dim,
+    Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) {
-             const Pair& max, int beamSize) {
  while (idx < dim) {
    if (topK[beamSize - 1] < src[idx]) {
      Pair tmp(src[idx], idx);
@@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim,
  }
 }
-template<int blockSize>
+template <int blockSize>
-__device__ __forceinline__
+__device__ __forceinline__ void getTopK(
-void getTopK(Pair topK[], real *val, int *col,
+    Pair topK[], real* val, int* col, int idx, int dim, int beamSize) {
-             int idx, int dim, int beamSize) {
  while (idx < dim) {
    if (topK[beamSize - 1] < val[idx]) {
      Pair tmp(val[idx], col[idx]);
@@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col,
  }
 }
-template<int blockSize>
+template <int blockSize>
-__device__ __forceinline__
+__device__ __forceinline__ void getTopK(Pair topK[],
-void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
+                                        real* val,
-             const Pair& max, int beamSize) {
+                                        int* col,
+                                        int idx,
+                                        int dim,
+                                        const Pair& max,
+                                        int beamSize) {
  while (idx < dim) {
    if (topK[beamSize - 1] < val[idx]) {
      Pair tmp(val[idx], col[idx]);
@@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
  }
 }
-template<int maxLength, int blockSize>
+template <int maxLength, int blockSize>
-__device__ __forceinline__
+__device__ __forceinline__ void threadGetTopK(Pair topK[],
-void threadGetTopK(Pair topK[], int& beam, int beamSize,
+                                              int& beam,
-                   real* src,
+                                              int beamSize,
-                   bool& firstStep, bool& isEmpty, Pair& max,
+                                              real* src,
-                   int dim, const int tid) {
+                                              bool& firstStep,
+                                              bool& isEmpty,
+                                              Pair& max,
+                                              int dim,
+                                              const int tid) {
  if (beam > 0) {
    int length = beam < beamSize ? beam : beamSize;
    if (firstStep) {
@@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
        }
      }
      if (!isEmpty) {
-        getTopK<blockSize>(topK + maxLength - beam, src, tid, dim,
+        getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, max, length);
-                           max, length);
      }
    }
@@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
  }
 }
-template<int maxLength, int blockSize>
+template <int maxLength, int blockSize>
-__device__ __forceinline__
+__device__ __forceinline__ void threadGetTopK(Pair topK[],
-void threadGetTopK(Pair topK[], int& beam, int beamSize,
+                                              int& beam,
-                   real* val, int* col,
+                                              int beamSize,
-                   bool& firstStep, bool& isEmpty, Pair& max,
+                                              real* val,
-                   int dim, const int tid) {
+                                              int* col,
+                                              bool& firstStep,
+                                              bool& isEmpty,
+                                              Pair& max,
+                                              int dim,
+                                              const int tid) {
  if (beam > 0) {
    int length = beam < beamSize ? beam : beamSize;
    if (firstStep) {
@@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
        }
      }
      if (!isEmpty) {
-        getTopK<blockSize>(topK + maxLength - beam, val, col, tid, dim,
+        getTopK<blockSize>(
-                           max, length);
+            topK + maxLength - beam, val, col, tid, dim, max, length);
      }
    }
@@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
  }
 }
-template<int maxLength, int blockSize>
+template <int maxLength, int blockSize>
-__device__ __forceinline__
+__device__ __forceinline__ void blockReduce(Pair* shTopK,
-void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
+                                            int* maxId,
-                 real** topVal, int** topIds,
+                                            Pair topK[],
-                 int& beam, int& beamSize,
+                                            real** topVal,
-                 const int tid, const int warp) {
+                                            int** topIds,
+                                            int& beam,
+                                            int& beamSize,
+                                            const int tid,
+                                            const int warp) {
  while (true) {
    __syncthreads();
    if (tid < blockSize / 2) {
@@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
      }
    }
    __syncthreads();
-    for (int stride = blockSize / 4; stride > 0; stride = stride/2) {
+    for (int stride = blockSize / 4; stride > 0; stride = stride / 2) {
      if (tid < stride) {
        if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) {
          maxId[tid] = maxId[tid + stride];
@@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
 * 3. go to the second setp, until one thread's topK value is null;
 * 4. go to the first setp, until get the topK value.
 */
-template<int maxLength, int blockSize>
+template <int maxLength, int blockSize>
-__global__ void KeMatrixTopK(real* topVal, int ldv,
+__global__ void KeMatrixTopK(real* topVal,
-                             int * topIds,
+                             int ldv,
-                             real* src, int lds,
+                             int* topIds,
+                             real* src,
+                             int lds,
                             int dim,
                             int beamSize) {
  __shared__ Pair shTopK[blockSize];
@@ -271,7 +279,7 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
  topVal += blockIdx.x * ldv;
  topIds += blockIdx.x * beamSize;
-  Pair topK[maxLength]; // NOLINT
+  Pair topK[maxLength];  // NOLINT
  int beam = maxLength;
  Pair max;
  bool isEmpty = false;
@@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
    topK[k].set(-HL_FLOAT_MAX, -1);
  }
  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>
+    threadGetTopK<maxLength, blockSize>(
-      (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>
+    blockReduce<maxLength, blockSize>(
-      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
  }
 }
-template<int maxLength, int blockSize>
+template <int maxLength, int blockSize>
-__global__ void KeSMatrixTopK(real* topVal, int ldv,
+__global__ void KeSMatrixTopK(real* topVal,
-                              int * topIds,
+                              int ldv,
+                              int* topIds,
                              real* val,
                              int* row,
                              int* col,
@@ -304,7 +313,7 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
  topVal += blockIdx.x * ldv;
  topIds += blockIdx.x * beamSize;
-  Pair topK[maxLength]; // NOLINT
+  Pair topK[maxLength];  // NOLINT
  int beam = maxLength;
  Pair max;
  bool isEmpty = false;
@@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
    topK[k].set(-HL_FLOAT_MAX, -1);
  }
  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>
+    threadGetTopK<maxLength, blockSize>(
-      (topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
+        topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>
+    blockReduce<maxLength, blockSize>(
-      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
  }
 }
-void hl_matrix_top_k(real* topVal, int ldv,
+void hl_matrix_top_k(real* topVal,
-                     int * topIds,
+                     int ldv,
-                     real* src, int lds,
+                     int* topIds,
+                     real* src,
+                     int lds,
                     int dim,
                     int beamSize,
                     int numSamples) {
@@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv,
  dim3 threads(256, 1);
  dim3 grid(numSamples, 1);
-  KeMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (topVal, ldv, topIds, src, lds, dim, beamSize);
+      topVal, ldv, topIds, src, lds, dim, beamSize);
  CHECK_SYNC("hl_matrix_top_k failed");
 }
-void hl_sparse_matrix_top_k(real* topVal, int ldv,
+void hl_sparse_matrix_top_k(real* topVal,
-                            int * topIds,
+                            int ldv,
+                            int* topIds,
                            hl_sparse_matrix_s src,
                            int beamSize,
                            int numSamples) {
  CHECK_NOTNULL(topVal);
  CHECK_NOTNULL(topIds);
  CHECK_NOTNULL(src);
-  CHECK_EQ(src->format, HL_SPARSE_CSR)
+  CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!";
-    <<"sparse matrix format error!";
  hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
-  if (csr->csr_val == NULL || csr->csr_row == NULL ||
+  if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) {
-      csr->csr_col == NULL) {
    LOG(FATAL) << "parameter src is null!";
  }
  dim3 threads(256, 1);
  dim3 grid(numSamples, 1);
-  KeSMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeSMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
+      topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
  CHECK_SYNC("hl_sparse_matrix_top_k failed");
 }
@@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
 * 3. go to the second setp, until one thread's topK value is null;
 * 4. go to the first setp, until get the topK value.
 */
-template<int maxLength, int blockSize>
+template <int maxLength, int blockSize>
-__global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
+__global__ void KeMatrixTopKClassificationError(real* topVal,
-                                                int * topIds,
+                                                int ldv,
-                                                real* src, int lds,
+                                                int* topIds,
+                                                real* src,
+                                                int lds,
                                                int dim,
                                                int beamSize,
                                                int* label,
@@ -408,7 +420,7 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
  topVal += blockIdx.x * ldv;
  topIds += blockIdx.x * beamSize;
-  Pair topK[maxLength]; // NOLINT
+  Pair topK[maxLength];  // NOLINT
  int beam = maxLength;
  Pair max;
  bool isEmpty = false;
@@ -420,34 +432,36 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
  }
  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>
+    threadGetTopK<maxLength, blockSize>(
-      (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>
+    blockReduce<maxLength, blockSize>(
-      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
  }
  __syncthreads();
  if (tid == 0) {
    for (int i = 0; i < topkSize; i++) {
-        if (*--topIds == label[blockIdx.x]) {
+      if (*--topIds == label[blockIdx.x]) {
-            recResult[blockIdx.x] = 0;
+        recResult[blockIdx.x] = 0;
-            break;
+        break;
-        }
+      }
-        recResult[blockIdx.x] = 1.0f;
+      recResult[blockIdx.x] = 1.0f;
    }
  }
 }
-void hl_matrix_classification_error(real* topVal, int ldv,
+void hl_matrix_classification_error(real* topVal,
-                                   int* topIds,
+                                    int ldv,
-                                   real* src, int lds,
+                                    int* topIds,
-                                   int dim,
+                                    real* src,
-                                   int topkSize,
+                                    int lds,
-                                   int numSamples,
+                                    int dim,
-                                   int* label,
+                                    int topkSize,
-                                   real* recResult) {
+                                    int numSamples,
+                                    int* label,
+                                    real* recResult) {
  CHECK_NOTNULL(topVal);
  CHECK_NOTNULL(topIds);
  CHECK_NOTNULL(src);
@@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv,
  dim3 threads(256, 1);
  dim3 grid(numSamples, 1);
-  KeMatrixTopKClassificationError<5, 256>
+  KeMatrixTopKClassificationError<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-  <<< grid, threads, 0, STREAM_DEFAULT >>>
+      topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
-  (topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
  CHECK_SYNC("hl_matrix_top_k classification error failed");
 }
--- a/paddle/framework/attribute.proto
+++ b/paddle/framework/attribute.proto
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-syntax="proto2";
+syntax = "proto2";
 package paddle.framework;
 // Attribute Type for paddle's Op.
 // Op contains many attributes. Each type of attributes could be different.
 // The AttrType will be shared between AttrDesc and AttrProto.
 enum AttrType {
-    INT = 0;
+  INT = 0;
-    FLOAT = 1;
+  FLOAT = 1;
-    STRING = 2;
+  STRING = 2;
-    INTS = 3;
+  INTS = 3;
-    FLOATS = 4;
+  FLOATS = 4;
-    STRINGS = 5;
+  STRINGS = 5;
 }
\ No newline at end of file
--- a/paddle/framework/op_desc.proto
+++ b/paddle/framework/op_desc.proto
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-syntax="proto2";
+syntax = "proto2";
 package paddle.framework;
 import "attribute.proto";
@@ -22,14 +22,14 @@ import "attribute.proto";
 //
 // e.g, for scale=3.0: name=scala, type=AttrType.FLOAT, value=3.0
 message AttrDesc {
-    required string name = 1;
+  required string name = 1;
-    required AttrType type = 2;
+  required AttrType type = 2;
-    optional int32 i = 3;
+  optional int32 i = 3;
-    optional float f = 4;
+  optional float f = 4;
-    optional string s = 5;
+  optional string s = 5;
-    repeated int32 ints = 6;
+  repeated int32 ints = 6;
-    repeated float floats = 7;
+  repeated float floats = 7;
-    repeated string strings = 8;
+  repeated string strings = 8;
 };
 // Protocol Message to describe an Operator.
@@ -42,15 +42,15 @@ message AttrDesc {
 // 3rd-party language can build this proto message and call
 // AddOp(const OpDesc& op_desc) of Paddle core to create an Operator.
 message OpDesc {
-    // input names of this Operator.
+  // input names of this Operator.
-    repeated string inputs = 1;
+  repeated string inputs = 1;
-    // output names of this Operator.
+  // output names of this Operator.
-    repeated string outputs = 2;
+  repeated string outputs = 2;
-    // type of this Operator, such as "add", "sub", "fc".
+  // type of this Operator, such as "add", "sub", "fc".
-    required string type = 3;
+  required string type = 3;
-    // Attributes of this Operator. e.g., scale=3.0 in cosine op.
+  // Attributes of this Operator. e.g., scale=3.0 in cosine op.
-    repeated AttrDesc attrs = 4;
+  repeated AttrDesc attrs = 4;
 };
\ No newline at end of file
--- a/paddle/framework/op_proto.proto
+++ b/paddle/framework/op_proto.proto
@@ -15,10 +15,11 @@ limitations under the License. */
 // Protocol Message for 3rd-party language binding.
 //
 // Paddle Python package will use `OpProto` to generate op creation methods.
-// The op creation methods take user's input and generate `OpDesc` proto message,
+// The op creation methods take user's input and generate `OpDesc` proto
+// message,
 // then pass `OpDesc` to C++ side and create Op pointer.
 //
-syntax="proto2";
+syntax = "proto2";
 package paddle.framework;
 import "attribute.proto";
@@ -26,89 +27,90 @@ import "attribute.proto";
 // Attribute protocol message for 3rd-party language binding.
 // It will store the Op support what attribute and what type.
 message AttrProto {
-    // Supported attribute name. e.g. `scale` for cosine op.
+  // Supported attribute name. e.g. `scale` for cosine op.
-    required string name = 1;
+  required string name = 1;
-    // Supported attribute type.
+  // Supported attribute type.
-    required AttrType type = 2;
+  required AttrType type = 2;
-    // Supported attribute comments. It helps 3rd-party language generate doc-string.
+  // Supported attribute comments. It helps 3rd-party language generate
-    required string comment = 3;
+  // doc-string.
+  required string comment = 3;
-    // If that attribute is generated, it means the Paddle third language
+  // If that attribute is generated, it means the Paddle third language
-    // binding has responsibility to fill that attribute. End-User should
+  // binding has responsibility to fill that attribute. End-User should
-    // not set that attribute.
+  // not set that attribute.
-    optional bool generated = 4 [default=false];
+  optional bool generated = 4 [ default = false ];
 }
 // Input or output message for 3rd-party language binding.
 // It contains parameter name and its comments.
 message VarProto {
-    // Input or output name in that op creation function.
+  // Input or output name in that op creation function.
-    // e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names.
+  // e.g. `cos(a, b, output, ...)`, "a", "b", "output" are names.
-    required string name = 1;
+  required string name = 1;
-    // The comment for that input. It helps 3rd-party language generate doc-string.
+  // The comment for that input. It helps 3rd-party language generate
-    required string comment = 2;
+  // doc-string.
+  required string comment = 2;
-    // Is that input/output could be a list or not.
-    // If so, that Op should write a attributed named `input_format` or
+  // Is that input/output could be a list or not.
-    // `output_format`.
+  // If so, that Op should write a attributed named `input_format` or
-    //
+  // `output_format`.
-    // e.g.
+  //
-    //   If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W`
+  // e.g.
-    //   could be multiple, so the multiple of `X` and `W` is True, and OpDesc
+  //   If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W`
-    //   will hold a attribute of them.
+  //   could be multiple, so the multiple of `X` and `W` is True, and OpDesc
-    //
+  //   will hold a attribute of them.
-    //   The Op desc of same fc could be
+  //
-    //   {
+  //   The Op desc of same fc could be
-    //      "type": "fc",
+  //   {
-    //      "input": ["X1", "X2", "W1", "W2", "b"],
+  //      "type": "fc",
-    //      "output": "fc.out",
+  //      "input": ["X1", "X2", "W1", "W2", "b"],
-    //      "attrs" : {
+  //      "output": "fc.out",
-    //        "input_format": [0, 2, 4, 5]
+  //      "attrs" : {
-    //      }
+  //        "input_format": [0, 2, 4, 5]
-    //   }
+  //      }
-    //
+  //   }
-    optional bool multiple = 3 [default=false];
+  //
+  optional bool multiple = 3 [ default = false ];
-    // It marks that output is a temporary output. That output is not used by
-    // user, but used by other op internally as input. If other op is not use
+  // It marks that output is a temporary output. That output is not used by
-    // that output, it could be optimized early.
+  // user, but used by other op internally as input. If other op is not use
-    //
+  // that output, it could be optimized early.
-    // Attribute temporary_index will be set in OpDesc if there is some
+  //
-    // outputs are temporary.
+  // Attribute temporary_index will be set in OpDesc if there is some
-    //
+  // outputs are temporary.
-    // output = [ "xxx.out1", "xxx.tmp", "xxx.out2"],
+  //
-    // attrs = {
+  // output = [ "xxx.out1", "xxx.tmp", "xxx.out2"],
-    //   "temporary_index": [1]
+  // attrs = {
-    // }
+  //   "temporary_index": [1]
-    optional bool temporary = 4 [default=false];
+  // }
+  optional bool temporary = 4 [ default = false ];
-    // The gradient of operator can be ignored immediately
-    // e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
+  // The gradient of operator can be ignored immediately
-    // can be ignored for the future optimized on graph.
+  // e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
-    optional bool ignore_gradient = 6;
+  // can be ignored for the future optimized on graph.
+  optional bool ignore_gradient = 6;
 }
 // Op protocol message for 3rd-party language binding.
 // It contains all information for generating op creation method.
 message OpProto {
-    // The input information to generate op creation method.
+  // The input information to generate op creation method.
-    repeated VarProto inputs = 1;
+  repeated VarProto inputs = 1;
-    // The output information to generate op creation method.
+  // The output information to generate op creation method.
-    repeated VarProto outputs = 2;
+  repeated VarProto outputs = 2;
-    // The attribute information to generate op creation method.
+  // The attribute information to generate op creation method.
-    repeated AttrProto attrs = 3;
+  repeated AttrProto attrs = 3;
-    // The comments for that Op. It helps 3rd-party language generate
+  // The comments for that Op. It helps 3rd-party language generate
-    // doc-string. The whole documentation of that Op is generated by comment,
+  // doc-string. The whole documentation of that Op is generated by comment,
-    // inputs, outputs, attrs together.
+  // inputs, outputs, attrs together.
-    required string comment = 4;
+  required string comment = 4;
-    // The type of that Op.
-    required string type = 5;
+  // The type of that Op.
+  required string type = 5;
 }
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -22,14 +22,14 @@ namespace framework {
 template <>
 Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return *device_context_.get_eigen_device<Eigen::DefaultDevice>();
+  return *device_context_->get_eigen_device<Eigen::DefaultDevice>();
 }
 #ifndef PADDLE_ONLY_CPU
 template <>
 Eigen::GpuDevice&
 ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return *device_context_.get_eigen_device<Eigen::GpuDevice>();
+  return *device_context_->get_eigen_device<Eigen::GpuDevice>();
 }
 #endif

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -174,7 +174,11 @@ class OperatorContext {
  template <typename T>
  T* Output(const size_t index) const {
    auto var = OutputVar(index);
-    PADDLE_ENFORCE(var != nullptr, "Output(%d) should not be nullptr", index);
+    PADDLE_ENFORCE(
+        var != nullptr,
+        "Output(%d) not be nullptr, which means variable [%s] does not "
+        "exist in scope",
+        index, op_.outputs_[index]);
    return var->GetMutable<T>();
  }
@@ -252,7 +256,7 @@ struct EigenDeviceConverter<platform::GPUPlace> {
 class ExecutionContext : public OperatorContext {
 public:
  ExecutionContext(const OperatorBase* op, const Scope& scope,
-                   const platform::DeviceContext& device_context)
+                   const platform::DeviceContext* device_context)
      : OperatorContext(op, scope), device_context_(device_context) {}
  template <typename PlaceType,
@@ -260,9 +264,9 @@ class ExecutionContext : public OperatorContext {
                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
  DeviceType& GetEigenDevice() const;
-  platform::Place GetPlace() const { return device_context_.GetPlace(); }
+  platform::Place GetPlace() const { return device_context_->GetPlace(); }
-  const platform::DeviceContext& device_context_;
+  const platform::DeviceContext* device_context_;
 };
 class OpKernel {
@@ -311,7 +315,7 @@ class OperatorWithKernel : public OperatorBase {
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const final {
    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
-    opKernel->Compute(ExecutionContext(this, scope, dev_ctx));
+    opKernel->Compute(ExecutionContext(this, scope, &dev_ctx));
  }
  static std::unordered_map<std::string /* op_type */, OpKernelMap>&

--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -157,22 +157,22 @@ class CPUKernalMultiInputsTest : public OpKernel {
    ASSERT_EQ(xs[2], "x2");
    auto inVar0 = ctx.MultiInputVar("xs");
-    ASSERT_EQ(inVar0.size(), 3);
+    ASSERT_EQ(inVar0.size(), 3U);
    auto intVar1 = ctx.InputVar("k");
    ASSERT_NE(intVar1, nullptr);
    auto outVar0 = ctx.MultiOutputVar("ys");
-    ASSERT_EQ(outVar0.size(), 2);
+    ASSERT_EQ(outVar0.size(), 2U);
    auto inTensor0 = ctx.MultiInput<Tensor>("xs");
-    ASSERT_EQ(inTensor0.size(), 3);
+    ASSERT_EQ(inTensor0.size(), 3U);
    auto intTensor1 = ctx.Input<Tensor>("k");
    ASSERT_NE(intTensor1, nullptr);
    auto outTensor0 = ctx.MultiOutput<Tensor>("ys");
-    ASSERT_EQ(outTensor0.size(), 2);
+    ASSERT_EQ(outTensor0.size(), 2U);
    auto k = ctx.op_.Input("k");
    ASSERT_EQ(k, "k0");

--- a/paddle/function/BlockExpandOpTest.cpp
+++ b/paddle/function/BlockExpandOpTest.cpp
@@ -18,10 +18,10 @@ limitations under the License. */
 namespace paddle {
 TEST(BlockExpandForward, real) {
-  for (size_t batchSize : {5, 32}) {
+  for (size_t batchSize : {5}) {
-    for (size_t channels : {1, 5, 32}) {
+    for (size_t channels : {1, 5}) {
-      for (size_t inputHeight : {5, 33, 100}) {
+      for (size_t inputHeight : {5, 33}) {
-        for (size_t inputWidth : {5, 32, 96}) {
+        for (size_t inputWidth : {5, 32}) {
          for (size_t block : {1, 3, 5}) {
            for (size_t stride : {1, 2}) {
              for (size_t padding : {0, 1}) {
@@ -61,10 +61,10 @@ TEST(BlockExpandForward, real) {
 }
 TEST(BlockExpandBackward, real) {
-  for (size_t batchSize : {5, 32}) {
+  for (size_t batchSize : {5}) {
-    for (size_t channels : {1, 5, 32}) {
+    for (size_t channels : {1, 5}) {
-      for (size_t inputHeight : {5, 33, 100}) {
+      for (size_t inputHeight : {5, 33}) {
-        for (size_t inputWidth : {5, 32, 96}) {
+        for (size_t inputWidth : {5, 32}) {
          for (size_t block : {1, 3, 5}) {
            for (size_t stride : {1, 2}) {
              for (size_t padding : {0, 1}) {

--- a/paddle/function/BufferArgTest.cpp
+++ b/paddle/function/BufferArgTest.cpp
@@ -32,7 +32,7 @@ TEST(BufferTest, SequenceIdArg) {
                         sizeOfValuType(VALUE_TYPE_INT32));
  SequenceIdArg buffer(memory.getBuf(), shape);
  EXPECT_EQ(buffer.data(), memory.getBuf());
-  EXPECT_EQ(buffer.numSeqs(), 9);
+  EXPECT_EQ(buffer.numSeqs(), 9U);
 }
 }  // namespace paddle
--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "hl_base.h"
 #include "ContextProjectionOp.h"
+#include "hl_base.h"
 namespace paddle {
@@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input,
  int block_size = blockDim.x;
  int sequenceId = blockIdx.x;
  int seq_start = sequence[sequenceId];
-  int seq_end = sequence[sequenceId+1];
+  int seq_end = sequence[sequenceId + 1];
  real value = 0;
  int instances = seq_end - seq_start + context_length - 1;
@@ -49,8 +49,9 @@ __global__ void KeContextProjectionForward(const real* input,
        } else if ((i + context_start) >= (seq_end - seq_start)) {
          if (padding) {
            value =
-              weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
+                weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
-                         input_dim + idx];
+                           input_dim +
+                       idx];
          } else {
            continue;
          }
@@ -61,7 +62,7 @@ __global__ void KeContextProjectionForward(const real* input,
        int outx = (i - context_length) < 0 ? i : (context_length - 1);
        int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
        real* output_r =
-          output + outy * input_dim * context_length + outx * input_dim;
+            output + outy * input_dim * context_length + outx * input_dim;
        for (int j = outy; j < seq_end - seq_start; j++) {
          output_r[idx] += value;
          if (j - outy == outx) break;
@@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input,
  dim3 grid(blocks_x, blocks_y);
  if (weight) {
-    KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
+    KeContextProjectionForward<true><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      (input, sequence, weight, output, input_dim,
+        input,
-       context_length, context_start, begin_pad);
+        sequence,
-  } else  {
+        weight,
-    KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
+        output,
-      (input, sequence, weight, output, input_dim,
+        input_dim,
-       context_length, context_start, begin_pad);
+        context_length,
+        context_start,
+        begin_pad);
+  } else {
+    KeContextProjectionForward<false><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        input,
+        sequence,
+        weight,
+        output,
+        input_dim,
+        context_length,
+        context_start,
+        begin_pad);
  }
  CHECK_SYNC("hl_context_projection_forward failed");
 }
@@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
  int block_size = blockDim.x;
  int sequenceId = blockIdx.x;
  int seq_start = sequence[sequenceId];
-  int seq_end = sequence[sequenceId+1];
+  int seq_end = sequence[sequenceId + 1];
  real value = 0;
  int instances = seq_end - seq_start + context_length - 1;
@@ -170,7 +183,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
        int outx = (i - context_length) < 0 ? i : (context_length - 1);
        int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
        real* output_r =
-          out + outy * input_dim * context_length + outx * input_dim;
+            out + outy * input_dim * context_length + outx * input_dim;
        for (int j = outy; j < seq_end - seq_start; j++) {
          value += output_r[idx];
          if (j - outy == outx) break;
@@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad,
  int blocks_y = 1;
  dim3 threads(block_size, 1);
  dim3 grid(blocks_x, blocks_y);
-  KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
+  KeContextProjectionBackwardData<<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (out_grad, sequence, input_grad, input_dim, context_length, context_start);
+      out_grad, sequence, input_grad, input_dim, context_length, context_start);
  CHECK_SYNC("hl_context_projection_backward_data failed");
 }
@@ -231,7 +244,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                      context_start);
 }
-template<int THREADS_X, int THREADS_Y>
+template <int THREADS_X, int THREADS_Y>
 __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
                                                  const int* sequence,
                                                  real* w_grad,
@@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
  if (weight_idx < w_dim) {
    for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
      int seq_start = sequence[seqId];
-      int seq_end = sequence[seqId+1];
+      int seq_end = sequence[seqId + 1];
-      output_r = const_cast<real*>(out_grad)
+      output_r =
-                    + seq_start * w_dim * context_length;
+          const_cast<real*>(out_grad) + seq_start * w_dim * context_length;
      if (context_start < 0) {
        if (padId + context_start < 0) {
          instanceId = padId;
        } else {
          // begin_pad > 0;
-          instanceId = (padId - begin_pad) +
+          instanceId =
-            (seq_end - seq_start) - context_start;
+              (padId - begin_pad) + (seq_end - seq_start) - context_start;
        }
      } else {
        if (padId + (seq_end - seq_start) < context_start) {
@@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
        }
      }
-      int outx = (instanceId - context_length) < 0 ?
+      int outx =
-                 instanceId : (context_length - 1);
+          (instanceId - context_length) < 0 ? instanceId : (context_length - 1);
-      int outy = (instanceId - context_length) < 0 ?
+      int outy = (instanceId - context_length) < 0
-                 0 : (instanceId - (context_length - 1));
+                     ? 0
+                     : (instanceId - (context_length - 1));
      output_r += outy * w_dim * context_length + outx * w_dim;
      for (int j = outy; j < seq_end - seq_start; j++) {
        value += output_r[weight_idx];
@@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
  }
  __syncthreads();
-  for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
+  for (int stride = THREADS_Y / 2; stride > 0; stride = stride / 2) {
    if (idy < stride) {
      sum_s[idy][idx] += sum_s[idy + stride][idx];
    }
@@ -339,22 +353,27 @@ void hl_context_projection_backward_weight(const real* out_grad,
  dim3 threads(threads_x, threads_y);
  dim3 grid(blocks_x, 1);
-  KeContextProjectionBackwardWeight<32, 32>
+  KeContextProjectionBackwardWeight<32,
-    <<< grid, threads, 0, STREAM_DEFAULT >>>
+                                    32><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (out_grad, sequence, w_grad, num_sequences, w_dim,
+      out_grad,
-     context_length, context_start, begin_pad);
+      sequence,
+      w_grad,
+      num_sequences,
+      w_dim,
+      context_length,
+      context_start,
+      begin_pad);
  CHECK_SYNC("hl_context_projection_backward_weight failed");
 }
 template <>
-void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
+void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
-        const GpuMatrix& out_grad,
+                                                      GpuMatrix& w_grad,
-        GpuMatrix& w_grad,
+                                                      const GpuIVector& seq_vec,
-        const GpuIVector& seq_vec,
+                                                      size_t context_length,
-        size_t context_length,
+                                                      int context_start,
-        int context_start,
+                                                      size_t total_pad,
-        size_t total_pad,
+                                                      size_t begin_pad) {
-        size_t begin_pad) {
  hl_context_projection_backward_weight(out_grad.getData(),
                                        seq_vec.getData(),
                                        w_grad.getData(),
@@ -376,23 +395,18 @@ void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                                size_t begin_pad,
                                                bool is_padding,
                                                size_t total_pad) {
-    if (in_grad) {
+  if (in_grad) {
-        ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
+    ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
-                out_grad,
+        out_grad, in_grad, sequence, context_length, context_start);
-                in_grad,
+  }
-                sequence,
+  if (is_padding && w_grad) {
-                context_length,
+    ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(out_grad,
-                context_start);
+                                                     w_grad,
-    }
+                                                     sequence,
-    if (is_padding && w_grad) {
+                                                     context_length,
-        ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
+                                                     context_start,
-                out_grad,
+                                                     total_pad,
-                w_grad,
+                                                     begin_pad);
-                sequence,
-                context_length,
-                context_start,
-                total_pad,
-                begin_pad);
  }
 }

--- a/paddle/function/CosSimOpGpu.cu
+++ b/paddle/function/CosSimOpGpu.cu
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "CosSimOp.h"
 #include "hl_base.h"
 #include "hl_device_functions.cuh"
-#include "CosSimOp.h"
 namespace paddle {
-template<int block_size>
+template <int block_size>
 __global__ void KeCosSim(real* output,
                         const real* input1,
                         const real* input2,
@@ -78,8 +78,8 @@ void hlCossim(real* output,
  dim3 threads(block_size, 1);
  dim3 grid(1, input1_height);
-  KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
+  KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (output, input1, input2, width, input1_height, input2_height, scale);
+      output, input1, input2, width, input1_height, input2_height, scale);
  CHECK_SYNC("hlCossim failed");
 }
@@ -99,7 +99,7 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
  hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale);
 }
-template<int block_size>
+template <int block_size>
 __global__ void KeCosSimDerivative(const real* grad,
                                   const real* output,
                                   const real* prev_out_x,
@@ -148,14 +148,13 @@ __global__ void KeCosSimDerivative(const real* grad,
  if (xy[0] == 0) {
    real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
    for (int index = tid; index < width; index += block_size) {
-      prev_grad_x[index] +=
+      prev_grad_x[index] += scale * grad[ty] * prev_out_y[index] * reciprocal;
-        scale * grad[ty] * prev_out_y[index] * reciprocal;
      if (input2_height > 1) {
-        prev_grad_y[index] +=
+        prev_grad_y[index] += scale * grad[ty] * prev_out_x[index] * reciprocal;
-          scale * grad[ty] * prev_out_x[index] * reciprocal;
      } else {
-        paddle::paddleAtomicAdd(prev_grad_y + index,
+        paddle::paddleAtomicAdd(
-          scale * grad[ty] * prev_out_x[index] * reciprocal);
+            prev_grad_y + index,
+            scale * grad[ty] * prev_out_x[index] * reciprocal);
      }
    }
  } else {
@@ -163,17 +162,18 @@ __global__ void KeCosSimDerivative(const real* grad,
    real reciprocalSquareSumX = 1.0 / xx[0];
    real reciprocalSquareSumY = 1.0 / yy[0];
    for (int index = tid; index < width; index += block_size) {
-      prev_grad_x[index] += output[ty] * grad[ty] *
+      prev_grad_x[index] +=
-        (prev_out_y[index] * reciprocalXY -
+          output[ty] * grad[ty] * (prev_out_y[index] * reciprocalXY -
-         prev_out_x[index] * reciprocalSquareSumX);
+                                   prev_out_x[index] * reciprocalSquareSumX);
      if (input2_height > 1) {
-        prev_grad_y[index] += output[ty] * grad[ty] *
+        prev_grad_y[index] +=
-          (prev_out_x[index] * reciprocalXY -
+            output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
-           prev_out_y[index] * reciprocalSquareSumY);
+                                     prev_out_y[index] * reciprocalSquareSumY);
      } else {
-        paddle::paddleAtomicAdd(prev_grad_y + index, output[ty] * grad[ty] *
+        paddle::paddleAtomicAdd(
-          (prev_out_x[index] * reciprocalXY -
+            prev_grad_y + index,
-           prev_out_y[index] * reciprocalSquareSumY));
+            output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
+                                     prev_out_y[index] * reciprocalSquareSumY));
      }
    }
  }
@@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad,
  const int block_size = 256;
  dim3 threads(block_size, 1);
  dim3 grid(1, input1_height);
-  KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
+  KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
-    (grad, output, prev_out_x, prev_out_y, prev_grad_x, prev_grad_y, width,
+      grad,
-        input1_height, input2_height, scale);
+      output,
+      prev_out_x,
+      prev_out_y,
+      prev_grad_x,
+      prev_grad_y,
+      width,
+      input1_height,
+      input2_height,
+      scale);
  CHECK_SYNC("hlCossimDerivate failed");
 }
@@ -214,9 +222,9 @@ void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                     real scale) {
  CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
        in2_val.getData() && in1_grad.getData() && in2_grad.getData());
-  CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_
+  CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ &&
-        && in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_)
+        in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_)
-        << "Matrix types are not equally GPU";
+      << "Matrix types are not equally GPU";
  size_t dim = in1_val.getWidth();
  const real* grad = out_grad.getData();

--- a/paddle/function/CropOpGpu.cu
+++ b/paddle/function/CropOpGpu.cu
@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "hl_base.h"
 #include "CropOp.h"
+#include "hl_base.h"
 namespace paddle {
-__global__ void KeCrop(real* outputs, const real* inputs,
+__global__ void KeCrop(real* outputs,
-                      int inC, int inH, int inW,
+                       const real* inputs,
-                      int cropC, int cropH, int cropW,
+                       int inC,
-                      int outC, int outH, int outW, int nthreads) {
+                       int inH,
+                       int inW,
+                       int cropC,
+                       int cropH,
+                       int cropW,
+                       int outC,
+                       int outH,
+                       int outW,
+                       int nthreads) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < nthreads) {
    const int w = idx % outW;
@@ -35,12 +43,12 @@ __global__ void KeCrop(real* outputs, const real* inputs,
 template <>
 void Crop<DEVICE_TYPE_GPU>(real* outputs,
-                          const real* inputs,
+                           const real* inputs,
-                          const TensorShape inShape,
+                           const TensorShape inShape,
-                          const TensorShape outShape,
+                           const TensorShape outShape,
-                          const FuncConfig& conf) {
+                           const FuncConfig& conf) {
  std::vector<uint32_t> crop_corner =
-        conf.get<std::vector<uint32_t>>("crop_corner");
+      conf.get<std::vector<uint32_t>>("crop_corner");
  int cropC = crop_corner[1];
  int cropH = crop_corner[2];
  int cropW = crop_corner[3];
@@ -58,16 +66,33 @@ void Crop<DEVICE_TYPE_GPU>(real* outputs,
  int blockSize = 1024;
  int gridSize = (nth + blockSize - 1) / blockSize;
-  KeCrop<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+  KeCrop<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(outputs,
-    (outputs, inputs, inC, inH, inW, cropC, cropH, cropW,
+                                                     inputs,
-     outC, outH, outW, nth);
+                                                     inC,
+                                                     inH,
+                                                     inW,
+                                                     cropC,
+                                                     cropH,
+                                                     cropW,
+                                                     outC,
+                                                     outH,
+                                                     outW,
+                                                     nth);
  CHECK_SYNC("Crop");
 }
-__global__ void KeCropDiff(const real* inGrad, real* outGrad,
+__global__ void KeCropDiff(const real* inGrad,
-                          int inC, int inH, int inW,
+                           real* outGrad,
-                          int cropC, int cropH, int cropW,
+                           int inC,
-                          int outC, int outH, int outW, int nthreads) {
+                           int inH,
+                           int inW,
+                           int cropC,
+                           int cropH,
+                           int cropW,
+                           int outC,
+                           int outH,
+                           int outW,
+                           int nthreads) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < nthreads) {
    const int w = idx % inW;
@@ -84,12 +109,12 @@ __global__ void KeCropDiff(const real* inGrad, real* outGrad,
 template <>
 void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
-                              real* outGrad,
+                               real* outGrad,
-                              const TensorShape inShape,
+                               const TensorShape inShape,
-                              const TensorShape outShape,
+                               const TensorShape outShape,
-                              const FuncConfig& conf) {
+                               const FuncConfig& conf) {
  std::vector<uint32_t> crop_corner =
-        conf.get<std::vector<uint32_t>>("crop_corner");
+      conf.get<std::vector<uint32_t>>("crop_corner");
  int cropC = crop_corner[1];
  int cropH = crop_corner[2];
  int cropW = crop_corner[3];
@@ -107,9 +132,18 @@ void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
  int blockSize = 1024;
  int gridSize = (nth + blockSize - 1) / blockSize;
-  KeCropDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+  KeCropDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(inGrad,
-    (inGrad, outGrad, inC, inH, inW, cropC, cropH, cropW,
+                                                         outGrad,
-     outC, outH, outW, nth);
+                                                         inC,
+                                                         inH,
+                                                         inW,
+                                                         cropC,
+                                                         cropH,
+                                                         cropW,
+                                                         outC,
+                                                         outH,
+                                                         outW,
+                                                         nth);
  CHECK_SYNC("CropGrad");
 }

--- a/paddle/function/CrossMapNormalOpGpu.cu
+++ b/paddle/function/CrossMapNormalOpGpu.cu
@@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "hl_base.h"
 #include "CrossMapNormalOp.h"
+#include "hl_base.h"
 namespace paddle {
-__global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
+__global__ void KeCMRNormFillScale(size_t imageSize,
-                                   real* scale, size_t channels,
+                                   const real* in,
-                                   size_t height, size_t width, size_t size,
+                                   real* scale,
+                                   size_t channels,
+                                   size_t height,
+                                   size_t width,
+                                   size_t size,
                                   real alpha) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < imageSize) {
@@ -51,8 +55,10 @@ __global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
  }
 }
-__global__ void KeCMRNormOutput(size_t inputSize, const real* in,
+__global__ void KeCMRNormOutput(size_t inputSize,
-                                const real* scale, real negative_beta,
+                                const real* in,
+                                const real* scale,
+                                real negative_beta,
                                real* out) {
  const int index = threadIdx.x + blockIdx.x * blockDim.x;
  if (index < inputSize) {
@@ -74,24 +80,30 @@ void CrossMapNormal<DEVICE_TYPE_GPU>(real* outputs,
  size_t imageSize = numSamples * height * width;
  int blockSize = 1024;
  int gridSize = (imageSize + 1024 - 1) / 1024;
-  KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+  KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-    (imageSize, inputs, denoms, channels, height, width, size, scale);
+      imageSize, inputs, denoms, channels, height, width, size, scale);
-  size_t inputSize = numSamples * height * width *channels;
+  size_t inputSize = numSamples * height * width * channels;
  blockSize = 1024;
  gridSize = (inputSize + 1024 - 1) / 1024;
-  KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+  KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-    (inputSize, inputs, denoms, -pow, outputs);
+      inputSize, inputs, denoms, -pow, outputs);
  CHECK_SYNC("CrossMapNormal");
 }
-__global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
+__global__ void KeCMRNormDiff(size_t imageSize,
-                              const real* top_data, const real* scale,
+                              const real* bottom_data,
-                              const real* top_diff, size_t channels,
+                              const real* top_data,
-                              size_t height, size_t width, size_t size,
+                              const real* scale,
-                              real negative_beta, real cache_ratio,
+                              const real* top_diff,
-                              real* bottom_diff ) {
+                              size_t channels,
+                              size_t height,
+                              size_t width,
+                              size_t size,
+                              real negative_beta,
+                              real cache_ratio,
+                              real* bottom_diff) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < imageSize) {
    const int w = idx % width;
@@ -113,17 +125,17 @@ __global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
    while (index < channels + post_pad) {
      if (index < channels) {
        accum += top_diff[index * step] * top_data[index * step] /
-          scale[index * step];
+                 scale[index * step];
      }
      if (index >= size) {
        accum -= top_diff[(index - size) * step] *
-          top_data[(index - size) * step] / scale[(index - size) * step];
+                 top_data[(index - size) * step] / scale[(index - size) * step];
      }
      if (index >= post_pad) {
        bottom_diff[(index - post_pad) * step] +=
-          top_diff[(index - post_pad) * step] *
+            top_diff[(index - post_pad) * step] *
-          pow(scale[(index - post_pad) * step], negative_beta) - cache_ratio *
+                pow(scale[(index - post_pad) * step], negative_beta) -
-          bottom_data[(index - post_pad) * step] * accum;
+            cache_ratio * bottom_data[(index - post_pad) * step] * accum;
      }
      ++index;
    }
@@ -147,9 +159,18 @@ void CrossMapNormalGrad<DEVICE_TYPE_GPU>(real* inputsGrad,
  int blockSize = 1024;
  int gridSize = (imageSize + 1024 - 1) / 1024;
-  KeCMRNormDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+  KeCMRNormDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(imageSize,
-    (imageSize, inputsValue, outputsValue, denoms, outputsGrad, channels,
+                                                            inputsValue,
-      height, width, size, -pow, 2.0f * pow * scale, inputsGrad);
+                                                            outputsValue,
+                                                            denoms,
+                                                            outputsGrad,
+                                                            channels,
+                                                            height,
+                                                            width,
+                                                            size,
+                                                            -pow,
+                                                            2.0f * pow * scale,
+                                                            inputsGrad);
  CHECK_SYNC("CrossMapNormalGrad");
 }

--- a/paddle/function/CrossMapNormalOpTest.cpp
+++ b/paddle/function/CrossMapNormalOpTest.cpp
@@ -18,11 +18,11 @@ limitations under the License. */
 namespace paddle {
 TEST(CrossMapNormal, real) {
-  for (size_t numSamples : {5, 32}) {
+  for (size_t numSamples : {5}) {
-    for (size_t channels : {1, 5, 32}) {
+    for (size_t channels : {1, 5}) {
-      for (size_t imgSizeH : {5, 33, 100}) {
+      for (size_t imgSizeH : {5, 33}) {
-        for (size_t imgSizeW : {5, 32, 96}) {
+        for (size_t imgSizeW : {5, 32}) {
-          for (size_t size : {1, 2, 3, 5, 7}) {
+          for (size_t size : {1, 3}) {
            VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                    << " size=" << size;
@@ -48,11 +48,11 @@ TEST(CrossMapNormal, real) {
 }
 TEST(CrossMapNormalGrad, real) {
-  for (size_t numSamples : {5, 32}) {
+  for (size_t numSamples : {5}) {
-    for (size_t channels : {1, 5, 32}) {
+    for (size_t channels : {1, 5}) {
-      for (size_t imgSizeH : {5, 33, 100}) {
+      for (size_t imgSizeH : {5, 33}) {
-        for (size_t imgSizeW : {5, 32, 96}) {
+        for (size_t imgSizeW : {5, 32}) {
-          for (size_t size : {1, 2, 3, 5, 7}) {
+          for (size_t size : {1, 3}) {
            VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                    << " size=" << size;

--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -20,17 +20,25 @@ namespace paddle {
 // CUDA kernel to compute the depthwise convolution forward pass
 template <class T>
-__global__
+__global__ void ConvolutionDepthwiseForward(const int nthreads,
-void ConvolutionDepthwiseForward(const int nthreads,
+                                            const T* const inputData,
-    const T* const inputData, const T* const filterData,
+                                            const T* const filterData,
-    const int batchSize, const int outputChannels, const int outputHeight,
+                                            const int batchSize,
-    const int outputWidth, const int inputChannels, const int inputHeight,
+                                            const int outputChannels,
-    const int inputWidth, const int filterMultiplier, const int filterHeight,
+                                            const int outputHeight,
-    const int filterWidth, const int strideH, const int strideW,
+                                            const int outputWidth,
-    const int paddingH, const int paddingW, T* const outputData) {
+                                            const int inputChannels,
+                                            const int inputHeight,
-  int index =
+                                            const int inputWidth,
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+                                            const int filterMultiplier,
+                                            const int filterHeight,
+                                            const int filterWidth,
+                                            const int strideH,
+                                            const int strideW,
+                                            const int paddingH,
+                                            const int paddingW,
+                                            T* const outputData) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
  if (index < nthreads) {
    const int batch = index / outputChannels / outputHeight / outputWidth;
@@ -45,32 +53,36 @@ void ConvolutionDepthwiseForward(const int nthreads,
    const int w_in_start = -paddingW + w_out * strideW;
    const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
    const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
-    if ((h_in_start >= 0) && (h_in_end < inputHeight)
+    if ((h_in_start >= 0) && (h_in_end < inputHeight) && (w_in_start >= 0) &&
-       && (w_in_start >= 0) && (w_in_end < inputWidth)) {
+        (w_in_end < inputWidth)) {
-        for (int kh = 0; kh < filterHeight; ++kh) {
+      for (int kh = 0; kh < filterHeight; ++kh) {
-            for (int kw = 0; kw < filterWidth; ++kw) {
+        for (int kw = 0; kw < filterWidth; ++kw) {
-                const int h_in = -paddingH + h_out * strideH + kh;
+          const int h_in = -paddingH + h_out * strideH + kh;
-                const int w_in = -paddingW + w_out * strideW + kw;
+          const int w_in = -paddingW + w_out * strideW + kw;
-                const int offset = ((batch * inputChannels + c_in)
+          const int offset =
-                    * inputHeight + h_in) * inputWidth + w_in;
+              ((batch * inputChannels + c_in) * inputHeight + h_in) *
-                value += (*weight) * inputData[offset];
+                  inputWidth +
-                ++weight;
+              w_in;
-            }
+          value += (*weight) * inputData[offset];
+          ++weight;
        }
+      }
    } else {
-        for (int kh = 0; kh < filterHeight; ++kh) {
+      for (int kh = 0; kh < filterHeight; ++kh) {
-            for (int kw = 0; kw < filterWidth; ++kw) {
+        for (int kw = 0; kw < filterWidth; ++kw) {
-                const int h_in = -paddingH + h_out * strideH + kh;
+          const int h_in = -paddingH + h_out * strideH + kh;
-                const int w_in = -paddingW + w_out * strideW + kw;
+          const int w_in = -paddingW + w_out * strideW + kw;
-                if ((h_in >= 0) && (h_in < inputHeight)
+          if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
-                   && (w_in >= 0) && (w_in < inputWidth)) {
+              (w_in < inputWidth)) {
-                    const int offset = ((batch * inputChannels + c_in)
+            const int offset =
-                        * inputHeight + h_in) * inputWidth + w_in;
+                ((batch * inputChannels + c_in) * inputHeight + h_in) *
-                    value += (*weight) * inputData[offset];
+                    inputWidth +
-                }
+                w_in;
-                ++weight;
+            value += (*weight) * inputData[offset];
-            }
+          }
-       }
+          ++weight;
+        }
+      }
    }
    outputData[index] = value;
  }
@@ -78,16 +90,25 @@ void ConvolutionDepthwiseForward(const int nthreads,
 // CUDA kernel to compute the depthwise convolution backprop w.r.t input.
 template <class T>
-__global__
+__global__ void ConvolutionDepthwiseInputBackward(const int nthreads,
-void ConvolutionDepthwiseInputBackward(const int nthreads,
+                                                  const T* const top_diff,
-    const T* const top_diff, const T* const weight_data,
+                                                  const T* const weight_data,
-    const int num, const int outputChannels, const int outputHeight,
+                                                  const int num,
-    const int outputWidth, const int inputChannels, const int inputHeight,
+                                                  const int outputChannels,
-    const int inputWidth, const int filterMultiplier, const int filterHeight,
+                                                  const int outputHeight,
-    const int filterWidth, const int strideH, const int strideW,
+                                                  const int outputWidth,
-    const int paddingH, const int paddingW, T* const bottom_diff) {
+                                                  const int inputChannels,
-  int index =
+                                                  const int inputHeight,
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+                                                  const int inputWidth,
+                                                  const int filterMultiplier,
+                                                  const int filterHeight,
+                                                  const int filterWidth,
+                                                  const int strideH,
+                                                  const int strideW,
+                                                  const int paddingH,
+                                                  const int paddingW,
+                                                  T* const bottom_diff) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
  if (index < nthreads) {
    const int batch = index / inputChannels / inputHeight / inputWidth;
    const int c_in = (index / inputHeight / inputWidth) % inputChannels;
@@ -96,65 +117,80 @@ void ConvolutionDepthwiseInputBackward(const int nthreads,
    const int c_out_start = c_in * filterMultiplier;
-    int h_out_start = (h_in - filterHeight + paddingH + strideH)/strideH;
+    int h_out_start = (h_in - filterHeight + paddingH + strideH) / strideH;
    h_out_start = 0 > h_out_start ? 0 : h_out_start;
-    int h_out_end = (h_in + paddingH)/strideH;
+    int h_out_end = (h_in + paddingH) / strideH;
-    h_out_end = outputHeight - 1 < h_out_end? outputHeight - 1 : h_out_end;
+    h_out_end = outputHeight - 1 < h_out_end ? outputHeight - 1 : h_out_end;
-    int w_out_start = (w_in - filterWidth + paddingW + strideW)/strideW;
+    int w_out_start = (w_in - filterWidth + paddingW + strideW) / strideW;
    w_out_start = 0 > w_out_start ? 0 : w_out_start;
-    int w_out_end = (w_in + paddingW)/strideW;
+    int w_out_end = (w_in + paddingW) / strideW;
-    w_out_end = outputWidth - 1 < w_out_end? outputWidth - 1 : w_out_end;
+    w_out_end = outputWidth - 1 < w_out_end ? outputWidth - 1 : w_out_end;
    T value = 0;
-    for (int c_out = c_out_start;
+    for (int c_out = c_out_start; c_out < c_out_start + filterMultiplier;
-         c_out < c_out_start + filterMultiplier; c_out ++) {
+         c_out++) {
-        for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
+      for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
-            const int filter_h = h_in + paddingH - h_out * strideH;
+        const int filter_h = h_in + paddingH - h_out * strideH;
-            for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
+        for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
-                const int filter_w = w_in + paddingW - w_out * strideW;
+          const int filter_w = w_in + paddingW - w_out * strideW;
-                const int filter_offset = c_out * filterHeight * filterWidth
+          const int filter_offset = c_out * filterHeight * filterWidth +
-                    + filter_h * filterWidth + filter_w;
+                                    filter_h * filterWidth + filter_w;
-                const int top_diff_offset = ((batch * outputChannels + c_out) *
+          const int top_diff_offset =
-                    outputHeight + h_out)* outputWidth + w_out;
+              ((batch * outputChannels + c_out) * outputHeight + h_out) *
-                value += top_diff[top_diff_offset] * weight_data[filter_offset];
+                  outputWidth +
-            }
+              w_out;
+          value += top_diff[top_diff_offset] * weight_data[filter_offset];
        }
+      }
    }
    bottom_diff[index] += value;
-   }
+  }
 }
 // CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
 template <class T>
-__global__
+__global__ void ConvolutionDepthwiseFilterBackward(const int num_i,
-void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
+                                                   const int nthreads,
-    const T* const top_diff, const T* const inputData,
+                                                   const T* const top_diff,
-    const int num, const int outputChannels, const int outputHeight,
+                                                   const T* const inputData,
-    const int outputWidth, const int inputChannels, const int inputHeight,
+                                                   const int num,
-    const int inputWidth, const int filterMultiplier, const int filterHeight,
+                                                   const int outputChannels,
-    const int filterWidth, const int strideH, const int strideW,
+                                                   const int outputHeight,
-    const int paddingH, const int paddingW, T* const buffer_data) {
+                                                   const int outputWidth,
-  int index =
+                                                   const int inputChannels,
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+                                                   const int inputHeight,
+                                                   const int inputWidth,
+                                                   const int filterMultiplier,
+                                                   const int filterHeight,
+                                                   const int filterWidth,
+                                                   const int strideH,
+                                                   const int strideW,
+                                                   const int paddingH,
+                                                   const int paddingW,
+                                                   T* const buffer_data) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
  if (index < nthreads) {
    const int h_out = (index / outputWidth) % outputHeight;
    const int w_out = index % outputWidth;
-    const int kh = (index / filterWidth / outputHeight / outputWidth)
+    const int kh =
-          % filterHeight;
+        (index / filterWidth / outputHeight / outputWidth) % filterHeight;
    const int kw = (index / outputHeight / outputWidth) % filterWidth;
    const int h_in = -paddingH + h_out * strideH + kh;
    const int w_in = -paddingW + w_out * strideW + kw;
-    if ((h_in >= 0) && (h_in < inputHeight)
+    if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
-          && (w_in >= 0) && (w_in < inputWidth)) {
+        (w_in < inputWidth)) {
-      const int c_out = index /
+      const int c_out =
-            (filterHeight * filterWidth * outputHeight * outputWidth);
+          index / (filterHeight * filterWidth * outputHeight * outputWidth);
      const int c_in = c_out / filterMultiplier;
      const int batch = num_i;
-      const int top_offset = ((batch * outputChannels + c_out) *
+      const int top_offset =
-            outputHeight + h_out) * outputWidth + w_out;
+          ((batch * outputChannels + c_out) * outputHeight + h_out) *
-      const int bottom_offset = ((batch * inputChannels + c_in)
+              outputWidth +
-            * inputHeight + h_in) * inputWidth + w_in;
+          w_out;
+      const int bottom_offset =
+          ((batch * inputChannels + c_in) * inputHeight + h_in) * inputWidth +
+          w_in;
      buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
    } else {
      buffer_data[index] = 0;
@@ -163,170 +199,169 @@ void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
 }
 template <class T>
-class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T>{
+class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T> {
 public:
  void operator()(const T* inputData,
-            const T* filterData,
+                  const T* filterData,
-            int batchSize,
+                  int batchSize,
-            int outputChannels,
+                  int outputChannels,
-            int outputHeight,
+                  int outputHeight,
-            int outputWidth,
+                  int outputWidth,
-            int inputChannels,
+                  int inputChannels,
-            int inputHeight,
+                  int inputHeight,
-            int inputWidth,
+                  int inputWidth,
-            int filterMultiplier,
+                  int filterMultiplier,
-            int filterHeight,
+                  int filterHeight,
-            int filterWidth,
+                  int filterWidth,
-            int strideH,
+                  int strideH,
-            int strideW,
+                  int strideW,
-            int paddingH,
+                  int paddingH,
-            int paddingW,
+                  int paddingW,
-            T* outputData){
+                  T* outputData) {
    int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
-    size_t blocks = (outputSize + 1024 -1) / 1024;
+    size_t blocks = (outputSize + 1024 - 1) / 1024;
    size_t blockX = 512;
-    size_t blockY = (blocks+512-1)/512;
+    size_t blockY = (blocks + 512 - 1) / 512;
    dim3 threads(1024, 1);
    dim3 grid(blockX, blockY);
-    ConvolutionDepthwiseForward<T>
+    ConvolutionDepthwiseForward<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        <<< grid, threads, 0, STREAM_DEFAULT >>>(
+        outputSize,
-            outputSize,
+        inputData,
-            inputData,
+        filterData,
-            filterData,
+        batchSize,
-            batchSize,
+        outputChannels,
-            outputChannels,
+        outputHeight,
-            outputHeight,
+        outputWidth,
-            outputWidth,
+        inputChannels,
-            inputChannels,
+        inputHeight,
-            inputHeight,
+        inputWidth,
-            inputWidth,
+        filterMultiplier,
-            filterMultiplier,
+        filterHeight,
-            filterHeight,
+        filterWidth,
-            filterWidth,
+        strideH,
-            strideH,
+        strideW,
-            strideW,
+        paddingH,
-            paddingH,
+        paddingW,
-            paddingW,
+        outputData);
-            outputData);
+  }
-    }
 };
 template <class T>
-class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T>{
+class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T> {
 public:
  void operator()(const T* outputGrad,
-            const T* filterData,
+                  const T* filterData,
-            int batchSize,
+                  int batchSize,
-            int outputChannels,
+                  int outputChannels,
-            int outputHeight,
+                  int outputHeight,
-            int outputWidth,
+                  int outputWidth,
-            int inputChannels,
+                  int inputChannels,
-            int inputHeight,
+                  int inputHeight,
-            int inputWidth,
+                  int inputWidth,
-            int filterMultiplier,
+                  int filterMultiplier,
-            int filterHeight,
+                  int filterHeight,
-            int filterWidth,
+                  int filterWidth,
-            int strideH,
+                  int strideH,
-            int strideW,
+                  int strideW,
-            int paddingH,
+                  int paddingH,
-            int paddingW,
+                  int paddingW,
-            T* inputGrad){
+                  T* inputGrad) {
    int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
-    size_t blocks = (inputSize + 1024 -1) / 1024;
+    size_t blocks = (inputSize + 1024 - 1) / 1024;
    size_t blockX = 512;
-    size_t blockY = (blocks+512-1)/512;
+    size_t blockY = (blocks + 512 - 1) / 512;
    dim3 threads(1024, 1);
    dim3 grid(blockX, blockY);
    ConvolutionDepthwiseInputBackward<T>
-          // NOLINT_NEXT_LINE(whitespace/operators)
+        // NOLINT_NEXT_LINE(whitespace/operators)
-        <<< grid, threads, 0, STREAM_DEFAULT >>>(
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(inputSize,
-            inputSize,
+                                               outputGrad,
-            outputGrad,
+                                               filterData,
-            filterData,
+                                               batchSize,
-            batchSize,
+                                               outputChannels,
-            outputChannels,
+                                               outputHeight,
-            outputHeight,
+                                               outputWidth,
-            outputWidth,
+                                               inputChannels,
-            inputChannels,
+                                               inputHeight,
-            inputHeight,
+                                               inputWidth,
-            inputWidth,
+                                               filterMultiplier,
-            filterMultiplier,
+                                               filterHeight,
-            filterHeight,
+                                               filterWidth,
-            filterWidth,
+                                               strideH,
-            strideH,
+                                               strideW,
-            strideW,
+                                               paddingH,
-            paddingH,
+                                               paddingW,
-            paddingW,
+                                               inputGrad);
-            inputGrad);
+  }
-    }
 };
 template <class T>
 class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
 public:
  void operator()(const T* outputGrad,
-                const T* inputData,
+                  const T* inputData,
-                int batchSize,
+                  int batchSize,
-                int outputChannels,
+                  int outputChannels,
-                int outputHeight,
+                  int outputHeight,
-                int outputWidth,
+                  int outputWidth,
-                int inputChannels,
+                  int inputChannels,
-                int inputHeight,
+                  int inputHeight,
-                int inputWidth,
+                  int inputWidth,
-                int filterMultiplier,
+                  int filterMultiplier,
-                int filterHeight,
+                  int filterHeight,
-                int filterWidth,
+                  int filterWidth,
-                int strideH,
+                  int strideH,
-                int strideW,
+                  int strideW,
-                int paddingH,
+                  int paddingH,
-                int paddingW,
+                  int paddingW,
-                T* colData,
+                  T* colData,
-                T* filterGrad){
+                  T* filterGrad) {
-        int colDataSize = outputChannels * filterHeight * filterWidth
+    int colDataSize = outputChannels * filterHeight * filterWidth *
-            * outputHeight * outputWidth;
+                      outputHeight * outputWidth;
-        size_t blocks = (colDataSize + 1024 -1) / 1024;
+    size_t blocks = (colDataSize + 1024 - 1) / 1024;
-        size_t blockX = 512;
+    size_t blockX = 512;
-        size_t blockY = (blocks+512-1)/512;
+    size_t blockY = (blocks + 512 - 1) / 512;
-        dim3 threads(1024, 1);
+    dim3 threads(1024, 1);
-        dim3 grid(blockX, blockY);
+    dim3 grid(blockX, blockY);
-        BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
+    BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
-            1, filterGrad, false, true);
+                                1,
+                                filterGrad,
+                                false,
+                                true);
-        for (int i = 0; i < batchSize; i++) {
+    for (int i = 0; i < batchSize; i++) {
-            ConvolutionDepthwiseFilterBackward<T>
+      ConvolutionDepthwiseFilterBackward<
-                <<< grid, threads, 0, STREAM_DEFAULT >>>(
+          T><<<grid, threads, 0, STREAM_DEFAULT>>>(i,
-                    i,
+                                                   colDataSize,
-                    colDataSize,
+                                                   outputGrad,
-                    outputGrad,
+                                                   inputData,
-                    inputData,
+                                                   batchSize,
-                    batchSize,
+                                                   outputChannels,
-                    outputChannels,
+                                                   outputHeight,
-                    outputHeight,
+                                                   outputWidth,
-                    outputWidth,
+                                                   inputChannels,
-                    inputChannels,
+                                                   inputHeight,
-                    inputHeight,
+                                                   inputWidth,
-                    inputWidth,
+                                                   filterMultiplier,
-                    filterMultiplier,
+                                                   filterHeight,
-                    filterHeight,
+                                                   filterWidth,
-                    filterWidth,
+                                                   strideH,
-                    strideH,
+                                                   strideW,
-                    strideW,
+                                                   paddingH,
-                    paddingH,
+                                                   paddingW,
-                    paddingW,
+                                                   colData);
-                    colData);
+      int K = outputHeight * outputWidth;
-            int K = outputHeight * outputWidth;
+      int M = colDataSize / K;
-            int M = colDataSize / K;
-            BaseMatrix colMatrix(M, K, colData, false, true);
+      BaseMatrix colMatrix(M, K, colData, false, true);
-            filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);
+      filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);
-        }
    }
+  }
 };
 #ifdef PADDLE_TYPE_DOUBLE

--- a/paddle/function/FunctionTest.cpp
+++ b/paddle/function/FunctionTest.cpp
@@ -24,14 +24,14 @@ void FunctionApi(typename Tensor<real, DType>::Matrix& output,
 template <>
 void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
-  EXPECT_EQ(output.getHeight(), 100);
+  EXPECT_EQ(output.getHeight(), 100U);
-  EXPECT_EQ(output.getWidth(), 200);
+  EXPECT_EQ(output.getWidth(), 200U);
 }
 template <>
 void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
-  EXPECT_EQ(output.getHeight(), 10);
+  EXPECT_EQ(output.getHeight(), 10U);
-  EXPECT_EQ(output.getWidth(), 20);
+  EXPECT_EQ(output.getWidth(), 20U);
 }
 template <DeviceType DType>
@@ -85,14 +85,14 @@ void testBufferArgs(const BufferArgs& inputs,
 }
 void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) {
-  EXPECT_EQ(inputs.size(), 1);
+  EXPECT_EQ(inputs.size(), 1U);
  check(inputs[0]);
 }
 TEST(Arguments, Matrix) {
  MatrixPtr matrix = Matrix::create(100, 200);
  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 2);
+    EXPECT_EQ(arg.shape().ndims(), 2U);
    EXPECT_EQ(arg.shape()[0], 100);
    EXPECT_EQ(arg.shape()[1], 200);
    EXPECT_EQ(arg.data(), matrix->getData());

--- a/paddle/function/Im2ColOpGpu.cu
+++ b/paddle/function/Im2ColOpGpu.cu
@@ -17,16 +17,21 @@ limitations under the License. */
 namespace paddle {
-template<class T>
+template <class T>
-__global__
+__global__ void im2col(const T* data_im,
-void im2col(const T* data_im, int numOuts, int height, int width,
+                       int numOuts,
-            int blockH, int blockW,
+                       int height,
-            int strideH, int strideW,
+                       int width,
-            int paddingH, int paddingW,
+                       int blockH,
-            int height_col, int width_col,
+                       int blockW,
-            T* data_col) {
+                       int strideH,
-  int index =
+                       int strideW,
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+                       int paddingH,
+                       int paddingW,
+                       int height_col,
+                       int width_col,
+                       T* data_col) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
  if (index < numOuts) {
    int w_out = index % width_col;
    index /= width_col;
@@ -39,17 +44,17 @@ void im2col(const T* data_im, int numOuts, int height, int width,
    data_col += (channel_out * height_col + h_out) * width_col + w_out;
    for (int i = 0; i < blockH; ++i) {
      for (int j = 0; j < blockW; ++j) {
-        int rIdx = int(h_in+i);
+        int rIdx = int(h_in + i);
-        int cIdx = int(w_in+j);
+        int cIdx = int(w_in + j);
-        if ((rIdx-(int)paddingH) >= (int)height ||
+        if ((rIdx - (int)paddingH) >= (int)height ||
-            (rIdx-(int)paddingH) < 0 ||
+            (rIdx - (int)paddingH) < 0 ||
-            (cIdx-(int)paddingW) >= (int)width ||
+            (cIdx - (int)paddingW) >= (int)width ||
-            (cIdx-(int)paddingW) < 0) {
+            (cIdx - (int)paddingW) < 0) {
          *data_col = 0;
        } else {
-          rIdx = rIdx + channel_in*height - paddingH;
+          rIdx = rIdx + channel_in * height - paddingH;
          cIdx = cIdx - paddingW;
-          *data_col = data_im[rIdx* width + cIdx];
+          *data_col = data_im[rIdx * width + cIdx];
        }
        data_col += height_col * width_col;
      }
@@ -82,60 +87,73 @@ public:
    int outputWidth = colShape[4];
    int numKernels = inputChannels * outputHeight * outputWidth;
-    int blocks = (numKernels + 1024 -1) / 1024;
+    int blocks = (numKernels + 1024 - 1) / 1024;
    int blockX = 512;
    int blockY = (blocks + 512 - 1) / 512;
    dim3 threads(1024, 1);
    dim3 grid(blockX, blockY);
-    im2col<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+    im2col<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
-        (imData, numKernels, inputHeight, inputWidth, filterHeight, filterWidth,
+                                                    numKernels,
-         strideHeight, strideWidth, paddingHeight, paddingWidth,
+                                                    inputHeight,
-         outputHeight, outputWidth, colData);
+                                                    inputWidth,
+                                                    filterHeight,
+                                                    filterWidth,
+                                                    strideHeight,
+                                                    strideWidth,
+                                                    paddingHeight,
+                                                    paddingWidth,
+                                                    outputHeight,
+                                                    outputWidth,
+                                                    colData);
    CHECK_SYNC("Im2ColFunctor GPU failed");
  }
 };
-template<class T>
+template <class T>
-__global__
+__global__ void col2im(size_t n,
-void col2im(size_t n, const T* data_col, size_t height,
+                       const T* data_col,
-            size_t width, size_t channels,
+                       size_t height,
-            size_t blockH, size_t blockW,
+                       size_t width,
-            size_t strideH, size_t strideW,
+                       size_t channels,
-            size_t paddingH, size_t paddingW,
+                       size_t blockH,
-            size_t height_col, size_t width_col,
+                       size_t blockW,
-            T* data_im) {
+                       size_t strideH,
+                       size_t strideW,
+                       size_t paddingH,
+                       size_t paddingW,
+                       size_t height_col,
+                       size_t width_col,
+                       T* data_im) {
  size_t index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
  if (index < n) {
    T val = 0;
    int w = int(index % width);
    int h = int((index / width) % height);
    int c = int(index / (width * height));
    if ((w - (int)paddingW) >= 0 &&
-        (w - (int)paddingW) < (width-2 * paddingW) &&
+        (w - (int)paddingW) < (width - 2 * paddingW) &&
-        (h - (int)paddingH) >= 0 &&
+        (h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) {
-        (h - paddingH) < (height - 2 * paddingH)) {
      // compute the start and end of the output
      int w_col_start =
-        (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
+          (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
-      int w_col_end =
+      int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col));
-        min((int)(w / (int)strideW + 1), (int)(width_col));
      int h_col_start =
-        (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
+          (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
      int h_col_end = min(int(h / strideH + 1), int(height_col));
      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
          // the col location: [c * width * height + h_out, w_out]
-          int c_col = int(c * blockH* blockW) + \
+          int c_col = int(c * blockH * blockW) +
-            (h - h_col * (int)strideH) * (int)blockW +
+                      (h - h_col * (int)strideH) * (int)blockW +
-            (w - w_col * (int)strideW);
+                      (w - w_col * (int)strideW);
          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
        }
      }
      h -= paddingH;
      w -= paddingW;
-      data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
+      data_im[c * ((width - 2 * paddingW) * (height - 2 * paddingH)) +
-              h*(width-2*paddingW) + w] += val;
+              h * (width - 2 * paddingW) + w] += val;
    }
  }
 }
@@ -164,32 +182,32 @@ public:
    int outputHeight = colShape[3];
    int outputWidth = colShape[4];
-    size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight)
+    size_t numKernels = inputChannels * (inputHeight + 2 * paddingHeight) *
-        * (inputWidth + 2*paddingWidth);
+                        (inputWidth + 2 * paddingWidth);
-    size_t blocks = (numKernels + 1024 -1) / 1024;
+    size_t blocks = (numKernels + 1024 - 1) / 1024;
    size_t blockX = 512;
-    size_t blockY = (blocks+512-1)/512;
+    size_t blockY = (blocks + 512 - 1) / 512;
    dim3 threads(1024, 1);
    dim3 grid(blockX, blockY);
    // To avoid involving atomic operations, we will launch one kernel per
    // bottom dimension, and then in the kernel add up the top dimensions.
-    col2im<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+    col2im<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
-             (numKernels,
+        numKernels,
-              colData,
+        colData,
-              inputHeight + 2*paddingHeight,
+        inputHeight + 2 * paddingHeight,
-              inputWidth + 2*paddingWidth,
+        inputWidth + 2 * paddingWidth,
-              inputChannels,
+        inputChannels,
-              filterHeight,
+        filterHeight,
-              filterWidth,
+        filterWidth,
-              strideHeight,
+        strideHeight,
-              strideWidth,
+        strideWidth,
-              paddingHeight,
+        paddingHeight,
-              paddingWidth,
+        paddingWidth,
-              outputHeight,
+        outputHeight,
-              outputWidth,
+        outputWidth,
-              imData);
+        imData);
    CHECK_SYNC("Col2ImFunctor GPU failed");
  }
 };
@@ -199,31 +217,35 @@ template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, double>;
 template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, float>;
 template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, double>;
-template<class T>
+template <class T>
-__global__
+__global__ void im2colOCF(const T* imData,
-void im2colOCF(const T* imData, T* colData,
+                          T* colData,
-               int inputChannels,
+                          int inputChannels,
-               int inputHeight, int inputWidth,
+                          int inputHeight,
-               int filterHeight, int filterWidth,
+                          int inputWidth,
-               int strideHeight, int strideWidth,
+                          int filterHeight,
-               int paddingHeight, int paddingWidth,
+                          int filterWidth,
-               int outputHeight, int outputWidth) {
+                          int strideHeight,
+                          int strideWidth,
+                          int paddingHeight,
+                          int paddingWidth,
+                          int outputHeight,
+                          int outputWidth) {
  int swId = blockIdx.x;
  int shId = blockIdx.y;
-  for (int channelId = threadIdx.z;
+  for (int channelId = threadIdx.z; channelId < inputChannels;
-       channelId < inputChannels;
       channelId += blockDim.z) {
    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
        int widthOffset = idx + swId * strideWidth - paddingWidth;
        int heightOffset = idy + shId * strideHeight - paddingHeight;
-        int imOffset = widthOffset + heightOffset * inputWidth
+        int imOffset = widthOffset + heightOffset * inputWidth +
-           + channelId * inputHeight * inputWidth;
+                       channelId * inputHeight * inputWidth;
-        int colOffset = idx + idy * filterWidth
+        int colOffset = idx + idy * filterWidth +
-          + channelId * filterHeight * filterWidth
+                        channelId * filterHeight * filterWidth +
-          + (shId * outputWidth + swId)
+                        (shId * outputWidth + swId) *
-          * (inputChannels * filterHeight * filterWidth);
+                            (inputChannels * filterHeight * filterWidth);
        if (heightOffset >= inputHeight || heightOffset < 0 ||
            widthOffset >= inputWidth || widthOffset < 0) {
@@ -279,39 +301,52 @@ public:
    int blockDimZ = 1024 / blockDimX / blockDimY;
    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
    dim3 grid(outputWidth, outputHeight);
-    im2colOCF<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+    im2colOCF<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
-        (imData, colData, inputChannels, inputHeight, inputWidth,
+                                                       colData,
-         filterHeight, filterWidth, strideHeight, strideWidth,
+                                                       inputChannels,
-         paddingHeight, paddingWidth, outputHeight, outputWidth);
+                                                       inputHeight,
+                                                       inputWidth,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       strideHeight,
+                                                       strideWidth,
+                                                       paddingHeight,
+                                                       paddingWidth,
+                                                       outputHeight,
+                                                       outputWidth);
    CHECK_SYNC("Im2ColFunctor GPU failed");
  }
 };
-template<class T>
+template <class T>
-__global__
+__global__ void col2imOCF(T* imData,
-void col2imOCF(T* imData, const T* colData,
+                          const T* colData,
-               int inputChannels,
+                          int inputChannels,
-               int inputHeight, int inputWidth,
+                          int inputHeight,
-               int filterHeight, int filterWidth,
+                          int inputWidth,
-               int strideHeight, int strideWidth,
+                          int filterHeight,
-               int paddingHeight, int paddingWidth,
+                          int filterWidth,
-               int outputHeight, int outputWidth) {
+                          int strideHeight,
+                          int strideWidth,
+                          int paddingHeight,
+                          int paddingWidth,
+                          int outputHeight,
+                          int outputWidth) {
  int swId = blockIdx.x;
  int shId = blockIdx.y;
-  for (int channelId = threadIdx.z;
+  for (int channelId = threadIdx.z; channelId < inputChannels;
-       channelId < inputChannels;
       channelId += blockDim.z) {
    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
        int widthOffset = idx + swId * strideWidth - paddingWidth;
        int heightOffset = idy + shId * strideHeight - paddingHeight;
-        int imOffset = widthOffset + heightOffset * inputWidth
+        int imOffset = widthOffset + heightOffset * inputWidth +
-           + channelId * inputHeight * inputWidth;
+                       channelId * inputHeight * inputWidth;
-        int colOffset = idx + idy * filterWidth
+        int colOffset = idx + idy * filterWidth +
-          + channelId * filterHeight * filterWidth
+                        channelId * filterHeight * filterWidth +
-          + (shId * outputWidth + swId)
+                        (shId * outputWidth + swId) *
-          * (inputChannels * filterHeight * filterWidth);
+                            (inputChannels * filterHeight * filterWidth);
        if (heightOffset >= 0 && heightOffset < inputHeight &&
            widthOffset >= 0 && widthOffset < inputWidth) {
@@ -365,10 +400,19 @@ public:
    int blockDimZ = 1024 / blockDimX / blockDimY;
    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
    dim3 grid(outputWidth, outputHeight);
-    col2imOCF<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+    col2imOCF<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
-        (imData, colData, inputChannels, inputHeight, inputWidth,
+                                                       colData,
-         filterHeight, filterWidth, strideHeight, strideWidth,
+                                                       inputChannels,
-         paddingHeight, paddingWidth, outputHeight, outputWidth);
+                                                       inputHeight,
+                                                       inputWidth,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       strideHeight,
+                                                       strideWidth,
+                                                       paddingHeight,
+                                                       paddingWidth,
+                                                       outputHeight,
+                                                       outputWidth);
    CHECK_SYNC("Col2ImFunctor GPU failed");
  }
 };

--- a/paddle/function/MulOpGpu.cu
+++ b/paddle/function/MulOpGpu.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "hl_base.h"
 #include "MulOp.h"
+#include "hl_base.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"

--- a/paddle/function/PadOpGpu.cu
+++ b/paddle/function/PadOpGpu.cu
@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "hl_base.h"
 #include "PadOp.h"
+#include "hl_base.h"
 namespace paddle {
-__global__ void KePad(real* outputs, const real* inputs,
+__global__ void KePad(real* outputs,
-                      int inC, int inH, int inW,
+                      const real* inputs,
-                      int padc, int padh, int padw,
+                      int inC,
-                      int outC, int outH, int outW, int nthreads) {
+                      int inH,
+                      int inW,
+                      int padc,
+                      int padh,
+                      int padw,
+                      int outC,
+                      int outH,
+                      int outW,
+                      int nthreads) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < nthreads) {
    const int w = idx % inW;
@@ -50,16 +58,33 @@ void Pad<DEVICE_TYPE_GPU>(real* outputs,
  int outC = inC + cstart + cend;
  int outH = inH + hstart + hend;
  int outW = inW + wstart + wend;
-  KePad<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+  KePad<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(outputs,
-    (outputs, inputs, inC, inH, inW, cstart, hstart, wstart,
+                                                    inputs,
-     outC, outH, outW, nth);
+                                                    inC,
+                                                    inH,
+                                                    inW,
+                                                    cstart,
+                                                    hstart,
+                                                    wstart,
+                                                    outC,
+                                                    outH,
+                                                    outW,
+                                                    nth);
  CHECK_SYNC("Pad");
 }
-__global__ void KePadDiff(real* inGrad, const real* outGrad,
+__global__ void KePadDiff(real* inGrad,
-                          int inC, int inH, int inW,
+                          const real* outGrad,
-                          int padc, int padh, int padw,
+                          int inC,
-                          int outC, int outH, int outW, int nthreads) {
+                          int inH,
+                          int inW,
+                          int padc,
+                          int padh,
+                          int padw,
+                          int outC,
+                          int outH,
+                          int outW,
+                          int nthreads) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < nthreads) {
    const int w = idx % inW;
@@ -89,9 +114,18 @@ void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
  int outC = inC + cstart + cend;
  int outH = inH + hstart + hend;
  int outW = inW + wstart + wend;
-  KePadDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+  KePadDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(inGrad,
-    (inGrad, outGrad, inC, inH, inW, cstart, hstart, wstart,
+                                                        outGrad,
-     outC, outH, outW, nth);
+                                                        inC,
+                                                        inH,
+                                                        inW,
+                                                        cstart,
+                                                        hstart,
+                                                        wstart,
+                                                        outC,
+                                                        outH,
+                                                        outW,
+                                                        nth);
  CHECK_SYNC("PadGrad");
 }

--- a/paddle/function/RowConvOpGpu.cu
+++ b/paddle/function/RowConvOpGpu.cu
@@ -12,16 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "hl_base.h"
 #include "RowConvOp.h"
+#include "hl_base.h"
 namespace paddle {
-template<int BLOCK_H, int BLOCK_W>
+template <int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConv(real* y, const real* x,  const real* w,
+__global__ void KeRowConv(real* y,
-    const int* starts, const int height, const int width,
+                          const real* x,
-    const int numSeq, const int context) {
+                          const real* w,
+                          const int* starts,
+                          const int height,
+                          const int width,
+                          const int numSeq,
+                          const int context) {
  const int tidx = threadIdx.x;
  const int tidy = threadIdx.y;
  const int blky = blockDim.y;
@@ -30,7 +34,7 @@ __global__ void KeRowConv(real* y, const real* x,  const real* w,
  __shared__ real sw[BLOCK_H][BLOCK_W];
  for (int i = tidy; i < context; i += blky) {
-    sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
+    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
  }
  __syncthreads();
@@ -56,9 +60,14 @@ __global__ void KeRowConv(real* y, const real* x,  const real* w,
  }
 }
-__global__ void KeRowConv2(real* y, const real* x,  const real* w,
+__global__ void KeRowConv2(real* y,
-    const int* starts, const int height, const int width,
+                           const real* x,
-    const int numSeq, const int context) {
+                           const real* w,
+                           const int* starts,
+                           const int height,
+                           const int width,
+                           const int numSeq,
+                           const int context) {
  const int tidx = threadIdx.x;
  const int tidy = threadIdx.y;
  const int blky = blockDim.y;
@@ -84,8 +93,6 @@ __global__ void KeRowConv2(real* y, const real* x,  const real* w,
  }
 }
 template <>
 void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
                              const GpuMatrix& in,
@@ -105,21 +112,24 @@ void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
  dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
  if (contextLength <= 32) {
-    KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+    KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-      (y, x, w, starts, height, width, numSeq, contextLength);
+        y, x, w, starts, height, width, numSeq, contextLength);
  } else {
-    KeRowConv2<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+    KeRowConv2<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-      (y, x, w, starts, height, width, numSeq, contextLength);
+        y, x, w, starts, height, width, numSeq, contextLength);
  }
  CHECK_SYNC("RowConv");
 }
+template <int BLOCK_H, int BLOCK_W, int CONTEXT>
-template<int BLOCK_H, int BLOCK_W, int CONTEXT>
+__global__ void KeRowConvBwWeight(real* dw,
-__global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
+                                  const real* x,
-    const int* starts, const int height, const int width, const int numSeq,
+                                  const real* dy,
-    const int context) {
+                                  const int* starts,
+                                  const int height,
+                                  const int width,
+                                  const int numSeq,
+                                  const int context) {
  const int tidx = threadIdx.x;
  const int tidy = threadIdx.y;
  const int blky = blockDim.y;
@@ -138,21 +148,21 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
    const int start = starts[i];
    const int end = starts[i + 1];
    const int steps = end - start;
-    const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H;
+    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
    for (int j = tidy; j < size; j += BLOCK_H) {
      int xoff = gidx + tidx;
      int yoff = start + j;
      // transpose
-      sh_x[tidx][tidy] = (xoff < width && yoff < end) ?
+      sh_x[tidx][tidy] =
-      x[yoff * width + xoff] : 0.0;
+          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
-      sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ?
+      sh_dy[tidx][tidy + context - 1] =
-      dy[yoff * width + xoff] : 0.0;
+          (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
      __syncthreads();
      if (tidy < (context - 1)) {
        yoff = yoff - context + 1;
-        sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ?
+        sh_dy[tidx][tidy] =
-        dy[yoff * width + xoff] : 0.0;
+            (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
      }
      __syncthreads();
@@ -179,11 +189,15 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
  }
 }
-template<int BLOCK_H, int BLOCK_W>
+template <int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
+__global__ void KeRowConvBwWeight2(real* dw,
-    const int* starts, const int height, const int width, const int numSeq,
+                                   const real* x,
-    const int context) {
+                                   const real* dy,
+                                   const int* starts,
+                                   const int height,
+                                   const int width,
+                                   const int numSeq,
+                                   const int context) {
  const int tidx = threadIdx.x;
  const int tidy = threadIdx.y;
  const int gidx = blockIdx.x * blockDim.x;
@@ -196,19 +210,21 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
    const int end = starts[i + 1];
    const int steps = end - start;
-    const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H;
+    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
    for (int j = tidy; j < size; j += BLOCK_H) {
      int xoff = gidx + tidx;
      int yoff = start + j;
      // transpose
-      sh_x[tidx][tidy] = (xoff < width && yoff < end) ?
+      sh_x[tidx][tidy] =
-      x[yoff * width + xoff] : 0.0;
+          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
      __syncthreads();
      for (int t = 0; t < context; t++) {
-        sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start &&
+        sh_dy[tidx][tidy] =
-        yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
+            (xoff < width && (yoff - t) >= start && yoff - t < end)
+                ? dy[(yoff - t) * width + xoff]
+                : 0.0;
        __syncthreads();
        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
@@ -222,18 +238,22 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
        __syncthreads();
        if (tidx == 0 && (gidx + tidy) < width) {
-          dw[t*width + gidx + tidy] += val;
+          dw[t * width + gidx + tidy] += val;
        }
      }
    }
  }
 }
-template<int BLOCK_H, int BLOCK_W>
+template <int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
+__global__ void KeRowConvBwData(real* dx,
-    const int* starts, const int height, const int width, const int numSeq,
+                                const real* w,
-    const int context) {
+                                const real* dy,
+                                const int* starts,
+                                const int height,
+                                const int width,
+                                const int numSeq,
+                                const int context) {
  const int tidx = threadIdx.x;
  const int tidy = threadIdx.y;
  const int blky = blockDim.y;
@@ -242,7 +262,7 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
  __shared__ real sw[BLOCK_H][BLOCK_W];
  for (int i = tidy; i < context; i += blky) {
-    sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
+    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
  }
  __syncthreads();
@@ -266,10 +286,14 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
  }
 }
-__global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy,
+__global__ void KeRowConvBwData2(real* dx,
-    const int* starts, const int height, const int width, const int numSeq,
+                                 const real* w,
-    const int context) {
+                                 const real* dy,
+                                 const int* starts,
+                                 const int height,
+                                 const int width,
+                                 const int numSeq,
+                                 const int context) {
  const int tidx = threadIdx.x;
  const int tidy = threadIdx.y;
  const int blky = blockDim.y;
@@ -295,14 +319,13 @@ __global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy,
  }
 }
 template <>
 void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
-                              const GpuMatrix& in,
+                                  const GpuMatrix& in,
-                              const GpuMatrix& filter,
+                                  const GpuMatrix& filter,
-                              GpuMatrix& inG,
+                                  GpuMatrix& inG,
-                              GpuMatrix& filterG,
+                                  GpuMatrix& filterG,
-                              const GpuIVector& seq) {
+                                  const GpuIVector& seq) {
  const size_t numSeq = seq.getSize() - 1;
  const size_t contextLength = filter.getHeight();
  const size_t height = in.getHeight();
@@ -318,13 +341,11 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
    dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
    real* dw = filterG.getData();
    if (contextLength <= 32) {
-      KeRowConvBwWeight<32, 32, 32>
+      KeRowConvBwWeight<32, 32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-        <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+          dw, x, dy, starts, height, width, numSeq, contextLength);
-        (dw, x, dy, starts, height, width, numSeq, contextLength);
    } else {
-      KeRowConvBwWeight2<32, 32>
+      KeRowConvBwWeight2<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-        <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+          dw, x, dy, starts, height, width, numSeq, contextLength);
-        (dw, x, dy, starts, height, width, numSeq, contextLength);
    }
  }
@@ -333,13 +354,11 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
    dim3 dimBlock2(32, 32);
    dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
    if (contextLength <= 64) {
-      KeRowConvBwData<32, 64>
+      KeRowConvBwData<32, 64><<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
-        <<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
+          dx, w, dy, starts, height, width, numSeq, contextLength);
-        (dx, w, dy, starts, height, width, numSeq, contextLength);
    } else {
-      KeRowConvBwData2
+      KeRowConvBwData2<<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
-        <<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
+          dx, w, dy, starts, height, width, numSeq, contextLength);
-        (dx, w, dy, starts, height, width, numSeq, contextLength);
    }
  }

--- a/paddle/function/TensorShapeTest.cpp
+++ b/paddle/function/TensorShapeTest.cpp
@@ -19,35 +19,35 @@ namespace paddle {
 TEST(TensorShape, Constructor) {
  TensorShape t1;
-  EXPECT_EQ(t1.ndims(), 0);
+  EXPECT_EQ(t1.ndims(), 0U);
-  EXPECT_EQ(t1.getElements(), 0);
+  EXPECT_EQ(t1.getElements(), 0U);
  TensorShape t2(3);
-  EXPECT_EQ(t2.ndims(), 3);
+  EXPECT_EQ(t2.ndims(), 3U);
-  EXPECT_EQ(t2.getElements(), 1);
+  EXPECT_EQ(t2.getElements(), 1U);
  TensorShape t3({8, 10});
-  EXPECT_EQ(t3.ndims(), 2);
+  EXPECT_EQ(t3.ndims(), 2U);
-  EXPECT_EQ(t3.getElements(), 80);
+  EXPECT_EQ(t3.getElements(), 80U);
  TensorShape t4(t3);
  EXPECT_EQ(t4.ndims(), t3.ndims());
  EXPECT_EQ(t4.getElements(), t3.getElements());
  TensorShape t5({1, 2, 3, 4, 5});
-  EXPECT_EQ(t5.ndims(), 5);
+  EXPECT_EQ(t5.ndims(), 5U);
-  EXPECT_EQ(t5.getElements(), 120);
+  EXPECT_EQ(t5.getElements(), 120U);
 }
 TEST(TensorShape, GetAndSet) {
  TensorShape t({1, 2, 3});
-  EXPECT_EQ(t.ndims(), 3);
+  EXPECT_EQ(t.ndims(), 3U);
-  EXPECT_EQ(t.getElements(), 6);
+  EXPECT_EQ(t.getElements(), 6U);
  EXPECT_EQ(t[1], 2);
  t.setDim(1, 100);
-  EXPECT_EQ(t.getElements(), 300);
+  EXPECT_EQ(t.getElements(), 300U);
-  EXPECT_EQ(t[1], 100);
+  EXPECT_EQ(t[1], 100U);
 }
 }  // namespace paddle
--- a/paddle/function/TensorTypeTest.cpp
+++ b/paddle/function/TensorTypeTest.cpp
@@ -19,9 +19,9 @@ namespace paddle {
 TEST(TensorType, Matrix) {
  Tensor<real, DEVICE_TYPE_CPU>::Matrix matrix(100, 200);
-  EXPECT_EQ(matrix.getHeight(), 100);
+  EXPECT_EQ(matrix.getHeight(), 100U);
-  EXPECT_EQ(matrix.getWidth(), 200);
+  EXPECT_EQ(matrix.getWidth(), 200U);
-  EXPECT_EQ(matrix.getElementCnt(), 100 * 200);
+  EXPECT_EQ(matrix.getElementCnt(), 100U * 200U);
  EXPECT_EQ(matrix.useGpu(), false);
  Tensor<real, DEVICE_TYPE_GPU>::Matrix testGpu(100, 200);
@@ -33,15 +33,15 @@ TEST(TensorType, Vector) {
  Tensor<real, DEVICE_TYPE_GPU>::Vector gpuVector(100);
  EXPECT_EQ(cpuVector.useGpu(), false);
  EXPECT_EQ(gpuVector.useGpu(), true);
-  EXPECT_EQ(cpuVector.getSize(), 100);
+  EXPECT_EQ(cpuVector.getSize(), 100U);
-  EXPECT_EQ(gpuVector.getSize(), 100);
+  EXPECT_EQ(gpuVector.getSize(), 100U);
  Tensor<int, DEVICE_TYPE_CPU>::Vector cpuIVector(100);
  Tensor<int, DEVICE_TYPE_GPU>::Vector gpuIVector(100);
  EXPECT_EQ(cpuIVector.useGpu(), false);
  EXPECT_EQ(gpuIVector.useGpu(), true);
-  EXPECT_EQ(cpuIVector.getSize(), 100);
+  EXPECT_EQ(cpuIVector.getSize(), 100U);
-  EXPECT_EQ(gpuIVector.getSize(), 100);
+  EXPECT_EQ(gpuIVector.getSize(), 100U);
 }
 TEST(TensorType, EmptyMatrix) {

--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -49,9 +49,7 @@ class NNPACKConvFunction : public ConvFunctionBase {
 public:
  void init(const FuncConfig& config) override {
    ConvFunctionBase::init(config);
-    CHECK_EQ(groups_, (size_t)1);
    algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo"));
-    // algorithm_ = nnp_convolution_algorithm_auto;
    transform_strategy_ = nnp_convolution_transform_strategy_compute;
    nnp_status status = nnp_initialize();
    CHECK_EQ(status, nnp_status_success);
@@ -67,8 +65,7 @@ public:
    }
  }
-  virtual void check(const BufferArgs& inputs,
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-                     const BufferArgs& outputs) override {
    const TensorShape& input = inputs[0].shape();
    const TensorShape& filter = inputs[1].shape();
    const TensorShape& output = outputs[0].shape();
@@ -91,8 +88,8 @@ public:
    size_t filterHeight = getFilterHeight(filter);
    size_t filterWidth = getFilterWidth(filter);
    size_t outputChannels = output[1];
-    // size_t outputHeight = output[2];
+    size_t outputHeight = output[2];
-    // size_t outputWidth = output[3];
+    size_t outputWidth = output[3];
    nnp_size inputSize = {.width = inputWidth, .height = inputHeight};
    nnp_padding padding = {.top = (size_t)paddingH(),
@@ -171,49 +168,58 @@ public:
      }
    }
+    size_t inputOffset = inputChannels / groups_ * inputHeight * inputWidth;
+    size_t outputOffset = outputChannels / groups_ * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
    if (batchSize == 1) {
-      nnp_status status =
+      for (size_t g = 0; g < groups_; g++) {
-          nnp_convolution_inference(algorithm_,
+        nnp_status status =
-                                    transform_strategy_,
+            nnp_convolution_inference(algorithm_,
-                                    inputChannels,
+                                      transform_strategy_,
-                                    outputChannels,
+                                      inputChannels / groups_,
-                                    inputSize,
+                                      outputChannels / groups_,
-                                    padding,
+                                      inputSize,
-                                    kernelSize,
+                                      padding,
-                                    outputSubsampling,
+                                      kernelSize,
-                                    inputData,
+                                      outputSubsampling,
-                                    filterData,
+                                      inputData + inputOffset * g,
-                                    nullptr, /* bias */
+                                      filterData + filterOffset * g,
-                                    outputData,
+                                      nullptr, /* bias */
-                                    bufferPtr,
+                                      outputData + outputOffset * g,
-                                    sizePtr,
+                                      bufferPtr,
-                                    nnp_activation_identity,
+                                      sizePtr,
-                                    nullptr,
+                                      nnp_activation_identity,
-                                    threadpool_, /* threadpool */
+                                      nullptr,
-                                    nullptr);
+                                      threadpool_, /* threadpool */
-      CHECK_EQ(status, nnp_status_success);
+                                      nullptr);
+        CHECK_EQ(status, nnp_status_success);
+      }
    } else {
-      // only supports stride = 1
+      for (size_t g = 0; g < groups_; g++) {
-      CHECK_EQ(strideH(), 1);
+        // only supports stride = 1
-      CHECK_EQ(strideW(), 1);
+        CHECK_EQ(strideH(), 1);
-      nnp_status status = nnp_convolution_output(algorithm_,
+        CHECK_EQ(strideW(), 1);
-                                                 batchSize,
+        nnp_status status =
-                                                 inputChannels,
+            nnp_convolution_output(algorithm_,
-                                                 outputChannels,
+                                   batchSize,
-                                                 inputSize,
+                                   inputChannels / groups_,
-                                                 padding,
+                                   outputChannels / groups_,
-                                                 kernelSize,
+                                   inputSize,
-                                                 inputData,
+                                   padding,
-                                                 filterData,
+                                   kernelSize,
-                                                 nullptr, /* bias */
+                                   inputData + inputOffset * g,
-                                                 outputData,
+                                   filterData + filterOffset * g,
-                                                 bufferPtr,
+                                   nullptr, /* bias */
-                                                 sizePtr,
+                                   outputData + outputOffset * g,
-                                                 nnp_activation_identity,
+                                   bufferPtr,
-                                                 nullptr,
+                                   sizePtr,
-                                                 threadpool_, /* threadpool */
+                                   nnp_activation_identity,
-                                                 nullptr);
+                                   nullptr,
-      CHECK_EQ(status, nnp_status_success);
+                                   threadpool_, /* threadpool */
+                                   nullptr);
+        CHECK_EQ(status, nnp_status_success);
+      }
    }
  }

--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -186,7 +186,10 @@ Error __must_check forward(Argument& act) {
                                    useGpu(act.deviceId));
  }
-  auto starts = act.sequenceStartPositions->getVector(useGpu(act.deviceId));
+  auto starts =
+      act.hasSubseq()
+          ? act.subSequenceStartPositions->getVector(useGpu(act.deviceId))
+          : act.sequenceStartPositions->getVector(useGpu(act.deviceId));
  act.value->sequenceSoftmax(*act.value, *starts);
  return Error();
 }
@@ -197,8 +200,9 @@ Error __must_check backward(Argument& act) {
        "Input width for each timestep of sequence softmax should be 1");
  }
-  size_t numSequences = act.getNumSequences();
+  size_t numSequences =
-  const int* starts = act.sequenceStartPositions->getData(false);
+      act.hasSubseq() ? act.getNumSubSequences() : act.getNumSequences();
+  const int* starts = act.getCpuStartPositions();
  for (size_t i = 0; i < numSequences; ++i) {
    // TODO(Dangqingqing) optimization for GPU

--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -57,8 +57,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
      convGradFilterType = "GemmConvGradFilter";
    }
-    if (FLAGS_use_nnpack) {
+    if (FLAGS_use_nnpack && !isDeconv_) {
-      CHECK_EQ(isDeconv_, false);
      createFunction(forward_,
                     "NNPACKConv",
                     FuncConfig()

--- a/paddle/gserver/layers/GruCompute.cu
+++ b/paddle/gserver/layers/GruCompute.cu
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "GruCompute.h"
 #include "hl_recurrent_apply.cuh"
@@ -31,8 +30,10 @@ void GruCompute::forward<1>(hl_gru_value value, int frameSize, int batchSize) {
 }
 template <>
-void GruCompute::backward<1>(hl_gru_value value, hl_gru_grad grad,
+void GruCompute::backward<1>(hl_gru_value value,
-                            int frameSize, int batchSize) {
+                             hl_gru_grad grad,
+                             int frameSize,
+                             int batchSize) {
  hl_gpu_gru_backward(hppl::backward::gru_stateGrad(),
                      hppl::backward::gru_resetGrad(),
                      value,

--- a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
+++ b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
@@ -97,13 +97,19 @@ void KmaxSeqScoreLayer::forward(PassType passType) {
    scores_ = inputScore;
  }
-  int seqNum =
+  Matrix::resizeOrCreate(
-      input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences();
+      output_.value,
-  Matrix::resizeOrCreate(output_.value, seqNum, beamSize_, false, false);
+      input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(),
+      beamSize_,
+      false,
+      false);
  output_.value->one();
  output_.value->mulScalar(-1.);
-  kmaxScorePerSeq(scores_->getData(), output_.value->getData(), seqNum);
+  kmaxScorePerSeq(scores_->getData(),
+                  output_.value->getData(),
+                  input.hasSubseq() ? input.subSequenceStartPositions
+                                    : input.sequenceStartPositions);
 }
 void KmaxSeqScoreLayer::backward(const UpdateCallback& callback) {}

--- a/paddle/gserver/layers/LstmCompute.cu
+++ b/paddle/gserver/layers/LstmCompute.cu
@@ -12,41 +12,62 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "LstmCompute.h"
 #include "hl_recurrent_apply.cuh"
 namespace paddle {
 template <>
-void LstmCompute::forwardBatch<1>(hl_lstm_value value, int frameSize,
+void LstmCompute::forwardBatch<1>(hl_lstm_value value,
-                                 int batchSize) {
+                                  int frameSize,
-  hl_gpu_lstm_forward(hppl::forward::lstm(), value, frameSize,
+                                  int batchSize) {
-                      batchSize, activeNode_, activeGate_,
+  hl_gpu_lstm_forward(hppl::forward::lstm(),
+                      value,
+                      frameSize,
+                      batchSize,
+                      activeNode_,
+                      activeGate_,
                      activeState_);
 }
 template <>
-void LstmCompute::backwardBatch<1>(hl_lstm_value value, hl_lstm_grad grad,
+void LstmCompute::backwardBatch<1>(hl_lstm_value value,
-                                   int frameSize, int batchSize) {
+                                   hl_lstm_grad grad,
-  hl_gpu_lstm_backward(hppl::backward::lstm(), value, grad,
+                                   int frameSize,
-                       frameSize, batchSize, activeNode_,
+                                   int batchSize) {
-                       activeGate_, activeState_);
+  hl_gpu_lstm_backward(hppl::backward::lstm(),
+                       value,
+                       grad,
+                       frameSize,
+                       batchSize,
+                       activeNode_,
+                       activeGate_,
+                       activeState_);
 }
 template <>
 void LstmCompute::forwardOneSequence<1>(hl_lstm_value value, int frameSize) {
-  hl_gpu_lstm_forward(hppl::forward::lstm(), value,
+  hl_gpu_lstm_forward(hppl::forward::lstm(),
-                      frameSize, /* batchSize */ 1,
+                      value,
-                      activeNode_, activeGate_, activeState_);
+                      frameSize,
+                      /* batchSize */ 1,
+                      activeNode_,
+                      activeGate_,
+                      activeState_);
 }
 template <>
-void LstmCompute::backwardOneSequence<1>(hl_lstm_value value, hl_lstm_grad grad,
+void LstmCompute::backwardOneSequence<1>(hl_lstm_value value,
+                                         hl_lstm_grad grad,
                                         int frameSize) {
-  hl_gpu_lstm_backward(hppl::backward::lstm(), value, grad,
+  hl_gpu_lstm_backward(hppl::backward::lstm(),
-                       frameSize, /* batchSize */ 1,
+                       value,
-                       activeNode_, activeGate_, activeState_);
+                       grad,
+                       frameSize,
+                       /* batchSize */ 1,
+                       activeNode_,
+                       activeGate_,
+                       activeState_);
 }
 }  // namespace paddle
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -29,7 +29,7 @@ public:
      vals.push_back(s.str());
    }
    size_t pos = 0;
-    int i = 0;
+    size_t i = 0;
    std::ostringstream s;
    const std::string& format = config_.user_arg();
    while (true) {

--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
 # gserver pacakge unittests
-file(GLOB_RECURSE GSERVER_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.h")
-file(GLOB_RECURSE GSERVER_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cpp")
-add_style_check_target(paddle_gserver ${GSERVER_SOURCES})
-add_style_check_target(paddle_gserver ${GSERVER_HEADER})
 ################### test_ProtoDataProvider ############
 add_unittest_without_exec(test_ProtoDataProvider
    test_ProtoDataProvider.cpp)

--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
@@ -57,6 +57,39 @@ TEST(Activation, activation) {
  }
 }
+void testSequenceSoftmaxAct(bool hasSubseq) {
+  LOG(INFO) << "test activation: sequence softmax";
+  const size_t size = 1;
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("addto");
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sequence_softmax");
+  config.inputDefs.push_back(
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+       "layer_0",
+       1,
+       0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "sequence_softmax",
+                  100,
+                  /* trans= */ false,
+                  useGpu,
+                  /* useWeight */ true);
+  }
+}
+TEST(SequenceSoftmaxActivation, activation) {
+  for (auto hasSubseq : {false, true}) {
+    LOG(INFO) << "hasSubseq = " << hasSubseq;
+    testSequenceSoftmaxAct(hasSubseq);
+  }
+}
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);

--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -12,21 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <cmath>
-#include <string.h>
 #include <paddle/utils/Logging.h>
+#include <string.h>
+#include <cmath>
 #include "BaseMatrix.h"
-#include "hl_matrix_ops.cuh"
-#include "hl_matrix_base.cuh"
-#include "hl_matrix_apply.cuh"
-#include "SIMDFunctions.h"
 #include "MathFunctions.h"
+#include "SIMDFunctions.h"
+#include "hl_matrix_apply.cuh"
+#include "hl_matrix_base.cuh"
+#include "hl_matrix_ops.cuh"
 namespace paddle {
 const char* SPARSE_SUPPORT_ERROR = "Sparse Matrix/Vector is not supported.";
-template<class T>
+template <class T>
 template <class Op>
 int BaseMatrixT<T>::applyUnary(Op op) {
  MatrixOffset offset(0, 0);
@@ -34,9 +34,11 @@ int BaseMatrixT<T>::applyUnary(Op op) {
  return 0;
 }
-template<class T>
+template <class T>
 template <class Op>
-int BaseMatrixT<T>::applyUnary(Op op, int numRows, int numCols,
+int BaseMatrixT<T>::applyUnary(Op op,
+                               int numRows,
+                               int numCols,
                               MatrixOffset& offset) {
  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
  int dimM = numRows;
@@ -56,7 +58,7 @@ int BaseMatrixT<T>::applyUnary(Op op, int numRows, int numCols,
  return 0;
 }
-template<class T>
+template <class T>
 template <class Op>
 int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
  CHECK(height_ == b.height_ && width_ == b.width_)
@@ -67,18 +69,23 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
  return 0;
 }
-template<class T>
+template <class T>
 template <class Op>
-int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
+int BaseMatrixT<T>::applyBinary(
-                                MatrixOffset& offset) {
+    Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset) {
  applyBinary(op, b, numRows, numCols, offset, false_type(), false_type());
  return 0;
 }
-template<class T>
+template <class T>
 template <class Op, class bAsRowVector, class bAsColVector>
-int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
+int BaseMatrixT<T>::applyBinary(Op op,
-                            MatrixOffset& offset, bAsRowVector, bAsColVector) {
+                                BaseMatrixT& b,
+                                int numRows,
+                                int numCols,
+                                MatrixOffset& offset,
+                                bAsRowVector,
+                                bAsColVector) {
  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
  CHECK(useGpu_ == b.useGpu_) << "Matrix type mismatch";
@@ -91,8 +98,8 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
  T* A = data_;
  T* B = b.data_;
  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
+  CAL_MATRIX_START_ADDRESS(
-                           offset.bRow_);
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
  CHECK_LE(dimM + offset.aRow_, this->height_);
  CHECK_LE(dimN + offset.aCol_, this->width_);
  if (!bAsRowVector::value && !bAsColVector::value) {
@@ -115,7 +122,7 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
  return 0;
 }
-template<class T>
+template <class T>
 template <class Op>
 int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
  CHECK_EQ(height_, b.height_);
@@ -129,21 +136,29 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
  return 0;
 }
-template<class T>
+template <class T>
 template <class Op>
-int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
+int BaseMatrixT<T>::applyTernary(Op op,
-                                 int numRows, int numCols,
+                                 BaseMatrixT& b,
+                                 BaseMatrixT& c,
+                                 int numRows,
+                                 int numCols,
                                 MatrixOffset& offset) {
  applyTernary(op, b, c, numRows, numCols, offset, false_type(), false_type());
  return 0;
 }
-template<class T>
+template <class T>
 template <class Op, class cAsRowVector, class cAsColVector>
-int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
+int BaseMatrixT<T>::applyTernary(Op op,
-                                 int numRows, int numCols, MatrixOffset& offset,
+                                 BaseMatrixT& b,
-                                 cAsRowVector, cAsColVector) {
+                                 BaseMatrixT& c,
+                                 int numRows,
+                                 int numCols,
+                                 MatrixOffset& offset,
+                                 cAsRowVector,
+                                 cAsColVector) {
  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
  CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR;
@@ -160,10 +175,10 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
  T* B = b.data_;
  T* C = c.data_;
  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
+  CAL_MATRIX_START_ADDRESS(
-                           offset.bRow_);
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_,
+  CAL_MATRIX_START_ADDRESS(
-                           offset.cRow_);
+      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
  CHECK_LE(dimM + offset.aRow_, this->height_);
  CHECK_LE(dimN + offset.aCol_, this->width_);
@@ -180,21 +195,21 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
  }
  if (true == useGpu_) {
-    hl_gpu_apply_ternary_op
+    hl_gpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
-      <T, Op, cAsRowVector::value, cAsColVector::value>(
        op, A, B, C, dimM, dimN, lda, ldb, ldc);
  } else {
-    hl_cpu_apply_ternary_op
+    hl_cpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
-      <T, Op, cAsRowVector::value, cAsColVector::value>(
        op, A, B, C, dimM, dimN, lda, ldb, ldc);
  }
  return 0;
 }
-template<class T>
+template <class T>
 template <class Op>
-int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
+int BaseMatrixT<T>::applyQuaternary(Op op,
+                                    BaseMatrixT& b,
+                                    BaseMatrixT& c,
                                    BaseMatrixT& d) {
  CHECK_EQ(height_, b.height_);
  CHECK_EQ(width_, b.width_);
@@ -209,10 +224,14 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
  return 0;
 }
-template<class T>
+template <class T>
 template <class Op>
-int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
+int BaseMatrixT<T>::applyQuaternary(Op op,
-                                    BaseMatrixT& d, int numRows, int numCols,
+                                    BaseMatrixT& b,
+                                    BaseMatrixT& c,
+                                    BaseMatrixT& d,
+                                    int numRows,
+                                    int numCols,
                                    MatrixOffset& offset) {
  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
@@ -234,12 +253,12 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
  T* C = c.data_;
  T* D = d.data_;
  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
+  CAL_MATRIX_START_ADDRESS(
-                           offset.bRow_);
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_,
+  CAL_MATRIX_START_ADDRESS(
-                           offset.cRow_);
+      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
-  CAL_MATRIX_START_ADDRESS(D, d.height_, d.width_, ldd, offset.dCol_,
+  CAL_MATRIX_START_ADDRESS(
-                           offset.dRow_);
+      D, d.height_, d.width_, ldd, offset.dCol_, offset.dRow_);
  CHECK_LE(dimM + offset.aRow_, this->height_);
  CHECK_LE(dimN + offset.aCol_, this->width_);
@@ -250,22 +269,29 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
  CHECK_LE(dimM + offset.dRow_, d.height_);
  CHECK_LE(dimN + offset.dCol_, d.width_);
  if (true == useGpu_) {
-    hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb,
+    hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
-                               ldc, ldd);
  } else {
-    hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb,
+    hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
-                               ldc, ldd);
  }
  return 0;
 }
-template<class T>
+template <class T>
-template <class Agg, class Op, class Saver, class aAsRowVector,
+template <class Agg,
+          class Op,
+          class Saver,
+          class aAsRowVector,
          class aAsColVector>
-int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
+int BaseMatrixT<T>::aggregate(Agg agg,
-                              int numRows, int numCols, MatrixOffset& offset,
+                              Op op,
-                              aAsRowVector, aAsColVector) {
+                              Saver sv,
+                              BaseMatrixT& b,
+                              int numRows,
+                              int numCols,
+                              MatrixOffset& offset,
+                              aAsRowVector,
+                              aAsColVector) {
  CHECK_EQ(useGpu_, b.useGpu_);
  int ld = stride_;
@@ -273,10 +299,10 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
  T* dst = data_;
  T* B = b.data_;
-  CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_,
+  CAL_MATRIX_START_ADDRESS(
-                           offset.aRow_);
+      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
+  CAL_MATRIX_START_ADDRESS(
-                           offset.bRow_);
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
  if (aAsRowVector::value && !aAsColVector::value) {
    if (useGpu_) {
@@ -297,12 +323,21 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
  return 0;
 }
-template<class T>
+template <class T>
-template <class Agg, class Op, class Saver, class aAsRowVector,
+template <class Agg,
+          class Op,
+          class Saver,
+          class aAsRowVector,
          class aAsColVector>
-int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
+int BaseMatrixT<T>::aggregate(Agg agg,
-                              BaseMatrixT& c, int numRows, int numCols,
+                              Op op,
-                              MatrixOffset& offset, aAsRowVector,
+                              Saver sv,
+                              BaseMatrixT& b,
+                              BaseMatrixT& c,
+                              int numRows,
+                              int numCols,
+                              MatrixOffset& offset,
+                              aAsRowVector,
                              aAsColVector) {
  CHECK_EQ(useGpu_, b.useGpu_);
  CHECK_EQ(useGpu_, c.useGpu_);
@@ -314,28 +349,28 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
  T* dst = data_;
  T* B = b.data_;
  T* C = c.data_;
-  CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_,
+  CAL_MATRIX_START_ADDRESS(
-                           offset.aRow_);
+      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
+  CAL_MATRIX_START_ADDRESS(
-                           offset.bRow_);
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_,
+  CAL_MATRIX_START_ADDRESS(
-                           offset.cRow_);
+      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
  if (aAsRowVector::value && !aAsColVector::value) {
    if (useGpu_) {
-      hl_gpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B,
+      hl_gpu_matrix_column_op(
-                              ldb, C, ldc);
+          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
    } else {
-      hl_cpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B,
+      hl_cpu_matrix_column_op(
-                              ldb, C, ldc);
+          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
    }
  } else if (!aAsRowVector::value && aAsColVector::value) {
    if (useGpu_) {
-      hl_gpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B,
+      hl_gpu_matrix_row_op(
-                           ldb, C, ldc);
+          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
    } else {
-      hl_cpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B,
+      hl_cpu_matrix_row_op(
-                           ldb, C, ldc);
+          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
    }
  } else {
    LOG(FATAL) << "not supported";
@@ -350,15 +385,19 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
 */
 DEFINE_MATRIX_UNARY_OP(Neg, a = -a);
-template<class T>
+template <class T>
-void BaseMatrixT<T>::neg() { applyUnary(unary::Neg<T>()); }
+void BaseMatrixT<T>::neg() {
+  applyUnary(unary::Neg<T>());
+}
 DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a));
-template<>
+template <>
-void BaseMatrixT<real>::exp2() { applyUnary(unary::Exp<real>()); }
+void BaseMatrixT<real>::exp2() {
+  applyUnary(unary::Exp<real>());
+}
 DEFINE_MATRIX_UNARY_OP(Log, a = log(a));
-template<>
+template <>
 void BaseMatrixT<real>::log2() {
  if (useGpu_) {
    applyUnary(unary::Log<real>());
@@ -368,30 +407,42 @@ void BaseMatrixT<real>::log2() {
 }
 DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a));
-template<>
+template <>
-void BaseMatrixT<real>::sqrt2() { applyUnary(unary::Sqrt<real>()); }
+void BaseMatrixT<real>::sqrt2() {
+  applyUnary(unary::Sqrt<real>());
+}
 DEFINE_MATRIX_UNARY_OP(Square, a = a * a);
-template<class T>
+template <class T>
-void BaseMatrixT<T>::square2() { applyUnary(unary::Square<T>()); }
+void BaseMatrixT<T>::square2() {
+  applyUnary(unary::Square<T>());
+}
 DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a);
-template<class T>
+template <class T>
-void BaseMatrixT<T>::reciprocal2() { applyUnary(unary::Reciprocal<T>()); }
+void BaseMatrixT<T>::reciprocal2() {
+  applyUnary(unary::Reciprocal<T>());
+}
 DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a);
-template<class T>
+template <class T>
-void BaseMatrixT<T>::abs2() { applyUnary(unary::Abs<T>()); }
+void BaseMatrixT<T>::abs2() {
+  applyUnary(unary::Abs<T>());
+}
 DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0));
-template<class T>
+template <class T>
-void BaseMatrixT<T>::sign2() { applyUnary(unary::Sign<T>()); }
+void BaseMatrixT<T>::sign2() {
+  applyUnary(unary::Sign<T>());
+}
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-template<class T>
+template <class T>
-void BaseMatrixT<T>::zero() { applyUnary(unary::Zero<T>()); }
+void BaseMatrixT<T>::zero() {
+  applyUnary(unary::Zero<T>());
+}
-template<class T>
+template <class T>
 void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
  int numRows = height_;
  int numCols = numColumns;
@@ -400,11 +451,13 @@ void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
 }
 DEFINE_MATRIX_UNARY_OP(One, a = 1);
-template<class T>
+template <class T>
-void BaseMatrixT<T>::one() { applyUnary(unary::One<T>()); }
+void BaseMatrixT<T>::one() {
+  applyUnary(unary::One<T>());
+}
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p));
-template<>
+template <>
 void BaseMatrixT<real>::pow2(real p) {
  if (useGpu_) {
    applyUnary(unary::Pow<real>(p));
@@ -414,51 +467,67 @@ void BaseMatrixT<real>::pow2(real p) {
 }
 DEFINE_MATRIX_UNARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a -= p);
-template<class T>
+template <class T>
-void BaseMatrixT<T>::subScalar(T p) { applyUnary(unary::SubScalar<T>(p)); }
+void BaseMatrixT<T>::subScalar(T p) {
+  applyUnary(unary::SubScalar<T>(p));
+}
 DEFINE_MATRIX_UNARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a *= p);
-template<class T>
+template <class T>
-void BaseMatrixT<T>::mulScalar(T p) { applyUnary(unary::MulScalar<T>(p)); }
+void BaseMatrixT<T>::mulScalar(T p) {
+  applyUnary(unary::MulScalar<T>(p));
+}
 DEFINE_MATRIX_UNARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a /= p);
-template<class T>
+template <class T>
-void BaseMatrixT<T>::divScalar(T p) { applyUnary(unary::DivScalar<T>(p)); }
+void BaseMatrixT<T>::divScalar(T p) {
+  applyUnary(unary::DivScalar<T>(p));
+}
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Assign, ONE_PARAMETER, a = p);
-template<class T>
+template <class T>
-void BaseMatrixT<T>::assign(T p) { applyUnary(unary::Assign<T>(p)); }
+void BaseMatrixT<T>::assign(T p) {
+  applyUnary(unary::Assign<T>(p));
+}
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Add, ONE_PARAMETER, a += p);
-template<class T>
+template <class T>
-void BaseMatrixT<T>::add(T p) { applyUnary(unary::Add<T>(p)); }
+void BaseMatrixT<T>::add(T p) {
+  applyUnary(unary::Add<T>(p));
+}
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = a * p1 + p2);
-template<class T>
+template <class T>
-void BaseMatrixT<T>::add(T p1, T p2) { applyUnary(unary::Add2<T>(p1, p2)); }
+void BaseMatrixT<T>::add(T p1, T p2) {
+  applyUnary(unary::Add2<T>(p1, p2));
+}
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip, TWO_PARAMETER,
+DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip,
+                                 TWO_PARAMETER,
                                 a = a < p1 ? p1 : (a > p2 ? p2 : a));
-template<class T>
+template <class T>
-void BaseMatrixT<T>::clip(T p1, T p2) { applyUnary(unary::Clip<T>(p1, p2)); }
+void BaseMatrixT<T>::clip(T p1, T p2) {
+  applyUnary(unary::Clip<T>(p1, p2));
+}
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative, TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative,
-    a = b < p1 ? 0 : (b > p2 ? 0 : 1));
+                                  TWO_PARAMETER,
-template<class T>
+                                  a = b < p1 ? 0 : (b > p2 ? 0 : 1));
+template <class T>
 void BaseMatrixT<T>::clipDerivative(BaseMatrixT& b, T p1, T p2) {
  applyBinary(binary::ClipDerivative<T>(p1, p2), b);
 }
-DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar, ONE_PARAMETER,
+DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar,
+                                 ONE_PARAMETER,
                                 a = a > p ? 1.0f : 0.0f);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::biggerThanScalar(T p) {
  applyUnary(unary::BiggerThanScalar<T>(p));
 }
-DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER,
+DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER, a = a > p ? a : p);
-                                 a = a > p ? a : p);
+template <class T>
-template<class T>
 void BaseMatrixT<T>::downClip(T p) {
  applyUnary(unary::DownClip<T>(p));
 }
@@ -469,12 +538,12 @@ void BaseMatrixT<T>::downClip(T p) {
 */
 DEFINE_MATRIX_BINARY_OP(Add, a += b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b) {
  applyBinary(binary::Add<T>(), b);
 }
-template<>
+template <>
 void BaseMatrixT<real>::add(BaseMatrixT& b) {
  if (useGpu_) {
    applyBinary(binary::Add<real>(), b);
@@ -485,7 +554,7 @@ void BaseMatrixT<real>::add(BaseMatrixT& b) {
  }
 }
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
  if (columnOffset + b.width_ <= width_) {
    int numRows = height_;
@@ -504,43 +573,53 @@ void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
  }
 }
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addP2P(BaseMatrixT& b) {
  T* A = data_;
  T* B = b.data_;
  int dimM = height_;
  int dimN = width_;
-  hl_gpu_apply_binary_op<T, binary::Add<T>, 0, 0>
+  hl_gpu_apply_binary_op<T, binary::Add<T>, 0, 0>(
-    (binary::Add<T>(), A, B, dimM, dimN, dimN, dimN);
+      binary::Add<T>(), A, B, dimM, dimN, dimN, dimN);
 }
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addColVector(BaseMatrixT& b) {
  MatrixOffset offset(0, 0, 0, 0);
  int numRows = height_;
  int numCols = width_;
-  applyBinary(binary::Add<T>(), b, numRows, numCols, offset, false_type(),
+  applyBinary(binary::Add<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              false_type(),
              true_type() /* bAsColVector */);
 }
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addRowVector(BaseMatrixT& b) {
  MatrixOffset offset(0, 0, 0, 0);
  int numRows = height_;
  int numCols = width_;
-  applyBinary(binary::Add<T>(), b, numRows, numCols, offset,
+  applyBinary(binary::Add<T>(),
-              true_type() /* bAsRowVector */, false_type());
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
 }
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Add1, ONE_PARAMETER, a += b * p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b, T p) {
  applyBinary(binary::Add1<T>(p), b);
 }
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p));
-template<>
+template <>
 void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
  if (useGpu_) {
    applyBinary(binary::Pow<real>(p), b);
@@ -550,36 +629,45 @@ void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
 }
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = p1 * a + p2 * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, T p2) {
  applyBinary(binary::Add2<T>(p1, p2), b);
 }
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addBias(BaseMatrixT& b, T scale) {
  MatrixOffset offset(0, 0, 0, 0);
  int numRows = height_;
  int numCols = width_;
-  applyBinary(binary::Add1<T>(scale), b, numRows, numCols, offset,
+  applyBinary(binary::Add1<T>(scale),
-              true_type() /* bAsRowVector */, false_type());
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
 }
 DEFINE_MATRIX_BINARY_OP(Sub, a -= b);
-template<class T>
+template <class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b) { applyBinary(binary::Sub<T>(), b); }
+void BaseMatrixT<T>::sub(BaseMatrixT& b) {
+  applyBinary(binary::Sub<T>(), b);
+}
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Sub1, ONE_PARAMETER, a -= b * p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sub(BaseMatrixT& b, T p) {
  applyBinary(binary::Sub1<T>(p), b);
 }
 DEFINE_MATRIX_BINARY_OP(Relu, b = a > 0.0f ? a : 0.0f);
-template<class T>
+template <class T>
-void BaseMatrixT<T>::relu(BaseMatrixT& b) { applyBinary(binary::Relu<T>(), b); }
+void BaseMatrixT<T>::relu(BaseMatrixT& b) {
+  applyBinary(binary::Relu<T>(), b);
+}
 DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::reluDerivative(BaseMatrixT& b) {
  applyBinary(binary::ReluDerivative<T>(), b);
 }
@@ -589,7 +677,7 @@ DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0;
                                              ? THRESHOLD
                                              : ((a < -THRESHOLD) ? (-THRESHOLD)
                                                                  : a))));
-template<>
+template <>
 void BaseMatrixT<real>::softrelu(BaseMatrixT& b) {
  applyBinary(binary::Softrelu<real>(), b);
 }
@@ -599,97 +687,100 @@ DEFINE_MATRIX_BINARY_OP(
    a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
                                ? THRESHOLD
                                : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
-template<>
+template <>
 void BaseMatrixT<real>::softreluDerivative(BaseMatrixT& b) {
  applyBinary(binary::SoftreluDerivative<real>(), b);
 }
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Brelu, TWO_PARAMETER, b = a > p1 ? a : p1;
                                  b = b < p2 ? b : p2);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::brelu(BaseMatrixT& b) {
-  int p1 = 0, p2 = 24;    //! TODO(yuyang18): Make p1,p2 configuable.
+  int p1 = 0, p2 = 24;  //! TODO(yuyang18): Make p1,p2 configuable.
  applyBinary(binary::Brelu<T>(p1, p2), b);
 }
-DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative, TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative,
+                                  TWO_PARAMETER,
                                  a *= (b > p1 && b < p2) ? 1.0 : 0.0);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::breluDerivative(BaseMatrixT& b) {
  int p1 = 0, p2 = 24;
  applyBinary(binary::BreluDerivative<T>(p1, p2), b);
 }
 DEFINE_MATRIX_BINARY_OP(Square, b = a * a);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::square2(BaseMatrixT& b) {
  applyBinary(binary::Square<T>(), b);
 }
 DEFINE_MATRIX_BINARY_OP(SquareDerivative, a *= 2.0 * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
  applyBinary(binary::SquareDerivative<T>(), b);
 }
-DEFINE_MATRIX_BINARY_OP(Tanh,
+DEFINE_MATRIX_BINARY_OP(Tanh, T tmp = -2.0 * a;
-    T tmp = -2.0 * a;
+                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
-    b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
+template <>
-template<>
 void BaseMatrixT<real>::tanh(BaseMatrixT& b) {
  applyBinary(binary::Tanh<real>(), b);
 }
 DEFINE_MATRIX_BINARY_OP(TanhDerivative, a *= 1 - b * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::tanhDerivative(BaseMatrixT& b) {
  applyBinary(binary::TanhDerivative<T>(), b);
 }
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanh, TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(
-                                  b = p1 *
+    ScaledTanh, TWO_PARAMETER, b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0));
-                                      (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0));
+template <>
-template<>
 void BaseMatrixT<real>::scaledTanh(BaseMatrixT& b, real p1, real p2) {
  applyBinary(binary::ScaledTanh<real>(p1, p2), b);
 }
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative, TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative,
+                                  TWO_PARAMETER,
                                  a *= p2 * (p1 - b * b));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) {
  applyBinary(binary::ScaledTanhDerivative<T>(p1 * p1, p2 / p1), b);
 }
 DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b) {
  applyBinary(binary::Reciprocal<T>(), b);
 }
 DEFINE_MATRIX_BINARY_OP(ReciprocalDerivative, a *= -b * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::reciprocalDerivative(BaseMatrixT& b) {
  applyBinary(binary::ReciprocalDerivative<T>(), b);
 }
 DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a);
-template<class T>
+template <class T>
-void BaseMatrixT<T>::abs2(BaseMatrixT& b) { applyBinary(binary::Abs<T>(), b); }
+void BaseMatrixT<T>::abs2(BaseMatrixT& b) {
+  applyBinary(binary::Abs<T>(), b);
+}
 DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::absDerivative(BaseMatrixT& b) {
  applyBinary(binary::AbsDerivative<T>(), b);
 }
-DEFINE_MATRIX_BINARY_OP(
+DEFINE_MATRIX_BINARY_OP(Sigmoid, const T THRESHOLD_MIN = -40.0;
-    Sigmoid, const T THRESHOLD_MIN = -40.0; const T THRESHOLD_MAX = 13.0;
+                        const T THRESHOLD_MAX = 13.0;
-    T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN
+                        T tmp = (a < THRESHOLD_MIN)
-                                   : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
+                                    ? THRESHOLD_MIN
-    b = 1.0f / (1.0f + exp(-tmp)));
+                                    : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
-template<>
+                        b = 1.0f / (1.0f + exp(-tmp)));
+template <>
 void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
  if (useGpu_) {
    applyBinary(binary::Sigmoid<real>(), b);
@@ -723,31 +814,31 @@ void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
 }
 DEFINE_MATRIX_BINARY_OP(SigmoidDerivative, a *= b * (1 - b));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sigmoidDerivative(BaseMatrixT& b) {
  applyBinary(binary::SigmoidDerivative<T>(), b);
 }
 DEFINE_MATRIX_BINARY_OP(ExpDerivative, a *= b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::expDerivative(BaseMatrixT& b) {
  applyBinary(binary::ExpDerivative<T>(), b);
 }
 DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sign2(BaseMatrixT& b) {
  applyBinary(binary::Sign<T>(), b);
 }
 DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b));
-template<>
+template <>
 void BaseMatrixT<real>::exp2(BaseMatrixT& b) {
  applyBinary(binary::Exp<real>(), b);
 }
 DEFINE_MATRIX_BINARY_OP(Log, a = log(b));
-template<>
+template <>
 void BaseMatrixT<real>::log2(BaseMatrixT& b) {
  if (useGpu_) {
    applyBinary(binary::Log<real>(), b);
@@ -757,13 +848,13 @@ void BaseMatrixT<real>::log2(BaseMatrixT& b) {
 }
 DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b));
-template<>
+template <>
 void BaseMatrixT<real>::sqrt2(BaseMatrixT& b) {
  applyBinary(binary::Sqrt<real>(), b);
 }
 DEFINE_MATRIX_BINARY_OP(InvSqrt, a = 1.0f / sqrt(b));
-template<>
+template <>
 void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
  if (useGpu_) {
    applyBinary(binary::InvSqrt<real>(), b);
@@ -775,37 +866,37 @@ void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
 }
 DEFINE_MATRIX_BINARY_PARAMETER_OP(IsEqual, ONE_PARAMETER, a = (b == p));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::isEqualTo(BaseMatrixT& b, T value) {
  applyBinary(binary::IsEqual<T>(value), b);
 }
 DEFINE_MATRIX_BINARY_PARAMETER_OP(AddScalar, ONE_PARAMETER, a = b + p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addScalar(BaseMatrixT& b, T p) {
  applyBinary(binary::AddScalar<T>(p), b);
 }
 DEFINE_MATRIX_BINARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a = b - p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::subScalar(BaseMatrixT& b, T p) {
  applyBinary(binary::SubScalar<T>(p), b);
 }
 DEFINE_MATRIX_BINARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a = b * p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::mulScalar(BaseMatrixT& b, T p) {
  applyBinary(binary::MulScalar<T>(p), b);
 }
 DEFINE_MATRIX_BINARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a = b / p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::divScalar(BaseMatrixT& b, T p) {
  applyBinary(binary::DivScalar<T>(p), b);
 }
 DEFINE_MATRIX_BINARY_PARAMETER_OP(ScalarDiv, ONE_PARAMETER, a = p / b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
  applyBinary(binary::ScalarDiv<T>(p), b);
 }
@@ -817,20 +908,20 @@ void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
 DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropy,
                         a = -c * log(b) - (1 - c) * log(1 - b));
-template<>
+template <>
 void BaseMatrixT<real>::softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) {
  applyTernary(ternary::SoftCrossEntropy<real>(), b, c);
 }
 DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropyBp, a += (b - c) / (b * (1 - b)));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
  applyTernary(ternary::SoftCrossEntropyBp<T>(), b, c);
 }
 DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropy,
                         a = c > 0.5 ? -log(b) : -log(1.0 - b));
-template<>
+template <>
 void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
                                                BaseMatrixT& c) {
  if (useGpu_) {
@@ -858,70 +949,73 @@ void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
 DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropyBp,
                         a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
  applyTernary(ternary::BinaryCrossEntropyBp<T>(), b, c);
 }
 DEFINE_MATRIX_TERNARY_OP(Add, a = b + c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b, BaseMatrixT& c) {
  applyTernary(ternary::Add<T>(), b, c);
 }
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add1, TWO_PARAMETER, a = p1 * b + p2 * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
  applyTernary(ternary::Add1<T>(p1, p2), b, c);
 }
 DEFINE_MATRIX_TERNARY_OP(Sub, a = b - c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sub(BaseMatrixT& b, BaseMatrixT& c) {
  applyTernary(ternary::Sub<T>(), b, c);
 }
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(Sub1, TWO_PARAMETER, a = p1 * b - p2 * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
  applyTernary(ternary::Sub1<T>(p1, p2), b, c);
 }
 DEFINE_MATRIX_TERNARY_OP(Add2, a = a + b + c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c) {
  applyTernary(ternary::Add2<T>(), b, c);
 }
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3, THREE_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3,
+                                   THREE_PARAMETER,
                                   a = p1 * a + p2 * b + p3 * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
  applyTernary(ternary::Add3<T>(p1, p2, p3), b, c);
 }
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate,
+                                   THREE_PARAMETER,
                                   c = p2 * c - p1 * (b + p3 * a);
                                   a = a + c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad
                               BaseMatrixT& c,  // mom
-                               T p1,        // learningRate,
+                               T p1,            // learningRate,
-                               T p2,        // momentum,
+                               T p2,            // momentum,
-                               T p3) {      // decayRate
+                               T p3) {          // decayRate
  applyTernary(ternary::SgdUpdate<T>(p1, p2, p3), b, c);
 }
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER,
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate,
+                                      THREE_PARAMETER,
                                      c = p2 * c - p1 * d * (b + p3 * a);
                                      a += c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad,
                               BaseMatrixT& c,  // mom,
                               BaseMatrixT& d,  // lr,
-                               T p1,        // learningRate,
+                               T p1,            // learningRate,
-                               T p2,        // momentum,
+                               T p2,            // momentum,
-                               T p3) {      // decayRate
+                               T p3) {          // decayRate
  applyQuaternary(quaternary::SgdUpdate<T>(p1, p2, p3), b, c, d);
 }
@@ -929,19 +1023,22 @@ DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b;
                                  a = (a > lambda)
                                          ? (a - lambda)
                                          : (a < -lambda) ? (a + lambda) : 0);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::applyL1(BaseMatrixT& lr, T learningRate, T decayRate) {
  applyBinary(binary::ApplyL1<T>(learningRate * decayRate), lr);
 }
-template<>
+template <>
 void BaseMatrixT<real>::applyL1(BaseMatrixT& lr,
                                real learningRate,
                                real decayRate) {
  if (useGpu_) {
    applyBinary(binary::ApplyL1<real>(learningRate * decayRate), lr);
  } else {
-    simd::decayL1(this->data_, this->data_, lr.data_, learningRate * decayRate,
+    simd::decayL1(this->data_,
+                  this->data_,
+                  lr.data_,
+                  learningRate * decayRate,
                  height_ * width_);
  }
 }
@@ -950,24 +1047,25 @@ DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p;
                                 a = (a > lambda)
                                         ? (a - lambda)
                                         : (a < -lambda) ? (a + lambda) : 0);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::applyL1(T learningRate, T decayRate) {
  applyUnary(unary::ApplyL1<T>(learningRate * decayRate));
 }
-template<>
+template <>
 void BaseMatrixT<real>::applyL1(real learningRate, real decayRate) {
  if (useGpu_) {
    applyUnary(unary::ApplyL1<real>(learningRate * decayRate));
  } else {
-    simd::decayL1(this->data_, this->data_, learningRate * decayRate,
+    simd::decayL1(
-                  height_ * width_);
+        this->data_, this->data_, learningRate * decayRate, height_ * width_);
  }
 }
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2, ONE_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2,
+                                  ONE_PARAMETER,
                                  a *= (1.0f / (1.0f + p * b)));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
  if (useGpu_) {
    applyBinary(binary::ApplyL2<T>(learningRate * decayRate), lr);
@@ -980,32 +1078,33 @@ void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
  }
 }
-template<class T>
+template <class T>
 void BaseMatrixT<T>::applyL2(T learningRate, T decayRate) {
  BaseMatrixT<T>::mulScalar(1.0f / (1.0f + learningRate * decayRate));
 }
 DEFINE_MATRIX_BINARY_OP(DotMul, a *= b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotMul(BaseMatrixT& b) {
  applyBinary(binary::DotMul<T>(), b);
 }
 DEFINE_MATRIX_TERNARY_OP(DotMul, a = b * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotMul(BaseMatrixT& b, BaseMatrixT& c) {
  applyTernary(ternary::DotMul<T>(), b, c);
 }
 DEFINE_MATRIX_TERNARY_OP(DotDiv, a = (b == 0.0) ? 0.0 : b / c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c) {
  applyTernary(ternary::DotDiv<T>(), b, c);
 }
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P,
+                                   TWO_PARAMETER,
                                   a = (b + p1) / (c + p2));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
  applyTernary(ternary::DotDiv2P<T>(p1, p2), b, c);
 }
@@ -1015,7 +1114,7 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c;
                                    ? THRESHOLD
                                    : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
                            a = log(1 + exp(a)) - a * d);
-template<>
+template <>
 void BaseMatrixT<real>::rankLoss(BaseMatrixT& b,
                                 BaseMatrixT& c,
                                 BaseMatrixT& d) {
@@ -1026,8 +1125,9 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c;
                            a = (a > THRESHOLD)
                                    ? THRESHOLD
                                    : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-                            a = exp(a); a = (a / (1 + a) - d));
+                            a = exp(a);
-template<>
+                            a = (a / (1 + a) - d));
+template <>
 void BaseMatrixT<real>::rankLossBp(BaseMatrixT& b,
                                   BaseMatrixT& c,
                                   BaseMatrixT& d) {
@@ -1040,7 +1140,7 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0;
                                                                 ? -THRESHOLD
                                                                 : b;
                         a = log(1 + exp(x)) - c * x);
-template<>
+template <>
 void BaseMatrixT<real>::logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c) {
  applyTernary(ternary::LogisticRegressionLoss<real>(), b, c);
 }
@@ -1050,22 +1150,23 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0;
                         T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
                                                                 ? -THRESHOLD
                                                                 : b;
-                         x = exp(x); a = x / (1 + x) - c);
+                         x = exp(x);
-template<>
+                         a = x / (1 + x) - c);
+template <>
 void BaseMatrixT<real>::logisticRegressionLossBp(BaseMatrixT& b,
                                                 BaseMatrixT& c) {
  applyTernary(ternary::LogisticRegressionLossBp<real>(), b, c);
 }
 DEFINE_MATRIX_TERNARY_OP(BiggerThan, a = (b > c) ? 1.0f : 0.0f);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::biggerThan(BaseMatrixT& b, BaseMatrixT& c) {
  applyTernary(ternary::BiggerThan<T>(), b, c);
 }
 DEFINE_MATRIX_QUATERNARY_OP(
    BiggerThan, a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
                                BaseMatrixT& c,
                                BaseMatrixT& d) {
@@ -1073,25 +1174,34 @@ void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
 }
 DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::max2(BaseMatrixT& b, BaseMatrixT& c) {
  applyTernary(ternary::Max<T>(), b, c);
 }
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError, ONE_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError,
+                                   ONE_PARAMETER,
                                   c += ((a > p) == (b > p)) ? 0.0f : 1.0f);
-template<class T>
+template <class T>
-void BaseMatrixT<T>::binaryClassificationError2(size_t destCol, BaseMatrixT& b,
+void BaseMatrixT<T>::binaryClassificationError2(size_t destCol,
-                                                BaseMatrixT& c, T p) {
+                                                BaseMatrixT& b,
+                                                BaseMatrixT& c,
+                                                T p) {
  CHECK(!useGpu_) << "do not support gpu";
  MatrixOffset offset(0, 0, 0, 0, destCol, 0);
  int numRows = b.height_;
  int numCols = b.width_;
-  b.applyTernary(ternary::BinaryClassificationError<T>(p), c, *this, numRows,
+  b.applyTernary(ternary::BinaryClassificationError<T>(p),
-                 numCols, offset, false_type(), true_type() /*cAsColVector*/);
+                 c,
+                 *this,
+                 numRows,
+                 numCols,
+                 offset,
+                 false_type(),
+                 true_type() /*cAsColVector*/);
 }
-template<>
+template <>
 void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
                                                  BaseMatrixT& b,
                                                  BaseMatrixT& c,
@@ -1099,127 +1209,148 @@ void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
  MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
  int numRows = b.height_;
  int numCols = b.width_;
-  aggregate(aggregate::sum(), base::binary::classificationError(p),
+  aggregate(aggregate::sum(),
-            base::binary::add(), b, c, numRows, numCols, offset, false_type(),
+            base::binary::classificationError(p),
+            base::binary::add(),
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
            true_type() /*aAsColVector*/);
 }
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3, THREE_PARAMETER,
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3,
+                                      THREE_PARAMETER,
                                      a = p1 * b + p2 * c + p3 * d);
-template<class T>
+template <class T>
-void BaseMatrixT<T>::add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1,
+void BaseMatrixT<T>::add3(
-                          T p2, T p3) {
+    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3) {
  applyQuaternary(quaternary::Add3<T>(p1, p2, p3), b, c, d);
 }
 DEFINE_MATRIX_TERNARY_OP(DotMulSquare, a = b * c * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b, BaseMatrixT& c) {
  applyTernary(ternary::DotMulSquare<T>(), b, c);
 }
 DEFINE_MATRIX_TERNARY_OP(DotSquareSquare, a = b * b * c * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c) {
  applyTernary(ternary::DotSquareSquare<T>(), b, c);
 }
 DEFINE_MATRIX_BINARY_OP(DotMulSquare, a *= b * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b) {
  applyBinary(binary::DotMulSquare<T>(), b);
 }
 DEFINE_MATRIX_BINARY_OP(DotSquareMul, a = a * a * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotSquareMul(BaseMatrixT& b) {
  applyBinary(binary::DotSquareMul<T>(), b);
 }
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum, THREE_PARAMETER,
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum,
+                                      THREE_PARAMETER,
                                      T tmp = p1 * b + p2 * c + p3 * d;
                                      a += tmp * tmp);
-template<class T>
+template <class T>
-void BaseMatrixT<T>::addSquareSum(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d,
+void BaseMatrixT<T>::addSquareSum(
-                                  T p1, T p2, T p3) {
+    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3) {
  applyQuaternary(quaternary::AddSquareSum<T>(p1, p2, p3), b, c, d);
 }
 DEFINE_MATRIX_BINARY_PARAMETER_OP(AddSquare, ONE_PARAMETER, a += p * b * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addSquare(BaseMatrixT& b, T p) {
  applyBinary(binary::AddSquare<T>(p), b);
 }
-DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare, TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare,
+                                  TWO_PARAMETER,
                                  a = p1 * a + p2 * b * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::decayAddSquare(BaseMatrixT& b, T p1, T p2) {
  applyBinary(binary::DecayAddSquare<T>(p1, p2), b);
 }
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul,
+                                   TWO_PARAMETER,
                                   a = p1 * a + p2 * b * b * c * c);
-template<class T>
+template <class T>
-void BaseMatrixT<T>::decayAddSquareMul(BaseMatrixT& b, BaseMatrixT& c, T p1,
+void BaseMatrixT<T>::decayAddSquareMul(BaseMatrixT& b,
+                                       BaseMatrixT& c,
+                                       T p1,
                                       T p2) {
  applyTernary(ternary::DecayAddSquareMul<T>(p1, p2), b, c);
 }
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum, THREE_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum,
+                                   THREE_PARAMETER,
                                   a = 1 / (p1 * b + p2 * c + p3));
-template<class T>
+template <class T>
-void BaseMatrixT<T>::reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2,
+void BaseMatrixT<T>::reciprocalSum(
-                                   T p3) {
+    BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
  applyTernary(ternary::ReciprocalSum<T>(p1, p2, p3), b, c);
 }
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2,
+                                  TWO_PARAMETER,
                                  a = 1 / (p1 * b + p2));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b, T p1, T p2) {
  applyBinary(binary::Reciprocal2<T>(p1, p2), b);
 }
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum,
+                                   TWO_PARAMETER,
                                   T tmp = p1 * b + p2 * c;
                                   a *= tmp * tmp);
-template<class T>
+template <class T>
-void BaseMatrixT<T>::dotMulSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1,
+void BaseMatrixT<T>::dotMulSquareSum(BaseMatrixT& b,
+                                     BaseMatrixT& c,
+                                     T p1,
                                     T p2) {
  applyTernary(ternary::DotMulSquareSum<T>(p1, p2), b, c);
 }
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum,
+                                   TWO_PARAMETER,
                                   T tmp = p1 * b + p2 * c;
                                   a = tmp * tmp);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
  applyTernary(ternary::DotSquareSum<T>(p1, p2), b, c);
 }
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum,
+                                   TWO_PARAMETER,
                                   a *= p1 * b + p2 * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
  applyTernary(ternary::DotMulSum<T>(p1, p2), b, c);
 }
 DEFINE_MATRIX_BINARY_OP(CopyAndClear, b = a; a = 0);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::copyAndClear(BaseMatrixT& b) {
  applyBinary(binary::CopyAndClear<T>(), b);
 }
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul,
+                                   TWO_PARAMETER,
                                   a = p1 * a + p2 * b * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
  applyTernary(ternary::AddDotMul<T>(p1, p2), b, c);
 }
 DEFINE_MATRIX_BINARY_OP(Assign, a = b;);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::assign(BaseMatrixT& b) {
  if (useGpu_) {
    applyBinary(binary::Assign<T>(), b);
@@ -1230,7 +1361,7 @@ void BaseMatrixT<T>::assign(BaseMatrixT& b) {
  }
 }
-template<class T>
+template <class T>
 void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
  if (columnOffset + b.width_ <= width_) {
    int numRows = height_;
@@ -1250,24 +1381,31 @@ void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
 }
 DEFINE_MATRIX_BINARY_OP(DeepSwap, T tmp = a; a = b; b = tmp);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::deepSwap(BaseMatrixT& b) {
-    applyBinary(binary::DeepSwap<T>(), b);
+  applyBinary(binary::DeepSwap<T>(), b);
 }
-template<>
+template <>
 void BaseMatrixT<real>::rowDotMul(size_t destCol,
                                  BaseMatrixT& b,
                                  BaseMatrixT& c) {
  int numRows = b.height_;
  int numCols = b.width_;
  MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
-  aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c,
+  aggregate(aggregate::sum(),
-            numRows, numCols, offset, false_type(),
+            base::binary::mul(),
+            base::binary::add(),
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
            true_type() /*aAsColVector*/);
 }
-template<class T>
+template <class T>
 void BaseMatrixT<T>::rowDotMul2(size_t destCol,
                                BaseMatrixT& b,
                                BaseMatrixT& c) {
@@ -1290,17 +1428,24 @@ void BaseMatrixT<T>::rowDotMul2(size_t destCol,
  }
 }
-template<>
+template <>
 void BaseMatrixT<real>::addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c) {
  MatrixOffset offset(0, 0, 0, 0, 0, 0);
  int numRows = b.height_;
  int numCols = b.width_;
-  aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c,
+  aggregate(aggregate::sum(),
-            numRows, numCols, offset, true_type() /*aAsRowVector*/,
+            base::binary::mul(),
+            base::binary::add(),
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            true_type() /*aAsRowVector*/,
            false_type());
 }
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
  CHECK(!useGpu_) << "do not support gpu";
@@ -1321,16 +1466,22 @@ void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
 }
 DEFINE_MATRIX_TERNARY_OP(addDotMulMMV, a += b * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c) {
  MatrixOffset offset(0, 0, 0, 0, 0, 0);
  int numRows = height_;
  int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset,
+  applyTernary(ternary::addDotMulMMV<T>(),
-               true_type() /*cAsRowVector*/, false_type());
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               true_type() /*cAsRowVector*/,
+               false_type());
 }
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
  CHECK(!useGpu_) << "do not support gpu";
@@ -1350,16 +1501,22 @@ void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
  }
 }
-template<class T>
+template <class T>
 void BaseMatrixT<T>::rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
  int numRows = height_;
  int numCols = width_;
-  applyTernary(ternary::DotMul<T>(), b, c, numRows, numCols, offset,
+  applyTernary(ternary::DotMul<T>(),
-    false_type(), true_type() /*cAsColVector*/);
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               false_type(),
+               true_type() /*cAsColVector*/);
 }
-template<class T>
+template <class T>
 void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
  CHECK(!useGpu_) << "do not support gpu";
@@ -1379,52 +1536,82 @@ void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
  }
 }
-template<class T>
+template <class T>
 void BaseMatrixT<T>::colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
  MatrixOffset offset(0, 0, 0, 0, 0, cRow);
  int numRows = height_;
  int numCols = width_;
-  applyTernary(ternary::DotMul<T>(), b, c, numRows, numCols, offset,
+  applyTernary(ternary::DotMul<T>(),
-               true_type() /* cAsRowVector */, false_type() /* cAsColVector */);
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               true_type() /* cAsRowVector */,
+               false_type() /* cAsColVector */);
 }
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
  MatrixOffset offset(0, 0, 0, 0, 0, cRow);
  int numRows = height_;
  int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset,
+  applyTernary(ternary::addDotMulMMV<T>(),
-               true_type() /* cAsRowVector */, false_type() /* cAsColVector */);
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               true_type() /* cAsRowVector */,
+               false_type() /* cAsColVector */);
 }
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
  int numRows = height_;
  int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset,
+  applyTernary(ternary::addDotMulMMV<T>(),
-               false_type(), true_type() /*cAsColVector*/);
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               false_type(),
+               true_type() /*cAsColVector*/);
 }
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(RowAdd, ONE_PARAMETER, a = b + p * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p) {
  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
  int numRows = height_;
  int numCols = width_;
-  applyTernary(ternary::RowAdd<T>(p), b, c, numRows, numCols, offset,
+  applyTernary(ternary::RowAdd<T>(p),
-    false_type(), true_type() /*cAsColVector*/);
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               false_type(),
+               true_type() /*cAsColVector*/);
 }
 DEFINE_MATRIX_TERNARY_OP(RowPow, a = pow(b, c));
-template<>
+template <>
 void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
  if (useGpu_) {
    MatrixOffset offset(0, 0, 0, 0, cCol, 0);
    int numRows = height_;
    int numCols = width_;
-    applyTernary(ternary::RowPow<real>(), b, c, numRows, numCols, offset,
+    applyTernary(ternary::RowPow<real>(),
-                 false_type(), true_type() /*cAsColVector*/);
+                 b,
+                 c,
+                 numRows,
+                 numCols,
+                 offset,
+                 false_type(),
+                 true_type() /*cAsColVector*/);
  } else {
    size_t height = this->height_;
    size_t width = this->width_;
@@ -1441,44 +1628,64 @@ void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
  }
 }
-template<class T>
+template <class T>
 void BaseMatrixT<T>::mulRowVector(BaseMatrixT& b) {
  MatrixOffset offset(0, 0, 0, 0);
  int numRows = height_;
  int numCols = width_;
-  applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset,
+  applyBinary(binary::DotMul<T>(),
-              true_type() /* bAsRowVector */, false_type());
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
 }
 DEFINE_MATRIX_BINARY_OP(DotDiv, a /= b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
  MatrixOffset offset(0, 0, 0, 0);
  int numRows = height_;
  int numCols = width_;
-  applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset,
+  applyBinary(binary::DotDiv<T>(),
-              true_type() /* bAsRowVector */, false_type());
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
 }
-template<class T>
+template <class T>
 void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) {
  MatrixOffset offset(0, 0, 0, 0);
  int numRows = height_;
  int numCols = width_;
-  applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset,
+  applyBinary(binary::DotMul<T>(),
-              false_type(), true_type() /* bAsColVector */);
+              b,
+              numRows,
+              numCols,
+              offset,
+              false_type(),
+              true_type() /* bAsColVector */);
 }
-template<class T>
+template <class T>
 void BaseMatrixT<T>::divColVector(BaseMatrixT& b) {
  MatrixOffset offset(0, 0, 0, 0);
  int numRows = height_;
  int numCols = width_;
-  applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset,
+  applyBinary(binary::DotDiv<T>(),
-              false_type(), true_type() /* bAsColVector */);
+              b,
+              numRows,
+              numCols,
+              offset,
+              false_type(),
+              true_type() /* bAsColVector */);
 }
-template<>
+template <>
 template <class Agg>
 int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
  MatrixOffset offset(0, 0, 0, 0, 0, 0);
@@ -1486,13 +1693,20 @@ int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
  size_t numCols = b.width_;
  CHECK_EQ(height_, numRows);
  CHECK_EQ(width_, 1UL);
-  aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
+  aggregate(agg,
-            numCols, offset, false_type(), true_type() /*aAsColVector*/);
+            base::unary::identity(),
+            base::binary::second(),
+            b,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
  return 0;
 }
-template<>
+template <>
 template <class Agg, class Saver>
 int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
  MatrixOffset offset(0, 0, 0, 0, 0, 0);
@@ -1500,16 +1714,25 @@ int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
  size_t numCols = b.width_;
  CHECK_EQ(height_, numRows);
  CHECK_EQ(width_, 1UL);
-  aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
+  aggregate(agg,
-            false_type(), true_type() /*aAsColVector*/);
+            base::unary::identity(),
+            sv,
+            b,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
  return 0;
 }
-template<>
+template <>
 template <class Agg>
-int BaseMatrixT<real>::applyRow(
+int BaseMatrixT<real>::applyRow(Agg agg,
-     Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) {
+                                real scaleDest,
+                                real scaleAgg,
+                                BaseMatrixT& b) {
  if (scaleDest != 0) {
    applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b);
  } else {
@@ -1521,10 +1744,10 @@ int BaseMatrixT<real>::applyRow(
  return 0;
 }
-template<>
+template <>
 template <class Agg, class Op, class Saver>
-int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
+int BaseMatrixT<real>::applyRow(
-                                BaseMatrixT& b, BaseMatrixT& c) {
+    Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c) {
  MatrixOffset offset(0, 0, 0, 0, 0, 0);
  size_t numRows = b.height_;
  size_t numCols = b.width_;
@@ -1532,16 +1755,27 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
  CHECK_EQ(width_, 1UL);
  CHECK_EQ(c.height_, numRows);
  CHECK_EQ(c.width_, numCols);
-  aggregate(agg, op, sv,
+  aggregate(agg,
-            b, c, numRows, numCols, offset,
+            op,
-            false_type(), true_type() /*aAsColVector*/);
+            sv,
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
  return 0;
 }
-template<>
+template <>
 template <class Agg, class Op>
-int BaseMatrixT<real>::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
+int BaseMatrixT<real>::applyRow(Agg agg,
-                                BaseMatrixT& b, BaseMatrixT& c) {
+                                Op op,
+                                real scaleDest,
+                                real scaleAgg,
+                                BaseMatrixT& b,
+                                BaseMatrixT& c) {
  if (scaleDest != 0) {
    applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c);
  } else {
@@ -1553,7 +1787,7 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
  return 0;
 }
-template<>
+template <>
 template <class Agg>
 int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
  MatrixOffset offset(0, 0, 0, 0, 0, 0);
@@ -1561,13 +1795,20 @@ int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
  size_t numCols = b.width_;
  CHECK_EQ(width_, numCols);
  CHECK_EQ(height_, 1UL);
-  aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
+  aggregate(agg,
-            numCols, offset, true_type() /*aAsRowVector*/, false_type());
+            base::unary::identity(),
+            base::binary::second(),
+            b,
+            numRows,
+            numCols,
+            offset,
+            true_type() /*aAsRowVector*/,
+            false_type());
  return 0;
 }
-template<>
+template <>
 template <class Agg, class Saver>
 int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
  MatrixOffset offset(0, 0, 0, 0, 0, 0);
@@ -1575,16 +1816,25 @@ int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
  size_t numCols = b.width_;
  CHECK_EQ(width_, numCols);
  CHECK_EQ(height_, 1UL);
-  aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
+  aggregate(agg,
-            true_type() /*aAsRowVector*/, false_type());
+            base::unary::identity(),
+            sv,
+            b,
+            numRows,
+            numCols,
+            offset,
+            true_type() /*aAsRowVector*/,
+            false_type());
  return 0;
 }
-template<>
+template <>
 template <class Agg>
-int BaseMatrixT<real>::applyCol(
+int BaseMatrixT<real>::applyCol(Agg agg,
-     Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) {
+                                real scaleDest,
+                                real scaleAgg,
+                                BaseMatrixT& b) {
  if (scaleDest != 0) {
    applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b);
  } else {
@@ -1596,48 +1846,51 @@ int BaseMatrixT<real>::applyCol(
  return 0;
 }
-template<>
+template <>
 void BaseMatrixT<real>::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) {
  applyRow(aggregate::sum(), scaleDest, scaleSum, b);
 }
-template<>
+template <>
 void BaseMatrixT<real>::maxRows(BaseMatrixT& b) {
  applyRow(aggregate::max(), b);
 }
-template<>
+template <>
 void BaseMatrixT<real>::minRows(BaseMatrixT& b) {
  applyRow(aggregate::min(), b);
 }
-template<>
+template <>
 void BaseMatrixT<real>::maxCols(BaseMatrixT& b) {
  applyCol(aggregate::max(), b);
 }
-template<>
+template <>
 void BaseMatrixT<real>::minCols(BaseMatrixT& b) {
  applyCol(aggregate::min(), b);
 }
-template<>
+template <>
 void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) {
  applyCol(aggregate::sum(), scaleDest, scaleSum, b);
 }
-template<>
+template <>
-void BaseMatrixT<real>::sumOfSquaredDiffs(
+void BaseMatrixT<real>::sumOfSquaredDiffs(BaseMatrixT& b,
-    BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) {
+                                          BaseMatrixT& c,
-  applyRow(aggregate::sum(), base::binary::squaredDiff(),
+                                          real scaleSum,
-           scaleDest, scaleSum, b, c);
+                                          real scaleDest) {
+  applyRow(
+      aggregate::sum(), base::binary::squaredDiff(), scaleDest, scaleSum, b, c);
 }
-template<>
+template <>
-void BaseMatrixT<real>::sumOfProducts(
+void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b,
-    BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) {
+                                      BaseMatrixT& c,
-  applyRow(aggregate::sum(), base::binary::mul(),
+                                      real scaleSum,
-           scaleDest, scaleSum, b, c);
+                                      real scaleDest) {
+  applyRow(aggregate::sum(), base::binary::mul(), scaleDest, scaleSum, b, c);
 }
 template class BaseMatrixT<real>;

--- a/paddle/math/TrainingAlgorithmOp.cu
+++ b/paddle/math/TrainingAlgorithmOp.cu
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/utils/Logging.h"
 #include "BaseMatrix.h"
 #include "TrainingAlgorithmOp.h"
+#include "paddle/utils/Logging.h"
 #if __cplusplus > 199711L
@@ -32,10 +32,10 @@ void sparseMomentumApply(BaseMatrix& value,
                         real tau,
                         real learningRate) {
  auto expr1 = momU.lazyAssign(momU - (alpha * gamma * learningRate) * grad);
-  auto expr2 = momV.lazyAssign(
+  auto expr2 =
-    momV + (tau * alpha * gamma * learningRate) * grad);
+      momV.lazyAssign(momV + (tau * alpha * gamma * learningRate) * grad);
-  auto expr3 = value.lazyAssign(
+  auto expr3 = value.lazyAssign((tau / beta + (real)1 / alpha) * momU +
-    (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV);
+                                ((real)1 / beta) * momV);
  AssignEvaluate(expr1, expr2, expr3);
 }
@@ -52,12 +52,12 @@ void adadeltaApply(BaseMatrix& value,
                   real momentum,
                   real decayRate) {
  auto expr1 = accum.lazyAssign(rou * accum + ((real)1 - rou) * grad.square());
-  auto expr2 = lr.lazyAssign(
+  auto expr2 =
-    ((accum_update + epsilon) / (accum + epsilon)).sqrt());
+      lr.lazyAssign(((accum_update + epsilon) / (accum + epsilon)).sqrt());
-  auto expr3 = accum_update.lazyAssign(
+  auto expr3 = accum_update.lazyAssign(rou * accum_update +
-    rou * accum_update + ((real)1 - rou) * (grad * lr).square());
+                                       ((real)1 - rou) * (grad * lr).square());
-  auto expr4 = mom.lazyAssign(
+  auto expr4 = mom.lazyAssign(mom * momentum -
-    mom * momentum - learningRate * lr * (grad + value * decayRate));
+                              learningRate * lr * (grad + value * decayRate));
  auto expr5 = value.lazyAssign(value + mom);
  AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
@@ -74,10 +74,10 @@ void adagradApply(BaseMatrix& value,
                  real momentum,
                  real decayRate) {
  auto expr1 = accum.lazyAssign(accum + grad.square());
-  auto expr2 = lr.lazyAssign(
+  auto expr2 =
-    (accum_buffer + accum + epsilon).sqrt().reciprocal());
+      lr.lazyAssign((accum_buffer + accum + epsilon).sqrt().reciprocal());
-  auto expr3 = mom.lazyAssign(
+  auto expr3 = mom.lazyAssign(mom * momentum -
-    mom * momentum - learningRate * lr * (grad + value * decayRate));
+                              learningRate * lr * (grad + value * decayRate));
  auto expr4 = value.lazyAssign(value + mom);
  AssignEvaluate(expr1, expr2, expr3, expr4);
@@ -98,8 +98,8 @@ void rmspropApply(BaseMatrix& value,
                  bool firstTime) {
  auto expr2 = f.lazyAssign(accumulatedRou * f + ((real)1 - rou) * grad);
  auto expr3 = lr.lazyAssign((g - f.square() + epsilon).sqrt().reciprocal());
-  auto expr4 = mom.lazyAssign(
+  auto expr4 = mom.lazyAssign(mom * momentum -
-    mom * momentum - learningRate * lr * (grad + value * decayRate));
+                              learningRate * lr * (grad + value * decayRate));
  auto expr5 = value.lazyAssign(value + mom);
  if (firstTime) {
@@ -107,8 +107,8 @@ void rmspropApply(BaseMatrix& value,
    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
  } else {
-    auto expr1 = g.lazyAssign(
+    auto expr1 =
-      accumulatedRou * g + ((real)1 - rou) * grad.square());
+        g.lazyAssign(accumulatedRou * g + ((real)1 - rou) * grad.square());
    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
  }
@@ -127,8 +127,8 @@ void decayedAdagradApply(BaseMatrix& value,
                         real decayRate,
                         bool firstTime) {
  auto expr2 = lr.lazyAssign((accum + epsilon).sqrt().reciprocal());
-  auto expr3 = mom.lazyAssign(
+  auto expr3 = mom.lazyAssign(mom * momentum -
-    mom * momentum - learningRate * lr * (grad + value * decayRate));
+                              learningRate * lr * (grad + value * decayRate));
  auto expr4 = value.lazyAssign(value + mom);
  if (firstTime) {
@@ -136,8 +136,8 @@ void decayedAdagradApply(BaseMatrix& value,
    AssignEvaluate(expr1, expr2, expr3, expr4);
  } else {
-    auto expr1 = accum.lazyAssign(
+    auto expr1 = accum.lazyAssign(accumulatedRou * accum +
-      accumulatedRou * accum + ((real)1 - rou) * grad.square());
+                                  ((real)1 - rou) * grad.square());
    AssignEvaluate(expr1, expr2, expr3, expr4);
  }
@@ -153,13 +153,12 @@ void adamApply(BaseMatrix& value,
               real beta2_power,
               real epsilon,
               real learningRate) {
-  real alpha = learningRate *
+  real alpha =
-      std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
  auto expr2 = v.lazyAssign(beta2 * v + ((real)1 - beta2) * grad.square());
-  auto expr3 = value.lazyAssign(
+  auto expr3 = value.lazyAssign(value - (mom * alpha) / (v.sqrt() + epsilon));
-    value - (mom * alpha) / (v.sqrt() + epsilon));
  AssignEvaluate(expr1, expr2, expr3);
 }
@@ -173,10 +172,10 @@ void adamaxApply(BaseMatrix& value,
                 int64_t step,
                 real alpha) {
  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
-  auto expr2 = u.lazyAssign(
+  auto expr2 =
-    (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs()));
+      u.lazyAssign((beta2 * u > grad.abs()).condition(beta2 * u, grad.abs()));
  auto expr3 = value.lazyAssign(
-    value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u));
+      value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u));
  AssignEvaluate(expr1, expr2, expr3);
 }
@@ -322,8 +321,8 @@ void adamApply(BaseMatrix& value,
               real beta2_power,
               real epsilon,
               real learningRate) {
-  real alpha = learningRate *
+  real alpha =
-      std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
  mom = beta1 * mom + ((real)1 - beta1) * grad;
@@ -331,7 +330,7 @@ void adamApply(BaseMatrix& value,
  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
  v = beta2 * v + ((real)1 - beta2) * grad.square();
-  value -=  (mom * alpha) / (v.sqrt() + epsilon);
+  value -= (mom * alpha) / (v.sqrt() + epsilon);
 }
 void adamaxApply(BaseMatrix& value,

--- a/paddle/math/tests/test_Tensor.cu
+++ b/paddle/math/tests/test_Tensor.cu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <gtest/gtest.h>
-#include "paddle/math/Matrix.h"
 #include "TensorCheck.h"
+#include "paddle/math/Matrix.h"
 using paddle::Matrix;
 using paddle::CpuMatrix;
@@ -26,25 +26,25 @@ using paddle::GpuIVector;
 using autotest::TensorCheckEqual;
 using autotest::TensorCheckErr;
-#define INIT_UNARY(A1, A2)                  \
+#define INIT_UNARY(A1, A2)  \
-    Tensor A1(height, width);               \
+  Tensor A1(height, width); \
-    Tensor A2(height, width);               \
+  Tensor A2(height, width); \
-    A1.randomizeUniform();                  \
+  A1.randomizeUniform();    \
-    A2.copyFrom(A1)
+  A2.copyFrom(A1)
-#define INIT_BINARY(A1, A2, B)              \
+#define INIT_BINARY(A1, A2, B) \
-    INIT_UNARY(A1, A2);                     \
+  INIT_UNARY(A1, A2);          \
-    Tensor B(height, width);                \
+  Tensor B(height, width);     \
-    B.randomizeUniform()
+  B.randomizeUniform()
-#define INIT_TERNARY(A1, A2, B, C)          \
+#define INIT_TERNARY(A1, A2, B, C) \
-    INIT_BINARY(A1, A2, B);                 \
+  INIT_BINARY(A1, A2, B);          \
-    Tensor C(height, width);                \
+  Tensor C(height, width);         \
-    C.randomizeUniform()
+  C.randomizeUniform()
-#define INIT_QUATERNARY(A1, A2, B, C, D)    \
+#define INIT_QUATERNARY(A1, A2, B, C, D) \
-    INIT_TERNARY(A1, A2, B, C);             \
+  INIT_TERNARY(A1, A2, B, C);            \
-    Tensor D(height, width);                \
+  Tensor D(height, width);               \
-    D.randomizeUniform()
+  D.randomizeUniform()
-template<typename Tensor>
+template <typename Tensor>
 struct TestUnaryMatrix {
  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
@@ -59,7 +59,7 @@ struct TestUnaryMatrix {
  }
 };
-template<typename Tensor>
+template <typename Tensor>
 struct TestBinaryMatrix {
  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B)> BinaryFunc;
@@ -74,10 +74,10 @@ struct TestBinaryMatrix {
  }
 };
-template<typename Tensor>
+template <typename Tensor>
 struct TestTernaryMatrix {
-  typedef std::function<void(
+  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C)>
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C)> TernaryFunc;
+      TernaryFunc;
  explicit TestTernaryMatrix(TernaryFunc testTernaryFunc) {
    for (auto height : {1, 11, 73, 128, 200, 330}) {
@@ -90,10 +90,11 @@ struct TestTernaryMatrix {
  }
 };
-template<typename Tensor>
+template <typename Tensor>
 struct TestQuaternaryMatrix {
  typedef std::function<void(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)> QuaternaryFunc;
+      Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)>
+      QuaternaryFunc;
  explicit TestQuaternaryMatrix(QuaternaryFunc testQuaternaryFunc) {
    for (auto height : {1, 11, 73, 128, 200, 330}) {
@@ -106,7 +107,7 @@ struct TestQuaternaryMatrix {
  }
 };
-template<typename Tensor, class T>
+template <typename Tensor, class T>
 struct TestUnaryVectorT {
  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
@@ -142,11 +143,11 @@ void SetTensorValue(Matrix& matrix, real value) {
  }
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAddScalar(Tensor& A1, Tensor& A2) {
  real p1 = 2.5;
  real p2 = 3.0;
-  A1.add(p1);   // a += p
+  A1.add(p1);  // a += p
  A2 += p1;
  TensorCheckEqual(A1, A2);
@@ -155,7 +156,7 @@ void testTensorAddScalar(Tensor& A1, Tensor& A2) {
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSubScalar(Tensor& A1, Tensor& A2) {
  real p = 2.5;
  A1.subScalar(p);  // a -= p
@@ -163,7 +164,7 @@ void testTensorSubScalar(Tensor& A1, Tensor& A2) {
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorMulScalar(Tensor& A1, Tensor& A2) {
  real p = 2.5;
  A1.mulScalar(p);  // a *= p
@@ -177,7 +178,7 @@ void testTensorMulScalar(Tensor& A1, Tensor& A2) {
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorDivScalar(Tensor& A1, Tensor& A2) {
  real p = 2.5;
  A1.divScalar(p);  // a /= p
@@ -185,44 +186,44 @@ void testTensorDivScalar(Tensor& A1, Tensor& A2) {
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorNeg(Tensor& A1, Tensor& A2) {
  A1.neg();  // a = -a
  A2 = -A2;
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAbs(Tensor& A1, Tensor& A2) {
  A1.abs2();  // a = a > 0 ? a : -a
  A2 = A2.abs();
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSquare(Tensor& A1, Tensor& A2) {
  A1.square2();  // a = a * a
  A2 = A2.square();
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorReciprocal(Tensor& A1, Tensor& A2) {
  A1.reciprocal2();  // a = 1.0f / a
  A2 = A2.reciprocal();
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSign(Tensor& A1, Tensor& A2) {
  A1.sign2();  // a = (a > 0) - (a < 0)
  A2 = A2.sign();
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAssign(Tensor& A1, Tensor& A2) {
-  A1.assign(1.5);   // a = p
+  A1.assign(1.5);  // a = p
  A2 = A2.constant(1.5);
  TensorCheckEqual(A1, A2);
@@ -235,7 +236,7 @@ void testTensorAssign(Tensor& A1, Tensor& A2) {
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
  testTensorAddScalar(A1, A2);
  testTensorSubScalar(A1, A2);
@@ -249,9 +250,9 @@ void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
  testTensorAssign(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
-  A1.add(2);   // a += p
+  A1.add(2);  // a += p
  A2 += 2;
  TensorCheckEqual(A1, A2);
@@ -266,46 +267,46 @@ void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
 TEST(Unary, BaseOp) {
  TestUnaryMatrix<CpuMatrix> testCpuMatrix(testUnaryBaseOp<CpuMatrix>);
  TestUnaryVectorT<CpuVector, real> testCpuVector(testUnaryBaseOp<CpuVector>);
-  TestUnaryVectorT<CpuIVector, int>
+  TestUnaryVectorT<CpuIVector, int> testCpuIVector(
-    testCpuIVector(testUnaryBaseOpInt<CpuIVector>);
+      testUnaryBaseOpInt<CpuIVector>);
 #ifndef PADDLE_ONLY_CPU
  TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>);
  TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>);
-  TestUnaryVectorT<GpuIVector, int>
+  TestUnaryVectorT<GpuIVector, int> testGpuIVector(
-    testGpuIVector(testUnaryBaseOpInt<GpuIVector>);
+      testUnaryBaseOpInt<GpuIVector>);
 #endif
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorExp(Tensor& A1, Tensor& A2) {
  A1.exp2();  // a = exp(a)
  A2 = A2.exp();
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorLog(Tensor& A1, Tensor& A2) {
  A1.log2();  // a = log(a)
  A2 = A2.log();
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSqrt(Tensor& A1, Tensor& A2) {
  A1.sqrt2();  // a = sqrt(a)
  A2 = A2.sqrt();
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorPow(Tensor& A1, Tensor& A2) {
  A1.pow2(3.2);  // a = pow(a, p)
  A2 = A2.pow(3.2);
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testUnayrMathOp(Tensor& A1, Tensor& A2) {
  testTensorExp(A1, A2);
  testTensorLog(A1, A2);
@@ -321,7 +322,7 @@ TEST(Unary, MathOp) {
 #endif
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorClip(Tensor& A1, Tensor& A2) {
  real p1 = 0.003f;
  real p2 = 0.877f;
@@ -331,7 +332,7 @@ void testTensorClip(Tensor& A1, Tensor& A2) {
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
  real p = 0.5f;
  A1.biggerThanScalar(p);  // a = a > p ? 1.0f : 0.0f
@@ -339,7 +340,7 @@ void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorapplyL1(Tensor& A1, Tensor& A2) {
  /**
   * T lambda = p;
@@ -351,14 +352,15 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2) {
  real learningRate = 0.7f;
  real decayRate = 0.6f;
  A1.applyL1(learningRate, decayRate);
-  A2 = (A2 > (learningRate * decayRate)).condition(
+  A2 = (A2 > (learningRate * decayRate))
-    (A2 - (learningRate * decayRate)),
+           .condition(
-    (A2 < -(learningRate * decayRate)).condition(
+               (A2 - (learningRate * decayRate)),
-      (A2 + (learningRate * decayRate)), (real)0.0));
+               (A2 < -(learningRate * decayRate))
+                   .condition((A2 + (learningRate * decayRate)), (real)0.0));
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testUnayrCompareOp(Tensor& A1, Tensor& A2) {
  testTensorClip(A1, A2);
  testTensorBiggerThanScalar(A1, A2);
@@ -377,7 +379,7 @@ TEST(Unary, CompareOp) {
 #endif
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
  real p1 = 2.5;
  real p2 = 3.2;
@@ -406,7 +408,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
  real p = 2.5;
  A1.sub(B);  // a -= b
@@ -422,7 +424,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
  real p = 2.5;
  A1.mulScalar(B, p);  // a = b * p
@@ -442,7 +444,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
  real p = 2.5;
  A1.divScalar(B, p);  // a = b / p
@@ -454,28 +456,28 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAssign(Tensor& A1, Tensor& A2, Tensor& B) {
  A1.assign(B);  // a = b
  A2 = B;
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSquare(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.square2(A1);   // b = a * a
+  B.square2(A1);  // b = a * a
  A2 = B.square();
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSquareDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
  A1.squareDerivative(B);  // a *= 2.0 * b
  A2 = A2 * (real)2.0 * B;
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
  B.reciprocal2(A1);  // b = 1.0f / a
  A2 = B.reciprocal();
@@ -490,33 +492,33 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
  real learningRate = 0.7f;
  real decayRate = 1.2f;
  A1.applyL2(B, learningRate, decayRate);  // a *= (1.0f / (1.0f + p * b))
-  A2 *= (B.constant(1.0f) +
+  A2 *= (B.constant(1.0f) + B.constant(learningRate * decayRate) * B)
-    B.constant(learningRate * decayRate) * B).reciprocal();
+            .reciprocal();
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorReciprocalDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
  A1.reciprocalDerivative(B);  // a *= -b * b
  A2 *= (-B) * B;
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSign(Tensor& A1, Tensor& A2, Tensor& B) {
  B.sign2(A1);  // b = a > 0.0f ? 1.0f : -1.0f
  A2 = B.sign();
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAbs(Tensor& A1, Tensor& A2, Tensor& B) {
  B.abs2(A1);  // b = a > 0.0f ? a : -a
  A2 = B.abs();
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) {
  testTensorAdd(A1, A2, B);
  testTensorSub(A1, A2, B);
@@ -539,7 +541,7 @@ TEST(Binary, BaseOp) {
 #endif
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
  // a = exp(b)
  A1.exp2(B);
@@ -547,14 +549,14 @@ void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorExpDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
  A1.expDerivative(B);  // a *= b
  A2 *= B;
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
  // a = log(b)
  A1.log2(B);
@@ -562,7 +564,7 @@ void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
  // a = sqrt(b)
  A1.sqrt2(B);
@@ -570,7 +572,7 @@ void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
  // a = 1.0f / sqrt(b)
  A1.invSqrt(B);
@@ -578,14 +580,14 @@ void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorPow(Tensor& A1, Tensor& A2, Tensor& B) {
  A1.pow2(B, 2.5f);  // a = pow(b, p)
  A2 = B.pow(2.5f);
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
  /*
   * const T THRESHOLD = 40.0;
@@ -597,12 +599,14 @@ void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
  real THRESHOLD = 40.0;
  A2 = (B.constant(1.0f) +
-        (B > THRESHOLD).condition(
+        (B > THRESHOLD)
-          THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)).exp()).log();
+            .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))
+            .exp())
+           .log();
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
  /*
   * const T THRESHOLD = 40.0;
@@ -612,14 +616,16 @@ void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   */
  A1.softreluDerivative(B);
  real THRESHOLD = 40.0;
-  A2 = A2 * (B.constant(1.0f) -
+  A2 = A2 *
-             (B.constant(-1.0f) *
+       (B.constant(1.0f) -
-              (B > THRESHOLD).condition(
+        (B.constant(-1.0f) *
-                THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))).exp());
+         (B > THRESHOLD)
+             .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)))
+            .exp());
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
  /*
    const T THRESHOLD_MIN = -40.0;
@@ -632,46 +638,47 @@ void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
  const real THRESHOLD_MIN = -40.0;
  const real THRESHOLD_MAX = 13.0;
-  auto tmp = (B < THRESHOLD_MIN).condition(
+  auto tmp = (B < THRESHOLD_MIN)
-    THRESHOLD_MIN, (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B));
+                 .condition(THRESHOLD_MIN,
+                            (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B));
  A2 = (B.constant(1.0f) + (-tmp).exp()).reciprocal();
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSigmoidDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
  A1.sigmoidDerivative(B);  // a *= b * (1 - b)
  A2 *= B * (B.constant(1.0f) - B);
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorTanh(Tensor& A1, Tensor& A2, Tensor& B) {
  B.tanh(A1);  // b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
  A2 = B.constant(2.0f) / ((B * ((real)-2.0f)).exp() + (real)1.0f) - (real)1.0f;
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
  A1.tanhDerivative(B);  // a *= 1 - b * b
  A2 *= B.constant(1.0f) - B * B;
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorScaledTanh(Tensor& A1, Tensor& A2, Tensor& B) {
  real p1 = 2.5;
  real p2 = 3.1;
  // b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)
  B.scaledTanh(A1, p1, p2);
  A2 = B.constant(p1) *
-      (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0)
+       (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0) -
-       - (real)1.0);
+        (real)1.0);
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
  real p1 = 2.5;
  real p2 = 3.1;
@@ -681,7 +688,7 @@ void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) {
  testTensorTanhDerivative(A1, A2, B);
  testTensorScaledTanhDerivative(A1, A2, B);
@@ -708,21 +715,21 @@ TEST(Binary, MathOp) {
 #endif
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorRelu(Tensor& A1, Tensor& A2, Tensor& B) {
  B.relu(A1);  // b = a > 0.0f ? a : 0.0f
  A2 = (B > (real)0.0f).condition(B, (real)0.0f);
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorReluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
  A1.reluDerivative(B);  // a *= (b > 0.0f ? 1.0f : 0.0f)
  A2 *= (B > (real)0.0).condition((real)1.0, (real)0.0);
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
  /*
   * b = a > p1 ? a : p1
@@ -736,7 +743,7 @@ void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
  SetTensorValue(B, 32.0f);
  /*
@@ -748,15 +755,15 @@ void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAbsDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
  A1.absDerivative(B);  // a = (b > 0) ? a : (b < 0) ? -a : 0
-  A2 = (B > (real)0.0f).condition(A2,
+  A2 = (B > (real)0.0f)
-    (B < (real)0.0f).condition(-A2, (real)0.0f));
+           .condition(A2, (B < (real)0.0f).condition(-A2, (real)0.0f));
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
  real p = 0.613;
  SetTensorValue(B, p);
@@ -765,7 +772,7 @@ void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
  /**
   * T lambda = p * b;
@@ -778,12 +785,13 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
  real decayRate = 0.6f;
  A1.applyL1(B, learningRate, decayRate);
  auto lambda = B.constant(learningRate * decayRate) * B;
-  A2 = (A2 > lambda).condition(
+  A2 = (A2 > lambda)
-    (A2 - lambda), (A2 < -lambda).condition((A2 + lambda), (real)0.0f));
+           .condition((A2 - lambda),
+                      (A2 < -lambda).condition((A2 + lambda), (real)0.0f));
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) {
  B.subScalar(0.5f);
  SetTensorValue(B, 0.0f);
@@ -807,7 +815,7 @@ TEST(Binary, CompareOp) {
 #endif
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
  A1.add(B, C);  // a = b + c
  A2 = B + C;
@@ -833,7 +841,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
  A1.sub(B, C);  // a = b - c
  A2 = B - C;
@@ -846,7 +854,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
  A1.dotMul(B, C);  // a = b * c
  A2 = B * C;
@@ -892,7 +900,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
  A1.dotDiv(B, C);  // a = (b == 0.0) ? 0.0 : b / c
  A2 = (B == (real)0.0).condition((real)0.0, B / C);
@@ -905,7 +913,7 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
  real p1 = 1.5;
  real p2 = 2.5;
@@ -915,14 +923,14 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSoftCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
  A1.softCrossEntropy(B, C);  // a = -c * log(b) - (1 - c) * log(1 - b)
  A2 = -C * B.log() - (C.constant(1.0f) - C) * (B.constant(1.0f) - B).log();
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSoftCrossEntropyBp(Tensor& A1,
                                  Tensor& A2,
                                  Tensor& B,
@@ -932,7 +940,7 @@ void testTensorSoftCrossEntropyBp(Tensor& A1,
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
  testTensorAdd(A1, A2, B, C);
  testTensorSub(A1, A2, B, C);
@@ -952,30 +960,30 @@ TEST(Ternary, BaseOp) {
 #endif
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorBinaryLabelCrossEntropy(Tensor& A1,
                                       Tensor& A2,
                                       Tensor& B,
                                       Tensor& C) {
  A1.binaryLabelCrossEntropy(B, C);  // a = c > 0.5 ? -log(b) : -log(1.0 - b)
-  A2 = (C > (real)0.5).condition(
+  A2 = (C > (real)0.5).condition(-(B.log()), -((B.constant(1.0f) - B).log()));
-    -(B.log()), -((B.constant(1.0f) - B).log()));
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorBinaryLabelCrossEntropyBp(Tensor& A1,
                                         Tensor& A2,
                                         Tensor& B,
                                         Tensor& C) {
  // a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)
  A1.binaryLabelCrossEntropyBp(B, C);
-  A2 += (C > (real)0.5).condition(
+  A2 += (C > (real)0.5)
-    (B.constant(-1.0f) / B), (B.constant(1.0f) - B).reciprocal());
+            .condition((B.constant(-1.0f) / B),
+                       (B.constant(1.0f) - B).reciprocal());
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorLogisticRegressionLoss(Tensor& A1,
                                      Tensor& A2,
                                      Tensor& B,
@@ -991,13 +999,14 @@ void testTensorLogisticRegressionLoss(Tensor& A1,
   */
  A1.logisticRegressionLoss(B, C);
  real THRESHOLD = 40.0;
-  auto tmp = (B > THRESHOLD).condition(
+  auto tmp =
-    THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
+      (B > THRESHOLD)
+          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
  A2 = (C.constant(1.0f) + tmp.exp()).log() - C * tmp;
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorLogisticRegressionLossBp(Tensor& A1,
                                        Tensor& A2,
                                        Tensor& B,
@@ -1013,28 +1022,29 @@ void testTensorLogisticRegressionLossBp(Tensor& A1,
   */
  A1.logisticRegressionLossBp(B, C);
  real THRESHOLD = 40.0;
-  auto tmp = (B > THRESHOLD).condition(
+  auto tmp =
-    THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
+      (B > THRESHOLD)
+          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
  auto tmp2 = tmp.exp();
  A2 = tmp2 / (C.constant(1.0) + tmp2) - C;
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorBiggerThan(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
  A1.biggerThan(B, C);  // a = (b > c) ? 1.0f : 0.0f
  A2 = (B > C).condition((real)1.0f, (real)0.0f);
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTensorMax(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
  A1.max2(B, C);  // a = (b > c) ? b : c
  A2 = (B > C).condition(B, C);
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
  testTensorBinaryLabelCrossEntropyBp(A1, A2, B, C);
  testTensorBinaryLabelCrossEntropy(A1, A2, B, C);
@@ -1053,12 +1063,9 @@ TEST(Ternary, CompareOp) {
 #endif
 }
-template<typename Tensor>
+template <typename Tensor>
-void testQuaternaryAdd(Tensor& A1,
+void testQuaternaryAdd(
-                       Tensor& A2,
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-                       Tensor& B,
-                       Tensor& C,
-                       Tensor& D) {
  // A1.add3(B, C, D, 1.5f, 2.5f, 3.5f);  // a = p1 * b + p2 * c + p3 * d
  // A2 = B * 1.5f + C * 2.5f + D * 3.5f;
  // TensorCheckEqual(A1, A2);
@@ -1084,25 +1091,19 @@ TEST(Quaternary, BaseOp) {
 #endif
 }
-template<typename Tensor>
+template <typename Tensor>
-void testTensorBiggerThan(Tensor& A1,
+void testTensorBiggerThan(
-                          Tensor& A2,
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-                          Tensor& B,
-                          Tensor& C,
-                          Tensor& D) {
  // a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
  A1.biggerThan(B, C, D);
-  A2 = ((B > C && D > (real)0.5)
+  A2 = ((B > C && D > (real)0.5) || (B < C && D < (real)0.5))
-        || (B < C && D < (real)0.5)).condition((real)1.0, (real)0.0);
+           .condition((real)1.0, (real)0.0);
  TensorCheckEqual(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
-void testTensorRankLoss(Tensor& A1,
+void testTensorRankLoss(
-                        Tensor& A2,
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-                        Tensor& B,
-                        Tensor& C,
-                        Tensor& D) {
  /**
   * const T THRESHOLD = 40.0; a = b - c;
   * a = (a > THRESHOLD)
@@ -1114,19 +1115,17 @@ void testTensorRankLoss(Tensor& A1,
  real THRESHOLD = 40.0;
  auto tmp = B - C;
-  auto tmp2 = (tmp > THRESHOLD).condition(
+  auto tmp2 =
-    THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
+      (tmp > THRESHOLD)
+          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
  A2 = (D.constant(1.0f) + tmp2.exp()).log() - tmp2 * D;
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
-void testTensorRankLossBp(Tensor& A1,
+void testTensorRankLossBp(
-                          Tensor& A2,
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-                          Tensor& B,
-                          Tensor& C,
-                          Tensor& D) {
  /**
   * const T THRESHOLD = 40.0; a = b - c;
   * a = (a > THRESHOLD)
@@ -1137,20 +1136,18 @@ void testTensorRankLossBp(Tensor& A1,
  A1.rankLossBp(B, C, D);
  real THRESHOLD = 40.0;
  auto tmp = B - C;
-  auto tmp2 = (tmp > THRESHOLD).condition(
+  auto tmp2 =
-    THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
+      (tmp > THRESHOLD)
+          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
  auto tmp3 = tmp2.exp();
  A2 = tmp3 / (D.constant(1.0f) + tmp3) - D;
  TensorCheckErr(A1, A2);
 }
-template<typename Tensor>
+template <typename Tensor>
-void testQuaternaryCompareOp(Tensor& A1,
+void testQuaternaryCompareOp(
-                             Tensor& A2,
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-                             Tensor& B,
-                             Tensor& C,
-                             Tensor& D) {
  testTensorBiggerThan(A1, A2, B, C, D);
  testTensorRankLoss(A1, A2, B, C, D);
  testTensorRankLossBp(A1, A2, B, C, D);

--- a/paddle/math/tests/test_lazyAssign.cu
+++ b/paddle/math/tests/test_lazyAssign.cu
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <gtest/gtest.h>
+#include "PerfUtils.h"
+#include "TensorCheck.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/TensorAssign.h"
-#include "TensorCheck.h"
-#include "PerfUtils.h"
 using paddle::BaseMatrix;
 using paddle::CpuMatrix;
@@ -27,14 +27,28 @@ using autotest::TensorCheckErr;
 typedef std::function<void(int height, int width)> testMatrixFunc;
 void testMatrixCase(testMatrixFunc matrixFunc) {
  for (auto height : {1}) {
-    for (auto width : {1, 32, 64, 128, 512, 1024, 4096, 32768, 65536, 131072,
+    for (auto width : {1,
-                       262144, 524288, 1048576, 2097152, 4194304, 8388608}) {
+                       32,
+                       64,
+                       128,
+                       512,
+                       1024,
+                       4096,
+                       32768,
+                       65536,
+                       131072,
+                       262144,
+                       524288,
+                       1048576,
+                       2097152,
+                       4194304,
+                       8388608}) {
      matrixFunc(height, width);
    }
  }
 }
-template<typename Tensor>
+template <typename Tensor>
 void testLazyAssign(int height, int width) {
  Tensor A1(height, width);
  Tensor A2(height, width);
@@ -49,40 +63,39 @@ void testLazyAssign(int height, int width) {
  EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;);
-  EXPRESSION_PERFORMANCE(
+  EXPRESSION_PERFORMANCE(auto expr1 = A2.lazyAssign(B + C);
-    auto expr1 = A2.lazyAssign(B + C);
+                         auto expr2 = A2.lazyAssign(A2 * D);
-    auto expr2 = A2.lazyAssign(A2 * D);
+                         AssignEvaluate(expr1, expr2););
-    AssignEvaluate(expr1, expr2););
  TensorCheckErr(A1, A2);
 }
-TEST(lazyAssign, CPU) {
+TEST(lazyAssign, CPU) { testMatrixCase(testLazyAssign<CpuMatrix>); }
-  testMatrixCase(testLazyAssign<CpuMatrix>);
-}
 #ifndef PADDLE_ONLY_CPU
-TEST(lazyAssign, GPU) {
+TEST(lazyAssign, GPU) { testMatrixCase(testLazyAssign<GpuMatrix>); }
-  testMatrixCase(testLazyAssign<GpuMatrix>);
-}
 #endif
-template<typename Tensor>
+template <typename Tensor>
-void sgdUpdateTensor(Tensor& A, Tensor& B, Tensor& C, Tensor& D,
+void sgdUpdateTensor(
-     real p1, real p2, real p3) {
+    Tensor& A, Tensor& B, Tensor& C, Tensor& D, real p1, real p2, real p3) {
  C = C * p2 - D * (B + A * p3) * p1;
  A += C;
 }
-void sgdUpdateLazyAssign(BaseMatrix& A, BaseMatrix& B,
+void sgdUpdateLazyAssign(BaseMatrix& A,
-    BaseMatrix& C, BaseMatrix& D,
+                         BaseMatrix& B,
-    real p1, real p2, real p3) {
+                         BaseMatrix& C,
+                         BaseMatrix& D,
+                         real p1,
+                         real p2,
+                         real p3) {
  auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1);
  auto expr2 = A.lazyAssign(A + C);
  AssignEvaluate(expr1, expr2);
 }
-template<typename Tensor>
+template <typename Tensor>
 void testSgdUpdate(int height, int width) {
  Tensor A1(height, width);
  Tensor A2(height, width);
@@ -113,16 +126,13 @@ void testSgdUpdate(int height, int width) {
   * a = a + c;
   */
  // BaseMatrix API
-  EXPRESSION_PERFORMANCE(
+  EXPRESSION_PERFORMANCE(A1.sgdUpdate(B, C1, D, p1, p2, p3););
-  A1.sgdUpdate(B, C1, D, p1, p2, p3););
  // Tensor expression
-  EXPRESSION_PERFORMANCE(
+  EXPRESSION_PERFORMANCE(sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
-    sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
  // lazyAssign
-  EXPRESSION_PERFORMANCE(
+  EXPRESSION_PERFORMANCE(sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
-    sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
  TensorCheckErr(A1, A2);
  TensorCheckErr(A1, A3);
@@ -130,12 +140,8 @@ void testSgdUpdate(int height, int width) {
  TensorCheckErr(C1, C3);
 }
-TEST(sgdUpdate, CPU) {
+TEST(sgdUpdate, CPU) { testMatrixCase(testSgdUpdate<CpuMatrix>); }
-  testMatrixCase(testSgdUpdate<CpuMatrix>);
-}
 #ifndef PADDLE_ONLY_CPU
-TEST(sgdUpdate, GPU) {
+TEST(sgdUpdate, GPU) { testMatrixCase(testSgdUpdate<GpuMatrix>); }
-  testMatrixCase(testSgdUpdate<GpuMatrix>);
-}
 #endif
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -1146,7 +1146,7 @@ void testBatch2seqPadding(int batchSize, int inputDim) {
  IVectorPtr cpuSequence;
  generateSequenceStartPositions(batchSize, cpuSequence);
-  for (int i = 0; i < cpuSequence->getSize(); ++i) {
+  for (int i = 0; i < int(cpuSequence->getSize()); ++i) {
    (cpuSequence->getData())[i] += 1;  // so no way that maxSeqLen is 0;
  }

--- a/paddle/operators/.clang-format
+++ b/paddle/operators/.clang-format
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11
+...
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -63,5 +63,6 @@ op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
 op_library(fc_op
    SRCS fc_op.cc
    DEPS mul_op rowwise_add_op sigmoid_op softmax_op net_op)
-op_library(recurrent_op SRCS recurrent_op.cc DEPS op_desc tensor op_registry operator net_op)
+op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
+    DEPS op_desc tensor op_registry operator net_op)
 cc_test(recurrent_op_test SRCS recurrent_op_test.cc DEPS recurrent_op gtest mul_op add_op)
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -18,10 +18,10 @@ namespace paddle {
 namespace operators {
 class AddOp : public OperatorWithKernel {
-protected:
+ protected:
  void InferShape(const InferShapeContext &ctx) const override {
-    PADDLE_ENFORCE(ctx.InputSize() == 2, "Input size of AddOp must be two");
+    PADDLE_ENFORCE_EQ(ctx.InputSize(), 2);
-    PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of AddOp must be one");
+    PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1);
    PADDLE_ENFORCE(ctx.InputVar(0) != nullptr && ctx.InputVar(1) != nullptr,
                   "Inputs of AddOp must all be set");
    PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr,
@@ -33,7 +33,7 @@ protected:
 };
 class AddOpMaker : public OpProtoAndCheckerMaker {
-public:
+ public:
  AddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The first input of add op");
@@ -48,7 +48,7 @@ The equation is: Out = X + Y
 };
 class AddOpGrad : public OperatorWithKernel {
-protected:
+ protected:
  void InferShape(const InferShapeContext &ctx) const override {}
 };

--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
@@ -20,7 +20,7 @@ namespace operators {
 template <typename Place, typename T>
 class AddKernel : public OpKernel {
-public:
+ public:
  void Compute(const ExecutionContext& context) const override {
    auto input0 = context.Input<Tensor>(0);
    auto input1 = context.Input<Tensor>(1);

--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -18,7 +18,7 @@ namespace paddle {
 namespace operators {
 class OnehotCrossEntropyOp : public OperatorWithKernel {
-protected:
+ protected:
  void InferShape(const InferShapeContext &ctx) const override {
    PADDLE_ENFORCE(ctx.InputSize() == 2,
                   "Input size of OnehotCrossEntropyOp must be two");
@@ -36,8 +36,19 @@ protected:
  }
 };
+class OnehotCrossEntropyGradientOp : public OperatorWithKernel {
+ protected:
+  void InferShape(const InferShapeContext &ctx) const override {
+    auto X_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto X = ctx.Input<Tensor>("X");
+    // TODO(superjom) add enforce here after helper functions ready
+    X_grad->Resize(X->dims());
+  }
+};
 class OnehotCrossEntropyOpMaker : public OpProtoAndCheckerMaker {
-public:
+ public:
  OnehotCrossEntropyOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The first input of OnehotCrossEntropyOp");
@@ -54,8 +65,11 @@ OnehotCrossEntropy Operator.
 }  // namespace operators
 }  // namespace paddle
-REGISTER_OP(onehot_cross_entropy,
+REGISTER_OP(onehot_cross_entropy, ops::OnehotCrossEntropyOp,
-            ops::OnehotCrossEntropyOp,
            ops::OnehotCrossEntropyOpMaker);
 REGISTER_OP_CPU_KERNEL(onehot_cross_entropy,
                       ops::OnehotCrossEntropyOpKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    onehot_cross_entropy_grad,
+    ops::OnehotCrossEntropyGradientOpKernel<ops::CPUPlace, float>);
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -18,28 +18,53 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
+static const float kCrossEntropyLogThreshold{1e-20};
 template <typename Place, typename T>
 class OnehotCrossEntropyOpKernel : public OpKernel {
-public:
+ public:
-  constexpr T LOG_THRESHOLD() const { return static_cast<T>(1e-20); }
  void Compute(const ExecutionContext& ctx) const override {
-    auto X = ctx.Input<Tensor>(0);
+    auto X = ctx.Input<Tensor>("X");
-    const T* X_data = X->data<T>();
+    const T* Xdata = X->data<T>();
    const int* label_data = ctx.Input<Tensor>(1)->data<int>();
-    auto Y = ctx.Output<Tensor>(0);
+    auto Y = ctx.Output<Tensor>("Y");
    Y->mutable_data<T>(ctx.GetPlace());
-    T* Y_data = Y->data<T>();
+    T* Ydata = Y->data<T>();
    int batch_size = X->dims()[0];
    int class_num = X->dims()[1];
    // Y[i] = -log(X[i][j])
    for (int i = 0; i < batch_size; ++i) {
-      Y_data[i] = -std::log(
+      Ydata[i] = -std::log(std::max(Xdata[i * class_num + label_data[i]],
-          std::max(X_data[i * class_num + label_data[i]], LOG_THRESHOLD()));
+                                    kCrossEntropyLogThreshold));
+    }
+  }
+};
+template <typename Place, typename T>
+class OnehotCrossEntropyGradientOpKernel : public OpKernel {
+ public:
+  void Compute(const ExecutionContext& ctx) const override {
+    auto X = ctx.Input<Tensor>("X");
+    auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto dY = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto label = ctx.Input<Tensor>("label");
+    auto* dXdata = dX->template mutable_data<T>(ctx.GetPlace());
+    auto* dYdata = dY->template data<T>();
+    auto* Xdata = X->template data<T>();
+    auto* label_data = label->data<int>();
+    const int batch_size = X->dims()[0];
+    const int class_num = X->dims()[1];
+    for (int i = 0; i < batch_size; ++i) {
+      dXdata[i * class_num + label_data[i]] =
+          -dYdata[i] / std::max(Xdata[i * class_num + label_data[i]],
+                                kCrossEntropyLogThreshold);
    }
  }
 };

--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
@@ -18,31 +18,29 @@ namespace paddle {
 namespace operators {
 class FullyConnectedOp : public NetOp {
-public:
+ public:
  void Init() override {
    AddOp(OpRegistry::CreateOp("mul",
                               {
                                   Input("X"), Input("W"),
                               },
-                               {Output("before_act")},
+                               {Output("before_act")}, {}));
-                               {}));
    auto b = Input("b");
    if (b != framework::kEmptyVarName) {
      AddOp(OpRegistry::CreateOp("rowwise_add",
                                 {Output("before_act"), Input("b")},
-                                 {Output("before_act")},
+                                 {Output("before_act")}, {}));
-                                 {}));
    }
    auto activation = GetAttr<std::string>("activation");
-    AddOp(OpRegistry::CreateOp(
+    AddOp(OpRegistry::CreateOp(activation, {Output("before_act")},
-        activation, {Output("before_act")}, {Output("Y")}, {}));
+                               {Output("Y")}, {}));
    CompleteAddOp(false);
  }
 };
 class FullyConnectedOpMaker : public OpProtoAndCheckerMaker {
-public:
+ public:
  FullyConnectedOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "the input of fc operator");

--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 class FillZerosLikeOp : public framework::OperatorWithKernel {
-protected:
+ protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE(ctx.InputSize() == 1UL,
                   "Input size of FillZerosLikeOp must be one.");
@@ -36,7 +36,7 @@ protected:
 };
 class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
-public:
+ public:
  FillZerosLikeOpMaker(framework::OpProto *proto,
                       framework::OpAttrChecker *op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
@@ -52,8 +52,7 @@ The output will have the same size with input.
 }  // namespace operators
 }  // namespace paddle
-REGISTER_OP(fill_zeros_like,
+REGISTER_OP(fill_zeros_like, paddle::operators::FillZerosLikeOp,
-            paddle::operators::FillZerosLikeOp,
            paddle::operators::FillZerosLikeOpMaker);
 REGISTER_OP_CPU_KERNEL(
    fill_zeros_like,

--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -22,7 +22,7 @@ namespace operators {
 template <typename Place, typename T>
 class FillZerosLikeKernel : public framework::OpKernel {
-public:
+ public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* output = context.Output<framework::Tensor>(0);
    output->mutable_data<T>(context.GetPlace());

--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -18,7 +18,7 @@ namespace paddle {
 namespace operators {
 class MeanOp : public OperatorWithKernel {
-protected:
+ protected:
  void InferShape(const InferShapeContext &ctx) const override {
    PADDLE_ENFORCE(ctx.InputSize() == 1, "Input size of AddOp must be one");
    PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of AddOp must be one");
@@ -29,7 +29,7 @@ protected:
 };
 class MeanOpMaker : public OpProtoAndCheckerMaker {
-public:
+ public:
  MeanOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of mean op");
@@ -39,7 +39,7 @@ public:
 };
 class MeanGradOp : public OperatorWithKernel {
-protected:
+ protected:
  void InferShape(const InferShapeContext &ctx) const override {
    ctx.Output<Tensor>("X" + framework::kGradVarSuffix)
        ->Resize(ctx.Input<Tensor>("X")->dims());

--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
@@ -20,7 +20,7 @@ namespace operators {
 template <typename Place, typename T>
 class MeanKernel : public OpKernel {
-public:
+ public:
  void Compute(const ExecutionContext& context) const override {
    auto input = context.Input<Tensor>(0);
    auto output = context.Output<Tensor>(0);
@@ -37,7 +37,7 @@ public:
 template <typename Place, typename T>
 class MeanGradKernel : public OpKernel {
-public:
+ public:
  void Compute(const ExecutionContext& context) const override {
    auto OG = context.Input<Tensor>("Out" + framework::kGradVarSuffix);
    PADDLE_ENFORCE(framework::product(OG->dims()) == 1,

--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -18,23 +18,27 @@ namespace paddle {
 namespace operators {
 class MulOp : public OperatorWithKernel {
-protected:
+ protected:
  void InferShape(const InferShapeContext &ctx) const override {
    PADDLE_ENFORCE(ctx.InputSize() == 2, "The mul op must take two inputs");
    auto dim0 = ctx.Input<Tensor>(0)->dims();
    auto dim1 = ctx.Input<Tensor>(1)->dims();
-    PADDLE_ENFORCE(dim0.size() == 2 && dim1.size() == 2,
+    PADDLE_ENFORCE_EQ(dim0.size(), 2,
-                   "The input of mul op must be matrix");
+                      "input X(%s) should be a tensor with 2 dims, a matrix",
-    PADDLE_ENFORCE(
+                      ctx.op_.Input("X"));
-        dim0[1] == dim1[0],
+    PADDLE_ENFORCE_EQ(dim1.size(), 2,
+                      "input Y(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("Y"));
+    PADDLE_ENFORCE_EQ(
+        dim0[1], dim1[0],
        "First matrix's width must be equal with second matrix's height.");
-    PADDLE_ENFORCE(ctx.OutputSize() == 1, "The mul op must take one output");
+    PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1, "The mul op takes only one output");
    ctx.Output<Tensor>(0)->Resize({dim0[0], dim1[1]});
  }
 };
 class MulOpMaker : public OpProtoAndCheckerMaker {
-public:
+ public:
  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The first input of mul op");
@@ -49,7 +53,7 @@ The equation is: Out = X * Y
 };
 class MulOpGrad : public OperatorWithKernel {
-protected:
+ protected:
  void InferShape(const InferShapeContext &ctx) const override {}
  std::string DebugString() const override {
    LOG(INFO) << "MulGrad";

--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -21,7 +21,7 @@ namespace operators {
 template <typename Place, typename T>
 class MulKernel : public OpKernel {
-public:
+ public:
  void Compute(const ExecutionContext& context) const override {
    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair = {
        {Eigen::IndexPair<Eigen::DenseIndex>(1, 0)}};

--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@@ -40,7 +40,7 @@ namespace operators {
 * it defines.
 */
 class NetOp : public framework::OperatorBase {
-public:
+ public:
  /**
   * Infer all the operators' input and output variables' shapes, will be called
   * before every mini-batch
@@ -90,7 +90,7 @@ public:
  std::vector<std::shared_ptr<OperatorBase>> ops_;
-private:
+ private:
  bool add_op_done_{false};
  template <typename T, typename KeyType>

--- a/paddle/operators/net_op_test.cc
+++ b/paddle/operators/net_op_test.cc
@@ -12,7 +12,7 @@ static int infer_shape_cnt = 0;
 static int run_cnt = 0;
 class TestOp : public OperatorBase {
-public:
+ public:
  void InferShape(const framework::Scope& scope) const override {
    ++infer_shape_cnt;
  }
@@ -23,7 +23,7 @@ public:
 };
 class EmptyOp : public OperatorBase {
-public:
+ public:
  void InferShape(const Scope& scope) const override {}
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {}

--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -25,214 +25,75 @@
 namespace paddle {
 namespace operators {
-namespace rnn {
-void SegmentInputs(const std::vector<Scope*>& step_scopes,
-                   const std::vector<Link>& inlinks,
-                   const size_t seq_len,
-                   bool infer_shape_mode) {
-  PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided.");
-  for (size_t i = 0; i < inlinks.size(); ++i) {
-    auto input_var = step_scopes[0]->FindVar(inlinks[i].external);
-    PADDLE_ENFORCE(input_var != nullptr,
-                   "input link [%s] is not in scope.",
-                   inlinks[i].external);
-    Tensor* input = input_var->GetMutable<Tensor>();
-    framework::DDim dims = input->dims();
-    PADDLE_ENFORCE(static_cast<size_t>(dims[0]) == seq_len,
-                   "all the inlinks must have same length");
-    framework::DDim step_dims = slice_ddim(dims, 1, dims.size());
-    for (size_t j = 0; j < seq_len; j++) {
-      Tensor* step_input =
-          step_scopes[j]->NewVar(inlinks[i].internal)->GetMutable<Tensor>();
-      if (!infer_shape_mode) {
-        *step_input = input->Slice<float>(j, j + 1);
-      }
-      step_input->Resize(step_dims);
-    }
-  }
-}
-void ConcatOutputs(const std::vector<Scope*>& step_scopes,
-                   const std::vector<Link>& outlinks,
-                   const size_t seq_len,
-                   bool infer_shape_mode) {
-  for (size_t i = 0; i < outlinks.size(); i++) {
-    auto output_var = step_scopes[0]->FindVar(outlinks[i].external);
-    PADDLE_ENFORCE(output_var != nullptr,
-                   "output link [%s] is not in scope.",
-                   outlinks[i].external);
-    Tensor* output = output_var->GetMutable<Tensor>();
-    if (infer_shape_mode) {
-      framework::DDim step_dims = step_scopes[0]
-                                      ->FindVar(outlinks[i].internal)
-                                      ->GetMutable<Tensor>()
-                                      ->dims();
-      std::vector<int> dims_vec = vectorize(step_dims);
-      dims_vec.insert(dims_vec.begin(), seq_len);
-      output->Resize(framework::make_ddim(dims_vec));
-    } else {
-      output->mutable_data<float>(platform::CPUPlace());
-      for (size_t j = 0; j < seq_len; j++) {
-        Tensor* step_output =
-            step_scopes[j]->FindVar(outlinks[i].internal)->GetMutable<Tensor>();
-        // TODO(luotao02) data type and platform::DeviceContext() should set
-        // correctly
-        (output->Slice<float>(j, j + 1))
-            .CopyFrom<float>(*step_output, platform::CPUPlace());
-      }
-    }
-  }
-}
-void LinkMemories(const std::vector<Scope*>& scopes,
-                  const std::vector<rnn::MemoryAttr>& memories,
-                  const size_t step_id,
-                  const int offset,
-                  bool infer_shape_mode) {
-  PADDLE_ENFORCE(step_id < scopes.size(),
-                 "step [%d] is out of range of step scopes' size [%d]",
-                 step_id,
-                 scopes.size());
-  PADDLE_ENFORCE(static_cast<int>(step_id) + offset >= 0,
-                 "offset [%d] must be large than -[%d]",
-                 offset,
-                 step_id);
-  PADDLE_ENFORCE(step_id + offset < scopes.size(),
-                 "offset [%d] is out of range, it must be less than (%d - %d)",
-                 offset,
-                 scopes.size(),
-                 step_id);
-  auto scope = scopes[step_id];
-  auto linked_scope = scopes[step_id + offset];
-  for (auto& attr : memories) {
-    auto mem = scope->FindVar(attr.pre_var)->GetMutable<Tensor>();
-    auto linked_mem = linked_scope->FindVar(attr.var)->GetMutable<Tensor>();
-    if (infer_shape_mode) {
-      mem->Resize(linked_mem->dims());
-    } else {
-      mem->ShareDataWith<float>(*linked_mem);
-    }
-  }
-}
-void InitArgument(const ArgumentName& name,
-                  Argument* arg,
-                  const OperatorBase& op) {
-  arg->step_net = op.Input(name.step_net);
-  arg->step_scopes = op.Output(name.step_scopes);
-  auto inlinks = op.Inputs(name.inlinks);
-  auto inlink_alias = op.GetAttr<std::vector<std::string>>(name.inlink_alias);
-  PADDLE_ENFORCE(inlinks.size() == inlink_alias.size(),
-                 "the size of inlinks and inlink_alias don't match:%d,%d",
-                 inlinks.size(),
-                 inlink_alias.size());
-  for (size_t i = 0; i < inlinks.size(); ++i) {
-    rnn::Link link;
-    link.external = inlinks[i];
-    link.internal = inlink_alias[i];
-    (arg->inlinks).push_back(link);
-  }
-  auto outlinks = op.Outputs(name.outlinks);
-  auto outlink_alias = op.GetAttr<std::vector<std::string>>(name.outlink_alias);
-  PADDLE_ENFORCE(outlinks.size() == outlink_alias.size(),
-                 "the size of outlinks and outlink_alias don't match:%d,%d",
-                 outlinks.size(),
-                 outlink_alias.size());
-  for (size_t i = 0; i < outlinks.size(); ++i) {
-    rnn::Link link;
-    link.external = outlinks[i];
-    link.internal = outlink_alias[i];
-    (arg->outlinks).push_back(link);
-  }
-  auto boot_memories = op.Inputs(name.boot_memories);
-  // attributes
-  auto memories = op.GetAttr<std::vector<std::string>>(name.memories);
-  auto pre_memories = op.GetAttr<std::vector<std::string>>(name.pre_memories);
-  PADDLE_ENFORCE(memories.size() == boot_memories.size(),
-                 "the size of memories, boot_memories don't match:%d,%d",
-                 memories.size(),
-                 boot_memories.size());
-  PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(),
-                 "the size of pre_memories, boot_memories don't match:%d,%d",
-                 pre_memories.size(),
-                 boot_memories.size());
-  PADDLE_ENFORCE(memories.size() > 0, "more than 1 memories should be set");
-  for (size_t i = 0; i < memories.size(); ++i) {
-    rnn::MemoryAttr mem_attr;
-    mem_attr.var = memories[i];
-    mem_attr.pre_var = pre_memories[i];
-    mem_attr.boot_var = boot_memories[i];
-    (arg->memories).push_back(mem_attr);
-  }
-}
-}  // namespace rnn
 void RecurrentAlgorithm::InferShape(const Scope& scope) const {
  seq_len_ = scope.FindVar((arg_->inlinks[0]).external)
                 ->GetMutable<Tensor>()
                 ->dims()[0];
  CreateScopes(scope);
  auto step_scopes = GetStepScopes(scope);
-  rnn::SegmentInputs(
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
-      step_scopes, arg_->inlinks, seq_len_, true /*infer_shape_mode*/);
+                     true /*infer_shape_mode*/);
  InitMemories(step_scopes[0], true /*infer_shape_mode*/);
  Variable* net = scope.FindVar(arg_->step_net);
  PADDLE_ENFORCE(net != nullptr, "failed to get step net");
  for (size_t i = 0; i < seq_len_; i++) {
    if (i > 0) {
-      rnn::LinkMemories(
+      rnn::LinkMemories(step_scopes, arg_->memories, i, -1,
-          step_scopes, arg_->memories, i, -1, true /*infer_shape_mode*/);
+                        true /*infer_shape_mode*/);
    }
    net->GetMutable<NetOp>()->InferShape(*step_scopes[i]);
  }
-  rnn::ConcatOutputs(
+  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
-      step_scopes, arg_->outlinks, seq_len_, true /*infer_shape_mode*/);
+                     true /*infer_shape_mode*/);
 }
 void RecurrentAlgorithm::Run(const Scope& scope,
                             const platform::DeviceContext& dev_ctx) const {
  auto step_scopes = GetStepScopes(scope);
-  rnn::SegmentInputs(
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
-      step_scopes, arg_->inlinks, seq_len_, false /*infer_shape_mode*/);
+                     false /*infer_shape_mode*/);
  InitMemories(step_scopes[0], false /*infer_shape_mode*/);
  Variable* net = scope.FindVar(arg_->step_net);
  for (size_t step_id = 0; step_id < seq_len_; step_id++) {
+    // create output alias variables
    if (step_id > 0) {
-      rnn::LinkMemories(
+      rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1,
-          step_scopes, arg_->memories, step_id, -1, false /*infer_shape_mode*/);
+                        false /*infer_shape_mode*/);
    }
    net->GetMutable<NetOp>()->Run(*step_scopes[step_id], dev_ctx);
  }
-  rnn::ConcatOutputs(
+  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
-      step_scopes, arg_->outlinks, seq_len_, false /*infer_shape_mode*/);
+                     false /*infer_shape_mode*/);
 }
 void RecurrentAlgorithm::CreateScopes(const Scope& scope) const {
-  // TODO(xxx) Only two scopes are needed for inference, this case will be
+  // TODO(superjom) Only two scopes are needed for inference, this case will be
  // supported later.
-  auto step_scopes =
+  auto step_scopes_var = scope.FindVar(arg_->step_scopes);
-      scope.FindVar(arg_->step_scopes)->GetMutable<std::vector<Scope*>>();
+  PADDLE_ENFORCE(step_scopes_var != nullptr, "");
+  auto step_scopes = step_scopes_var->GetMutable<std::vector<Scope*>>();
+  // Now all variables in scope must be created outside of op.
+  auto net_var = scope.FindVar(arg_->step_net);
+  PADDLE_ENFORCE(net_var != nullptr, "no stepnet called %s in scope",
+                 arg_->step_net);
+  auto net_op = net_var->GetMutable<NetOp>();
+  PADDLE_ENFORCE(!net_op->outputs_.empty(), "net_op has no outputs");
  if (seq_len_ > step_scopes->size()) {
    for (size_t i = step_scopes->size(); i < seq_len_; ++i) {
      auto& step_scope = scope.NewScope();
-      // Now all variables in scope must be created outside of op.
+      // create step net's temp inputs
-      auto net_op = scope.FindVar(arg_->step_net)->GetMutable<NetOp>();
      for (auto& input : net_op->inputs_) {
        // the weight are located in parent scope
-        if (!step_scope.FindVar(input)) step_scope.NewVar(input);
+        if (!step_scope.FindVar(input))
+          step_scope.NewVar(input)->GetMutable<Tensor>();
      }
-      for (auto& output : net_op->outputs_) {
+      // create stepnet's outputs
+      for (const auto& output : net_op->outputs_) {
        step_scope.NewVar(output);
      }
      step_scopes->emplace_back(&step_scope);
@@ -245,37 +106,27 @@ void RecurrentAlgorithm::InitMemories(Scope* step_scope,
  for (auto& attr : arg_->memories) {
    Tensor* pre_mem = step_scope->NewVar(attr.pre_var)->GetMutable<Tensor>();
    PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
-                   "memory [%s]'s boot variable [%s] not exists",
+                   "memory [%s]'s boot variable [%s] not exists", attr.var,
-                   attr.var,
                   attr.boot_var);
    Tensor* boot_mem = step_scope->FindVar(attr.boot_var)->GetMutable<Tensor>();
    if (infer_shape_mode) {
      pre_mem->Resize(boot_mem->dims());
+      PADDLE_ENFORCE_EQ(pre_mem->dims().size(), 2);
    } else {
      pre_mem->ShareDataWith<float>(*boot_mem);
    }
  }
 }
-const rnn::ArgumentName RecurrentOp::kArgName{"step_net",
+const rnn::ArgumentName RecurrentOp::kArgName{
-                                              "step_scopes",
+    "step_net", "step_scopes",  "inlinks",
-                                              "inlinks",
+    "outlinks", "inlink_alias", "outlink_alias",
-                                              "outlinks",
+    "memories", "pre_memories", "boot_memories"};
-                                              "inlink_alias",
-                                              "outlink_alias",
-                                              "memories",
-                                              "pre_memories",
-                                              "boot_memories"};
-const rnn::ArgumentName RecurrentGradientOp::kArgName{"step_net",
+const rnn::ArgumentName RecurrentGradientOp::kArgName{
-                                                      "step_scopes",
+    "step_net",    "step_scopes",  "outlink@grad",
-                                                      "outlink@grad",
+    "inlink@grad", "inlink_alias", "outlink_alias",
-                                                      "inlink@grad",
+    "memories",    "pre_memories", "boot_memories@grad"};
-                                                      "inlink_alias",
-                                                      "outlink_alias",
-                                                      "memories",
-                                                      "pre_memories",
-                                                      "boot_memories@grad"};
 void RecurrentOp::Init() {
  OperatorBase::Init();
@@ -285,7 +136,7 @@ void RecurrentOp::Init() {
 }
 class RecurrentAlgorithmProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
-public:
+ public:
  RecurrentAlgorithmProtoAndCheckerMaker(OpProto* proto,
                                         OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
@@ -316,31 +167,29 @@ public:
 void RecurrentGradientAlgorithm::Run(
    const Scope& scope, const platform::DeviceContext& dev_ctx) const {
  auto step_scopes = GetStepScopes(scope);
-  rnn::SegmentInputs(
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
-      step_scopes, arg_->inlinks, seq_len_, false /*infer_shape_mode*/);
+                     false /*infer_shape_mode*/);
  Variable* net = scope.FindVar(arg_->step_net);
  PADDLE_ENFORCE(net != nullptr, "failed to get step net");
  for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
    if (static_cast<size_t>(step_id) != seq_len_ - 1) {
-      rnn::LinkMemories(
+      rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1,
-          step_scopes, arg_->memories, step_id, 1, false /*infer_shape_mode*/);
+                        false /*infer_shape_mode*/);
    }
    net->GetMutable<NetOp>()->Run(*step_scopes[step_id], dev_ctx);
  }
  LinkBootMemoryGradients(step_scopes[0], false);
-  rnn::ConcatOutputs(
+  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
-      step_scopes, arg_->outlinks, seq_len_, false /*infer_shape_mode*/);
+                     false /*infer_shape_mode*/);
 }
 void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
    Scope* step_scope, bool infer_shape_mode) const {
  for (auto& attr : arg_->memories) {
    PADDLE_ENFORCE(step_scope->FindVar(attr.var) != nullptr,
-                   "memory variable [%s] does not exists",
+                   "memory variable [%s] does not exists", attr.var);
-                   attr.var);
    PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
-                   "boot variable [%s] does not exists",
+                   "boot variable [%s] does not exists", attr.boot_var);
-                   attr.boot_var);
    Tensor* mem_grad = step_scope->NewVar(attr.var)->GetMutable<Tensor>();
    Tensor* boot_mem_grad =
        step_scope->NewVar(attr.boot_var)->GetMutable<Tensor>();
@@ -357,19 +206,19 @@ void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const {
                 ->GetMutable<Tensor>()
                 ->dims()[0];
  auto step_scopes = GetStepScopes(scope);
-  rnn::SegmentInputs(
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
-      step_scopes, arg_->inlinks, seq_len_, true /*infer_shape_mode*/);
+                     true /*infer_shape_mode*/);
  Variable* net = scope.FindVar(arg_->step_net);
  PADDLE_ENFORCE(net != nullptr, "failed to get step net");
  for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
    if (static_cast<size_t>(step_id) != seq_len_ - 1) {
-      rnn::LinkMemories(
+      rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1,
-          step_scopes, arg_->memories, step_id, 1, true /*infer_shape_mode*/);
+                        true /*infer_shape_mode*/);
    }
    net->GetMutable<NetOp>()->InferShape(*step_scopes[step_id]);
  }
-  rnn::ConcatOutputs(
+  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
-      step_scopes, arg_->outlinks, seq_len_, true /*infer_shape_mode*/);
+                     true /*infer_shape_mode*/);
  LinkBootMemoryGradients(step_scopes[0], true /*infer_shape_mode*/);
 }
@@ -383,6 +232,5 @@ void RecurrentGradientOp::Init() {
 }  // namespace operators
 }  // namespace paddle
-REGISTER_OP(recurrent_op,
+REGISTER_OP(recurrent_op, paddle::operators::RecurrentOp,
-            paddle::operators::RecurrentOp,
            paddle::operators::RecurrentAlgorithmProtoAndCheckerMaker);
--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
@@ -15,82 +15,11 @@
 #pragma once
 #include "paddle/framework/operator.h"
+#include "paddle/operators/rnn/recurrent_op_utils.h"
 namespace paddle {
 namespace operators {
-namespace rnn {
-/**
- * Memory of a RNN (same as the role of `Momory` in PaddlePaddle).
- *
- * Memory attributes cached by this op, dims will be infered from
- * boot memories in father scope. Other attributes are copied from Op's proto
- * attributes.
- */
-struct MemoryAttr {
-  // name of current state variable
-  std::string var;
-  // name of previous step's state variable
-  std::string pre_var;
-  // name of the variables to init this memory (same role of `boot_layer` in
-  // PaddlePaddle), which is store in father's scope.
-  std::string boot_var;
-};
-struct Link {
-  // input or output links name.
-  std::string internal;
-  // alias to avoid duplicate keys in scopes.
-  std::string external;
-};
-struct Argument {
-  std::string step_net;
-  std::string step_scopes;
-  std::vector<Link> inlinks;
-  std::vector<Link> outlinks;
-  std::vector<rnn::MemoryAttr> memories;
-};
-struct ArgumentName {
-  std::string step_net;
-  std::string step_scopes;
-  std::string inlinks;
-  std::string outlinks;
-  std::string inlink_alias;   // the alias of inlinks in step net.
-  std::string outlink_alias;  // the alias of outlinks in step net.
-  std::string memories;       // the memory name
-  std::string pre_memories;   // the previous memory name
-  std::string boot_memories;  // the boot memory name
-};
-/**
- * Prepare inputs for each step net.
- */
-void SegmentInputs(const std::vector<framework::Scope*>& step_scopes,
-                   const std::vector<Link>& inlinks,
-                   const size_t seq_len,
-                   bool infer_shape_mode);
-/**
- * Process outputs of step nets and merge to variables.
- */
-void ConcatOutputs(const std::vector<framework::Scope*>& step_scopes,
-                   const std::vector<Link>& outlinks,
-                   const size_t seq_len,
-                   bool infer_shape_mode);
-void LinkMemories(const std::vector<framework::Scope*>& step_scopes,
-                  const std::vector<MemoryAttr>& memories,
-                  const size_t step_id,
-                  const int offset,
-                  bool infer_shape_mode);
-void InitArgument(const ArgumentName& name, Argument* arg);
-};  // namespace rnn
 // The sequence format in RecurrentOp is Tensor<seq_len, batch_size, dim> now.
 // TODO(Yan Chunwei):
 // 1. No-padding computing for sequences with indifinite length in one batch.
@@ -100,7 +29,7 @@ void InitArgument(const ArgumentName& name, Argument* arg);
 //    Refer to: https://arxiv.org/pdf/1502.02367.pdf
 class RecurrentAlgorithm {
-public:
+ public:
  void Run(const framework::Scope& scope,
           const platform::DeviceContext& dev_ctx) const;
@@ -111,7 +40,7 @@ public:
   */
  void InferShape(const framework::Scope& scope) const;
-protected:
+ protected:
  /*
   * The step scopes will be stored in the father scope as a variable.
   *
@@ -128,7 +57,7 @@ protected:
  void InitMemories(framework::Scope* step_scopes, bool infer_shape_mode) const;
-private:
+ private:
  std::unique_ptr<rnn::Argument> arg_;
  mutable size_t seq_len_;
 };
@@ -144,7 +73,7 @@ class RecurrentGradientAlgorithm {
   * lot, and the latter is a wrapper acts like an dapter for it to make RNN an
   * operator.
   */
-public:
+ public:
  void Init(std::unique_ptr<rnn::Argument> arg) { arg_ = std::move(arg); }
  void Run(const framework::Scope& scope,
@@ -158,20 +87,20 @@ public:
   */
  void InferShape(const framework::Scope& scope) const;
-protected:
+ protected:
  inline const std::vector<framework::Scope*>& GetStepScopes(
      const framework::Scope& scope) const {
    return *scope.FindVar(arg_->step_scopes)
                ->GetMutable<std::vector<framework::Scope*>>();
  }
-private:
+ private:
  std::unique_ptr<rnn::Argument> arg_;
  mutable size_t seq_len_;
 };
 class RecurrentOp final : public framework::OperatorBase {
-public:
+ public:
  void Init() override;
  /**
@@ -188,12 +117,12 @@ public:
  static const rnn::ArgumentName kArgName;
-private:
+ private:
  RecurrentAlgorithm alg_;
 };
 class RecurrentGradientOp final : public framework::OperatorBase {
-public:
+ public:
  void Init() override;
  /**
@@ -210,7 +139,7 @@ public:
  static const rnn::ArgumentName kArgName;
-private:
+ private:
  RecurrentGradientAlgorithm alg_;
 };

--- a/paddle/operators/recurrent_op_test.cc
+++ b/paddle/operators/recurrent_op_test.cc
@@ -29,7 +29,7 @@ using framework::make_ddim;
 using framework::DDim;
 class RecurrentOpTest : public ::testing::Test {
-protected:
+ protected:
  virtual void SetUp() override {
    CreateGlobalVariables();
    CreateStepNet();
@@ -174,7 +174,7 @@ TEST_F(RecurrentOpTest, Run) {
 }
 class RecurrentGradientAlgorithmTest : public ::testing::Test {
-protected:
+ protected:
  virtual void SetUp() override {
    CreateGlobalVariables();
    CreateStepScopes();
@@ -277,13 +277,11 @@ protected:
    LOG(INFO) << "create variable step_net";
    Variable* var = scope_.NewVar("step_net");
    auto net = var->GetMutable<NetOp>();
-    net->AddOp(OpRegistry::CreateOp("mul",
+    net->AddOp(OpRegistry::CreateOp("mul", {"rnn/h_pre", "rnn/w", "rnn/s_grad"},
-                                    {"rnn/h_pre", "rnn/w", "rnn/s_grad"},
+                                    {"rnn/h_pre_grad", "rnn/w_grad"}, {}));
-                                    {"rnn/h_pre_grad", "rnn/w_grad"},
-                                    {}));
-    net->AddOp(OpRegistry::CreateOp(
+    net->AddOp(OpRegistry::CreateOp("add_two", {"rnn/h_grad"},
-        "add_two", {"rnn/h_grad"}, {"rnn/x_grad", "rnn/s_grad"}, {}));
+                                    {"rnn/x_grad", "rnn/s_grad"}, {}));
    net->CompleteAddOp();
  }
@@ -297,9 +295,7 @@ protected:
    inlink.internal = "rnn/x";
    auto step_scopes =
        scope_.FindVar("step_scopes")->GetMutable<std::vector<Scope*>>();
-    rnn::SegmentInputs(*step_scopes,
+    rnn::SegmentInputs(*step_scopes, std::vector<rnn::Link>{inlink}, 10,
-                       std::vector<rnn::Link>{inlink},
-                       10,
                       true /*infer_shape_mode*/);
  }
@@ -314,8 +310,8 @@ protected:
    auto step_scopes =
        scope_.FindVar("step_scopes")->GetMutable<std::vector<Scope*>>();
    for (int i = 1; i < 10; ++i) {
-      rnn::LinkMemories(
+      rnn::LinkMemories(*step_scopes, memories, i, -1,
-          *step_scopes, memories, i, -1, true /*infer_shape_mode*/);
+                        true /*infer_shape_mode*/);
    }
  }
@@ -395,3 +391,4 @@ TEST(RecurrentOp, LinkMemories) {
 USE_OP(add_two);
 USE_OP(mul);
+USE_OP_WITHOUT_KERNEL(recurrent_op);
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/rnn/recurrent_op_utils.h"
+namespace paddle {
+namespace operators {
+namespace rnn {
+namespace fmw = paddle::framework;
+void SegmentInputs(const std::vector<Scope*>& step_scopes,
+                   const std::vector<Link>& inlinks, const size_t seq_len,
+                   bool infer_shape_mode) {
+  PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided.");
+  for (size_t i = 0; i < inlinks.size(); ++i) {
+    auto input_var = step_scopes[0]->FindVar(inlinks[i].external);
+    PADDLE_ENFORCE(input_var != nullptr, "input link [%s] is not in scope.",
+                   inlinks[i].external);
+    Tensor* input = input_var->GetMutable<Tensor>();
+    fmw::DDim dims = input->dims();
+    PADDLE_ENFORCE(static_cast<size_t>(dims[0]) == seq_len,
+                   "all the inlinks must have same length");
+    fmw::DDim step_dims = slice_ddim(dims, 1, dims.size());
+    for (size_t j = 0; j < seq_len; j++) {
+      Tensor* step_input =
+          step_scopes[j]->NewVar(inlinks[i].internal)->GetMutable<Tensor>();
+      if (!infer_shape_mode) {
+        *step_input = input->Slice<float>(j, j + 1);
+      }
+      step_input->Resize(step_dims);
+    }
+  }
+}
+void ConcatOutputs(const std::vector<Scope*>& step_scopes,
+                   const std::vector<Link>& outlinks, const size_t seq_len,
+                   bool infer_shape_mode) {
+  for (size_t i = 0; i < outlinks.size(); i++) {
+    auto output_var = step_scopes[0]->FindVar(outlinks[i].external);
+    PADDLE_ENFORCE(output_var != nullptr, "output link [%s] is not in scope.",
+                   outlinks[i].external);
+    Tensor* output = output_var->GetMutable<Tensor>();
+    if (infer_shape_mode) {
+      auto step_scope_var = step_scopes[0]->FindVar(outlinks[i].internal);
+      PADDLE_ENFORCE(step_scope_var != nullptr, "%s not in scope",
+                     outlinks[i].internal);
+      fmw::DDim step_dims =
+          step_scope_var->template GetMutable<Tensor>()->dims();
+      std::vector<int> dims_vec = vectorize(step_dims);
+      dims_vec.insert(dims_vec.begin(), seq_len);
+      output->Resize(fmw::make_ddim(dims_vec));
+    } else {
+      output->mutable_data<float>(platform::CPUPlace());
+      for (size_t j = 0; j < seq_len; j++) {
+        Tensor* step_output =
+            step_scopes[j]->FindVar(outlinks[i].internal)->GetMutable<Tensor>();
+        // TODO(luotao02) data type and platform::DeviceContext() should set
+        // correctly
+        (output->Slice<float>(j, j + 1))
+            .CopyFrom<float>(*step_output, platform::CPUPlace());
+      }
+    }
+  }
+}
+void LinkMemories(const std::vector<Scope*>& scopes,
+                  const std::vector<rnn::MemoryAttr>& memories,
+                  const size_t step_id, const int offset,
+                  bool infer_shape_mode) {
+  PADDLE_ENFORCE_LT(step_id, scopes.size(),
+                    "step [%d] is out of range of step scopes' size [%d]",
+                    step_id, scopes.size());
+  PADDLE_ENFORCE_GE(static_cast<int>(step_id) + offset, 0,
+                    "offset [%d] must be large than -[%d]", offset, step_id);
+  PADDLE_ENFORCE_LT(
+      step_id + offset, scopes.size(),
+      "offset [%d] is out of range, it must be less than (%d - %d)", offset,
+      scopes.size(), step_id);
+  auto scope = scopes[step_id];
+  auto linked_scope = scopes[step_id + offset];
+  for (auto& attr : memories) {
+    auto mem = scope->FindVar(attr.pre_var)->GetMutable<Tensor>();
+    auto linked_mem = linked_scope->FindVar(attr.var)->GetMutable<Tensor>();
+    if (infer_shape_mode) {
+      mem->Resize(linked_mem->dims());
+    } else {
+      mem->ShareDataWith<float>(*linked_mem);
+    }
+  }
+}
+void InitArgument(const ArgumentName& name, Argument* arg,
+                  const OperatorBase& op) {
+  arg->step_net = op.Input(name.step_net);
+  arg->step_scopes = op.Output(name.step_scopes);
+  auto inlinks = op.Inputs(name.inlinks);
+  auto inlink_alias = op.GetAttr<std::vector<std::string>>(name.inlink_alias);
+  PADDLE_ENFORCE(inlinks.size() == inlink_alias.size(),
+                 "the size of inlinks and inlink_alias don't match:%d,%d",
+                 inlinks.size(), inlink_alias.size());
+  for (size_t i = 0; i < inlinks.size(); ++i) {
+    rnn::Link link;
+    link.external = inlinks[i];
+    link.internal = inlink_alias[i];
+    (arg->inlinks).push_back(link);
+  }
+  auto outlinks = op.Outputs(name.outlinks);
+  auto outlink_alias = op.GetAttr<std::vector<std::string>>(name.outlink_alias);
+  PADDLE_ENFORCE(outlinks.size() == outlink_alias.size(),
+                 "the size of outlinks and outlink_alias don't match:%d,%d",
+                 outlinks.size(), outlink_alias.size());
+  for (size_t i = 0; i < outlinks.size(); ++i) {
+    rnn::Link link;
+    link.external = outlinks[i];
+    link.internal = outlink_alias[i];
+    (arg->outlinks).push_back(link);
+  }
+  auto boot_memories = op.Inputs(name.boot_memories);
+  // attributes
+  auto memories = op.GetAttr<std::vector<std::string>>(name.memories);
+  auto pre_memories = op.GetAttr<std::vector<std::string>>(name.pre_memories);
+  PADDLE_ENFORCE(memories.size() == boot_memories.size(),
+                 "the size of memories, boot_memories don't match:%d,%d",
+                 memories.size(), boot_memories.size());
+  PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(),
+                 "the size of pre_memories, boot_memories don't match:%d,%d",
+                 pre_memories.size(), boot_memories.size());
+  PADDLE_ENFORCE(memories.size() > 0, "more than 1 memories should be set");
+  for (size_t i = 0; i < memories.size(); ++i) {
+    rnn::MemoryAttr mem_attr;
+    mem_attr.var = memories[i];
+    mem_attr.pre_var = pre_memories[i];
+    mem_attr.boot_var = boot_memories[i];
+    (arg->memories).push_back(mem_attr);
+  }
+}
+}  // namespace rnn
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ b/paddle/operators/rnn/recurrent_op_utils.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <string>
+#include "paddle/framework/operator.h"
+#include "paddle/operators/type_alias.h"
+namespace paddle {
+namespace operators {
+namespace rnn {
+/**
+ * Memory of a RNN (same as the role of `Momory` in PaddlePaddle).
+ *
+ * Memory attributes cached by this op, dims will be infered from
+ * boot memories in father scope. Other attributes are copied from Op's proto
+ * attributes.
+ */
+struct MemoryAttr {
+  // name of current state variable
+  std::string var;
+  // name of previous step's state variable
+  std::string pre_var;
+  // name of the variables to init this memory (same role of `boot_layer` in
+  // PaddlePaddle), which is store in father's scope.
+  std::string boot_var;
+};
+struct Link {
+  // input or output links name.
+  std::string internal;
+  // alias to avoid duplicate keys in scopes.
+  std::string external;
+};
+struct Argument {
+  std::string step_net;
+  std::string step_scopes;
+  std::vector<Link> inlinks;
+  std::vector<Link> outlinks;
+  std::vector<rnn::MemoryAttr> memories;
+};
+struct ArgumentName {
+  std::string step_net;
+  std::string step_scopes;
+  std::string inlinks;
+  std::string outlinks;
+  std::string inlink_alias;   // the alias of inlinks in step net.
+  std::string outlink_alias;  // the alias of outlinks in step net.
+  std::string memories;       // the memory name
+  std::string pre_memories;   // the previous memory name
+  std::string boot_memories;  // the boot memory name
+};
+/**
+ * Prepare inputs for each step net.
+ */
+void SegmentInputs(const std::vector<Scope*>& step_scopes,
+                   const std::vector<Link>& inlinks, const size_t seq_len,
+                   bool infer_shape_mode);
+/**
+ * Process outputs of step nets and merge to variables.
+ */
+void ConcatOutputs(const std::vector<Scope*>& step_scopes,
+                   const std::vector<Link>& outlinks, const size_t seq_len,
+                   bool infer_shape_mode);
+void LinkMemories(const std::vector<Scope*>& step_scopes,
+                  const std::vector<MemoryAttr>& memories, const size_t step_id,
+                  const int offset, bool infer_shape_mode);
+void InitArgument(const ArgumentName& name, Argument* arg,
+                  const OperatorBase& op);
+}  // namespace rnn
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -17,7 +17,7 @@ namespace paddle {
 namespace operators {
 class RowWiseAddOp : public OperatorWithKernel {
-protected:
+ protected:
  void InferShape(const InferShapeContext &ctx) const override {
    PADDLE_ENFORCE(ctx.InputSize() == 2UL,
                   "Two inputs is needed by rowwise add");
@@ -33,7 +33,7 @@ protected:
 };
 class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
-public:
+ public:
  RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The left input of row-wise add op, must be matrix");

--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -20,7 +20,7 @@ namespace operators {
 template <typename Place, typename T>
 class RowWiseAddKernel : public OpKernel {
-public:
+ public:
  void Compute(const ExecutionContext& context) const override {
    auto out = context.Output<Tensor>(0);
    out->mutable_data<T>(context.GetPlace());

--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -18,7 +18,7 @@ namespace paddle {
 namespace operators {
 class SGDOp : public OperatorWithKernel {
-protected:
+ protected:
  void InferShape(const InferShapeContext &ctx) const override {
    PADDLE_ENFORCE(ctx.InputSize() == 2, "Input size of SGDOp must be two");
    PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of SGDOp must be one");
@@ -32,7 +32,7 @@ protected:
 };
 class SGDOpMaker : public OpProtoAndCheckerMaker {
-public:
+ public:
  SGDOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("param", "input parameter");

--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -20,7 +20,7 @@ namespace operators {
 template <typename Place, typename T>
 class SGDOpKernel : public OpKernel {
-public:
+ public:
  void Compute(const ExecutionContext& ctx) const override {
    auto param = ctx.Input<Tensor>("param");
    auto grad = ctx.Input<Tensor>("grad");

--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -17,7 +17,7 @@ namespace paddle {
 namespace operators {
 class SigmoidOp : public OperatorWithKernel {
-protected:
+ protected:
  void InferShape(const InferShapeContext &ctx) const override {
    PADDLE_ENFORCE(ctx.InputSize() == 1, "Sigmoid Op only have one input");
    PADDLE_ENFORCE(ctx.OutputSize() == 1, "Sigmoid Op only have one output");
@@ -26,7 +26,7 @@ protected:
 };
 class SigmoidOpMaker : public OpProtoAndCheckerMaker {
-public:
+ public:
  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "sigmoid input");
@@ -36,11 +36,9 @@ public:
 };
 class SigmoidOpGrad : public OperatorWithKernel {
-protected:
+ protected:
-  void InferShape(const InferShapeContext &ctx) const override {}
+  void InferShape(const InferShapeContext &ctx) const override {
-  std::string DebugString() const override {
+    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
-    LOG(INFO) << "SigmoidGrad";
-    return "";
  }
 };
@@ -51,3 +49,5 @@ REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker);
 REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, ops::SigmoidOpGrad);
 REGISTER_OP_CPU_KERNEL(sigmoid, ops::SigmoidKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(sigmoid_grad,
+                       ops::SigmoidGradKernel<ops::CPUPlace, float>);
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
@@ -16,3 +16,5 @@
 #include "paddle/operators/sigmoid_op.h"
 REGISTER_OP_GPU_KERNEL(sigmoid, ops::SigmoidKernel<ops::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(sigmoid_grad,
+                       ops::SigmoidGradKernel<ops::GPUPlace, float>);
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
@@ -21,12 +21,13 @@ namespace operators {
 template <typename Place, typename T>
 class SigmoidKernel : public OpKernel {
-public:
+ public:
  void Compute(const ExecutionContext& context) const override {
    auto input = context.Input<Tensor>(0);
    auto output = context.Output<Tensor>(0);
    output->mutable_data<T>(context.GetPlace());
+    // The clipping is used in Paddle's raw implenmention
    auto X = EigenVector<T>::Flatten(*input);
    auto Y = EigenVector<T>::Flatten(*output);
    auto place = context.GetEigenDevice<Place>();
@@ -34,5 +35,23 @@ public:
    Y.device(place) = 1.0 / (1.0 + (-1.0 * X).exp());
  }
 };
+template <typename Place, typename T>
+class SigmoidGradKernel : public OpKernel {
+ public:
+  void Compute(const ExecutionContext& context) const override {
+    auto Y_t = context.Input<Tensor>("Y");
+    auto dY_t = context.Input<Tensor>(framework::GradVarName("Y"));
+    auto dX_t = context.Output<Tensor>(framework::GradVarName("X"));
+    dX_t->mutable_data<T>(context.GetPlace());
+    auto dX = EigenVector<T>::Flatten(*dX_t);
+    auto Y = EigenVector<T>::Flatten(*Y_t);
+    auto dY = EigenVector<T>::Flatten(*dY_t);
+    dX.device(context.GetEigenDevice<Place>()) = dY * Y * (1. - Y);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -18,7 +18,7 @@ namespace paddle {
 namespace operators {
 class SoftmaxOp : public OperatorWithKernel {
-protected:
+ protected:
  void InferShape(const InferShapeContext &ctx) const override {
    PADDLE_ENFORCE(ctx.InputSize() == 1UL,
                   "Only one input is need for softmax");
@@ -31,7 +31,7 @@ protected:
 };
 class SoftmaxOpMaker : public OpProtoAndCheckerMaker {
-public:
+ public:
  SoftmaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "input of softmax");
@@ -41,7 +41,7 @@ public:
 };
 class SoftmaxOpGrad : public OperatorWithKernel {
-protected:
+ protected:
  void InferShape(const InferShapeContext &ctx) const override {
    PADDLE_ENFORCE(ctx.InputSize() == 3UL,
                   "Input of SoftmaxOpGrad should be 3, X, Y, YG");

--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -24,7 +24,7 @@ namespace operators {
 template <typename Place, typename T>
 class SoftmaxKernel : public OpKernel {
-public:
+ public:
  void Compute(const ExecutionContext& context) const override {
    auto input = context.Input<Tensor>("X");
    auto output = context.Output<Tensor>("Y");
@@ -63,7 +63,7 @@ public:
 template <typename Place, typename T>
 class SoftmaxGradKernel : public OpKernel {
-public:
+ public:
  void Compute(const ExecutionContext& context) const override {
    std::shared_ptr<Tensor> scale_ = std::make_shared<Tensor>();

--- a/paddle/operators/type_alias.h
+++ b/paddle/operators/type_alias.h
@@ -26,21 +26,16 @@ using OperatorBase = framework::OperatorBase;
 using InferShapeContext = framework::InferShapeContext;
 using ExecutionContext = framework::ExecutionContext;
 using Variable = framework::Variable;
-template <typename T,
+template <typename T, int MajorType = Eigen::RowMajor,
-          int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
-template <typename T,
+template <typename T, int MajorType = Eigen::RowMajor,
-          int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T,
+template <typename T, int MajorType = Eigen::RowMajor,
-          int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-template <typename T,
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          size_t D,
-          int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 using Tensor = framework::Tensor;

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -39,6 +39,10 @@ Configuring cmake in /paddle/build ...
      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 ========================================
 EOF
+# Disable UNITTEST_USE_VIRTUALENV in docker because
+# docker environment is fully controlled by this script.
+# See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
 cmake .. \
      -DCMAKE_BUILD_TYPE=Release \
      -DWITH_DOC=OFF \
@@ -52,39 +56,43 @@ cmake .. \
      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 cat <<EOF
-========================================
+============================================
 Building in /paddle/build ...
   Build unit tests: ${WITH_TESTING:-OFF}
-========================================
+============================================
 EOF
 make -j `nproc`
-if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
-    pip uninstall -y py-paddle paddle || true
-    ctest --output-on-failure
-fi
+if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
 cat <<EOF
 ========================================
-Installing ...
+Running unit tests ...
 ========================================
 EOF
-make install -j `nproc`
+    # make install should also be test when unittest
-pip install /usr/local/opt/paddle/share/wheels/*.whl
+    make install -j `nproc`
-paddle version
+    pip install /usr/local/opt/paddle/share/wheels/*.whl
+    paddle version
+    ctest --output-on-failure
+fi
 # To build documentation, we need to run cmake again after installing
 # PaddlePaddle.  This awkwardness is due to
 # https://github.com/PaddlePaddle/Paddle/issues/1854.  It also
 # describes a solution.
-if [[ ${WITH_DOC} == "ON" ]]; then
+if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
    cat <<EOF
 ========================================
 Building documentation ...
   In /paddle/build_doc
 ========================================
 EOF
+    # build documentation need install Paddle before
+    make install -j `nproc`
+    pip install /usr/local/opt/paddle/share/wheels/*.whl
+    paddle version
    mkdir -p /paddle/build_doc
    pushd /paddle/build_doc
    cmake .. \
@@ -117,13 +125,22 @@ fi
 # generate deb package for current build
 # FIXME(typhoonzero): should we remove paddle/scripts/deb ?
-cat <<EOF
+if [[ ${WITH_DEB:-OFF} == "ON" ]]; then
+    cat <<EOF
 ========================================
 Generating .deb package ...
 ========================================
 EOF
-cpack -D CPACK_GENERATOR='DEB' -j `nproc` ..
+    set +e
+    cpack -D CPACK_GENERATOR='DEB' -j `nproc` ..
+    err_code=$?
+    if [ ${err_code} -ne 0 ]; then
+        # cat error logs if cpack failed.
+        cat /paddle/build/_CPack_Packages/Linux/DEB/PreinstallOutput.log
+        exit ${err_code}
+    fi
+    set -e
+fi
 cat <<EOF
 ========================================

--- a/paddle/scripts/run_python_tests.sh
+++ b/paddle/scripts/run_python_tests.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-pushd `dirname $0` > /dev/null
-SCRIPTPATH=$PWD
-popd > /dev/null
-USE_VIRTUALENV_FOR_TEST=$1; shift
-PYTHON=$1; shift
-if [ $USE_VIRTUALENV_FOR_TEST -ne 0 ]; then
-   rm -rf .test_env
-   virtualenv .test_env
-   unset PYTHONHOME
-   unset PYTHONPATH
-   source .test_env/bin/activate
-   PYTHON=python
-fi
-$PYTHON -m pip install $SCRIPTPATH/../dist/*.whl
-if [ "X${PADDLE_PACKAGE_DIR}" != "X" ]; then
-   $PYTHON -m pip install ${PADDLE_PACKAGE_DIR}/*.whl
-else
-   export PYTHONPATH=$SCRIPTPATH/../../python/
-fi
-$PYTHON -m pip install ipython==5.3
-for fn in "$@"
-do
-  echo "test $fn"
-  $PYTHON $fn
-  if [ $? -ne 0 ]; then
-    exit 1
-  fi
-done
-if [ $USE_VIRTUALENV_FOR_TEST -ne 0 ]; then
-    deactivate
-    rm -rf .test_env
-fi
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -22,7 +22,9 @@ setup(name="py_paddle",
      package_data={'py_paddle':['*.py','_swig_paddle.so']},
      install_requires = [
        'nltk>=3.2.2',
-        'numpy>=1.8.0',      # The numpy is required.
+        # We use `numpy.flip` in `test_image.py`.
+        # `numpy.flip` is introduced in `1.12.0`
+        'numpy>=1.12.0',      # The numpy is required.
        'protobuf==${PROTOBUF_VERSION}'    # The paddle protobuf version
      ],
      url='http://www.paddlepaddle.org/',

--- a/paddle/trainer/tests/compare_sparse_data
+++ b/paddle/trainer/tests/compare_sparse_data
--- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
+++ b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
--- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
+++ b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
-./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
+./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto_data
--- a/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf
+++ b/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
+# Note: when making change to this file, please make sure
+# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest
+# for comparing these two nets can pass (test_CompareTwoNets)
+default_initial_std(0.1)
+default_device(0)
+word_dim = 999
+l1 = 0
+l2 = 0
+model_type("nn")
+sparse_update = get_config_arg("sparse_update", bool, False)
+TrainData(ProtoData(        
+            type = "proto_sequence",
+            files = ('trainer/tests/train_sparse.list'), 
+            ))
+Settings(
+    algorithm='sgd',
+    batch_size=100,
+    learning_rate=0.0001,
+    learning_rate_decay_a=4e-08,
+    learning_rate_decay_b=0.0,
+    learning_rate_schedule='poly',
+)
+wordvec_dim = 32
+layer2_dim = 16
+layer3_dim = 16
+hidden_dim = 32
+slot_names = ["qb", "qw", "tb", "tw"]
+def ltr_network(network_name,
+                word_dim=word_dim,
+                wordvec_dim=wordvec_dim,
+                layer2_dim=layer2_dim,
+                layer3_dim=layer3_dim,
+                hidden_dim=hidden_dim,
+                slot_names=slot_names,
+                l1=l1,
+                l2=l2):
+    slotnum = len(slot_names)
+    for i in xrange(slotnum):
+        Inputs(slot_names[i] + network_name)
+    for i in xrange(slotnum):
+        Layer(
+            name = slot_names[i] + network_name,
+            type = "data",
+            size = word_dim,
+            device = -1,
+        )
+        Layer(
+            name = slot_names[i] + "_embedding_" + network_name,
+            type = "mixed",
+            size = wordvec_dim,
+            bias = False,
+            device = -1,
+            inputs = TableProjection(slot_names[i] + network_name,
+                                     parameter_name = "embedding.w0",
+                                     decay_rate_l1=l1,
+                                     sparse_remote_update = True,
+                                     sparse_update = sparse_update,
+                                     ),
+        )
+        Layer(
+            name = slot_names[i] + "_rnn1_" + network_name,
+            type = "recurrent",
+            active_type = "tanh",
+            bias = Bias(initial_std = 0,
+                        parameter_name = "rnn1.bias"),
+            inputs = Input(slot_names[i] + "_embedding_" + network_name,
+                           parameter_name = "rnn1.w0")
+        )
+        Layer(
+            name = slot_names[i] + "_rnnlast_" + network_name,
+            type = "seqlastins",
+            inputs = [
+                slot_names[i] + "_rnn1_" + network_name,
+            ],
+        )
+    Layer(
+        name = "layer2_" + network_name,
+        type = "fc",
+        active_type = "tanh",
+        size = layer2_dim,
+        bias = Bias(parameter_name = "layer2.bias"),
+        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
+                        parameter_name = "_layer2_" + slot_name + ".w", 
+                        decay_rate = l2, 
+                        initial_smart = True) for slot_name in slot_names]
+    )
+    Layer(
+        name = "layer3_" + network_name,
+        type = "fc",
+        active_type = "tanh",
+        size = layer3_dim,
+        bias = Bias(parameter_name = "layer3.bias"),
+        inputs = [
+            Input("layer2_" + network_name, 
+                  parameter_name = "_layer3.w", 
+                  decay_rate = l2, 
+                  initial_smart = True),
+        ]
+    )
+    Layer(
+        name = "output_" + network_name,
+        type = "fc",
+        size = 1,
+        bias = False,
+        inputs = [
+                  Input("layer3_" + network_name,
+                       parameter_name = "_layerO.w"),
+                 ],
+        )
+ltr_network("left")
+ltr_network("right")
+Inputs("label")
+Layer(
+    name = "label",
+    type = "data",
+    size = 1,
+    )
+Outputs("cost", "qb_rnnlast_left")
+Layer(
+    name = "cost",
+    type = "rank-cost",
+    inputs = ["output_left", "output_right", "label"],
+    )
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
@@ -23,7 +23,7 @@ using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 static const string& configFile1 =
-    "trainer/tests/sample_trainer_config_qb_rnn.conf";
+    "trainer/tests/sample_trainer_config_compare_sparse.conf";
 DECLARE_bool(use_gpu);
 DECLARE_string(config);

--- a/paddle/trainer/tests/train_sparse.list
+++ b/paddle/trainer/tests/train_sparse.list
+trainer/tests/compare_sparse_data
--- a/proto/DataConfig.proto
+++ b/proto/DataConfig.proto
@@ -15,14 +15,13 @@ syntax = "proto2";
 package paddle;
 message FileGroupConf {
-  optional uint32 queue_capacity = 1 [default = 1];
+  optional uint32 queue_capacity = 1 [ default = 1 ];
  // how many files to load for a load file thread
-  optional int32 load_file_count = 2 [default = 1];
+  optional int32 load_file_count = 2 [ default = 1 ];
  // how many threads to load files
  // Setting to be 5~10 is appropriate when loading files by hadoop vfs
-  optional int32 load_thread_num = 3 [default = 1];
+  optional int32 load_thread_num = 3 [ default = 1 ];
 };
 message DataConfig {
@@ -32,26 +31,28 @@ message DataConfig {
  // name of a text file which contains a list of file names at each line
  optional string files = 3;
-  optional int32 feat_dim = 4;//feature dimension of one frame
+  optional int32 feat_dim = 4;         // feature dimension of one frame
-  repeated int32 slot_dims = 5;//feature slot dims
+  repeated int32 slot_dims = 5;        // feature slot dims
-  optional int32 context_len = 6;//max neibour frame numbers
+  optional int32 context_len = 6;      // max neibour frame numbers
-  optional uint64 buffer_capacity = 7;//the number of samples
+  optional uint64 buffer_capacity = 7; // the number of samples
-  //part of data used in training
+  // part of data used in training
-  //if not -1, part of train data is used in training
+  // if not -1, part of train data is used in training
-  optional int64 train_sample_num = 8 [default = -1];
+  optional int64 train_sample_num = 8 [ default = -1 ];
-  //The number of documents processed once
+  // The number of documents processed once
-  optional int32  file_load_num = 9 [default = -1];
+  optional int32 file_load_num = 9 [ default = -1 ];
-  optional bool  async_load_data = 12 [default = false];
+  optional bool async_load_data = 12 [ default = false ];
  /// Note the field number 10, 11 and 13 have been deprecated.
-  optional bool for_test = 14 [default = false];  // whether this data is for test
+  optional bool for_test = 14
+      [ default = false ]; // whether this data is for test
  optional FileGroupConf file_group_conf = 15;
  repeated int32 float_slot_dims = 16;
  /// Note the field number 17, 18 and 19 have been deprecated.
-  // a list of values which will be used to create additional one dimensional float
+  // a list of values which will be used to create additional one dimensional
+  // float
  // values slots. These one dimensional slots can be used as the weight input
  // for cost layers.
  // Currently this is only supported by ProtoDataProvider.
@@ -65,21 +66,21 @@ message DataConfig {
  // for MultiDataProvider
  repeated DataConfig sub_data_configs = 24; // sub dataproviders
-  /*
+                                             /*
-   * the ratio of each sub dataproviders:
+                                              * the ratio of each sub dataproviders:
-   * e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100,
+                                              * e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100,
-   * then each mini-batch is combined by 10 instance from A and 90 instances
+                                              * then each mini-batch is combined by 10 instance from A and 90 instances
-   * from B.
+                                              * from B.
-   */
+                                              */
  optional int32 data_ratio = 25;
  /*
   * if one of the sub dataproviders is running out of data, then
   * (1) it is "main data", then finish current pass.
   * (2) it is not "main data", then reset it, and try getNextBatch again.
   */
-  optional bool is_main_data = 26 [default = true];
+  optional bool is_main_data = 26 [ default = true ];
-  // the usage ratio of instances. Setting to 1.0 means the use of all instances.
+  // the usage ratio of instances. Setting to 1.0 means the use of all
-  optional double usage_ratio = 27 [default = 1.0];
+  // instances.
+  optional double usage_ratio = 27 [ default = 1.0 ];
 };
--- a/proto/DataFormat.proto
+++ b/proto/DataFormat.proto
@@ -17,27 +17,32 @@ package paddle;
 /*
 If values is not empty and ids is empty, this is a dense vector.
- If values is not empty and ids is not empty, this is a sparse vector. The position of each value
+ If values is not empty and ids is not empty, this is a sparse vector. The
+ position of each value
 is specified by ids.
- If values is empty and ids is not empty, this is a sparse vector whose non-zero values are 1.
+ If values is empty and ids is not empty, this is a sparse vector whose non-zero
+ values are 1.
 The position of each 1 is specified by ids.
 */
 message VectorSlot {
-  repeated float values = 1 [packed = true];
+  repeated float values = 1 [ packed = true ];
-  repeated uint32 ids = 2 [packed = true];
+  repeated uint32 ids = 2 [ packed = true ];
  /* For multidimensional data, for example "image width height depth" */
-  repeated uint32 dims = 3 [packed = true];
+  repeated uint32 dims = 3 [ packed = true ];
-  repeated string strs = 4; 
+  repeated string strs = 4;
 };
 /*
- SubseqSlot use to record whether VectorSlot or any other slot in future has subseq.
+ SubseqSlot use to record whether VectorSlot or any other slot in future has
- If not all VectorSlot have subseq, we only store the one who has subseq, and use *slot_id* to record it.
+ subseq.
- One vector_slots has one sequence, and it may have N subseq, thus the number of *lens* will be N too. 
+ If not all VectorSlot have subseq, we only store the one who has subseq, and
+ use *slot_id* to record it.
+ One vector_slots has one sequence, and it may have N subseq, thus the number of
+ *lens* will be N too.
 */
 message SubseqSlot {
-  required uint32 slot_id = 1; //the id of slot who has subseq
+  required uint32 slot_id = 1; // the id of slot who has subseq
-  repeated uint32 lens = 2; // lengths of sub-sequence in the slot
+  repeated uint32 lens = 2;    // lengths of sub-sequence in the slot
 };
 message SlotDef {
@@ -45,13 +50,14 @@ message SlotDef {
    VECTOR_DENSE = 0;
    VECTOR_SPARSE_NON_VALUE = 1;
    VECTOR_SPARSE_VALUE = 2;
-    INDEX = 3;  // This can be used as label, or word id, etc.
+    INDEX = 3; // This can be used as label, or word id, etc.
    VAR_MDIM_DENSE = 4;
    VAR_MDIM_INDEX = 5;
    STRING = 6;
  }
  required SlotType type = 1;
-  required uint32 dim = 2;  // For INDEX slots, this means the maximal index plus 1.
+  required uint32 dim =
+      2; // For INDEX slots, this means the maximal index plus 1.
 };
 message DataHeader {
@@ -60,11 +66,11 @@ message DataHeader {
 };
 message DataSample {
-  optional bool is_beginning = 1 [default = true]; // is the beginning of a sequence
+  optional bool is_beginning = 1
+      [ default = true ]; // is the beginning of a sequence
  repeated VectorSlot vector_slots = 2;
-  repeated uint32 id_slots = 3 [packed = true];
+  repeated uint32 id_slots = 3 [ packed = true ];
  /* use ids of VectorSlot */
  repeated VectorSlot var_id_slots = 4;
  repeated SubseqSlot subseq_slots = 5;
 };
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -21,7 +21,6 @@ package paddle;
 * Various structs for the configuration of a neural network
 */
 message ExternalConfig {
  repeated string layer_names = 1;
  repeated string input_layer_names = 2;
@@ -68,7 +67,7 @@ message ConvConfig {
  required uint32 img_size = 8;
  // caffe mode for output size coherence
-  required bool caffe_mode = 9 [default = true];
+  required bool caffe_mode = 9 [ default = true ];
  // if filter_size_y is set , this convolutional layer will use
  // filters of size filter_size * filter_size_y pixels.
@@ -99,7 +98,7 @@ message PoolConfig {
  optional uint32 start = 4;
  // Defines the stride size between successive pooling squares.
-  required uint32 stride = 5 [default = 1];
+  required uint32 stride = 5 [ default = 1 ];
  // The size of output feature map.
  required uint32 output_x = 6;
@@ -109,7 +108,7 @@ message PoolConfig {
  // padding = 4, instructs the net to implicitly
  // pad the images with a 4-pixel border of zeros.
-  optional uint32 padding = 8 [default = 0];
+  optional uint32 padding = 8 [ default = 0 ];
  // if not set, use size_x
  optional uint32 size_y = 9;
@@ -194,9 +193,7 @@ message MaxOutConfig {
  required uint32 groups = 2;
 }
-message RowConvConfig {
+message RowConvConfig { required uint32 context_length = 1; }
-  required uint32 context_length = 1;
-}
 message SliceConfig {
  required uint32 start = 1;
@@ -212,14 +209,14 @@ message ProjectionConfig {
  // For ShiftProjection
  optional int32 context_start = 5;
  optional int32 context_length = 6;
-  optional bool trainable_padding = 7 [default = false];
+  optional bool trainable_padding = 7 [ default = false ];
  // For convolution
  optional ConvConfig conv_conf = 8;
  optional int32 num_filters = 9;
  // For IdentityOffsetProjection
-  optional uint64 offset = 11 [default = 0];
+  optional uint64 offset = 11 [ default = 0 ];
  // For pool
  optional PoolConfig pool_conf = 12;
@@ -236,7 +233,7 @@ message OperatorConfig {
  required uint64 output_size = 4;
  // For DotMulOperator
-  optional double dotmul_scale = 5 [default = 1.0];
+  optional double dotmul_scale = 5 [ default = 1.0 ];
  // For ConvOperator
  optional ConvConfig conv_conf = 6;
@@ -282,8 +279,8 @@ message MultiBoxLossConfig {
  required float neg_overlap = 4;
  required uint32 background_id = 5;
  required uint32 input_num = 6;
-  optional uint32 height = 7 [default = 1];
+  optional uint32 height = 7 [ default = 1 ];
-  optional uint32 width = 8 [default = 1];
+  optional uint32 width = 8 [ default = 1 ];
 }
 message DetectionOutputConfig {
@@ -294,8 +291,8 @@ message DetectionOutputConfig {
  required uint32 input_num = 5;
  required uint32 keep_top_k = 6;
  required float confidence_threshold = 7;
-  optional uint32 height = 8 [default = 1];
+  optional uint32 height = 8 [ default = 1 ];
-  optional uint32 width = 9 [default = 1];
+  optional uint32 width = 9 [ default = 1 ];
 }
 message ClipConfig {
@@ -331,7 +328,7 @@ message LayerConfig {
  required string name = 1;
  required string type = 2;
  optional uint64 size = 3;
-  //optional ActivationConfig activation = 4;
+  // optional ActivationConfig activation = 4;
  optional string active_type = 4;
  repeated LayerInputConfig inputs = 5;
  optional string bias_parameter_name = 6;
@@ -344,7 +341,7 @@ message LayerConfig {
  // (which is how convnets are usually trained). Setting this to
  // false will untie the biases, yielding a separate bias for
  // every location at which the filter is applied.
-  optional bool shared_biases = 8 [default = false];
+  optional bool shared_biases = 8 [ default = false ];
  // Valid values are ones that divide the area of the output
  // grid in this convolutional layer. For example if this layer
@@ -362,33 +359,35 @@ message LayerConfig {
  // the gpu device which the Layer's data in.
  // Only used by ParallelNeuralNetork. Ignored otherwise.
-  optional int32 device = 12 [default = -1];
+  optional int32 device = 12 [ default = -1 ];
-  // for recurrent layer. If true, the recurrence runs from the end to the beginning.
+  // for recurrent layer. If true, the recurrence runs from the end to the
-  optional bool reversed = 13 [default = false];
+  // beginning.
+  optional bool reversed = 13 [ default = false ];
-  // for lstmemory layer. Different types of nodes have different activation type.
+  // for lstmemory layer. Different types of nodes have different activation
-  optional string active_gate_type  = 14;
+  // type.
+  optional string active_gate_type = 14;
  optional string active_state_type = 15;
  // For NCELayer
  // The number of random negative labels for each sample
-  optional int32 num_neg_samples = 16 [default = 10];
+  optional int32 num_neg_samples = 16 [ default = 10 ];
  // For NCELayer
  // The distribution for generating the random negative labels.
  // A uniform distribution will be used if not provided
-  repeated double neg_sampling_dist = 17 [packed = true];
+  repeated double neg_sampling_dist = 17 [ packed = true ];
  // For MaxLayer
  // default: output VALUE of MaxLayer. set this flag to true for output INDEX
  // INDEX will be put in Argument::value as double values.
-  optional bool output_max_index = 19 [default = false];
+  optional bool output_max_index = 19 [ default = false ];
  /// The filed number 20 have been deprecated.
  // For self-normalized estimation
-  optional double softmax_selfnorm_alpha = 21 [default = 0.1];
+  optional double softmax_selfnorm_alpha = 21 [ default = 0.1 ];
  /// The filed numbers 22 and 23 have been deprecated.
@@ -399,14 +398,14 @@ message LayerConfig {
  optional bool norm_by_times = 25;
  // for CostLayers
-  optional double coeff = 26 [default = 1.0];
+  optional double coeff = 26 [ default = 1.0 ];
  // for AverageLayer
  // can be set to: 'average', 'sum' or 'squarerootn'
  optional string average_strategy = 27;
  // for error clipping
-  optional double error_clipping_threshold = 28 [default = 0.0];
+  optional double error_clipping_threshold = 28 [ default = 0.0 ];
  // for operators used by mixed layer
  repeated OperatorConfig operator_confs = 29;
@@ -434,43 +433,44 @@ message LayerConfig {
  optional uint32 beam_size = 39;
  // for seqlastins layer, whether select first instead last
-  optional bool select_first = 40 [default = false];
+  optional bool select_first = 40 [ default = false ];
  // for seqlastins layer, AverageLayer, MaxLayer and ExpandLayer
  // can be set to: 'non-seq','seq'
-  optional string trans_type = 41 [default = 'non-seq'];
+  optional string trans_type = 41 [ default = 'non-seq' ];
  // to indicate whether selective_fc layer
  // is used in sequence generation or not
-  optional bool selective_fc_pass_generation = 42 [default = false];
+  optional bool selective_fc_pass_generation = 42 [ default = false ];
  // to indicate whether selective_fc layer take its last input to
  // selected several columns and only compute the multiplications
  // between the input matrices and the selected columns of
  // the parameter matrices of this layer.
  // if set false, selective_fc degrades into fc.
-  optional bool has_selected_colums = 43 [default = true];
+  optional bool has_selected_colums = 43 [ default = true ];
  // this parameter is for speed consideration.
  // if number of the selected columns is less than
  // sample number * selective_fc output size * selective_fc_mull_mull_ratio
  // sparse multiplication is used, otherwise, using full multiplication.
-  optional double selective_fc_full_mul_ratio = 44 [default = 0.02];
+  optional double selective_fc_full_mul_ratio = 44 [ default = 0.02 ];
  // to indicate how many threads selective_fc use to to accelate
  // the plain_mul period
  // leave empty or set to 0 to disable multi-thread accleleration
-  optional uint32 selective_fc_parallel_plain_mul_thread_num = 45 [default = 0];
+  optional uint32 selective_fc_parallel_plain_mul_thread_num = 45
+      [ default = 0 ];
  // for batch normalization layer
  // if set use_global_stats true, will use the loaded mean and variance.
  optional bool use_global_stats = 46;
  // use to compute moving mean and variance.
-  optional double moving_average_fraction = 47 [default = 0.9];
+  optional double moving_average_fraction = 47 [ default = 0.9 ];
  // bias size
-  optional uint32 bias_size = 48 [default = 0];
+  optional uint32 bias_size = 48 [ default = 0 ];
  // this parameter can be used as a user-defined parameter when necessary,
  // without changing the proto file.
@@ -485,18 +485,17 @@ message LayerConfig {
  optional uint64 width = 51;
  // blank label used in ctc loss
-  optional uint32 blank = 52 [default = 0];
+  optional uint32 blank = 52 [ default = 0 ];
  // stride parameter for seqlastins layer, AverageLayer, MaxLayer, which
  // controls the scope of pooling operation. can be set > 0.
  // leave empty or set to -1 to disable this stride pooling.
-  optional int32 seq_pool_stride = 53 [default = -1];
+  optional int32 seq_pool_stride = 53 [ default = -1 ];
  // for crop layer
-  optional int32 axis = 54 [default = 2];
+  optional int32 axis = 54 [ default = 2 ];
  repeated uint32 offset = 55;
  repeated uint32 shape = 56;
 }
 message EvaluatorConfig {
@@ -512,9 +511,9 @@ message EvaluatorConfig {
  // Used by PrecisionRecallEvaluator and ClassificationErrorEvaluator
  // For multi binary labels: true if output > classification_threshold
-  optional double classification_threshold = 6 [default = 0.5];
+  optional double classification_threshold = 6 [ default = 0.5 ];
  // The positive label. -1 means average precision and recall
-  optional int32 positive_label = 7 [default = -1];
+  optional int32 positive_label = 7 [ default = -1 ];
  // load dict from this file
  optional string dict_file = 8;
@@ -523,10 +522,10 @@ message EvaluatorConfig {
  optional string result_file = 9;
  // top # results for max id printer
-  optional int32 num_results = 10 [default = 1];
+  optional int32 num_results = 10 [ default = 1 ];
  // whether to delimit the sequence in the seq_text_printer
-  optional bool delimited = 11 [default = true];
+  optional bool delimited = 11 [ default = true ];
  // Used by ChunkEvaluator
  // chunk of these types are not counted
@@ -534,23 +533,23 @@ message EvaluatorConfig {
  // Used by ClassificationErrorEvaluator
  // top # classification error
-  optional int32 top_k = 13 [default = 1];
+  optional int32 top_k = 13 [ default = 1 ];
  // Used by DetectionMAPEvaluator
-  optional double overlap_threshold = 14 [default = 0.5];
+  optional double overlap_threshold = 14 [ default = 0.5 ];
-  optional int32 background_id = 15 [default = 0];
+  optional int32 background_id = 15 [ default = 0 ];
-  optional bool evaluate_difficult = 16 [default = false];
+  optional bool evaluate_difficult = 16 [ default = false ];
-  optional string ap_type = 17 [default = "11point"];
+  optional string ap_type = 17 [ default = "11point" ];
 }
 message LinkConfig {
  required string layer_name = 1;
  required string link_name = 2;
  // If true, this link has sub-sequence
-  optional bool has_subseq = 3 [default = false];
+  optional bool has_subseq = 3 [ default = false ];
 }
 message MemoryConfig {
@@ -563,18 +562,18 @@ message MemoryConfig {
  optional uint32 boot_with_const_id = 7;
  // memory is a sequence, initailized by a sequence boot layer
-  optional bool is_sequence = 6 [default = false];
+  optional bool is_sequence = 6 [ default = false ];
 }
 message GeneratorConfig {
  required uint32 max_num_frames = 1;
  required string eos_layer_name = 2;
-  optional int32 num_results_per_sample = 3 [default = 1];
+  optional int32 num_results_per_sample = 3 [ default = 1 ];
  // for beam search
-  optional int32 beam_size = 4 [default = 1];
+  optional int32 beam_size = 4 [ default = 1 ];
-  optional bool log_prob = 5 [default = true];
+  optional bool log_prob = 5 [ default = true ];
 }
 message SubModelConfig {
@@ -584,10 +583,10 @@ message SubModelConfig {
  repeated string output_layer_names = 4;
  repeated string evaluator_names = 5;
-  optional bool is_recurrent_layer_group = 6 [default = false];
+  optional bool is_recurrent_layer_group = 6 [ default = false ];
  // If true, the recurrence runs from the end to the beginning.
-  optional bool reversed = 7 [default = false];
+  optional bool reversed = 7 [ default = false ];
  // name and link name of memory
  repeated MemoryConfig memories = 8;
@@ -601,14 +600,15 @@ message SubModelConfig {
  optional GeneratorConfig generator = 11;
-  // the id of inlink which share info with outlinks, used in recurrent layer group
+  // the id of inlink which share info with outlinks, used in recurrent layer
+  // group
  optional int32 target_inlinkid = 12;
 }
 message ModelConfig {
  // type of the model.
  // Currently, "nn", "recurrent_nn" and "recursive_nn" are supported
-  required string type = 1 [default = "nn"];
+  required string type = 1 [ default = "nn" ];
  // layers should be ordered in such a way that the forward propagation
  // can be correctly executed by going from the first layer to the last layer

--- a/proto/OptimizerConfig.proto
+++ b/proto/OptimizerConfig.proto
 syntax = "proto2";
 option optimize_for = LITE_RUNTIME;
 package paddle;
@@ -9,13 +9,11 @@ message SGDConfig {
  // momentum: float >= 0. Parameter updates momentum.
  // decay: float >= 0. Learning rate decay over each update.
  // nesterov: boolean. Whether to apply Nesterov momentum.
-  optional double momentum = 21 [default = 0.0];
+  optional double momentum = 21 [ default = 0.0 ];
-  optional double decay = 23 [default = 0.0];
+  optional double decay = 23 [ default = 0.0 ];
-  optional bool nesterov =24 [default = false];
+  optional bool nesterov = 24 [ default = false ];
 }
 message AdadeltaConfig {
  // Adadelta
  // It is recommended to leave it at the default value.
@@ -23,21 +21,23 @@ message AdadeltaConfig {
  // epsilon: float >= 0. Fuzz factor.
  // decay: float >= 0. Learning rate decay over each update.
-  // reference : [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701)
+  // reference : [Adadelta - an adaptive learning rate
-  optional double rho = 33 [default = 0.90];
+  // method](http://arxiv.org/abs/1212.5701)
-  optional double epsilon = 31 [default = 1e-5];
+  optional double rho = 33 [ default = 0.90 ];
-  optional double decay = 32 [default = 0.0];
+  optional double epsilon = 31 [ default = 1e-5 ];
+  optional double decay = 32 [ default = 0.0 ];
 }
 message AdagradConfig {
-// Adagrad
+  // Adagrad
-// epsilon: float >= 0.
+  // epsilon: float >= 0.
-// decay: float >= 0. Learning rate decay over each update.
+  // decay: float >= 0. Learning rate decay over each update.
-// reference : [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+  // reference : [Adaptive Subgradient Methods for Online Learning and
-  optional double epsilon = 41 [default = 1e-5];
+  // Stochastic
-  optional double decay = 42 [default = 0.0];
+  // Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+  optional double epsilon = 41 [ default = 1e-5 ];
+  optional double decay = 42 [ default = 0.0 ];
 }
 message AdamConfig {
@@ -46,7 +46,8 @@ message AdamConfig {
  // beta_2: float, 0 < beta < 1. Generally close to 1.
  // epsilon: float >= 0. Fuzz factor.
  // decay: float >= 0. Learning rate decay over each update.
-  // reference : [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
+  // reference : [Adam - A Method for Stochastic
+  // Optimization](http://arxiv.org/abs/1412.6980v8)
  optional double beta_1 = 41;
  optional double beta_2 = 42;
  optional double epsilon = 43;
@@ -55,32 +56,32 @@ message AdamConfig {
 message ConstLrConfig {
  // learninRate Policy
-  optional double learning_rate = 1 [default = 1.0];
+  optional double learning_rate = 1 [ default = 1.0 ];
 }
 message LinearLrConfig {
  // learninRate Policy
-  optional double learning_rate = 1 [default = 1.0];
+  optional double learning_rate = 1 [ default = 1.0 ];
  optional double lr_decay_a = 2;
  optional double lr_decay_b = 3;
 }
 message TensorProto {
-enum DataType {
+  enum DataType {
-  PADDLE_ELEMENT_TYPE_INT32 = 0;
+    PADDLE_ELEMENT_TYPE_INT32 = 0;
-  PADDLE_ELEMENT_TYPE_UINT32 = 1;
+    PADDLE_ELEMENT_TYPE_UINT32 = 1;
-  PADDLE_ELEMENT_TYPE_INT64 = 2;
+    PADDLE_ELEMENT_TYPE_INT64 = 2;
-  PADDLE_ELEMENT_TYPE_UINT64 = 3;
+    PADDLE_ELEMENT_TYPE_UINT64 = 3;
-  PADDLE_ELEMENT_TYPE_FLOAT32 = 4;
+    PADDLE_ELEMENT_TYPE_FLOAT32 = 4;
-  PADDLE_ELEMENT_TYPE_FLOAT64 = 5;
+    PADDLE_ELEMENT_TYPE_FLOAT64 = 5;
-}
+  }
  optional DataType data_type = 1;
  repeated bytes content = 2;
 }
 message LrPolicyState {
  // learninRate Policy
-  optional double learning_rate = 1 [default = 1.0];
+  optional double learning_rate = 1 [ default = 1.0 ];
  optional double lr_decay_a = 2;
  optional double lr_decay_b = 3;
 }
@@ -104,7 +105,6 @@ message AdadeltaOptimizerState {
  optional TensorProto update_delta = 4;
 }
 message AdagradOptimizerState {
  optional LrPolicyState lr_state = 101;
  optional double num_sample_passed = 104;
@@ -124,10 +124,10 @@ message AdamOptimizerState {
 message OptimizerConfig {
  enum Optimizer {
-   SGD = 1;
+    SGD = 1;
-   Adadelta = 2;
+    Adadelta = 2;
-   Adagrad = 3;
+    Adagrad = 3;
-   Adam = 4;
+    Adam = 4;
  }
  optional Optimizer optimizer = 1;
  optional SGDConfig sgd = 3;
@@ -136,8 +136,8 @@ message OptimizerConfig {
  optional AdamConfig adam = 6;
  enum LrPolicy {
-   Const = 0;
+    Const = 0;
-   Linear = 1;
+    Linear = 1;
  }
  optional LrPolicy lr_policy = 11;
  optional ConstLrConfig const_lr = 12;

--- a/proto/ParameterConfig.proto
+++ b/proto/ParameterConfig.proto
@@ -27,56 +27,57 @@ enum ParameterInitStrategy {
 message ParameterUpdaterHookConfig {
  // hook type such as  'pruning'
  required string type = 1;
-  // this represents the ratio of zero element to be set by the Parameter 
+  // this represents the ratio of zero element to be set by the Parameter
-  optional double sparsity_ratio = 2 [default = 0.6];
+  optional double sparsity_ratio = 2 [ default = 0.6 ];
 }
 message ParameterConfig {
  required string name = 1;
  required uint64 size = 2;
-  optional double learning_rate = 3 [default = 1.0];
+  optional double learning_rate = 3 [ default = 1.0 ];
-  optional double momentum = 4 [default = 0.0];
+  optional double momentum = 4 [ default = 0.0 ];
-  optional double initial_mean = 5 [default = 0.0];
+  optional double initial_mean = 5 [ default = 0.0 ];
-  optional double initial_std = 6 [default = 0.01];
+  optional double initial_std = 6 [ default = 0.01 ];
  // use L2-regularization if decay_rate set and decay_rate_l1 not set
-  optional double decay_rate = 7 [default = 0.0];
+  optional double decay_rate = 7 [ default = 0.0 ];
  // use L1-regularization if decay_rate_l1 set
-  optional double decay_rate_l1 = 8 [default = 0.0];
+  optional double decay_rate_l1 = 8 [ default = 0.0 ];
  // dims of Parameter, e.g. dims[0] as height, dims[1] as width..
  repeated uint64 dims = 9;
  // the gpu device which the parameter in.
  // Only used by ParallelNeuralNetork. Ignored otherwise.
-  optional int32 device = 10 [default = -1];
+  optional int32 device = 10 [ default = -1 ];
  // how to init the parameter: 0 -> normal, 1 -> uniform
  // 0: treat initial_mean as mean, intial_std as standard deviation
  // 1: range is (initial_mean - initial_std) to (initial_mean + initial_std)
-  optional int32 initial_strategy = 11 [default = 0];
+  optional int32 initial_strategy = 11 [ default = 0 ];
  // define the variance when init the parameter, by height of the Matrix
-  optional bool initial_smart = 12 [default = false];
+  optional bool initial_smart = 12 [ default = false ];
  // apply regularization every # batches
-  optional int32 num_batches_regularization = 13 [default = 1];
+  optional int32 num_batches_regularization = 13 [ default = 1 ];
  // if is_sparse is true, para is sparse, else para is dense
-  optional bool is_sparse = 14[default = false];
+  optional bool is_sparse = 14 [ default = false ];
-  // if para is sparse, format should be "csc" or "csr", empty means is not sparse
+  // if para is sparse, format should be "csc" or "csr", empty means is not
-  optional string format = 15 [default = ""];
+  // sparse
+  optional string format = 15 [ default = "" ];
  // sparse remote update or not
-  optional bool sparse_remote_update = 16 [default = false];
+  optional bool sparse_remote_update = 16 [ default = false ];
  // gradient clipping threshold, no clipping by default
-  optional double gradient_clipping_threshold = 17 [default = 0.0];
+  optional double gradient_clipping_threshold = 17 [ default = 0.0 ];
  // static parameters are fixed when training
-  optional bool is_static = 18 [default = false];
+  optional bool is_static = 18 [ default = false ];
  // para_id should NOT be set by config_parser. It is for
  // internal use.
  optional uint64 para_id = 19;
  repeated ParameterUpdaterHookConfig update_hooks = 20;
  // setup load mat -> csr
-  optional bool need_compact = 21 [default = false];
+  optional bool need_compact = 21 [ default = false ];
  // whether to do sparse update for this parameter
-  optional bool sparse_update = 22 [default = false];
+  optional bool sparse_update = 22 [ default = false ];
  // whether this parameter is shared or not.
-  optional bool is_shared = 23 [default = false];
+  optional bool is_shared = 23 [ default = false ];
  // parameter block size
-  optional uint64 parameter_block_size = 24 [default = 0];
+  optional uint64 parameter_block_size = 24 [ default = 0 ];
 }
--- a/proto/ParameterServerConfig.proto
+++ b/proto/ParameterServerConfig.proto
@@ -15,13 +15,10 @@ syntax = "proto2";
 package paddle;
 /**
 * Configuration structure for ParameterClient2.
 */
-message ParameterClientConfig {
+message ParameterClientConfig { required int32 trainer_id = 1; }
-  required int32 trainer_id = 1;
-}
 /**
 * Configuration structure for ParameterServer2.
@@ -30,24 +27,24 @@ message ParameterServerConfig {
  // Number of ports for sending dense parameter,
  // following ports on parameter server will be visited
  // for sending dense parameter: [port, port+ports_num-1]
-  required int32 ports_num = 1 [default = 1];
+  required int32 ports_num = 1 [ default = 1 ];
  // Number of ports for sending sparse parameter,
  // following ports on parameter server will be visited
  // for sending sparse parameter:
  // [port+ports_num, port+ports_num+ports_num_for_sparse-1]
-  required int32 ports_num_for_sparse = 2 [default = 0];
+  required int32 ports_num_for_sparse = 2 [ default = 0 ];
  // network device name for pservers
-  required string nics = 3 [default = "xgbe0,xgbe1"];
+  required string nics = 3 [ default = "xgbe0,xgbe1" ];
-  required string rdma_tcp = 4 [default = "tcp"];
+  required string rdma_tcp = 4 [ default = "tcp" ];
  // Listening port for pserver
-  required int32 port = 5 [default = 20134];
+  required int32 port = 5 [ default = 20134 ];
  // number of gradient servers
-  required int32 num_gradient_servers = 6 [default = 1];
+  required int32 num_gradient_servers = 6 [ default = 1 ];
  // number of threads for sync op exec
-  required int32 pserver_num_threads = 7 [default = 1];
+  required int32 pserver_num_threads = 7 [ default = 1 ];
  // control config_.async_lagged_grad_discard_ratio() min value
-  required double async_lagged_ratio_min = 8 [default = 1.0];
+  required double async_lagged_ratio_min = 8 [ default = 1.0 ];
  // if async_lagged_grad_discard_ratio is not set in trainer_config.conf
  // use it as defalut value
-  required double async_lagged_ratio_default = 9 [default = 1.5];
+  required double async_lagged_ratio_default = 9 [ default = 1.5 ];
 }
\ No newline at end of file
--- a/proto/ParameterService.proto
+++ b/proto/ParameterService.proto
@@ -23,8 +23,8 @@ package paddle;
 */
 enum ParameterUpdateMode {
  // Set parameter
-   PSERVER_UPDATE_MODE_SET_PARAM = 0;//use local param
+  PSERVER_UPDATE_MODE_SET_PARAM = 0;      // use local param
-   PSERVER_UPDATE_MODE_SET_PARAM_ZERO = 1;//set zero param
+  PSERVER_UPDATE_MODE_SET_PARAM_ZERO = 1; // set zero param
  // Update parameter once a gradient is received
  PSERVER_UPDATE_MODE_ASYNC_SGD = 2;
@@ -37,7 +37,7 @@ enum ParameterUpdateMode {
  // No update. Only get parameters back.
  PSERVER_UPDATE_MODE_GET_PARAM = 5;
-  PSERVER_UPDATE_MODE_GET_PARAM_SPARSE = 6;//only get sparse rows
+  PSERVER_UPDATE_MODE_GET_PARAM_SPARSE = 6; // only get sparse rows
 };
 message ParameterBlock {
@@ -80,42 +80,34 @@ message SendParameterRequest {
  optional int32 trainer_id = 7;
  // send back parameter type on pserver, PARAMETER_VALUE by default
-  optional int32 send_back_parameter_type = 8 [default = 0];
+  optional int32 send_back_parameter_type = 8 [ default = 0 ];
  // forwardbackward time in usec
  optional uint64 forwardbackward_time = 9;
 }
-message WaitPassStartRequest {
+message WaitPassStartRequest {}
-}
-message WaitPassStartResponse {
+message WaitPassStartResponse {}
-}
-message WaitPassFinishRequest {
+message WaitPassFinishRequest {}
-}
-message WaitPassFinishResponse {
+message WaitPassFinishResponse {}
-}
 enum SyncObject {
  SYNC_DEFAULT = 0; // wait for the synchronizeBarrier_
-  SYNC_DATA = 1; // wait for the synchronizeDataBarrier_
+  SYNC_DATA = 1;    // wait for the synchronizeDataBarrier_
 }
 message SynchronizeRequest {
-  required SyncObject sync_object_id = 1 [default = SYNC_DEFAULT];
+  required SyncObject sync_object_id = 1 [ default = SYNC_DEFAULT ];
  optional int32 trainer_id = 2;
 }
-message SynchronizeResponse {
+message SynchronizeResponse {}
-}
-message SendParameterResponse  {
+message SendParameterResponse { repeated ParameterBlock blocks = 1; }
-  repeated ParameterBlock blocks = 1;
-}
 message SetConfigRequest {
  repeated ParameterConfig param_configs = 1;
@@ -125,26 +117,18 @@ message SetConfigRequest {
  required bool is_sparse_server = 6;
 }
-message SetConfigResponse{
+message SetConfigResponse {}
-}
-message GetStatusRequest {
+message GetStatusRequest {}
-}
-message GetStatusResponse {
+message GetStatusResponse { required PServerStatus status = 1; }
-  required PServerStatus status = 1;
-}
-message SetStatusRequest {
+message SetStatusRequest { required PServerStatus status = 1; }
-  required PServerStatus status = 1;
-}
-message SetStatusResponse {
+message SetStatusResponse {}
-}
 // create a column vector. The size is the dimension of parameter
-message CreateVectorRequest {
+message CreateVectorRequest {}
-}
 message CreateVectorResponse {
  // error message. Empty if success
@@ -153,9 +137,7 @@ message CreateVectorResponse {
  required int64 handle = 2;
 }
-message ReleaseVectorRequest {
+message ReleaseVectorRequest { required int64 handle = 1; }
-  required int64 handle = 1;
-}
 message ReleaseVectorResponse {
  // error message. Empty if success
@@ -164,9 +146,7 @@ message ReleaseVectorResponse {
 // Create a column major matrix. The number of rows is the dimension
 // of parameter. The number of columns is specifed by num_cols
-message CreateMatrixRequest {
+message CreateMatrixRequest { required int32 num_cols = 1; }
-  required int32 num_cols = 1;
-}
 message CreateMatrixResponse {
  // error message. Empty if success
@@ -175,16 +155,13 @@ message CreateMatrixResponse {
  required int64 handle = 2;
 }
-message ReleaseMatrixRequest {
+message ReleaseMatrixRequest { required int64 handle = 1; }
-  required int64 handle = 1;
-}
 message ReleaseMatrixResponse {
  // error message. Empty if success
  optional string return_message = 1;
 }
 /**
 * The operations are defined using the variables commented at Operation
 * and OperationResult
@@ -245,36 +222,36 @@ enum MatrixVectorOperation {
 message ProtoVector {
  required int64 dim = 1;
-  repeated double values = 2 [packed = true];
+  repeated double values = 2 [ packed = true ];
 }
 message ProtoMatrix {
  required int64 num_rows = 1;
  required int64 num_cols = 2;
-  repeated double values = 3 [packed = true];
+  repeated double values = 3 [ packed = true ];
 }
 message Operation {
  required MatrixVectorOperation operation = 1;
  // vector handles created on the pserver
-  repeated int64 pvectors = 2;        // u, v, w
+  repeated int64 pvectors = 2; // u, v, w
  // matrix handles created on the pserver
-  repeated int64 pmatrices = 3;       // A, B, C
+  repeated int64 pmatrices = 3; // A, B, C
-  repeated double scalars = 4;  	      // a, b, c
+  repeated double scalars = 4;       // a, b, c
-  repeated ProtoVector vectors = 5;   // x, y, z
+  repeated ProtoVector vectors = 5;  // x, y, z
-  repeated ProtoMatrix matrices = 6;  // X, Y, Z
+  repeated ProtoMatrix matrices = 6; // X, Y, Z
 }
 message OperationResult {
  // error message. Empty if success
  optional string return_message = 1;
-//
+  //
-  repeated double scalars = 2;  // d, e, f
+  repeated double scalars = 2;       // d, e, f
  repeated ProtoVector vectors = 3;  // p, q, r
-  repeated ProtoMatrix matrices = 4;  // P, Q, R
+  repeated ProtoMatrix matrices = 4; // P, Q, R
 }
 message DoOperationRequest {
@@ -301,18 +278,14 @@ message DoOperationResponse {
  required bool pass_finish = 3;
 }
-message LoadValueRequest {
+message LoadValueRequest { required string dir_name = 1; }
-  required string dir_name = 1;
-}
 message LoadValueResponse {
  // error message. Empty if success
  optional string return_message = 1;
 }
-message SaveValueRequest {
+message SaveValueRequest { required string dir_name = 1; }
-  required string dir_name = 1;
-}
 message SaveValueResponse {
  // error message. Empty if success
@@ -331,11 +304,11 @@ enum DataUpdateMode {
  // Client send it's own ref label to pserver
  DATA_UPDATE_MODE_SET_REF_LABEL = 4;
  // Client get all ref labels from all pservers
-  DATA_UPDATE_MODE_GET_REF_LABEL =5;
+  DATA_UPDATE_MODE_GET_REF_LABEL = 5;
  // Client send it's own ref grad to pserver
-  DATA_UPDATE_MODE_SET_REF_GRAD =6;
+  DATA_UPDATE_MODE_SET_REF_GRAD = 6;
  // Client get all ref grad from all pservers
-  DATA_UPDATE_MODE_GET_REF_GRAD =7;
+  DATA_UPDATE_MODE_GET_REF_GRAD = 7;
 }
 enum SendDataType {
@@ -360,7 +333,7 @@ message DataBlock {
  // byte size of one data type
  required int32 data_size = 2;
  // data_type
-  optional TransDataType data_type = 3 [default = TRANS_DOUBLE];
+  optional TransDataType data_type = 3 [ default = TRANS_DOUBLE ];
 }
 message SendDataRequest {

--- a/proto/TrainerConfig.proto
+++ b/proto/TrainerConfig.proto
@@ -20,14 +20,14 @@ package paddle;
 message OptimizationConfig {
  required int32 batch_size = 3;
-  required string algorithm = 4 [default = "async_sgd"];
+  required string algorithm = 4 [ default = "async_sgd" ];
-  optional int32 num_batches_per_send_parameter = 5 [default = 1];
+  optional int32 num_batches_per_send_parameter = 5 [ default = 1 ];
-  optional int32 num_batches_per_get_parameter = 6 [default = 1];
+  optional int32 num_batches_per_get_parameter = 6 [ default = 1 ];
  required double learning_rate = 7;
-  optional double learning_rate_decay_a = 8 [default = 0];
+  optional double learning_rate_decay_a = 8 [ default = 0 ];
-  optional double learning_rate_decay_b = 9 [default = 0];
+  optional double learning_rate_decay_b = 9 [ default = 0 ];
-  optional string learning_rate_schedule = 27 [default = "constant"];
+  optional string learning_rate_schedule = 27 [ default = "constant" ];
  // learning rate will be scaled according to learning_rate_schedule
  // 1), constant:
  // lr = learning_rate
@@ -49,88 +49,92 @@ message OptimizationConfig {
  // owlqn related
  // L1-regularization
-  optional double l1weight = 10 [default = 0.1];
+  optional double l1weight = 10 [ default = 0.1 ];
  // L2-regularization
-  optional double l2weight = 11 [default = 0];
+  optional double l2weight = 11 [ default = 0 ];
  // "c1" in wolfe condition: if (newobj <= oldobj + c1 * origDirDeriv * step)
  // then accept the step
-  optional double c1 = 12 [default = 0.0001];
+  optional double c1 = 12 [ default = 0.0001 ];
  // multiply the step with "backoff", when wolfe condition doesn't satisfy
-  optional double backoff = 13 [default = 0.5];
+  optional double backoff = 13 [ default = 0.5 ];
  // how many "s"s and "y"s are kept in owlqn
-  optional int32 owlqn_steps = 14 [default = 10];
+  optional int32 owlqn_steps = 14 [ default = 10 ];
  // accept the step if encountered "max_backoff" times of "reduce the step"
-  optional int32 max_backoff = 15 [default = 5];
+  optional int32 max_backoff = 15 [ default = 5 ];
  // L2-regularization coefficient is reduced linearly from iteration 0 to
  // "l2weight_zero_iter", and set to 0 after "l2weight_zero_iter"
  // iterations. set "l2weight_zero_iter" to 0 to disable this strategy.
-  optional int32 l2weight_zero_iter = 17 [default = 0];
+  optional int32 l2weight_zero_iter = 17 [ default = 0 ];
  // averaged sgd
  // About average_window * numBatchProcessed parameter are used
  // for average. To be accurate, between average_window * numBatchProcessed
  // and 2 * average_window * numBatchProcessed parameters are used for
  // average.
-  optional double average_window = 18 [default = 0];
+  optional double average_window = 18 [ default = 0 ];
-  optional int64 max_average_window = 19 [default = 0x7fffffffffffffff];
+  optional int64 max_average_window = 19 [ default = 0x7fffffffffffffff ];
  //////////////////////////
  // Options Adaptive SGD //
  //////////////////////////
-  // learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta", "rmsprop"
+  // learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta",
-  // default learning method("momentum") use global decayed learning rate with momentum.
+  // "rmsprop"
+  // default learning method("momentum") use global decayed learning rate with
+  // momentum.
  // "adagrad", "adadelta" and "rmsprop" can set momentum too.
-  optional string learning_method = 23 [default = "momentum"];
+  optional string learning_method = 23 [ default = "momentum" ];
-  optional double ada_epsilon = 24 [default = 1e-6];
+  optional double ada_epsilon = 24 [ default = 1e-6 ];
-  optional double ada_rou = 26 [default = 0.95];
+  optional double ada_rou = 26 [ default = 0.95 ];
  // Force to do average in cpu in order to save gpu memory usage
-  optional bool do_average_in_cpu = 25 [default = false];
+  optional bool do_average_in_cpu = 25 [ default = false ];
  // delta add rate in pserver, used while num_batches_per_send_parameter>1
  // will be divided by #machines automatically.
-  optional double delta_add_rate = 28 [default = 1.0];
+  optional double delta_add_rate = 28 [ default = 1.0 ];
  // We split a large size into smaller mini-batches, whose sizes are
  // determined by mini_batch_size. It only takes effect when there is
  // an ExternalMachine.
-  optional int32 mini_batch_size = 29 [default = 128];
+  optional int32 mini_batch_size = 29 [ default = 128 ];
  // automatically set if any one of parameters set sparse remote update flag
-  optional bool use_sparse_remote_updater = 30 [default = false];
+  optional bool use_sparse_remote_updater = 30 [ default = false ];
-  // how to update center parameter and feedback to local parameter, 
+  // how to update center parameter and feedback to local parameter,
  // when use local sgd update in cluster training.
-  // A option is elastic_average, proposed by the paper: Deep learning with elastic averaging SGD.
+  // A option is elastic_average, proposed by the paper: Deep learning with
-  // If use elastic_average method, every trainer node should sample from whole data sets.
+  // elastic averaging SGD.
-  optional string center_parameter_update_method = 31 [default = "average"];
+  // If use elastic_average method, every trainer node should sample from whole
+  // data sets.
+  optional string center_parameter_update_method = 31 [ default = "average" ];
  // shrink sparse parameter value
  // only works if parameter is remote sparse update and has L1 decay rate
-  optional double shrink_parameter_value = 32 [default = 0];
+  optional double shrink_parameter_value = 32 [ default = 0 ];
  ////////////////////////////
  // Options Adam Optimizer //
  ////////////////////////////
-  optional double adam_beta1 = 33 [default = 0.9];
+  optional double adam_beta1 = 33 [ default = 0.9 ];
-  optional double adam_beta2 = 34 [default = 0.999];
+  optional double adam_beta2 = 34 [ default = 0.999 ];
-  optional double adam_epsilon = 35 [default = 1e-8];
+  optional double adam_epsilon = 35 [ default = 1e-8 ];
  // arguments for learning rate scheduler
  // Format: num1:rate1,num2:rate2,...,numK:rateK
  // For learning_rate_schedule="manual", num is the number of samples,
  // For learning_rate_schedule="pass_manual",
  //  num is the number of passes (starting from 0)
-  optional string learning_rate_args = 36 [default = ""];
+  optional string learning_rate_args = 36 [ default = "" ];
  // for async sgd gradient commit control.
  // when async_lagged_grad_discard_ratio * num_gradient_servers commit passed,
  // current async gradient will be discard silently.
-  optional double async_lagged_grad_discard_ratio = 37 [default = 1.5];
+  optional double async_lagged_grad_discard_ratio = 37 [ default = 1.5 ];
-  // global threshold for gradient clipping 
+  // global threshold for gradient clipping
-  optional double gradient_clipping_threshold = 38 [default = 0.0];
+  optional double gradient_clipping_threshold = 38 [ default = 0.0 ];
 };
 message TrainerConfig {
@@ -141,7 +145,7 @@ message TrainerConfig {
  repeated string config_files = 5;
  // the directory to save/load model files for each training path
-  optional string save_dir = 6 [default = "./output/model"];
+  optional string save_dir = 6 [ default = "./output/model" ];
  // Path of the initial model parameters.
  // If it was set, start_pass will be ignored.
@@ -149,7 +153,7 @@ message TrainerConfig {
  // Start training from this pass.
  // Will load parameter from the previous pass.
-  optional int32 start_pass = 8 [default = 0];
+  optional int32 start_pass = 8 [ default = 0 ];
  // file path to the trainer config file
  optional string config_file = 9;

--- a/python/paddle/v2/framework/create_op_creation_methods.py
+++ b/python/paddle/v2/framework/create_op_creation_methods.py
 import paddle.v2.framework.core as core
 import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2
 import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2
-import paddle.v2.framework.proto.attr_type_pb2 as attr_type_pb2
+import paddle.v2.framework.proto.attribute_pb2 as attribute_pb2
 import cStringIO
@@ -57,7 +57,7 @@ class OpDescCreationMethod(object):
            op_desc.attrs.extend([out_format])
        if len(tmp_index) != 0:
            tmp_index_attr = op_desc.attrs.add()
-            tmp_index_attr.type = attr_type_pb2.INTS
+            tmp_index_attr.type = attribute_pb2.INTS
            tmp_index_attr.name = "temporary_index"
            tmp_index_attr.ints.extend(tmp_index)
@@ -73,17 +73,17 @@ class OpDescCreationMethod(object):
                new_attr = op_desc.attrs.add()
                new_attr.name = attr.name
                new_attr.type = attr.type
-                if attr.type == attr_type_pb2.INT:
+                if attr.type == attribute_pb2.INT:
                    new_attr.i = user_defined_attr
-                elif attr.type == attr_type_pb2.FLOAT:
+                elif attr.type == attribute_pb2.FLOAT:
                    new_attr.f = user_defined_attr
-                elif attr.type == attr_type_pb2.STRING:
+                elif attr.type == attribute_pb2.STRING:
                    new_attr.s = user_defined_attr
-                elif attr.type == attr_type_pb2.INTS:
+                elif attr.type == attribute_pb2.INTS:
                    new_attr.ints.extend(user_defined_attr)
-                elif attr.type == attr_type_pb2.FLOATS:
+                elif attr.type == attribute_pb2.FLOATS:
                    new_attr.floats.extend(user_defined_attr)
-                elif attr.type == attr_type_pb2.STRINGS:
+                elif attr.type == attribute_pb2.STRINGS:
                    new_attr.strings.extend(user_defined_attr)
                else:
                    raise NotImplementedError("Not support attribute type " +
@@ -109,7 +109,7 @@ class OpDescCreationMethod(object):
        retv = []
        if multiple:
            var_format = op_desc_pb2.AttrDesc()
-            var_format.type = attr_type_pb2.INTS
+            var_format.type = attribute_pb2.INTS
            var_format.name = "%s_format" % in_out
            var_format.ints.append(0)
@@ -185,17 +185,17 @@ def get_docstring_from_op_proto(op_proto):
    for attr in op_proto.attrs:
        attr_type = None
-        if attr.type == attr_type_pb2.INT:
+        if attr.type == attribute_pb2.INT:
            attr_type = "int"
-        elif attr.type == attr_type_pb2.FLOAT:
+        elif attr.type == attribute_pb2.FLOAT:
            attr_type = "float"
-        elif attr.type == attr_type_pb2.STRING:
+        elif attr.type == attribute_pb2.STRING:
            attr_type = "basestr"
-        elif attr.type == attr_type_pb2.INTS:
+        elif attr.type == attribute_pb2.INTS:
            attr_type = "list of int"
-        elif attr.type == attr_type_pb2.FLOATS:
+        elif attr.type == attribute_pb2.FLOATS:
            attr_type = "list of float"
-        elif attr.type == attr_type_pb2.STRINGS:
+        elif attr.type == attribute_pb2.STRINGS:
            attr_type = "list of basestr"
        if attr_type is None:

--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
-add_python_test(test_framework
+py_test(test_net SRCS test_net.py)
-    test_protobuf.py
-    test_scope.py
+py_test(test_fc_op SRCS test_fc_op.py)
-    test_default_scope_funcs.py
+py_test(test_scope SRCS test_scope.py)
-    test_op_creation_methods.py
-    test_net.py
+py_test(test_tensor SRCS test_tensor.py)
-    test_tensor.py
+py_test(test_mul_op SRCS test_mul_op.py)
-    test_fc_op.py
-    test_add_two_op.py
+py_test(test_network SRCS test_network.py)
-    test_sgd_op.py
+py_test(test_mean_op SRCS test_mean_op.py)
-    test_mul_op.py
-    test_mean_op.py
+py_test(test_protobuf SRCS test_protobuf.py)
-    test_sigmoid_op.py
-    test_softmax_op.py
+py_test(test_add_two_op SRCS test_add_two_op.py)
-    test_rowwise_add_op.py
+py_test(test_sigmoid_op SRCS test_sigmoid_op.py)
-    test_network.py
+py_test(test_softmax_op SRCS test_softmax_op.py)
-    gradient_checker.py)
+py_test(gradient_checker SRCS gradient_checker.py)
+py_test(test_rowwise_add_op SRCS test_rowwise_add_op.py)
+py_test(test_default_scope_funcs SRCS test_default_scope_funcs.py)
+py_test(test_op_creation_methods SRCS test_op_creation_methods.py)
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@@ -33,23 +33,28 @@ class OpTestMeta(type):
            for place in places:
                for in_name in func.all_input_args:
-                    if hasattr(self, in_name):
+                    if hasattr(self, "inputs") and in_name in self.inputs:
                        kwargs[in_name] = in_name
                        var = scope.new_var(in_name).get_tensor()
-                        arr = getattr(self, in_name)
+                        arr = self.inputs[in_name]
                        var.set_dims(arr.shape)
                        var.set(arr, place)
                    else:
                        kwargs[in_name] = "@EMPTY@"
                for out_name in func.all_output_args:
-                    if hasattr(self, out_name):
+                    if not hasattr(self, "outputs"):
-                        kwargs[out_name] = out_name
+                        raise ValueError(
-                        scope.new_var(out_name).get_tensor()
+                            "The test op must set self.outputs dict.")
+                    if out_name not in self.outputs:
+                        raise ValueError("The %s is not in self.outputs dict." %
+                                         (out_name))
+                    kwargs[out_name] = out_name
+                    scope.new_var(out_name).get_tensor()
                for attr_name in func.all_attr_args:
-                    if hasattr(self, attr_name):
+                    if hasattr(self, "attrs") and attr_name in self.attrs:
-                        kwargs[attr_name] = getattr(self, attr_name)
+                        kwargs[attr_name] = self.attrs[attr_name]
                op = func(**kwargs)
@@ -60,11 +65,8 @@ class OpTestMeta(type):
                for out_name in func.all_output_args:
                    actual = numpy.array(scope.find_var(out_name).get_tensor())
-                    expect = getattr(self, out_name)
+                    expect = self.outputs[out_name]
-                    # TODO(qijun) The default decimal is 7, but numpy.dot and eigen.mul
+                    numpy.isclose(actual, expect)
-                    # has some diff, and could not pass unittest. So I set decimal 3 here.
-                    # And I will check this in future.
-                    numpy.testing.assert_almost_equal(actual, expect, decimal=3)
        obj.test_all = test_all
        return obj
--- a/python/paddle/v2/framework/tests/test_add_two_op.py
+++ b/python/paddle/v2/framework/tests/test_add_two_op.py
@@ -12,9 +12,11 @@ class TestAddOp(unittest.TestCase):
    def setUp(self):
        self.type = "add_two"
-        self.X = numpy.random.random((102, 105)).astype("float32")
+        self.inputs = {
-        self.Y = numpy.random.random((102, 105)).astype("float32")
+            'X': numpy.random.random((102, 105)).astype("float32"),
-        self.Out = self.X + self.Y
+            'Y': numpy.random.random((102, 105)).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}
 class TestAddGradOp(unittest.TestCase):

--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -7,16 +7,20 @@ class TestSGD(unittest.TestCase):
    __metaclass__ = OpTestMeta
    def setUp(self):
+        # TODO this unit test is not passed
        self.type = "onehot_cross_entropy"
        batch_size = 100
        class_num = 10
-        self.X = numpy.random.random((batch_size, class_num)).astype("float32")
+        X = numpy.random.random((batch_size, class_num)).astype("float32")
-        self.label = 5 * numpy.ones(batch_size).astype("int32")
+        label = 5 * numpy.ones(batch_size).astype("int32")
+        self.inputs = {'X': X, 'label': label}
        Y = []
        for i in range(0, batch_size):
-            Y.append(-numpy.log(self.X[i][self.label[i]]))
+            Y.append(-numpy.log(X[i][label[i]]))
-        self.Y = numpy.array(Y).astype("float32")
+        self.outputs = {'Y': numpy.array(Y).astype("float32")}
+# TODO(superjom) add gradient check
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/v2/framework/tests/test_mean_op.py
+++ b/python/paddle/v2/framework/tests/test_mean_op.py
@@ -8,8 +8,8 @@ class TestMeanOp(unittest.TestCase):
    def setUp(self):
        self.type = "mean"
-        self.X = np.random.random((32, 784)).astype("float32")
+        self.inputs = {'X': np.random.random((32, 784)).astype("float32")}
-        self.Out = np.mean(self.X)
+        self.outputs = {'Out': np.mean(self.inputs['X'])}
 if __name__ == '__main__':

--- a/python/paddle/v2/framework/tests/test_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_mul_op.py
@@ -8,9 +8,11 @@ class TestMulOp(unittest.TestCase):
    def setUp(self):
        self.type = "mul"
-        self.X = np.random.random((32, 84)).astype("float32")
+        self.inputs = {
-        self.Y = np.random.random((84, 100)).astype("float32")
+            'X': np.random.random((32, 84)).astype("float32"),
-        self.Out = np.dot(self.X, self.Y)
+            'Y': np.random.random((84, 100)).astype("float32")
+        }
+        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
 if __name__ == '__main__':

--- a/python/paddle/v2/framework/tests/test_op_creation_methods.py
+++ b/python/paddle/v2/framework/tests/test_op_creation_methods.py
@@ -3,7 +3,7 @@ import paddle.v2.framework.create_op_creation_methods as creation
 import paddle.v2.framework.core as core
 import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2
 import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2
-import paddle.v2.framework.proto.attr_type_pb2 as attr_type_pb2
+import paddle.v2.framework.proto.attribute_pb2 as attribute_pb2
 class TestGetAllProtos(unittest.TestCase):
@@ -76,7 +76,7 @@ class TestOpDescCreationMethod(unittest.TestCase):
        expected1.type = 'fc'
        attr = expected1.attrs.add()
        attr.name = 'input_format'
-        attr.type = attr_type_pb2.INTS
+        attr.type = attribute_pb2.INTS
        attr.ints.extend([0, 1, 2, 3])
        self.assertEqual(expected1, generated1)
@@ -88,7 +88,7 @@ class TestOpDescCreationMethod(unittest.TestCase):
        expected2.type = 'fc'
        attr = expected2.attrs.add()
        attr.name = 'input_format'
-        attr.type = attr_type_pb2.INTS
+        attr.type = attribute_pb2.INTS
        attr.ints.extend([0, 3, 6, 7])
        self.assertEqual(expected2, generated2)
@@ -105,12 +105,12 @@ class TestOpDescCreationMethod(unittest.TestCase):
            attr.comment = ""
            attr.type = type
-        __add_attr__("int_attr", attr_type_pb2.INT)
+        __add_attr__("int_attr", attribute_pb2.INT)
-        __add_attr__("float_attr", attr_type_pb2.FLOAT)
+        __add_attr__("float_attr", attribute_pb2.FLOAT)
-        __add_attr__("string_attr", attr_type_pb2.STRING)
+        __add_attr__("string_attr", attribute_pb2.STRING)
-        __add_attr__("ints_attr", attr_type_pb2.INTS)
+        __add_attr__("ints_attr", attribute_pb2.INTS)
-        __add_attr__("floats_attr", attr_type_pb2.FLOATS)
+        __add_attr__("floats_attr", attribute_pb2.FLOATS)
-        __add_attr__("strings_attr", attr_type_pb2.STRINGS)
+        __add_attr__("strings_attr", attribute_pb2.STRINGS)
        op.comment = ""
        self.assertTrue(op.IsInitialized())
@@ -131,32 +131,32 @@ class TestOpDescCreationMethod(unittest.TestCase):
        expected.inputs.extend(['a'])
        attr = expected.attrs.add()
        attr.name = "int_attr"
-        attr.type = attr_type_pb2.INT
+        attr.type = attribute_pb2.INT
        attr.i = 10
        attr = expected.attrs.add()
        attr.name = "float_attr"
-        attr.type = attr_type_pb2.FLOAT
+        attr.type = attribute_pb2.FLOAT
        attr.f = 3.2
        attr = expected.attrs.add()
        attr.name = "string_attr"
-        attr.type = attr_type_pb2.STRING
+        attr.type = attribute_pb2.STRING
        attr.s = "test_str"
        attr = expected.attrs.add()
        attr.name = "ints_attr"
-        attr.type = attr_type_pb2.INTS
+        attr.type = attribute_pb2.INTS
        attr.ints.extend([0, 1, 2, 3, 4])
        attr = expected.attrs.add()
        attr.name = 'floats_attr'
-        attr.type = attr_type_pb2.FLOATS
+        attr.type = attribute_pb2.FLOATS
        attr.floats.extend([0.2, 3.2, 4.5])
        attr = expected.attrs.add()
        attr.name = 'strings_attr'
-        attr.type = attr_type_pb2.STRINGS
+        attr.type = attribute_pb2.STRINGS
        attr.strings.extend(['a', 'b', 'c'])
        self.assertEqual(expected, generated)
@@ -185,7 +185,7 @@ class TestOpDescCreationMethod(unittest.TestCase):
        desc.type = "test"
        attr = desc.attrs.add()
        attr.name = "temporary_index"
-        attr.type = attr_type_pb2.INTS
+        attr.type = attribute_pb2.INTS
        attr.ints.append(2)
        self.assertEqual(generated, desc)
@@ -219,7 +219,7 @@ This op is used for unit test, not a real op.
        test_str = op.attrs.add()
        test_str.name = "str_attr"
-        test_str.type = attr_type_pb2.STRING
+        test_str.type = attribute_pb2.STRING
        test_str.comment = "A string attribute for test op"
        actual = creation.get_docstring_from_op_proto(op)

--- a/python/paddle/v2/framework/tests/test_protobuf.py
+++ b/python/paddle/v2/framework/tests/test_protobuf.py
-import paddle.v2.framework.proto.op_proto_pb2
+import paddle.v2.framework.proto.op_proto_pb2 as op_proto_lib
-import paddle.v2.framework.proto.attr_type_pb2
+import paddle.v2.framework.proto.attribute_pb2 as attr_type_lib
 import unittest
 class TestFrameworkProto(unittest.TestCase):
    def test_all(self):
-        op_proto_lib = paddle.v2.framework.proto.op_proto_pb2
-        attr_type_lib = paddle.v2.framework.proto.attr_type_pb2
        op_proto = op_proto_lib.OpProto()
        ipt0 = op_proto.inputs.add()
        ipt0.name = "a"

--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
+import logging
 import paddle.v2.framework.core as core
 import unittest
 import numpy as np
@@ -7,10 +8,9 @@ ops = creation.op_creations
 def create_tensor(scope, name, shape):
-    tensor = scope.create_var(name).get_tensor()
+    tensor = scope.new_var(name).get_tensor()
    tensor.set_dims(shape)
-    tensor.alloc_float()
+    tensor.set(np.random.random(shape), core.CPUPlace())
-    tensor.set(np.random.random(shape))
    return tensor
@@ -31,40 +31,36 @@ class TestRNN(unittest.TestCase):
        - h
    '''
+    input_dim = 30
+    batch_size = 50
+    weight_dim = 15
+    sent_len = 11
    def init(self):
-        input_dim = 30
-        batch_size = 50
-        weight_dim = 15
-        self.scope = core.Scope(None)
-        # create vars
-        create_tensor(self.scope, "x", [batch_size, input_dim])
-        create_tensor(self.scope, "W", [input_dim, weight_dim])
-        create_tensor(self.scope, "U", [weight_dim, weight_dim])
-        create_tensor(self.scope, "h_boot", [batch_size, weight_dim])
-        x_alias = "x@alias"
-        y_alias = "y@alias"
-        memory = "h@alias"
-        prememory = "h@pre"
-        output = "rnn_out"
-        output_alias = "rnn_out@alias"
-        # create step net
-        stepnet_var = self.scope.create_var("stepnet")
-        stepnet = stepnet_var.get_net()
-        # stepnet = core.Net.create()
-        x_fc_op = ops.fc(X=x_alias, W="W", Y="Wx")
-        h_fc_op = ops.fc(X=prememory, W="U", Y="Uh")
-        sum_op = ops.add_two(X="Wx", Y="Uh", Out="sum")
-        sig_op = ops.sigmoid(X="sum", Y=memory)
-        stepnet.add_op(x_fc_op)
-        stepnet.add_op(h_fc_op)
-        stepnet.add_op(sum_op)
-        stepnet.add_op(sig_op)
-        stepnet.complete_add_op(True)
+        self.scope = core.Scope()
+        self.create_global_variables()
+        self.create_step_net()
+        rnn_op = self.create_rnn_op()
+        ctx = core.DeviceContext.create(core.CPUPlace())
+        print 'infer_shape'
+        rnn_op.infer_shape(self.scope)
+        rnn_op.run(self.scope, ctx)
+    def create_global_variables(self):
+        # create inlink
+        create_tensor(self.scope, "x",
+                      [self.sent_len, self.batch_size, self.input_dim])
+        create_tensor(self.scope, "W", [self.input_dim, self.input_dim])
+        create_tensor(self.scope, "U", [self.input_dim, self.input_dim])
+        create_tensor(self.scope, "h_boot", [self.batch_size, self.input_dim])
+        self.scope.new_var("step_scopes")
+        self.scope.new_var("h@alias")
+        self.scope.new_var("h")
+    def create_rnn_op(self):
        # create RNNOp
        rnnop = ops.recurrent_op(
            # inputs
@@ -72,17 +68,27 @@ class TestRNN(unittest.TestCase):
            boot_memories=["h_boot"],
            step_net="stepnet",
            # outputs
-            outlinks=[output],
+            outlinks=["h"],
            step_scopes="step_scopes",
            # attributes
            inlink_alias=["x@alias"],
-            outlink_alias=[output_alias],
+            outlink_alias=["h@alias"],
-            pre_memories=[prememory],
+            pre_memories=["h@pre"],
-            memories=[memory])
+            memories=["h@alias"])
+        return rnnop
+    def create_step_net(self):
+        var = self.scope.new_var("stepnet")
+        stepnet = var.get_net()
-        ctx = core.DeviceContext.cpu_context()
+        x_fc_op = ops.fc(X="x@alias", W="W", Y="Wx")
-        rnnop.infer_shape(self.scope)
+        h_fc_op = ops.fc(X="h@pre", W="U", Y="Uh")
-        rnnop.run(self.scope, ctx)
+        sum_op = ops.add_two(X="Wx", Y="Uh", Out="sum")
+        sig_op = ops.sigmoid(X="sum", Y="h@alias")
+        for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
+            stepnet.add_op(op)
+        stepnet.complete_add_op(True)
    def test_recurrent(self):
        self.init()

--- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
@@ -8,9 +8,11 @@ class TestRowwiseAddOp(unittest.TestCase):
    def setUp(self):
        self.type = "rowwise_add"
-        self.X = np.random.random((32, 84)).astype("float32")
+        self.inputs = {
-        self.b = np.random.random(84).astype("float32")
+            'X': np.random.random((32, 84)).astype("float32"),
-        self.Out = np.add(self.X, self.b)
+            'b': np.random.random(84).astype("float32")
+        }
+        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['b'])}
 if __name__ == '__main__':

--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
@@ -8,10 +8,13 @@ class TestSGD(unittest.TestCase):
    def setUp(self):
        self.type = "sgd"
-        self.param = numpy.random.random((102, 105)).astype("float32")
+        w = numpy.random.random((102, 105)).astype("float32")
-        self.grad = numpy.random.random((102, 105)).astype("float32")
+        g = numpy.random.random((102, 105)).astype("float32")
-        self.learning_rate = 0.1
+        lr = 0.1
-        self.param_out = self.param - self.learning_rate * self.grad
+        self.inputs = {'param': w, 'grad': g}
+        self.attrs = {'learning_rate': lr}
+        self.outputs = {'param_out': w - lr * g}
 if __name__ == "__main__":

--- a/python/paddle/v2/framework/tests/test_sigmoid_op.py
+++ b/python/paddle/v2/framework/tests/test_sigmoid_op.py
@@ -8,9 +8,12 @@ class TestSigmoidOp(unittest.TestCase):
    def setUp(self):
        self.type = "sigmoid"
-        self.X = np.random.random((32, 100)).astype("float32")
+        self.inputs = {'X': np.random.random((32, 100)).astype("float32")}
-        self.Y = 1 / (1 + np.exp(-self.X))
+        self.outputs = {'Y': 1 / (1 + np.exp(-self.inputs['X']))}
+#class TestSigmoidGradOp(unittest.TestCase):
+#TODO(qingqing) add unit test
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/framework/tests/test_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_op.py
@@ -19,8 +19,10 @@ class TestSoftmaxOp(unittest.TestCase):
    def setUp(self):
        self.type = "softmax"
-        self.X = np.random.random((32, 100)).astype("float32")
+        self.inputs = {'X': np.random.random((32, 100)).astype("float32")}
-        self.Y = np.apply_along_axis(stable_softmax, 1, self.X)
+        self.outputs = {
+            'Y': np.apply_along_axis(stable_softmax, 1, self.inputs['X'])
+        }
 class TestSoftmaxGradOp(unittest.TestCase):

--- a/python/paddle/v2/plot/tests/CMakeLists.txt
+++ b/python/paddle/v2/plot/tests/CMakeLists.txt
 if (NOT APPLE)
  # The Mac OS X backend will not be able to function correctly if Python is
  # not installed as a framework.
-  add_python_test(test_ploter test_ploter.py)
+  py_test(test_ploter SRCS test_ploter.py)
 endif()
--- a/python/paddle/v2/reader/tests/CMakeLists.txt
+++ b/python/paddle/v2/reader/tests/CMakeLists.txt
-add_python_test(reader_tests creator_test.py decorator_test.py)
+py_test(creator_test SRCS creator_test.py)
+py_test(decorator_test SRCS decorator_test.py)
--- a/python/paddle/v2/tests/CMakeLists.txt
+++ b/python/paddle/v2/tests/CMakeLists.txt
-add_python_test(test_v2_api test_data_feeder.py test_op.py test_parameters.py
+py_test(test_op SRCS test_op.py)
-test_layer.py test_rnn_layer.py test_topology.py test_image.py)
+py_test(test_image SRCS test_image.py)
+py_test(test_layer SRCS test_layer.py)
+py_test(test_topology SRCS test_topology.py)
+py_test(test_rnn_layer SRCS test_rnn_layer.py)
+py_test(test_parameters SRCS test_parameters.py)
+py_test(test_data_feeder SRCS test_data_feeder.py)
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -14,7 +14,7 @@ packages=['paddle',
          'paddle.v2.framework.proto']
 setup_requires=["requests",
-                "numpy",
+                "numpy>=1.12",
                "protobuf==3.1",
                "recordio",
                "matplotlib",