merge origin/develop

2447c34a · dongzhihong · 933e55eb · d911b1b5 · 2447c34a · 2447c34a
213 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,10 +17,14 @@
    -   id: detect-private-key
        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
    -   id: end-of-file-fixer
-   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
-    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
+-   repo: local
    hooks:
-    -   id: clang-formater
+    -   id: clang-format
+        name: clang-format
+        description: Format files with ClangFormat.
+        entry: clang-format -i
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
    sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
    hooks:

--- a/Dockerfile
+++ b/Dockerfile
@@ -27,13 +27,16 @@ RUN apt-get update && \
    git python-pip python-dev openssh-server bison  \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
-    python-numpy python-matplotlib gcc g++ \
+    python-matplotlib gcc-4.8 g++-4.8 \
    automake locales clang-format-3.8 swig doxygen cmake  \
    liblapack-dev liblapacke-dev libboost-dev \
    clang-3.8 llvm-3.8 libclang-3.8-dev \
    net-tools && \
    apt-get clean -y

+# paddle is using numpy.flip, which is introduced since 1.12.0
+RUN pip --no-cache-dir install 'numpy>=1.12.0'
+
 # Install Go and glide
 RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
    tar -C /usr/local -xzf go.tgz && \

--- a/README.md
+++ b/README.md
@@ -72,7 +72,7 @@ We provide [English](http://doc.paddlepaddle.org/develop/doc/) and

 - [Deep Learning 101](http://book.paddlepaddle.org/index.html)

-  You might want to start from the this online interactive book that can run in Jupyter Notebook.
+  You might want to start from this online interactive book that can run in Jupyter Notebook.

 - [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)


--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -56,11 +56,14 @@ macro(add_style_check_target TARGET_NAME)
                # cpplint code style
                get_filename_component(base_filename ${filename} NAME)
                set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
-                add_custom_command(TARGET ${TARGET_NAME} PRE_BUILD
+                add_custom_command(OUTPUT ${CUR_GEN} PRE_BUILD
                    COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
                            "--filter=${STYLE_FILTER}"
                            "--write-success=${CUR_GEN}" ${filename}
+                    DEPENDS ${filename} ${PROJ_ROOT}/paddle/scripts/cpplint.py
                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+                add_custom_target(${base_filename}.cpplint DEPENDS ${CUR_GEN})
+                add_dependencies(${TARGET_NAME} ${base_filename}.cpplint)
            endif()
        endforeach()
    endif()

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -20,34 +20,30 @@ INCLUDE(ExternalProject)

 SET(MKLDNN_PROJECT        "extern_mkldnn")
 SET(MKLDNN_SOURCES_DIR    ${THIRD_PARTY_PATH}/mkldnn)
-SET(MKLDNN_INSTALL_ROOT   ${CMAKE_INSTALL_PREFIX})
-IF(NOT "$ENV{HOME}" STREQUAL "/root")
-    SET(MKLDNN_INSTALL_ROOT  "$ENV{HOME}")
-ENDIF()
-
-SET(MKLDNN_INSTALL_DIR    "${MKLDNN_INSTALL_ROOT}/opt/paddle/third_party/mkldnn")
-SET(MKLDNN_INCLUDE_DIR    "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
+SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
+SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)

-IF(WIN32)
-    MESSAGE(WARNING "It is not supported compiling with mkldnn in windows Paddle yet."
+IF(WIN32 OR APPLE)
+    MESSAGE(WARNING 
+        "Windows or Mac is not supported with MKLDNN in Paddle yet."
        "Force WITH_MKLDNN=OFF")
-    SET(WITH_MKLDNN OFF)
+    SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE)
    return()
-ELSE(WIN32)
-    SET(MKLDNN_LIBRARY "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
-    MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
-    SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
-    #SET(CMAKE_MACOSX_RPATH 1) # hold for MacOS
-    SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
-ENDIF(WIN32)
+ENDIF()
+
+SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
+MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
+SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")

-INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})

 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
    SET(MKLDNN_MKLROOT   ${MKLML_ROOT})
    SET(MKLDNN_IOMP_LIB  ${MKLML_IOMP_LIB})
    SET(MKLDNN_IOMP_DIR  ${MKLML_LIB_DIR})
+    MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
 ENDIF()

 ExternalProject_Add(
@@ -57,16 +53,15 @@ ExternalProject_Add(
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
    GIT_TAG             "v0.9"
    PREFIX              ${MKLDNN_SOURCES_DIR}
-    CONFIGURE_COMMAND   mkdir -p <SOURCE_DIR>/build
-    BUILD_COMMAND       cd <SOURCE_DIR>/build
-                        && cmake .. -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} -DMKLROOT=${MKLDNN_MKLROOT}
-                        && $(MAKE)
-    INSTALL_COMMAND     cd <SOURCE_DIR>/build && $(MAKE) install
    UPDATE_COMMAND      ""
+    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
+    CMAKE_ARGS          -DMKLROOT=${MKLDNN_MKLROOT}
+    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
+                        -DMKLROOT:PATH=${MKLDNN_MKLROOT}
 )

 ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIBRARY})
+SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
 ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
-MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIBRARY}")
+MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}")
 LIST(APPEND external_project_dependencies mkldnn)
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -16,19 +16,23 @@ IF(NOT ${WITH_MKLML})
  return()
 ENDIF(NOT ${WITH_MKLML})

+IF(WIN32 OR APPLE)
+    MESSAGE(WARNING 
+        "Windows or Mac is not supported with MKLML in Paddle yet."
+        "Force WITH_MKLML=OFF")
+    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE)
+    return()
+ENDIF()
+
 INCLUDE(ExternalProject)

 SET(MKLML_PROJECT       "extern_mklml")
-SET(MKLML_VER           "mklml_lnx_2018.0.20170425")
+SET(MKLML_VER           "mklml_lnx_2018.0.20170720")
 SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
-SET(MKLML_DST_DIR       "opt/paddle/third_party/mklml")
-SET(MKLML_INSTALL_ROOT  "${CMAKE_INSTALL_PREFIX}")
-IF(NOT "$ENV{HOME}" STREQUAL "/root")
-    SET(MKLML_INSTALL_ROOT  "$ENV{HOME}")
-ENDIF()
-
+SET(MKLML_DST_DIR       "mklml")
+SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
 SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
 SET(MKLML_ROOT          ${MKLML_INSTALL_DIR}/${MKLML_VER})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)

--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -24,7 +24,6 @@ IF(WITH_PYTHON)
 ENDIF(WITH_PYTHON)

 SET(py_env "")
-SET(USE_VIRTUALENV_FOR_TEST 1)
 IF(PYTHONINTERP_FOUND)
    find_python_module(pip REQUIRED)
    find_python_module(numpy REQUIRED)

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -9,6 +9,11 @@ function(CheckCompilerCXX11Flag)
        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
        endif()
+        # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem.
+        # Use Debug mode instead for now.
+        if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9) 
+            set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE)
+        endif()
    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
        # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
        # Apple Clang is a different compiler than upstream Clang which havs different version numbers.

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -187,7 +187,13 @@ function(cc_library TARGET_NAME)
    endif()
    
    # cpplint code style
-    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS})
+    foreach(source_file ${cc_library_SRCS})
+      string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+      endif()
+    endforeach()
+    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})

  else(cc_library_SRCS)
    if (cc_library_DEPS)
@@ -239,6 +245,14 @@ function(nv_library TARGET_NAME)
        add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
        target_link_libraries(${TARGET_NAME} ${nv_library_DEPS})
      endif()
+      # cpplint code style
+      foreach(source_file ${nv_library_SRCS})
+        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        endif()
+      endforeach()
+      add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
    else(nv_library_SRCS)
      if (nv_library_DEPS)
        merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})

--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -118,7 +118,6 @@ endfunction()
 macro(add_unittest_without_exec TARGET_NAME)
    add_executable(${TARGET_NAME} ${ARGN})
    link_paddle_test(${TARGET_NAME})
-    add_style_check_target(${TARGET_NAME} ${ARGN})
 endmacro()

 # add_unittest
@@ -150,9 +149,12 @@ endfunction()
 # Create a python unittest using run_python_tests.sh,
 # which takes care of making correct running environment
 function(add_python_test TEST_NAME)
-  add_test(NAME ${TEST_NAME}
-        COMMAND env PADDLE_PACKAGE_DIR=${PADDLE_PYTHON_PACKAGE_DIR}
-        bash ${PROJ_ROOT}/paddle/scripts/run_python_tests.sh
-        ${USE_VIRTUALENV_FOR_TEST} ${PYTHON_EXECUTABLE} ${ARGN}
+    foreach(arg ${ARGN})
+        get_filename_component(py_fn ${arg} NAME_WE)
+        set(TRG_NAME ${TEST_NAME}_${py_fn})
+        add_test(NAME ${TRG_NAME}
+                COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
+                python2 ${arg}
                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+    endforeach()
 endfunction()
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -105,6 +105,11 @@ cross_channel_norm
 ..  autoclass:: paddle.v2.layer.cross_channel_norm
    :noindex:

+row_l2_norm
+-----------
+..  autoclass:: paddle.v2.layer.row_l2_norm
+    :noindex:
+    
 Recurrent Layers
 ================

@@ -198,6 +203,10 @@ identity_projection
 ..  autoclass:: paddle.v2.layer.identity_projection
    :noindex:

+slice_projection
+-------------------
+..  autoclass:: paddle.v2.layer.slice_projection
+    :noindex:

 table_projection
 ----------------
@@ -316,6 +325,11 @@ scaling
 ..  autoclass:: paddle.v2.layer.scaling
    :noindex:

+clip
+----
+..  autoclass:: paddle.v2.layer.clip
+    :noindex:
+
 slope_intercept
 ---------------
 ..  autoclass:: paddle.v2.layer.slope_intercept

--- a/doc/design/scope.md
+++ b/doc/design/scope.md
@@ -37,8 +37,8 @@ Scope is an association of a name to variable. All variables belong to `Scope`.
 ```cpp
 class Scope {
 public:
-  Variable* CreateVariable(const std::string& name);
-  const Variable* GetVariable(const std::string& name) const;
+  Variable* NewVar(const std::string& name);
+  const Variable* FindVar(const std::string& name) const;

 private:
    std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
@@ -58,12 +58,12 @@ class Scope {
 public:
  Scope(const std::shared_ptr<Scope>& scope): parent_(scope) {}

-  Variable* GetVariable(const std::string& name) const {
+  Variable* FindVar(const std::string& name) const {
    auto it = vars_.find(name);
    if (it != vars_.end()) {
      return it->second.get();
    } else if (parent_ != nullptr) {
-      return parent_->GetVariable(name);
+      return parent_->FindVar(name);
    } else {
      return nullptr;
    }
@@ -95,10 +95,10 @@ class Scope {
  static std::shared_ptr<Scope> Create(const std::shared_ptr<Scope>& parent = nullptr);

  // return nullptr if not found.
-  Variable* GetVariable(const std::string& name) const;
+  Variable* FindVar(const std::string& name) const;

  // return if already contains same name variable.
-  Variable* CreateVariable(const std::string& name);
+  Variable* NewVar(const std::string& name);

 private:
  std::shared_ptr<Scope> parent_;
@@ -107,11 +107,11 @@ class Scope {
 ```
 ## Only scope can create a variable

-To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `CreateVariable` can construct `Variable`.
+To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `NewVar` can construct `Variable`.

 ## When scope destroyed, all variables inside this scope should be destroyed together

-The scope hold unique pointers for all variables. User can `GetVariable` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together.
+The scope hold unique pointers for all variables. User can `FindVar` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together.

 ## Sharing a parent scope

@@ -121,4 +121,4 @@ Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shar

 ## Orthogonal interface

-`GetVariable` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `CreateVariable` will return a `Error` when there is a name conflict locally. Combine `GetVariable` and `CreateVariable`, we can implement `CreateOrGetVariable` easily.
+`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `NewVar` will return a `Error` when there is a name conflict locally. Combine `FindVar` and `NewVar`, we can implement `NewVar` easily.
--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
@@ -3,24 +3,11 @@ import paddle.v2.dataset.uci_housing as uci_housing
 import paddle.v2.master as master
 import os
 import cPickle as pickle
+from paddle.v2.reader.creator import cloud_reader

 etcd_ip = os.getenv("MASTER_IP", "127.0.0.1")
-etcd_endpoint = "http://" + etcd_ip + ":2379"
-print "connecting to master, etcd endpoints: ", etcd_endpoint
-master_client = master.client(etcd_endpoint, 5, 64)
-
-
-def cloud_reader():
-    global master_client
-    master_client.set_dataset(
-        ["/pfs/dlnel/public/dataset/uci_housing/uci_housing-*"], passes=30)
-    while 1:
-        r, e = master_client.next_record()
-        if not r:
-            if e != -2:  # other errors
-                print "get record error:", e
-            break
-        yield pickle.loads(r)
+etcd_endpoints = "http://" + etcd_ip + ":2379"
+print "etcd endpoints: ", etcd_endpoints


 def main():
@@ -49,7 +36,7 @@ def main():
                                 parameters=parameters,
                                 update_equation=optimizer,
                                 is_local=False,
-                                 pserver_spec=etcd_endpoint,
+                                 pserver_spec=etcd_endpoints,
                                 use_etcd=True)

    # event_handler to print training and testing info
@@ -75,7 +62,11 @@ def main():
    trainer.train(
        reader=paddle.batch(
            paddle.reader.shuffle(
-                cloud_reader, buf_size=500), batch_size=2),
+                cloud_reader(
+                    ["/pfs/dlnel/public/dataset/uci_housing/uci_housing*"],
+                    etcd_endpoints),
+                buf_size=500),
+            batch_size=2),
        feeding={'x': 0,
                 'y': 1},
        event_handler=event_handler,

--- a/paddle/.set_python_path.sh
+++ b/paddle/.set_python_path.sh
@@ -21,22 +21,15 @@
 # 
 # It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...}
 #
-
-if ! python -c "import paddle" >/dev/null 2>/dev/null; then
-  PYPATH=""
-  set -x
-  while getopts "d:" opt; do
+PYPATH=""
+set -x
+while getopts "d:" opt; do
  case $opt in
    d)
      PYPATH=$OPTARG
      ;;
  esac
-  done
-  shift $(($OPTIND - 1))
-  export PYTHONPATH=$PYPATH:$PYTHONPATH
-  $@
-else
-  echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment."
-  echo "Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'"
-  exit 1
-fi
+done
+shift $(($OPTIND - 1))
+export PYTHONPATH=$PYPATH:$PYTHONPATH
+$@
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -15,7 +15,6 @@ if(Boost_FOUND)
  add_subdirectory(platform)
  add_subdirectory(framework)
  add_subdirectory(operators)
-  add_subdirectory(pybind)
 endif()

 if(WITH_C_API)

--- a/paddle/cuda/src/hl_batch_transpose.cu
+++ b/paddle/cuda/src/hl_batch_transpose.cu
@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "hl_batch_transpose.h"
 #include "hl_base.h"
+#include "hl_batch_transpose.h"

 const int TILE_DIM = 64;
 const int BLOCK_ROWS = 16;

 // No bank-conflict transpose for a batch of data.
-__global__ void batchTransposeNoBankConflicts(real* odata,
-                                              const real* idata,
-                                              int numSamples, int width,
-                                              int height) {
+__global__ void batchTransposeNoBankConflicts(
+    real* odata, const real* idata, int numSamples, int width, int height) {
  __shared__ float tile[TILE_DIM][TILE_DIM + 1];

  const int x = blockIdx.x * TILE_DIM + threadIdx.x;
@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
          newX] = tile[threadIdx.x][j];
 }

-void batchTranspose(const real* input, real* output, int width, int height,
-                    int batchSize) {
+void batchTranspose(
+    const real* input, real* output, int width, int height, int batchSize) {
  dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
  dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize);
-  batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
-      (output, input, batchSize, width, height);
+  batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+      output, input, batchSize, width, height);

  CHECK_SYNC("batchTranspose failed!");
 }
--- a/paddle/cuda/src/hl_cuda_aggregate.cu
+++ b/paddle/cuda/src/hl_cuda_aggregate.cu
@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-
+#include "hl_aggregate.h"
 #include "hl_base.h"
 #include "hl_cuda.h"
 #include "hl_cuda.ph"
-#include "hl_aggregate.h"
-#include "hl_thread.ph"
 #include "hl_matrix_base.cuh"
+#include "hl_thread.ph"
 #include "paddle/utils/Logging.h"

 /**
 * @brief   matrix row operator.
 */
-template<class Agg, int blockSize>
-__global__ void KeMatrixRowOp(Agg agg,
-                              real *E,
-                              real *Sum,
-                              int dimN) {
+template <class Agg, int blockSize>
+__global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) {
  __shared__ real sum_s[blockSize];
-  int cnt = (dimN + blockSize -1) / blockSize;
-  int rowId = blockIdx.x + blockIdx.y*gridDim.x;
-  int index = rowId*dimN;
+  int cnt = (dimN + blockSize - 1) / blockSize;
+  int rowId = blockIdx.x + blockIdx.y * gridDim.x;
+  int index = rowId * dimN;
  int tid = threadIdx.x;
  int lmt = tid;

@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg,
  sum_s[tid] = tmp;
  __syncthreads();

-  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
    if (tid < stride) {
      sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]);
    }
@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg,
 }

 template <class Agg>
-void hl_matrix_row_op(Agg agg,
-                      real *A_d,
-                      real *C_d,
-                      int dimM,
-                      int dimN) {
+void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
  int blocksX = dimM;
  int blocksY = 1;
  dim3 threads(128, 1);
  dim3 grid(blocksX, blocksY);

-  KeMatrixRowOp<Agg, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
-           (agg, A_d, C_d, dimN);
+  KeMatrixRowOp<Agg, 128><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      agg, A_d, C_d, dimN);
 }

 void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);

-  hl_matrix_row_op(aggregate::sum(),
-                   A_d,
-                   C_d,
-                   dimM,
-                   dimN);
+  hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN);
  CHECK_SYNC("hl_matrix_row_sum failed");
 }

@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);

-  hl_matrix_row_op(aggregate::max(),
-                   A_d,
-                   C_d,
-                   dimM,
-                   dimN);
+  hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN);
  CHECK_SYNC("hl_matrix_row_max failed");
 }

@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);

-  hl_matrix_row_op(aggregate::min(),
-                   A_d,
-                   C_d,
-                   dimM,
-                   dimN);
+  hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN);
  CHECK_SYNC("hl_matrix_row_min failed");
 }

 /**
 * @brief   matrix column operator.
 */
-template<class Agg>
-__global__ void KeMatrixColumnOp(Agg agg,
-                                 real *E,
-                                 real *Sum,
-                                 int dimM,
-                                 int dimN) {
+template <class Agg>
+__global__ void KeMatrixColumnOp(
+    Agg agg, real *E, real *Sum, int dimM, int dimN) {
  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
  real tmp = agg.init();
  if (rowIdx < dimN) {
@@ -127,13 +104,10 @@ __global__ void KeMatrixColumnOp(Agg agg,
  }
 }

-template<class Agg, int blockDimX, int blockDimY>
-__global__ void KeMatrixColumnOp_S(Agg agg,
-                                   real *E,
-                                   real *Sum,
-                                   int dimM,
-                                   int dimN) {
-    __shared__ real _sum[blockDimX*blockDimY];
+template <class Agg, int blockDimX, int blockDimY>
+__global__ void KeMatrixColumnOp_S(
+    Agg agg, real *E, real *Sum, int dimM, int dimN) {
+  __shared__ real _sum[blockDimX * blockDimY];
  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
  int index = threadIdx.y;

@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
      index += blockDimY;
    }
  }
-  _sum[threadIdx.x + threadIdx.y*blockDimX] = tmp;
+  _sum[threadIdx.x + threadIdx.y * blockDimX] = tmp;
  __syncthreads();

  if (rowIdx < dimN) {
-    if (threadIdx.y ==0) {
+    if (threadIdx.y == 0) {
      real tmp = agg.init();
-      for (int i=0; i < blockDimY; i++) {
-        tmp = agg(tmp, _sum[threadIdx.x + i*blockDimX]);
+      for (int i = 0; i < blockDimY; i++) {
+        tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]);
      }
      Sum[rowIdx] = tmp;
    }
@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
 }

 template <class Agg>
-void hl_matrix_column_op(Agg agg,
-                         real *A_d,
-                         real *C_d,
-                         int dimM,
-                         int dimN) {
+void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
  if (dimN >= 8192) {
-    int blocksX = (dimN + 128 -1) / 128;
+    int blocksX = (dimN + 128 - 1) / 128;
    int blocksY = 1;
    dim3 threads(128, 1);
    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp<Agg><<< grid, threads, 0, STREAM_DEFAULT >>>
-             (agg, A_d, C_d, dimM, dimN);
+    KeMatrixColumnOp<Agg><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        agg, A_d, C_d, dimM, dimN);
  } else {
-    int blocksX = (dimN + 32 -1) / 32;
+    int blocksX = (dimN + 32 - 1) / 32;
    int blocksY = 1;
    dim3 threads(32, 32);
    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp_S<Agg, 32, 32><<< grid, threads, 0, STREAM_DEFAULT>>>
-             (agg, A_d, C_d, dimM, dimN);
+    KeMatrixColumnOp_S<Agg, 32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        agg, A_d, C_d, dimM, dimN);
  }

  return;
@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);

-  hl_matrix_column_op(aggregate::sum(),
-                      A_d,
-                      C_d,
-                      dimM,
-                      dimN);
+  hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN);

  CHECK_SYNC("hl_matrix_column_sum failed");
 }
@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);

-  hl_matrix_column_op(aggregate::max(),
-                      A_d,
-                      C_d,
-                      dimM,
-                      dimN);
+  hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN);

  CHECK_SYNC("hl_matrix_column_max failed");
 }
@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);

-  hl_matrix_column_op(aggregate::min(),
-                      A_d,
-                      C_d,
-                      dimM,
-                      dimN);
+  hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN);

  CHECK_SYNC("hl_matrix_column_min failed");
 }
@@ -226,16 +184,16 @@ template <int blockSize>
 __global__ void KeVectorSum(real *E, real *Sum, int dimM) {
  __shared__ double sum_s[blockSize];
  int tid = threadIdx.x;
-  int index = blockIdx.y*blockDim.x+threadIdx.x;
+  int index = blockIdx.y * blockDim.x + threadIdx.x;

  sum_s[tid] = 0.0f;
  while (index < dimM) {
    sum_s[tid] += E[index];
-    index += blockDim.x*gridDim.y;
+    index += blockDim.x * gridDim.y;
  }
  __syncthreads();

-  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
    if (tid < stride) {
      sum_s[tid] += sum_s[tid + stride];
    }
@@ -261,36 +219,37 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {

  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
  hl_event_t hl_event = &hl_event_st;
-  while (!hl_cuda_event_is_ready(hl_event)) {}
+  while (!hl_cuda_event_is_ready(hl_event)) {
+  }

-  KeVectorSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
-           (A_d, t_resource.gpu_mem, dimM);
-  KeVectorSum<128><<< 1, threads, 0, STREAM_DEFAULT >>>
-           (t_resource.gpu_mem, t_resource.cpu_mem, 128);
+  KeVectorSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      A_d, t_resource.gpu_mem, dimM);
+  KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
+      t_resource.gpu_mem, t_resource.cpu_mem, 128);

  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);

  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
  cudaError_t err = (cudaError_t)hl_get_device_last_error();
-  CHECK_EQ(cudaSuccess, err)
-    << "CUDA error: " << hl_get_device_error_string((size_t)err);
+  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
+                             << hl_get_device_error_string((size_t)err);
 }

 template <int blockSize>
 __global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) {
  __shared__ double sum_s[blockSize];
  int tid = threadIdx.x;
-  int index = blockIdx.y*blockDim.x+threadIdx.x;
+  int index = blockIdx.y * blockDim.x + threadIdx.x;

  sum_s[tid] = 0.0f;
  while (index < dimM) {
    sum_s[tid] += abs(E[index]);
-    index += blockDim.x*gridDim.y;
+    index += blockDim.x * gridDim.y;
  }
  __syncthreads();

-  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
    if (tid < stride) {
      sum_s[tid] += sum_s[tid + stride];
    }
@@ -316,18 +275,19 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {

  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
  hl_event_t hl_event = &hl_event_st;
-  while (!hl_cuda_event_is_ready(hl_event)) {}
+  while (!hl_cuda_event_is_ready(hl_event)) {
+  }

-  KeVectorAbsSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
-           (A_d, t_resource.gpu_mem, dimM);
-  KeVectorAbsSum<128><<< 1, threads, 0, STREAM_DEFAULT >>>
-           (t_resource.gpu_mem, t_resource.cpu_mem, 128);
+  KeVectorAbsSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      A_d, t_resource.gpu_mem, dimM);
+  KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
+      t_resource.gpu_mem, t_resource.cpu_mem, 128);

  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);

  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
  cudaError_t err = (cudaError_t)hl_get_device_last_error();
-  CHECK_EQ(cudaSuccess, err)
-    << "CUDA error: " << hl_get_device_error_string((size_t)err);
+  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
+                             << hl_get_device_error_string((size_t)err);
 }
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -1022,6 +1022,15 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
  real alpha = 1.0f;
  real beta = 1.0f;
  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
+
+  int batch_size = ((cudnn_tensor_descriptor)inputDesc)->batch_size;
+  if (batch_size > 1024 && g_cudnn_lib_version < 6000) {
+    LOG(INFO) << " To process current batch data with size " << batch_size
+              << " (>1024), cudnnBatchNorm requires cuDNN version >= 6000."
+              << " If there is an error complaining CUDNN_STATUS_NOT_SUPPORTED,"
+              << " just recompile PaddlePaddle with cuDNN >= 6000, replacing"
+              << " current version " << g_cudnn_lib_version;
+  }
  CHECK_CUDNN(
      dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
                                                       mode,

--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-
 #include "hl_base.h"
+#include "hl_device_functions.cuh"
+#include "hl_gpu_matrix_kernel.cuh"
 #include "hl_matrix.h"
-#include "hl_matrix_ops.cuh"
 #include "hl_matrix_apply.cuh"
+#include "hl_matrix_ops.cuh"
 #include "hl_sequence.h"
 #include "hl_sparse.ph"
 #include "paddle/utils/Logging.h"
-#include "hl_device_functions.cuh"
-#include "hl_gpu_matrix_kernel.cuh"

 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b);
-void hl_matrix_add(real *A_d,
-                   real *B_d,
-                   real *C_d,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b);
+void hl_matrix_add(real* A_d,
+                   real* B_d,
+                   real* C_d,
                   int dimM,
                   int dimN,
                   real alpha,
@@ -36,8 +35,8 @@ void hl_matrix_add(real *A_d,
  CHECK_NOTNULL(B_d);
  CHECK_NOTNULL(C_d);

-  hl_gpu_apply_ternary_op
-    <real, ternary::_add<real>, 0, 0>(ternary::_add<real>(alpha, beta),
+  hl_gpu_apply_ternary_op<real, ternary::_add<real>, 0, 0>(
+      ternary::_add<real>(alpha, beta),
      A_d,
      B_d,
      C_d,
@@ -50,12 +49,11 @@ void hl_matrix_add(real *A_d,
 }

 #ifdef PADDLE_TYPE_DOUBLE
-    #define THRESHOLD   128
+#define THRESHOLD 128
 #else
-    #define THRESHOLD   64
+#define THRESHOLD 64
 #endif
-__device__ __forceinline__
-void findMax(real* I,
+__device__ __forceinline__ void findMax(real* I,
                                        real* dfMax_s,
                                        int blockSize,
                                        int base,
@@ -89,8 +87,7 @@ void findMax(real* I,
  __syncthreads();
 }

-__device__ __forceinline__
-void subMaxAndExp(real* I,
+__device__ __forceinline__ void subMaxAndExp(real* I,
                                             real* O,
                                             int curIdx,
                                             int nextIdx,
@@ -115,8 +112,7 @@ void subMaxAndExp(real* I,
  __syncthreads();
 }

-__device__ __forceinline__
-void valueSum(real* O,
+__device__ __forceinline__ void valueSum(real* O,
                                         real* dfMax_s,
                                         int blockSize,
                                         int base,
@@ -141,13 +137,8 @@ void valueSum(real* O,
  __syncthreads();
 }

-__device__ __forceinline__
-void divSum(real* O,
-            real sum,
-            int curIdx,
-            int nextIdx,
-            int blockSize,
-            int dimN) {
+__device__ __forceinline__ void divSum(
+    real* O, real sum, int curIdx, int nextIdx, int blockSize, int dimN) {
  while (curIdx < dimN) {
    O[nextIdx] /= sum;
    nextIdx += blockSize;
@@ -155,8 +146,7 @@ void divSum(real* O,
  }
 }

-__device__ __forceinline__
-void softmax(real* I,
+__device__ __forceinline__ void softmax(real* I,
                                        real* O,
                                        real* dfMax_s,
                                        int blockSize,
@@ -167,8 +157,7 @@ void softmax(real* I,
  __shared__ real max;

  // find the max number
-  findMax(I, dfMax_s, blockSize, base, curIdx,
-          nextIdx, dimN, &max);
+  findMax(I, dfMax_s, blockSize, base, curIdx, nextIdx, dimN, &max);

  // sub max Value and do Exp operation
  subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max);
@@ -181,8 +170,8 @@ void softmax(real* I,
  divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN);
 }

-template<int blockSize>
-__global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
+template <int blockSize>
+__global__ void KeMatrixSoftMax(real* O, real* I, int dimN) {
  int base = threadIdx.x;
  __shared__ real dfMax_s[blockSize];
  int nextIdx = blockIdx.x * dimN + base;
@@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
 }

-void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) {
+void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);

  dim3 block(512, 1);
  dim3 grid(dimM, 1);
-  KeMatrixSoftMax<512>
-           <<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
+  KeMatrixSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
  CHECK_SYNC("hl_matrix_softmax failed");
 }

-template<int blockSize>
-__global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
+template <int blockSize>
+__global__ void KeSequenceSoftMax(real* O, real* I, const int* index) {
  int base = threadIdx.x;
  int bid = blockIdx.x;
  __shared__ real dfMax_s[blockSize];
@@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
 }

-void hl_sequence_softmax_forward(real *A_d,
-                                 real *C_d,
+void hl_sequence_softmax_forward(real* A_d,
+                                 real* C_d,
                                 const int* index,
                                 int numSequence) {
  CHECK_NOTNULL(A_d);
@@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d,

  dim3 block(512, 1);
  dim3 grid(numSequence, 1);
-  KeSequenceSoftMax<512>
-           <<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
+  KeSequenceSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
  CHECK_SYNC("hl_sequence_softmax_forward failed");
 }

-__global__ void KeMatrixDerivative(real *grad_d,
-                                   real *output_d,
-                                   real *sftmaxSum_d,
-                                   int dimM,
-                                   int dimN) {
-  int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
-  int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
+__global__ void KeMatrixDerivative(
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
  int index;

  if (rowIdx < dimM && colIdx < dimN) {
-    index = rowIdx*dimN + colIdx;
+    index = rowIdx * dimN + colIdx;
    grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]);
  }
 }

-void hl_matrix_softmax_derivative(real *grad_d,
-                                  real *output_d,
-                                  real *sftmaxSum_d,
-                                  int dimM,
-                                  int dimN) {
+void hl_matrix_softmax_derivative(
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
  CHECK_NOTNULL(grad_d);
  CHECK_NOTNULL(output_d);
  CHECK_NOTNULL(sftmaxSum_d);

  int blocksX = (dimM + 0) / 1;
-  int blocksY = (dimN + 1024 -1) / 1024;
+  int blocksY = (dimN + 1024 - 1) / 1024;
  dim3 threads(1, 1024);
  dim3 grid(blocksX, blocksY);

-  KeMatrixDerivative<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (grad_d, output_d, sftmaxSum_d, dimM, dimN);
+  KeMatrixDerivative<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad_d, output_d, sftmaxSum_d, dimM, dimN);
  CHECK_SYNC("hl_matrix_softmax_derivative failed");
 }

-__global__ void KeMatrixMultiBinaryCrossEntropy(real* output,
-                                                real* entropy,
-                                                int* row,
-                                                int* col,
-                                                int dimM,
-                                                int dimN) {
+__global__ void KeMatrixMultiBinaryCrossEntropy(
+    real* output, real* entropy, int* row, int* col, int dimM, int dimN) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  if (index < dimM) {
-    for (int i = 0; i < dimN; i ++) {
+    for (int i = 0; i < dimN; i++) {
      entropy[index] -= log(1 - output[index * dimN + i]);
    }
-    int *row_col = col + row[index];
+    int* row_col = col + row[index];
    int col_num = row[index + 1] - row[index];
-    for (int i = 0; i < col_num; i ++) {
+    for (int i = 0; i < col_num; i++) {
      real o = output[index * dimN + row_col[i]];
      entropy[index] -= log(o / (1 - o));
    }
@@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output,
  dim3 threads(n_threads);
  dim3 grid(blocks);
  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
-  KeMatrixMultiBinaryCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>>
-          (output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
+  KeMatrixMultiBinaryCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed");
 }

-__global__ void KeMatrixMultiBinaryCrossEntropyBp(real* output,
-                                                  real* grad,
-                                                  int* row,
-                                                  int* col,
-                                                  int dimM,
-                                                  int dimN) {
+__global__ void KeMatrixMultiBinaryCrossEntropyBp(
+    real* output, real* grad, int* row, int* col, int dimM, int dimN) {
  int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (row_idx < dimM) {
-    for (int i = 0; i < dimN; i ++) {
+    for (int i = 0; i < dimN; i++) {
      int index = row_idx * dimN + i;
      grad[index] += 1.0 / (1 - output[index]);
    }
    int col_num = row[row_idx + 1] - row[row_idx];
-    int *row_col = col + row[row_idx];
-    for (int i = 0; i < col_num; i ++) {
+    int* row_col = col + row[row_idx];
+    for (int i = 0; i < col_num; i++) {
      int index = row_idx * dimN + row_col[i];
      grad[index] -= 1.0 / (output[index] * (1 - output[index]));
    }
  }
 }

-void hl_matrix_multi_binary_cross_entropy_bp(real* output,
-                                             real* grad,
-                                             hl_sparse_matrix_s csr_mat,
-                                             int dimM,
-                                             int dimN) {
+void hl_matrix_multi_binary_cross_entropy_bp(
+    real* output, real* grad, hl_sparse_matrix_s csr_mat, int dimM, int dimN) {
  CHECK_NOTNULL(output);
  CHECK_NOTNULL(grad);
  CHECK_NOTNULL(csr_mat);
@@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output,
  dim3 threads(n_threads);
  dim3 grid(blocks);
  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
-  KeMatrixMultiBinaryCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>>
-          (output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
+  KeMatrixMultiBinaryCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed");
 }

-__global__ void KeMatrixCrossEntropy(real* O,
-                                     real* E,
-                                     int* label,
-                                     int dimM,
-                                     int dimN) {
+__global__ void KeMatrixCrossEntropy(
+    real* O, real* E, int* label, int dimM, int dimN) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int newBase;
  if (index < dimM) {
@@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O,
  }
 }

-void hl_matrix_cross_entropy(real* A_d,
-                             real* C_d,
-                             int* label_d,
-                             int dimM,
-                             int dimN) {
+void hl_matrix_cross_entropy(
+    real* A_d, real* C_d, int* label_d, int dimM, int dimN) {
  CHECK_NOTNULL(A_d);
  CHECK_NOTNULL(C_d);

  int blocks = (dimM + 1024 - 1) / 1024;
  dim3 threads(1024, 1);
  dim3 grid(blocks, 1);
-  KeMatrixCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (A_d, C_d, label_d, dimM, dimN);
+  KeMatrixCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      A_d, C_d, label_d, dimM, dimN);
  CHECK_SYNC("hl_matrix_cross_entropy failed");
 }

-__global__ void KeMatrixCrossEntropyBp(real* grad_d,
-                                       real* output_d,
-                                       int* label_d,
-                                       int dimM,
-                                       int dimN) {
-  int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
-  int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
+__global__ void KeMatrixCrossEntropyBp(
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
  int index;
  if (rowIdx < dimM && colIdx < dimN) {
-    index = rowIdx*dimN + colIdx;
+    index = rowIdx * dimN + colIdx;
    if (label_d[rowIdx] == colIdx) {
      grad_d[index] -= 1.0f / output_d[index];
    }
  }
 }

-void hl_matrix_cross_entropy_bp(real* grad_d,
-                                real* output_d,
-                                int* label_d,
-                                int dimM,
-                                int dimN) {
+void hl_matrix_cross_entropy_bp(
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
  CHECK_NOTNULL(grad_d);
  CHECK_NOTNULL(output_d);
  CHECK_NOTNULL(label_d);

-  int blocksX = (dimM + 0)/1;
-  int blocksY = (dimN + 1024 -1) / 1024;
+  int blocksX = (dimM + 0) / 1;
+  int blocksY = (dimN + 1024 - 1) / 1024;
  dim3 threads(1, 1024);
  dim3 grid(blocksX, blocksY);
-  KeMatrixCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (grad_d, output_d, label_d, dimM, dimN);
+  KeMatrixCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad_d, output_d, label_d, dimM, dimN);
  CHECK_SYNC("hl_matrix_cross_entropy_bp failed");
 }

 void hl_matrix_zero_mem(real* data, int num) {
-  hl_gpu_apply_unary_op(
-        unary::Zero<real>(), data, 1, num, num);
+  hl_gpu_apply_unary_op(unary::Zero<real>(), data, 1, num, num);
 }

 __global__ void KeParamReluForward(real* output,
@@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output,
  int ty = blockIdx.y * blockDim.y + threadIdx.y;
  if (tx < width && ty < height) {
    int index = ty * width + tx;
-    output[index] = input[index] > 0 ? input[index] :
-        input[index] * w[tx / partial_sum];
+    output[index] =
+        input[index] > 0 ? input[index] : input[index] * w[tx / partial_sum];
  }
 }

@@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output,
  CHECK_NOTNULL(w);
  dim3 threads(16, 16);
  int blockX = (width + 16 - 1) / 16;
-  int blockY = (height + 16 -1) / 16;
+  int blockY = (height + 16 - 1) / 16;
  dim3 grid(blockX, blockY);
-  KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>
-    (output, input, w, width, height, partial_sum);
+  KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, input, w, width, height, partial_sum);
  CHECK_SYNC("hl_param_relu_forward failed");
 }

-template<int blockSize>
+template <int blockSize>
 __global__ void KeParamReluBackWardW(real* grad_w,
                                     real* grad_o,
                                     real* input,
@@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w,
  int grid_num = width / partial_sum;
  dim3 threads(blockSize, 1);
  dim3 grid(grid_num, 1);
-  KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (grad_w, grad_o, input, width, height, partial_sum);
+  KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad_w, grad_o, input, width, height, partial_sum);
  CHECK_SYNC("hl_param_relu_backward_w failed");
 }

@@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o,
  CHECK_NOTNULL(diff);
  dim3 threads(16, 16);
  int blockX = (width + 16 - 1) / 16;
-  int blockY = (height + 16 -1) / 16;
+  int blockY = (height + 16 - 1) / 16;
  dim3 grid(blockX, blockY);
-  KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>
-      (grad_o, data, w, diff, width, height, partial_sum);
+  KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad_o, data, w, diff, width, height, partial_sum);
  CHECK_SYNC("hl_param_relu_backward_diff failed");
 }

-__global__ void KeMatrixAddSharedBias(real* A,
-                                      real* B,
-                                      const int channel,
-                                      const int M,
-                                      const int N,
-                                      real scale) {
+__global__ void KeMatrixAddSharedBias(
+    real* A, real* B, const int channel, const int M, const int N, real scale) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int dim = N / channel;
  if (index < M * N) {
@@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d,
                               real scale) {
  const int blocks = 512;
  const int grids = DIVUP(dimM * dimN, blocks);
-  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>
-    (A_d, B_d, channel, dimM, dimN, scale);
+  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>(
+      A_d, B_d, channel, dimM, dimN, scale);
  CHECK_SYNC("hl_matrix_add_shared_bias failed");
 }

-
 template <int blockSize>
-__global__ void KeMatrixCollectSharedBias(real *B,
-                                          real *A,
+__global__ void KeMatrixCollectSharedBias(real* B,
+                                          real* A,
                                          const int channel,
                                          const int M,
                                          const int N,
@@ -611,14 +563,13 @@ void hl_matrix_collect_shared_bias(real* B_d,
  const int limit = 64;
  int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;

-  KeMatrixCollectSharedBias<blocks>
-      <<< grids, blocks, 0, STREAM_DEFAULT>>>
-      (B_d, A_d, channel, dimM, dimN, dim, limit, scale);
+  KeMatrixCollectSharedBias<blocks><<<grids, blocks, 0, STREAM_DEFAULT>>>(
+      B_d, A_d, channel, dimM, dimN, dim, limit, scale);
  CHECK_SYNC("hl_matrix_collect_shared_bias failed");
 }

-__global__ void keMatrixRotate(real* mat, real* matRot,
-                               int dimM, int dimN, bool clockWise) {
+__global__ void keMatrixRotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < dimM * dimN) {
    int i = idx / dimN;
@@ -631,13 +582,13 @@ __global__ void keMatrixRotate(real* mat, real* matRot,
  }
 }

-void hl_matrix_rotate(real *mat, real* matRot,
-                      int dimM, int dimN, bool clockWise) {
+void hl_matrix_rotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
  CHECK_NOTNULL(mat);
  CHECK_NOTNULL(matRot);
  const int threads = 512;
  const int blocks = DIVUP(dimM * dimN, threads);
-    keMatrixRotate<<< blocks, threads, 0, STREAM_DEFAULT >>>
-            (mat, matRot, dimM, dimN, clockWise);
+  keMatrixRotate<<<blocks, threads, 0, STREAM_DEFAULT>>>(
+      mat, matRot, dimM, dimN, clockWise);
  CHECK_SYNC("hl_matrix_rotate failed");
 }
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -16,36 +16,36 @@ limitations under the License. */
 #include "hl_device_functions.cuh"
 #include "paddle/utils/Logging.h"

-__global__ void KeMaxSequenceForward(real *input,
-                                     const int *sequence,
+__global__ void KeMaxSequenceForward(real* input,
+                                     const int* sequence,
                                     real* output,
-                                     int *index,
+                                     int* index,
                                     int numSequences,
                                     int dim) {
  int dimIdx = threadIdx.x;
  int sequenceId = blockIdx.x;
  if (sequenceId >= numSequences) return;
  int start = sequence[sequenceId];
-  int end = sequence[sequenceId+1];
+  int end = sequence[sequenceId + 1];

  for (int i = dimIdx; i < dim; i += blockDim.x) {
    real tmp = -HL_FLOAT_MAX;
    int tmpId = -1;
    for (int insId = start; insId < end; insId++) {
-      if (tmp < input[insId*dim + i]) {
-        tmp = input[insId*dim + i];
+      if (tmp < input[insId * dim + i]) {
+        tmp = input[insId * dim + i];
        tmpId = insId;
      }
    }
-    output[sequenceId*dim + i] = tmp;
-    index[sequenceId*dim + i] = tmpId;
+    output[sequenceId * dim + i] = tmp;
+    index[sequenceId * dim + i] = tmpId;
  }
 }

 void hl_max_sequence_forward(real* input,
                             const int* sequence,
                             real* output,
-                             int *index,
+                             int* index,
                             int numSequences,
                             int dim) {
  CHECK_NOTNULL(input);
@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input,

  dim3 threads(256, 1);
  dim3 grid(numSequences, 1);
-  KeMaxSequenceForward<<< grid, threads, 0, STREAM_DEFAULT >>>
-      (input, sequence, output, index, numSequences, dim);
+  KeMaxSequenceForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      input, sequence, output, index, numSequences, dim);
  CHECK_SYNC("hl_max_sequence_forward failed");
 }

-__global__ void KeMaxSequenceBackward(real *outputGrad,
-                                      int *index,
-                                      real* inputGrad,
-                                      int numSequences,
-                                      int dim) {
+__global__ void KeMaxSequenceBackward(
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
  int idx = threadIdx.x + blockIdx.x * blockDim.x;
  int colIdx = idx % dim;
-  if (idx < numSequences*dim) {
+  if (idx < numSequences * dim) {
    int insId = index[idx];
    inputGrad[insId * dim + colIdx] += outputGrad[idx];
  }
 }

-void hl_max_sequence_backward(real* outputGrad,
-                              int *index,
-                              real* inputGrad,
-                              int numSequences,
-                              int dim) {
+void hl_max_sequence_backward(
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
  CHECK_NOTNULL(outputGrad);
  CHECK_NOTNULL(index);
  CHECK_NOTNULL(inputGrad);
@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad,
  unsigned int blocks = (numSequences * dim + 128 - 1) / 128;
  dim3 threads(128, 1);
  dim3 grid(blocks, 1);
-  KeMaxSequenceBackward<<< grid, threads, 0, STREAM_DEFAULT >>>
-      (outputGrad, index, inputGrad, numSequences, dim);
+  KeMaxSequenceBackward<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      outputGrad, index, inputGrad, numSequences, dim);
  CHECK_SYNC("hl_max_sequence_backward failed");
 }

-template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
+template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
 __global__ void KeMatrixAddRows(real* output,
                                real* table,
                                int* ids,
@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output,
  while (sampleId < numSamples) {
    int tableId = ids[sampleId];
    if ((0 <= tableId) && (tableId < tableSize)) {
-      real *outputData = output + sampleId * dim;
-      real *tableData = table + tableId * dim;
+      real* outputData = output + sampleId * dim;
+      real* tableData = table + tableId * dim;
      for (int i = idx; i < dim; i += blockDimX) {
        if (AddRow == 0) {
          outputData[i] += tableData[i];
@@ -114,15 +108,18 @@ __global__ void KeMatrixAddRows(real* output,
        }
      }
    }
-    sampleId += blockDimY*gridDimX;
+    sampleId += blockDimY * gridDimX;
  }
 }

-template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd>
-__global__
-void KeSequence2Batch(real *batch,
-                      real *sequence,
-                      const int *batchIndex,
+template <int blockDimX,
+          int blockDimY,
+          int gridDimX,
+          bool seq2batch,
+          bool isAdd>
+__global__ void KeSequence2Batch(real* batch,
+                                 real* sequence,
+                                 const int* batchIndex,
                                 int seqWidth,
                                 int batchCount) {
  int idx = threadIdx.x;
@@ -130,8 +127,8 @@ void KeSequence2Batch(real *batch,
  int id = blockIdx.x + idy * gridDimX;
  while (id < batchCount) {
    int seqId = batchIndex[id];
-    real* batchData = batch + id*seqWidth;
-    real* seqData = sequence + seqId*seqWidth;
+    real* batchData = batch + id * seqWidth;
+    real* seqData = sequence + seqId * seqWidth;
    for (int i = idx; i < seqWidth; i += blockDimX) {
      if (seq2batch) {
        if (isAdd) {
@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch,
        }
      }
    }
-    id += blockDimY*gridDimX;
+    id += blockDimY * gridDimX;
  }
 }

-void hl_sequence2batch_copy(real *batch,
-                            real *sequence,
-                            const int *batchIndex,
+void hl_sequence2batch_copy(real* batch,
+                            real* sequence,
+                            const int* batchIndex,
                            int seqWidth,
                            int batchCount,
                            bool seq2batch) {
@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch,
  dim3 threads(128, 8);
  dim3 grid(8, 1);
  if (seq2batch) {
-    KeSequence2Batch<128, 8, 8, 1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+    KeSequence2Batch<128, 8, 8, 1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        batch, sequence, batchIndex, seqWidth, batchCount);
  } else {
-    KeSequence2Batch<128, 8, 8, 0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+    KeSequence2Batch<128, 8, 8, 0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        batch, sequence, batchIndex, seqWidth, batchCount);
  }
  CHECK_SYNC("hl_sequence2batch_copy failed");
 }

-void hl_sequence2batch_add(real *batch,
-                           real *sequence,
-                           int *batchIndex,
+void hl_sequence2batch_add(real* batch,
+                           real* sequence,
+                           int* batchIndex,
                           int seqWidth,
                           int batchCount,
                           bool seq2batch) {
@@ -186,18 +183,17 @@ void hl_sequence2batch_add(real *batch,
  dim3 threads(128, 8);
  dim3 grid(8, 1);
  if (seq2batch) {
-    KeSequence2Batch<128, 8, 8, 1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+    KeSequence2Batch<128, 8, 8, 1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        batch, sequence, batchIndex, seqWidth, batchCount);
  } else {
-    KeSequence2Batch<128, 8, 8, 0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+    KeSequence2Batch<128, 8, 8, 0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        batch, sequence, batchIndex, seqWidth, batchCount);
  }
  CHECK_SYNC("hl_sequence2batch_add failed");
 }

-template<bool normByTimes, bool seq2batch>
-__global__
-void KeSequence2BatchPadding(real* batch,
+template <bool normByTimes, bool seq2batch>
+__global__ void KeSequence2BatchPadding(real* batch,
                                        real* sequence,
                                        const int* sequenceStartPositions,
                                        const size_t sequenceWidth,
@@ -269,45 +265,56 @@ void hl_sequence2batch_copy_padding(real* batch,
  int blockDimY = CUDA_BLOCK_SIZE / blockDimX;
  dim3 threads(blockDimX, blockDimY);

-  int gridDimX = (maxSequenceLength * blockDimX + CUDA_BLOCK_SIZE - 1) /
-      CUDA_BLOCK_SIZE;
+  int gridDimX = (maxSequenceLength + blockDimY - 1) / blockDimY;
  int gridDimY = numSequences;
  dim3 grid(gridDimX, gridDimY);

  if (seq2batch) {
    /* sequence -> batch */
    if (normByTimes) {
-      KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
-              batch, sequence, sequenceStartPositions,
-              sequenceWidth, maxSequenceLength, numSequences);
+      KeSequence2BatchPadding<1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          batch,
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
    } else {
-      KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
-              batch, sequence, sequenceStartPositions,
-              sequenceWidth, maxSequenceLength, numSequences);
+      KeSequence2BatchPadding<0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          batch,
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
    }
  } else {
    /* batch -> sequence */
    if (normByTimes) {
-      KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
-              batch, sequence, sequenceStartPositions,
-              sequenceWidth, maxSequenceLength, numSequences);
+      KeSequence2BatchPadding<1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          batch,
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
    } else {
-      KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
-              batch, sequence, sequenceStartPositions,
-              sequenceWidth, maxSequenceLength, numSequences);
+      KeSequence2BatchPadding<0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          batch,
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
    }
  }

  CHECK_SYNC("hl_sequence2batch_copy_padding failed");
 }

-__device__ inline float my_rsqrt(float x) {
-  return rsqrtf(x);
-}
+__device__ inline float my_rsqrt(float x) { return rsqrtf(x); }

-__device__ inline double my_rsqrt(double x) {
-  return rsqrt(x);
-}
+__device__ inline double my_rsqrt(double x) { return rsqrt(x); }

 __global__ void KeSequenceAvgForward(real* dst,
                                     real* src,
@@ -328,8 +335,8 @@ __global__ void KeSequenceAvgForward(real* dst,
    for (int i = start; i < end; i++) {
      sum += src[i * width + col];
    }
-    sum = mode == 1 ? sum :
-        (mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength));
+    sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength
+                                       : sum * my_rsqrt((real)seqLength));
    dst[gid] += sum;
  }
 }
@@ -350,8 +357,8 @@ void hl_sequence_avg_forward(real* dst,
  CHECK(mode == 0 || mode == 1 || mode == 2)
      << "mode error in hl_sequence_avg_forward!";

-  KeSequenceAvgForward<<< grid, block, 0, STREAM_DEFAULT >>>
-           (dst, src, starts, height, width, mode);
+  KeSequenceAvgForward<<<grid, block, 0, STREAM_DEFAULT>>>(
+      dst, src, starts, height, width, mode);
  CHECK_SYNC("hl_sequence_avg_forward failed");
 }

@@ -371,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst,
    int seqLength = end - start;
    if (seqLength == 0) return;
    real grad = src[gid];
-    grad = mode == 1 ? grad :
-        (mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength));
+    grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength
+                                         : grad * my_rsqrt((real)seqLength));
    for (int i = start; i < end; i++) {
      dst[i * width + col] += grad;
    }
@@ -395,7 +402,7 @@ void hl_sequence_avg_backward(real* dst,
  CHECK(mode == 0 || mode == 1 || mode == 2)
      << "mode error in hl_sequence_avg_backward!";

-  KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>>
-           (dst, src, starts, height, width, mode);
+  KeSequenceAvgBackward<<<grid, block, 0, STREAM_DEFAULT>>>(
+      dst, src, starts, height, width, mode);
  CHECK_SYNC("hl_sequence_avg_backward failed");
 }
--- a/paddle/cuda/src/hl_cuda_sparse.cu
+++ b/paddle/cuda/src/hl_cuda_sparse.cu
--- a/paddle/cuda/src/hl_perturbation_util.cu
+++ b/paddle/cuda/src/hl_perturbation_util.cu
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-
-#include <cmath>
 #include <stdlib.h>
-#include "hl_cuda.h"
-#include "hl_time.h"
+#include <cmath>
 #include "hl_base.h"
+#include "hl_cuda.h"
 #include "hl_perturbation_util.cuh"
+#include "hl_time.h"

 #define _USE_MATH_DEFINES

@@ -30,10 +29,16 @@ limitations under the License. */
 * centerX, centerY: translation.
 * sourceX, sourceY: output coordinates in the original image.
 */
-__device__ void getTranformCoord(int x, int y, real theta, real scale,
-                                 real tgtCenter, real imgCenter,
-                                 real centerR, real centerC,
-                                 int* sourceX, int* sourceY) {
+__device__ void getTranformCoord(int x,
+                                 int y,
+                                 real theta,
+                                 real scale,
+                                 real tgtCenter,
+                                 real imgCenter,
+                                 real centerR,
+                                 real centerC,
+                                 int* sourceX,
+                                 int* sourceY) {
  real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)};

  // compute coornidates in the rotated and scaled image
@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
 * created by Wei Xu (genome), converted by Jiang Wang
 */

-__global__ void kSamplingPatches(const real* imgs, real* targets,
-                                 int imgSize, int tgtSize, const int channels,
-                                 int samplingRate, const real* thetas,
-                                 const real* scales, const int* centerRs,
-                                 const int* centerCs, const real padValue,
+__global__ void kSamplingPatches(const real* imgs,
+                                 real* targets,
+                                 int imgSize,
+                                 int tgtSize,
+                                 const int channels,
+                                 int samplingRate,
+                                 const real* thetas,
+                                 const real* scales,
+                                 const int* centerRs,
+                                 const int* centerCs,
+                                 const real padValue,
                                 const int numImages) {
  const int caseIdx = blockIdx.x * 4 + threadIdx.x;
  const int pxIdx = blockIdx.y * 128 + threadIdx.y;
@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
    const int pxY = pxIdx / tgtSize;

    int srcPxX, srcPxY;
-    getTranformCoord(pxX, pxY, thetas[imgIdx], scales[imgIdx], tgtCenter,
-                     imgCenter, centerCs[caseIdx], centerRs[caseIdx], &srcPxX,
+    getTranformCoord(pxX,
+                     pxY,
+                     thetas[imgIdx],
+                     scales[imgIdx],
+                     tgtCenter,
+                     imgCenter,
+                     centerCs[caseIdx],
+                     centerRs[caseIdx],
+                     &srcPxX,
                     &srcPxY);

    imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels;
@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
 *
 * created by Wei Xu
 */
-void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
-                                int*& gpuCenterR, int*& gpuCenterC,
-                                int numImages, int imgSize, real rotateAngle,
-                                real scaleRatio, int samplingRate,
+void hl_generate_disturb_params(real*& gpuAngle,
+                                real*& gpuScaleRatio,
+                                int*& gpuCenterR,
+                                int*& gpuCenterC,
+                                int numImages,
+                                int imgSize,
+                                real rotateAngle,
+                                real scaleRatio,
+                                int samplingRate,
                                bool isTrain) {
  // The number of output samples.
  int numPatches = numImages * samplingRate;
@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
    for (int i = 0; i < numImages; i++) {
      r_angle[i] =
          (rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0)  // NOLINT
-                                          - 0.5);
+                                          -
+                                          0.5);
      s_ratio[i] =
          1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio;  // NOLINT
    }
@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
        int pxY =
            (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0));  // NOLINT

-        const real H[4] = {cos(-r_angle[i]), -sin(-r_angle[i]),
-                           sin(-r_angle[i]), cos(-r_angle[i])};
+        const real H[4] = {cos(-r_angle[i]),
+                           -sin(-r_angle[i]),
+                           sin(-r_angle[i]),
+                           cos(-r_angle[i])};
        real x = pxX - imgCenter;
        real y = pxY - imgCenter;
        real xx = H[0] * x + H[1] * y;
@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
  delete[] center_c;
 }

-void hl_conv_random_disturb_with_params(const real* images, int imgSize,
-                                        int tgtSize, int channels,
-                                        int numImages, int samplingRate,
+void hl_conv_random_disturb_with_params(const real* images,
+                                        int imgSize,
+                                        int tgtSize,
+                                        int channels,
+                                        int numImages,
+                                        int samplingRate,
                                        const real* gpuRotationAngle,
                                        const real* gpuScaleRatio,
                                        const int* gpuCenterR,
@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
  dim3 threadsPerBlock(4, 128);
  dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128));

-  kSamplingPatches <<<numBlocks, threadsPerBlock>>>
-      (images, target, imgSize, tgtSize, channels, samplingRate,
-      gpuRotationAngle, gpuScaleRatio, gpuCenterR, gpuCenterC,
-      paddingValue, numImages);
+  kSamplingPatches<<<numBlocks, threadsPerBlock>>>(images,
+                                                   target,
+                                                   imgSize,
+                                                   tgtSize,
+                                                   channels,
+                                                   samplingRate,
+                                                   gpuRotationAngle,
+                                                   gpuScaleRatio,
+                                                   gpuCenterR,
+                                                   gpuCenterC,
+                                                   paddingValue,
+                                                   numImages);

  hl_device_synchronize();
 }

-void hl_conv_random_disturb(const real* images, int imgSize,
-                            int tgtSize, int channels, int numImages,
-                            real scaleRatio, real rotateAngle,
-                            int samplingRate, real* gpu_r_angle,
-                            real* gpu_s_ratio, int* gpu_center_r,
-                            int* gpu_center_c, int paddingValue,
-                            bool isTrain, real* targets) {
+void hl_conv_random_disturb(const real* images,
+                            int imgSize,
+                            int tgtSize,
+                            int channels,
+                            int numImages,
+                            real scaleRatio,
+                            real rotateAngle,
+                            int samplingRate,
+                            real* gpu_r_angle,
+                            real* gpu_s_ratio,
+                            int* gpu_center_r,
+                            int* gpu_center_c,
+                            int paddingValue,
+                            bool isTrain,
+                            real* targets) {
  // generate the random disturbance sequence and the sampling locations
-  hl_generate_disturb_params(gpu_r_angle, gpu_s_ratio, gpu_center_r,
-                  gpu_center_c, numImages, imgSize, rotateAngle,
-                  scaleRatio, samplingRate, isTrain);
-
-  hl_conv_random_disturb_with_params(
-                  images, imgSize, tgtSize, channels, numImages,
-                  samplingRate, gpu_r_angle, gpu_s_ratio,
-                  gpu_center_r, gpu_center_r, paddingValue,
+  hl_generate_disturb_params(gpu_r_angle,
+                             gpu_s_ratio,
+                             gpu_center_r,
+                             gpu_center_c,
+                             numImages,
+                             imgSize,
+                             rotateAngle,
+                             scaleRatio,
+                             samplingRate,
+                             isTrain);
+
+  hl_conv_random_disturb_with_params(images,
+                                     imgSize,
+                                     tgtSize,
+                                     channels,
+                                     numImages,
+                                     samplingRate,
+                                     gpu_r_angle,
+                                     gpu_s_ratio,
+                                     gpu_center_r,
+                                     gpu_center_r,
+                                     paddingValue,
                                     targets);
 }
--- a/paddle/cuda/src/hl_table_apply.cu
+++ b/paddle/cuda/src/hl_table_apply.cu
@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-
 #include "hl_base.h"
-#include "hl_device_functions.cuh"
 #include "hl_cuda.h"
+#include "hl_device_functions.cuh"
 #include "paddle/utils/Logging.h"

-template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
-__global__ void KeMatrixAddRows(real* output, int ldo,
-                                real* table, int ldt,
+template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
+__global__ void KeMatrixAddRows(real* output,
+                                int ldo,
+                                real* table,
+                                int ldt,
                                int* ids,
                                int numSamples,
                                int tableSize,
@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
  while (idy < numSamples) {
    int tableId = ids[idy];
    if ((0 <= tableId) && (tableId < tableSize)) {
-      real *out = output + idy * ldo;
-      real *tab = table + tableId * ldt;
+      real* out = output + idy * ldo;
+      real* tab = table + tableId * ldt;
      for (int i = idx; i < dim; i += blockDimX) {
        if (AddRow) {
          paddle::paddleAtomicAdd(&tab[i], out[i]);
@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
  }
 }

-void hl_matrix_select_rows(real* output, int ldo,
-                           real* table, int ldt,
+void hl_matrix_select_rows(real* output,
+                           int ldo,
+                           real* table,
+                           int ldt,
                           int* ids,
                           int numSamples,
                           int tableSize,
@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,

  dim3 threads(128, 8);
  dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
-    (output, ldo, table, ldt, ids, numSamples, tableSize, dim);
+  KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, ldo, table, ldt, ids, numSamples, tableSize, dim);

  CHECK_SYNC("hl_matrix_select_rows failed");
 }

-void hl_matrix_add_to_rows(real* table, int ldt,
-                           real* input, int ldi,
+void hl_matrix_add_to_rows(real* table,
+                           int ldt,
+                           real* input,
+                           int ldi,
                           int* ids,
                           int numSamples,
                           int tableSize,
@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,

  dim3 threads(128, 8);
  dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
-    (input, ldi, table, ldt, ids, numSamples, tableSize, dim);
+  KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      input, ldi, table, ldt, ids, numSamples, tableSize, dim);

  CHECK_SYNC("hl_matrix_add_to_rows failed");
 }

-template<class T, int blockDimX, int gridDimX>
-__global__ void KeVectorSelect(T* dst, int sized,
-                               const T* src, int sizes,
-                               const int* ids, int sizei) {
+template <class T, int blockDimX, int gridDimX>
+__global__ void KeVectorSelect(
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
  int idx = threadIdx.x + blockDimX * blockIdx.x;
  while (idx < sizei) {
    int index = ids[idx];
@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
 }

 template <class T>
-void hl_vector_select_from(T* dst, int sized,
-                           const T* src, int sizes,
-                           const int* ids, int sizei) {
+void hl_vector_select_from(
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
  CHECK_NOTNULL(dst);
  CHECK_NOTNULL(src);
  CHECK_NOTNULL(ids);
@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,

  dim3 threads(512, 1);
  dim3 grid(8, 1);
-  KeVectorSelect<T, 512, 8><<< grid, threads, 0, STREAM_DEFAULT >>>
-    (dst, sized, src, sizes, ids, sizei);
+  KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      dst, sized, src, sizes, ids, sizei);

  CHECK_SYNC("hl_vector_select_from failed");
 }

-template
-void hl_vector_select_from(real* dst, int sized,
-                           const real* src, int sizes,
-                           const int* ids, int sizei);
-template
-void hl_vector_select_from(int* dst, int sized,
-                           const int* src, int sizes,
-                           const int* ids, int sizei);
-
+template void hl_vector_select_from(real* dst,
+                                    int sized,
+                                    const real* src,
+                                    int sizes,
+                                    const int* ids,
+                                    int sizei);
+template void hl_vector_select_from(
+    int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);
--- a/paddle/cuda/src/hl_top_k.cu
+++ b/paddle/cuda/src/hl_top_k.cu
@@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-
 #include "hl_base.h"
-#include "hl_top_k.h"
 #include "hl_sparse.ph"
+#include "hl_top_k.h"
 #include "paddle/utils/Logging.h"

 // using namespace hppl;

 struct Pair {
-  __device__ __forceinline__
-  Pair() {}
+  __device__ __forceinline__ Pair() {}

-  __device__ __forceinline__
-  Pair(real value, int id) : v_(value), id_(id) {}
+  __device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {}

-  __device__ __forceinline__
-  void set(real value, int id) {
+  __device__ __forceinline__ void set(real value, int id) {
    v_ = value;
    id_ = id;
  }

-  __device__ __forceinline__
-  void operator=(const Pair& in) {
+  __device__ __forceinline__ void operator=(const Pair& in) {
    v_ = in.v_;
    id_ = in.id_;
  }

-  __device__ __forceinline__
-  bool operator<(const real value) const {
+  __device__ __forceinline__ bool operator<(const real value) const {
    return (v_ < value);
  }

-  __device__ __forceinline__
-  bool operator<(const Pair& in) const {
+  __device__ __forceinline__ bool operator<(const Pair& in) const {
    return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_));
  }

-  __device__ __forceinline__
-  bool operator>(const Pair& in) const {
+  __device__ __forceinline__ bool operator>(const Pair& in) const {
    return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_));
  }

@@ -58,8 +50,9 @@ struct Pair {
  int id_;
 };

-__device__ __forceinline__
-void addTo(Pair topK[], const Pair &p, int beamSize) {
+__device__ __forceinline__ void addTo(Pair topK[],
+                                      const Pair& p,
+                                      int beamSize) {
  for (int k = beamSize - 2; k >= 0; k--) {
    if (topK[k] < p) {
      topK[k + 1] = topK[k];
@@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) {
  topK[0] = p;
 }

-template<int beamSize>
-__device__ __forceinline__
-void addTo(Pair topK[], const Pair &p) {
+template <int beamSize>
+__device__ __forceinline__ void addTo(Pair topK[], const Pair& p) {
  for (int k = beamSize - 2; k >= 0; k--) {
    if (topK[k] < p) {
      topK[k + 1] = topK[k];
@@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) {
  topK[0] = p;
 }

-template<int blockSize>
-__device__ __forceinline__
-void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
+template <int blockSize>
+__device__ __forceinline__ void getTopK(
+    Pair topK[], real* src, int idx, int dim, int beamSize) {
  while (idx < dim) {
    if (topK[beamSize - 1] < src[idx]) {
      Pair tmp(src[idx], idx);
@@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
  }
 }

-template<int blockSize>
-__device__ __forceinline__
-void getTopK(Pair topK[], real *src, int idx, int dim,
-             const Pair& max, int beamSize) {
+template <int blockSize>
+__device__ __forceinline__ void getTopK(
+    Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) {
  while (idx < dim) {
    if (topK[beamSize - 1] < src[idx]) {
      Pair tmp(src[idx], idx);
@@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim,
  }
 }

-template<int blockSize>
-__device__ __forceinline__
-void getTopK(Pair topK[], real *val, int *col,
-             int idx, int dim, int beamSize) {
+template <int blockSize>
+__device__ __forceinline__ void getTopK(
+    Pair topK[], real* val, int* col, int idx, int dim, int beamSize) {
  while (idx < dim) {
    if (topK[beamSize - 1] < val[idx]) {
      Pair tmp(val[idx], col[idx]);
@@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col,
  }
 }

-template<int blockSize>
-__device__ __forceinline__
-void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
-             const Pair& max, int beamSize) {
+template <int blockSize>
+__device__ __forceinline__ void getTopK(Pair topK[],
+                                        real* val,
+                                        int* col,
+                                        int idx,
+                                        int dim,
+                                        const Pair& max,
+                                        int beamSize) {
  while (idx < dim) {
    if (topK[beamSize - 1] < val[idx]) {
      Pair tmp(val[idx], col[idx]);
@@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
  }
 }

-template<int maxLength, int blockSize>
-__device__ __forceinline__
-void threadGetTopK(Pair topK[], int& beam, int beamSize,
+template <int maxLength, int blockSize>
+__device__ __forceinline__ void threadGetTopK(Pair topK[],
+                                              int& beam,
+                                              int beamSize,
                                              real* src,
-                   bool& firstStep, bool& isEmpty, Pair& max,
-                   int dim, const int tid) {
+                                              bool& firstStep,
+                                              bool& isEmpty,
+                                              Pair& max,
+                                              int dim,
+                                              const int tid) {
  if (beam > 0) {
    int length = beam < beamSize ? beam : beamSize;
    if (firstStep) {
@@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
        }
      }
      if (!isEmpty) {
-        getTopK<blockSize>(topK + maxLength - beam, src, tid, dim,
-                           max, length);
+        getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, max, length);
      }
    }

@@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
  }
 }

-template<int maxLength, int blockSize>
-__device__ __forceinline__
-void threadGetTopK(Pair topK[], int& beam, int beamSize,
-                   real* val, int* col,
-                   bool& firstStep, bool& isEmpty, Pair& max,
-                   int dim, const int tid) {
+template <int maxLength, int blockSize>
+__device__ __forceinline__ void threadGetTopK(Pair topK[],
+                                              int& beam,
+                                              int beamSize,
+                                              real* val,
+                                              int* col,
+                                              bool& firstStep,
+                                              bool& isEmpty,
+                                              Pair& max,
+                                              int dim,
+                                              const int tid) {
  if (beam > 0) {
    int length = beam < beamSize ? beam : beamSize;
    if (firstStep) {
@@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
        }
      }
      if (!isEmpty) {
-        getTopK<blockSize>(topK + maxLength - beam, val, col, tid, dim,
-                           max, length);
+        getTopK<blockSize>(
+            topK + maxLength - beam, val, col, tid, dim, max, length);
      }
    }

@@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
  }
 }

-template<int maxLength, int blockSize>
-__device__ __forceinline__
-void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
-                 real** topVal, int** topIds,
-                 int& beam, int& beamSize,
-                 const int tid, const int warp) {
+template <int maxLength, int blockSize>
+__device__ __forceinline__ void blockReduce(Pair* shTopK,
+                                            int* maxId,
+                                            Pair topK[],
+                                            real** topVal,
+                                            int** topIds,
+                                            int& beam,
+                                            int& beamSize,
+                                            const int tid,
+                                            const int warp) {
  while (true) {
    __syncthreads();
    if (tid < blockSize / 2) {
@@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
      }
    }
    __syncthreads();
-    for (int stride = blockSize / 4; stride > 0; stride = stride/2) {
+    for (int stride = blockSize / 4; stride > 0; stride = stride / 2) {
      if (tid < stride) {
        if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) {
          maxId[tid] = maxId[tid + stride];
@@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
 * 3. go to the second setp, until one thread's topK value is null;
 * 4. go to the first setp, until get the topK value.
 */
-template<int maxLength, int blockSize>
-__global__ void KeMatrixTopK(real* topVal, int ldv,
-                             int * topIds,
-                             real* src, int lds,
+template <int maxLength, int blockSize>
+__global__ void KeMatrixTopK(real* topVal,
+                             int ldv,
+                             int* topIds,
+                             real* src,
+                             int lds,
                             int dim,
                             int beamSize) {
  __shared__ Pair shTopK[blockSize];
@@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
    topK[k].set(-HL_FLOAT_MAX, -1);
  }
  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>
-      (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+    threadGetTopK<maxLength, blockSize>(
+        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);

    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>
-      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+    blockReduce<maxLength, blockSize>(
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
  }
 }

-template<int maxLength, int blockSize>
-__global__ void KeSMatrixTopK(real* topVal, int ldv,
-                              int * topIds,
+template <int maxLength, int blockSize>
+__global__ void KeSMatrixTopK(real* topVal,
+                              int ldv,
+                              int* topIds,
                              real* val,
                              int* row,
                              int* col,
@@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
    topK[k].set(-HL_FLOAT_MAX, -1);
  }
  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>
-      (topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
+    threadGetTopK<maxLength, blockSize>(
+        topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);

    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>
-      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+    blockReduce<maxLength, blockSize>(
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
  }
 }

-void hl_matrix_top_k(real* topVal, int ldv,
-                     int * topIds,
-                     real* src, int lds,
+void hl_matrix_top_k(real* topVal,
+                     int ldv,
+                     int* topIds,
+                     real* src,
+                     int lds,
                     int dim,
                     int beamSize,
                     int numSamples) {
@@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv,

  dim3 threads(256, 1);
  dim3 grid(numSamples, 1);
-  KeMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>>
-    (topVal, ldv, topIds, src, lds, dim, beamSize);
+  KeMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      topVal, ldv, topIds, src, lds, dim, beamSize);

  CHECK_SYNC("hl_matrix_top_k failed");
 }

-void hl_sparse_matrix_top_k(real* topVal, int ldv,
-                            int * topIds,
+void hl_sparse_matrix_top_k(real* topVal,
+                            int ldv,
+                            int* topIds,
                            hl_sparse_matrix_s src,
                            int beamSize,
                            int numSamples) {
  CHECK_NOTNULL(topVal);
  CHECK_NOTNULL(topIds);
  CHECK_NOTNULL(src);
-  CHECK_EQ(src->format, HL_SPARSE_CSR)
-    <<"sparse matrix format error!";
+  CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!";

  hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
-  if (csr->csr_val == NULL || csr->csr_row == NULL ||
-      csr->csr_col == NULL) {
+  if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) {
    LOG(FATAL) << "parameter src is null!";
  }

  dim3 threads(256, 1);
  dim3 grid(numSamples, 1);
-  KeSMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>>
-    (topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
+  KeSMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);

  CHECK_SYNC("hl_sparse_matrix_top_k failed");
 }
@@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
 * 3. go to the second setp, until one thread's topK value is null;
 * 4. go to the first setp, until get the topK value.
 */
-template<int maxLength, int blockSize>
-__global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
-                                                int * topIds,
-                                                real* src, int lds,
+template <int maxLength, int blockSize>
+__global__ void KeMatrixTopKClassificationError(real* topVal,
+                                                int ldv,
+                                                int* topIds,
+                                                real* src,
+                                                int lds,
                                                int dim,
                                                int beamSize,
                                                int* label,
@@ -420,12 +432,12 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
  }

  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>
-      (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+    threadGetTopK<maxLength, blockSize>(
+        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);

    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>
-      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+    blockReduce<maxLength, blockSize>(
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
  }

  __syncthreads();
@@ -440,9 +452,11 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
  }
 }

-void hl_matrix_classification_error(real* topVal, int ldv,
+void hl_matrix_classification_error(real* topVal,
+                                    int ldv,
                                    int* topIds,
-                                   real* src, int lds,
+                                    real* src,
+                                    int lds,
                                    int dim,
                                    int topkSize,
                                    int numSamples,
@@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv,

  dim3 threads(256, 1);
  dim3 grid(numSamples, 1);
-  KeMatrixTopKClassificationError<5, 256>
-  <<< grid, threads, 0, STREAM_DEFAULT >>>
-  (topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
+  KeMatrixTopKClassificationError<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);

  CHECK_SYNC("hl_matrix_top_k classification error failed");
 }
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -8,15 +8,19 @@ cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)

 cc_test(variable_test SRCS variable_test.cc)
-cc_test(scope_test SRCS scope_test.cc)

-proto_library(attr_type SRCS attr_type.proto)
-proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
-proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
+cc_library(scope SRCS scope.cc)
+cc_test(scope_test SRCS scope_test.cc DEPS scope)
+
+proto_library(attribute_proto SRCS attribute.proto)
+proto_library(op_proto SRCS op_proto.proto DEPS attribute_proto)
+proto_library(op_desc SRCS op_desc.proto DEPS attribute_proto)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)

-cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor)
+cc_library(attribute SRCS attribute.cc DEPS op_desc op_proto)
+
+cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor scope attribute)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)

 cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS op_proto operator)
@@ -24,10 +28,19 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_desc grad_op_builder)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)

-py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
+py_proto_compile(framework_py_proto SRCS attribute.proto op_proto.proto op_desc.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)

-cc_library(net SRCS net.cc DEPS op_registry)
-cc_test(net_op_test SRCS net_op_test.cc DEPS net add_op mul_op sigmoid_op softmax_op fc_op)
+cc_library(backward SRCS backward.cc DEPS net_op)
+cc_test(backward_test SRCS backward_test.cc DEPS backward)
+cc_library(paddle_pybind SHARED
+    SRCS pybind.cc
+    DEPS pybind python backward
+	fc_op
+	sgd_op
+	add_op
+	mean_op
+	cross_entropy_op
+	recurrent_op)
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/attribute.h"
+
+#include <vector>
+
+namespace paddle {
+namespace framework {
+
+template <>
+AttrType AttrTypeID<int>() {
+  return INT;
+}
+template <>
+AttrType AttrTypeID<float>() {
+  return FLOAT;
+}
+template <>
+AttrType AttrTypeID<std::string>() {
+  return STRING;
+}
+template <>
+AttrType AttrTypeID<std::vector<int>>() {
+  return INTS;
+}
+template <>
+AttrType AttrTypeID<std::vector<float>>() {
+  return FLOATS;
+}
+template <>
+AttrType AttrTypeID<std::vector<std::string>>() {
+  return STRINGS;
+}
+
+Attribute GetAttrValue(const AttrDesc& attr_desc) {
+  switch (attr_desc.type()) {
+    case paddle::framework::AttrType::INT: {
+      return attr_desc.i();
+    }
+    case paddle::framework::AttrType::FLOAT: {
+      return attr_desc.f();
+    }
+    case paddle::framework::AttrType::STRING: {
+      return attr_desc.s();
+    }
+    case paddle::framework::AttrType::INTS: {
+      std::vector<int> val(attr_desc.ints_size());
+      for (int i = 0; i < attr_desc.ints_size(); ++i) {
+        val[i] = attr_desc.ints(i);
+      }
+      return val;
+    }
+    case paddle::framework::AttrType::FLOATS: {
+      std::vector<float> val(attr_desc.floats_size());
+      for (int i = 0; i < attr_desc.floats_size(); ++i) {
+        val[i] = attr_desc.floats(i);
+      }
+      return val;
+    }
+    case paddle::framework::AttrType::STRINGS: {
+      std::vector<std::string> val(attr_desc.strings_size());
+      for (int i = 0; i < attr_desc.strings_size(); ++i) {
+        val[i] = attr_desc.strings(i);
+      }
+      return val;
+    }
+  }
+  PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");
+  return boost::blank();
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/attr_checker.h
+++ b/paddle/framework/attr_checker.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma once

 #include <boost/variant.hpp>
@@ -6,6 +20,9 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
+#include "paddle/framework/attribute.pb.h"
+#include "paddle/framework/op_desc.pb.h"
 #include "paddle/platform/enforce.h"

 namespace paddle {
@@ -14,13 +31,19 @@ namespace framework {
 typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                       std::vector<float>, std::vector<std::string>>
    Attribute;
+
 typedef std::unordered_map<std::string, Attribute> AttributeMap;

+template <typename T>
+AttrType AttrTypeID();
+
+Attribute GetAttrValue(const AttrDesc& attr_desc);
+
 // check whether a value(attribute) fit a certain limit
 template <typename T>
 class LargerThanChecker {
 public:
-  LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
+  explicit LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
  void operator()(T& value) const {
    PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail");
  }
@@ -35,7 +58,8 @@ class LargerThanChecker {
 template <typename T>
 class DefaultValueSetter {
 public:
-  DefaultValueSetter(T default_value) : default_value_(default_value) {}
+  explicit DefaultValueSetter(T default_value)
+      : default_value_(default_value) {}
  void operator()(T& value) const { value = default_value_; }

 private:
@@ -78,7 +102,8 @@ class TypedAttrChecker {
  typedef std::function<void(T&)> ValueChecker;

 public:
-  TypedAttrChecker(const std::string& attr_name) : attr_name_(attr_name) {}
+  explicit TypedAttrChecker(const std::string& attr_name)
+      : attr_name_(attr_name) {}

  TypedAttrChecker& InEnum(const std::unordered_set<T>& range) {
    value_checkers_.push_back(EnumInContainer<T>(range));

--- a/paddle/framework/attr_type.proto
+++ b/paddle/framework/attr_type.proto
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-syntax="proto2";
+syntax = "proto2";
 package paddle.framework;

 // Attribute Type for paddle's Op.

--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/backward.h"
+#include <list>
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/net_op.h"
+
+namespace paddle {
+namespace framework {
+
+static bool AllInSet(const std::vector<std::string>& names,
+                     const std::string& suffix,
+                     const std::unordered_set<std::string>& set) {
+  for (auto& name : names) {
+    if (set.find(name + suffix) == set.end()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+static std::shared_ptr<OperatorBase> NOP() {
+  auto net_op = std::make_shared<operators::NetOp>();
+  net_op->type_ = "@NOP@";
+  net_op->CompleteAddOp();
+  return net_op;
+}
+
+//  Get backward operator from a forward operator, recursively implementation.
+//
+//  no_grad_names the gradient variable names without gradient calculating.
+//
+//  uniq_id is a unique index used inside recursively calling
+//  BackwardRecursive. use `uid = uniq_id++;` to get the unique index, and
+//  pass `uniq_id` through recursive calling.
+//
+//  returns The backward operator. For simple situation, it is a simple
+//  operator. For complex situation, it is a NetOp.
+//
+//  See Backward.h for details
+static std::shared_ptr<OperatorBase> BackwardRecursive(
+    const OperatorBase& forwardOp,
+    std::unordered_set<std::string>& no_grad_names, size_t& uniq_id);
+std::shared_ptr<OperatorBase> BackwardRecursive(
+    const OperatorBase& forwardOp,
+    std::unordered_set<std::string>& no_grad_names, size_t& uniq_id) {
+  //  If all input gradients of forwarding operator do not need to calculate,
+  //  just return an NOP. Not return null ptr because NOP does not take
+  //  too much time for calculation, but it is useful for simplifying logic.
+  if (AllInSet(forwardOp.inputs_, kGradVarSuffix, no_grad_names)) {
+    return NOP();
+  }
+
+  //  All output gradients of forwarding operator do not need to calculate.
+  //  Then all input gradients cannot be computed at all, and we put them into
+  //  `no_grad_names` set. Return an NOP.
+  if (AllInSet(forwardOp.outputs_, kGradVarSuffix, no_grad_names)) {
+    for (auto& name : forwardOp.inputs_) {
+      // Mark all input is not need
+      no_grad_names.insert(name + kGradVarSuffix);
+    }
+    return NOP();
+  }
+
+  // Returned gradient network
+  auto net = std::make_shared<operators::NetOp>();
+
+  if (forwardOp.IsNetOp()) {
+    // Because forwardOp is a net op, it can static_cast.
+    auto& forwardNet = static_cast<const operators::NetOp&>(forwardOp);
+
+    // Map from output gradient variable name to operator's indices in
+    // backward net. That operator generates that variable.
+    std::unordered_map<std::string, std::vector<size_t>> dup_output_ops;
+
+    size_t local_op_id = 0;
+    // reversely travel forwardNet
+    for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend();
+         ++it, ++local_op_id) {
+      auto fwd = *it;
+      auto bwd = BackwardRecursive(*fwd, no_grad_names, uniq_id);
+      net->AddOp(bwd);
+      for (auto& out : bwd->outputs_) {
+        dup_output_ops[out].emplace_back(local_op_id);
+      }
+    }
+    // Get unique ID for this method.
+    auto uid = uniq_id++;
+    // TODO(dzh): more comment
+    using Pos = std::pair<size_t, std::shared_ptr<OperatorBase>>;
+    std::list<Pos> insert_position;
+    for (auto& dup_output_op : dup_output_ops) {
+      const std::string& name = dup_output_op.first;
+      auto& dup_op = dup_output_op.second;
+      if (dup_op.size() == 1) continue;
+      std::vector<std::string> dup_outputs;
+
+      for (size_t i = 0; i < dup_op.size(); ++i) {
+        auto op_offset = dup_op[i];
+        dup_outputs.push_back(name + "@RENAME@" + std::to_string(uid) + "@" +
+                              std::to_string(i));
+        net->ops_[op_offset]->Rename(name, dup_outputs.back());
+      }
+      insert_position.push_back(
+          {dup_op.back(),
+           OpRegistry::CreateOp(
+               "add", {dup_outputs}, {name},
+               {{"input_format",
+                 std::vector<int>{0, static_cast<int>(dup_outputs.size())}}})});
+    }
+
+    insert_position.sort(
+        [](const Pos& l, const Pos& r) { return l.first > r.first; });
+
+    for (auto& pos : insert_position) {
+      net->InsertOp(pos.first + 1, pos.second);
+    }
+
+  } else {
+    std::shared_ptr<OperatorBase> grad_op = OpRegistry::CreateGradOp(forwardOp);
+    for (std::string& grad_input : grad_op->inputs_) {
+      if (no_grad_names.count(grad_input)) {
+        std::string prefix =
+            grad_input.substr(0, grad_input.size() - kGradVarSuffix.size());
+        grad_input = prefix + kZeroVarSuffix;
+
+        // If part of input gradient of that operator is not calculated, fill
+        // zero variables to that input gradient.
+        net->AddOp(OpRegistry::CreateOp("fill_zeros_like", {prefix},
+                                        {grad_input}, {}));
+      }
+    }
+
+    for (std::string& grad_output : grad_op->outputs_) {
+      if (no_grad_names.count(grad_output)) {
+        grad_output = kEmptyVarName;
+      }
+    }
+
+    if (net->ops_.empty()) {  // Current no aux op is added to network
+      return grad_op;
+    }
+    net->AddOp(grad_op);
+  }
+  net->type_ = "@GENERATED_BACKWARD@";
+  net->CompleteAddOp();
+  return net;
+}
+
+// See header for comments
+std::shared_ptr<OperatorBase> Backward(
+    const OperatorBase& forwardOp,
+    const std::unordered_set<std::string>& no_grad_vars) {
+  std::unordered_set<std::string> no_grad_names;
+  no_grad_names.reserve(no_grad_vars.size());
+
+  no_grad_names.insert(kEmptyVarName + kGradVarSuffix);
+
+  for (auto& name : no_grad_vars) {
+    no_grad_names.insert(name + kGradVarSuffix);
+  }
+  size_t uid = 0;
+  return BackwardRecursive(forwardOp, no_grad_names, uid);
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <unordered_set>
+#include "operator.h"
+namespace paddle {
+namespace framework {
+
+// Create the backward operator from a forward operator.
+// TODO(yuyang18): Add more API reference comment.
+extern std::shared_ptr<OperatorBase> Backward(
+    const OperatorBase& forwardOp,
+    const std::unordered_set<std::string>& no_grad_vars);
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/backward.md
+++ b/paddle/framework/backward.md
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -25,18 +25,15 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-namespace {
-typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
-                       Dim<8>, Dim<9>>
-    DDimVar;
-}
-
 /**
 * \brief A dynamically sized dimension.
 *
 * The number of dimensions must be between [1, 9].
 */
 struct DDim {
+  typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
+                         Dim<8>, Dim<9>>
+      DDimVar;
  DDimVar var;

  DDim() : var(Dim<1>()) {}

--- a/paddle/framework/eigen.h
+++ b/paddle/framework/eigen.h
--- a/paddle/framework/eigen_test.cc
+++ b/paddle/framework/eigen_test.cc
--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
--- a/paddle/framework/grad_op_builder.h
+++ b/paddle/framework/grad_op_builder.h
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
--- a/paddle/framework/images/duplicate_op.graffle
+++ b/paddle/framework/images/duplicate_op.graffle
--- a/paddle/framework/images/duplicate_op.png
+++ b/paddle/framework/images/duplicate_op.png
--- a/paddle/framework/images/duplicate_op2.graffle
+++ b/paddle/framework/images/duplicate_op2.graffle
--- a/paddle/framework/images/duplicate_op2.png
+++ b/paddle/framework/images/duplicate_op2.png
--- a/paddle/framework/op_desc.proto
+++ b/paddle/framework/op_desc.proto
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-syntax="proto2";
+syntax = "proto2";
 package paddle.framework;

-import "attr_type.proto";
+import "attribute.proto";

 // AttrDesc is used to describe Attributes of an Operator. It contain's
 // name, type, and value of Attribute.

--- a/paddle/framework/op_proto.proto
+++ b/paddle/framework/op_proto.proto
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
--- a/paddle/framework/pybind.cc
+++ b/paddle/framework/pybind.cc
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
--- a/paddle/framework/detail/tensor-inl.h
+++ b/paddle/framework/detail/tensor-inl.h
--- a/paddle/pybind/tensor_bind.h
+++ b/paddle/pybind/tensor_bind.h
--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
--- a/paddle/function/BlockExpandOpTest.cpp
+++ b/paddle/function/BlockExpandOpTest.cpp
--- a/paddle/function/BufferArgTest.cpp
+++ b/paddle/function/BufferArgTest.cpp
--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
--- a/paddle/function/CosSimOpGpu.cu
+++ b/paddle/function/CosSimOpGpu.cu
--- a/paddle/function/CropOpGpu.cu
+++ b/paddle/function/CropOpGpu.cu
--- a/paddle/function/CrossMapNormalOpGpu.cu
+++ b/paddle/function/CrossMapNormalOpGpu.cu
--- a/paddle/function/CrossMapNormalOpTest.cpp
+++ b/paddle/function/CrossMapNormalOpTest.cpp
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
--- a/paddle/function/FunctionTest.cpp
+++ b/paddle/function/FunctionTest.cpp
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
--- a/paddle/function/Im2ColOpGpu.cu
+++ b/paddle/function/Im2ColOpGpu.cu
--- a/paddle/function/MulOpGpu.cu
+++ b/paddle/function/MulOpGpu.cu
--- a/paddle/function/PadOpGpu.cu
+++ b/paddle/function/PadOpGpu.cu
--- a/paddle/function/RowConvOpGpu.cu
+++ b/paddle/function/RowConvOpGpu.cu
--- a/paddle/function/TensorShapeTest.cpp
+++ b/paddle/function/TensorShapeTest.cpp
--- a/paddle/function/TensorTypeTest.cpp
+++ b/paddle/function/TensorTypeTest.cpp
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
--- a/paddle/gserver/layers/ClipLayer.cpp
+++ b/paddle/gserver/layers/ClipLayer.cpp
--- a/paddle/gserver/layers/GruCompute.cu
+++ b/paddle/gserver/layers/GruCompute.cu
--- a/paddle/gserver/layers/LstmCompute.cu
+++ b/paddle/gserver/layers/LstmCompute.cu
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
--- a/paddle/gserver/layers/RowL2NormLayer.cpp
+++ b/paddle/gserver/layers/RowL2NormLayer.cpp
--- a/paddle/gserver/layers/SliceProjection.cpp
+++ b/paddle/gserver/layers/SliceProjection.cpp
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
--- a/paddle/gserver/tests/concat_slice_a.conf
+++ b/paddle/gserver/tests/concat_slice_a.conf
--- a/paddle/scripts/run_python_tests.sh
+++ b/paddle/scripts/run_python_tests.sh
--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
--- a/paddle/math/MathUtils.cpp
+++ b/paddle/math/MathUtils.cpp
--- a/paddle/math/TrainingAlgorithmOp.cu
+++ b/paddle/math/TrainingAlgorithmOp.cu
--- a/paddle/math/tests/test_Tensor.cu
+++ b/paddle/math/tests/test_Tensor.cu
--- a/paddle/math/tests/test_lazyAssign.cu
+++ b/paddle/math/tests/test_lazyAssign.cu
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
--- a/paddle/memory/detail/meta_cache.h
+++ b/paddle/memory/detail/meta_cache.h
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
--- a/paddle/operators/.clang-format
+++ b/paddle/operators/.clang-format
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
--- a/paddle/operators/fill_zeros_like_op.cu
+++ b/paddle/operators/fill_zeros_like_op.cu
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
--- a/paddle/operators/mean_op.cu
+++ b/paddle/operators/mean_op.cu
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
--- a/paddle/operators/mean_op_test.cc
+++ b/paddle/operators/mean_op_test.cc
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
--- a/paddle/framework/net_design.md
+++ b/paddle/framework/net_design.md
--- a/paddle/framework/net_op_test.cc
+++ b/paddle/framework/net_op_test.cc
--- a/paddle/operators/recurrent_network_op.cc
+++ b/paddle/operators/recurrent_network_op.cc
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
--- a/paddle/operators/recurrent_network_op.h
+++ b/paddle/operators/recurrent_network_op.h
--- a/paddle/operators/recurrent_network_op_test.cc
+++ b/paddle/operators/recurrent_network_op_test.cc
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ b/paddle/operators/rnn/recurrent_op_utils.h
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
--- a/paddle/operators/type_alias.h
+++ b/paddle/operators/type_alias.h
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
--- a/paddle/platform/dynload/cublas.cc
+++ b/paddle/platform/dynload/cublas.cc
--- a/paddle/platform/dynload/cudnn.cc
+++ b/paddle/platform/dynload/cudnn.cc
--- a/paddle/platform/dynload/curand.cc
+++ b/paddle/platform/dynload/curand.cc
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
--- a/paddle/platform/enforce_test.cc
+++ b/paddle/platform/enforce_test.cc
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
--- a/paddle/trainer/tests/compare_sparse_data
+++ b/paddle/trainer/tests/compare_sparse_data
--- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
+++ b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
--- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
+++ b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
--- a/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf
+++ b/paddle/trainer/tests/sample_trainer_config_compare_sparse.conf
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
--- a/paddle/trainer/tests/train_sparse.list
+++ b/paddle/trainer/tests/train_sparse.list
--- a/proto/DataConfig.proto
+++ b/proto/DataConfig.proto
--- a/proto/DataFormat.proto
+++ b/proto/DataFormat.proto
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
--- a/proto/OptimizerConfig.proto
+++ b/proto/OptimizerConfig.proto
--- a/proto/ParameterConfig.proto
+++ b/proto/ParameterConfig.proto
--- a/proto/ParameterServerConfig.proto
+++ b/proto/ParameterServerConfig.proto
--- a/proto/ParameterService.proto
+++ b/proto/ParameterService.proto
--- a/proto/TrainerConfig.proto
+++ b/proto/TrainerConfig.proto
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/v2/dataset/imikolov.py
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
--- a/python/paddle/v2/dataset/movielens.py
+++ b/python/paddle/v2/dataset/movielens.py
--- a/python/paddle/v2/dataset/sentiment.py
+++ b/python/paddle/v2/dataset/sentiment.py
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
--- a/python/paddle/v2/framework/create_op_creation_methods.py
+++ b/python/paddle/v2/framework/create_op_creation_methods.py
--- a/python/paddle/v2/framework/default_scope_funcs.py
+++ b/python/paddle/v2/framework/default_scope_funcs.py
--- a/python/paddle/v2/framework/network.py
+++ b/python/paddle/v2/framework/network.py
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
--- a/python/paddle/v2/framework/tests/test_add_two_op.py
+++ b/python/paddle/v2/framework/tests/test_add_two_op.py
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
--- a/python/paddle/v2/framework/tests/test_default_scope_funcs.py
+++ b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
--- a/python/paddle/v2/framework/tests/test_fc_op.py
+++ b/python/paddle/v2/framework/tests/test_fc_op.py
--- a/python/paddle/v2/framework/tests/test_mean_op.py
+++ b/python/paddle/v2/framework/tests/test_mean_op.py
--- a/python/paddle/v2/framework/tests/test_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_mul_op.py
--- a/python/paddle/v2/framework/tests/test_op_creation_methods.py
+++ b/python/paddle/v2/framework/tests/test_op_creation_methods.py
--- a/python/paddle/v2/framework/tests/test_protobuf.py
+++ b/python/paddle/v2/framework/tests/test_protobuf.py
--- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
--- a/python/paddle/v2/framework/tests/test_scope.py
+++ b/python/paddle/v2/framework/tests/test_scope.py
--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
--- a/python/paddle/v2/framework/tests/test_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_op.py
--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/framework/tests/test_tensor.py
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
--- a/python/paddle/v2/reader/tests/creator_test.py
+++ b/python/paddle/v2/reader/tests/creator_test.py
--- a/python/setup.py.in
+++ b/python/setup.py.in